diff --git a/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp b/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp new file mode 100644 index 0000000000..ed651db33e --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp @@ -0,0 +1,2053 @@ +/* BPlusTree - BFS B+Tree implementation +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** Roughly based on 'btlib' written by Marcus J. Ranum +** +** Copyright (c) 2001-2002 pinc Software. All Rights Reserved. +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "cpp.h" +#include "BPlusTree.h" +#include "Inode.h" +#include "Utility.h" +#include "Stack.h" + +#include + +#include +#include +#include + + +// Node Caching for the BPlusTree class +// +// With write support, there is the need for a function that allocates new +// nodes by either returning empty nodes, or by growing the file's data stream +// +// !! The CachedNode class assumes that you have properly locked the stream +// !! before asking for nodes. +// +// Note: This code will fail if the block size is smaller than the node size! +// Since BFS supports block sizes of 1024 bytes or greater, and the node size +// is hard-coded to 1024 bytes, that's not an issue now. + +void +CachedNode::Unset() +{ + if (fTree == NULL || fTree->fStream == NULL) + return; + + if (fBlock != NULL) { + release_block(fTree->fStream->GetVolume()->Device(),fBlockNumber); + + fBlock = NULL; + fNode = NULL; + } +} + + +bplustree_node * +CachedNode::SetTo(off_t offset,bool check) +{ + if (fTree == NULL || fTree->fStream == NULL) { + REPORT_ERROR(B_BAD_VALUE); + return NULL; + } + + Unset(); + + // You can only ask for nodes at valid positions - you can't + // even access the b+tree header with this method (use SetToHeader() + // instead) + if (offset > fTree->fHeader->maximum_size - fTree->fNodeSize + || offset <= 0 + || (offset % fTree->fNodeSize) != 0) + return NULL; + + if (InternalSetTo(offset) != NULL && check) { + // sanity checks (links, all_key_count) + bplustree_header *header = fTree->fHeader; + if (!header->IsValidLink(fNode->left_link) + || !header->IsValidLink(fNode->right_link) + || !header->IsValidLink(fNode->overflow_link) + || (int8 *)fNode->Values() + fNode->all_key_count * sizeof(off_t) > + (int8 *)fNode + fTree->fNodeSize) { + FATAL(("invalid node read from offset %Ld, inode at %Ld\n", + offset,fTree->fStream->ID())); + return NULL; + } + } + return fNode; +} + + +bplustree_header * +CachedNode::SetToHeader() +{ + if (fTree == NULL || fTree->fStream == NULL) { + REPORT_ERROR(B_BAD_VALUE); + return NULL; + } + + Unset(); + + InternalSetTo(0LL); + return (bplustree_header *)fNode; +} + + +bplustree_node * +CachedNode::InternalSetTo(off_t offset) +{ + fNode = NULL; + + off_t fileOffset; + block_run run; + if (offset < fTree->fStream->Size() + && fTree->fStream->FindBlockRun(offset,run,fileOffset) == B_OK) { + Volume *volume = fTree->fStream->GetVolume(); + + int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); + fBlockNumber = volume->ToBlock(run) + blockOffset; + + fBlock = (uint8 *)get_block(volume->Device(),fBlockNumber,volume->BlockSize()); + if (fBlock) { + // the node is somewhere in that block... (confusing offset calculation) + fNode = (bplustree_node *)(fBlock + offset - + (fileOffset + blockOffset * volume->BlockSize())); + } else + REPORT_ERROR(B_IO_ERROR); + } + return fNode; +} + + +status_t +CachedNode::Free(Transaction *transaction,off_t offset) +{ + if (transaction == NULL || fTree == NULL || fTree->fStream == NULL + || offset == BPLUSTREE_NULL) + RETURN_ERROR(B_BAD_VALUE); + + // ToDo: scan the free nodes list and remove all nodes at the end + // of the tree - perhaps that shouldn't be done everytime that + // function is called, perhaps it should be done when the directory + // inode is closed or based on some calculation or whatever... + + // if the node is the last one in the tree, we shrink + // the tree and file size by one node + off_t lastOffset = fTree->fHeader->maximum_size - fTree->fNodeSize; + if (offset == lastOffset) { + fTree->fHeader->maximum_size = lastOffset; + + status_t status = fTree->fStream->SetFileSize(transaction,lastOffset); + if (status < B_OK) + return status; + + return fTree->fCachedHeader.WriteBack(transaction); + } + + // add the node to the free nodes list + fNode->left_link = fTree->fHeader->free_node_pointer; + fNode->overflow_link = BPLUSTREE_FREE; + + if (WriteBack(transaction) == B_OK) { + fTree->fHeader->free_node_pointer = offset; + return fTree->fCachedHeader.WriteBack(transaction); + } + return B_ERROR; +} + + +status_t +CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset) +{ + if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL + || fTree->fStream == NULL) { + RETURN_ERROR(B_BAD_VALUE); + } + + status_t status; + + // if there are any free nodes, recycle them + if (SetTo(fTree->fHeader->free_node_pointer,false) != NULL) { + *_offset = fTree->fHeader->free_node_pointer; + + // set new free node pointer + fTree->fHeader->free_node_pointer = fNode->left_link; + if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) { + fNode->Initialize(); + *_node = fNode; + return B_OK; + } + return status; + } + // allocate space for a new node + Inode *stream = fTree->fStream; + if ((status = stream->Append(transaction,fTree->fNodeSize)) < B_OK) + return status; + + // the maximum_size has to be changed before the call to SetTo() - or + // else it will fail because the requested node is out of bounds + off_t offset = fTree->fHeader->maximum_size; + fTree->fHeader->maximum_size += fTree->fNodeSize; + + if (SetTo(offset,false) != NULL) { + *_offset = offset; + + if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) { + fNode->Initialize(); + *_node = fNode; + return B_OK; + } + } + RETURN_ERROR(B_ERROR); +} + + +status_t +CachedNode::WriteBack(Transaction *transaction) +{ + if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL) + RETURN_ERROR(B_BAD_VALUE); + + return transaction->WriteBlocks(fBlockNumber,fBlock); +} + + +// #pragma mark - + + +BPlusTree::BPlusTree(Transaction *transaction,Inode *stream,int32 nodeSize) + : + fStream(NULL), + fHeader(NULL), + fCachedHeader(this) +{ + SetTo(transaction,stream); +} + + +BPlusTree::BPlusTree(Inode *stream) + : + fStream(NULL), + fHeader(NULL), + fCachedHeader(this) +{ + SetTo(stream); +} + + +BPlusTree::BPlusTree() + : + fStream(NULL), + fHeader(NULL), + fCachedHeader(this), + fNodeSize(BPLUSTREE_NODE_SIZE), + fAllowDuplicates(true), + fStatus(B_NO_INIT) +{ +} + + +BPlusTree::~BPlusTree() +{ + // if there are any TreeIterators left, we need to stop them + // (can happen when the tree's inode gets deleted while + // traversing the tree - a TreeIterator doesn't lock the inode) + if (fIteratorLock.Lock() < B_OK) + return; + + TreeIterator *iterator = NULL; + while ((iterator = fIterators.Next(iterator)) != NULL) + iterator->Stop(); + + fIteratorLock.Unlock(); +} + + +status_t +BPlusTree::SetTo(Transaction *transaction,Inode *stream,int32 nodeSize) +{ + // initializes in-memory B+Tree + + fCachedHeader.Unset(); + fStream = stream; + + fHeader = fCachedHeader.SetToHeader(); + if (fHeader == NULL) { + // allocate space for new header + node! + fStatus = stream->SetFileSize(transaction,nodeSize * 2); + if (fStatus < B_OK) + RETURN_ERROR(fStatus); + + fHeader = fCachedHeader.SetToHeader(); + if (fHeader == NULL) + RETURN_ERROR(fStatus = B_ERROR); + } + + fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR + && stream->BlockRun() != stream->Parent()) + || (stream->Mode() & S_ALLOW_DUPS) != 0; + + fNodeSize = nodeSize; + + // initialize b+tree header + fHeader->magic = BPLUSTREE_MAGIC; + fHeader->node_size = fNodeSize; + fHeader->max_number_of_levels = 1; + fHeader->data_type = ModeToKeyType(stream->Mode()); + fHeader->root_node_pointer = nodeSize; + fHeader->free_node_pointer = BPLUSTREE_NULL; + fHeader->maximum_size = nodeSize * 2; + + if (fCachedHeader.WriteBack(transaction) < B_OK) + RETURN_ERROR(fStatus = B_ERROR); + + // initialize b+tree root node + CachedNode cached(this,fHeader->root_node_pointer,false); + if (cached.Node() == NULL) + RETURN_ERROR(B_ERROR); + + cached.Node()->Initialize(); + return fStatus = cached.WriteBack(transaction); +} + + +status_t +BPlusTree::SetTo(Inode *stream) +{ + if (stream == NULL || stream->Node() == NULL) + RETURN_ERROR(fStatus = B_BAD_VALUE); + + // get on-disk B+Tree header + + fCachedHeader.Unset(); + fStream = stream; + + fHeader = fCachedHeader.SetToHeader(); + if (fHeader == NULL) + RETURN_ERROR(fStatus = B_NO_INIT); + + // is header valid? + + if (fHeader->magic != BPLUSTREE_MAGIC + || fHeader->maximum_size != stream->Size() + || (fHeader->root_node_pointer % fHeader->node_size) != 0 + || !fHeader->IsValidLink(fHeader->root_node_pointer) + || !fHeader->IsValidLink(fHeader->free_node_pointer)) + RETURN_ERROR(fStatus = B_BAD_DATA); + + fNodeSize = fHeader->node_size; + + { + uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, + S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; + uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX + | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); + + if (fHeader->data_type > BPLUSTREE_DOUBLE_TYPE + || (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->data_type] != mode + || !stream->IsDirectory()) { + D( dump_bplustree_header(fHeader); + dump_inode(stream->Node()); + ); + RETURN_ERROR(fStatus = B_BAD_TYPE); + } + + // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused + // in the original BFS code - we will honour it nevertheless + fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR + && stream->BlockRun() != stream->Parent()) + || (stream->Mode() & S_ALLOW_DUPS) != 0; + } + + CachedNode cached(this,fHeader->root_node_pointer); + RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); +} + + +status_t +BPlusTree::InitCheck() +{ + return fStatus; +} + + +int32 +BPlusTree::TypeCodeToKeyType(type_code code) +{ + switch (code) { + case B_STRING_TYPE: + return BPLUSTREE_STRING_TYPE; + case B_INT32_TYPE: + return BPLUSTREE_INT32_TYPE; + case B_UINT32_TYPE: + return BPLUSTREE_UINT32_TYPE; + case B_INT64_TYPE: + return BPLUSTREE_INT64_TYPE; + case B_UINT64_TYPE: + return BPLUSTREE_UINT64_TYPE; + case B_FLOAT_TYPE: + return BPLUSTREE_FLOAT_TYPE; + case B_DOUBLE_TYPE: + return BPLUSTREE_DOUBLE_TYPE; + } + return -1; +} + + +int32 +BPlusTree::ModeToKeyType(mode_t mode) +{ + switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX + | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { + case S_INT_INDEX: + return BPLUSTREE_INT32_TYPE; + case S_UINT_INDEX: + return BPLUSTREE_UINT32_TYPE; + case S_LONG_LONG_INDEX: + return BPLUSTREE_INT64_TYPE; + case S_ULONG_LONG_INDEX: + return BPLUSTREE_UINT64_TYPE; + case S_FLOAT_INDEX: + return BPLUSTREE_FLOAT_TYPE; + case S_DOUBLE_INDEX: + return BPLUSTREE_DOUBLE_TYPE; + case S_STR_INDEX: + default: + // default is for standard directories + return BPLUSTREE_STRING_TYPE; + } +} + + +// #pragma mark - + + +void +BPlusTree::UpdateIterators(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change) +{ + // Although every iterator which is affected by this update currently + // waits on a semaphore, other iterators could be added/removed at + // any time, so we need to protect this loop + if (fIteratorLock.Lock() < B_OK) + return; + + TreeIterator *iterator = NULL; + while ((iterator = fIterators.Next(iterator)) != NULL) + iterator->Update(offset,nextOffset,keyIndex,splitAt,change); + + fIteratorLock.Unlock(); +} + + +void +BPlusTree::AddIterator(TreeIterator *iterator) +{ + if (fIteratorLock.Lock() < B_OK) + return; + + fIterators.Add(iterator); + + fIteratorLock.Unlock(); +} + + +void +BPlusTree::RemoveIterator(TreeIterator *iterator) +{ + if (fIteratorLock.Lock() < B_OK) + return; + + fIterators.Remove(iterator); + + fIteratorLock.Unlock(); +} + + +int32 +BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) +{ + type_code type = 0; + switch (fHeader->data_type) + { + case BPLUSTREE_STRING_TYPE: + type = B_STRING_TYPE; + break; + case BPLUSTREE_INT32_TYPE: + type = B_INT32_TYPE; + break; + case BPLUSTREE_UINT32_TYPE: + type = B_UINT32_TYPE; + break; + case BPLUSTREE_INT64_TYPE: + type = B_INT64_TYPE; + break; + case BPLUSTREE_UINT64_TYPE: + type = B_UINT64_TYPE; + break; + case BPLUSTREE_FLOAT_TYPE: + type = B_FLOAT_TYPE; + break; + case BPLUSTREE_DOUBLE_TYPE: + type = B_DOUBLE_TYPE; + break; + } + return compareKeys(type,key1,keyLength1,key2,keyLength2); +} + + +status_t +BPlusTree::FindKey(bplustree_node *node,const uint8 *key,uint16 keyLength,uint16 *index,off_t *next) +{ + if (node->all_key_count == 0) + { + if (index) + *index = 0; + if (next) + *next = node->overflow_link; + return B_ENTRY_NOT_FOUND; + } + + off_t *values = node->Values(); + int16 saveIndex; + + // binary search in the key array + for (int16 first = 0,last = node->all_key_count - 1;first <= last;) + { + uint16 i = (first + last) >> 1; + + uint16 searchLength; + uint8 *searchKey = node->KeyAt(i,&searchLength); + if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize + || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { + fStream->GetVolume()->Panic(); + RETURN_ERROR(B_BAD_DATA); + } + + int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength); + if (cmp < 0) + { + last = i - 1; + saveIndex = i; + } + else if (cmp > 0) + { + saveIndex = first = i + 1; + } + else + { + if (index) + *index = i; + if (next) + *next = values[i]; + return B_OK; + } + } + + if (index) + *index = saveIndex; + if (next) + { + if (saveIndex == node->all_key_count) + *next = node->overflow_link; + else + *next = values[saveIndex]; + } + return B_ENTRY_NOT_FOUND; +} + + +/** Prepares the stack to contain all nodes that were passed while + * following the key, from the root node to the leaf node that could + * or should contain that key. + */ + +status_t +BPlusTree::SeekDown(Stack &stack,const uint8 *key,uint16 keyLength) +{ + // set the root node to begin with + node_and_key nodeAndKey; + nodeAndKey.nodeOffset = fHeader->root_node_pointer; + + CachedNode cached(this); + bplustree_node *node; + while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { + // if we are already on leaf level, we're done + if (node->overflow_link == BPLUSTREE_NULL) { + // node that the keyIndex is not properly set here (but it's not + // needed in the calling functions anyway)! + nodeAndKey.keyIndex = 0; + stack.Push(nodeAndKey); + return B_OK; + } + + off_t nextOffset; + status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset); + + if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) + RETURN_ERROR(B_ERROR); + + // put the node offset & the correct keyIndex on the stack + stack.Push(nodeAndKey); + + nodeAndKey.nodeOffset = nextOffset; + } + RETURN_ERROR(B_ERROR); +} + + +status_t +BPlusTree::FindFreeDuplicateFragment(bplustree_node *node,CachedNode *cached,off_t *_offset,bplustree_node **_fragment,uint32 *_index) +{ + off_t *values = node->Values(); + for (int32 i = 0;i < node->all_key_count;i++) { + // does the value link to a duplicate fragment? + if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT) + continue; + + bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]),false); + if (fragment == NULL) { + FATAL(("Could not get duplicate fragment at %Ld\n",values[i])); + continue; + } + + // see if there is some space left for us + int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1); + for (int32 j = 0;j < num;j++) { + duplicate_array *array = fragment->FragmentAt(j); + + if (array->count == 0) { + *_offset = bplustree_node::FragmentOffset(values[i]); + *_fragment = fragment; + *_index = j; + return B_OK; + } + } + } + return B_ENTRY_NOT_FOUND; +} + + +status_t +BPlusTree::InsertDuplicate(Transaction *transaction,CachedNode *cached,bplustree_node *node,uint16 index,off_t value) +{ + CachedNode cachedDuplicate(this); + off_t *values = node->Values(); + off_t oldValue = values[index]; + status_t status; + off_t offset; + + if (bplustree_node::IsDuplicate(oldValue)) { + // + // If it's a duplicate fragment, try to insert it into that, or if it + // doesn't fit anymore, create a new duplicate node + // + if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { + bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue),false); + if (duplicate == NULL) + return B_IO_ERROR; + + duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); + if (array->count > NUM_FRAGMENT_VALUES + || array->count < 1) { + FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),bplustree_node::FragmentOffset(oldValue),array->count)); + return B_BAD_DATA; + } + + if (array->count < NUM_FRAGMENT_VALUES) { + array->Insert(value); + } else { + // test if the fragment will be empty if we remove this key's values + if (duplicate->FragmentsUsed(fNodeSize) < 2) { + // the node will be empty without our values, so let us + // reuse it as a duplicate node + offset = bplustree_node::FragmentOffset(oldValue); + + memmove(duplicate->DuplicateArray(),array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); + duplicate->left_link = duplicate->right_link = BPLUSTREE_NULL; + + array = duplicate->DuplicateArray(); + array->Insert(value); + } else { + // create a new duplicate node + CachedNode cachedNewDuplicate(this); + bplustree_node *newDuplicate; + status = cachedNewDuplicate.Allocate(transaction,&newDuplicate,&offset); + if (status < B_OK) + return status; + + // copy the array from the fragment node to the duplicate node + // and free the old entry (by zero'ing all values) + newDuplicate->overflow_link = array->count; + memcpy(&newDuplicate->all_key_count,&array->values[0],array->count * sizeof(off_t)); + memset(array,0,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); + + array = newDuplicate->DuplicateArray(); + array->Insert(value); + + // if this fails, the old fragments node will contain wrong + // data... (but since it couldn't be written, it shouldn't + // be fatal) + if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK) + return status; + } + + // update the main pointer to link to a duplicate node + values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE,offset); + if ((status = cached->WriteBack(transaction)) < B_OK) + return status; + } + + return cachedDuplicate.WriteBack(transaction); + } + + // + // Put the value into a dedicated duplicate node + // + + // search for free space in the duplicate nodes of that key + duplicate_array *array; + bplustree_node *duplicate; + off_t duplicateOffset; + do { + duplicateOffset = bplustree_node::FragmentOffset(oldValue); + duplicate = cachedDuplicate.SetTo(duplicateOffset,false); + if (duplicate == NULL) + return B_IO_ERROR; + + array = duplicate->DuplicateArray(); + if (array->count > NUM_DUPLICATE_VALUES + || array->count < 0) { + FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",duplicateOffset,array->count)); + return B_BAD_DATA; + } + } while (array->count >= NUM_DUPLICATE_VALUES && (oldValue = duplicate->right_link) != BPLUSTREE_NULL); + + if (array->count < NUM_DUPLICATE_VALUES) { + array->Insert(value); + } else { + // no space left - add a new duplicate node + + CachedNode cachedNewDuplicate(this); + bplustree_node *newDuplicate; + status = cachedNewDuplicate.Allocate(transaction,&newDuplicate,&offset); + if (status < B_OK) + return status; + + // link the two nodes together + duplicate->right_link = offset; + newDuplicate->left_link = duplicateOffset; + + array = newDuplicate->DuplicateArray(); + array->count = 0; + array->Insert(value); + + status = cachedNewDuplicate.WriteBack(transaction); + if (status < B_OK) + return status; + } + return cachedDuplicate.WriteBack(transaction); + } + + // + // Search for a free duplicate fragment or create a new one + // to insert the duplicate value into + // + + uint32 fragmentIndex = 0; + bplustree_node *fragment; + if (FindFreeDuplicateFragment(node,&cachedDuplicate,&offset,&fragment,&fragmentIndex) < B_OK) { + // allocate a new duplicate fragment node + if ((status = cachedDuplicate.Allocate(transaction,&fragment,&offset)) < B_OK) + return status; + + memset(fragment,0,fNodeSize); + } + duplicate_array *array = fragment->FragmentAt(fragmentIndex); + array->Insert(oldValue); + array->Insert(value); + + if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK) + return status; + + values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,offset,fragmentIndex); + + return cached->WriteBack(transaction); +} + + +void +BPlusTree::InsertKey(bplustree_node *node,uint16 index,uint8 *key,uint16 keyLength,off_t value) +{ + // should never happen, but who knows? + if (index > node->all_key_count) + return; + + off_t *values = node->Values(); + uint16 *keyLengths = node->KeyLengths(); + uint8 *keys = node->Keys(); + + node->all_key_count++; + node->all_key_length += keyLength; + + off_t *newValues = node->Values(); + uint16 *newKeyLengths = node->KeyLengths(); + + // move values and copy new value into them + memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index)); + memmove(newValues,values,sizeof(off_t) * index); + + newValues[index] = value; + + // move and update key length index + for (uint16 i = node->all_key_count;i-- > index + 1;) + newKeyLengths[i] = keyLengths[i - 1] + keyLength; + memmove(newKeyLengths,keyLengths,sizeof(uint16) * index); + + int32 keyStart; + newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); + + // move keys and copy new key into them + int32 size = node->all_key_length - newKeyLengths[index]; + if (size > 0) + memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size); + + memcpy(keys + keyStart,key,keyLength); +} + + +status_t +BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,bplustree_node *other,off_t otherOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value) +{ + if (*_keyIndex > node->all_key_count + 1) + return B_BAD_VALUE; + + uint16 *inKeyLengths = node->KeyLengths(); + off_t *inKeyValues = node->Values(); + uint8 *inKeys = node->Keys(); + uint8 *outKeys = other->Keys(); + int32 keyIndex = *_keyIndex; // can become less than zero! + + // how many keys will fit in one (half) page? + // that loop will find the answer to this question and + // change the key lengths indices for their new home + + // "bytes" is the number of bytes written for the new key, + // "bytesBefore" are the bytes before that key + // "bytesAfter" are the bytes after the new key, if any + int32 bytes = 0,bytesBefore = 0,bytesAfter = 0; + + size_t size = fNodeSize >> 1; + int32 out,in; + for (in = out = 0;in < node->all_key_count + 1;) { + if (!bytes) + bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; + + if (in == keyIndex && !bytes) { + bytes = *_keyLength; + } else { + if (keyIndex < out) + bytesAfter = inKeyLengths[in] - bytesBefore; + + in++; + } + out++; + + if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + + out * (sizeof(uint16) + sizeof(off_t)) >= size) { + // we have found the number of keys in the new node! + break; + } + } + + // if the new key was not inserted, set the length of the keys + // that can be copied directly + if (keyIndex >= out && in > 0) + bytesBefore = inKeyLengths[in - 1]; + + if (bytesBefore < 0 || bytesAfter < 0) + return B_BAD_DATA; + + other->left_link = node->left_link; + other->right_link = nodeOffset; + other->all_key_length = bytes + bytesBefore + bytesAfter; + other->all_key_count = out; + + uint16 *outKeyLengths = other->KeyLengths(); + off_t *outKeyValues = other->Values(); + int32 keys = out > keyIndex ? keyIndex : out; + + if (bytesBefore) { + // copy the keys + memcpy(outKeys,inKeys,bytesBefore); + memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16)); + memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t)); + } + if (bytes) { + // copy the newly inserted key + memcpy(outKeys + bytesBefore,key,bytes); + outKeyLengths[keyIndex] = bytes + bytesBefore; + outKeyValues[keyIndex] = *_value; + + if (bytesAfter) { + // copy the keys after the new key + memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter); + keys = out - keyIndex - 1; + for (int32 i = 0;i < keys;i++) + outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes; + memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t)); + } + } + + // if the new key was already inserted, we shouldn't use it again + if (in != out) + keyIndex--; + + int32 total = bytesBefore + bytesAfter; + + // these variables are for the key that will be returned + // to the parent node + uint8 *newKey = NULL; + uint16 newLength; + bool newAllocated = false; + + // If we have split an index node, we have to drop the first key + // of the next node (which can also be the new key to insert). + // The dropped key is also the one which has to be inserted in + // the parent node, so we will set the "newKey" already here. + if (node->overflow_link != BPLUSTREE_NULL) { + if (in == keyIndex) { + newKey = key; + newLength = *_keyLength; + + other->overflow_link = *_value; + keyIndex--; + } else { + // If a key is dropped (is not the new key), we have to copy + // it, because it would be lost if not. + uint8 *droppedKey = node->KeyAt(in,&newLength); + if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize + || newLength > BPLUSTREE_MAX_KEY_LENGTH) { + fStream->GetVolume()->Panic(); + RETURN_ERROR(B_BAD_DATA); + } + newKey = (uint8 *)malloc(newLength); + if (newKey == NULL) + return B_NO_MEMORY; + memcpy(newKey,droppedKey,newLength); + + other->overflow_link = inKeyValues[in]; + total = inKeyLengths[in++]; + } + } + + // and now the same game for the other page and the rest of the keys + // (but with memmove() instead of memcpy(), because they may overlap) + + bytesBefore = bytesAfter = bytes = 0; + out = 0; + int32 skip = in; + while (in < node->all_key_count + 1) { + if (in == keyIndex && !bytes) { + // it's enough to set bytesBefore once here, because we do + // not need to know the exact length of all keys in this + // loop + bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; + bytes = *_keyLength; + } else { + if (in < node->all_key_count) { + inKeyLengths[in] -= total; + if (bytes) { + inKeyLengths[in] += bytes; + bytesAfter = inKeyLengths[in] - bytesBefore - bytes; + } + } + in++; + } + + out++; + + // break out when all keys are done + if (in > node->all_key_count && keyIndex < in) + break; + } + + // adjust the byte counts (since we were a bit lazy in the loop) + if (keyIndex >= in && keyIndex - skip < out) + bytesAfter = inKeyLengths[in] - bytesBefore - total; + else if (keyIndex < skip) + bytesBefore = node->all_key_length - total; + + if (bytesBefore < 0 || bytesAfter < 0) + return B_BAD_DATA; + + node->left_link = otherOffset; + // right link, and overflow link can stay the same + node->all_key_length = bytes + bytesBefore + bytesAfter; + node->all_key_count = out - 1; + + // array positions have changed + outKeyLengths = node->KeyLengths(); + outKeyValues = node->Values(); + + // move the keys in the old node: the order is important here, + // because we don't want to overwrite any contents + + keys = keyIndex <= skip ? out : keyIndex - skip; + keyIndex -= skip; + + if (bytesBefore) + memmove(inKeys,inKeys + total,bytesBefore); + if (bytesAfter) + memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter); + + if (bytesBefore) + memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16)); + in = out - keyIndex - 1; + if (bytesAfter) + memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16)); + + if (bytesBefore) + memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t)); + if (bytesAfter) + memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t)); + + if (bytes) { + // finally, copy the newly inserted key (don't overwrite anything) + memcpy(inKeys + bytesBefore,key,bytes); + outKeyLengths[keyIndex] = bytes + bytesBefore; + outKeyValues[keyIndex] = *_value; + } + + // Prepare the key that will be inserted in the parent node which + // is either the dropped key or the last of the other node. + // If it's the dropped key, "newKey" was already set earlier. + + if (newKey == NULL) + newKey = other->KeyAt(other->all_key_count - 1,&newLength); + + memcpy(key,newKey,newLength); + *_keyLength = newLength; + *_value = otherOffset; + + if (newAllocated) + free(newKey); + + return B_OK; +} + + +status_t +BPlusTree::Insert(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value) +{ + if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) + RETURN_ERROR(B_BAD_VALUE); + + // lock access to stream + WriteLocked locked(fStream->Lock()); + + Stack stack; + if (SeekDown(stack,key,keyLength) != B_OK) + RETURN_ERROR(B_ERROR); + + uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; + + memcpy(keyBuffer,key,keyLength); + keyBuffer[keyLength] = 0; + + node_and_key nodeAndKey; + bplustree_node *node; + + CachedNode cached(this); + while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { + if (node->IsLeaf()) { + // first round, check for duplicate entries + status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); + + // is this a duplicate entry? + if (status == B_OK) { + if (fAllowDuplicates) + return InsertDuplicate(transaction,&cached,node,nodeAndKey.keyIndex,value); + else + RETURN_ERROR(B_NAME_IN_USE); + } + } + + // is the node big enough to hold the pair? + if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength) + + (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) + { + InsertKey(node,nodeAndKey.keyIndex,keyBuffer,keyLength,value); + UpdateIterators(nodeAndKey.nodeOffset,BPLUSTREE_NULL,nodeAndKey.keyIndex,0,1); + + return cached.WriteBack(transaction); + } else { + CachedNode cachedNewRoot(this); + CachedNode cachedOther(this); + + // do we need to allocate a new root node? if so, then do + // it now + off_t newRoot = BPLUSTREE_NULL; + if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) { + bplustree_node *root; + status_t status = cachedNewRoot.Allocate(transaction,&root,&newRoot); + if (status < B_OK) { + // The tree is most likely corrupted! + // But it's still sane at leaf level - we could set + // a flag in the header that forces the tree to be + // rebuild next time... + // But since we will have journaling, that's not a big + // problem anyway. + RETURN_ERROR(status); + } + } + + // reserve space for the other node + bplustree_node *other; + off_t otherOffset; + status_t status = cachedOther.Allocate(transaction,&other,&otherOffset); + if (status < B_OK) { + cachedNewRoot.Free(transaction,newRoot); + RETURN_ERROR(status); + } + + if (SplitNode(node,nodeAndKey.nodeOffset,other,otherOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&value) < B_OK) { + // free root node & other node here + cachedNewRoot.Free(transaction,newRoot); + cachedOther.Free(transaction,otherOffset); + + RETURN_ERROR(B_ERROR); + } + + // write the updated nodes back + + if (cached.WriteBack(transaction) < B_OK + || cachedOther.WriteBack(transaction) < B_OK) + RETURN_ERROR(B_ERROR); + + UpdateIterators(nodeAndKey.nodeOffset,otherOffset,nodeAndKey.keyIndex,node->all_key_count,1); + + // update the right link of the node in the left of the new node + if ((other = cachedOther.SetTo(other->left_link)) != NULL) { + other->right_link = otherOffset; + if (cachedOther.WriteBack(transaction) < B_OK) + RETURN_ERROR(B_ERROR); + } + + // create a new root if necessary + if (newRoot != BPLUSTREE_NULL) { + bplustree_node *root = cachedNewRoot.Node(); + + InsertKey(root,0,keyBuffer,keyLength,node->left_link); + root->overflow_link = nodeAndKey.nodeOffset; + + if (cachedNewRoot.WriteBack(transaction) < B_OK) + RETURN_ERROR(B_ERROR); + + // finally, update header to point to the new root + fHeader->root_node_pointer = newRoot; + fHeader->max_number_of_levels++; + + return fCachedHeader.WriteBack(transaction); + } + } + } + RETURN_ERROR(B_ERROR); +} + + +status_t +BPlusTree::RemoveDuplicate(Transaction *transaction,bplustree_node *node,CachedNode *cached,uint16 index,off_t value) +{ + CachedNode cachedDuplicate(this); + off_t *values = node->Values(); + off_t oldValue = values[index]; + status_t status; + + off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); + bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset,false); + if (duplicate == NULL) + return B_IO_ERROR; + + // if it's a duplicate fragment, remove the entry from there + if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { + duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); + + if (array->count > NUM_FRAGMENT_VALUES + || array->count < 1) { + FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),duplicateOffset,array->count)); + return B_BAD_DATA; + } + if (!array->Remove(value)) + FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",value,duplicateOffset)); + + // remove the array from the fragment node if it is empty + if (array->count == 1) { + // set the link to the remaining value + values[index] = array->values[0]; + + // Remove the whole fragment node, if this was the only array, + // otherwise free the array and write the changes back + if (duplicate->FragmentsUsed(fNodeSize) == 1) + status = cachedDuplicate.Free(transaction,duplicateOffset); + else { + array->count = 0; + status = cachedDuplicate.WriteBack(transaction); + } + if (status < B_OK) + return status; + + return cached->WriteBack(transaction); + } + return cachedDuplicate.WriteBack(transaction); + } + + // + // Remove value from a duplicate node! + // + + duplicate_array *array; + + if (duplicate->left_link != BPLUSTREE_NULL) { + FATAL(("invalid duplicate node: first left link points to %Ld!\n",duplicate->left_link)); + return B_BAD_DATA; + } + + // Search the duplicate nodes until the entry could be found (and removed) + while (duplicate != NULL) { + array = duplicate->DuplicateArray(); + if (array->count > NUM_DUPLICATE_VALUES + || array->count < 0) { + FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",duplicateOffset,array->count)); + return B_BAD_DATA; + } + + if (array->Remove(value)) + break; + + if ((duplicateOffset = duplicate->right_link) == BPLUSTREE_NULL) + RETURN_ERROR(B_ENTRY_NOT_FOUND); + + duplicate = cachedDuplicate.SetTo(duplicateOffset,false); + } + if (duplicate == NULL) + RETURN_ERROR(B_IO_ERROR); + + while (true) { + off_t left = duplicate->left_link; + off_t right = duplicate->right_link; + bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; + + if (isLast && array->count == 1 || array->count == 0) { + // Free empty duplicate page, link their siblings together, and + // update the duplicate link if needed (which should not be, if + // we are the only one working on that tree...) + + if (duplicateOffset == bplustree_node::FragmentOffset(oldValue) + || array->count == 1) { + if (array->count == 1 && isLast) + values[index] = array->values[0]; + else if (isLast) { + FATAL(("removed last value from duplicate!\n")); + } else + values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE,right); + + if ((status = cached->WriteBack(transaction)) < B_OK) + return status; + } + + if ((status = cachedDuplicate.Free(transaction,duplicateOffset)) < B_OK) + return status; + + if (left != BPLUSTREE_NULL + && (duplicate = cachedDuplicate.SetTo(left,false)) != NULL) { + duplicate->right_link = right; + + // If the next node is the last node, we need to free that node + // and convert the duplicate entry back into a normal entry + if (right == BPLUSTREE_NULL && duplicate->left_link == BPLUSTREE_NULL + && duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) { + duplicateOffset = left; + continue; + } + + status = cachedDuplicate.WriteBack(transaction); + if (status < B_OK) + return status; + } + if (right != BPLUSTREE_NULL + && (duplicate = cachedDuplicate.SetTo(right,false)) != NULL) { + duplicate->left_link = left; + + // Again, we may need to turn the duplicate entry back into a normal entry + array = duplicate->DuplicateArray(); + if (left == BPLUSTREE_NULL && duplicate->right_link == BPLUSTREE_NULL + && duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) { + duplicateOffset = right; + continue; + } + + return cachedDuplicate.WriteBack(transaction); + } + return status; + } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { + // If the number of entries fits in a duplicate fragment, then + // either find a free fragment node, or convert this node to a + // fragment node. + CachedNode cachedOther(this); + + bplustree_node *fragment = NULL; + uint32 fragmentIndex = 0; + off_t offset; + if (FindFreeDuplicateFragment(node,&cachedOther,&offset,&fragment,&fragmentIndex) < B_OK) { + // convert node + memmove(duplicate,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); + memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1,0,fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); + } else { + // move to other node + duplicate_array *target = fragment->FragmentAt(fragmentIndex); + memcpy(target,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); + + cachedDuplicate.Free(transaction,duplicateOffset); + duplicateOffset = offset; + } + values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,duplicateOffset,fragmentIndex); + + if ((status = cached->WriteBack(transaction)) < B_OK) + return status; + + if (fragment != NULL) + return cachedOther.WriteBack(transaction); + } + return cachedDuplicate.WriteBack(transaction); + } +} + + +/** Removes the key with the given index from the specified node. + * Since it has to get the key from the node anyway (to obtain it's + * pointer), it's not needed to pass the key & its length, although + * the calling method (BPlusTree::Remove()) have this data. + */ + +void +BPlusTree::RemoveKey(bplustree_node *node,uint16 index) +{ + // should never happen, but who knows? + if (index > node->all_key_count && node->all_key_count > 0) { + FATAL(("Asked me to remove key outer limits: %u\n",index)); + return; + } + + off_t *values = node->Values(); + + // if we would have to drop the overflow link, drop + // the last key instead and update the overflow link + // to the value of that one + if (!node->IsLeaf() && index == node->all_key_count) + node->overflow_link = values[--index]; + + uint16 length; + uint8 *key = node->KeyAt(index,&length); + if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize + || length > BPLUSTREE_MAX_KEY_LENGTH) { + FATAL(("Key length to long: %s, %u (inode at %ld,%u [%s])\n",key,length,fStream->BlockRun().allocation_group,fStream->BlockRun().start,fStream->Name())); + fStream->GetVolume()->Panic(); + return; + } + + uint16 *keyLengths = node->KeyLengths(); + uint8 *keys = node->Keys(); + + node->all_key_count--; + node->all_key_length -= length; + + off_t *newValues = node->Values(); + uint16 *newKeyLengths = node->KeyLengths(); + + // move key data + memmove(key,key + length,node->all_key_length - (key - keys)); + + // move and update key lengths + if (index > 0 && newKeyLengths != keyLengths) + memmove(newKeyLengths,keyLengths,index * sizeof(uint16)); + for (uint16 i = index;i < node->all_key_count;i++) + newKeyLengths[i] = keyLengths[i + 1] - length; + + // move values + if (index > 0) + memmove(newValues,values,index * sizeof(off_t)); + if (node->all_key_count > index) + memmove(newValues + index,values + index + 1,(node->all_key_count - index) * sizeof(off_t)); +} + + +/** Removes the specified key from the tree. The "value" parameter is only used + * for trees which allow duplicates, so you may safely ignore it. + * It's not an optional parameter, so at least you have to think about it. + */ + +status_t +BPlusTree::Remove(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value) +{ + if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) + RETURN_ERROR(B_BAD_VALUE); + + // lock access to stream + WriteLocked locked(fStream->Lock()); + + Stack stack; + if (SeekDown(stack,key,keyLength) != B_OK) + RETURN_ERROR(B_ERROR); + + node_and_key nodeAndKey; + bplustree_node *node; + + CachedNode cached(this); + while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) + { + if (node->IsLeaf()) // first round, check for duplicate entries + { + status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); + if (status < B_OK) + RETURN_ERROR(status); + + // If we will remove the last key, the iterator will be set + // to the next node after the current - if there aren't any + // more nodes, we need a way to prevent the TreeIterators to + // touch the old node again, we use BPLUSTREE_FREE for this + off_t next = node->right_link == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->right_link; + UpdateIterators(nodeAndKey.nodeOffset,node->all_key_count == 1 ? + next : BPLUSTREE_NULL,nodeAndKey.keyIndex,0,-1); + + // is this a duplicate entry? + if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) { + if (fAllowDuplicates) + return RemoveDuplicate(transaction,node,&cached,nodeAndKey.keyIndex,value); + else + RETURN_ERROR(B_NAME_IN_USE); + } + } + + // if it's an empty root node, we have to convert it + // to a leaf node by dropping the overflow link, or, + // if it's a leaf node, just empty it + if (nodeAndKey.nodeOffset == fHeader->root_node_pointer + && node->all_key_count == 0 + || node->all_key_count == 1 && node->IsLeaf()) { + node->overflow_link = BPLUSTREE_NULL; + node->all_key_count = 0; + node->all_key_length = 0; + + if (cached.WriteBack(transaction) < B_OK) + return B_IO_ERROR; + + fHeader->max_number_of_levels = 1; + return fCachedHeader.WriteBack(transaction); + } + + // if there is only one key left, we don't have to remove + // it, we can just dump the node (index nodes still have + // the overflow link, so we have to drop the last key) + if (node->all_key_count > 1 + || !node->IsLeaf() && node->all_key_count == 1) { + RemoveKey(node,nodeAndKey.keyIndex); + return cached.WriteBack(transaction); + } + + // when we are here, we can just free the node, but + // we have to update the right/left link of the + // siblings first + CachedNode otherCached(this); + bplustree_node *other = otherCached.SetTo(node->left_link); + if (other != NULL) { + other->right_link = node->right_link; + if (otherCached.WriteBack(transaction) < B_OK) + return B_IO_ERROR; + } + + if ((other = otherCached.SetTo(node->right_link)) != NULL) { + other->left_link = node->left_link; + if (otherCached.WriteBack(transaction) < B_OK) + return B_IO_ERROR; + } + + cached.Free(transaction,nodeAndKey.nodeOffset); + } + RETURN_ERROR(B_ERROR); +} + + +/** Replaces the value for the key in the tree. + * Returns B_OK if the key could be found and its value replaced, + * B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors + * to indicate that something went terribly wrong. + * Note that this doesn't work with duplicates - it will just + * return B_BAD_TYPE if you call this function on a tree where + * duplicates are allowed. + */ + +status_t +BPlusTree::Replace(Transaction *transaction,const uint8 *key,uint16 keyLength,off_t value) +{ + if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH + || key == NULL) + RETURN_ERROR(B_BAD_VALUE); + + if (fAllowDuplicates) + RETURN_ERROR(B_BAD_TYPE); + + // lock access to stream (a read lock is okay for this purpose) + ReadLocked locked(fStream->Lock()); + + off_t nodeOffset = fHeader->root_node_pointer; + CachedNode cached(this); + bplustree_node *node; + + while ((node = cached.SetTo(nodeOffset)) != NULL) { + uint16 keyIndex = 0; + off_t nextOffset; + status_t status = FindKey(node,key,keyLength,&keyIndex,&nextOffset); + + if (node->overflow_link == BPLUSTREE_NULL) { + if (status == B_OK) { + node->Values()[keyIndex] = value; + return cached.WriteBack(transaction); + } + + return status; + } else if (nextOffset == nodeOffset) + RETURN_ERROR(B_ERROR); + + nodeOffset = nextOffset; + } + RETURN_ERROR(B_ERROR); +} + + +/** Searches the key in the tree, and stores the offset found in + * _value, if successful. + * It's very similar to BPlusTree::SeekDown(), but doesn't fill + * a stack while it descends the tree. + * Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND + * if not. It can also return other errors to indicate that + * something went wrong. + * Note that this doesn't work with duplicates - it will just + * return B_BAD_TYPE if you call this function on a tree where + * duplicates are allowed. + */ + +status_t +BPlusTree::Find(const uint8 *key,uint16 keyLength,off_t *_value) +{ + if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH + || key == NULL) + RETURN_ERROR(B_BAD_VALUE); + + if (fAllowDuplicates) + RETURN_ERROR(B_BAD_TYPE); + + // lock access to stream + ReadLocked locked(fStream->Lock()); + + off_t nodeOffset = fHeader->root_node_pointer; + CachedNode cached(this); + bplustree_node *node; + + while ((node = cached.SetTo(nodeOffset)) != NULL) { + uint16 keyIndex = 0; + off_t nextOffset; + status_t status = FindKey(node,key,keyLength,&keyIndex,&nextOffset); + + if (node->overflow_link == BPLUSTREE_NULL) { + if (status == B_OK && _value != NULL) + *_value = node->Values()[keyIndex]; + + return status; + } else if (nextOffset == nodeOffset) + RETURN_ERROR(B_ERROR); + + nodeOffset = nextOffset; + } + RETURN_ERROR(B_ERROR); +} + + +// #pragma mark - + + +TreeIterator::TreeIterator(BPlusTree *tree) + : + fTree(tree), + fCurrentNodeOffset(BPLUSTREE_NULL), + fNext(NULL) +{ + tree->AddIterator(this); +} + + +TreeIterator::~TreeIterator() +{ + if (fTree) + fTree->RemoveIterator(this); +} + + +status_t +TreeIterator::Goto(int8 to) +{ + if (fTree == NULL || fTree->fHeader == NULL) + RETURN_ERROR(B_BAD_VALUE); + + // lock access to stream + ReadLocked locked(fTree->fStream->Lock()); + + off_t nodeOffset = fTree->fHeader->root_node_pointer; + CachedNode cached(fTree); + bplustree_node *node; + + while ((node = cached.SetTo(nodeOffset)) != NULL) { + // is the node a leaf node? + if (node->overflow_link == BPLUSTREE_NULL) { + fCurrentNodeOffset = nodeOffset; + fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count; + fDuplicateNode = BPLUSTREE_NULL; + + return B_OK; + } + + // get the next node offset depending on the direction (and if there + // are any keys in that node at all) + off_t nextOffset; + if (to == BPLUSTREE_END || node->all_key_count == 0) + nextOffset = node->overflow_link; + else { + if (node->all_key_length > fTree->fNodeSize + || (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->all_key_count) + RETURN_ERROR(B_ERROR); + + nextOffset = node->Values()[0]; + } + if (nextOffset == nodeOffset) + break; + + nodeOffset = nextOffset; + } + FATAL(("%s fails\n",__PRETTY_FUNCTION__)); + RETURN_ERROR(B_ERROR); +} + + +/** Iterates through the tree in the specified direction. + * When it iterates through duplicates, the "key" is only updated for the + * first entry - if you need to know when this happens, use the "duplicate" + * parameter which is 0 for no duplicate, 1 for the first, and 2 for all + * the other duplicates. + * That's not too nice, but saves the 256 bytes that would be needed to + * store the last key - if this will ever become an issue, it will be + * easy to change. + * The other advantage of this is, that the queries can skip all duplicates + * at once when they are not relevant to them. + */ + +status_t +TreeIterator::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate) +{ + if (fTree == NULL) + return B_INTERRUPTED; + if (fCurrentNodeOffset == BPLUSTREE_NULL + && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) + RETURN_ERROR(B_ERROR); + + // if the tree was emptied since the last call + if (fCurrentNodeOffset == BPLUSTREE_FREE) + return B_ENTRY_NOT_FOUND; + + // lock access to stream + ReadLocked locked(fTree->fStream->Lock()); + + CachedNode cached(fTree); + bplustree_node *node; + + if (fDuplicateNode != BPLUSTREE_NULL) + { + // regardless of traverse direction the duplicates are always presented in + // the same order; since they are all considered as equal, this shouldn't + // cause any problems + + if (!fIsFragment || fDuplicate < fNumDuplicates) + node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),false); + else + node = NULL; + + if (node != NULL) + { + if (!fIsFragment && fDuplicate >= fNumDuplicates) + { + // if the node is out of duplicates, we go directly to the next one + fDuplicateNode = node->right_link; + if (fDuplicateNode != BPLUSTREE_NULL + && (node = cached.SetTo(fDuplicateNode,false)) != NULL) + { + fNumDuplicates = node->CountDuplicates(fDuplicateNode,false); + fDuplicate = 0; + } + } + if (fDuplicate < fNumDuplicates) + { + *value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++); + if (duplicate) + *duplicate = 2; + return B_OK; + } + } + fDuplicateNode = BPLUSTREE_NULL; + } + + off_t savedNodeOffset = fCurrentNodeOffset; + if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) + RETURN_ERROR(B_ERROR); + + if (duplicate) + *duplicate = 0; + + fCurrentKey += direction; + + // is the current key in the current node? + while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count) + || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) + { + fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link; + + // are there any more nodes? + if (fCurrentNodeOffset != BPLUSTREE_NULL) + { + node = cached.SetTo(fCurrentNodeOffset); + if (!node) + RETURN_ERROR(B_ERROR); + + // reset current key + fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count; + } + else + { + // there are no nodes left, so turn back to the last key + fCurrentNodeOffset = savedNodeOffset; + fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1; + + return B_ENTRY_NOT_FOUND; + } + } + + if (node->all_key_count == 0) + RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? + + uint16 length; + uint8 *keyStart = node->KeyAt(fCurrentKey,&length); + if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize + || length > BPLUSTREE_MAX_KEY_LENGTH) { + fTree->fStream->GetVolume()->Panic(); + RETURN_ERROR(B_BAD_DATA); + } + + length = min_c(length,maxLength); + memcpy(key,keyStart,length); + + if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) // terminate string type + { + if (length == maxLength) + length--; + ((char *)key)[length] = '\0'; + } + *keyLength = length; + + off_t offset = node->Values()[fCurrentKey]; + + // duplicate fragments? + uint8 type = bplustree_node::LinkType(offset); + if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) + { + fDuplicateNode = offset; + + node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),false); + if (node == NULL) + RETURN_ERROR(B_ERROR); + + fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; + + fNumDuplicates = node->CountDuplicates(offset,fIsFragment); + if (fNumDuplicates) + { + offset = node->DuplicateAt(offset,fIsFragment,0); + fDuplicate = 1; + if (duplicate) + *duplicate = 1; + } + else + { + // shouldn't happen, but we're dealing here with potentially corrupt disks... + fDuplicateNode = BPLUSTREE_NULL; + offset = 0; + } + } + *value = offset; + + return B_OK; +} + + +/** This is more or less a copy of BPlusTree::Find() - but it just + * sets the current position in the iterator, regardless of if the + * key could be found or not. + */ + +status_t +TreeIterator::Find(const uint8 *key, uint16 keyLength) +{ + if (fTree == NULL) + return B_INTERRUPTED; + if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH + || key == NULL) + RETURN_ERROR(B_BAD_VALUE); + + // lock access to stream + ReadLocked locked(fTree->fStream->Lock()); + + off_t nodeOffset = fTree->fHeader->root_node_pointer; + + CachedNode cached(fTree); + bplustree_node *node; + while ((node = cached.SetTo(nodeOffset)) != NULL) { + uint16 keyIndex = 0; + off_t nextOffset; + status_t status = fTree->FindKey(node,key,keyLength,&keyIndex,&nextOffset); + + if (node->overflow_link == BPLUSTREE_NULL) { + fCurrentNodeOffset = nodeOffset; + fCurrentKey = keyIndex - 1; + fDuplicateNode = BPLUSTREE_NULL; + + return status; + } else if (nextOffset == nodeOffset) + RETURN_ERROR(B_ERROR); + + nodeOffset = nextOffset; + } + RETURN_ERROR(B_ERROR); +} + + +void +TreeIterator::SkipDuplicates() +{ + fDuplicateNode = BPLUSTREE_NULL; +} + + +void +TreeIterator::Update(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change) +{ + if (offset != fCurrentNodeOffset) + return; + + if (nextOffset != BPLUSTREE_NULL) { + fCurrentNodeOffset = nextOffset; + if (splitAt <= fCurrentKey) { + fCurrentKey -= splitAt; + keyIndex -= splitAt; + } + } + + // Adjust fCurrentKey to point to the same key as before. + // Note, that if a key is inserted at the current position + // it won't be included in this tree transition. + if (keyIndex <= fCurrentKey) + fCurrentKey += change; + + // ToDo: duplicate handling! +} + + +void +TreeIterator::Stop() +{ + fTree = NULL; +} + + +#ifdef DEBUG +void +TreeIterator::Dump() +{ + __out("TreeIterator at %p:\n",this); + __out("\tfTree = %p\n",fTree); + __out("\tfCurrentNodeOffset = %Ld\n",fCurrentNodeOffset); + __out("\tfCurrentKey = %ld\n",fCurrentKey); + __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",bplustree_node::FragmentOffset(fDuplicateNode),fDuplicateNode,fDuplicateNode); + __out("\tfDuplicate = %u\n",fDuplicate); + __out("\tfNumDuplicates = %u\n",fNumDuplicates); + __out("\tfIsFragment = %s\n",fIsFragment ? "true" : "false"); +} +#endif + + +// #pragma mark - + + +void +bplustree_node::Initialize() +{ + left_link = right_link = overflow_link = BPLUSTREE_NULL; + all_key_count = 0; + all_key_length = 0; +} + + +uint8 * +bplustree_node::KeyAt(int32 index,uint16 *keyLength) const +{ + if (index < 0 || index > all_key_count) + return NULL; + + uint8 *keyStart = Keys(); + uint16 *keyLengths = KeyLengths(); + + *keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0); + if (index > 0) + keyStart += keyLengths[index - 1]; + + return keyStart; +} + + +uint8 +bplustree_node::CountDuplicates(off_t offset,bool isFragment) const +{ + // the duplicate fragment handling is currently hard-coded to a node size + // of 1024 bytes - with future versions of BFS, this may be a problem + + if (isFragment) { + uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); + + return ((off_t *)this)[fragment]; + } + return overflow_link; +} + + +off_t +bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const +{ + uint32 start; + if (isFragment) + start = 8 * ((uint64)offset & 0x3ff); + else + start = 2; + + return ((off_t *)this)[start + 1 + index]; +} + + +/** Although the name suggests it, this function doesn't return the real + * used fragment count; at least, it can only count to two: it returns + * 0, if there is no fragment used, 1 if there is only one fragment + * used, and 2 if there are at least 2 fragments used. + */ + +int32 +bplustree_node::FragmentsUsed(uint32 nodeSize) +{ + uint32 used = 0; + for (int32 i = 0;i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));i++) { + duplicate_array *array = FragmentAt(i); + if (array->count > 0 && ++used > 1) + return used; + } + return used; +} + + +// #pragma mark - + + +int32 +compareKeys(type_code type,const void *key1, int keyLength1, const void *key2, int keyLength2) +{ + // if one of the keys is NULL, bail out gracefully + if (key1 == NULL || key2 == NULL) + return -1; + + switch (type) + { + case B_STRING_TYPE: + { + int len = min_c(keyLength1,keyLength2); + int result = strncmp((const char *)key1,(const char *)key2,len); + + if (result == 0 + && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) + result = keyLength1 - keyLength2; + + return result; + } + + case B_INT32_TYPE: + return *(int32 *)key1 - *(int32 *)key2; + + case B_UINT32_TYPE: + { + if (*(uint32 *)key1 == *(uint32 *)key2) + return 0; + else if (*(uint32 *)key1 > *(uint32 *)key2) + return 1; + + return -1; + } + + case B_INT64_TYPE: + { + if (*(int64 *)key1 == *(int64 *)key2) + return 0; + else if (*(int64 *)key1 > *(int64 *)key2) + return 1; + + return -1; + } + + case B_UINT64_TYPE: + { + if (*(uint64 *)key1 == *(uint64 *)key2) + return 0; + else if (*(uint64 *)key1 > *(uint64 *)key2) + return 1; + + return -1; + } + + case B_FLOAT_TYPE: + { + float result = *(float *)key1 - *(float *)key2; + if (result == 0.0f) + return 0; + + return (result < 0.0f) ? -1 : 1; + } + + case B_DOUBLE_TYPE: + { + double result = *(double *)key1 - *(double *)key2; + if (result == 0.0) + return 0; + + return (result < 0.0) ? -1 : 1; + } + } + return 0; +} + + diff --git a/src/add-ons/kernel/file_systems/bfs/BPlusTree.h b/src/add-ons/kernel/file_systems/bfs/BPlusTree.h new file mode 100644 index 0000000000..402db41f84 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/BPlusTree.h @@ -0,0 +1,436 @@ +#ifndef B_PLUS_TREE_H +#define B_PLUS_TREE_H +/* BPlusTree - BFS B+Tree implementation +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** Roughly based on 'btlib' written by Marcus J. Ranum +** +** Copyright (c) 2001-2002 pinc Software. All Rights Reserved. +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "bfs.h" +#include "Journal.h" +#include "Chain.h" + + +//****************** on-disk structures ******************** + +#define BPLUSTREE_NULL -1LL +#define BPLUSTREE_FREE -2LL + +struct bplustree_header { + uint32 magic; + uint32 node_size; + uint32 max_number_of_levels; + uint32 data_type; + off_t root_node_pointer; + off_t free_node_pointer; + off_t maximum_size; + + inline bool IsValidLink(off_t link); +}; + +#define BPLUSTREE_MAGIC 0x69f6c2e8 +#define BPLUSTREE_NODE_SIZE 1024 +#define BPLUSTREE_MAX_KEY_LENGTH 256 +#define BPLUSTREE_MIN_KEY_LENGTH 1 + +enum bplustree_types { + BPLUSTREE_STRING_TYPE = 0, + BPLUSTREE_INT32_TYPE = 1, + BPLUSTREE_UINT32_TYPE = 2, + BPLUSTREE_INT64_TYPE = 3, + BPLUSTREE_UINT64_TYPE = 4, + BPLUSTREE_FLOAT_TYPE = 5, + BPLUSTREE_DOUBLE_TYPE = 6 +}; + +struct sorted_array; +typedef sorted_array duplicate_array; + +struct bplustree_node { + off_t left_link; + off_t right_link; + off_t overflow_link; + uint16 all_key_count; + uint16 all_key_length; + + inline uint16 *KeyLengths() const; + inline off_t *Values() const; + inline uint8 *Keys() const; + inline int32 Used() const; + uint8 *KeyAt(int32 index,uint16 *keyLength) const; + + inline bool IsLeaf() const; + + void Initialize(); + uint8 CountDuplicates(off_t offset,bool isFragment) const; + off_t DuplicateAt(off_t offset,bool isFragment,int8 index) const; + int32 FragmentsUsed(uint32 nodeSize); + inline duplicate_array *FragmentAt(int8 index); + inline duplicate_array *DuplicateArray(); + + static inline uint8 LinkType(off_t link); + static inline off_t MakeLink(uint8 type, off_t link, uint32 fragmentIndex = 0); + static inline bool IsDuplicate(off_t link); + static inline off_t FragmentOffset(off_t link); + static inline uint32 FragmentIndex(off_t link); +}; + +//#define BPLUSTREE_NODE 0 +#define BPLUSTREE_DUPLICATE_NODE 2 +#define BPLUSTREE_DUPLICATE_FRAGMENT 3 + +#define NUM_FRAGMENT_VALUES 7 +#define NUM_DUPLICATE_VALUES 125 + +//************************************** + +enum bplustree_traversing { + BPLUSTREE_FORWARD = 1, + BPLUSTREE_BACKWARD = -1, + + BPLUSTREE_BEGIN = 0, + BPLUSTREE_END = 1 +}; + + +//****************** in-memory structures ******************** + +template class Stack; +class BPlusTree; +class TreeIterator; +class CachedNode; +class Inode; + +// needed for searching (utilizing a stack) +struct node_and_key { + off_t nodeOffset; + uint16 keyIndex; +}; + + +//***** Cache handling ***** + +class CachedNode { + public: + CachedNode(BPlusTree *tree) + : + fTree(tree), + fNode(NULL), + fBlock(NULL) + { + } + + CachedNode(BPlusTree *tree,off_t offset,bool check = true) + : + fTree(tree), + fNode(NULL), + fBlock(NULL) + { + SetTo(offset,check); + } + + ~CachedNode() + { + Unset(); + } + + bplustree_node *SetTo(off_t offset,bool check = true); + bplustree_header *SetToHeader(); + void Unset(); + + status_t Free(Transaction *transaction, off_t offset); + status_t Allocate(Transaction *transaction,bplustree_node **node,off_t *offset); + status_t WriteBack(Transaction *transaction); + + bplustree_node *Node() const { return fNode; } + + protected: + bplustree_node *InternalSetTo(off_t offset); + + BPlusTree *fTree; + bplustree_node *fNode; + uint8 *fBlock; + off_t fBlockNumber; +}; + + +//******** B+tree class ********* + +class BPlusTree { + public: + BPlusTree(Transaction *transaction,Inode *stream,int32 nodeSize = BPLUSTREE_NODE_SIZE); + BPlusTree(Inode *stream); + BPlusTree(); + ~BPlusTree(); + + status_t SetTo(Transaction *transaction,Inode *stream,int32 nodeSize = BPLUSTREE_NODE_SIZE); + status_t SetTo(Inode *stream); + status_t SetStream(Inode *stream); + + status_t InitCheck(); + status_t Validate(); + + status_t Remove(Transaction *transaction,const uint8 *key, uint16 keyLength, off_t value); + status_t Insert(Transaction *transaction,const uint8 *key, uint16 keyLength, off_t value); + + status_t Insert(Transaction *transaction,const char *key, off_t value); + status_t Insert(Transaction *transaction,int32 key, off_t value); + status_t Insert(Transaction *transaction,uint32 key, off_t value); + status_t Insert(Transaction *transaction,int64 key, off_t value); + status_t Insert(Transaction *transaction,uint64 key, off_t value); + status_t Insert(Transaction *transaction,float key, off_t value); + status_t Insert(Transaction *transaction,double key, off_t value); + + status_t Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value); + status_t Find(const uint8 *key, uint16 keyLength, off_t *value); + + static int32 TypeCodeToKeyType(type_code code); + static int32 ModeToKeyType(mode_t mode); + + private: + int32 CompareKeys(const void *key1, int keylength1, const void *key2, int keylength2); + status_t FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index = NULL, off_t *next = NULL); + status_t SeekDown(Stack &stack, const uint8 *key, uint16 keyLength); + + status_t FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset, bplustree_node **_fragment,uint32 *_index); + status_t InsertDuplicate(Transaction *transaction,CachedNode *cached,bplustree_node *node,uint16 index,off_t value); + void InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength, off_t value); + status_t SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value); + + status_t RemoveDuplicate(Transaction *transaction,bplustree_node *node,CachedNode *cached,uint16 keyIndex, off_t value); + void RemoveKey(bplustree_node *node, uint16 index); + + void UpdateIterators(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change); + void AddIterator(TreeIterator *iterator); + void RemoveIterator(TreeIterator *iterator); + + private: + friend TreeIterator; + friend CachedNode; + + Inode *fStream; + bplustree_header *fHeader; + CachedNode fCachedHeader; + int32 fNodeSize; + bool fAllowDuplicates; + status_t fStatus; + SimpleLock fIteratorLock; + Chain fIterators; +}; + + +//***** helper classes/functions ***** + +extern int32 compareKeys(type_code type,const void *key1, int keyLength1, const void *key2, int keyLength2); + +class TreeIterator { + public: + TreeIterator(BPlusTree *tree); + ~TreeIterator(); + + status_t Goto(int8 to); + status_t Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength, off_t *value,uint16 *duplicate = NULL); + status_t Find(const uint8 *key, uint16 keyLength); + + status_t Rewind(); + status_t GetNextEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate = NULL); + status_t GetPreviousEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate = NULL); + void SkipDuplicates(); + +#ifdef DEBUG + void Dump(); +#endif + + private: + BPlusTree *fTree; + + off_t fCurrentNodeOffset; // traverse position + int32 fCurrentKey; + off_t fDuplicateNode; + uint16 fDuplicate, fNumDuplicates; + bool fIsFragment; + + private: + friend Chain; + friend BPlusTree; + + void Update(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change); + void Stop(); + TreeIterator *fNext; +}; + +// BPlusTree's inline functions (most of them may not be needed) + +inline status_t +BPlusTree::Insert(Transaction *transaction,const char *key,off_t value) +{ + if (fHeader->data_type != BPLUSTREE_STRING_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)key, strlen(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,int32 key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_INT32_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,uint32 key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_UINT32_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,int64 key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_INT64_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,uint64 key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_UINT64_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,float key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_FLOAT_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + +inline status_t +BPlusTree::Insert(Transaction *transaction,double key, off_t value) +{ + if (fHeader->data_type != BPLUSTREE_DOUBLE_TYPE) + return B_BAD_TYPE; + return Insert(transaction,(uint8 *)&key, sizeof(key), value); +} + + +/************************ TreeIterator inline functions ************************/ +// #pragma mark - + +inline status_t +TreeIterator::Rewind() +{ + return Goto(BPLUSTREE_BEGIN); +} + +inline status_t +TreeIterator::GetNextEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate) +{ + return Traverse(BPLUSTREE_FORWARD,key,keyLength,maxLength,value,duplicate); +} + +inline status_t +TreeIterator::GetPreviousEntry(void *key,uint16 *keyLength,uint16 maxLength,off_t *value,uint16 *duplicate) +{ + return Traverse(BPLUSTREE_BACKWARD,key,keyLength,maxLength,value,duplicate); +} + +/************************ bplustree_header inline functions ************************/ +// #pragma mark - + + +inline bool +bplustree_header::IsValidLink(off_t link) +{ + return link == BPLUSTREE_NULL || (link > 0 && link <= maximum_size - node_size); +} + + +/************************ bplustree_node inline functions ************************/ +// #pragma mark - + + +inline uint16 * +bplustree_node::KeyLengths() const +{ + return (uint16 *)(((char *)this) + round_up(sizeof(bplustree_node) + all_key_length)); +} + +inline off_t * +bplustree_node::Values() const +{ + return (off_t *)((char *)KeyLengths() + all_key_count * sizeof(uint16)); +} + +inline uint8 * +bplustree_node::Keys() const +{ + return (uint8 *)this + sizeof(bplustree_node); +} + +inline int32 +bplustree_node::Used() const +{ + return round_up(sizeof(bplustree_node) + all_key_length) + all_key_count * (sizeof(uint16) + sizeof(off_t)); +} + +inline bool +bplustree_node::IsLeaf() const +{ + return overflow_link == BPLUSTREE_NULL; +} + + +inline duplicate_array * +bplustree_node::FragmentAt(int8 index) +{ + return (duplicate_array *)((off_t *)this + index * (NUM_FRAGMENT_VALUES + 1)); +} + + +inline duplicate_array * +bplustree_node::DuplicateArray() +{ + return (duplicate_array *)&this->overflow_link; +} + + +inline uint8 +bplustree_node::LinkType(off_t link) +{ + return *(uint64 *)&link >> 62; +} + +inline off_t +bplustree_node::MakeLink(uint8 type,off_t link,uint32 fragmentIndex) +{ + return ((off_t)type << 62) | (link & 0x3ffffffffffffc00LL) | (fragmentIndex & 0x3ff); +} + +inline bool +bplustree_node::IsDuplicate(off_t link) +{ + return (LinkType(link) & (BPLUSTREE_DUPLICATE_NODE | BPLUSTREE_DUPLICATE_FRAGMENT)) > 0; +} + +inline off_t +bplustree_node::FragmentOffset(off_t link) +{ + return link & 0x3ffffffffffffc00LL; +} + +inline uint32 +bplustree_node::FragmentIndex(off_t link) +{ + return (uint32)(link & 0x3ff); +} + +#endif /* B_PLUS_TREE_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp new file mode 100644 index 0000000000..027f2fb1d5 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp @@ -0,0 +1,599 @@ +/* BlockAllocator - block bitmap handling and allocation policies +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "cpp.h" +#include "Debug.h" +#include "BlockAllocator.h" +#include "Volume.h" +#include "Inode.h" + +#ifdef USER +# define spawn_kernel_thread spawn_thread +#endif + +// Things the BlockAllocator should do: + +// - find a range of blocks of a certain size nearby a specific position +// - allocating a unsharp range of blocks for pre-allocation +// - free blocks +// - know how to deal with each allocation, special handling for directories, +// files, symlinks, etc. (type sensitive allocation policies) + +// What makes the code complicated is the fact that we are not just reading +// in the whole bitmap and operate on that in memory - e.g. a 13 GB partition +// with a block size of 2048 bytes already has a 800kB bitmap, and the size +// of partitions will grow even more - so that's not an option. +// Instead we are reading in every block when it's used - since an allocation +// group can span several blocks in the block bitmap, the AllocationBlock +// class is there to make handling those easier. + +// The current implementation is very basic and will be heavily optimized +// in the future. +// Furthermore, the allocation policies used here (when they will be in place) +// should have some real world tests. + + +class AllocationBlock : public CachedBlock { + public: + AllocationBlock(Volume *volume); + + void Allocate(uint16 start,uint16 numBlocks = 0xffff); + void Free(uint16 start,uint16 numBlocks = 0xffff); + inline bool IsUsed(uint16 block); + + status_t SetTo(AllocationGroup &group,uint16 block); + + int32 NumBlockBits() const { return fNumBits; } + + private: + int32 fNumBits; +}; + + +class AllocationGroup { + public: + AllocationGroup(); + + void AddFreeRange(int32 start,int32 blocks); + bool IsFull() const { return fFreeBits == 0; } + + int32 fNumBits; + int32 fStart; + int32 fFirstFree,fLargest,fLargestFirst; + int32 fFreeBits; +}; + + +AllocationBlock::AllocationBlock(Volume *volume) + : CachedBlock(volume) +{ +} + + +status_t +AllocationBlock::SetTo(AllocationGroup &group, uint16 block) +{ + // 8 blocks per byte + fNumBits = fVolume->BlockSize() << 3; + // the last group may have less bits in the last block + if ((group.fNumBits % fNumBits) != 0) + fNumBits = group.fNumBits % fNumBits; + + return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR; +} + + +bool +AllocationBlock::IsUsed(uint16 block) +{ + if (block > fNumBits) + return true; + return ((uint32 *)fBlock)[block >> 5] & (1UL << (block % 32)); +} + + +void +AllocationBlock::Allocate(uint16 start,uint16 numBlocks) +{ + start = start % fNumBits; + if (numBlocks == 0xffff) { + // allocate all blocks after "start" + numBlocks = fNumBits - start; + } else if (start + numBlocks > fNumBits) { + FATAL(("should allocate more blocks than there are in a block!\n")); + numBlocks = fNumBits - start; + } + + int32 block = start >> 5; + + while (numBlocks > 0) { + uint32 mask = 0; + for (int32 i = start % 32;i < 32 && numBlocks;i++,numBlocks--) + mask |= 1UL << (i % 32); + + ((uint32 *)fBlock)[block++] |= mask; + start = 0; + } +} + + +void +AllocationBlock::Free(uint16 start,uint16 numBlocks) +{ + start = start % fNumBits; + if (numBlocks == 0xffff) { + // free all blocks after "start" + numBlocks = fNumBits - start; + } else if (start + numBlocks > fNumBits) { + FATAL(("should free more blocks than there are in a block!\n")); + numBlocks = fNumBits - start; + } + + int32 block = start >> 5; + + while (numBlocks > 0) { + uint32 mask = 0; + for (int32 i = start % 32;i < 32 && numBlocks;i++,numBlocks--) + mask |= 1UL << (i % 32); + + ((uint32 *)fBlock)[block++] &= ~mask; + start = 0; + } +} + + +// #pragma mark - + + +AllocationGroup::AllocationGroup() + : + fFirstFree(-1), + fLargest(-1), + fLargestFirst(-1), + fFreeBits(0) +{ +} + + +void +AllocationGroup::AddFreeRange(int32 start, int32 blocks) +{ + D(if (blocks > 512) + PRINT(("range of %ld blocks starting at %ld\n",blocks,start))); + + if (fFirstFree == -1) + fFirstFree = start; + + if (fLargest < blocks) { + fLargest = blocks; + fLargestFirst = start; + } + + fFreeBits += blocks; +} + + +// #pragma mark - + + +BlockAllocator::BlockAllocator(Volume *volume) + : + fVolume(volume), + fGroups(NULL) +{ +} + + +BlockAllocator::~BlockAllocator() +{ + delete[] fGroups; +} + + +status_t +BlockAllocator::Initialize() +{ + if (fLock.InitCheck() < B_OK) + return B_ERROR; + + fNumGroups = fVolume->AllocationGroups(); + fBlocksPerGroup = fVolume->SuperBlock().blocks_per_ag; + fGroups = new AllocationGroup[fNumGroups]; + if (fGroups == NULL) + return B_NO_MEMORY; + + thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,"bfs block allocator",B_LOW_PRIORITY,(void *)this); + if (id < B_OK) + return initialize(this); + + return resume_thread(id); +} + + +status_t +BlockAllocator::initialize(BlockAllocator *allocator) +{ + Locker lock(allocator->fLock); + + Volume *volume = allocator->fVolume; + uint32 blocks = allocator->fBlocksPerGroup; + uint32 numBits = 8 * blocks * volume->BlockSize(); + off_t freeBlocks = 0; + + uint32 *buffer = (uint32 *)malloc(numBits >> 3); + if (buffer == NULL) + RETURN_ERROR(B_NO_MEMORY); + + AllocationGroup *groups = allocator->fGroups; + off_t offset = 1; + int32 num = allocator->fNumGroups; + + for (int32 i = 0;i < num;i++) { + if (cached_read(volume->Device(),offset,buffer,blocks,volume->BlockSize()) < B_OK) + break; + + // the last allocation group may contain less blocks than the others + groups[i].fNumBits = i == num - 1 ? allocator->fVolume->NumBlocks() - i * numBits : numBits; + groups[i].fStart = offset; + + // finds all free ranges in this allocation group + int32 start,range = 0; + int32 size = groups[i].fNumBits,num = 0; + + for (int32 k = 0;k < (size >> 2);k++) { + for (int32 j = 0;j < 32 && num < size;j++,num++) { + if (buffer[k] & (1UL << j)) { + if (range > 0) { + groups[i].AddFreeRange(start,range); + range = 0; + } + } else if (range++ == 0) + start = num; + } + } + if (range) + groups[i].AddFreeRange(start,range); + + freeBlocks += groups[i].fFreeBits; + + offset += blocks; + } + free(buffer); + + off_t usedBlocks = volume->NumBlocks() - freeBlocks; + if (volume->UsedBlocks() != usedBlocks) { + // If the disk in a dirty state at mount time, it's + // normal that the values don't match + INFORM(("volume reports %Ld used blocks, correct is %Ld\n",volume->UsedBlocks(),usedBlocks)); + volume->SuperBlock().used_blocks = usedBlocks; + } + + return B_OK; +} + + +status_t +BlockAllocator::AllocateBlocks(Transaction *transaction,int32 group,uint16 start,uint16 maximum,uint16 minimum, block_run &run) +{ + AllocationBlock cached(fVolume); + Locker lock(fLock); + + // the first scan through all allocation groups will look for the + // wanted maximum of blocks, the second scan will just look to + // satisfy the minimal requirement + uint16 numBlocks = maximum; + + for (int32 i = 0;i < fNumGroups * 2;i++,group++,start = 0) { + group = group % fNumGroups; + + if (start >= fGroups[group].fNumBits || fGroups[group].IsFull()) + continue; + + if (i >= fNumGroups) { + // if the minimum is the same as the maximum, it's not necessary to + // search for in the allocation groups a second time + if (maximum == minimum) + return B_DEVICE_FULL; + + numBlocks = minimum; + } + + // The wanted maximum is smaller than the largest free block in the group + // or already smaller than the minimum + // ToDo: disabled because it's currently not maintained after the first allocation + //if (numBlocks > fGroups[group].fLargest) + // continue; + + if (start < fGroups[group].fFirstFree) + start = fGroups[group].fFirstFree; + + // there may be more than one block per allocation group - and + // we iterate through it to find a place for the allocation. + // (one allocation can't exceed one allocation group) + + uint32 block = start / (fVolume->BlockSize() << 3); + int32 range = 0, rangeStart = 0,rangeBlock = 0; + + for (;block < fBlocksPerGroup;block++) { + if (cached.SetTo(fGroups[group],block) < B_OK) + RETURN_ERROR(B_ERROR); + + // find a block large enough to hold the allocation + for (int32 bit = start % cached.NumBlockBits();bit < cached.NumBlockBits();bit++) { + if (!cached.IsUsed(bit)) { + if (range == 0) { + // start new range + rangeStart = block * cached.NumBlockBits() + bit; + rangeBlock = block; + } + + // have we found a range large enough to hold numBlocks? + if (++range >= maximum) + break; + } else if (i >= fNumGroups && range >= minimum) { + // we have found a block larger than the required minimum (second pass) + break; + } else { + // end of a range + range = 0; + } + } + + // if we found a suitable block, mark the blocks as in use, and write + // the updated block bitmap back to disk + if (range >= numBlocks) { + // adjust allocation size + if (numBlocks < maximum) + numBlocks = range; + + // Update the allocation group info + // Note, the fFirstFree block doesn't have to be really free + if (rangeStart == fGroups[group].fFirstFree) + fGroups[group].fFirstFree = rangeStart + numBlocks; + fGroups[group].fFreeBits -= numBlocks; + + if (block != rangeBlock) { + // allocate the part that's in the current block + cached.Allocate(0,(rangeStart + numBlocks) % cached.NumBlockBits()); + if (cached.WriteBack(transaction) < B_OK) + RETURN_ERROR(B_ERROR); + + // set the blocks in the previous block + if (cached.SetTo(fGroups[group],block - 1) < B_OK) + cached.Allocate(rangeStart); + else + RETURN_ERROR(B_ERROR); + } else { + // just allocate the bits in the current block + cached.Allocate(rangeStart,numBlocks); + } + run.allocation_group = group; + run.start = rangeStart; + run.length = numBlocks; + + fVolume->SuperBlock().used_blocks += numBlocks; + // We are not writing back the disk's super block - it's + // either done by the journaling code, or when the disk + // is unmounted. + // If the value is not correct at mount time, it will be + // fixed anyway. + + return cached.WriteBack(transaction); + } + + // start from the beginning of the next block + start = 0; + } + } + return B_DEVICE_FULL; +} + + +status_t +BlockAllocator::AllocateForInode(Transaction *transaction,const block_run *parent, mode_t type, block_run &run) +{ + // apply some allocation policies here (AllocateBlocks() will break them + // if necessary) - we will start with those described in Dominic Giampaolo's + // "Practical File System Design", and see how good they work + + // files are going in the same allocation group as its parent, sub-directories + // will be inserted 8 allocation groups after the one of the parent + uint16 group = parent->allocation_group; + if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY) + group += 8; + + return AllocateBlocks(transaction,group,0,1,1,run); +} + + +status_t +BlockAllocator::Allocate(Transaction *transaction,const Inode *inode, off_t numBlocks, block_run &run, uint16 minimum) +{ + if (numBlocks <= 0) + return B_ERROR; + + // one block_run can't hold more data than it is in one allocation group + if (numBlocks > fGroups[0].fNumBits) + numBlocks = fGroups[0].fNumBits; + + // apply some allocation policies here (AllocateBlocks() will break them + // if necessary) + uint16 group = inode->BlockRun().allocation_group; + uint16 start = 0; + + // are there already allocated blocks? (then just allocate near the last) + if (inode->Size() > 0) { + data_stream *data = &inode->Node()->data; + // we currently don't care for when the data stream is + // already grown into the indirect ranges + if (data->max_double_indirect_range == 0 + && data->max_indirect_range == 0) { + int32 last = 0; + for (;last < NUM_DIRECT_BLOCKS - 1;last++) + if (data->direct[last + 1].IsZero()) + break; + + group = data->direct[last].allocation_group; + start = data->direct[last].start + data->direct[last].length; + } + } else if (inode->IsDirectory()) { + // directory data will go in the same allocation group as the inode is in + // but after the inode data + start = inode->BlockRun().start; + } else { + // file data will start in the next allocation group + group = inode->BlockRun().allocation_group + 1; + } + + return AllocateBlocks(transaction,group,start,numBlocks,minimum,run); +} + + +status_t +BlockAllocator::Free(Transaction *transaction,block_run &run) +{ + Locker lock(fLock); + + int32 group = run.allocation_group; + uint16 start = run.start; + uint16 length = run.length; + + // doesn't use Volume::IsValidBlockRun() here because it can check better + // against the group size (the last group may have a different length) + if (group < 0 || group >= fNumGroups + || start > fGroups[group].fNumBits + || start + length > fGroups[group].fNumBits + || length == 0) { + FATAL(("someone tried to free an invalid block_run (%ld, %u, %u)\n",group,start,length)); + return B_BAD_VALUE; + } + // check if someone tries to free reserved areas at the beginning of the drive + if (group == 0 && start < fVolume->Log().start + fVolume->Log().length) { + FATAL(("someone tried to free a reserved block_run (%ld, %u, %u)\n",group,start,length)); + return B_BAD_VALUE; + } +#ifdef DEBUG + if (CheckBlockRun(run) < B_OK) + return B_BAD_DATA; +#endif + + AllocationBlock cached(fVolume); + + uint32 block = run.start / (fVolume->BlockSize() << 3); + + if (fGroups[group].fFirstFree > start) + fGroups[group].fFirstFree = start; + fGroups[group].fFreeBits += length; + + for (;block < fBlocksPerGroup;block++) { + if (cached.SetTo(fGroups[group],block) < B_OK) + RETURN_ERROR(B_IO_ERROR); + + uint16 freeLength = length; + if (start + length > cached.NumBlockBits()) + freeLength = cached.NumBlockBits() - start; + + cached.Free(start,freeLength); + + if (cached.WriteBack(transaction) < B_OK) + return B_IO_ERROR; + + length -= freeLength; + if (length <= 0) + break; + + start = 0; + } + + fVolume->SuperBlock().used_blocks -= run.length; + return B_OK; +} + +#ifdef DEBUG +#include "BPlusTree.h" + +status_t +BlockAllocator::CheckBlockRun(block_run run) +{ + uint32 block = run.start / (fVolume->BlockSize() << 3); + uint32 start = run.start; + uint32 pos = 0; + + AllocationBlock cached(fVolume); + + for (;block < fBlocksPerGroup;block++) { + if (cached.SetTo(fGroups[run.allocation_group],block) < B_OK) + RETURN_ERROR(B_IO_ERROR); + + start = start % cached.NumBlockBits(); + while (pos < run.length && start + pos < cached.NumBlockBits()) { + if (!cached.IsUsed(start + pos)) { + PRINT(("block_run(%ld,%u,%u) is only partially allocated!\n",run.allocation_group,run.start,run.length)); + fVolume->Panic(); + return B_BAD_DATA; + } + pos++; + } + start = 0; + } + return B_OK; +} + + +status_t +BlockAllocator::CheckInode(Inode *inode) +{ + status_t status = CheckBlockRun(inode->BlockRun()); + if (status < B_OK) + return status; + + // only checks the direct range for now... + + data_stream *data = &inode->Node()->data; + for (int32 i = 0;i < NUM_DIRECT_BLOCKS;i++) { + if (data->direct[i].IsZero()) + break; + + status = CheckBlockRun(data->direct[i]); + if (status < B_OK) + return status; + } + return B_OK; +} + + +status_t +BlockAllocator::Check(Inode *inode) +{ + if (!inode || !inode->IsDirectory()) + return B_BAD_VALUE; + + BPlusTree *tree; + status_t status = inode->GetTree(&tree); + if (status < B_OK) + return status; + + TreeIterator iterator(tree); + char key[BPLUSTREE_MAX_KEY_LENGTH]; + uint16 length; + off_t offset; + while (iterator.GetNextEntry(key,&length,BPLUSTREE_MAX_KEY_LENGTH,&offset) == B_OK) { + Vnode vnode(fVolume,offset); + Inode *entry; + if (vnode.Get(&entry) < B_OK) { + FATAL(("could not get inode in tree at: %Ld\n",offset)); + continue; + } + block_run run = entry->BlockRun(); + PRINT(("check allocations of inode \"%s\" (%ld,%u,%u)\n",key,run.allocation_group,run.start,run.length)); + status = CheckInode(entry); + if (status < B_OK) + return status; + } + return B_OK; +} +#endif /* DEBUG */ diff --git a/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h new file mode 100644 index 0000000000..8f2ea0ba26 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/BlockAllocator.h @@ -0,0 +1,49 @@ +#ifndef BLOCK_ALLOCATOR_H +#define BLOCK_ALLOCATOR_H +/* BlockAllocator - block bitmap handling and allocation policies +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + + +class AllocationGroup; +class Transaction; +class Volume; +class Inode; +struct disk_super_block; +struct block_run; + + +class BlockAllocator { + public: + BlockAllocator(Volume *volume); + ~BlockAllocator(); + + status_t Initialize(); + + status_t AllocateForInode(Transaction *transaction,const block_run *parent,mode_t type,block_run &run); + status_t Allocate(Transaction *transaction,const Inode *inode,off_t numBlocks,block_run &run,uint16 minimum = 1); + status_t Free(Transaction *transaction,block_run &run); + + status_t AllocateBlocks(Transaction *transaction,int32 group, uint16 start, uint16 numBlocks, uint16 minimum, block_run &run); + +#ifdef DEBUG + status_t CheckBlockRun(block_run run); + status_t CheckInode(Inode *inode); + status_t Check(Inode *inode); +#endif + + private: + static status_t initialize(BlockAllocator *); + + Volume *fVolume; + Benaphore fLock; + AllocationGroup *fGroups; + int32 fNumGroups,fBlocksPerGroup; +}; + +#endif /* BLOCK_ALLOCATOR_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Chain.h b/src/add-ons/kernel/file_systems/bfs/Chain.h new file mode 100644 index 0000000000..7d7e3e87ef --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Chain.h @@ -0,0 +1,55 @@ +#ifndef CHAIN_H +#define CHAIN_H +/* Chain - a chain implementation; it's used for the callback management +** throughout the code (currently TreeIterator, and AttributeIterator). +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +/** The Link class you want to use with the Chain class needs to have + * a "fNext" member which is accessable from within the Chain class. + */ + +template class Chain { + public: + Chain() + : + fFirst(NULL) + { + } + + void Add(Link *link) + { + link->fNext = fFirst; + fFirst = link; + } + + void Remove(Link *link) + { + // search list for the correct callback to remove + Link *last = NULL,*entry; + for (entry = fFirst;link != entry;entry = entry->fNext) + last = entry; + if (link == entry) { + if (last) + last->fNext = link->fNext; + else + fFirst = link->fNext; + } + } + + Link *Next(Link *last) + { + if (last == NULL) + return fFirst; + + return last->fNext; + } + + private: + Link *fFirst; +}; + +#endif /* CHAIN_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Debug.cpp b/src/add-ons/kernel/file_systems/bfs/Debug.cpp new file mode 100644 index 0000000000..89bed31f9a --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Debug.cpp @@ -0,0 +1,241 @@ +/* Debug - debug stuff +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** Some code is based on work previously done by Marcus Overhagen +** +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "BPlusTree.h" + +#include + +#include + +#define Print __out + + +char * +get_tupel(uint32 id) +{ + static unsigned char tupel[5]; + + tupel[0] = 0xff & (id >> 24); + tupel[1] = 0xff & (id >> 16); + tupel[2] = 0xff & (id >> 8); + tupel[3] = 0xff & (id); + tupel[4] = 0; + for (int16 i = 0;i < 4;i++) + if (tupel[i] < ' ' || tupel[i] > 128) + tupel[i] = '.'; + + return (char *)tupel; +} + + +void +dump_block_run(const char *prefix,block_run &run) +{ + Print("%s(%ld, %d, %d)\n",prefix,run.allocation_group,run.start,run.length); +} + + +void +dump_super_block(disk_super_block *superBlock) +{ + Print("disk_super_block:\n"); + Print(" name = %s\n",superBlock->name); + Print(" magic1 = %#08lx (%s) %s\n",superBlock->magic1, get_tupel(superBlock->magic1), (superBlock->magic1 == SUPER_BLOCK_MAGIC1 ? "valid" : "INVALID")); + Print(" fs_byte_order = %#08lx (%s)\n",superBlock->fs_byte_order, get_tupel(superBlock->fs_byte_order)); + Print(" block_size = %lu\n",superBlock->block_size); + Print(" block_shift = %lu\n",superBlock->block_shift); + Print(" num_blocks = %Lu\n",superBlock->num_blocks); + Print(" used_blocks = %Lu\n",superBlock->used_blocks); + Print(" inode_size = %lu\n",superBlock->inode_size); + Print(" magic2 = %#08lx (%s) %s\n",superBlock->magic2, get_tupel(superBlock->magic2), (superBlock->magic2 == (int)SUPER_BLOCK_MAGIC2 ? "valid" : "INVALID")); + Print(" blocks_per_ag = %lu\n",superBlock->blocks_per_ag); + Print(" ag_shift = %lu (%ld bytes)\n",superBlock->ag_shift, 1LL << superBlock->ag_shift); + Print(" num_ags = %lu\n",superBlock->num_ags); + Print(" flags = %#08lx (%s)\n",superBlock->flags, get_tupel(superBlock->flags)); + dump_block_run(" log_blocks = ",superBlock->log_blocks); + Print(" log_start = %Lu\n",superBlock->log_start); + Print(" log_end = %Lu\n",superBlock->log_end); + Print(" magic3 = %#08lx (%s) %s\n",superBlock->magic3, get_tupel(superBlock->magic3), (superBlock->magic3 == SUPER_BLOCK_MAGIC3 ? "valid" : "INVALID")); + dump_block_run(" root_dir = ",superBlock->root_dir); + dump_block_run(" indices = ",superBlock->indices); +} + + +void +dump_data_stream(data_stream *stream) +{ + Print("data_stream:\n"); + for (int i = 0; i < NUM_DIRECT_BLOCKS; i++) { + if (!stream->direct[i].IsZero()) { + Print(" direct[%02d] = ",i); + dump_block_run("",stream->direct[i]); + } + } + Print(" max_direct_range = %Lu\n",stream->max_direct_range); + + if (!stream->indirect.IsZero()) + dump_block_run(" indirect = ",stream->indirect); + + Print(" max_indirect_range = %Lu\n",stream->max_indirect_range); + + if (!stream->double_indirect.IsZero()) + dump_block_run(" double_indirect = ",stream->double_indirect); + + Print(" max_double_indirect_range = %Lu\n",stream->max_double_indirect_range); + Print(" size = %Lu\n",stream->size); +} + + +void +dump_inode(bfs_inode *inode) +{ + Print("inode:\n"); + Print(" magic1 = %08lx (%s) %s\n",inode->magic1, + get_tupel(inode->magic1), (inode->magic1 == INODE_MAGIC1 ? "valid" : "INVALID")); + dump_block_run( " inode_num = ",inode->inode_num); + Print(" uid = %lu\n",inode->uid); + Print(" gid = %lu\n",inode->gid); + Print(" mode = %08lx\n",inode->mode); + Print(" flags = %08lx\n",inode->flags); + Print(" create_time = %Ld (%Ld)\n",inode->create_time,inode->create_time >> INODE_TIME_SHIFT); + Print(" last_modified_time = %Ld (%Ld)\n",inode->last_modified_time,inode->last_modified_time >> INODE_TIME_SHIFT); + dump_block_run( " parent = ",inode->parent); + dump_block_run( " attributes = ",inode->attributes); + Print(" type = %lu\n",inode->type); + Print(" inode_size = %lu\n",inode->inode_size); + Print(" etc = %#08lx\n",inode->etc); + Print(" short_symlink = %s\n", + S_ISLNK(inode->mode) && (inode->flags & INODE_LONG_SYMLINK) == 0? inode->short_symlink : "-"); + dump_data_stream(&(inode->data)); + Print(" --\n pad[0] = %08lx\n",inode->pad[0]); + Print(" pad[1] = %08lx\n",inode->pad[1]); + Print(" pad[2] = %08lx\n",inode->pad[2]); + Print(" pad[3] = %08lx\n",inode->pad[3]); +} + + +void +dump_bplustree_header(bplustree_header *header) +{ + Print("bplustree_header:\n"); + Print(" magic = %#08lx (%s) %s\n",header->magic, + get_tupel(header->magic), (header->magic == BPLUSTREE_MAGIC ? "valid" : "INVALID")); + Print(" node_size = %lu\n",header->node_size); + Print(" max_number_of_levels = %lu\n",header->max_number_of_levels); + Print(" data_type = %lu\n",header->data_type); + Print(" root_node_pointer = %Ld\n",header->root_node_pointer); + Print(" free_node_pointer = %Ld\n",header->free_node_pointer); + Print(" maximum_size = %Lu\n",header->maximum_size); +} + + +#define DUMPED_BLOCK_SIZE 16 + +void +dump_block(const char *buffer,int size) +{ + for(int i = 0;i < size;) { + int start = i; + + for(;i < start+DUMPED_BLOCK_SIZE;i++) { + if (!(i % 4)) + Print(" "); + + if (i >= size) + Print(" "); + else + Print("%02x",*(unsigned char *)(buffer+i)); + } + Print(" "); + + for(i = start;i < start + DUMPED_BLOCK_SIZE;i++) { + if (i < size) { + char c = *(buffer+i); + + if (c < 30) + Print("."); + else + Print("%c",c); + } + else + break; + } + Print("\n"); + } +} + + +void +dump_bplustree_node(bplustree_node *node,bplustree_header *header,Volume *volume) +{ + Print("bplustree_node:\n"); + Print(" left_link = %Ld\n",node->left_link); + Print(" right_link = %Ld\n",node->right_link); + Print(" overflow_link = %Ld\n",node->overflow_link); + Print(" all_key_count = %u\n",node->all_key_count); + Print(" all_key_length = %u\n",node->all_key_length); + + if (header == NULL) + return; + + if (node->all_key_count > node->all_key_length + || uint32(node->all_key_count * 10) > (uint32)header->node_size + || node->all_key_count == 0) { + Print("\n"); + dump_block((char *)node,header->node_size/*,sizeof(off_t)*/); + return; + } + + Print("\n"); + for (int32 i = 0;i < node->all_key_count;i++) { + uint16 length; + char buffer[256],*key = (char *)node->KeyAt(i,&length); + if (length > 255 || length == 0) { + Print(" %2ld. Invalid length (%u)!!\n",i,length); + dump_block((char *)node,header->node_size/*,sizeof(off_t)*/); + break; + } + memcpy(buffer,key,length); + buffer[length] = '\0'; + + off_t *value = node->Values() + i; + if ((uint32)value < (uint32)node || (uint32)value > (uint32)node + header->node_size) + Print(" %2ld. Invalid Offset!!\n",i); + else { + Print(" %2ld. ",i); + if (header->data_type == BPLUSTREE_STRING_TYPE) + Print("\"%s\"",buffer); + else if (header->data_type == BPLUSTREE_INT32_TYPE) + Print("int32 = %ld (0x%lx)",*(int32 *)&buffer,*(int32 *)&buffer); + else if (header->data_type == BPLUSTREE_UINT32_TYPE) + Print("uint32 = %lu (0x%lx)",*(uint32 *)&buffer,*(uint32 *)&buffer); + else if (header->data_type == BPLUSTREE_INT64_TYPE) + Print("int64 = %Ld (0x%Lx)",*(int64 *)&buffer,*(int64 *)&buffer); + else + Print("???"); + + off_t offset = *value & 0x3fffffffffffffffLL; + Print(" (%d bytes) -> %Ld",length,offset); + if (volume != NULL) + { + block_run run = volume->ToBlockRun(offset); + Print(" (%ld, %d)",run.allocation_group,run.start); + } + if (bplustree_node::LinkType(*value) == BPLUSTREE_DUPLICATE_FRAGMENT) + Print(" (duplicate fragment %Ld)\n",*value & 0x3ff); + else if (bplustree_node::LinkType(*value) == BPLUSTREE_DUPLICATE_NODE) + Print(" (duplicate node)\n"); + else + Print("\n"); + } + } +} + + diff --git a/src/add-ons/kernel/file_systems/bfs/Debug.h b/src/add-ons/kernel/file_systems/bfs/Debug.h new file mode 100644 index 0000000000..dfde5cdc92 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Debug.h @@ -0,0 +1,74 @@ +#ifndef DEBUG_H +#define DEBUG_H +/* Debug - debug stuff +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include +#ifdef USER +# include +# define __out printf +#else +# include +# define __out dprintf +#endif + +// Short overview over the debug output macros: +// PRINT() +// is for general messages that very unlikely should appear in a release build +// FATAL() +// this is for fatal messages, when something has really gone wrong +// INFORM() +// general information, as disk size, etc. +// REPORT_ERROR(status_t) +// prints out error information +// RETURN_ERROR(status_t) +// calls REPORT_ERROR() and return the value +// D() +// the statements in D() are only included if DEBUG is defined + +#ifdef DEBUG + #define PRINT(x) { __out("bfs: "); __out x; } + #define REPORT_ERROR(status) __out("bfs: %s:%ld: %s\n",__FUNCTION__,__LINE__,strerror(status)); + #define RETURN_ERROR(err) { status_t _status = err; if (_status < B_OK) REPORT_ERROR(_status); return _status;} + #define FATAL(x) { __out("bfs: "); __out x; } + #define INFORM(x) { __out("bfs: "); __out x; } +// #define FUNCTION() __out("bfs: %s()\n",__FUNCTION__); + #define FUNCTION_START(x) { __out("bfs: %s() ",__FUNCTION__); __out x; } + #define FUNCTION() ; +// #define FUNCTION_START(x) ; + #define D(x) {x;}; +#else + #define PRINT(x) ; + #define REPORT_ERROR(status) ; + #define RETURN_ERROR(status) return status; + #define FATAL(x) { __out("bfs: "); __out x; } + #define INFORM(x) { __out("bfs: "); __out x; } + #define FUNCTION() ; + #define FUNCTION_START(x) ; + #define D(x) ; +#endif + +#ifdef DEBUG + struct block_run; + struct bplustree_header; + struct bplustree_node; + struct data_stream; + struct bfs_inode; + struct disk_super_block; + class Volume; + + // some structure dump functions + extern void dump_block_run(const char *prefix, block_run &run); + extern void dump_super_block(disk_super_block *superBlock); + extern void dump_data_stream(data_stream *stream); + extern void dump_inode(bfs_inode *inode); + extern void dump_bplustree_header(bplustree_header *header); + extern void dump_bplustree_node(bplustree_node *node,bplustree_header *header = NULL,Volume *volume = NULL); + extern void dump_block(const char *buffer, int size); +#endif + +#endif /* DEBUG_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Index.cpp b/src/add-ons/kernel/file_systems/bfs/Index.cpp new file mode 100644 index 0000000000..48a0d07727 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Index.cpp @@ -0,0 +1,335 @@ +/* Index - index access functions +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "cpp.h" +#include "Index.h" +#include "Volume.h" +#include "Inode.h" +#include "BPlusTree.h" + +#include + + +Index::Index(Volume *volume) + : + fVolume(volume), + fNode(NULL) +{ +} + + +Index::~Index() +{ + if (fNode == NULL) + return; + + put_vnode(fVolume->ID(),fNode->ID()); +} + + +void +Index::Unset() +{ + if (fNode == NULL) + return; + + put_vnode(fVolume->ID(),fNode->ID()); + fNode = NULL; +} + + +status_t +Index::SetTo(const char *name) +{ + // remove the old node, if the index is set for the second time + Unset(); + + Inode *indices = fVolume->IndicesNode(); + if (indices == NULL) + return B_ENTRY_NOT_FOUND; + + BPlusTree *tree; + if (indices->GetTree(&tree) != B_OK) + return B_BAD_VALUE; + + vnode_id id; + status_t status = tree->Find((uint8 *)name,(uint16)strlen(name),&id); + if (status != B_OK) + return status; + + if (get_vnode(fVolume->ID(),id,(void **)&fNode) != B_OK) + return B_ENTRY_NOT_FOUND; + + if (fNode == NULL) { + FATAL(("fatal error at Index::InitCheck(), get_vnode() returned NULL pointer\n")); + put_vnode(fVolume->ID(),id); + return B_ERROR; + } + fName = name; + // only stores the pointer, so it assumes that it will stay constant + // in further comparisons (currently only used in Index::Update()) + + return B_OK; +} + + +uint32 +Index::Type() +{ + if (fNode == NULL) + return 0; + + switch (fNode->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX | + S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { + case S_INT_INDEX: + return B_INT32_TYPE; + case S_UINT_INDEX: + return B_UINT32_TYPE; + case S_LONG_LONG_INDEX: + return B_INT64_TYPE; + case S_ULONG_LONG_INDEX: + return B_UINT64_TYPE; + case S_FLOAT_INDEX: + return B_FLOAT_TYPE; + case S_DOUBLE_INDEX: + return B_DOUBLE_TYPE; + case S_STR_INDEX: + return B_STRING_TYPE; + } + FATAL(("index has unknown type!\n")); + return 0; +} + + +size_t +Index::KeySize() +{ + if (fNode == NULL) + return 0; + + int32 mode = fNode->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX | + S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); + + if (mode == S_STR_INDEX) + // string indices don't have a fixed key size + return 0; + + switch (mode) { + case S_INT_INDEX: + case S_UINT_INDEX: + return sizeof(int32); + case S_LONG_LONG_INDEX: + case S_ULONG_LONG_INDEX: + return sizeof(int64); + case S_FLOAT_INDEX: + return sizeof(float); + case S_DOUBLE_INDEX: + return sizeof(double); + } + FATAL(("index has unknown type!\n")); + return 0; +} + + +status_t +Index::Create(Transaction *transaction,const char *name,uint32 type) +{ + Unset(); + + int32 mode = 0; + switch (type) { + case B_INT32_TYPE: + mode = S_INT_INDEX; + break; + case B_UINT32_TYPE: + mode = S_UINT_INDEX; + break; + case B_INT64_TYPE: + mode = S_LONG_LONG_INDEX; + break; + case B_UINT64_TYPE: + mode = S_ULONG_LONG_INDEX; + break; + case B_FLOAT_TYPE: + mode = S_FLOAT_INDEX; + break; + case B_DOUBLE_TYPE: + mode = S_DOUBLE_INDEX; + break; + case B_STRING_TYPE: + mode = S_STR_INDEX; + break; + default: + return B_BAD_TYPE; + } + + status_t status; + + // do we need to create the index directory first? + if (fVolume->IndicesNode() == NULL) { + if ((status = fVolume->CreateIndicesRoot(transaction)) < B_OK) + RETURN_ERROR(status); + } + + vnode_id id; + status = Inode::Create(transaction,fVolume->IndicesNode(),name,S_INDEX_DIR | S_DIRECTORY | mode,0,type,&id); + if (status == B_OK) { + // since Inode::Create() lets the created inode open if "id" is specified, + // we don't need to call Vnode::Keep() here + Vnode vnode(fVolume,id); + return vnode.Get(&fNode); + } + return status; +} + + +/** Updates the specified index, the oldKey will be removed from, the newKey + * inserted into the tree. + * If the method returns B_BAD_INDEX, it means the index couldn't be found - + * the most common reason will be that the index doesn't exist. + * You may not want to let the whole transaction fail because of that. + */ + +status_t +Index::Update(Transaction *transaction,const char *name,int32 type,const uint8 *oldKey,uint16 oldLength,const uint8 *newKey,uint16 newLength,Inode *inode) +{ + if (name == NULL + || oldKey == NULL && newKey == NULL + || oldKey != NULL && oldLength == 0 + || newKey != NULL && newLength == 0) + return B_BAD_VALUE; + + // if the two keys are identical, don't do anything + if (type != 0 && !compareKeys(type,oldKey,oldLength,newKey,newLength)) + return B_OK; + + // update all live queries about the change, if they have an index or not + fVolume->UpdateLiveQueries(inode,name,type,oldKey,oldLength,newKey,newLength); + + status_t status; + if (name != fName && (status = SetTo(name)) < B_OK) + return B_BAD_INDEX; + + // now that we have the type, check again for equality + if (type == 0 && !compareKeys(Type(),oldKey,oldLength,newKey,newLength)) + return B_OK; + + BPlusTree *tree; + if ((status = Node()->GetTree(&tree)) < B_OK) + return status; + + // remove the old key from the tree + + if (oldKey != NULL) { + status = tree->Remove(transaction,(const uint8 *)oldKey,oldLength,inode->ID()); + if (status == B_ENTRY_NOT_FOUND) { + // That's not nice, but should be no reason to let the whole thing fail + FATAL(("Could not find value in index \"%s\"!\n",name)); + } else if (status < B_OK) + return status; + } + + // add the new key to the key + + if (newKey != NULL) + status = tree->Insert(transaction,(const uint8 *)newKey,newLength,inode->ID()); + + return status; +} + + +status_t +Index::InsertName(Transaction *transaction,const char *name,Inode *inode) +{ + return UpdateName(transaction,NULL,name,inode); +} + + +status_t +Index::RemoveName(Transaction *transaction,const char *name,Inode *inode) +{ + return UpdateName(transaction,name,NULL,inode); +} + + +status_t +Index::UpdateName(Transaction *transaction,const char *oldName, const char *newName,Inode *inode) +{ + uint16 oldLength = oldName ? strlen(oldName) : 0; + uint16 newLength = newName ? strlen(newName) : 0; + return Update(transaction,"name",B_STRING_TYPE,(uint8 *)oldName,oldLength,(uint8 *)newName,newLength,inode); +} + + +status_t +Index::InsertSize(Transaction *transaction, Inode *inode) +{ + off_t size = inode->Size(); + return Update(transaction,"size",B_INT64_TYPE,NULL,0,(uint8 *)&size,sizeof(int64),inode); +} + + +status_t +Index::RemoveSize(Transaction *transaction, Inode *inode) +{ + // Inode::OldSize() is the size that's in the index + off_t size = inode->OldSize(); + return Update(transaction,"size",B_INT64_TYPE,(uint8 *)&size,sizeof(int64),NULL,0,inode); +} + + +status_t +Index::UpdateSize(Transaction *transaction,Inode *inode) +{ + off_t oldSize = inode->OldSize(); + off_t newSize = inode->Size(); + status_t status = Update(transaction,"size",B_INT64_TYPE,(uint8 *)&oldSize,sizeof(int64), + (uint8 *)&newSize,sizeof(int64),inode); + + if (status == B_OK) + inode->UpdateOldSize(); + + return status; +} + + +status_t +Index::InsertLastModified(Transaction *transaction, Inode *inode) +{ + off_t modified = inode->Node()->last_modified_time; + return Update(transaction,"last_modified",B_INT64_TYPE,NULL,0,(uint8 *)&modified,sizeof(int64),inode); +} + + +status_t +Index::RemoveLastModified(Transaction *transaction, Inode *inode) +{ + // Inode::OldLastModified() is the value which is in the index + off_t modified = inode->OldLastModified(); + return Update(transaction,"last_modified",B_INT64_TYPE,(uint8 *)&modified,sizeof(int64),NULL,0,inode); +} + + +status_t +Index::UpdateLastModified(Transaction *transaction, Inode *inode, off_t modified) +{ + off_t oldModified = inode->OldLastModified(); + if (modified == -1) + modified = (bigtime_t)time(NULL) << INODE_TIME_SHIFT; + modified |= fVolume->GetUniqueID() & INODE_TIME_MASK; + + status_t status = Update(transaction,"last_modified",B_INT64_TYPE,(uint8 *)&oldModified,sizeof(int64), + (uint8 *)&modified,sizeof(int64),inode); + + inode->Node()->last_modified_time = modified; + if (status == B_OK) + inode->UpdateOldLastModified(); + + return status; +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Index.h b/src/add-ons/kernel/file_systems/bfs/Index.h new file mode 100644 index 0000000000..5e65953614 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Index.h @@ -0,0 +1,51 @@ +#ifndef INDEX_H +#define INDEX_H +/* Index - index access functions +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + +class Transaction; +class Volume; +class Inode; + + +class Index { + public: + Index(Volume *volume); + ~Index(); + + status_t SetTo(const char *name); + void Unset(); + + Inode *Node() const { return fNode; }; + uint32 Type(); + size_t KeySize(); + + status_t Create(Transaction *transaction, const char *name, uint32 type); + + status_t Update(Transaction *transaction, const char *name, int32 type, const uint8 *oldKey, uint16 oldLength, const uint8 *newKey, uint16 newLength, Inode *inode); + + status_t InsertName(Transaction *transaction,const char *name,Inode *inode); + status_t RemoveName(Transaction *transaction,const char *name,Inode *inode); + status_t UpdateName(Transaction *transaction,const char *oldName,const char *newName,Inode *inode); + + status_t InsertSize(Transaction *transaction, Inode *inode); + status_t RemoveSize(Transaction *transaction, Inode *inode); + status_t UpdateSize(Transaction *transaction, Inode *inode); + + status_t InsertLastModified(Transaction *transaction, Inode *inode); + status_t RemoveLastModified(Transaction *transaction, Inode *inode); + status_t UpdateLastModified(Transaction *transaction, Inode *inode,off_t modified = -1); + + private: + Volume *fVolume; + Inode *fNode; + const char *fName; +}; + +#endif /* INDEX_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Inode.cpp b/src/add-ons/kernel/file_systems/bfs/Inode.cpp new file mode 100644 index 0000000000..dc6212f639 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Inode.cpp @@ -0,0 +1,2107 @@ +/* Inode - inode access functions +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "cpp.h" +#include "Inode.h" +#include "BPlusTree.h" +#include "Index.h" + +#include + + +class InodeAllocator { + public: + InodeAllocator(Transaction *transaction); + ~InodeAllocator(); + + status_t New(block_run *parentRun,mode_t mode,block_run &run,Inode **inode); + void Keep(); + + private: + Transaction *fTransaction; + block_run fRun; + Inode *fInode; +}; + + +InodeAllocator::InodeAllocator(Transaction *transaction) + : + fTransaction(transaction), + fInode(NULL) +{ +} + + +InodeAllocator::~InodeAllocator() +{ + delete fInode; + + if (fTransaction) + fTransaction->GetVolume()->Free(fTransaction,fRun); +} + + +status_t +InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **inode) +{ + Volume *volume = fTransaction->GetVolume(); + + status_t status = volume->AllocateForInode(fTransaction,parentRun,mode,fRun); + if (status < B_OK) { + // don't free the space in the destructor, because + // the allocation failed + fTransaction = NULL; + RETURN_ERROR(status); + } + + run = fRun; + fInode = new Inode(volume,volume->ToVnode(run),true); + if (fInode == NULL) + RETURN_ERROR(B_NO_MEMORY); + + *inode = fInode; + return B_OK; +} + + +void InodeAllocator::Keep() +{ + fTransaction = NULL; + fInode = NULL; +} + + +// #pragma mark - + + +Inode::Inode(Volume *volume,vnode_id id,bool empty,uint8 reenter) + : CachedBlock(volume,volume->VnodeToBlock(id),empty), + fTree(NULL), + fLock("bfs inode") +{ + Node()->flags &= INODE_PERMANENT_FLAGS; + + // these two will help to maintain the indices + fOldSize = Size(); + fOldLastModified = Node()->last_modified_time; +} + + +Inode::~Inode() +{ + delete fTree; +} + + +status_t +Inode::InitCheck() +{ + if (!Node()) + RETURN_ERROR(B_IO_ERROR); + + // test inode magic and flags + if (Node()->magic1 != INODE_MAGIC1 + || !(Node()->flags & INODE_IN_USE) + || Node()->inode_num.length != 1 + // matches inode size? + || Node()->inode_size != fVolume->InodeSize() + // parent resides on disk? + || Node()->parent.allocation_group > fVolume->AllocationGroups() + || Node()->parent.allocation_group < 0 + || Node()->parent.start > (1L << fVolume->AllocationGroupShift()) + || Node()->parent.length != 1 + // attributes, too? + || Node()->attributes.allocation_group > fVolume->AllocationGroups() + || Node()->attributes.allocation_group < 0 + || Node()->attributes.start > (1L << fVolume->AllocationGroupShift())) { + FATAL(("inode at block %Ld corrupt!\n",fBlockNumber)); + RETURN_ERROR(B_BAD_DATA); + } + + // ToDo: Add some tests to check the integrity of the other stuff here, + // especially for the data_stream! + + // it's more important to know that the inode is corrupt + // so we check for the lock not until here + return fLock.InitCheck(); +} + + +status_t +Inode::CheckPermissions(int accessMode) const +{ + uid_t user = geteuid(); + gid_t group = getegid(); + + // you never have write access to a read-only volume + if (accessMode & W_OK && fVolume->IsReadOnly()) + return B_READ_ONLY_DEVICE; + + // root users always have full access (but they can't execute anything) + if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0)) + return B_OK; + + // shift mode bits, to check directly against accessMode + mode_t mode = Mode(); + if (user == Node()->uid) + mode >>= 6; + else if (group == Node()->gid) + mode >>= 3; + + if (accessMode & ~(mode & S_IRWXO)) + return B_NOT_ALLOWED; + + return B_OK; +} + + +// #pragma mark - + + +void +Inode::AddIterator(AttributeIterator *iterator) +{ + if (fSmallDataLock.Lock() < B_OK) + return; + + fIterators.Add(iterator); + + fSmallDataLock.Unlock(); +} + + +void +Inode::RemoveIterator(AttributeIterator *iterator) +{ + if (fSmallDataLock.Lock() < B_OK) + return; + + fIterators.Remove(iterator); + + fSmallDataLock.Unlock(); +} + + +/** Tries to free up "bytes" space in the small_data section by moving + * attributes to real files. Used for system attributes like the name. + * You need to hold the fSmallDataLock when you call this method + */ + +status_t +Inode::MakeSpaceForSmallData(Transaction *transaction,const char *name,int32 bytes) +{ + while (bytes > 0) { + small_data *item = Node()->small_data_start,*max = NULL; + int32 index = 0,maxIndex = 0; + for (;!item->IsLast(Node());item = item->Next(),index++) { + // should not remove those + if (*item->Name() == FILE_NAME_NAME || !strcmp(name,item->Name())) + continue; + + if (max == NULL || max->Size() < item->Size()) { + maxIndex = index; + max = item; + } + + // remove the first one large enough to free the needed amount of bytes + if (bytes < item->Size()) + break; + } + + if (item->IsLast(Node()) || item->Size() < bytes) + return B_ERROR; + + bytes -= max->Size(); + + // Move the attribute to a real attribute file + // Luckily, this doesn't cause any index updates + + Inode *attribute; + status_t status = CreateAttribute(transaction,item->Name(),item->type,&attribute); + if (status < B_OK) + RETURN_ERROR(status); + + size_t length = item->data_size; + status = attribute->WriteAt(transaction,0,item->Data(),&length); + + ReleaseAttribute(attribute); + + if (status < B_OK) { + Vnode vnode(fVolume,Attributes()); + Inode *attributes; + if (vnode.Get(&attributes) < B_OK + || attributes->Remove(transaction,name) < B_OK) { + FATAL(("Could not remove newly created attribute!\n")); + } + + RETURN_ERROR(status); + } + + RemoveSmallData(max,maxIndex); + } + return B_OK; +} + + +/** Private function which removes the given attribute from the small_data + * section. + * You need to hold the fSmallDataLock when you call this method + */ + +status_t +Inode::RemoveSmallData(small_data *item,int32 index) +{ + small_data *next = item->Next(); + if (!next->IsLast(Node())) { + // find the last attribute + small_data *last = next; + while (!last->IsLast(Node())) + last = last->Next(); + + int32 size = (uint8 *)last - (uint8 *)next; + if (size < 0 || size > (uint8 *)Node() + fVolume->BlockSize() - (uint8 *)next) + return B_BAD_DATA; + + memmove(item,next,size); + + // Move the "last" one to its new location and + // correctly terminate the small_data section + last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item)); + memset(last,0,(uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last); + } else + memset(item,0,item->Size()); + + // update all current iterators + AttributeIterator *iterator = NULL; + while ((iterator = fIterators.Next(iterator)) != NULL) + iterator->Update(index,-1); + + return B_OK; +} + + +/** Removes the given attribute from the small_data section. + * Note that you need to write back the inode yourself after having called + * that method. + */ + +status_t +Inode::RemoveSmallData(Transaction *transaction,const char *name) +{ + if (name == NULL) + return B_BAD_VALUE; + + SimpleLocker locker(fSmallDataLock); + + // search for the small_data item + + small_data *item = Node()->small_data_start; + int32 index = 0; + while (!item->IsLast(Node()) && strcmp(item->Name(),name)) { + item = item->Next(); + index++; + } + + if (item->IsLast(Node())) + return B_ENTRY_NOT_FOUND; + + return RemoveSmallData(item,index); +} + + +/** Try to place the given attribute in the small_data section - if the + * new attribute is too big to fit in that section, it returns B_DEVICE_FULL. + * In that case, the attribute should be written to a real attribute file; + * if the attribute was already part of the small_data section, but the new + * one wouldn't fit, the old one is automatically removed from the small_data + * section. + * Note that you need to write back the inode yourself after having called that + * method - it's a bad API decision that it needs a transaction but enforces you + * to write back the inode all by yourself, but it's just more efficient in most + * cases... + */ + +status_t +Inode::AddSmallData(Transaction *transaction,const char *name,uint32 type,const uint8 *data,size_t length,bool force) +{ + if (name == NULL || data == NULL || type == 0) + return B_BAD_VALUE; + + // reject any requests that can't fit into the small_data section + uint32 nameLength = strlen(name); + uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1; + if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode)) + return B_DEVICE_FULL; + + SimpleLocker locker(fSmallDataLock); + + small_data *item = Node()->small_data_start; + int32 index = 0; + while (!item->IsLast(Node()) && strcmp(item->Name(),name)) { + item = item->Next(); + index++; + } + + // is the attribute already in the small_data section? + // then just replace the data part of that one + if (!item->IsLast(Node())) { + // find last attribute + small_data *last = item; + while (!last->IsLast(Node())) + last = last->Next(); + + // try to change the attributes value + if (item->data_size > length + || force + || ((uint8 *)last + length - item->data_size) <= ((uint8 *)Node() + fVolume->InodeSize())) { + // make room for the new attribute if needed (and we are forced to do so) + if (force + && ((uint8 *)last + length - item->data_size) > ((uint8 *)Node() + fVolume->InodeSize())) { + // We also take the free space at the end of the small_data section + // into account, and request only what's really needed + uint32 needed = length - item->data_size - + (uint32)((uint8 *)Node() + fVolume->InodeSize() - (uint8 *)last); + + if (MakeSpaceForSmallData(transaction,name,needed) < B_OK) + return B_ERROR; + + // reset our pointers + item = Node()->small_data_start; + index = 0; + while (!item->IsLast(Node()) && strcmp(item->Name(),name)) { + item = item->Next(); + index++; + } + + last = item; + while (!last->IsLast(Node())) + last = last->Next(); + } + + // move the attributes after the current one + small_data *next = item->Next(); + if (!next->IsLast(Node())) + memmove((uint8 *)item + spaceNeeded,next,(uint8 *)last - (uint8 *)next); + + // Move the "last" one to its new location and + // correctly terminate the small_data section + last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded))); + if ((uint8 *)last < (uint8 *)Node() + fVolume->BlockSize()) + memset(last,0,(uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last); + + item->type = type; + item->data_size = length; + memcpy(item->Data(),data,length); + item->Data()[length] = '\0'; + + return B_OK; + } + + // Could not replace the old attribute, so remove it to let + // let the calling function create an attribute file for it + if (RemoveSmallData(item,index) < B_OK) + return B_ERROR; + + return B_DEVICE_FULL; + } + + // try to add the new attribute! + + if ((uint8 *)item + spaceNeeded > (uint8 *)Node() + fVolume->InodeSize()) { + // there is not enough space for it! + if (!force) + return B_DEVICE_FULL; + + // make room for the new attribute + if (MakeSpaceForSmallData(transaction,name,spaceNeeded) < B_OK) + return B_ERROR; + + // get new last item! + item = Node()->small_data_start; + index = 0; + while (!item->IsLast(Node())) { + item = item->Next(); + index++; + } + } + + memset(item,0,spaceNeeded); + item->type = type; + item->name_size = nameLength; + item->data_size = length; + strcpy(item->Name(),name); + memcpy(item->Data(),data,length); + + // correctly terminate the small_data section + item = item->Next(); + if (!item->IsLast(Node())) + memset(item,0,(uint8 *)Node() + fVolume->InodeSize() - (uint8 *)item); + + // update all current iterators + AttributeIterator *iterator = NULL; + while ((iterator = fIterators.Next(iterator)) != NULL) + iterator->Update(index,1); + + return B_OK; +} + + +/** Iterates through the small_data section of an inode. + * To start at the beginning of this section, you let smallData + * point to NULL, like: + * small_data *data = NULL; + * while (inode->GetNextSmallData(&data) { ... } + * + * This function is reentrant and doesn't allocate any memory; + * you can safely stop calling it at any point (you don't need + * to iterate through the whole list). + * You need to hold the fSmallDataLock when you call this method + */ + +status_t +Inode::GetNextSmallData(small_data **smallData) const +{ + if (!Node()) + RETURN_ERROR(B_ERROR); + + small_data *data = *smallData; + + // begin from the start? + if (data == NULL) + data = Node()->small_data_start; + else + data = data->Next(); + + // is already last item? + if (data->IsLast(Node())) + return B_ENTRY_NOT_FOUND; + + *smallData = data; + + return B_OK; +} + + +/** Finds the attribute "name" in the small data section, and + * returns a pointer to it (or NULL if it doesn't exist). + * You need to hold the fSmallDataLock when you call this method + */ + +small_data * +Inode::FindSmallData(const char *name) const +{ + small_data *smallData = NULL; + while (GetNextSmallData(&smallData) == B_OK) { + if (!strcmp(smallData->Name(),name)) + return smallData; + } + return NULL; +} + + +const char * +Inode::Name() const +{ + SimpleLocker locker(fSmallDataLock); + + small_data *smallData = NULL; + while (GetNextSmallData(&smallData) == B_OK) { + if (*smallData->Name() == FILE_NAME_NAME && smallData->name_size == FILE_NAME_NAME_LENGTH) + return (const char *)smallData->Data(); + } + return NULL; +} + + +/** Changes or set the name of a file: in the inode small_data section only, it + * doesn't change it in the parent directory's b+tree. + * Note that you need to write back the inode yourself after having called + * that method. It suffers from the same API decision as AddSmallData() does + * (and for the same reason). + */ + +status_t +Inode::SetName(Transaction *transaction,const char *name) +{ + if (name == NULL || *name == '\0') + return B_BAD_VALUE; + + const char nameTag[2] = {FILE_NAME_NAME, 0}; + + return AddSmallData(transaction,nameTag,FILE_NAME_TYPE,(uint8 *)name,strlen(name),true); +} + + +/** Reads data from the specified attribute. + * This is a high-level attribute function that understands attributes + * in the small_data section as well as real attribute files. + */ + +status_t +Inode::ReadAttribute(const char *name,int32 type,off_t pos,uint8 *buffer,size_t *_length) +{ + if (pos < 0) + pos = 0; + + // search in the small_data section (which has to be locked first) + { + SimpleLocker locker(fSmallDataLock); + + small_data *smallData = FindSmallData(name); + if (smallData != NULL) { + size_t length = *_length; + if (pos >= smallData->data_size) { + *_length = 0; + return B_OK; + } + if (length + pos > smallData->data_size) + length = smallData->data_size - pos; + + memcpy(buffer,smallData->Data() + pos,length); + *_length = length; + return B_OK; + } + } + + // search in the attribute directory + Inode *attribute; + status_t status = GetAttribute(name,&attribute); + if (status == B_OK) { + if (attribute->Lock().Lock() == B_OK) { + status = attribute->ReadAt(pos,(uint8 *)buffer,_length); + attribute->Lock().Unlock(); + } else + status = B_ERROR; + + ReleaseAttribute(attribute); + } + + RETURN_ERROR(status); +} + + +/** Writes data to the specified attribute. + * This is a high-level attribute function that understands attributes + * in the small_data section as well as real attribute files. + */ + +status_t +Inode::WriteAttribute(Transaction *transaction,const char *name,int32 type,off_t pos,const uint8 *buffer,size_t *_length) +{ + // needed to maintain the index + uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH],*oldData = NULL; + size_t oldLength = 0; + + Index index(fVolume); + bool hasIndex = index.SetTo(name) == B_OK; + + Inode *attribute = NULL; + status_t status; + if (GetAttribute(name,&attribute) < B_OK) { + // save the old attribute data + if (hasIndex) { + fSmallDataLock.Lock(); + + small_data *smallData = FindSmallData(name); + if (smallData != NULL) { + oldLength = smallData->data_size; + if (oldLength > BPLUSTREE_MAX_KEY_LENGTH) + oldLength = BPLUSTREE_MAX_KEY_LENGTH; + memcpy(oldData = oldBuffer,smallData->Data(),oldLength); + } + fSmallDataLock.Unlock(); + } + + // if the attribute doesn't exist yet (as a file), try to put it in the + // small_data section first - if that fails (due to insufficent space), + // create a real attribute file + status = AddSmallData(transaction,name,type,buffer,*_length); + if (status == B_DEVICE_FULL) { + status = CreateAttribute(transaction,name,type,&attribute); + if (status < B_OK) + RETURN_ERROR(status); + } else if (status == B_OK) + status = WriteBack(transaction); + } + + if (attribute != NULL) { + if (attribute->Lock().LockWrite() == B_OK) { + // save the old attribute data (if this fails, oldLength will reflect it) + if (hasIndex) { + oldLength = BPLUSTREE_MAX_KEY_LENGTH; + if (attribute->ReadAt(0,oldBuffer,&oldLength) == B_OK) + oldData = oldBuffer; + } + status = attribute->WriteAt(transaction,pos,buffer,_length); + + attribute->Lock().UnlockWrite(); + } else + status = B_ERROR; + + ReleaseAttribute(attribute); + } + + if (status == B_OK) { + // ToDo: find a better way for that "pos" thing... + // Update index + if (hasIndex && pos == 0) { + // index only the first BPLUSTREE_MAX_KEY_LENGTH bytes + uint16 length = *_length; + if (length > BPLUSTREE_MAX_KEY_LENGTH) + length = BPLUSTREE_MAX_KEY_LENGTH; + + index.Update(transaction,name,0,oldData,oldLength,buffer,length,this); + } + } + return status; +} + + +/** Removes the specified attribute from the inode. + * This is a high-level attribute function that understands attributes + * in the small_data section as well as real attribute files. + */ + +status_t +Inode::RemoveAttribute(Transaction *transaction,const char *name) +{ + Index index(fVolume); + bool hasIndex = index.SetTo(name) == B_OK; + + // update index for attributes in the small_data section + if (hasIndex) { + fSmallDataLock.Lock(); + + small_data *smallData = FindSmallData(name); + if (smallData != NULL) { + uint32 length = smallData->data_size; + if (length > BPLUSTREE_MAX_KEY_LENGTH) + length = BPLUSTREE_MAX_KEY_LENGTH; + index.Update(transaction,name,0,smallData->Data(),length,NULL,0,this); + } + fSmallDataLock.Unlock(); + } + + status_t status = RemoveSmallData(transaction,name); + if (status == B_OK) { + status = WriteBack(transaction); + } else if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) { + // remove the attribute file if it exists + Vnode vnode(fVolume,Attributes()); + Inode *attributes; + if ((status = vnode.Get(&attributes)) < B_OK) + return status; + + // update index + Inode *attribute; + if (hasIndex && GetAttribute(name,&attribute) == B_OK) { + uint8 data[BPLUSTREE_MAX_KEY_LENGTH]; + size_t length = BPLUSTREE_MAX_KEY_LENGTH; + if (attribute->ReadAt(0,data,&length) == B_OK) + index.Update(transaction,name,0,data,length,NULL,0,this); + + ReleaseAttribute(attribute); + } + + if ((status = attributes->Remove(transaction,name)) < B_OK) + return status; + + if (attributes->IsEmpty()) { + // remove attribute directory (don't fail if that can't be done) + if (remove_vnode(fVolume->ID(),attributes->ID()) == B_OK) { + // update the inode, so that no one will ever doubt it's deleted :-) + attributes->Node()->flags |= INODE_DELETED; + if (attributes->WriteBack(transaction) == B_OK) { + Attributes().SetTo(0,0,0); + WriteBack(transaction); + } else + unremove_vnode(fVolume->ID(),attributes->ID()); + } + } + } + return status; +} + + +status_t +Inode::GetAttribute(const char *name,Inode **attribute) +{ + // does this inode even have attributes? + if (Attributes().IsZero()) + return B_ENTRY_NOT_FOUND; + + Vnode vnode(fVolume,Attributes()); + Inode *attributes; + if (vnode.Get(&attributes) < B_OK) { + FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",name)); + return B_ERROR; + } + + BPlusTree *tree; + status_t status = attributes->GetTree(&tree); + if (status == B_OK) { + vnode_id id; + if ((status = tree->Find((uint8 *)name,(uint16)strlen(name),&id)) == B_OK) + return get_vnode(fVolume->ID(),id,(void **)attribute); + } + return status; +} + + +void +Inode::ReleaseAttribute(Inode *attribute) +{ + if (attribute == NULL) + return; + + put_vnode(fVolume->ID(),attribute->ID()); +} + + +status_t +Inode::CreateAttribute(Transaction *transaction,const char *name,uint32 type,Inode **attribute) +{ + // do we need to create the attribute directory first? + if (Attributes().IsZero()) { + status_t status = Inode::Create(transaction,this,NULL,S_ATTR_DIR | 0666,0,0,NULL); + if (status < B_OK) + RETURN_ERROR(status); + } + Vnode vnode(fVolume,Attributes()); + Inode *attributes; + if (vnode.Get(&attributes) < B_OK) + return B_ERROR; + + // Inode::Create() locks the inode if we provide the "id" parameter + vnode_id id; + return Inode::Create(transaction,attributes,name,S_ATTR | 0666,0,type,&id,attribute); +} + + +// #pragma mark - + + +/** Gives the caller direct access to the b+tree for a given directory. + * The tree is created on demand, but lasts until the inode is + * deleted. + */ + +status_t +Inode::GetTree(BPlusTree **tree) +{ + if (fTree) { + *tree = fTree; + return B_OK; + } + + if (IsDirectory()) { + fTree = new BPlusTree(this); + if (!fTree) + RETURN_ERROR(B_NO_MEMORY); + + *tree = fTree; + status_t status = fTree->InitCheck(); + if (status < B_OK) { + delete fTree; + fTree = NULL; + } + RETURN_ERROR(status); + } + RETURN_ERROR(B_BAD_VALUE); +} + + +bool +Inode::IsEmpty() +{ + BPlusTree *tree; + status_t status = GetTree(&tree); + if (status < B_OK) + return status; + + TreeIterator iterator(tree); + + // index and attribute directories are really empty when they are + // empty - directories for standard files always contain ".", and + // "..", so we need to ignore those two + + uint32 count = 0; + char name[BPLUSTREE_MAX_KEY_LENGTH]; + uint16 length; + vnode_id id; + while (iterator.GetNextEntry(name,&length,B_FILE_NAME_LENGTH,&id) == B_OK) { + if (Mode() & (S_ATTR_DIR | S_INDEX_DIR)) + return false; + + if (++count > 2 || strcmp(".",name) && strcmp("..",name)) + return false; + } + return true; +} + + +/** Finds the block_run where "pos" is located in the data_stream of + * the inode. + * If successful, "offset" will then be set to the file offset + * of the block_run returned; so "pos - offset" is for the block_run + * what "pos" is for the whole stream. + */ + +status_t +Inode::FindBlockRun(off_t pos,block_run &run,off_t &offset) +{ + data_stream *data = &Node()->data; + + // Inode::ReadAt() does already does this + //if (pos > data->size) + // return B_ENTRY_NOT_FOUND; + + // find matching block run + + if (data->max_direct_range > 0 && pos >= data->max_direct_range) { + if (data->max_double_indirect_range > 0 && pos >= data->max_indirect_range) { + // access to double indirect blocks + + CachedBlock cached(fVolume); + + off_t start = pos - data->max_indirect_range; + int32 indirectSize = (16 << fVolume->BlockShift()) * (fVolume->BlockSize() / sizeof(block_run)); + int32 directSize = 4 << fVolume->BlockShift(); + int32 index = start / indirectSize; + int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run); + + block_run *indirect = (block_run *)cached.SetTo( + fVolume->ToBlock(data->double_indirect) + index / runsPerBlock); + if (indirect == NULL) + RETURN_ERROR(B_ERROR); + + //printf("\tstart = %Ld, indirectSize = %ld, directSize = %ld, index = %ld\n",start,indirectSize,directSize,index); + //printf("\tlook for indirect block at %ld,%d\n",indirect[index].allocation_group,indirect[index].start); + + int32 current = (start % indirectSize) / directSize; + + indirect = (block_run *)cached.SetTo( + fVolume->ToBlock(indirect[index % runsPerBlock]) + current / runsPerBlock); + if (indirect == NULL) + RETURN_ERROR(B_ERROR); + + run = indirect[current % runsPerBlock]; + offset = data->max_indirect_range + (index * indirectSize) + (current * directSize); + //printf("\tfCurrent = %ld, fRunFileOffset = %Ld, fRunBlockEnd = %Ld, fRun = %ld,%d\n",fCurrent,fRunFileOffset,fRunBlockEnd,fRun.allocation_group,fRun.start); + } else { + // access to indirect blocks + + int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run); + off_t runBlockEnd = data->max_direct_range; + + CachedBlock cached(fVolume); + off_t block = fVolume->ToBlock(data->indirect); + + for (int32 i = 0;i < data->indirect.length;i++) { + block_run *indirect = (block_run *)cached.SetTo(block + i); + if (indirect == NULL) + RETURN_ERROR(B_IO_ERROR); + + int32 current = -1; + while (++current < runsPerBlock) { + if (indirect[current].IsZero()) + break; + + runBlockEnd += indirect[current].length << fVolume->BlockShift(); + if (runBlockEnd > pos) { + run = indirect[current]; + offset = runBlockEnd - (run.length << fVolume->BlockShift()); + //printf("reading from indirect block: %ld,%d\n",fRun.allocation_group,fRun.start); + //printf("### indirect-run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.length,fRunFileOffset); + return fVolume->IsValidBlockRun(run); + } + } + } + RETURN_ERROR(B_ERROR); + } + } else { + // access from direct blocks + + off_t runBlockEnd = 0LL; + int32 current = -1; + + while (++current < NUM_DIRECT_BLOCKS) { + if (data->direct[current].IsZero()) + break; + + runBlockEnd += data->direct[current].length << fVolume->BlockShift(); + if (runBlockEnd > pos) { + run = data->direct[current]; + offset = runBlockEnd - (run.length << fVolume->BlockShift()); + //printf("### run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.length,fRunFileOffset); + return fVolume->IsValidBlockRun(run); + } + } + //PRINT(("FindBlockRun() failed in direct range: size = %Ld, pos = %Ld\n",data->size,pos)); + return B_ENTRY_NOT_FOUND; + } + return fVolume->IsValidBlockRun(run); +} + + +status_t +Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length) +{ + // set/check boundaries for pos/length + + if (pos < 0) + pos = 0; + else if (pos >= Node()->data.size) { + *_length = 0; + return B_NO_ERROR; + } + + size_t length = *_length; + + if (pos + length > Node()->data.size) + length = Node()->data.size - pos; + + block_run run; + off_t offset; + if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + uint32 bytesRead = 0; + uint32 blockSize = fVolume->BlockSize(); + uint32 blockShift = fVolume->BlockShift(); + uint8 *block; + + // the first block_run we read could not be aligned to the block_size boundary + // (read partial block at the beginning) + + // pos % block_size == (pos - offset) % block_size, offset % block_size == 0 + if (pos % blockSize != 0) { + run.start += (pos - offset) / blockSize; + run.length -= (pos - offset) / blockSize; + + CachedBlock cached(fVolume,run); + if ((block = cached.Block()) == NULL) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + bytesRead = blockSize - (pos % blockSize); + if (length < bytesRead) + bytesRead = length; + + memcpy(buffer,block + (pos % blockSize),bytesRead); + pos += bytesRead; + + length -= bytesRead; + if (length == 0) { + *_length = bytesRead; + return B_OK; + } + + if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = bytesRead; + RETURN_ERROR(B_BAD_VALUE); + } + } + + // the first block_run is already filled in at this point + // read the following complete blocks using cached_read(), + // the last partial block is read using the CachedBlock class + + bool partial = false; + + while (length > 0) { + // offset is the offset to the current pos in the block_run + run.start += (pos - offset) >> blockShift; + run.length -= (pos - offset) >> blockShift; + + if ((run.length << blockShift) > length) { + if (length < blockSize) { + CachedBlock cached(fVolume,run); + if ((block = cached.Block()) == NULL) { + *_length = bytesRead; + RETURN_ERROR(B_BAD_VALUE); + } + memcpy(buffer + bytesRead,block,length); + bytesRead += length; + break; + } + run.length = length >> blockShift; + partial = true; + } + + if (cached_read(fVolume->Device(),fVolume->ToBlock(run),buffer + bytesRead, + run.length,blockSize) != B_OK) { + *_length = bytesRead; + RETURN_ERROR(B_BAD_VALUE); + } + + int32 bytes = run.length << blockShift; + length -= bytes; + bytesRead += bytes; + if (length == 0) + break; + + pos += bytes; + + if (partial) { + // if the last block was read only partially, point block_run + // to the remaining part + run.start += run.length; + run.length = 1; + offset = pos; + } else if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = bytesRead; + RETURN_ERROR(B_BAD_VALUE); + } + } + + *_length = bytesRead; + return B_NO_ERROR; +} + + +status_t +Inode::WriteAt(Transaction *transaction,off_t pos,const uint8 *buffer,size_t *_length) +{ + size_t length = *_length; + + // set/check boundaries for pos/length + if (pos < 0) + pos = 0; + else if (pos + length > Node()->data.size) { + off_t oldSize = Size(); + + // the transaction doesn't have to be started already + if ((Flags() & INODE_NO_TRANSACTION) == 0) + transaction->Start(fVolume,BlockNumber()); + + // let's grow the data stream to the size needed + status_t status = SetFileSize(transaction,pos + length); + if (status < B_OK) { + *_length = 0; + RETURN_ERROR(status); + } + // If the position of the write was beyond the file size, we + // have to fill the gap between that position and the old file + // size with zeros. + FillGapWithZeros(oldSize,pos); + } + + block_run run; + off_t offset; + if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + bool logStream = (Flags() & INODE_LOGGED) == INODE_LOGGED; + if (logStream) + transaction->Start(fVolume,BlockNumber()); + + uint32 bytesWritten = 0; + uint32 blockSize = fVolume->BlockSize(); + uint32 blockShift = fVolume->BlockShift(); + uint8 *block; + + // the first block_run we write could not be aligned to the block_size boundary + // (write partial block at the beginning) + + // pos % block_size == (pos - offset) % block_size, offset % block_size == 0 + if (pos % blockSize != 0) { + run.start += (pos - offset) / blockSize; + run.length -= (pos - offset) / blockSize; + + CachedBlock cached(fVolume,run); + if ((block = cached.Block()) == NULL) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + bytesWritten = blockSize - (pos % blockSize); + if (length < bytesWritten) + bytesWritten = length; + + memcpy(block + (pos % blockSize),buffer,bytesWritten); + + // either log the stream or write it directly to disk + if (logStream) + cached.WriteBack(transaction); + else + fVolume->WriteBlocks(cached.BlockNumber(),block,1); + + pos += bytesWritten; + + length -= bytesWritten; + if (length == 0) { + *_length = bytesWritten; + return B_OK; + } + + if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = bytesWritten; + RETURN_ERROR(B_BAD_VALUE); + } + } + + // the first block_run is already filled in at this point + // write the following complete blocks using Volume::WriteBlocks(), + // the last partial block is written using the CachedBlock class + + bool partial = false; + + while (length > 0) { + // offset is the offset to the current pos in the block_run + run.start += (pos - offset) >> blockShift; + run.length -= (pos - offset) >> blockShift; + + if ((run.length << blockShift) > length) { + if (length < blockSize) { + CachedBlock cached(fVolume,run); + if ((block = cached.Block()) == NULL) { + *_length = bytesWritten; + RETURN_ERROR(B_BAD_VALUE); + } + memcpy(block,buffer + bytesWritten,length); + + if (logStream) + cached.WriteBack(transaction); + else + fVolume->WriteBlocks(cached.BlockNumber(),block,1); + + bytesWritten += length; + break; + } + run.length = length >> blockShift; + partial = true; + } + + status_t status; + if (logStream) { + status = transaction->WriteBlocks(fVolume->ToBlock(run), + buffer + bytesWritten,run.length); + } else { + status = fVolume->WriteBlocks(fVolume->ToBlock(run), + buffer + bytesWritten,run.length); + } + if (status != B_OK) { + *_length = bytesWritten; + RETURN_ERROR(B_BAD_VALUE); + } + + int32 bytes = run.length << blockShift; + length -= bytes; + bytesWritten += bytes; + if (length == 0) + break; + + pos += bytes; + + if (partial) { + // if the last block was written only partially, point block_run + // to the remaining part + run.start += run.length; + run.length = 1; + offset = pos; + } else if (FindBlockRun(pos,run,offset) < B_OK) { + *_length = bytesWritten; + RETURN_ERROR(B_BAD_VALUE); + } + } + + *_length = bytesWritten; + + return B_NO_ERROR; +} + + +/** Fills the gap between the old file size and the new file size + * with zeros. + * It's more or less a copy of Inode::WriteAt() but it can handle + * length differences of more than just 4 GB, and it never uses + * the log, even if the INODE_LOGGED flag is set. + */ + +status_t +Inode::FillGapWithZeros(off_t pos,off_t newSize) +{ + //if (pos >= newSize) + return B_OK; + + block_run run; + off_t offset; + if (FindBlockRun(pos,run,offset) < B_OK) + RETURN_ERROR(B_BAD_VALUE); + + off_t length = newSize - pos; + uint32 bytesWritten = 0; + uint32 blockSize = fVolume->BlockSize(); + uint32 blockShift = fVolume->BlockShift(); + uint8 *block; + + // the first block_run we write could not be aligned to the block_size boundary + // (write partial block at the beginning) + + // pos % block_size == (pos - offset) % block_size, offset % block_size == 0 + if (pos % blockSize != 0) { + run.start += (pos - offset) / blockSize; + run.length -= (pos - offset) / blockSize; + + CachedBlock cached(fVolume,run); + if ((block = cached.Block()) == NULL) + RETURN_ERROR(B_BAD_VALUE); + + bytesWritten = blockSize - (pos % blockSize); + if (length < bytesWritten) + bytesWritten = length; + + memset(block + (pos % blockSize),0,bytesWritten); + fVolume->WriteBlocks(cached.BlockNumber(),block,1); + + pos += bytesWritten; + + length -= bytesWritten; + if (length == 0) + return B_OK; + + if (FindBlockRun(pos,run,offset) < B_OK) + RETURN_ERROR(B_BAD_VALUE); + } + + while (length > 0) { + // offset is the offset to the current pos in the block_run + run.start += (pos - offset) >> blockShift; + run.length -= (pos - offset) >> blockShift; + + CachedBlock cached(fVolume); + off_t blockNumber = fVolume->ToBlock(run); + for (int32 i = 0;i < run.length;i++) { + if ((block = cached.SetTo(blockNumber + i,true)) == NULL) + RETURN_ERROR(B_IO_ERROR); + + if (fVolume->WriteBlocks(cached.BlockNumber(),block,1) < B_OK) + RETURN_ERROR(B_IO_ERROR); + } + + int32 bytes = run.length << blockShift; + length -= bytes; + bytesWritten += bytes; + + // since we don't respect a last partial block, length can be lower + if (length <= 0) + break; + + pos += bytes; + + if (FindBlockRun(pos,run,offset) < B_OK) + RETURN_ERROR(B_BAD_VALUE); + } + return B_OK; +} + + +status_t +Inode::GrowStream(Transaction *transaction, off_t size) +{ + data_stream *data = &Node()->data; + + // is the data stream already large enough to hold the new size? + // (can be the case with preallocated blocks) + if (size < data->max_direct_range + || size < data->max_indirect_range + || size < data->max_double_indirect_range) { + data->size = size; + return B_OK; + } + + // how many bytes are still needed? (unused ranges are always zero) + off_t bytes; + if (data->size < data->max_double_indirect_range) + bytes = size - data->max_double_indirect_range; + else if (data->size < data->max_indirect_range) + bytes = size - data->max_indirect_range; + else if (data->size < data->max_direct_range) + bytes = size - data->max_direct_range; + else + bytes = size - data->size; + + // do we have enough free blocks on the disk? + off_t blocks = (bytes + fVolume->BlockSize() - 1) / fVolume->BlockSize(); + if (blocks > fVolume->FreeBlocks()) + return B_DEVICE_FULL; + + // should we preallocate some blocks (currently, always 64k)? + off_t blocksNeeded = blocks; + if (blocks < 65536 / fVolume->BlockSize() && fVolume->FreeBlocks() > 128) + blocks = 65536 / fVolume->BlockSize(); + + while (blocksNeeded > 0) { + // the requested blocks do not need to be returned with a + // single allocation, so we need to iterate until we have + // enough blocks allocated + block_run run; + status_t status = fVolume->Allocate(transaction,this,blocks,run); + if (status < B_OK) + return status; + + // okay, we have the needed blocks, so just distribute them to the + // different ranges of the stream (direct, indirect & double indirect) + + blocksNeeded -= run.length; + // don't preallocate if the first allocation was already too small + blocks = blocksNeeded; + + if (data->size <= data->max_direct_range) { + // let's try to put them into the direct block range + int32 free = 0; + for (;free < NUM_DIRECT_BLOCKS;free++) + if (data->direct[free].IsZero()) + break; + + if (free < NUM_DIRECT_BLOCKS) { + // can we merge the last allocated run with the new one? + int32 last = free - 1; + if (free > 0 + && data->direct[last].allocation_group == run.allocation_group + && data->direct[last].start + data->direct[last].length == run.start) { + data->direct[last].length += run.length; + } else { + data->direct[free] = run; + } + data->max_direct_range += run.length * fVolume->BlockSize(); + data->size = blocksNeeded > 0 ? data->max_direct_range : size; + continue; + } + } + + if (data->size <= data->max_indirect_range || !data->max_indirect_range) { + CachedBlock cached(fVolume); + block_run *runs = NULL; + int32 free = 0; + off_t block; + + // if there is no indirect block yet, create one + if (data->indirect.IsZero()) { + status = fVolume->Allocate(transaction,this,4,data->indirect,4); + if (status < B_OK) + return status; + + // make sure those blocks are empty + block = fVolume->ToBlock(data->indirect); + for (int32 i = 1;i < data->indirect.length;i++) { + block_run *runs = (block_run *)cached.SetTo(block + i,true); + if (runs == NULL) + return B_IO_ERROR; + + cached.WriteBack(transaction); + } + data->max_indirect_range = data->max_direct_range; + // insert the block_run in the first block + runs = (block_run *)cached.SetTo(block,true); + } else { + uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run); + block = fVolume->ToBlock(data->indirect); + + // search first empty entry + int32 i = 0; + for (;i < data->indirect.length;i++) { + if ((runs = (block_run *)cached.SetTo(block + i)) == NULL) + return B_IO_ERROR; + + for (free = 0;free < numberOfRuns;free++) + if (runs[free].IsZero()) + break; + + if (free < numberOfRuns) + break; + } + if (i == data->indirect.length) + runs = NULL; + } + + if (runs != NULL) { + // try to insert the run to the last one - note that this doesn't + // take block borders into account, so it could be further optimized + int32 last = free - 1; + if (free > 0 + && runs[last].allocation_group == run.allocation_group + && runs[last].start + runs[last].length == run.start) { + runs[last].length += run.length; + } else { + runs[free] = run; + } + data->max_indirect_range += run.length * fVolume->BlockSize(); + data->size = blocksNeeded > 0 ? data->max_indirect_range : size; + + cached.WriteBack(transaction); + continue; + } + } + + // when we are here, we need to grow into the double indirect + // range - but that's not yet implemented, so bail out! + + if (data->size <= data->max_double_indirect_range || !data->max_double_indirect_range) { + FATAL(("growing in the double indirect range is not yet implemented!\n")); + // ToDo: implement growing into the double indirect range, please! + } + + RETURN_ERROR(EFBIG); + } + // update the size of the data stream + data->size = size; + + return B_OK; +} + + +status_t +Inode::FreeStaticStreamArray(Transaction *transaction,int32 level,block_run run,off_t size,off_t offset,off_t &max) +{ + int32 indirectSize; + if (level == 0) + indirectSize = (16 << fVolume->BlockShift()) * (fVolume->BlockSize() / sizeof(block_run)); + else if (level == 1) + indirectSize = 4 << fVolume->BlockShift(); + + off_t start; + if (size > offset) + start = size - offset; + else + start = 0; + + int32 index = start / indirectSize; + int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run); + + CachedBlock cached(fVolume); + off_t blockNumber = fVolume->ToBlock(run); + + // set the file offset to the current block run + offset += (off_t)index * indirectSize; + + for (int32 i = index / runsPerBlock;i < run.length;i++) { + block_run *array = (block_run *)cached.SetTo(blockNumber + i); + if (array == NULL) + RETURN_ERROR(B_ERROR); + + for (index = index % runsPerBlock;index < runsPerBlock;index++) { + if (array[index].IsZero()) { + // we also want to break out of the outer loop + i = run.length; + break; + } + + status_t status = B_OK; + if (level == 0) + status = FreeStaticStreamArray(transaction,1,array[index],size,offset,max); + else if (offset >= size) + status = fVolume->Free(transaction,array[index]); + else + max = offset + indirectSize; + + if (status < B_OK) + RETURN_ERROR(status); + + if (offset >= size) + array[index].SetTo(0,0,0); + + offset += indirectSize; + } + index = 0; + + cached.WriteBack(transaction); + } + return B_OK; +} + + +/** Frees all block_runs in the array which come after the specified size. + * It also trims the last block_run that contain the size. + * "offset" and "max" are maintained until the last block_run that doesn't + * have to be freed - after this, the values won't be correct anymore, but + * will still assure correct function for all subsequent calls. + */ + +status_t +Inode::FreeStreamArray(Transaction *transaction,block_run *array,uint32 arrayLength,off_t size,off_t &offset,off_t &max) +{ + off_t newOffset = offset; + uint32 i = 0; + for (;i < arrayLength;i++,offset = newOffset) { + if (array[i].IsZero()) + break; + + newOffset += (off_t)array[i].length << fVolume->BlockShift(); + if (newOffset <= size) + continue; + + block_run run = array[i]; + + // determine the block_run to be freed + if (newOffset > size && offset < size) { + // free partial block_run (and update the original block_run) + run.start = array[i].start + ((size - offset) >> fVolume->BlockShift()) + 1; + array[i].length = run.start - array[i].start; + run.length -= array[i].length; + + if (run.length == 0) + continue; + + // update maximum range + max = offset + ((off_t)array[i].length << fVolume->BlockShift()); + } else { + // free the whole block_run + array[i].SetTo(0,0,0); + + if (max > offset) + max = offset; + } + + if (fVolume->Free(transaction,run) < B_OK) + return B_IO_ERROR; + } + return B_OK; +} + + +status_t +Inode::ShrinkStream(Transaction *transaction, off_t size) +{ + data_stream *data = &Node()->data; + + if (data->max_double_indirect_range > size) { + FreeStaticStreamArray(transaction,0,data->double_indirect,size,data->max_indirect_range,data->max_double_indirect_range); + + if (size <= data->max_indirect_range) { + fVolume->Free(transaction,data->double_indirect); + data->double_indirect.SetTo(0,0,0); + data->max_double_indirect_range = 0; + } + } + if (data->max_indirect_range > size) { + CachedBlock cached(fVolume); + off_t block = fVolume->ToBlock(data->indirect); + off_t offset = data->max_direct_range; + + for (int32 i = 0;i < data->indirect.length;i++) { + block_run *array = (block_run *)cached.SetTo(block + i); + if (array == NULL) + break; + + if (FreeStreamArray(transaction,array,fVolume->BlockSize() / sizeof(block_run),size,offset,data->max_indirect_range) == B_OK) + cached.WriteBack(transaction); + } + if (data->max_direct_range == data->max_indirect_range) { + fVolume->Free(transaction,data->indirect); + data->indirect.SetTo(0,0,0); + data->max_indirect_range = 0; + } + } + if (data->max_direct_range > size) { + off_t offset = 0; + FreeStreamArray(transaction,data->direct,NUM_DIRECT_BLOCKS,size,offset,data->max_direct_range); + } + + data->size = size; + return B_OK; +} + + +status_t +Inode::SetFileSize(Transaction *transaction, off_t size) +{ + if (size < 0) + return B_BAD_VALUE; + + off_t oldSize = Node()->data.size; + + if (size == oldSize) + return B_OK; + + // should the data stream grow or shrink? + status_t status; + if (size > oldSize) { + status = GrowStream(transaction,size); + if (status < B_OK) { + // if the growing of the stream fails, the whole operation + // fails, so we should shrink the stream to its former size + ShrinkStream(transaction,oldSize); + } + } + else + status = ShrinkStream(transaction,size); + + if (status < B_OK) + return status; + + return WriteBack(transaction); +} + + +status_t +Inode::Append(Transaction *transaction,off_t bytes) +{ + return SetFileSize(transaction,Size() + bytes); +} + + +status_t +Inode::Trim(Transaction *transaction) +{ + return ShrinkStream(transaction,Size()); +} + + +status_t +Inode::Sync() +{ + // We may also want to flush the attribute's data stream to + // disk here... (do we?) + + data_stream *data = &Node()->data; + status_t status; + + // flush direct range + + for (int32 i = 0;i < NUM_DIRECT_BLOCKS;i++) { + if (data->direct[i].IsZero()) + return B_OK; + + status = flush_blocks(fVolume->Device(),fVolume->ToBlock(data->direct[i]),data->direct[i].length); + if (status != B_OK) + return status; + } + + // flush indirect range + + if (data->max_indirect_range == 0) + return B_OK; + + CachedBlock cached(fVolume); + off_t block = fVolume->ToBlock(data->indirect); + int32 count = fVolume->BlockSize() / sizeof(block_run); + + for (int32 j = 0;j < data->indirect.length;j++) { + block_run *runs = (block_run *)cached.SetTo(block + j); + if (runs == NULL) + break; + + for (int32 i = 0;i < count;i++) { + if (runs[i].IsZero()) + return B_OK; + + status = flush_blocks(fVolume->Device(),fVolume->ToBlock(runs[i]),runs[i].length); + if (status != B_OK) + return status; + } + } + + // flush double indirect range + + if (data->max_double_indirect_range == 0) + return B_OK; + + off_t indirectBlock = fVolume->ToBlock(data->double_indirect); + + for (int32 l = 0;l < data->double_indirect.length;l++) { + block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l); + if (indirectRuns == NULL) + return B_FILE_ERROR; + + CachedBlock directCached(fVolume); + + for (int32 k = 0;k < count;k++) { + if (indirectRuns[k].IsZero()) + return B_OK; + + block = fVolume->ToBlock(indirectRuns[k]); + for (int32 j = 0;j < indirectRuns[k].length;j++) { + block_run *runs = (block_run *)directCached.SetTo(block + j); + if (runs == NULL) + return B_FILE_ERROR; + + for (int32 i = 0;i < count;i++) { + if (runs[i].IsZero()) + return B_OK; + + // ToDo: combine single block_runs to bigger ones when + // they are adjacent + status = flush_blocks(fVolume->Device(),fVolume->ToBlock(runs[i]),runs[i].length); + if (status != B_OK) + return status; + } + } + } + } + return B_OK; +} + + +status_t +Inode::Remove(Transaction *transaction,const char *name,off_t *_id,bool isDirectory) +{ + BPlusTree *tree; + if (GetTree(&tree) != B_OK) + RETURN_ERROR(B_BAD_VALUE); + + // does the file even exists? + off_t id; + if (tree->Find((uint8 *)name,(uint16)strlen(name),&id) < B_OK) + return B_ENTRY_NOT_FOUND; + + if (_id) + *_id = id; + + Vnode vnode(fVolume,id); + Inode *inode; + status_t status = vnode.Get(&inode); + if (status < B_OK) { + REPORT_ERROR(status); + return B_ENTRY_NOT_FOUND; + } + + // It's a bit stupid, but indices are regarded as directories + // in BFS - so a test for a directory always succeeds, but you + // should really be able to do whatever you want with your indices + // without having to remove all files first :) + if (!inode->IsIndex()) { + // if it's not of the correct type, don't delete it! + if (inode->IsDirectory() != isDirectory) + return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY; + + // only delete empty directories + if (isDirectory && !inode->IsEmpty()) + return B_DIRECTORY_NOT_EMPTY; + } + + // remove_vnode() allows the inode to be accessed until the last put_vnode() + if (remove_vnode(fVolume->ID(),id) != B_OK) + return B_ERROR; + + if (tree->Remove(transaction,(uint8 *)name,(uint16)strlen(name),id) < B_OK) { + unremove_vnode(fVolume->ID(),id); + RETURN_ERROR(B_ERROR); + } + + // update the inode, so that no one will ever doubt it's deleted :-) + inode->Node()->flags |= INODE_DELETED; + + // In balance to the Inode::Create() method, the main indices + // are updated here (name, size, & last_modified) + + Index index(fVolume); + if ((inode->Mode() & (S_ATTR_DIR | S_ATTR | S_INDEX_DIR)) == 0) { + index.RemoveName(transaction,name,inode); + // If removing from the index fails, it is not regarded as a + // fatal error and will not be reported back! + // Deleted inodes won't be visible in queries anyway. + } + + if ((inode->Mode() & (S_FILE | S_SYMLINK)) != 0) { + index.RemoveSize(transaction,inode); + index.RemoveLastModified(transaction,inode); + } + + if (inode->WriteBack(transaction) < B_OK) + return B_ERROR; + + return B_OK; +} + + +/** Creates the inode with the specified parent directory, and automatically + * adds the created inode to that parent directory. If an attribute directory + * is created, it will also automatically added to the parent inode as such. + * However, the indices root node, and the regular root node won't be added + * to the super block. + * It will also create the initial B+tree for the inode if it's a directory + * of any kind. + * If the "id" variable is given to store the inode's ID, the inode stays + * locked - you have to call put_vnode() if you don't use it anymore. + */ + +status_t +Inode::Create(Transaction *transaction,Inode *parent, const char *name, int32 mode, int omode, uint32 type, off_t *_id, Inode **_inode) +{ + block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0,0,0); + Volume *volume = transaction->GetVolume(); + BPlusTree *tree = NULL; + + if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsDirectory()) { + // check if the file already exists in the directory + if (parent->GetTree(&tree) != B_OK) + RETURN_ERROR(B_BAD_VALUE); + + // does the file already exist? + off_t offset; + if (tree->Find((uint8 *)name,(uint16)strlen(name),&offset) == B_OK) { + // return if the file should be a directory or opened in exclusive mode + if (mode & S_DIRECTORY || omode & O_EXCL) + return B_FILE_EXISTS; + + Vnode vnode(volume,offset); + Inode *inode; + status_t status = vnode.Get(&inode); + if (status < B_OK) { + REPORT_ERROR(status); + return B_ENTRY_NOT_FOUND; + } + + // if it's a directory, bail out! + if (inode->IsDirectory()) + return B_IS_A_DIRECTORY; + + // if omode & O_TRUNC, truncate the existing file + if (omode & O_TRUNC) { + WriteLocked locked(inode->Lock()); + + status_t status = inode->SetFileSize(transaction,0); + if (status < B_OK) + return status; + } + + // only keep the vnode in memory if the vnode_id pointer is provided + if (_id) { + *_id = offset; + vnode.Keep(); + } + if (_inode) + *_inode = inode; + + return B_OK; + } + } else if (parent && (mode & S_ATTR_DIR) == 0) + return B_BAD_VALUE; + + // allocate space for the new inode + InodeAllocator allocator(transaction); + block_run run; + Inode *inode; + status_t status = allocator.New(&parentRun,mode,run,&inode); + if (status < B_OK) + return status; + + // initialize the on-disk bfs_inode structure + + bfs_inode *node = inode->Node(); + + node->magic1 = INODE_MAGIC1; + node->inode_num = run; + node->parent = parentRun; + + node->uid = geteuid(); + node->gid = parent ? parent->Node()->gid : getegid(); + // the group ID is inherited from the parent, if available + node->mode = mode; + node->flags = INODE_IN_USE; + node->type = type; + + node->create_time = (bigtime_t)time(NULL) << INODE_TIME_SHIFT; + node->last_modified_time = node->create_time | (volume->GetUniqueID() & INODE_TIME_MASK); + // we use Volume::GetUniqueID() to avoid having too many duplicates in the + // last_modified index + + node->inode_size = volume->InodeSize(); + + // only add the name to regular files, directories, or symlinks + // don't add it to attributes, or indices + if (tree && (mode & (S_INDEX_DIR | S_ATTR_DIR | S_ATTR)) == 0 + && inode->SetName(transaction,name) < B_OK) + return B_ERROR; + + // initialize b+tree if it's a directory (and add "." & ".." if it's + // a standard directory for files - not for attributes or indices) + if (mode & (S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR)) { + BPlusTree *tree = inode->fTree = new BPlusTree(transaction,inode); + if (tree == NULL || tree->InitCheck() < B_OK) + return B_ERROR; + + if ((mode & (S_INDEX_DIR | S_ATTR_DIR)) == 0) { + if (tree->Insert(transaction,".",inode->BlockNumber()) < B_OK + || tree->Insert(transaction,"..",volume->ToBlock(inode->Parent())) < B_OK) + return B_ERROR; + } + } + + // update the main indices (name, size & last_modified) + Index index(volume); + if ((mode & (S_ATTR_DIR | S_ATTR | S_INDEX_DIR)) == 0) { + status = index.InsertName(transaction,name,inode); + if (status < B_OK && status != B_BAD_INDEX) + return status; + } + + inode->UpdateOldLastModified(); + + // The "size" & "last_modified" indices don't contain directories + if ((mode & (S_FILE | S_SYMLINK)) != 0) { + // if adding to these indices fails, the inode creation will not be harmed + index.InsertSize(transaction,inode); + index.InsertLastModified(transaction,inode); + } + + if ((status = inode->WriteBack(transaction)) < B_OK) + return status; + + if (new_vnode(volume->ID(),inode->ID(),inode) != B_OK) + return B_ERROR; + + // add a link to the inode from the parent, depending on its type + if (tree && tree->Insert(transaction,name,volume->ToBlock(run)) < B_OK) { + put_vnode(volume->ID(),inode->ID()); + RETURN_ERROR(B_ERROR); + } else if (parent && mode & S_ATTR_DIR) { + parent->Attributes() = run; + parent->WriteBack(transaction); + } + + allocator.Keep(); + + if (_id != NULL) + *_id = inode->ID(); + else + put_vnode(volume->ID(),inode->ID()); + + if (_inode != NULL) + *_inode = inode; + + return B_OK; +} + + +// #pragma mark - + + +AttributeIterator::AttributeIterator(Inode *inode) + : + fCurrentSmallData(0), + fInode(inode), + fAttributes(NULL), + fIterator(NULL), + fBuffer(NULL) +{ + inode->AddIterator(this); +} + + +AttributeIterator::~AttributeIterator() +{ + if (fAttributes) + put_vnode(fAttributes->GetVolume()->ID(),fAttributes->ID()); + + delete fIterator; + fInode->RemoveIterator(this); +} + + +status_t +AttributeIterator::Rewind() +{ + fCurrentSmallData = 0; + + if (fIterator != NULL) + fIterator->Rewind(); + + return B_OK; +} + + +status_t +AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type, vnode_id *_id) +{ + // read attributes out of the small data section + + if (fCurrentSmallData >= 0) { + small_data *item = fInode->Node()->small_data_start; + + fInode->SmallDataLock().Lock(); + + int32 i = 0; + for (;;item = item->Next()) { + if (item->IsLast(fInode->Node())) + break; + + if (item->name_size == FILE_NAME_NAME_LENGTH + && *item->Name() == FILE_NAME_NAME) + continue; + + if (i++ == fCurrentSmallData) + break; + } + + if (!item->IsLast(fInode->Node())) { + strncpy(name,item->Name(),B_FILE_NAME_LENGTH); + *_type = item->type; + *_length = item->name_size; + *_id = (vnode_id)fCurrentSmallData; + + fCurrentSmallData = i; + } + else { + // stop traversing the small_data section + fCurrentSmallData = -1; + } + + fInode->SmallDataLock().Unlock(); + + if (fCurrentSmallData != -1) + return B_OK; + } + + // read attributes out of the attribute directory + + if (fInode->Attributes().IsZero()) + return B_ENTRY_NOT_FOUND; + + Volume *volume = fInode->GetVolume(); + + // if you haven't yet access to the attributes directory, get it + if (fAttributes == NULL) { + if (get_vnode(volume->ID(),volume->ToVnode(fInode->Attributes()),(void **)&fAttributes) != 0 + || fAttributes == NULL) { + FATAL(("get_vnode() failed in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name)); + return B_ENTRY_NOT_FOUND; + } + + BPlusTree *tree; + if (fAttributes->GetTree(&tree) < B_OK + || (fIterator = new TreeIterator(tree)) == NULL) { + FATAL(("could not get tree in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name)); + return B_ENTRY_NOT_FOUND; + } + } + + block_run run; + uint16 length; + vnode_id id; + status_t status = fIterator->GetNextEntry(name,&length,B_FILE_NAME_LENGTH,&id); + if (status < B_OK) + return status; + + Vnode vnode(volume,id); + Inode *attribute; + if ((status = vnode.Get(&attribute)) == B_OK) { + *_type = attribute->Node()->type; + *_length = attribute->Node()->data.size; + *_id = id; + } + + return status; +} + + +void +AttributeIterator::Update(uint16 index, int8 change) +{ + // fCurrentSmallData points already to the next item + if (index < fCurrentSmallData) + fCurrentSmallData += change; +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Inode.h b/src/add-ons/kernel/file_systems/bfs/Inode.h new file mode 100644 index 0000000000..5148270997 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Inode.h @@ -0,0 +1,309 @@ +#ifndef INODE_H +#define INODE_H +/* Inode - inode access functions +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include +#ifdef USER +# include "myfs.h" +# include +#endif + +#ifndef _IMPEXP_KERNEL +# define _IMPEXP_KERNEL +#endif + +extern "C" { + #include + #include +} + +#include + +#include "Volume.h" +#include "Journal.h" +#include "Lock.h" +#include "Chain.h" +#include "Debug.h" + + +class BPlusTree; +class TreeIterator; +class AttributeIterator; + + +enum inode_type { + S_DIRECTORY = S_IFDIR, + S_FILE = S_IFREG, + S_SYMLINK = S_IFLNK +}; + + +// The CachedBlock class is completely implemented as inlines. +// It should be used when cache single blocks to make sure they +// will be properly released after use (and it's also very +// convenient to use them). + +class CachedBlock { + public: + CachedBlock(Volume *volume) + : + fVolume(volume), + fBlock(NULL) + { + } + + CachedBlock(Volume *volume,off_t block,bool empty = false) + : + fVolume(volume), + fBlock(NULL) + { + SetTo(block,empty); + } + + CachedBlock(Volume *volume,block_run run,bool empty = false) + : + fVolume(volume), + fBlock(NULL) + { + SetTo(volume->ToBlock(run),empty); + } + + ~CachedBlock() + { + Unset(); + } + + void Unset() + { + if (fBlock != NULL) + release_block(fVolume->Device(),fBlockNumber); + } + + uint8 *SetTo(off_t block,bool empty = false) + { + Unset(); + fBlockNumber = block; + return fBlock = empty ? (uint8 *)get_empty_block(fVolume->Device(),block,fVolume->BlockSize()) + : (uint8 *)get_block(fVolume->Device(),block,fVolume->BlockSize()); + } + + uint8 *SetTo(block_run run,bool empty = false) + { + return SetTo(fVolume->ToBlock(run),empty); + } + + status_t WriteBack(Transaction *transaction) + { + if (transaction == NULL || fBlock == NULL) + RETURN_ERROR(B_BAD_VALUE); + + return transaction->WriteBlocks(fBlockNumber,fBlock); + } + + uint8 *Block() const { return fBlock; } + off_t BlockNumber() const { return fBlockNumber; } + + protected: + Volume *fVolume; + off_t fBlockNumber; + uint8 *fBlock; +}; + + +class Inode : public CachedBlock { + public: + Inode(Volume *volume,vnode_id id,bool empty = false,uint8 reenter = 0); + ~Inode(); + + bfs_inode *Node() const { return (bfs_inode *)fBlock; } + vnode_id ID() const { return fVolume->ToVnode(fBlockNumber); } + + ReadWriteLock &Lock() { return fLock; } + SimpleLock &SmallDataLock() { return fSmallDataLock; } + + mode_t Mode() const { return Node()->mode; } + int32 Flags() const { return Node()->flags; } + bool IsDirectory() const { return Mode() & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR); } + // note, that this test will also be true for S_IFBLK (not that it's used in the fs :) + bool IsIndex() const { return (Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR; } + // that's a stupid check, but AFAIK the only possible method... + + bool IsSymLink() const { return S_ISLNK(Mode()); } + bool HasUserAccessableStream() const { return S_ISREG(Mode()); } + // currently only files can be accessed with bfs_read()/bfs_write() + + off_t Size() const { return Node()->data.size; } + + block_run &BlockRun() const { return Node()->inode_num; } + block_run &Parent() const { return Node()->parent; } + block_run &Attributes() const { return Node()->attributes; } + Volume *GetVolume() const { return fVolume; } + + status_t InitCheck(); + + status_t CheckPermissions(int accessMode) const; + + // small_data access methods + status_t MakeSpaceForSmallData(Transaction *transaction,const char *name, int32 length); + status_t RemoveSmallData(Transaction *transaction,const char *name); + status_t AddSmallData(Transaction *transaction,const char *name,uint32 type,const uint8 *data,size_t length,bool force = false); + status_t GetNextSmallData(small_data **smallData) const; + small_data *FindSmallData(const char *name) const; + const char *Name() const; + status_t SetName(Transaction *transaction,const char *name); + + // high-level attribute methods + status_t ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer, size_t *_length); + status_t WriteAttribute(Transaction *transaction, const char *name, int32 type, off_t pos, const uint8 *buffer, size_t *_length); + status_t RemoveAttribute(Transaction *transaction, const char *name); + + // attribute methods + status_t GetAttribute(const char *name,Inode **attribute); + void ReleaseAttribute(Inode *attribute); + status_t CreateAttribute(Transaction *transaction,const char *name,uint32 type,Inode **attribute); + + // for directories only: + status_t GetTree(BPlusTree **); + bool IsEmpty(); + + // manipulating the data stream + status_t FindBlockRun(off_t pos,block_run &run,off_t &offset); + + status_t ReadAt(off_t pos,uint8 *buffer,size_t *length); + status_t WriteAt(Transaction *transaction,off_t pos,const uint8 *buffer,size_t *length); + status_t FillGapWithZeros(off_t oldSize,off_t newSize); + + status_t SetFileSize(Transaction *transaction,off_t size); + status_t Append(Transaction *transaction,off_t bytes); + status_t Trim(Transaction *transaction); + + status_t Sync(); + + // create/remove inodes + status_t Remove(Transaction *transaction,const char *name,off_t *_id = NULL,bool isDirectory = false); + static status_t Create(Transaction *transaction,Inode *parent,const char *name,int32 mode,int omode,uint32 type,off_t *_id = NULL,Inode **_inode = NULL); + + // index maintaining helper + void UpdateOldSize() { fOldSize = Size(); } + void UpdateOldLastModified() { fOldLastModified = Node()->last_modified_time; } + off_t OldSize() { return fOldSize; } + off_t OldLastModified() { return fOldLastModified; } + + private: + friend AttributeIterator; + + status_t RemoveSmallData(small_data *item,int32 index); + + void AddIterator(AttributeIterator *iterator); + void RemoveIterator(AttributeIterator *iterator); + + status_t FreeStaticStreamArray(Transaction *transaction,int32 level,block_run run,off_t size,off_t offset,off_t &max); + status_t FreeStreamArray(Transaction *transaction, block_run *array, uint32 arrayLength, off_t size, off_t &offset, off_t &max); + status_t GrowStream(Transaction *transaction,off_t size); + status_t ShrinkStream(Transaction *transaction,off_t size); + + BPlusTree *fTree; + Inode *fAttributes; + ReadWriteLock fLock; + off_t fOldSize; // we need those values to ensure we will remove + off_t fOldLastModified; // the correct keys from the indices + + mutable SimpleLock fSmallDataLock; + Chain fIterators; +}; + + +// The Vnode class provides a convenience layer upon get_vnode(), so that +// you don't have to call put_vnode() anymore, which may make code more +// readable in some cases + +class Vnode { + public: + Vnode(Volume *volume,vnode_id id) + : + fVolume(volume), + fID(id) + { + } + + Vnode(Volume *volume,block_run run) + : + fVolume(volume), + fID(volume->ToVnode(run)) + { + } + + ~Vnode() + { + Put(); + } + + status_t Get(Inode **inode) + { + // should we check inode against NULL here? it should not be necessary + return get_vnode(fVolume->ID(),fID,(void **)inode); + } + + void Put() + { + if (fVolume) + put_vnode(fVolume->ID(),fID); + fVolume = NULL; + } + + void Keep() + { + fVolume = NULL; + } + + private: + Volume *fVolume; + vnode_id fID; +}; + + +class AttributeIterator { + public: + AttributeIterator(Inode *inode); + ~AttributeIterator(); + + status_t Rewind(); + status_t GetNext(char *name,size_t *length,uint32 *type,vnode_id *id); + + private: + int32 fCurrentSmallData; + Inode *fInode, *fAttributes; + TreeIterator *fIterator; + void *fBuffer; + + private: + friend Chain; + friend Inode; + + void Update(uint16 index,int8 change); + AttributeIterator *fNext; +}; + + +/** Converts the "omode", the open flags given to bfs_open(), into + * access modes, e.g. since O_RDONLY requires read access to the + * file, it will be converted to R_OK. + */ + +inline int oModeToAccess(int omode) +{ + omode &= O_RWMASK; + if (omode == O_RDONLY) + return R_OK; + else if (omode == O_WRONLY) + return W_OK; + + return R_OK | W_OK; +} + +#endif /* INODE_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Jamfile b/src/add-ons/kernel/file_systems/bfs/Jamfile new file mode 100644 index 0000000000..c7ec61aac5 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Jamfile @@ -0,0 +1,23 @@ +SubDir OBOS_TOP src add-ons kernel file_systems bfs ; + +{ + local debug = -g ; + local defines = [ FDefines DEBUG ] ; + SubDirCcFlags $(defines) $(debug) ; + SubDirC++Flags $(defines) $(debug) ; +} + +R5KernelAddon obfs : [ FDirName kernel file_systems bfs ] : + BlockAllocator.cpp + BPlusTree.cpp + cpp.cpp + Debug.cpp + Index.cpp + Inode.cpp + Journal.cpp + kernel_interface.cpp + Query.cpp + Utility.cpp + Volume.cpp +; + diff --git a/src/add-ons/kernel/file_systems/bfs/Journal.cpp b/src/add-ons/kernel/file_systems/bfs/Journal.cpp new file mode 100644 index 0000000000..a60b7c8b88 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Journal.cpp @@ -0,0 +1,433 @@ +/* Journal - transaction and logging +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Journal.h" +#include "Inode.h" +#include "Debug.h" +#include "cpp.h" + + +Journal::Journal(Volume *volume) + : + fVolume(volume), + fLock("bfs journal"), + fOwner(NULL), + fOwningThread(-1), + fArray(volume->BlockSize()), + fLogSize(volume->Log().length), + fMaxTransactionSize(fLogSize / 4 - 5), + fUsed(0), + fTransactionsInEntry(0) +{ + if (fMaxTransactionSize > fLogSize / 2) + fMaxTransactionSize = fLogSize / 2 - 5; +} + + +Journal::~Journal() +{ + FlushLogAndBlocks(); +} + + +status_t +Journal::InitCheck() +{ + if (fVolume->LogStart() != fVolume->LogEnd()) { + if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) + FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n")); + + return ReplayLog(); + } + + return B_OK; +} + + +status_t +Journal::CheckLogEntry(int32 count,off_t *array) +{ + // ToDo: check log entry integrity (block numbers and entry size) + PRINT(("Log entry has %ld entries (%Ld)\n",count)); + return B_OK; +} + + +status_t +Journal::ReplayLogEntry(int32 *_start) +{ + PRINT(("ReplayLogEntry(start = %u)\n",*_start)); + + off_t logOffset = fVolume->ToBlock(fVolume->Log()); + off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log()); + int32 blockSize = fVolume->BlockSize(); + int32 count = 1,valuesInBlock = blockSize / sizeof(off_t); + int32 numArrayBlocks; + off_t blockNumber; + bool first = true; + + CachedBlock cached(fVolume); + while (count > 0) { + off_t *array = (off_t *)cached.SetTo(arrayBlock); + if (array == NULL) + return B_IO_ERROR; + + int32 index = 0; + if (first) { + count = array[0]; + if (count < 1 || count >= fLogSize) + return B_BAD_DATA; + + first = false; + + numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize; + blockNumber = (*_start + numArrayBlocks) % fLogSize; + // first real block in this log entry + *_start += count; + index++; + // the first entry in the first block is the number + // of blocks in that log entry + } + (*_start)++; + + if (CheckLogEntry(count,array + 1) < B_OK) + return B_BAD_DATA; + + CachedBlock cachedCopy(fVolume); + for (;index < valuesInBlock && count-- > 0;index++) { + PRINT(("replay block %Ld in log at %Ld!\n",array[index],blockNumber)); + + uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber); + if (copy == NULL) + RETURN_ERROR(B_IO_ERROR); + + ssize_t written = write_pos(fVolume->Device(),array[index] << fVolume->BlockShift(),copy,blockSize); + if (written != blockSize) + RETURN_ERROR(B_IO_ERROR); + + blockNumber = (blockNumber + 1) % fLogSize; + } + arrayBlock++; + if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize) + arrayBlock = fVolume->ToBlock(fVolume->Log()); + } + return B_OK; +} + + +/** Replays all log entries - this will put the disk into a + * consistent and clean state, if it was not correctly unmounted + * before. + * This method is called by Journal::InitCheck() if the log start + * and end pointer don't match. + */ + +status_t +Journal::ReplayLog() +{ + INFORM(("Replay log, disk was not correctly unmounted...\n")); + + int32 start = fVolume->LogStart(); + int32 lastStart = -1; + while (true) { + + // stop if the log is completely flushed + if (start == fVolume->LogEnd()) + break; + + if (start == lastStart) { + // strange, flushing the log hasn't changed the log_start pointer + return B_ERROR; + } + lastStart = start; + + status_t status = ReplayLogEntry(&start); + if (status < B_OK) { + FATAL(("replaying log entry from %u failed: %s\n",start,strerror(status))); + return B_ERROR; + } + start = start % fLogSize; + } + + PRINT(("replaying worked fine!\n")); + fVolume->SuperBlock().log_start = fVolume->LogEnd(); + fVolume->LogStart() = fVolume->LogEnd(); + fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; + + return fVolume->WriteSuperBlock(); +} + + +/** This is a callback function that is called by the cache, whenever + * a block is flushed to disk that was updated as part of a transaction. + * This is necessary to keep track of completed transactions, to be + * able to update the log start pointer. + */ + +void +Journal::blockNotify(off_t blockNumber,size_t numBlocks,void *arg) +{ + log_entry *logEntry = (log_entry *)arg; + + logEntry->cached_blocks -= numBlocks; + if (logEntry->cached_blocks > 0) { + // nothing to do yet... + return; + } + + Journal *journal = logEntry->journal; + disk_super_block &superBlock = journal->fVolume->SuperBlock(); + bool update = false; + + // Set log_start pointer if possible... + + if (logEntry == journal->fEntries.head) { + if (logEntry->Next() != NULL) { + int32 length = logEntry->next->start - logEntry->start; + superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize; + } else + superBlock.log_start = journal->fVolume->LogEnd(); + + update = true; + } + journal->fUsed -= logEntry->length; + + journal->fEntriesLock.Lock(); + logEntry->Remove(); + journal->fEntriesLock.Unlock(); + + free(logEntry); + + // update the super block, and change the disk's state, if necessary + + if (update) { + journal->fVolume->LogStart() = superBlock.log_start; + + if (superBlock.log_start == superBlock.log_end) + superBlock.flags = SUPER_BLOCK_DISK_CLEAN; + + journal->fVolume->WriteSuperBlock(); + } +} + + +status_t +Journal::WriteLogEntry() +{ + fTransactionsInEntry = 0; + fHasChangedBlocks = false; + + sorted_array *array = fArray.Array(); + if (array == NULL || array->count == 0) + return B_OK; + + // Make sure there is enough space in the log. + // If that fails for whatever reason, panic! + force_cache_flush(fVolume->Device(),false); + int32 tries = fLogSize / 2 + 1; + while (TransactionSize() > FreeLogBlocks() && tries-- > 0) + force_cache_flush(fVolume->Device(),true); + + if (tries <= 0) { + fVolume->Panic(); + return B_BAD_DATA; + } + + int32 blockShift = fVolume->BlockShift(); + off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; + off_t logStart = fVolume->LogEnd(); + off_t logPosition = logStart % fLogSize; + + // Write disk block array + + uint8 *arrayBlock = (uint8 *)array; + + for (int32 size = fArray.BlocksUsed();size-- > 0;) { + write_pos(fVolume->Device(),logOffset + (logPosition << blockShift),arrayBlock,fVolume->BlockSize()); + + logPosition = (logPosition + 1) % fLogSize; + arrayBlock += fVolume->BlockSize(); + } + + // Write logged blocks into the log + + CachedBlock cached(fVolume); + for (int32 i = 0;i < array->count;i++) { + uint8 *block = cached.SetTo(array->values[i]); + if (block == NULL) + return B_IO_ERROR; + + write_pos(fVolume->Device(),logOffset + (logPosition << blockShift),block,fVolume->BlockSize()); + logPosition = (logPosition + 1) % fLogSize; + } + + log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry)); + if (logEntry != NULL) { + logEntry->start = logStart; + logEntry->length = TransactionSize(); + logEntry->cached_blocks = array->count; + logEntry->journal = this; + + fEntriesLock.Lock(); + fEntries.Add(logEntry); + fEntriesLock.Unlock(); + + fCurrent = logEntry; + fUsed += logEntry->length; + + set_blocks_info(fVolume->Device(),&array->values[0],array->count,blockNotify,logEntry); + } + + // If the log goes to the next round (the log is written as a + // circular buffer), all blocks will be flushed out which is + // possible because we don't have any locked blocks at this + // point. + if (logPosition < logStart) + fVolume->FlushDevice(); + + // We need to flush the drives own cache here to ensure + // disk consistency. + // If that call fails, we can't do anything about it anyway + ioctl(fVolume->Device(),B_FLUSH_DRIVE_CACHE); + + fArray.MakeEmpty(); + + // Update the log end pointer in the super block + fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; + fVolume->SuperBlock().log_end = logPosition; + fVolume->LogEnd() = logPosition; + + fVolume->WriteSuperBlock(); +} + + +status_t +Journal::FlushLogAndBlocks() +{ + status_t status = Lock((Transaction *)this); + if (status != B_OK) + return status; + + // write the current log entry to disk + + if (TransactionSize() != 0) { + status = WriteLogEntry(); + if (status < B_OK) + FATAL(("writing current log entry failed: %s\n",status)); + } + status = fVolume->FlushDevice(); + + Unlock((Transaction *)this,true); + return status; +} + + +status_t +Journal::Lock(Transaction *owner) +{ + if (owner == fOwner) + return B_OK; + + status_t status = fLock.Lock(); + if (status == B_OK) { + fOwner = owner; + fOwningThread = find_thread(NULL); + } + + // if the last transaction is older than 2 secs, start a new one + if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L) + WriteLogEntry(); + + return B_OK; +} + + +void +Journal::Unlock(Transaction *owner,bool success) +{ + if (owner != fOwner) + return; + + TransactionDone(success); + + fTimestamp = system_time(); + fOwner = NULL; + fOwningThread = -1; + fLock.Unlock(); +} + + +status_t +Journal::TransactionDone(bool success) +{ + if (!success && fTransactionsInEntry == 0) { + // we can safely abort the transaction + // ToDo: abort the transaction + PRINT(("should abort transaction...\n")); + } + + // Up to a maximum size, we will just batch several + // transactions together to improve speed + if (TransactionSize() < fMaxTransactionSize) { + fTransactionsInEntry++; + fHasChangedBlocks = false; + + return B_OK; + } + + return WriteLogEntry(); +} + + +status_t +Journal::LogBlocks(off_t blockNumber,const uint8 *buffer,size_t numBlocks) +{ + // ToDo: that's for now - we should change the log file size here + if (TransactionSize() + numBlocks + 1 > fLogSize) + return B_DEVICE_FULL; + + fHasChangedBlocks = true; + int32 blockSize = fVolume->BlockSize(); + + for (;numBlocks-- > 0;blockNumber++,buffer += blockSize) { + if (fArray.Find(blockNumber) >= 0) + continue; + + // Insert the block into the transaction's array, and write the changes + // back into the locked cache buffer + fArray.Insert(blockNumber); + status_t status = cached_write_locked(fVolume->Device(),blockNumber,buffer,1,blockSize); + if (status < B_OK) + return status; + } + + // If necessary, flush the log, so that we have enough space for this transaction + if (TransactionSize() > FreeLogBlocks()) + force_cache_flush(fVolume->Device(),true); + + return B_OK; +} + + +// #pragma mark - + + +status_t +Transaction::Start(Volume *volume,off_t refBlock) +{ + // has it already been started? + if (fJournal != NULL) + return B_OK; + + fJournal = volume->GetJournal(refBlock); + if (fJournal != NULL && fJournal->Lock(this) == B_OK) + return B_OK; + + fJournal = NULL; + return B_ERROR; +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Journal.h b/src/add-ons/kernel/file_systems/bfs/Journal.h new file mode 100644 index 0000000000..f9195d780e --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Journal.h @@ -0,0 +1,152 @@ +#ifndef JOURNAL_H +#define JOURNAL_H +/* Journal - transaction and logging +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + +#ifdef USER +# include "myfs.h" +# include +#endif + +#ifndef _IMPEXP_KERNEL +# define _IMPEXP_KERNEL +#endif + +extern "C" { + #include + #include +} + +#include "Volume.h" +#include "Chain.h" +#include "Utility.h" + + +struct log_entry : node { + uint16 start; + uint16 length; + uint32 cached_blocks; + Journal *journal; +}; + + +class Journal { + public: + Journal(Volume *); + ~Journal(); + + status_t InitCheck(); + + status_t Lock(Transaction *owner); + void Unlock(Transaction *owner,bool success); + + status_t CheckLogEntry(int32 count, off_t *array); + status_t ReplayLogEntry(int32 *start); + status_t ReplayLog(); + + status_t WriteLogEntry(); + status_t LogBlocks(off_t blockNumber,const uint8 *buffer, size_t numBlocks); + + thread_id CurrentThread() const { return fOwningThread; } + Transaction *CurrentTransaction() const { return fOwner; } + uint32 TransactionSize() const { return fArray.CountItems() + fArray.BlocksUsed(); } + + status_t FlushLogAndBlocks(); + Volume *GetVolume() const { return fVolume; } + + inline int32 FreeLogBlocks() const; + + private: + friend log_entry; + + static void blockNotify(off_t blockNumber, size_t numBlocks, void *arg); + status_t TransactionDone(bool success); + + Volume *fVolume; + Benaphore fLock; + Transaction *fOwner; + thread_id fOwningThread; + BlockArray fArray; + uint32 fLogSize,fMaxTransactionSize,fUsed; + int32 fTransactionsInEntry; + SimpleLock fEntriesLock; + list fEntries; + log_entry *fCurrent; + bool fHasChangedBlocks; + bigtime_t fTimestamp; +}; + + +inline int32 +Journal::FreeLogBlocks() const +{ + return fVolume->LogStart() <= fVolume->LogEnd() ? + fLogSize - fVolume->LogEnd() + fVolume->LogStart() + : fVolume->LogStart() - fVolume->LogEnd(); +} + + +// For now, that's only a dumb class that does more or less nothing +// else than writing the blocks directly to the real location. +// It doesn't yet use logging. + +class Transaction { + public: + Transaction(Volume *volume,off_t refBlock) + : + fJournal(NULL) + { + Start(volume,refBlock); + } + + Transaction(Volume *volume,block_run refRun) + : + fJournal(NULL) + { + Start(volume,volume->ToBlock(refRun)); + } + + Transaction() + : + fJournal(NULL) + { + } + + ~Transaction() + { + if (fJournal) + fJournal->Unlock(this,false); + } + + status_t Start(Volume *volume,off_t refBlock); + + void Done() + { + if (fJournal != NULL) + fJournal->Unlock(this,true); + fJournal = NULL; + } + + status_t WriteBlocks(off_t blockNumber,const uint8 *buffer,size_t numBlocks = 1) + { + if (fJournal == NULL) + return B_NO_INIT; + + return fJournal->LogBlocks(blockNumber,buffer,numBlocks); + //status_t status = cached_write/*_locked*/(fVolume->Device(),blockNumber,buffer,numBlocks,fVolume->BlockSize()); + //return status; + } + + Volume *GetVolume() { return fJournal != NULL ? fJournal->GetVolume() : NULL; } + + protected: + Journal *fJournal; +}; + +#endif /* JOURNAL_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Lock.h b/src/add-ons/kernel/file_systems/bfs/Lock.h new file mode 100644 index 0000000000..8ffd9dc133 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Lock.h @@ -0,0 +1,337 @@ +#ifndef LOCK_H +#define LOCK_H +/* Lock - benaphores, read/write lock implementation +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** Roughly based on a Be sample code written by Nathan Schrenk. +** +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + + +class Benaphore { + public: + Benaphore(const char *name = "bfs benaphore") + : + fSemaphore(create_sem(0, name)), + fCount(1) + { + } + + ~Benaphore() + { + delete_sem(fSemaphore); + } + + status_t InitCheck() + { + if (fSemaphore < B_OK) + return fSemaphore; + + return B_OK; + } + + status_t Lock() + { + if (atomic_add(&fCount, -1) <= 0) + return acquire_sem(fSemaphore); + + return B_OK; + } + + void Unlock() + { + if (atomic_add(&fCount, 1) < 0) + release_sem(fSemaphore); + } + + private: + sem_id fSemaphore; + vint32 fCount; +}; + +// a convenience class to lock the benaphore + +class Locker { + public: + Locker(Benaphore &lock) + : fLock(lock) + { + fStatus = lock.Lock(); + } + + ~Locker() + { + if (fStatus == B_OK) + fLock.Unlock(); + } + + private: + Benaphore &fLock; + status_t fStatus; +}; + + +//**** Many Reader/Single Writer Lock + +// This is a "fast" implementation of a single writer/many reader +// locking scheme. It's fast because it uses the benaphore idea +// to do lazy semaphore locking - in most cases it will only have +// to do some simple integer arithmetic. +// The second semaphore (fWriteLock) is needed to prevent the situation +// that a second writer can acquire the lock when there are still readers +// holding it. + +#define MAX_READERS 100000 + +// Note: this code will break if you actually have 100000 readers +// at once. With the current thread/... limits in BeOS you can't +// touch that value, but it might be possible in the future. +// Also, you can only have about 20000 concurrent writers until +// the semaphore count exceeds the int32 bounds + +// Timeouts: +// It may be a good idea to have timeouts for the WriteLocked class, +// in case something went wrong - we'll see if this is necessary, +// but it would be a somewhat poor work-around for a deadlock... +// But the only real problem with timeouts could be for things like +// "chkbfs" - because such a tool may need to lock for some more time + + +// define if you want to have fast locks as the foundation for the +// ReadWriteLock class - the benefit is that acquire_sem() doesn't +// have to be called when there is no one waiting. +// The disadvantage is the use of 2 real semaphores which is quite +// expensive regarding that BeOS only allows for a total of 64k +// semaphores. + +//#define FAST_LOCK +#ifdef FAST_LOCK +class ReadWriteLock { + public: + ReadWriteLock(const char *name = "bfs r/w lock") + : + fSemaphore(create_sem(0, name)), + fCount(MAX_READERS), + fWriteLock() + { + } + + ~ReadWriteLock() + { + delete_sem(fSemaphore); + } + + status_t InitCheck() + { + if (fSemaphore < B_OK) + return fSemaphore; + + return B_OK; + } + + status_t Lock() + { + if (atomic_add(&fCount, -1) <= 0) + return acquire_sem(fSemaphore); + + return B_OK; + } + + void Unlock() + { + if (atomic_add(&fCount, 1) < 0) + release_sem(fSemaphore); + } + + status_t LockWrite() + { + if (fWriteLock.Lock() < B_OK) + return B_ERROR; + + int32 readers = atomic_add(&fCount, -MAX_READERS); + status_t status = B_OK; + + if (readers < MAX_READERS) { + // Acquire sem for all readers currently not using a semaphore. + // But if we are not the only write lock in the queue, just get + // the one for us + status = acquire_sem_etc(fSemaphore,readers <= 0 ? 1 : MAX_READERS - readers,0,0); + } + fWriteLock.Unlock(); + + return status; + } + + void UnlockWrite() + { + int32 readers = atomic_add(&fCount,MAX_READERS); + if (readers < 0) { + // release sem for all readers only when we were the only writer + release_sem_etc(fSemaphore,readers <= -MAX_READERS ? 1 : -readers,0); + } + } + + private: + friend class ReadLocked; + friend class WriteLocked; + + sem_id fSemaphore; + vint32 fCount; + Benaphore fWriteLock; +}; +#else // FAST_LOCK +class ReadWriteLock { + public: + ReadWriteLock(const char *name = "bfs r/w lock") + : + fSemaphore(create_sem(MAX_READERS, name)) + { + } + + ~ReadWriteLock() + { + delete_sem(fSemaphore); + } + + status_t InitCheck() + { + if (fSemaphore < B_OK) + return fSemaphore; + + return B_OK; + } + + status_t Lock() + { + return acquire_sem(fSemaphore); + } + + void Unlock() + { + release_sem(fSemaphore); + } + + status_t LockWrite() + { + return acquire_sem_etc(fSemaphore,MAX_READERS,0,0); + } + + void UnlockWrite() + { + release_sem_etc(fSemaphore,MAX_READERS,0); + } + + private: + friend class ReadLocked; + friend class WriteLocked; + + sem_id fSemaphore; +}; +#endif // FAST_LOCK + + +class ReadLocked { + public: + ReadLocked(ReadWriteLock &lock) + : + fLock(lock) + { + fStatus = lock.Lock(); + } + + ~ReadLocked() + { + if (fStatus == B_OK) + fLock.Unlock(); + } + + private: + ReadWriteLock &fLock; + status_t fStatus; +}; + + +class WriteLocked { + public: + WriteLocked(ReadWriteLock &lock) + : + fLock(lock) + { + fStatus = lock.LockWrite(); + } + + ~WriteLocked() + { + if (fStatus == B_OK) + fLock.UnlockWrite(); + } + + status_t IsLocked() + { + return fStatus; + } + + private: + ReadWriteLock &fLock; + status_t fStatus; +}; + + +// A simple locking structure that doesn't use a semaphore - it's useful +// if you have to protect critical parts with a short runtime. + +class SimpleLock { + public: + SimpleLock() + : + fLock(0), + fUnlock(0) + { + } + + status_t Lock(bigtime_t time = 500) + { + int32 turn = atomic_add(&fLock,1); + while (turn != fUnlock) + snooze(time); + + // ToDo: the lock cannot fail currently! We may want + // to change this + return B_OK; + } + + void Unlock() + { + atomic_add(&fUnlock,1); + } + + private: + vint32 fLock; + vint32 fUnlock; +}; + +// A convenience class to lock the SimpleLock, note the +// different timing compared to the direct call + +class SimpleLocker { + public: + SimpleLocker(SimpleLock &lock,bigtime_t time = 1000) + : fLock(lock) + { + lock.Lock(time); + } + + ~SimpleLocker() + { + fLock.Unlock(); + } + + private: + SimpleLock &fLock; +}; + +#endif /* LOCK_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Query.cpp b/src/add-ons/kernel/file_systems/bfs/Query.cpp new file mode 100644 index 0000000000..ce37e6122a --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Query.cpp @@ -0,0 +1,1505 @@ +/* Query - query parsing and evaluation +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** The pattern matching is roughly based on code originally written +** by J. Kercheval, and on code written by Kenneth Almquist, though +** it shares no code. +** +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Query.h" +#include "cpp.h" +#include "bfs.h" +#include "Debug.h" +#include "Stack.h" +#include "Volume.h" +#include "Inode.h" +#include "BPlusTree.h" +#include "Index.h" + +#include +#include +#include + +#include +#include +#include + + +// The parser has a very static design, but it will do what is required. +// +// ParseOr(), ParseAnd(), ParseEquation() are guarantying the operator +// precedence, that is =,!=,>,<,>=,<= .. && .. ||. +// Apparently, the "!" (not) can only be used with brackets. +// +// If you think that there are too few NULL pointer checks in some places +// of the code, just read the beginning of the query constructor. +// The API is not fully available, just the Query and the Expression class +// are. + + +enum ops { + OP_NONE, + + OP_AND, + OP_OR, + + OP_EQUATION, + + OP_EQUAL, + OP_UNEQUAL, + OP_GREATER_THAN, + OP_LESS_THAN, + OP_GREATER_THAN_OR_EQUAL, + OP_LESS_THAN_OR_EQUAL, +}; + +enum match { + NO_MATCH = 0, + MATCH_OK = 1, + + MATCH_BAD_PATTERN = -2, + MATCH_INVALID_CHARACTER +}; + +// return values from isValidPattern() +enum { + PATTERN_INVALID_ESCAPE = -3, + PATTERN_INVALID_RANGE, + PATTERN_INVALID_SET +}; + +union value { + int64 Int64; + uint64 Uint64; + int32 Int32; + uint32 Uint32; + float Float; + double Double; + char String[INODE_FILE_NAME_LENGTH]; +}; + +class Term { + public: + Term(int8 op) : fOp(op), fParent(NULL) {} + + int8 Op() const { return fOp; } + + void SetParent(Term *parent) { fParent = parent; } + Term *Parent() const { return fParent; } + + virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0, + const uint8 *key = NULL,size_t size = 0) = 0; + virtual void Complement() = 0; + + virtual void CalculateScore(Index &index) = 0; + virtual int32 Score() const = 0; + + virtual status_t InitCheck() = 0; + +#ifdef DEBUG + virtual void PrintToStream() = 0; +#endif + + protected: + int8 fOp; + Term *fParent; +}; + +// Although an Equation object is quite independent from the volume on which +// the query is run, there are some dependencies that are produced while +// querying: +// The type/size of the value, the score, and if it has an index or not. +// So you could run more than one query on the same volume, but it might return +// wrong values when it runs concurrently on another volume. +// That's not an issue right now, because we run single-threaded and don't use +// queries more than once. + +class Equation : public Term { + public: + Equation(char **expr); + ~Equation(); + + virtual status_t InitCheck(); + + status_t ParseQuotedString(char **_start,char **_end); + char *CopyString(char *start, char *end); + + virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0,const uint8 *key = NULL,size_t size = 0); + virtual void Complement(); + + status_t PrepareQuery(Volume *volume, Index &index, TreeIterator **iterator); + status_t GetNextMatching(Volume *volume,TreeIterator *iterator,struct dirent *dirent,size_t bufferSize); + + virtual void CalculateScore(Index &index); + virtual int32 Score() const { return fScore; } + +#ifdef DEBUG + virtual void PrintToStream(); +#endif + + private: + status_t ConvertValue(type_code type); + bool CompareTo(const uint8 *value, uint16 size); + uint8 *Value() const { return (uint8 *)&fValue; } + status_t MatchEmptyString(); + + char *fAttribute; + char *fString; + union value fValue; + type_code fType; + size_t fSize; + bool fIsPattern; + bool fIsSpecialTime; + + int32 fScore; + bool fHasIndex; +}; + +class Operator : public Term { + public: + Operator(Term *,int8,Term *); + ~Operator(); + + Term *Left() const { return fLeft; } + Term *Right() const { return fRight; } + + virtual status_t Match(Inode *inode,const char *attribute = NULL,int32 type = 0,const uint8 *key = NULL,size_t size = 0); + virtual void Complement(); + + virtual void CalculateScore(Index &index); + virtual int32 Score() const; + + virtual status_t InitCheck(); + + //Term *Copy() const; +#ifdef DEBUG + virtual void PrintToStream(); +#endif + + protected: + Term *fLeft,*fRight; +}; + + +//--------------------------------- + + +void +skipWhitespace(char **expr, int32 skip = 0) +{ + char *string = (*expr) + skip; + while (*string == ' ' || *string == '\t') string++; + *expr = string; +} + + +void +skipWhitespaceReverse(char **expr,char *stop) +{ + char *string = *expr; + while (string > stop && (*string == ' ' || *string == '\t')) string--; + *expr = string; +} + + +// #pragma mark - + + +uint32 +utf8ToUnicode(char **string) +{ + uint8 *bytes = (uint8 *)*string; + int32 length; + uint8 mask = 0x1f; + + switch (bytes[0] & 0xf0) { + case 0xc0: + case 0xd0: length = 2; break; + case 0xe0: length = 3; break; + case 0xf0: + mask = 0x0f; + length = 4; + break; + default: + // valid 1-byte character + // and invalid characters + (*string)++; + return bytes[0]; + } + uint32 c = bytes[0] & mask; + int32 i = 1; + for (;i < length && (bytes[i] & 0x80) > 0;i++) + c = (c << 6) | (bytes[i] & 0x3f); + + if (i < length) { + // invalid character + (*string)++; + return (uint32)bytes[0]; + } + *string += length; + return c; +} + + +int32 +getFirstPatternSymbol(char *string) +{ + char c; + + for (int32 index = 0;(c = *string++);index++) { + if (c == '*' || c == '?' || c == '[') + return index; + } + return -1; +} + + +bool +isPattern(char *string) +{ + return getFirstPatternSymbol(string) >= 0 ? true : false; +} + + +status_t +isValidPattern(char *pattern) +{ + while (*pattern) { + switch (*pattern++) { + case '\\': + // the escape character must not be at the end of the pattern + if (!*pattern++) + return PATTERN_INVALID_ESCAPE; + break; + + case '[': + if (pattern[0] == ']' || !pattern[0]) + return PATTERN_INVALID_SET; + + while (*pattern != ']') { + if (*pattern == '\\' && !*++pattern) + return PATTERN_INVALID_ESCAPE; + + if (!*pattern) + return PATTERN_INVALID_SET; + + if (pattern[0] == '-' && pattern[1] == '-') + return PATTERN_INVALID_RANGE; + + pattern++; + } + break; + } + } + return B_OK; +} + + +/** Matches the string against the given wildcard pattern. + * Returns either MATCH_OK, or NO_MATCH when everything went fine, + * or values < 0 (see enum at the top of Query.cpp) if an error + * occurs + */ + +status_t +matchString(char *pattern,char *string) +{ + while (*pattern) { + // end of string == valid end of pattern? + if (!string[0]) { + while (pattern[0] == '*') + pattern++; + return !pattern[0] ? MATCH_OK : NO_MATCH; + } + + switch (*pattern++) { + case '?': + { + // match exactly one UTF-8 character; we are + // not interested in the result + utf8ToUnicode(&string); + break; + } + + case '*': + { + // compact pattern + while (true) { + if (pattern[0] == '?') { + if (!*++string) + return NO_MATCH; + } else if (pattern[0] != '*') + break; + + pattern++; + } + + // if the pattern is done, we have matched the string + if (!pattern[0]) + return MATCH_OK; + + while(true) { + // we have removed all occurences of '*' and '?' + if (pattern[0] == string[0] + || pattern[0] == '[' + || pattern[0] == '\\') { + status_t status = matchString(pattern,string); + if (status < B_OK || status == MATCH_OK) + return status; + } + + // we could be nice here and just jump to the next + // UTF-8 character - but we wouldn't gain that much + // and it'd be slower (since we're checking for + // equality before entering the recursion) + if (!*++string) + return NO_MATCH; + } + break; + } + + case '[': + { + bool invert = false; + if (pattern[0] == '^' || pattern[0] == '!') { + invert = true; + pattern++; + } + + if (!pattern[0] || pattern[0] == ']') + return MATCH_BAD_PATTERN; + + uint32 c = utf8ToUnicode(&string); + bool matched = false; + + while (pattern[0] != ']') { + if (!pattern[0]) + return MATCH_BAD_PATTERN; + + if (pattern[0] == '\\') + pattern++; + + uint32 first = utf8ToUnicode(&pattern); + + // Does this character match, or is this a range? + if (first == c) { + matched = true; + break; + } else if (pattern[0] == '-' && pattern[1] != ']' && pattern[1]) { + pattern++; + + if (pattern[0] == '\\') { + pattern++; + if (!pattern[0]) + return MATCH_BAD_PATTERN; + } + uint32 last = utf8ToUnicode(&pattern); + + if (c >= first && c <= last) { + matched = true; + break; + } + } + } + + if (invert) + matched = !matched; + + if (matched) { + while (pattern[0] != ']') { + if (!pattern[0]) + return MATCH_BAD_PATTERN; + pattern++; + } + pattern++; + break; + } + return NO_MATCH; + } + + case '\\': + if (!pattern[0]) + return MATCH_BAD_PATTERN; + // supposed to fall through + default: + if (pattern[-1] != string[0]) + return NO_MATCH; + string++; + break; + } + } + + if (string[0]) + return NO_MATCH; + + return MATCH_OK; +} + + +// #pragma mark - + + +Equation::Equation(char **expr) + : Term(OP_EQUATION), + fAttribute(NULL), + fString(NULL), + fType(0), + fIsPattern(false) +{ + char *string = *expr; + char *start = string; + char *end = NULL; + + // Since the equation is the integral part of any query, we're just parsing + // the whole thing here. + // The whitespace at the start is already removed in Expression::ParseEquation() + + if (*start == '"' || *start == '\'') { + // string is quoted (start has to be on the beginning of a string) + if (ParseQuotedString(&start,&end) < B_OK) + return; + + // set string to a valid start of the equation symbol + string = end + 2; + skipWhitespace(&string); + if (*string != '=' && *string != '<' && *string != '>' && *string != '!') { + *expr = string; + return; + } + } else { + // search the (in)equation for the actual equation symbol (and for other operators + // in case the equation is malformed) + while (*string && *string != '=' && *string != '<' && *string != '>' && *string != '!' + && *string != '&' && *string != '|') + string++; + + // get the attribute string (and trim whitespace), in case + // the string was not quoted + end = string - 1; + skipWhitespaceReverse(&end,start); + } + + // attribute string is empty (which is not allowed) + if (start > end) + return; + + // at this point, "start" points to the beginning of the string, "end" points + // to the last character of the string, and "string" points to the first + // character of the equation symbol + + // test for the right symbol (as this doesn't need any memory) + switch (*string) { + case '=': + fOp = OP_EQUAL; + break; + case '>': + fOp = *(string + 1) == '=' ? OP_GREATER_THAN_OR_EQUAL : OP_GREATER_THAN; + break; + case '<': + fOp = *(string + 1) == '=' ? OP_LESS_THAN_OR_EQUAL : OP_LESS_THAN; + break; + case '!': + if (*(string + 1) != '=') + return; + fOp = OP_UNEQUAL; + break; + + // any invalid characters will be rejected + default: + *expr = string; + return; + } + // lets change "start" to point to the first character after the symbol + if (*(string + 1) == '=') + string++; + string++; + skipWhitespace(&string); + + // allocate & copy the attribute string + + fAttribute = CopyString(start,end); + if (fAttribute == NULL) + return; + + start = string; + if (*start == '"' || *start == '\'') { + // string is quoted (start has to be on the beginning of a string) + if (ParseQuotedString(&start,&end) < B_OK) + return; + + string = end + 2; + skipWhitespace(&string); + } else { + while (*string && *string != '&' && *string != '|' && *string != ')') + string++; + + end = string - 1; + skipWhitespaceReverse(&end,start); + } + + // at this point, "start" will point to the first character of the value, + // "end" will point to its last character, and "start" to the first non- + // whitespace character after the value string + + fString = CopyString(start,end); + if (fString == NULL) + return; + + // patterns are only allowed for these operations (and strings) + if (fOp == OP_EQUAL || fOp == OP_UNEQUAL) { + fIsPattern = isPattern(fString); + if (fIsPattern && isValidPattern(fString) < B_OK) { + // we only want to have valid patterns; setting fString + // to NULL will cause InitCheck() to fail + free(fString); + fString = NULL; + } + } + + // The special time flag is set if the time values are shifted + // 64-bit values to reduce the number of duplicates. + // We have to be able to compare them against unshifted values + // later. The only index which needs this is the last_modified + // index, but we may want to open that feature for other indices, + // too one day. + fIsSpecialTime = !strcmp(fAttribute,"last_modified"); + + *expr = string; +} + + +Equation::~Equation() +{ + if (fAttribute != NULL) + free(fAttribute); + if (fString != NULL) + free(fString); +} + + +status_t +Equation::InitCheck() +{ + if (fAttribute == NULL + || fString == NULL + || fOp == OP_NONE) + return B_BAD_VALUE; + + return B_OK; +} + + +status_t +Equation::ParseQuotedString(char **_start, char **_end) +{ + char *start = *_start; + char quote = *start++; + char *end = start; + + for (;*end && *end != quote;end++) { + if (*end == '\\') + end++; + } + if (*end == '\0') + return B_BAD_VALUE; + + *_start = start; + *_end = end - 1; + + return B_OK; +} + + +char * +Equation::CopyString(char *start,char *end) +{ + // end points to the last character of the string - and the length + // also has to include the null-termination + int32 length = end + 2 - start; + // just to make sure; since that's the max. attribute name length and + // the max. string in an index, it make sense to have it that way + if (length > INODE_FILE_NAME_LENGTH || length <= 0) + return NULL; + + char *copy = (char *)malloc(length); + if (copy == NULL) + return NULL; + + memcpy(copy,start,length - 1); + copy[length - 1] = '\0'; + + return copy; +} + + +status_t +Equation::ConvertValue(type_code type) +{ + // Has the type already been converted? + if (type == fType) + return B_OK; + + fType = type; + char *string = fString; + + switch (type) { + // B_MIME_STRING_TYPE is defined in Mime.h which I didn't want to include just for that + case 'MIMS': + type = B_STRING_TYPE; + // supposed to fall through + case B_STRING_TYPE: + strncpy(fValue.String,string,INODE_FILE_NAME_LENGTH); + fValue.String[INODE_FILE_NAME_LENGTH - 1] = '\0'; + fSize = strlen(fValue.String); + break; + case B_INT32_TYPE: + fValue.Int32 = strtol(string,&string,0); + fSize = sizeof(int32); + break; + case B_UINT32_TYPE: + fValue.Int32 = strtoul(string,&string,0); + fSize = sizeof(uint32); + break; + case B_INT64_TYPE: + fValue.Int64 = strtoll(string,&string,0); + fSize = sizeof(int64); + break; + case B_UINT64_TYPE: + fValue.Uint64 = strtoull(string,&string,0); + fSize = sizeof(uint64); + break; + case B_FLOAT_TYPE: + fValue.Float = strtod(string,&string); + fSize = sizeof(float); + break; + case B_DOUBLE_TYPE: + fValue.Double = strtod(string,&string); + fSize = sizeof(double); + break; + default: + FATAL(("query value conversion to 0x%lx requested!\n",type)); + // should we fail here or just do a safety int32 conversion? + return B_ERROR; + } + + // patterns are only allowed for string types + if (fType != B_STRING_TYPE && fIsPattern) + fIsPattern = false; + + return B_OK; +} + + +/** Returns true when the key matches the equation. You have to + * call ConvertValue() before this one. + */ + +bool +Equation::CompareTo(const uint8 *value,uint16 size) +{ + int32 compare; + + // fIsPattern is only true if it's a string type, and fOp OP_EQUAL, or OP_UNEQUAL + if (fIsPattern) { + // we have already validated the pattern, so we don't check for failing + // here - if something is broken, and matchString() returns an error, + // we just don't match + compare = matchString(fValue.String,(char *)value) == MATCH_OK ? 0 : 1; + } else if (fIsSpecialTime) { + // the index is a shifted int64 index, but we have to match + // against an unshifted value (i.e. the last_modified index) + int64 timeValue = *(int64 *)value >> INODE_TIME_SHIFT; + compare = compareKeys(fType,&timeValue,sizeof(int64),&fValue.Int64,sizeof(int64)); + } else + compare = compareKeys(fType,value,size,Value(),fSize); + + switch (fOp) { + case OP_EQUAL: + return compare == 0; + case OP_UNEQUAL: + return compare != 0; + case OP_LESS_THAN: + return compare < 0; + case OP_LESS_THAN_OR_EQUAL: + return compare <= 0; + case OP_GREATER_THAN: + return compare > 0; + case OP_GREATER_THAN_OR_EQUAL: + return compare >= 0; + } + FATAL(("Unknown/Unsupported operation: %d\n",fOp)); + return false; +} + + +void +Equation::Complement() +{ + D(if (fOp <= OP_EQUATION || fOp > OP_LESS_THAN_OR_EQUAL) { + FATAL(("op out of range!")); + return; + }); + + int8 complementOp[] = {OP_UNEQUAL, OP_EQUAL, OP_LESS_THAN_OR_EQUAL, + OP_GREATER_THAN_OR_EQUAL, OP_LESS_THAN, OP_GREATER_THAN}; + fOp = complementOp[fOp - OP_EQUAL]; +} + + +status_t +Equation::MatchEmptyString() +{ + // there is no matching attribute, we will just bail out if we + // already know that our value is not of a string type. + // If not, it will be converted to a string - and then be compared with "". + // That's why we have to call ConvertValue() here - but it will be + // a cheap call for the next time + // Should we do this only for OP_UNEQUAL? + if (fType != 0 && fType != B_STRING_TYPE) + return NO_MATCH; + + status_t status = ConvertValue(B_STRING_TYPE); + if (status == B_OK) + status = CompareTo((const uint8 *)"",fSize) ? MATCH_OK : NO_MATCH; + + return status; +} + + +/** Matches the inode's attribute value with the equation. + * Returns MATCH_OK if it matches, NO_MATCH if not, < 0 if something went wrong + */ + +status_t +Equation::Match(Inode *inode,const char *attributeName,int32 type,const uint8 *key,size_t size) +{ + // get a pointer to the attribute in question + union value value; + uint8 *buffer; + + // first, check if we are matching for a live query and use that value + if (attributeName != NULL && !strcmp(fAttribute,attributeName)) { + if (key == NULL) { + if (type == B_STRING_TYPE) + return MatchEmptyString(); + + return NO_MATCH; + } + buffer = const_cast(key); + } else if (!strcmp(fAttribute,"name")) { + // if not, check for "fake" attributes, "name", "size", "last_modified", + buffer = (uint8 *)inode->Name(); + if (buffer == NULL) + return B_ERROR; + + type = B_STRING_TYPE; + size = strlen((const char *)buffer); + } else if (!strcmp(fAttribute,"size")) { + buffer = (uint8 *)&inode->Node()->data.size; + type = B_INT64_TYPE; + } else if (!strcmp(fAttribute,"last_modified")) { + buffer = (uint8 *)&inode->Node()->last_modified_time; + type = B_INT64_TYPE; + } else { + // then for attributes in the small_data section, and finally for the + // real attributes + Inode *attribute; + + inode->SmallDataLock().Lock(); + small_data *smallData = inode->FindSmallData(fAttribute); + if (smallData != NULL) { + buffer = smallData->Data(); + type = smallData->type; + size = smallData->data_size; + inode->SmallDataLock().Unlock(); + } else { + // needed to unlock the small_data section as fast as possible + inode->SmallDataLock().Unlock(); + + if (inode->GetAttribute(fAttribute,&attribute) == B_OK) { + buffer = (uint8 *)&value; + type = attribute->Node()->type; + size = attribute->Size(); + + if (size > INODE_FILE_NAME_LENGTH) + size = INODE_FILE_NAME_LENGTH; + + if (attribute->ReadAt(0,buffer,&size) < B_OK) { + inode->ReleaseAttribute(attribute); + return B_IO_ERROR; + } + inode->ReleaseAttribute(attribute); + } else + return MatchEmptyString(); + } + } + // prepare own value for use, if it is possible to convert it + status_t status = ConvertValue(type); + if (status == B_OK) + status = CompareTo(buffer,size) ? MATCH_OK : NO_MATCH; + + RETURN_ERROR(status); +} + + +void +Equation::CalculateScore(Index &index) +{ + // As always, these values could be tuned and refined. + // And the code could also need some real world testing :-) + + // do we have to operate on a "foreign" index? + if (fOp == OP_UNEQUAL || index.SetTo(fAttribute) < B_OK) { + fScore = 0; + return; + } + + // if we have a pattern, how much does it help our search? + if (fIsPattern) + fScore = getFirstPatternSymbol(fString) << 3; + else { + // Score by operator + if (fOp == OP_EQUAL) + // higher than pattern="255 chars+*" + fScore = 2048; + else + // the pattern search is regarded cheaper when you have at + // least one character to set your index to + fScore = 5; + } + + // take index size into account (1024 is the current node size + // in our B+trees) + // 2048 * 2048 == 4194304 is the maximum score (for an empty + // tree, since the header + 1 node are already 2048 bytes) + fScore = fScore * ((2048 * 1024LL) / index.Node()->Size()); +} + + +status_t +Equation::PrepareQuery(Volume */*volume*/, Index &index, TreeIterator **iterator) +{ + type_code type; + status_t status = index.SetTo(fAttribute); + + // special case for OP_UNEQUAL - it will always operate through the whole index + // but we need the call to the original index to get the correct type + if (status < B_OK || fOp == OP_UNEQUAL) { + // Try to get an index that holds all files (name) + // Also sets the default type for all attributes without index + // to string. + type = status < B_OK ? B_STRING_TYPE : index.Type(); + + if (index.SetTo("name") < B_OK) + return B_ENTRY_NOT_FOUND; + + fHasIndex = false; + } else { + fHasIndex = true; + type = index.Type(); + } + + if (ConvertValue(type) < B_OK) + return B_BAD_VALUE; + + BPlusTree *tree; + if (index.Node()->GetTree(&tree) < B_OK) + return B_ERROR; + + *iterator = new TreeIterator(tree); + if (*iterator == NULL) + return B_NO_MEMORY; + + if ((fOp == OP_EQUAL || fOp == OP_GREATER_THAN || fOp == OP_GREATER_THAN_OR_EQUAL + || fIsPattern) + && fHasIndex) { + // set iterator to the exact position + + int32 keySize = index.KeySize(); + + // at this point, fIsPattern is only true if it's a string type, and fOp + // is either OP_EQUAL or OP_UNEQUAL + if (fIsPattern) { + // let's see if we can use the beginning of the key for positioning + // the iterator and adjust the key size; if not, just leave the + // iterator at the start and return success + keySize = getFirstPatternSymbol(fString); + if (keySize <= 0) + return B_OK; + } + + if (keySize == 0) { + if (fType == B_STRING_TYPE) + keySize = strlen(fValue.String); + else + RETURN_ERROR(B_ENTRY_NOT_FOUND); + } + + if (fIsSpecialTime) { + // we have to find the first matching shifted value + off_t value = fValue.Int64 << INODE_TIME_SHIFT; + status = (*iterator)->Find((uint8 *)&value,keySize); + if (status == B_ENTRY_NOT_FOUND) + return B_OK; + } else { + status = (*iterator)->Find(Value(),keySize); + if (fOp == OP_EQUAL && !fIsPattern) + return status; + else if (status == B_ENTRY_NOT_FOUND && (fIsPattern || fOp == OP_GREATER_THAN || fOp == OP_GREATER_THAN_OR_EQUAL)) + return B_OK; + } + + RETURN_ERROR(status); + } + + return B_OK; +} + + +status_t +Equation::GetNextMatching(Volume *volume, TreeIterator *iterator, + struct dirent *dirent, size_t bufferSize) +{ + while (true) { + union value indexValue; + uint16 keyLength; + uint16 duplicate; + off_t offset; + + status_t status = iterator->GetNextEntry(&indexValue,&keyLength,(uint16)sizeof(indexValue),&offset,&duplicate); + if (status < B_OK) + return status; + + // only compare against the index entry when this is the correct + // index for the equation + if (fHasIndex && duplicate < 2 && !CompareTo((uint8 *)&indexValue,keyLength)) { + // They aren't equal? let the operation decide what to do + // Since we always start at the beginning of the index (or the correct + // position), only some needs to be stopped if the entry doesn't fit. + if (fOp == OP_LESS_THAN + || fOp == OP_LESS_THAN_OR_EQUAL + || (fOp == OP_EQUAL && !fIsPattern)) + return B_ENTRY_NOT_FOUND; + + if (duplicate > 0) + iterator->SkipDuplicates(); + continue; + } + + Inode *inode; + if ((status = get_vnode(volume->ID(),offset,(void **)&inode)) != B_OK) { + REPORT_ERROR(status); + FATAL(("could not get inode %Ld in index \"%s\"!\n",offset,fAttribute)); + // try with next + continue; + } + + // check user permissions here - but which one?! + // we could filter out all those where we don't have + // read access... (we should check for every parent + // directory if the X_OK is allowed) + // Although it's quite expensive to open all parents, + // it's likely that the application that runs the + // query will do something similar (and we don't have + // to do it for root, either). + + // go up in the tree until a &&-operator is found, and check if the + // inode matches with the rest of the expression - we don't have to + // check ||-operators for that + Term *term = this; + status = MATCH_OK; + + if (!fHasIndex) + status = Match(inode); + + while (term != NULL && status == MATCH_OK) { + Operator *parent = (Operator *)term->Parent(); + if (parent == NULL) + break; + + if (parent->Op() == OP_AND) { + // choose the other child of the parent + Term *other = parent->Right(); + if (other == term) + other = parent->Left(); + + if (other == NULL) { + FATAL(("&&-operator has only one child... (parent = %p)\n",parent)); + break; + } + status = other->Match(inode); + if (status < 0) { + REPORT_ERROR(status); + status = NO_MATCH; + } + } + term = (Term *)parent; + } + + if (status == MATCH_OK) { + dirent->d_dev = volume->ID(); + dirent->d_ino = offset; + dirent->d_pdev = volume->ID(); + dirent->d_pino = volume->ToVnode(inode->Parent()); + strcpy(dirent->d_name,inode->Name()); + dirent->d_reclen = strlen(dirent->d_name); + } + + put_vnode(volume->ID(), inode->ID()); + + if (status == MATCH_OK) + return B_OK; + } + RETURN_ERROR(B_ERROR); +} + + +// #pragma mark - + + +Operator::Operator(Term *left, int8 op, Term *right) + : Term(op), + fLeft(left), + fRight(right) +{ + if (left) + left->SetParent(this); + if (right) + right->SetParent(this); +} + + +Operator::~Operator() +{ + delete fLeft; + delete fRight; +} + + +status_t +Operator::Match(Inode *inode,const char *attribute,int32 type,const uint8 *key,size_t size) +{ + if (fOp == OP_AND) { + status_t status = fLeft->Match(inode,attribute,type,key,size); + if (status != MATCH_OK) + return status; + + return fRight->Match(inode,attribute,type,key,size); + } else { + // choose the term with the better score for OP_OR + if (fRight->Score() > fLeft->Score()) { + status_t status = fRight->Match(inode,attribute,type,key,size); + if (status != NO_MATCH) + return status; + } + return fLeft->Match(inode,attribute,type,key,size); + } +} + + +void +Operator::Complement() +{ + if (fOp == OP_AND) + fOp = OP_OR; + else + fOp = OP_AND; + + fLeft->Complement(); + fRight->Complement(); +} + + +void +Operator::CalculateScore(Index &index) +{ + fLeft->CalculateScore(index); + fRight->CalculateScore(index); +} + + +int32 +Operator::Score() const +{ + if (fOp == OP_AND) { + // return the one with the better score + if (fRight->Score() > fLeft->Score()) + return fRight->Score(); + + return fLeft->Score(); + } + + // for OP_OR, be honest, and return the one with the worse score + if (fRight->Score() < fLeft->Score()) + return fRight->Score(); + + return fLeft->Score(); +} + + +status_t +Operator::InitCheck() +{ + if (fOp != OP_AND && fOp != OP_OR + || fLeft == NULL || fLeft->InitCheck() < B_OK + || fRight == NULL || fRight->InitCheck() < B_OK) + return B_ERROR; + + return B_OK; +} + + +#if 0 +Term * +Operator::Copy() const +{ + if (fEquation != NULL) { + Equation *equation = new Equation(*fEquation); + if (equation == NULL) + return NULL; + + Term *term = new Term(equation); + if (term == NULL) + delete equation; + + return term; + } + + Term *left = NULL, *right = NULL; + + if (fLeft != NULL && (left = fLeft->Copy()) == NULL) + return NULL; + if (fRight != NULL && (right = fRight->Copy()) == NULL) { + delete left; + return NULL; + } + + Term *term = new Term(left,fOp,right); + if (term == NULL) { + delete left; + delete right; + return NULL; + } + return term; +} +#endif + + +// #pragma mark - + +#ifdef DEBUG +void +Operator::PrintToStream() +{ + D(__out("( ")); + if (fLeft != NULL) + fLeft->PrintToStream(); + + char *op; + switch (fOp) { + case OP_OR: op = "OR"; break; + case OP_AND: op = "AND"; break; + default: op = "?"; break; + } + D(__out(" %s ",op)); + + if (fRight != NULL) + fRight->PrintToStream(); + + D(__out(" )")); +} + + +void +Equation::PrintToStream() +{ + char *symbol = "???"; + switch (fOp) { + case OP_EQUAL: symbol = "=="; break; + case OP_UNEQUAL: symbol = "!="; break; + case OP_GREATER_THAN: symbol = ">"; break; + case OP_GREATER_THAN_OR_EQUAL: symbol = ">="; break; + case OP_LESS_THAN: symbol = "<"; break; + case OP_LESS_THAN_OR_EQUAL: symbol = "<="; break; + } + D(__out("[\"%s\" %s \"%s\"]",fAttribute,symbol,fString)); +} + +#endif /* DEBUG */ + +// #pragma mark - + + +Expression::Expression(char *expr) +{ + if (expr == NULL) + return; + + fTerm = ParseOr(&expr); + if (fTerm != NULL && fTerm->InitCheck() < B_OK) { + FATAL(("Corrupt tree in expression!\n")); + delete fTerm; + fTerm = NULL; + } + D(if (fTerm != NULL) { + fTerm->PrintToStream(); + D(__out("\n")); + if (*expr != '\0') + PRINT(("Unexpected end of string: \"%s\"!\n",expr)); + }); + fPosition = expr; +} + + +Expression::~Expression() +{ + delete fTerm; +} + + +Term * +Expression::ParseEquation(char **expr) +{ + skipWhitespace(expr); + + bool not = false; + if (**expr == '!') { + skipWhitespace(expr, 1); + if (**expr != '(') + return NULL; + + not = true; + } + + if (**expr == ')') { + // shouldn't be handled here + return NULL; + } else if (**expr == '(') { + skipWhitespace(expr, 1); + + Term *term = ParseOr(expr); + + skipWhitespace(expr); + + if (**expr != ')') { + delete term; + return NULL; + } + + // If the term is negated, we just complement the tree, to get + // rid of the not, a.k.a. DeMorgan's Law. + if (not) + term->Complement(); + + skipWhitespace(expr, 1); + + return term; + } + + Equation *equation = new Equation(expr); + if (equation == NULL || equation->InitCheck() < B_OK) { + delete equation; + return NULL; + } + return equation; +} + + +Term * +Expression::ParseAnd(char **expr) +{ + Term *left = ParseEquation(expr); + if (left == NULL) + return NULL; + + while (IsOperator(expr,'&')) { + Term *right = ParseAnd(expr); + Term *newParent = NULL; + + if (right == NULL || (newParent = new Operator(left,OP_AND,right)) == NULL) { + delete left; + delete right; + + return NULL; + } + left = newParent; + } + + return left; +} + + +Term * +Expression::ParseOr(char **expr) +{ + Term *left = ParseAnd(expr); + if (left == NULL) + return NULL; + + while (IsOperator(expr,'|')) { + Term *right = ParseAnd(expr); + Term *newParent = NULL; + + if (right == NULL || (newParent = new Operator(left,OP_OR,right)) == NULL) { + delete left; + delete right; + + return NULL; + } + left = newParent; + } + + return left; +} + + +bool +Expression::IsOperator(char **expr, char op) +{ + char *string = *expr; + + if (*string == op && *(string + 1) == op) { + *expr += 2; + return true; + } + return false; +} + + +status_t +Expression::InitCheck() +{ + if (fTerm == NULL) + return B_BAD_VALUE; + + return B_OK; +} + + +// #pragma mark - + + +Query::Query(Volume *volume,Expression *expression) + : + fVolume(volume), + fExpression(expression), + fCurrent(NULL), + fIterator(NULL), + fIndex(volume), + fPort(-1) +{ + // if the expression has a valid root pointer, the whole tree has + // already passed the sanity check, so that we don't have to check + // every pointer + if (volume == NULL || expression == NULL || expression->Root() == NULL) + return; + + // create index on the stack and delete it afterwards + fExpression->Root()->CalculateScore(fIndex); + fIndex.Unset(); + + Stack stack; + stack.Push(fExpression->Root()); + + Term *term; + while (stack.Pop(&term)) { + if (term->Op() < OP_EQUATION) { + Operator *op = (Operator *)term; + + if (op->Op() == OP_OR) { + stack.Push(op->Left()); + stack.Push(op->Right()); + } else { + // For OP_AND, we can use the scoring system to decide which path to add + if (op->Right()->Score() > op->Left()->Score()) + stack.Push(op->Right()); + else + stack.Push(op->Left()); + } + } else if (term->Op() == OP_EQUATION || fStack.Push((Equation *)term) < B_OK) + FATAL(("Unknown term on stack or stack error")); + } + + volume->AddQuery(this); +} + + +Query::~Query() +{ + fVolume->RemoveQuery(this); +} + + +status_t +Query::GetNextEntry(struct dirent *dirent, size_t size) +{ + // If we don't have an equation to use yet/anymore, get a new one + // from the stack + while (true) { + if (fIterator == NULL) { + if (!fStack.Pop(&fCurrent) + || fCurrent == NULL + || fCurrent->PrepareQuery(fVolume,fIndex,&fIterator) < B_OK) + return B_ENTRY_NOT_FOUND; + } + if (fCurrent == NULL) + RETURN_ERROR(B_ERROR); + + status_t status = fCurrent->GetNextMatching(fVolume,fIterator,dirent,size); + if (status < B_OK) { + delete fIterator; + fIterator = NULL; + fCurrent = NULL; + } else { + // only return if we have another entry + return B_OK; + } + } +} + + +void +Query::SetLiveMode(port_id port,int32 token) +{ + fPort = port; + fToken = token; +} + + +void +Query::LiveUpdate(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength) +{ + if (fPort < 0 || fExpression == NULL || attribute == NULL) + return; + + // ToDo: check if the attribute is part of the query at all... + + status_t oldStatus = fExpression->Root()->Match(inode,attribute,type,oldKey,oldLength); + status_t newStatus = fExpression->Root()->Match(inode,attribute,type,newKey,newLength); + int32 op; + if (oldStatus == MATCH_OK && newStatus == MATCH_OK) { + // only send out a notification if the name was changed + if (oldKey == NULL || strcmp(attribute,"name")) + return; + + send_notification(fPort,fToken,B_QUERY_UPDATE,B_ENTRY_REMOVED,fVolume->ID(),0,fVolume->ToVnode(inode->Parent()),0,inode->ID(),(const char *)oldKey); + op = B_ENTRY_CREATED; + } else if (oldStatus != MATCH_OK && newStatus != MATCH_OK) { + // nothing has changed + return; + } else if (oldStatus == MATCH_OK && newStatus != MATCH_OK) + op = B_ENTRY_REMOVED; + else + op = B_ENTRY_CREATED; + + // if "value" is NULL, send_notification() crashes... + const char *value = (const char *)newKey; + if (type != B_STRING_TYPE || value == NULL) + value = ""; + + send_notification(fPort,fToken,B_QUERY_UPDATE,op,fVolume->ID(),0,fVolume->ToVnode(inode->Parent()),0,inode->ID(),value); +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Query.h b/src/add-ons/kernel/file_systems/bfs/Query.h new file mode 100644 index 0000000000..50f3779063 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Query.h @@ -0,0 +1,72 @@ +#ifndef QUERY_H +#define QUERY_H +/* Query - query parsing and evaluation +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + +#include "Index.h" +#include "Stack.h" +#include "Chain.h" + +class Volume; +class Term; +class Equation; +class TreeIterator; +class Query; + + +class Expression { + public: + Expression(char *expr); + ~Expression(); + + status_t InitCheck(); + const char *Position() const { return fPosition; } + Term *Root() const { return fTerm; } + + protected: + Term *ParseOr(char **expr); + Term *ParseAnd(char **expr); + Term *ParseEquation(char **expr); + + bool IsOperator(char **expr,char op); + + private: + char *fPosition; + Term *fTerm; +}; + +class Query { + public: + Query(Volume *volume,Expression *expression); + ~Query(); + + status_t GetNextEntry(struct dirent *,size_t size); + + void SetLiveMode(port_id port,int32 token); + void LiveUpdate(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength); + + Expression *GetExpression() const { return fExpression; } + + private: + Volume *fVolume; + Expression *fExpression; + Equation *fCurrent; + TreeIterator *fIterator; + Index fIndex; + Stack fStack; + + port_id fPort; + int32 fToken; + + private: + friend Chain; + Query *fNext; +}; + +#endif /* QUERY_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Stack.h b/src/add-ons/kernel/file_systems/bfs/Stack.h new file mode 100644 index 0000000000..9793eb2491 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Stack.h @@ -0,0 +1,58 @@ +#ifndef STACK_H +#define STACK_H +/* Stack - a template stack class +** +** Copyright 2001 pinc Software. All Rights Reserved. +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + + +template class Stack { + public: + Stack() + : + fArray(NULL), + fUsed(0), + fMax(0) + { + } + + ~Stack() + { + if (fArray) + free(fArray); + } + + status_t Push(T value) + { + if (fUsed >= fMax) { + fMax += 16; + T *newArray = (T *)realloc(fArray,fMax * sizeof(T)); + if (newArray == NULL) + return B_NO_MEMORY; + + fArray = newArray; + } + fArray[fUsed++] = value; + return B_OK; + } + + bool Pop(T *value) + { + if (fUsed == 0) + return false; + + *value = fArray[--fUsed]; + return true; + } + + private: + T *fArray; + int32 fUsed; + int32 fMax; +}; + +#endif /* STACK_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/ToDo b/src/add-ons/kernel/file_systems/bfs/ToDo new file mode 100644 index 0000000000..9badbc44cc --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/ToDo @@ -0,0 +1,74 @@ +BFS - ToDo, June 5th, 2002 +----- + +BlockAllocator + + - the BlockAllocator is only slightly optimized and probably slow + - the first free and the largest range are currently not correctly maintained (only efficiency suffers - it does work correctly) + - the allocation policies will have to stand against some real world tests + - the access to the block bitmap is currently managed using a global lock + + +DataStream + + - growing/shrinking the stream size is not implemented for the double indirect range + - only files are trimmed back (in bfs_close()), but every inode has a preallocated stream... + - merging of block_runs doesn't work between range/block boundaries + + +Queries + + - There shouldn't be any cases where you can speed up a query with reordering the query expression - test it + - Check permissions of the parent directories + - Add protection against crashing applications which had a query open - at least the original BeOS kernel does not free the cookie (which throws some memory away *and* prevents unmounting the disk) + + +Journal + + - Check if there are any standard and often-happening cases for a transaction to fail, and if so, start the transaction only when necessary + - if the system crashes between bfs_unlink() and bfs_remove_vnode(), the inode can be removed from the tree, but its memory is still allocated - this can happen if the inode is still in use by someone (and that's what the "chkbfs" utility is for, mainly). + - add delayed index updating (+ delete actions to solve the issue above) + - multiple log files, parallel transactions? + - variable sized log file + - as long as we have a fixed-sized log file, it should be possible to reserve space for a transaction to be able to decide if batching it is possible + + +BPlusTree + + - BPlusTree::Remove() could trigger CachedNode::Free() to go through the free nodes list and free all pages at the end of the data stream + - updating the TreeIterators doesn't work yet for duplicates (which may be a problem if a duplicate node will go away after a remove) + - BPlusTree::RemoveDuplicate() could spread the contents of duplicate node with only a few entries to save some space (right now, only empty nodes are freed) + + +Inode + + - sometimes the inode's last modified time seems to be wrong, and is therefore not found in the b+tree (assuming that the b+tree is working correctly, what I do) + - Inode::FillGapWithZeros() currently disabled; apart from being slow, it really shouldn't be executed while a transaction is running, because that stops all other threads from doing anything (which can be a long time for a 100 MB file) + + +Indices + + + +Attributes + + - bfs_write_attr() doesn't check if the attribute data may fit into the small_data region if there already is that attribute as an attribute file + + +Volume + + +kernel_interface + + - missing functions, maybe they are not all needed (but most of them are): bfs_rename_attr(), bfs_rename_index(), bfs_initialize(), bfs_setflags(), bfs_link() + - bfs_rename() currently doesn't respect any permissions + + +general stuff + + - There are also some comments with a leading "ToDo:" directly in the code which may not be mentioned here. + + +----- +Axel Dörfler +axeld@pinc-software.de diff --git a/src/add-ons/kernel/file_systems/bfs/Utility.cpp b/src/add-ons/kernel/file_systems/bfs/Utility.cpp new file mode 100644 index 0000000000..4e1d1b91e2 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Utility.cpp @@ -0,0 +1,138 @@ +/* Utility - some helper classes +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Utility.h" +#include "Debug.h" +#include "cpp.h" + +#include +#include + + +bool +sorted_array::FindInternal(off_t value, int32 &index) const +{ + int32 min = 0, max = count-1; + off_t cmp; + while (min <= max) { + index = (min + max) / 2; + + cmp = values[index] - value; + if (cmp < 0) + min = index + 1; + else if (cmp > 0) + max = index - 1; + else + return true; + } + return false; +} + + +void +sorted_array::Insert(off_t value) +{ + // if there are more than 8 values in this array, use a + // binary search, if not, just iterate linearly to find + // the insertion point + int32 i; + if (count > 8 ) { + if (!FindInternal(value,i) + && values[i] <= value) + i++; + } else { + for (i = 0;i < count; i++) + if (values[i] > value) + break; + } + + memmove(&values[i+1],&values[i],(count - i) * sizeof(off_t)); + values[i] = value; + count++; +} + + +bool +sorted_array::Remove(off_t value) +{ + int32 index = Find(value); + if (index == -1) + return false; + + memmove(&values[index],&values[index + 1],(count - index) * sizeof(off_t)); + count--; + + return true; +} + + +// #pragma mark - + + +BlockArray::BlockArray(int32 blockSize) + : + fArray(NULL), + fSize(0), + fBlockSize(blockSize) +{ +} + + +BlockArray::~BlockArray() +{ + if (fArray) + free(fArray); +} + + +int32 +BlockArray::Find(off_t value) +{ + if (fArray == NULL) + return -1; + + return fArray->Find(value); +} + + +status_t +BlockArray::Insert(off_t value) +{ + if (fArray == NULL || fArray->count + 1 > fMaxBlocks) { + sorted_array *array = (sorted_array *)realloc(fArray,fSize + fBlockSize); + if (array == NULL) + return B_NO_MEMORY; + + if (fArray == NULL) + array->count = 0; + + fArray = array; + fSize += fBlockSize; + fMaxBlocks = fSize / sizeof(off_t) - 1; + } + + fArray->Insert(value); + return B_OK; +} + + +status_t +BlockArray::Remove(off_t value) +{ + if (fArray == NULL) + return B_ENTRY_NOT_FOUND; + + return fArray->Remove(value) ? B_OK : B_ENTRY_NOT_FOUND; +} + + +void +BlockArray::MakeEmpty() +{ + fArray->count = 0; +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Utility.h b/src/add-ons/kernel/file_systems/bfs/Utility.h new file mode 100644 index 0000000000..f095545a16 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Utility.h @@ -0,0 +1,110 @@ +#ifndef UTILITY_H +#define UTILITY_H +/* Utility - some helper classes +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + + +// Simple array, used for the duplicate handling in the B+Tree, +// and for the log entries. + +struct sorted_array { + public: + off_t count; + off_t values[0]; + + inline int32 Find(off_t value) const; + void Insert(off_t value); + bool Remove(off_t value); + + private: + bool FindInternal(off_t value,int32 &index) const; +}; + + +inline int32 +sorted_array::Find(off_t value) const +{ + int32 i; + return FindInternal(value,i) ? i : -1; +} + + +// The BlockArray reserves a multiple of "blockSize" and +// maintain array size for new entries. +// This is used for the in-memory log entries before they +// are written to disk. + +class BlockArray { + public: + BlockArray(int32 blockSize); + ~BlockArray(); + + int32 Find(off_t value); + status_t Insert(off_t value); + status_t Remove(off_t value); + + void MakeEmpty(); + + int32 CountItems() const { return fArray != NULL ? fArray->count : 0; } + int32 BlocksUsed() const { return fArray != NULL ? ((fArray->count + 1) * sizeof(off_t) + fBlockSize - 1) / fBlockSize : 0; } + sorted_array *Array() const { return fArray; } + int32 Size() const { return fSize; } + + private: + sorted_array *fArray; + int32 fBlockSize; + int32 fSize; + int32 fMaxBlocks; +}; + + +// Doubly linked list + +template struct node { + Node *next,*prev; + + void + Remove() + { + prev->next = next; + next->prev = prev; + } + + Node * + Next() + { + if (next && next->next != NULL) + return next; + + return NULL; + } +}; + +template struct list { + Node *head,*tail,*last; + + list() + { + head = (Node *)&tail; + tail = NULL; + last = (Node *)&head; + } + + void + Add(Node *entry) + { + entry->next = (Node *)&tail; + entry->prev = last; + last->next = entry; + last = entry; + } +}; + + +#endif /* UTILITY_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/Volume.cpp b/src/add-ons/kernel/file_systems/bfs/Volume.cpp new file mode 100644 index 0000000000..f7a3a1aa2f --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Volume.cpp @@ -0,0 +1,304 @@ +/* Volume - BFS super block, mounting, etc. +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "cpp.h" +#include "Volume.h" +#include "Journal.h" +#include "Inode.h" +#include "Query.h" + +#include + +#include +#include +#include +#include +#include + + +Volume::Volume(nspace_id id) + : + fID(id), + fBlockAllocator(this), + fLock("bfs volume"), + fDirtyCachedBlocks(0), + fUniqueID(0), + fFlags(0) +{ +} + + +Volume::~Volume() +{ +} + + +bool +Volume::IsValidSuperBlock() +{ + if (fSuperBlock.magic1 != (int32)SUPER_BLOCK_MAGIC1 + || fSuperBlock.magic2 != (int32)SUPER_BLOCK_MAGIC2 + || fSuperBlock.magic3 != (int32)SUPER_BLOCK_MAGIC3 + || (int32)fSuperBlock.block_size != fSuperBlock.inode_size + || fSuperBlock.fs_byte_order != SUPER_BLOCK_FS_LENDIAN + || (1UL << fSuperBlock.block_shift) != fSuperBlock.block_size + || fSuperBlock.num_ags < 1 + || fSuperBlock.ag_shift < 1 + || fSuperBlock.blocks_per_ag < 1 + || fSuperBlock.num_blocks < 10 + || fSuperBlock.num_ags != divide_roundup(fSuperBlock.num_blocks,1L << fSuperBlock.ag_shift)) + return false; + + return true; +} + + +void +Volume::Panic() +{ + FATAL(("we have to panic... switch to read-only mode!\n")); + fFlags |= VOLUME_READ_ONLY; +#ifdef USER + debugger("BFS panics!"); +#endif +} + + +status_t +Volume::Mount(const char *deviceName,uint32 flags) +{ + if (flags & B_MOUNT_READ_ONLY) + fFlags |= VOLUME_READ_ONLY; + + fDevice = open(deviceName,flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR); + + // if we couldn't open the device, try read-only (don't rely on a specific error code) + if (fDevice < B_OK && (flags & B_MOUNT_READ_ONLY) == 0) { + fDevice = open(deviceName,O_RDONLY); + fFlags |= VOLUME_READ_ONLY; + } + + if (fDevice < B_OK) + RETURN_ERROR(fDevice); + + // check if it's a regular file, and if so, disable the cache for the + // underlaying file system + struct stat stat; + if (fstat(fDevice,&stat) < 0) + RETURN_ERROR(B_ERROR); + +//#ifndef USER + if (stat.st_mode & S_FILE && ioctl(fDevice,IOCTL_FILE_UNCACHED_IO,NULL) < 0) { + // mount read-only if the cache couldn't be disabled +# ifdef DEBUG + FATAL(("couldn't disable cache for image file - system may dead-lock!\n")); +# else + FATAL(("couldn't disable cache for image file!\n")); + Panic(); +# endif + } +//#endif + + // read the super block + char buffer[1024]; + if (read_pos(fDevice,0,buffer,sizeof(buffer)) != sizeof(buffer)) + return B_IO_ERROR; + + status_t status = B_OK; + + // Note: that does work only for x86, for PowerPC, the super block + // is located at offset 0! + memcpy(&fSuperBlock,buffer + 512,sizeof(disk_super_block)); + + if (IsValidSuperBlock()) { + // set the current log pointers, so that journaling will work correctly + fLogStart = fSuperBlock.log_start; + fLogEnd = fSuperBlock.log_end; + + if (init_cache_for_device(fDevice, NumBlocks()) == B_OK) { + fJournal = new Journal(this); + // replaying the log is the first thing we will do on this disk + if (fJournal && fJournal->InitCheck() == B_OK + && fBlockAllocator.Initialize() == B_OK) { + fRootNode = new Inode(this,ToVnode(Root())); + + if (fRootNode && fRootNode->InitCheck() == B_OK) { + if (new_vnode(fID,ToVnode(Root()),(void *)fRootNode) == B_OK) { + // try to get indices root dir + + // question: why doesn't get_vnode() work here?? + // answer: we have not yet backpropagated the pointer to the + // volume in bfs_mount(), so bfs_read_vnode() can't get it. + // But it's not needed to do that anyway. + + fIndicesNode = new Inode(this,ToVnode(Indices())); + if (fIndicesNode == NULL + || fIndicesNode->InitCheck() < B_OK + || !fIndicesNode->IsDirectory()) { + INFORM(("bfs: volume doesn't have indices!\n")); + + if (fIndicesNode) { + // if this is the case, the index root node is gone bad, and + // BFS switch to read-only mode + fFlags |= VOLUME_READ_ONLY; + fIndicesNode = NULL; + } + } + + // all went fine + return B_OK; + } else + status = B_NO_MEMORY; + } else + status = B_BAD_VALUE; + + FATAL(("could not create root node: new_vnode() failed!\n")); + } else { + // ToDo: improve error reporting for a bad journal + status = B_NO_MEMORY; + FATAL(("could not initialize journal/block bitmap allocator!\n")); + } + + remove_cached_device_blocks(fDevice,NO_WRITES); + } else { + FATAL(("could not initialize cache!\n")); + status = B_IO_ERROR; + } + FATAL(("invalid super block!\n")); + } + else + status = B_BAD_VALUE; + + close(fDevice); + + return status; +} + + +status_t +Volume::Unmount() +{ + // This will also flush the log & all blocks to disk + delete fJournal; + fJournal = NULL; + + delete fIndicesNode; + + remove_cached_device_blocks(fDevice,ALLOW_WRITES); + close(fDevice); + + return B_OK; +} + + +status_t +Volume::Sync() +{ + return fJournal->FlushLogAndBlocks(); +} + + +status_t +Volume::IsValidBlockRun(block_run run) +{ + if (run.allocation_group < 0 || run.allocation_group > AllocationGroups() + || run.start > (1LL << AllocationGroupShift()) + || run.length == 0 + || (uint32)run.length + run.start > (1LL << AllocationGroupShift())) { + Panic(); + FATAL(("*** invalid run(%ld,%d,%d)\n",run.allocation_group,run.start,run.length)); + return B_BAD_DATA; + } + return B_OK; +} + + +block_run +Volume::ToBlockRun(off_t block) const +{ + block_run run; + run.allocation_group = block >> fSuperBlock.ag_shift; + run.start = block & ~((1LL << fSuperBlock.ag_shift) - 1); + run.length = 1; + return run; +} + + +status_t +Volume::CreateIndicesRoot(Transaction *transaction) +{ + off_t id; + status_t status = Inode::Create(transaction,NULL,NULL, + S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700,0,0,&id); + if (status < B_OK) + RETURN_ERROR(status); + + fSuperBlock.indices = ToBlockRun(id); + WriteSuperBlock(); + + // The Vnode destructor will unlock the inode, but it has already been + // locked by the Inode::Create() call. + Vnode vnode(this,id); + return vnode.Get(&fIndicesNode); +} + + +status_t +Volume::AllocateForInode(Transaction *transaction, const Inode *parent, mode_t type, block_run &run) +{ + return fBlockAllocator.AllocateForInode(transaction,&parent->BlockRun(),type,run); +} + + +status_t +Volume::WriteSuperBlock() +{ + if (write_pos(fDevice,512,&fSuperBlock,sizeof(disk_super_block)) != sizeof(disk_super_block)) + return B_IO_ERROR; + + return B_OK; +} + + +void +Volume::UpdateLiveQueries(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength) +{ + if (fQueryLock.Lock() < B_OK) + return; + + Query *query = NULL; + while ((query = fQueries.Next(query)) != NULL) + query->LiveUpdate(inode,attribute,type,oldKey,oldLength,newKey,newLength); + + fQueryLock.Unlock(); +} + + +void +Volume::AddQuery(Query *query) +{ + if (fQueryLock.Lock() < B_OK) + return; + + fQueries.Add(query); + + fQueryLock.Unlock(); +} + + +void +Volume::RemoveQuery(Query *query) +{ + if (fQueryLock.Lock() < B_OK) + return; + + fQueries.Remove(query); + + fQueryLock.Unlock(); +} + diff --git a/src/add-ons/kernel/file_systems/bfs/Volume.h b/src/add-ons/kernel/file_systems/bfs/Volume.h new file mode 100644 index 0000000000..1c6a143e10 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/Volume.h @@ -0,0 +1,176 @@ +#ifndef VOLUME_H +#define VOLUME_H +/* Volume - BFS super block, mounting, etc. +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + +extern "C" { + #ifndef _IMPEXP_KERNEL + # define _IMPEXP_KERNEL + #endif + #include "fsproto.h" + #include "lock.h" + #include "cache.h" +} + +#include "bfs.h" +#include "BlockAllocator.h" +#include "Chain.h" + +class Journal; +class Inode; +class Query; + +enum volume_flags { + VOLUME_READ_ONLY = 0x0001 +}; + + +class Volume { + public: + Volume(nspace_id id); + ~Volume(); + + status_t Mount(const char *device,uint32 flags); + status_t Unmount(); + + bool IsValidSuperBlock(); + bool IsReadOnly() const { return fFlags & VOLUME_READ_ONLY; } + void Panic(); + Benaphore &Lock() { return fLock; } + + block_run Root() const { return fSuperBlock.root_dir; } + Inode *RootNode() const { return fRootNode; } + block_run Indices() const { return fSuperBlock.indices; } + Inode *IndicesNode() const { return fIndicesNode; } + block_run Log() const { return fSuperBlock.log_blocks; } + vint32 &LogStart() { return fLogStart; } + vint32 &LogEnd() { return fLogEnd; } + int Device() const { return fDevice; } + + nspace_id ID() const { return fID; } + const char *Name() const { return fSuperBlock.name; } + + off_t NumBlocks() const { return fSuperBlock.num_blocks; } + off_t UsedBlocks() const { return fSuperBlock.used_blocks; } + off_t FreeBlocks() const { return fSuperBlock.num_blocks - fSuperBlock.used_blocks; } + + uint32 BlockSize() const { return fSuperBlock.block_size; } + uint32 BlockShift() const { return fSuperBlock.block_shift; } + uint32 InodeSize() const { return fSuperBlock.inode_size; } + uint32 AllocationGroups() const { return fSuperBlock.num_ags; } + uint32 AllocationGroupShift() const { return fSuperBlock.ag_shift; } + disk_super_block &SuperBlock() { return fSuperBlock; } + + off_t ToOffset(block_run run) const { return ToBlock(run) << fSuperBlock.block_shift; } + off_t ToBlock(block_run run) const { return ((((off_t)run.allocation_group) << fSuperBlock.ag_shift) | (off_t)run.start); } + block_run ToBlockRun(off_t block) const; + status_t IsValidBlockRun(block_run run); + + off_t ToVnode(block_run run) const { return ToBlock(run); } + off_t ToVnode(off_t block) const { return block; } + off_t VnodeToBlock(vnode_id id) const { return (off_t)id; } + + status_t CreateIndicesRoot(Transaction *transaction); + + status_t AllocateForInode(Transaction *transaction,const Inode *parent,mode_t type,block_run &run); + status_t AllocateForInode(Transaction *transaction,const block_run *parent,mode_t type,block_run &run); + status_t Allocate(Transaction *transaction,const Inode *inode,off_t numBlocks,block_run &run,uint16 minimum = 1); + status_t Free(Transaction *transaction,block_run &run); + +#ifdef DEBUG + BlockAllocator &Allocator() { return fBlockAllocator; } +#endif + + status_t Sync(); + Journal *GetJournal(off_t /*refBlock*/) const { return fJournal; } + + status_t WriteSuperBlock(); + status_t WriteBlocks(off_t blockNumber,const uint8 *block,uint32 numBlocks); + void WriteCachedBlocksIfNecessary(); + status_t FlushDevice(); + + void UpdateLiveQueries(Inode *inode,const char *attribute,int32 type,const uint8 *oldKey,size_t oldLength,const uint8 *newKey,size_t newLength); + void AddQuery(Query *query); + void RemoveQuery(Query *query); + + uint32 GetUniqueID() { return atomic_add(&fUniqueID,1); } + + + protected: + nspace_id fID; + int fDevice; + disk_super_block fSuperBlock; + BlockAllocator fBlockAllocator; + Benaphore fLock; + Journal *fJournal; + vint32 fLogStart,fLogEnd; + + Inode *fRootNode; + Inode *fIndicesNode; + + vint32 fDirtyCachedBlocks; + + SimpleLock fQueryLock; + Chain fQueries; + + int32 fUniqueID; + uint32 fFlags; +}; + +// inline functions + +inline status_t +Volume::AllocateForInode(Transaction *transaction, const block_run *parent, mode_t type, block_run &run) +{ + return fBlockAllocator.AllocateForInode(transaction,parent,type,run); +} + + +inline status_t +Volume::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks, block_run &run, uint16 minimum) +{ + return fBlockAllocator.Allocate(transaction,inode,numBlocks,run,minimum); +} + + +inline status_t +Volume::Free(Transaction *transaction, block_run &run) +{ + return fBlockAllocator.Free(transaction,run); +} + + +inline status_t +Volume::WriteBlocks(off_t blockNumber, const uint8 *block, uint32 numBlocks) +{ + atomic_add(&fDirtyCachedBlocks,numBlocks); + return cached_write(fDevice,blockNumber,block,numBlocks,fSuperBlock.block_size); +} + + +inline void +Volume::WriteCachedBlocksIfNecessary() +{ + // the specific values are only valid for the current BeOS cache + if (fDirtyCachedBlocks > 128) { + force_cache_flush(fDevice,false); + atomic_add(&fDirtyCachedBlocks,-64); + } +} + + +inline status_t +Volume::FlushDevice() +{ + fDirtyCachedBlocks = 0; + return flush_device(fDevice,0); +} + + +#endif /* VOLUME_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/bfs.h b/src/add-ons/kernel/file_systems/bfs/bfs.h new file mode 100644 index 0000000000..ba6488e513 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/bfs.h @@ -0,0 +1,298 @@ +#ifndef BFS_H +#define BFS_H +/* bfs - BFS definitions and helper functions +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** Parts of this code is based on work previously done by Marcus Overhagen +** +** Copyright 2001 pinc Software. All Rights Reserved. +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include + +#ifndef B_BAD_DATA +# define B_BAD_DATA B_ERROR +#endif + + +struct block_run +{ + int32 allocation_group; + uint16 start; + uint16 length; + + inline bool operator==(const block_run &run) const; + inline bool operator!=(const block_run &run) const; + inline bool IsZero(); + inline void SetTo(int32 group,uint16 start,uint16 length = 1); + + inline static block_run Run(int32 group,uint16 start,uint16 length = 1); +}; + +typedef block_run inode_addr; + +//************************************** + + +#define BFS_DISK_NAME_LENGTH 32 + +struct disk_super_block +{ + char name[BFS_DISK_NAME_LENGTH]; + int32 magic1; + int32 fs_byte_order; + uint32 block_size; + uint32 block_shift; + off_t num_blocks; + off_t used_blocks; + int32 inode_size; + int32 magic2; + int32 blocks_per_ag; + int32 ag_shift; + int32 num_ags; + int32 flags; + block_run log_blocks; + off_t log_start; + off_t log_end; + int32 magic3; + inode_addr root_dir; + inode_addr indices; + int32 pad[8]; +}; + +#define SUPER_BLOCK_FS_LENDIAN 'BIGE' /* BIGE */ + +#define SUPER_BLOCK_MAGIC1 'BFS1' /* BFS1 */ +#define SUPER_BLOCK_MAGIC2 0xdd121031 +#define SUPER_BLOCK_MAGIC3 0x15b6830e + +#define SUPER_BLOCK_DISK_CLEAN 'CLEN' /* CLEN */ +#define SUPER_BLOCK_DISK_DIRTY 'DIRT' /* DIRT */ + +//************************************** + +#define NUM_DIRECT_BLOCKS 12 + +struct data_stream +{ + block_run direct[NUM_DIRECT_BLOCKS]; + off_t max_direct_range; + block_run indirect; + off_t max_indirect_range; + block_run double_indirect; + off_t max_double_indirect_range; + off_t size; +}; + +//************************************** + +struct bfs_inode; + +struct small_data +{ + uint32 type; + uint16 name_size; + uint16 data_size; + char name[0]; // name_size long, followed by data + + inline char *Name(); + inline uint8 *Data(); + inline uint32 Size(); + inline small_data *Next(); + inline bool IsLast(bfs_inode *inode); +}; + +// the file name is part of the small_data structure +#define FILE_NAME_TYPE 'CSTR' +#define FILE_NAME_NAME 0x13 +#define FILE_NAME_NAME_LENGTH 1 + +//************************************** + +#define SHORT_SYMLINK_NAME_LENGTH 144 // length incl. terminating '\0' + +struct bfs_inode +{ + int32 magic1; + inode_addr inode_num; + int32 uid; + int32 gid; + int32 mode; // see sys/stat.h + int32 flags; + bigtime_t create_time; + bigtime_t last_modified_time; + inode_addr parent; + inode_addr attributes; + uint32 type; // attribute type + + int32 inode_size; + uint32 etc; // for in-memory structures (unused in OpenBeOS' fs) + + union { + data_stream data; + char short_symlink[SHORT_SYMLINK_NAME_LENGTH]; + }; + int32 pad[4]; + small_data small_data_start[0]; +}; + +#define INODE_MAGIC1 0x3bbe0ad9 +#define INODE_TIME_SHIFT 16 +#define INODE_TIME_MASK 0xffff +#define INODE_FILE_NAME_LENGTH 256 + +enum inode_flags +{ + INODE_IN_USE = 0x00000001, // always set + INODE_ATTR_INODE = 0x00000004, + INODE_LOGGED = 0x00000008, // log changes to the data stream + INODE_DELETED = 0x00000010, + INODE_EMPTY = 0x00000020, + INODE_LONG_SYMLINK = 0x00000040, // symlink in data stream + + INODE_PERMANENT_FLAGS = 0x0000ffff, + + INODE_NO_CACHE = 0x00010000, + INODE_WAS_WRITTEN = 0x00020000, + INODE_NO_TRANSACTION = 0x00040000, +}; + +//************************************** + +struct file_cookie { + bigtime_t last_notification; + off_t last_size; + int open_mode; +}; + +// notify every second if the file size has changed +#define INODE_NOTIFICATION_INTERVAL 1000000LL + +//************************************** + + +inline int32 +divide_roundup(int32 num,int32 divisor) +{ + return (num + divisor - 1) / divisor; +} + +inline int64 +divide_roundup(int64 num,int32 divisor) +{ + return (num + divisor - 1) / divisor; +} + +inline int +get_shift(uint64 i) +{ + int c; + c = 0; + while (i > 1) { + i >>= 1; + c++; + } + return c; +} + +inline int32 +round_up(uint32 data) +{ + // rounds up to the next off_t boundary + return (data + sizeof(off_t) - 1) & ~(sizeof(off_t) - 1); +} + + +/************************ block_run inline functions ************************/ +// #pragma mark - + + +inline bool +block_run::operator==(const block_run &run) const +{ + return allocation_group == run.allocation_group + && start == run.start + && length == run.length; +} + + +inline bool +block_run::operator!=(const block_run &run) const +{ + return allocation_group != run.allocation_group + || start != run.start + || length != run.length; +} + + +inline bool +block_run::IsZero() +{ + return allocation_group == 0 && start == 0 && length == 0; +} + + +inline void +block_run::SetTo(int32 _group,uint16 _start,uint16 _length) +{ + allocation_group = _group; + start = _start; + length = _length; +} + + +inline block_run +block_run::Run(int32 group, uint16 start, uint16 length) +{ + block_run run; + run.allocation_group = group; + run.start = start; + run.length = length; + return run; +} + + +/************************ small_data inline functions ************************/ +// #pragma mark - + + +inline char * +small_data::Name() +{ + return name; +} + + +inline uint8 * +small_data::Data() +{ + return (uint8 *)name + name_size + 3; +} + + +inline uint32 +small_data::Size() +{ + return sizeof(small_data) + name_size + 3 + data_size + 1; +} + + +inline small_data * +small_data::Next() +{ + return (small_data *)((uint8 *)this + Size()); +} + + +inline bool +small_data::IsLast(bfs_inode *inode) +{ + // we need to check the location first, because if name_size is already beyond + // the block, we would touch invalid memory (although that can't cause wrong + // results) + return (uint32)this > (uint32)inode + inode->inode_size - sizeof(small_data) || name_size == 0; +} + +#endif /* BFS_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/cache.h b/src/add-ons/kernel/file_systems/bfs/cache.h new file mode 100644 index 0000000000..a0e913840e --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/cache.h @@ -0,0 +1,108 @@ +/* + Copyright 1999-2001, Be Incorporated. All Rights Reserved. + This file may be used under the terms of the Be Sample Code License. +*/ + +#ifndef _CACHE_H_ +#define _CACHE_H_ + +#include + +typedef struct hash_ent { + int dev; + off_t bnum; + off_t hash_val; + void *data; + struct hash_ent *next; +} hash_ent; + + +typedef struct hash_table { + hash_ent **table; + int max; + int mask; /* == max - 1 */ + int num_elements; +} hash_table; + + +#define HT_DEFAULT_MAX 128 + + +typedef struct cache_ent { + int dev; + off_t block_num; + int bsize; + volatile int flags; + + void *data; + void *clone; /* copy of data by set_block_info() */ + int lock; + + void (*func)(off_t bnum, size_t num_blocks, void *arg); + off_t logged_bnum; + void *arg; + + struct cache_ent *next, /* points toward mru end of list */ + *prev; /* points toward lru end of list */ + +} cache_ent; + +#define CE_NORMAL 0x0000 /* a nice clean pristine page */ +#define CE_DIRTY 0x0002 /* needs to be written to disk */ +#define CE_BUSY 0x0004 /* this block has i/o happening, don't touch it */ + + +typedef struct cache_ent_list { + cache_ent *lru; /* tail of the list */ + cache_ent *mru; /* head of the list */ +} cache_ent_list; + + +typedef struct block_cache { + struct lock lock; + int flags; + int cur_blocks; + int max_blocks; + hash_table ht; + + cache_ent_list normal, /* list of "normal" blocks (clean & dirty) */ + locked; /* list of clean and locked blocks */ +} block_cache; + +#if 0 /* XXXdbg -- need to deal with write through caches */ +#define DC_WRITE_THROUGH 0x0001 /* cache is write-through (for floppies) */ +#endif + +#define ALLOW_WRITES 1 +#define NO_WRITES 0 + +extern _IMPEXP_KERNEL int init_block_cache(int max_blocks, int flags); +extern _IMPEXP_KERNEL void shutdown_block_cache(void); + +extern _IMPEXP_KERNEL void force_cache_flush(int dev, int prefer_log_blocks); +extern _IMPEXP_KERNEL int flush_blocks(int dev, off_t bnum, int nblocks); +extern _IMPEXP_KERNEL int flush_device(int dev, int warn_locked); + +extern _IMPEXP_KERNEL int init_cache_for_device(int fd, off_t max_blocks); +extern _IMPEXP_KERNEL int remove_cached_device_blocks(int dev, int allow_write); + +extern _IMPEXP_KERNEL void *get_block(int dev, off_t bnum, int bsize); +extern _IMPEXP_KERNEL void *get_empty_block(int dev, off_t bnum, int bsize); +extern _IMPEXP_KERNEL int release_block(int dev, off_t bnum); +extern _IMPEXP_KERNEL int mark_blocks_dirty(int dev, off_t bnum, int nblocks); + + +extern _IMPEXP_KERNEL int cached_read(int dev, off_t bnum, void *data, off_t num_blocks, int bsize); +extern _IMPEXP_KERNEL int cached_write(int dev, off_t bnum, const void *data, + off_t num_blocks, int bsize); +extern _IMPEXP_KERNEL int cached_write_locked(int dev, off_t bnum, const void *data, + off_t num_blocks, int bsize); +extern _IMPEXP_KERNEL int set_blocks_info(int dev, off_t *blocks, int nblocks, + void (*func)(off_t bnum, size_t nblocks, void *arg), + void *arg); + + +extern _IMPEXP_KERNEL size_t read_phys_blocks (int fd, off_t bnum, void *data, uint num_blocks, int bsize); +extern _IMPEXP_KERNEL size_t write_phys_blocks(int fd, off_t bnum, void *data, uint num_blocks, int bsize); + +#endif /* _CACHE_H_ */ diff --git a/src/add-ons/kernel/file_systems/bfs/cpp.cpp b/src/add-ons/kernel/file_systems/bfs/cpp.cpp new file mode 100644 index 0000000000..47d5ca110b --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/cpp.cpp @@ -0,0 +1,17 @@ +/* cpp - C++ in the kernel +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "cpp.h" + + +nothrow_t _dontthrow; + +extern "C" void __pure_virtual() +{ + //printf("pure virtual function call"); +} + diff --git a/src/add-ons/kernel/file_systems/bfs/cpp.h b/src/add-ons/kernel/file_systems/bfs/cpp.h new file mode 100644 index 0000000000..c9fd65fdf3 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/cpp.h @@ -0,0 +1,52 @@ +#ifndef CPP_H +#define CPP_H +/* cpp - C++ in the kernel +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include +#include + + +// Oh no! C++ in the kernel! Are you nuts? +// +// - no exceptions +// - (almost) no virtuals (well, the Query code now uses them) +// - it's basically only the C++ syntax, and type checking +// - since one tend to encapsulate everything in classes, it has a slightly +// higher memory overhead +// - nicer code +// - easier to maintain + + +inline void *operator new(size_t size, const nothrow_t&) throw() +{ + return malloc(size); +} + +inline void *operator new[](size_t size, const nothrow_t&) throw() +{ + return malloc(size); +} + +inline void operator delete(void *ptr) +{ + free(ptr); +} + +inline void operator delete[](void *ptr) +{ + free(ptr); +} + +// now we're using virtuals +extern "C" void __pure_virtual(); + +extern nothrow_t _dontthrow; +#define new new (_dontthrow) + + +#endif /* CPP_H */ diff --git a/src/add-ons/kernel/file_systems/bfs/fsproto.h b/src/add-ons/kernel/file_systems/bfs/fsproto.h new file mode 100644 index 0000000000..1fc15ddc7c --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/fsproto.h @@ -0,0 +1,249 @@ +/* + Copyright 1999-2001, Be Incorporated. All Rights Reserved. + This file may be used under the terms of the Be Sample Code License. +*/ + +#ifndef _FSPROTO_H +#define _FSPROTO_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +typedef dev_t nspace_id; +typedef ino_t vnode_id; + +/* + * PUBLIC PART OF THE FILE SYSTEM PROTOCOL + */ + +#define WSTAT_MODE 0x0001 +#define WSTAT_UID 0x0002 +#define WSTAT_GID 0x0004 +#define WSTAT_SIZE 0x0008 +#define WSTAT_ATIME 0x0010 +#define WSTAT_MTIME 0x0020 +#define WSTAT_CRTIME 0x0040 + +#define WFSSTAT_NAME 0x0001 + +#define B_ENTRY_CREATED 1 +#define B_ENTRY_REMOVED 2 +#define B_ENTRY_MOVED 3 +#define B_STAT_CHANGED 4 +#define B_ATTR_CHANGED 5 +#define B_DEVICE_MOUNTED 6 +#define B_DEVICE_UNMOUNTED 7 + +#define B_STOP_WATCHING 0x0000 +#define B_WATCH_NAME 0x0001 +#define B_WATCH_STAT 0x0002 +#define B_WATCH_ATTR 0x0004 +#define B_WATCH_DIRECTORY 0x0008 + +#define SELECT_READ 1 +#define SELECT_WRITE 2 +#define SELECT_EXCEPTION 3 + +// missing ioctl() call added +#define IOCTL_FILE_UNCACHED_IO 10000 + +#define B_CUR_FS_API_VERSION 2 + +struct attr_info; +struct index_info; + +typedef int op_read_vnode(void *ns, vnode_id vnid, char r, void **node); +typedef int op_write_vnode(void *ns, void *node, char r); +typedef int op_remove_vnode(void *ns, void *node, char r); +typedef int op_secure_vnode(void *ns, void *node); + +typedef int op_walk(void *ns, void *base, const char *file, char **newpath, + vnode_id *vnid); + +typedef int op_access(void *ns, void *node, int mode); + +typedef int op_create(void *ns, void *dir, const char *name, + int omode, int perms, vnode_id *vnid, void **cookie); +typedef int op_mkdir(void *ns, void *dir, const char *name, int perms); +typedef int op_symlink(void *ns, void *dir, const char *name, + const char *path); +typedef int op_link(void *ns, void *dir, const char *name, void *node); + +typedef int op_rename(void *ns, void *olddir, const char *oldname, + void *newdir, const char *newname); +typedef int op_unlink(void *ns, void *dir, const char *name); +typedef int op_rmdir(void *ns, void *dir, const char *name); + +typedef int op_readlink(void *ns, void *node, char *buf, size_t *bufsize); + +typedef int op_opendir(void *ns, void *node, void **cookie); +typedef int op_closedir(void *ns, void *node, void *cookie); +typedef int op_rewinddir(void *ns, void *node, void *cookie); +typedef int op_readdir(void *ns, void *node, void *cookie, long *num, + struct dirent *buf, size_t bufsize); + +typedef int op_open(void *ns, void *node, int omode, void **cookie); +typedef int op_close(void *ns, void *node, void *cookie); +typedef int op_free_cookie(void *ns, void *node, void *cookie); +typedef int op_read(void *ns, void *node, void *cookie, off_t pos, void *buf, + size_t *len); +typedef int op_write(void *ns, void *node, void *cookie, off_t pos, + const void *buf, size_t *len); +typedef int op_readv(void *ns, void *node, void *cookie, off_t pos, const iovec *vec, + size_t count, size_t *len); +typedef int op_writev(void *ns, void *node, void *cookie, off_t pos, const iovec *vec, + size_t count, size_t *len); +typedef int op_ioctl(void *ns, void *node, void *cookie, int cmd, void *buf, + size_t len); +typedef int op_setflags(void *ns, void *node, void *cookie, int flags); + +typedef int op_rstat(void *ns, void *node, struct stat *); +typedef int op_wstat(void *ns, void *node, struct stat *, long mask); +typedef int op_fsync(void *ns, void *node); + +typedef int op_select(void *ns, void *node, void *cookie, uint8 event, + uint32 ref, selectsync *sync); +typedef int op_deselect(void *ns, void *node, void *cookie, uint8 event, + selectsync *sync); + +typedef int op_initialize(const char *devname, void *parms, size_t len); +typedef int op_mount(nspace_id nsid, const char *devname, ulong flags, + void *parms, size_t len, void **data, vnode_id *vnid); +typedef int op_unmount(void *ns); +typedef int op_sync(void *ns); +typedef int op_rfsstat(void *ns, struct fs_info *); +typedef int op_wfsstat(void *ns, struct fs_info *, long mask); + + +typedef int op_open_attrdir(void *ns, void *node, void **cookie); +typedef int op_close_attrdir(void *ns, void *node, void *cookie); +typedef int op_rewind_attrdir(void *ns, void *node, void *cookie); +typedef int op_read_attrdir(void *ns, void *node, void *cookie, long *num, + struct dirent *buf, size_t bufsize); +typedef int op_remove_attr(void *ns, void *node, const char *name); +typedef int op_rename_attr(void *ns, void *node, const char *oldname, + const char *newname); +typedef int op_stat_attr(void *ns, void *node, const char *name, + struct attr_info *buf); + +typedef int op_write_attr(void *ns, void *node, const char *name, int type, + const void *buf, size_t *len, off_t pos); +typedef int op_read_attr(void *ns, void *node, const char *name, int type, + void *buf, size_t *len, off_t pos); + +typedef int op_open_indexdir(void *ns, void **cookie); +typedef int op_close_indexdir(void *ns, void *cookie); +typedef int op_rewind_indexdir(void *ns, void *cookie); +typedef int op_read_indexdir(void *ns, void *cookie, long *num, + struct dirent *buf, size_t bufsize); +typedef int op_create_index(void *ns, const char *name, int type, int flags); +typedef int op_remove_index(void *ns, const char *name); +typedef int op_rename_index(void *ns, const char *oldname, + const char *newname); +typedef int op_stat_index(void *ns, const char *name, struct index_info *buf); + +typedef int op_open_query(void *ns, const char *query, ulong flags, + port_id port, long token, void **cookie); +typedef int op_close_query(void *ns, void *cookie); +typedef int op_read_query(void *ns, void *cookie, long *num, + struct dirent *buf, size_t bufsize); + +typedef struct vnode_ops { + op_read_vnode (*read_vnode); + op_write_vnode (*write_vnode); + op_remove_vnode (*remove_vnode); + op_secure_vnode (*secure_vnode); + op_walk (*walk); + op_access (*access); + op_create (*create); + op_mkdir (*mkdir); + op_symlink (*symlink); + op_link (*link); + op_rename (*rename); + op_unlink (*unlink); + op_rmdir (*rmdir); + op_readlink (*readlink); + op_opendir (*opendir); + op_closedir (*closedir); + op_free_cookie (*free_dircookie); + op_rewinddir (*rewinddir); + op_readdir (*readdir); + op_open (*open); + op_close (*close); + op_free_cookie (*free_cookie); + op_read (*read); + op_write (*write); + op_readv (*readv); + op_writev (*writev); + op_ioctl (*ioctl); + op_setflags (*setflags); + op_rstat (*rstat); + op_wstat (*wstat); + op_fsync (*fsync); + op_initialize (*initialize); + op_mount (*mount); + op_unmount (*unmount); + op_sync (*sync); + op_rfsstat (*rfsstat); + op_wfsstat (*wfsstat); + op_select (*select); + op_deselect (*deselect); + op_open_indexdir (*open_indexdir); + op_close_indexdir (*close_indexdir); + op_free_cookie (*free_indexdircookie); + op_rewind_indexdir (*rewind_indexdir); + op_read_indexdir (*read_indexdir); + op_create_index (*create_index); + op_remove_index (*remove_index); + op_rename_index (*rename_index); + op_stat_index (*stat_index); + op_open_attrdir (*open_attrdir); + op_close_attrdir (*close_attrdir); + op_free_cookie (*free_attrdircookie); + op_rewind_attrdir (*rewind_attrdir); + op_read_attrdir (*read_attrdir); + op_write_attr (*write_attr); + op_read_attr (*read_attr); + op_remove_attr (*remove_attr); + op_rename_attr (*rename_attr); + op_stat_attr (*stat_attr); + op_open_query (*open_query); + op_close_query (*close_query); + op_free_cookie (*free_querycookie); + op_read_query (*read_query); +} vnode_ops; + +extern _IMPEXP_KERNEL int new_path(const char *path, char **copy); +extern _IMPEXP_KERNEL void free_path(char *p); + +extern _IMPEXP_KERNEL int notify_listener(int op, nspace_id nsid, + vnode_id vnida, vnode_id vnidb, + vnode_id vnidc, const char *name); +extern _IMPEXP_KERNEL void notify_select_event(selectsync *sync, uint32 ref); +extern _IMPEXP_KERNEL int send_notification(port_id port, long token, + ulong what, long op, nspace_id nsida, + nspace_id nsidb, vnode_id vnida, + vnode_id vnidb, vnode_id vnidc, + const char *name); +extern _IMPEXP_KERNEL int get_vnode(nspace_id nsid, vnode_id vnid, void **data); +extern _IMPEXP_KERNEL int put_vnode(nspace_id nsid, vnode_id vnid); +extern _IMPEXP_KERNEL int new_vnode(nspace_id nsid, vnode_id vnid, void *data); +extern _IMPEXP_KERNEL int remove_vnode(nspace_id nsid, vnode_id vnid); +extern _IMPEXP_KERNEL int unremove_vnode(nspace_id nsid, vnode_id vnid); +extern _IMPEXP_KERNEL int is_vnode_removed(nspace_id nsid, vnode_id vnid); + + +extern _EXPORT vnode_ops fs_entry; +extern _EXPORT int32 api_version; + +#endif diff --git a/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp b/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp new file mode 100644 index 0000000000..5dcaab7628 --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/kernel_interface.cpp @@ -0,0 +1,1880 @@ +/* kernel_interface - file system interface to BeOS' vnode layer +** +** Initial version by Axel Dörfler, axeld@pinc-software.de +** This file may be used under the terms of the OpenBeOS License. +*/ + + +#include "Debug.h" +#include "cpp.h" +#include "Volume.h" +#include "Inode.h" +#include "Index.h" +#include "BPlusTree.h" +#include "Query.h" + +#include +#include + +// BeOS vnode layer stuff +#include +#ifndef _IMPEXP_KERNEL +# define _IMPEXP_KERNEL +#endif + +extern "C" { + #include + #include + #include +} +#include +#include + + +#ifdef USER +# define dprintf printf +#endif + + +extern "C" { + static int bfs_mount(nspace_id nsid, const char *device, ulong flags, + void *parms, size_t len, void **data, vnode_id *vnid); + static int bfs_unmount(void *_ns); + static int bfs_read_fs_stat(void *_ns, struct fs_info *); + static int bfs_write_fs_stat(void *ns, struct fs_info *, long mode); + static int bfs_initialize(const char *devname, void *parms, size_t len); + + static int bfs_sync(void *ns); + + static int bfs_read_vnode(void *_ns, vnode_id vnid, char r, void **node); + static int bfs_release_vnode(void *_ns, void *_node, char r); + static int bfs_remove_vnode(void *ns, void *node, char r); + + static int bfs_walk(void *_ns, void *_base, const char *file, + char **newpath, vnode_id *vnid); + + static int bfs_ioctl(void *ns, void *node, void *cookie, int cmd, void *buf,size_t len); + static int bfs_setflags(void *ns, void *node, void *cookie, int flags); + + static int bfs_select(void *ns, void *node, void *cookie, uint8 event, + uint32 ref, selectsync *sync); + static int bfs_deselect(void *ns, void *node, void *cookie, uint8 event, + selectsync *sync); + static int bfs_fsync(void *ns,void *node); + + static int bfs_create(void *ns, void *dir, const char *name, + int perms, int omode, vnode_id *vnid, void **cookie); + static int bfs_symlink(void *ns, void *dir, const char *name, + const char *path); + static int bfs_link(void *ns, void *dir, const char *name, void *node); + static int bfs_unlink(void *ns, void *dir, const char *name); + static int bfs_rename(void *ns, void *oldDir, const char *oldName, void *newDir, const char *newName); + + static int bfs_read_stat(void *_ns, void *_node, struct stat *st); + static int bfs_write_stat(void *ns, void *node, struct stat *st, long mask); + + static int bfs_open(void *_ns, void *_node, int omode, void **cookie); + static int bfs_read(void *_ns, void *_node, void *cookie, off_t pos, + void *buf, size_t *len); + static int bfs_write(void *ns, void *node, void *cookie, off_t pos, + const void *buf, size_t *len); + static int bfs_free_cookie(void *ns, void *node, void *cookie); + static int bfs_close(void *ns, void *node, void *cookie); + + static int bfs_access(void *_ns, void *_node, int mode); + static int bfs_read_link(void *_ns, void *_node, char *buffer, size_t *bufferSize); + + // directory functions + static int bfs_mkdir(void *ns, void *dir, const char *name, int perms); + static int bfs_rmdir(void *ns, void *dir, const char *name); + static int bfs_open_dir(void *_ns, void *_node, void **cookie); + static int bfs_read_dir(void *_ns, void *_node, void *cookie, + long *num, struct dirent *dirent, size_t bufferSize); + static int bfs_rewind_dir(void *_ns, void *_node, void *cookie); + static int bfs_close_dir(void *_ns, void *_node, void *cookie); + static int bfs_free_dir_cookie(void *_ns, void *_node, void *cookie); + + // attribute support + static int bfs_open_attrdir(void *ns, void *node, void **cookie); + static int bfs_close_attrdir(void *ns, void *node, void *cookie); + static int bfs_free_attrdir_cookie(void *ns, void *node, void *cookie); + static int bfs_rewind_attrdir(void *ns, void *node, void *cookie); + static int bfs_read_attrdir(void *ns, void *node, void *cookie, long *num, + struct dirent *buf, size_t bufferSize); + static int bfs_remove_attr(void *ns, void *node, const char *name); + static int bfs_rename_attr(void *ns, void *node, const char *oldname, + const char *newname); + static int bfs_stat_attr(void *ns, void *node, const char *name, + struct attr_info *buf); + static int bfs_write_attr(void *ns, void *node, const char *name, int type, + const void *buf, size_t *len, off_t pos); + static int bfs_read_attr(void *ns, void *node, const char *name, int type, + void *buf, size_t *len, off_t pos); + + // index support + static int bfs_open_indexdir(void *ns, void **cookie); + static int bfs_close_indexdir(void *ns, void *cookie); + static int bfs_free_indexdir_cookie(void *ns, void *node, void *cookie); + static int bfs_rewind_indexdir(void *ns, void *cookie); + static int bfs_read_indexdir(void *ns, void *cookie, long *num,struct dirent *dirent, + size_t bufferSize); + static int bfs_create_index(void *ns, const char *name, int type, int flags); + static int bfs_remove_index(void *ns, const char *name); + static int bfs_rename_index(void *ns, const char *oldname, const char *newname); + static int bfs_stat_index(void *ns, const char *name, struct index_info *indexInfo); + + // query support + static int bfs_open_query(void *ns, const char *query, ulong flags, + port_id port, long token, void **cookie); + static int bfs_close_query(void *ns, void *cookie); + static int bfs_free_query_cookie(void *ns, void *node, void *cookie); + static int bfs_read_query(void *ns, void *cookie, long *num, + struct dirent *buf, size_t bufsize); +} // extern "C" + + +/* vnode_ops struct. Fill this in to tell the kernel how to call + functions in your driver. +*/ + +vnode_ops fs_entry = { + &bfs_read_vnode, // read_vnode + &bfs_release_vnode, // write_vnode + &bfs_remove_vnode, // remove_vnode + NULL, // secure_vnode (not needed) + &bfs_walk, // walk + &bfs_access, // access + &bfs_create, // create + &bfs_mkdir, // mkdir + &bfs_symlink, // symlink + &bfs_link, // link + &bfs_rename, // rename + &bfs_unlink, // unlink + &bfs_rmdir, // rmdir + &bfs_read_link, // readlink + &bfs_open_dir, // opendir + &bfs_close_dir, // closedir + &bfs_free_dir_cookie, // free_dircookie + &bfs_rewind_dir, // rewinddir + &bfs_read_dir, // readdir + &bfs_open, // open file + &bfs_close, // close file + &bfs_free_cookie, // free cookie + &bfs_read, // read file + &bfs_write, // write file + NULL, // readv + NULL, // writev + &bfs_ioctl, // ioctl + &bfs_setflags, // setflags file + &bfs_read_stat, // read stat + &bfs_write_stat, // write stat + &bfs_fsync, // fsync + &bfs_initialize, // initialize + &bfs_mount, // mount + &bfs_unmount, // unmount + &bfs_sync, // sync + &bfs_read_fs_stat, // read fs stat + &bfs_write_fs_stat, // write fs stat + &bfs_select, // select + &bfs_deselect, // deselect + + &bfs_open_indexdir, // open index dir + &bfs_close_indexdir, // close index dir + &bfs_free_indexdir_cookie, // free index dir cookie + &bfs_rewind_indexdir, // rewind index dir + &bfs_read_indexdir, // read index dir + &bfs_create_index, // create index + &bfs_remove_index, // remove index + &bfs_rename_index, // rename index + &bfs_stat_index, // stat index + + &bfs_open_attrdir, // open attr dir + &bfs_close_attrdir, // close attr dir + &bfs_free_attrdir_cookie, // free attr dir cookie + &bfs_rewind_attrdir, // rewind attr dir + &bfs_read_attrdir, // read attr dir + &bfs_write_attr, // write attr + &bfs_read_attr, // read attr + &bfs_remove_attr, // remove attr + &bfs_rename_attr, // rename attr + &bfs_stat_attr, // stat attr + + &bfs_open_query, // open query + &bfs_close_query, // close query + &bfs_free_query_cookie, // free query cookie + &bfs_read_query // read query +}; + +#define BFS_IO_SIZE 65536 +int32 api_version = B_CUR_FS_API_VERSION; + + +static int +bfs_mount(nspace_id nsid, const char *device, ulong flags, void *parms, + size_t len, void **data, vnode_id *rootID) +{ + FUNCTION(); + +#ifndef USER + // If you can't build the file system because of this line, you can either + // add the prototype: + // extern int load_driver_symbols(const char *driver_name); + // to your KernelExport.h include (since it's missing there in some releases + // of BeOS R5), or just comment out the line, it won't do any harm and is + // used only for debugging purposes. + load_driver_symbols("obfs"); +#endif + + Volume *volume = new Volume(nsid); + if (volume == NULL) + return B_NO_MEMORY; + + status_t status; + if ((status = volume->Mount(device,flags)) == B_OK) { + *data = volume; + *rootID = volume->ToVnode(volume->Root()); + INFORM(("mounted \"%s\" (root node at %Ld, device = %s)\n",volume->Name(),*rootID,device)); + } + else + delete volume; + + RETURN_ERROR(status); +} + + +static int +bfs_unmount(void *ns) +{ + FUNCTION(); + Volume* volume = (Volume *)ns; + + status_t status = volume->Unmount(); + delete volume; + + RETURN_ERROR(status); +} + + +/** Fill in bfs_info struct for device. + */ + +static int +bfs_read_fs_stat(void *_ns, struct fs_info *info) +{ + FUNCTION(); + if (_ns == NULL || info == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + + // File system flags. + info->flags = B_FS_IS_PERSISTENT | B_FS_HAS_ATTR | B_FS_HAS_MIME | B_FS_HAS_QUERY | + (volume->IsReadOnly() ? B_FS_IS_READONLY : 0); + + info->io_size = BFS_IO_SIZE; + // whatever is appropriate here? Just use the same value as BFS (and iso9660) for now + + info->block_size = volume->BlockSize(); + info->total_blocks = volume->NumBlocks(); + info->free_blocks = volume->FreeBlocks(); + + // Volume name + strncpy(info->volume_name, volume->Name(), sizeof(info->volume_name) - 1); + info->volume_name[sizeof(info->volume_name) - 1] = '\0'; + + // File system name (ToDo: has to change to "bfs" later) + strcpy(info->fsh_name,"obfs"); + + return B_NO_ERROR; +} + + +static int +bfs_write_fs_stat(void *_ns, struct fs_info *info, long mask) +{ + FUNCTION_START(("mask = %ld\n",mask)); + Volume *volume = (Volume *)_ns; + disk_super_block &superBlock = volume->SuperBlock(); + + Locker locker(volume->Lock()); + + status_t status = B_BAD_VALUE; + + if (mask & WFSSTAT_NAME) { + strncpy(superBlock.name,info->volume_name,sizeof(superBlock.name) - 1); + superBlock.name[sizeof(superBlock.name) - 1] = '\0'; + + status = volume->WriteSuperBlock(); + } + return status; +} + + +int +bfs_initialize(const char *deviceName, void *parms, size_t len) +{ + FUNCTION_START(("deviceName = %s, parameter len = %ld\n",deviceName,len)); + + // ToDo: implement bfs_initialize()! + + return B_ERROR; +} + + +int +bfs_sync(void *_ns) +{ + FUNCTION(); + if (_ns == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + + return volume->Sync(); +} + + +// #pragma mark - + + +/** Using vnode id, read in vnode information into fs-specific struct, + * and return it in node. the reenter flag tells you if this function + * is being called via some other fs routine, so that things like + * double-locking can be avoided. + */ + +static int +bfs_read_vnode(void *_ns, vnode_id id, char reenter, void **node) +{ + FUNCTION_START(("vnode_id = %Ld\n",id)); + Volume *volume = (Volume *)_ns; + + if (id < 0 || id > volume->NumBlocks()) { + FATAL(("inode at %Ld requested!\n",id)); + return B_ERROR; + } + + Inode *inode = new Inode(volume,id,false,reenter); + if (inode == NULL) + return B_NO_MEMORY; + + if (inode->InitCheck() == B_OK) { + *node = (void *)inode; + return B_OK; + } + + delete inode; + RETURN_ERROR(B_ERROR); +} + + +static int +bfs_release_vnode(void *ns, void *_node, char reenter) +{ + //FUNCTION_START(("node = %p\n",_node)); + Inode *inode = (Inode *)_node; + + delete inode; + + return B_NO_ERROR; +} + + +int +bfs_remove_vnode(void *_ns, void *_node, char reenter) +{ + FUNCTION(); + + if (_ns == NULL || _node == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + // If the inode isn't in use anymore, we were called before + // bfs_unlink() returns - in this case, we can just use the + // transaction which has already deleted the inode. + Transaction localTransaction,*transaction = &localTransaction; + Journal *journal = volume->GetJournal(volume->ToBlock(inode->Parent())); + + if (journal != NULL && journal->CurrentThread() == find_thread(NULL)) + transaction = journal->CurrentTransaction(); + else + localTransaction.Start(volume,inode->BlockNumber()); + + // Perhaps there should be an implementation of Inode::ShrinkStream() that + // just frees the data_stream, but doesn't change the inode (since it is + // freed anyway) - that would make an undelete command possible + status_t status = inode->SetFileSize(transaction,0); + if (status < B_OK) + return status; + + // Free all attributes, and remove their indices + { + // We have to limit the scope of AttributeIterator, so that its + // destructor is not called after the inode is deleted + AttributeIterator iterator(inode); + + char name[B_FILE_NAME_LENGTH]; + uint32 type; + size_t length; + vnode_id id; + while ((status = iterator.GetNext(name,&length,&type,&id)) == B_OK) + inode->RemoveAttribute(transaction,name); + } + + if ((status = volume->Free(transaction,inode->BlockRun())) == B_OK) { + if (transaction == &localTransaction) + localTransaction.Done(); + + delete inode; + } + + return B_OK; +} + + +// #pragma mark - + + +/** the walk function just "walks" through a directory looking for the + * specified file. It calls get_vnode() on its vnode-id to init it + * for the kernel. + */ + +static int +bfs_walk(void *_ns, void *_directory, const char *file, char **_resolvedPath, vnode_id *vnid) +{ + FUNCTION_START(("file = %s\n",file)); + + if (_ns == NULL || _directory == NULL || file == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + // check access permissions + status_t status = directory->CheckPermissions(X_OK); + if (status < B_OK) + RETURN_ERROR(status); + + BPlusTree *tree; + if (directory->GetTree(&tree) != B_OK) + RETURN_ERROR(B_BAD_VALUE); + + if ((status = tree->Find((uint8 *)file,(uint16)strlen(file),vnid)) < B_OK) + RETURN_ERROR(status); + + Inode *inode; + if ((status = get_vnode(volume->ID(),*vnid,(void **)&inode)) != B_OK) { + REPORT_ERROR(status); + return B_ENTRY_NOT_FOUND; + } + + // Is inode a symlink? Then resolve it, if we should + + if (inode->IsSymLink() && _resolvedPath != NULL) { + status_t status = B_OK; + char *newPath = NULL; + + // Symbolic links can store their target in the data stream (for links + // that take more than 144 bytes of storage [the size of the data_stream + // structure]), or directly instead of the data_stream class + // So we have to deal with both cases here. + + // Note: we would naturally call bfs_read_link() here, but the API of the + // vnode layer would require us to always reserve a large chunk of memory + // for the path, so we're not going to do that + + if (inode->Flags() & INODE_LONG_SYMLINK) { + size_t readBytes = inode->Node()->data.size; + char *data = (char *)malloc(readBytes); + if (data != NULL) { + status = inode->ReadAt(0, (uint8 *)data, &readBytes); + if (status == B_OK && readBytes == inode->Node()->data.size) + status = new_path(data, &newPath); + + free(data); + } else + status = B_NO_MEMORY; + } else + status = new_path((char *)&inode->Node()->short_symlink, &newPath); + + put_vnode(volume->ID(), inode->ID()); + if (status == B_OK) + *_resolvedPath = newPath; + + RETURN_ERROR(status); + } + + return B_OK; +} + + +int +bfs_ioctl(void *_ns, void *_node, void *_cookie, int cmd, void *buffer, size_t bufferLength) +{ + FUNCTION_START(("node = %p, cmd = %d, buf = %p, len = %ld\n",_node,cmd,buffer,bufferLength)); + + if (_ns == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + switch (cmd) { + case IOCTL_FILE_UNCACHED_IO: + if (inode != NULL) + PRINT(("trying to make access to inode %lx uncached. Not yet implemented!\n",inode->ID())); + return B_ERROR; +#ifdef DEBUG + case 56742: + { + // allocate all free blocks and zero them out (a test for the BlockAllocator)! + BlockAllocator &allocator = volume->Allocator(); + Transaction transaction(volume,0); + CachedBlock cached(volume); + block_run run; + while (allocator.AllocateBlocks(&transaction,8,0,64,1,run) == B_OK) { + PRINT(("write block_run(%ld, %d, %d)\n",run.allocation_group,run.start,run.length)); + for (int32 i = 0;i < run.length;i++) { + uint8 *block = cached.SetTo(run); + if (block != NULL) { + memset(block,0,volume->BlockSize()); + cached.WriteBack(&transaction); + } + } + } + return B_OK; + } + case 56743: + dump_super_block(&volume->SuperBlock()); + return B_OK; + case 56744: + if (inode != NULL) + dump_inode(inode->Node()); + return B_OK; + case 56745: + if (inode != NULL) + dump_block((const char *)inode->Node(),volume->BlockSize()); + return B_OK; +#endif + } + return B_BAD_VALUE; +} + + +int +bfs_setflags(void *ns, void *node, void *cookie, int flags) +{ + FUNCTION_START(("node = %p, flags = %d",node,flags)); + + // ToDo: implement bfs_setflags()! + INFORM(("setflags not yet implemented...\n")); + + return B_OK; +} + + +int +bfs_select(void *ns, void *node, void *cookie, uint8 event, uint32 ref, selectsync *sync) +{ + FUNCTION_START(("event = %d, ref = %lu, sync = %p\n",event,ref,sync)); + notify_select_event(sync, ref); + + return B_OK; +} + + +int +bfs_deselect(void *ns, void *node, void *cookie, uint8 event, selectsync *sync) +{ + FUNCTION(); + return B_OK; +} + + +int +bfs_fsync(void *_ns, void *_node) +{ + FUNCTION(); + if (_node == NULL) + return B_BAD_VALUE; + + Inode *inode = (Inode *)_node; + return inode->Sync(); +} + + +/** Fills in the stat struct for a node + */ + +static int +bfs_read_stat(void *_ns, void *_node, struct stat *st) +{ + FUNCTION(); + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + bfs_inode *node = inode->Node(); + + st->st_dev = volume->ID(); + st->st_ino = inode->ID(); + st->st_nlink = 1; + st->st_blksize = BFS_IO_SIZE; + + st->st_uid = node->uid; + st->st_gid = node->gid; + st->st_mode = node->mode; + st->st_size = node->data.size; + + st->st_atime = time(NULL); + st->st_mtime = st->st_ctime = (time_t)(node->last_modified_time >> INODE_TIME_SHIFT); + st->st_crtime = (time_t)(node->create_time >> INODE_TIME_SHIFT); + + return B_NO_ERROR; +} + + +int +bfs_write_stat(void *_ns, void *_node, struct stat *stat, long mask) +{ + FUNCTION(); + + if (_ns == NULL || _node == NULL || stat == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + // that may be incorrect here - I don't think we need write access to + // change most of the stat... + // we should definitely check a bit more if the new stats are correct and valid... + + status_t status = inode->CheckPermissions(W_OK); + if (status < B_OK) + RETURN_ERROR(status); + + WriteLocked locked(inode->Lock()); + if (locked.IsLocked() < B_OK) + RETURN_ERROR(B_ERROR); + + Transaction transaction(volume,inode->BlockNumber()); + + bfs_inode *node = inode->Node(); + + if (mask & WSTAT_MODE) { + PRINT(("original mode = %ld, stat->st_mode = %ld\n",node->mode,stat->st_mode)); + node->mode = node->mode & ~S_IUMSK | stat->st_mode & S_IUMSK; + } + + if (mask & WSTAT_UID) + node->uid = stat->st_uid; + if (mask & WSTAT_GID) + node->gid = stat->st_gid; + + if (mask & WSTAT_SIZE) { + if (inode->IsDirectory()) + return B_IS_A_DIRECTORY; + + if (inode->Size() != stat->st_size) { + status = inode->SetFileSize(&transaction,stat->st_size); + + // fill the new blocks (if any) with zeros + inode->FillGapWithZeros(inode->OldSize(),inode->Size()); + + Index index(volume); + index.UpdateSize(&transaction,inode); + + if ((mask & WSTAT_MTIME) == 0) + index.UpdateLastModified(&transaction,inode); + } + } + + if (mask & WSTAT_MTIME) { + // Index::UpdateLastModified() will set the new time in the inode + Index index(volume); + index.UpdateLastModified(&transaction,inode,(bigtime_t)stat->st_mtime << INODE_TIME_SHIFT); + } + if (mask & WSTAT_CRTIME) { + node->create_time = (bigtime_t)stat->st_crtime << INODE_TIME_SHIFT; + } + + if ((status = inode->WriteBack(&transaction)) == B_OK) + transaction.Done(); + + notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL); + + return status; +} + + +int +bfs_create(void *_ns, void *_directory, const char *name, int omode, int mode, vnode_id *vnid, void **_cookie) +{ + FUNCTION_START(("name = \"%s\", perms = %ld, omode = %ld\n",name,mode,omode)); + + if (_ns == NULL || _directory == NULL || _cookie == NULL + || name == NULL || *name == '\0') + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + if (!directory->IsDirectory()) + RETURN_ERROR(B_BAD_TYPE); + + status_t status = directory->CheckPermissions(W_OK); + if (status < B_OK) + RETURN_ERROR(status); + + file_cookie *cookie = (file_cookie *)malloc(sizeof(file_cookie)); + if (cookie == NULL) + RETURN_ERROR(B_NO_MEMORY); + + // initialize the cookie + cookie->open_mode = omode; + cookie->last_size = 0; + cookie->last_notification = system_time(); + + Transaction transaction(volume,directory->BlockNumber()); + + status = Inode::Create(&transaction,directory,name,S_FILE | (mode & S_IUMSK),omode,0,vnid); + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,*vnid,name); + } + if (status < B_OK) + free(cookie); + else + *_cookie = cookie; + + return status; +} + + +int +bfs_symlink(void *_ns, void *_directory, const char *name, const char *path) +{ + FUNCTION(); + + if (_ns == NULL || _directory == NULL || path == NULL + || name == NULL || *name == '\0') + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + if (!directory->IsDirectory()) + RETURN_ERROR(B_BAD_TYPE); + + status_t status = directory->CheckPermissions(W_OK); + if (status < B_OK) + RETURN_ERROR(status); + + Transaction transaction(volume,directory->BlockNumber()); + + Inode *link; + off_t id; + status = Inode::Create(&transaction,directory,name,S_SYMLINK | 0777,0,0,&id,&link); + if (status < B_OK) + RETURN_ERROR(status); + + size_t length = strlen(path); + if (length < SHORT_SYMLINK_NAME_LENGTH) { + strcpy(link->Node()->short_symlink,path); + status = link->WriteBack(&transaction); + } else { + link->Node()->flags |= INODE_LONG_SYMLINK | INODE_LOGGED; + // The following call will have to write the inode back, so + // we don't have to do that here... + status = link->WriteAt(&transaction,0,(const uint8 *)path,&length); + } + + // Inode::Create() left the inode locked + put_vnode(volume->ID(),id); + + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,id,name); + } + + return status; +} + + +int +bfs_link(void *ns, void *dir, const char *name, void *node) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + // ToDo: implement bfs_link()?!? + + return B_ERROR; +} + + +int +bfs_unlink(void *_ns, void *_directory, const char *name) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + if (_ns == NULL || _directory == NULL || name == NULL || *name == '\0') + return B_BAD_VALUE; + if (!strcmp(name,"..") || !strcmp(name,".")) + return B_NOT_ALLOWED; + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + status_t status = directory->CheckPermissions(W_OK); + if (status < B_OK) + return status; + + Transaction transaction(volume,directory->BlockNumber()); + + off_t id; + if ((status = directory->Remove(&transaction,name,&id)) == B_OK) { + transaction.Done(); + + notify_listener(B_ENTRY_REMOVED,volume->ID(),directory->ID(),0,id,NULL); + } + return status; +} + + +int +bfs_rename(void *_ns, void *_oldDir, const char *oldName, void *_newDir, const char *newName) +{ + FUNCTION_START(("oldDir = %p, oldName = \"%s\", newDir = %p, newName = \"%s\"\n",_oldDir,oldName,_newDir,newName)); + + // there may be some more tests needed?! + if (_ns == NULL || _oldDir == NULL || _newDir == NULL + || oldName == NULL || *oldName == '\0' + || newName == NULL || *newName == '\0' + || !strcmp(oldName,".") || !strcmp(oldName,"..") + || !strcmp(newName,".") || !strcmp(newName,"..")) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *oldDirectory = (Inode *)_oldDir; + Inode *newDirectory = (Inode *)_newDir; + + // get the directory's tree, and a pointer to the inode which should be changed + BPlusTree *tree; + status_t status = oldDirectory->GetTree(&tree); + if (status < B_OK) + RETURN_ERROR(status); + + off_t id; + status = tree->Find((const uint8 *)oldName,strlen(oldName),&id); + if (status < B_OK) + RETURN_ERROR(status); + + Vnode vnode(volume,id); + Inode *inode; + if (vnode.Get(&inode) < B_OK) + return B_IO_ERROR; + + // Don't move a directory into one of its children - we soar up + // from the newDirectory to either the root node or the old + // directory, whichever comes first. + // If we meet our inode on that way, we have to bail out. + + if (oldDirectory != newDirectory) { + vnode_id parent = volume->ToVnode(newDirectory->Parent()); + vnode_id root = volume->RootNode()->ID(); + while (true) + { + if (parent == id) + return B_BAD_VALUE; + else if (parent == root || parent == oldDirectory->ID()) + break; + + Vnode vnode(volume,parent); + Inode *parentNode; + if (vnode.Get(&parentNode) < B_OK) + return B_ERROR; + + parent = volume->ToVnode(parentNode->Parent()); + } + } + + // Everything okay? Then lets get to work... + + Transaction transaction(volume,oldDirectory->BlockNumber()); + + // First, try to make sure there is nothing that will stop us in + // the target directory - since this is the only non-critical + // failure, we will test this case first + BPlusTree *newTree = tree; + if (newDirectory != oldDirectory) { + status = newDirectory->GetTree(&newTree); + if (status < B_OK) + RETURN_ERROR(status); + } + + status = newTree->Insert(&transaction,(const uint8 *)newName,strlen(newName),id); + if (status == B_NAME_IN_USE) { + // If there is already a file with that name, we have to remove + // it, as long it's not a directory with files in it + off_t clobber; + if (newTree->Find((const uint8 *)newName,strlen(newName),&clobber) < B_OK) + return B_NAME_IN_USE; + if (clobber == id) + return B_BAD_VALUE; + + Vnode vnode(volume,clobber); + Inode *other; + if (vnode.Get(&other) < B_OK) + return B_NAME_IN_USE; + + status = newDirectory->Remove(&transaction,newName,NULL,other->IsDirectory()); + if (status < B_OK) + return status; + + notify_listener(B_ENTRY_REMOVED,volume->ID(),newDirectory->ID(),0,clobber,NULL); + + status = newTree->Insert(&transaction,(const uint8 *)newName,strlen(newName),id); + } + if (status < B_OK) + return status; + + // If anything fails now, we have to remove the inode from the + // new directory in any case to restore the previous state + status_t bailStatus = B_OK; + + // update the name only when they differ + bool nameUpdated = false; + if (strcmp(oldName,newName)) { + status = inode->SetName(&transaction,newName); + if (status == B_OK) { + Index index(volume); + index.UpdateName(&transaction,oldName,newName,inode); + nameUpdated = true; + } + } + + if (status == B_OK) { + status = tree->Remove(&transaction,(const uint8 *)oldName,strlen(oldName),id); + if (status == B_OK) { + inode->Node()->parent = newDirectory->BlockRun(); + + // if it's a directory, update the parent directory pointer + // in its tree if necessary + BPlusTree *movedTree = NULL; + if (oldDirectory != newDirectory + && inode->IsDirectory() + && (status = inode->GetTree(&movedTree)) == B_OK) + status = movedTree->Replace(&transaction,(const uint8 *)"..",2,newDirectory->ID()); + + if (status == B_OK) { + status = inode->WriteBack(&transaction); + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ENTRY_MOVED,volume->ID(),oldDirectory->ID(),newDirectory->ID(),id,newName); + return B_OK; + } + } + // Those better don't fail, or we switch to a read-only + // device for safety reasons (Volume::Panic() does this + // for us) + // Anyway, if we overwrote a file in the target directory + // this is lost now (only in-memory, not on-disk)... + bailStatus = tree->Insert(&transaction,(const uint8 *)oldName,strlen(oldName),id); + if (movedTree != NULL) + movedTree->Replace(&transaction,(const uint8 *)"..",2,oldDirectory->ID()); + } + } + if (bailStatus == B_OK && nameUpdated) + bailStatus = inode->SetName(&transaction,oldName); + + if (bailStatus == B_OK) + bailStatus = newTree->Remove(&transaction,(const uint8 *)newName,strlen(newName),id); + + if (bailStatus < B_OK) + volume->Panic(); + + return status; +} + + +/** Opens the file with the specified mode. + */ + +static int +bfs_open(void *_ns, void *_node, int omode, void **_cookie) +{ + FUNCTION(); + if (_ns == NULL || _node == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + // opening a directory read-only is allowed, although you can't read + // any data from it. + if (inode->IsDirectory() && omode & O_RWMASK) { + omode = omode & ~O_RWMASK; + // ToDo: for compatibility reasons, we don't return an error here... + // e.g. "copyattr" tries to do that + //return B_IS_A_DIRECTORY; + } + + status_t status = inode->CheckPermissions(oModeToAccess(omode)); + if (status < B_OK) + RETURN_ERROR(status); + + // we could actually use the cookie to keep track of: + // - the last block_run + // - the location in the data_stream (indirect, double indirect, + // position in block_run array) + // + // This could greatly speed up continuous reads of big files, especially + // in the indirect block section. + + file_cookie *cookie = (file_cookie *)malloc(sizeof(file_cookie)); + if (cookie == NULL) + RETURN_ERROR(B_NO_MEMORY); + + // initialize the cookie + cookie->open_mode = omode; + // needed by e.g. bfs_write() for O_APPEND + cookie->last_size = inode->Size(); + cookie->last_notification = system_time(); + + // Should we truncate the file? + if (omode & O_TRUNC) { + Transaction transaction(volume,inode->BlockNumber()); + WriteLocked locked(inode->Lock()); + + status_t status = inode->SetFileSize(&transaction,0); + if (status < B_OK) { + // bfs_free_cookie() is only called if this function is successful + free(cookie); + return status; + } + + transaction.Done(); + } + + *_cookie = cookie; + return B_OK; +} + + +/** Read a file specified by node, using information in cookie + * and at offset specified by pos. read len bytes into buffer buf. + */ + +static int +bfs_read(void *_ns, void *_node, void *_cookie, off_t pos, void *buffer, size_t *_length) +{ + //FUNCTION(); + Inode *inode = (Inode *)_node; + + if (!inode->HasUserAccessableStream()) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + ReadLocked locked(inode->Lock()); + return inode->ReadAt(pos,(uint8 *)buffer,_length); +} + + +int +bfs_write(void *_ns, void *_node, void *_cookie, off_t pos, const void *buffer, size_t *_length) +{ + //FUNCTION(); + // uncomment to be more robust against a buggy vnode layer ;-) + //if (_ns == NULL || _node == NULL || _cookie == NULL) + // return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + if (!inode->HasUserAccessableStream()) { + *_length = 0; + RETURN_ERROR(B_BAD_VALUE); + } + + file_cookie *cookie = (file_cookie *)_cookie; + + if (cookie->open_mode & O_APPEND) + pos = inode->Size(); + + WriteLocked locked(inode->Lock()); + if (locked.IsLocked() < B_OK) + RETURN_ERROR(B_ERROR); + + Transaction transaction; + // We are not starting the transaction here, since + // it might not be needed at all + + status_t status = inode->WriteAt(&transaction,pos,(const uint8 *)buffer,_length); + + if (status == B_OK) + transaction.Done(); + + // periodically notify if the file size has changed + if (cookie->last_size != inode->Size() + && system_time() > cookie->last_notification + INODE_NOTIFICATION_INTERVAL) { + notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL); + cookie->last_size = inode->Size(); + cookie->last_notification = system_time(); + } + + // This will flush the dirty blocks to disk from time to time. + // It's done here and not in Inode::WriteAt() so that it won't + // add to the duration of a transaction - it might even be a + // good idea to offload those calls to another thread + volume->WriteCachedBlocksIfNecessary(); + + return status; +} + + +/** Do whatever is necessary to close a file, EXCEPT for freeing + * the cookie! + */ + +static int +bfs_close(void *_ns, void *_node, void *_cookie) +{ + FUNCTION(); + if (_ns == NULL || _node == NULL || _cookie == NULL) + return B_BAD_VALUE; + + file_cookie *cookie = (file_cookie *)_cookie; + + if (cookie->open_mode & O_RWMASK) { + // trim the preallocated blocks and update the size, + // and last_modified indices if needed + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + Transaction transaction(volume,inode->BlockNumber()); + + status_t status = inode->Trim(&transaction); + if (status < B_OK) + FATAL(("Could not trim preallocated blocks!")); + + Index index(volume); + index.UpdateSize(&transaction,inode); + index.UpdateLastModified(&transaction,inode); + + if (status == B_OK) + transaction.Done(); + + notify_listener(B_STAT_CHANGED,volume->ID(),0,0,inode->ID(),NULL); + } + + return B_OK; +} + + +static int +bfs_free_cookie(void * /*ns*/, void * /*node*/, void *cookie) +{ + FUNCTION(); + + if (cookie != NULL) + free(cookie); + + return B_OK; +} + + +/** Checks access permissions, return B_NOT_ALLOWED if the action + * is not allowed. + */ + +static int +bfs_access(void *_ns, void *_node, int accessMode) +{ + FUNCTION(); + + if (_ns == NULL || _node == NULL) + return B_BAD_VALUE; + + Inode *inode = (Inode *)_node; + status_t status = inode->CheckPermissions(accessMode); + if (status < B_OK) + RETURN_ERROR(status); + + return B_OK; +} + + +static int +bfs_read_link(void *_ns, void *_node, char *buffer, size_t *bufferSize) +{ + FUNCTION(); + + Inode *inode = (Inode *)_node; + + if (!inode->IsSymLink()) + RETURN_ERROR(B_BAD_VALUE); + + if (inode->Flags() & INODE_LONG_SYMLINK) { + status_t status = inode->ReadAt(0, (uint8 *)buffer, bufferSize); + if (status < B_OK) + RETURN_ERROR(status); + + *bufferSize = inode->Size(); + return B_OK; + } + + size_t numBytes = strlen((char *)&inode->Node()->short_symlink); + uint32 bytes = numBytes; + if (bytes > *bufferSize) + bytes = *bufferSize; + + memcpy(buffer, inode->Node()->short_symlink, bytes); + *bufferSize = numBytes; + + return B_OK; +} + + +// #pragma mark - +// Directory functions + + +int +bfs_mkdir(void *_ns, void *_directory, const char *name, int mode) +{ + FUNCTION_START(("name = \"%s\", perms = %ld\n",name,mode)); + + if (_ns == NULL || _directory == NULL + || name == NULL || *name == '\0') + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + if (!directory->IsDirectory()) + RETURN_ERROR(B_BAD_TYPE); + + status_t status = directory->CheckPermissions(W_OK); + if (status < B_OK) + RETURN_ERROR(status); + + Transaction transaction(volume,directory->BlockNumber()); + + // Inode::Create() locks the inode if we pass the "id" parameter, but we + // need it anyway + off_t id; + status = Inode::Create(&transaction,directory,name,S_DIRECTORY | (mode & S_IUMSK),0,0,&id); + if (status == B_OK) { + put_vnode(volume->ID(),id); + transaction.Done(); + + notify_listener(B_ENTRY_CREATED,volume->ID(),directory->ID(),0,id,name); + } + + return status; +} + + +int +bfs_rmdir(void *_ns, void *_directory, const char *name) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + if (_ns == NULL || _directory == NULL || name == NULL || *name == '\0') + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *directory = (Inode *)_directory; + + Transaction transaction(volume,directory->BlockNumber()); + + off_t id; + status_t status = directory->Remove(&transaction,name,&id,true); + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ENTRY_REMOVED,volume->ID(),directory->ID(),0,id,NULL); + } + + return status; +} + + +/** creates fs-specific "cookie" struct that keeps track of where + * you are at in reading through directory entries in bfs_readdir. + */ + +static int +bfs_open_dir(void *_ns, void *_node, void **_cookie) +{ + FUNCTION(); + + if (_ns == NULL || _node == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Inode *inode = (Inode *)_node; + + if (!inode->IsDirectory()) + RETURN_ERROR(B_BAD_VALUE); + + BPlusTree *tree; + if (inode->GetTree(&tree) != B_OK) + RETURN_ERROR(B_BAD_VALUE); + + TreeIterator *iterator = new TreeIterator(tree); + if (iterator == NULL) + RETURN_ERROR(B_NO_MEMORY); + + *_cookie = iterator; + return B_OK; +} + + +static int +bfs_read_dir(void *_ns, void *_node, void *_cookie, long *num, + struct dirent *dirent, size_t bufferSize) +{ + FUNCTION(); + + TreeIterator *iterator = (TreeIterator *)_cookie; + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + uint16 length; + vnode_id id; + status_t status = iterator->GetNextEntry(dirent->d_name,&length,bufferSize,&id); + if (status == B_ENTRY_NOT_FOUND) { + *num = 0; + return B_OK; + } else if (status != B_OK) + RETURN_ERROR(status); + + Volume *volume = (Volume *)_ns; + + dirent->d_dev = volume->ID(); + dirent->d_ino = id; + dirent->d_reclen = length; + + *num = 1; + return B_OK; +} + + +/** Sets the TreeIterator back to the beginning of the directory + */ + +static int +bfs_rewind_dir(void * /*ns*/, void * /*node*/, void *_cookie) +{ + FUNCTION(); + TreeIterator *iterator = (TreeIterator *)_cookie; + + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + return iterator->Rewind(); +} + + +static int +bfs_close_dir(void * /*ns*/, void * /*node*/, void * /*_cookie*/) +{ + FUNCTION(); + // Do whatever you need to to close a directory, but DON'T free the cookie! + return B_OK; +} + + +static int +bfs_free_dir_cookie(void *ns, void *node, void *_cookie) +{ + TreeIterator *iterator = (TreeIterator *)_cookie; + + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + delete iterator; + return B_OK; +} + + +// #pragma mark - +// Attribute functions + + +int +bfs_open_attrdir(void *_ns, void *_node, void **cookie) +{ + FUNCTION(); + + Inode *inode = (Inode *)_node; + if (inode == NULL || inode->Node() == NULL) + RETURN_ERROR(B_ERROR); + + AttributeIterator *iterator = new AttributeIterator(inode); + if (iterator == NULL) + RETURN_ERROR(B_NO_MEMORY); + + *cookie = iterator; + return B_OK; +} + + +int +bfs_close_attrdir(void *ns, void *node, void *cookie) +{ + FUNCTION(); + return B_OK; +} + + +int +bfs_free_attrdir_cookie(void *ns, void *node, void *_cookie) +{ + FUNCTION(); + AttributeIterator *iterator = (AttributeIterator *)_cookie; + + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + delete iterator; + return B_OK; +} + + +int +bfs_rewind_attrdir(void *_ns, void *_node, void *_cookie) +{ + FUNCTION(); + + AttributeIterator *iterator = (AttributeIterator *)_cookie; + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + RETURN_ERROR(iterator->Rewind()); +} + + +int +bfs_read_attrdir(void *_ns, void *node, void *_cookie, long *num, struct dirent *dirent, size_t bufsize) +{ + FUNCTION(); + AttributeIterator *iterator = (AttributeIterator *)_cookie; + + if (iterator == NULL) + RETURN_ERROR(B_BAD_VALUE); + + uint32 type; + size_t length; + status_t status = iterator->GetNext(dirent->d_name,&length,&type,&dirent->d_ino); + if (status == B_ENTRY_NOT_FOUND) { + *num = 0; + return B_OK; + } else if (status != B_OK) + RETURN_ERROR(status); + + Volume *volume = (Volume *)_ns; + + dirent->d_dev = volume->ID(); + dirent->d_reclen = length; + + *num = 1; + return B_OK; +} + + +int +bfs_remove_attr(void *_ns, void *_node, const char *name) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + if (_ns == NULL || _node == NULL || name == NULL) + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + status_t status = inode->CheckPermissions(W_OK); + if (status < B_OK) + return status; + + Transaction transaction(volume,inode->BlockNumber()); + + status = inode->RemoveAttribute(&transaction,name); + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ATTR_CHANGED,volume->ID(),0,0,inode->ID(),name); + } + + RETURN_ERROR(status); +} + + +int +bfs_rename_attr(void *ns, void *node, const char *oldname,const char *newname) +{ + FUNCTION_START(("name = \"%s\",to = \"%s\"\n",oldname,newname)); + + // ToDo: implement bfs_rename_attr()! + // Does anybody need this? :-) + + RETURN_ERROR(B_ENTRY_NOT_FOUND); +} + + +int +bfs_stat_attr(void *ns, void *_node, const char *name,struct attr_info *attrInfo) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + Inode *inode = (Inode *)_node; + if (inode == NULL || inode->Node() == NULL) + RETURN_ERROR(B_ERROR); + + small_data *smallData = NULL; + if (inode->SmallDataLock().Lock() == B_OK) + { + if ((smallData = inode->FindSmallData((const char *)name)) != NULL) { + attrInfo->type = smallData->type; + attrInfo->size = smallData->data_size; + } + inode->SmallDataLock().Unlock(); + } + if (smallData != NULL) + return B_OK; + + // search in the attribute directory + Inode *attribute; + status_t status = inode->GetAttribute(name,&attribute); + if (status == B_OK) { + attrInfo->type = attribute->Node()->type; + attrInfo->size = attribute->Node()->data.size; + + inode->ReleaseAttribute(attribute); + return B_OK; + } + + RETURN_ERROR(status); +} + + +int +bfs_write_attr(void *_ns, void *_node, const char *name, int type,const void *buffer, size_t *_length, off_t pos) +{ + FUNCTION_START(("name = \"%s\"\n",name)); + + if (_ns == NULL || _node == NULL || name == NULL || *name == '\0') + RETURN_ERROR(B_BAD_VALUE); + + // Writing the name attribute using this function is not allowed, + // also using the reserved indices name, last_modified, and size + // shouldn't be allowed. + if (name[0] == FILE_NAME_NAME && name[1] == '\0' + || !strcmp(name,"name") + || !strcmp(name,"last_modified") + || !strcmp(name,"size")) + RETURN_ERROR(B_NOT_ALLOWED); + + Volume *volume = (Volume *)_ns; + Inode *inode = (Inode *)_node; + + status_t status = inode->CheckPermissions(W_OK); + if (status < B_OK) + return status; + + Transaction transaction(volume,inode->BlockNumber()); + + status = inode->WriteAttribute(&transaction,name,type,pos,(const uint8 *)buffer,_length); + if (status == B_OK) { + transaction.Done(); + + notify_listener(B_ATTR_CHANGED,volume->ID(),0,0,inode->ID(),name); + } + + return status; +} + + +int +bfs_read_attr(void *_ns, void *_node, const char *name, int type,void *buffer, size_t *_length, off_t pos) +{ + FUNCTION(); + Inode *inode = (Inode *)_node; + + if (inode == NULL || name == NULL || *name == '\0' || buffer == NULL) + RETURN_ERROR(B_BAD_VALUE); + + status_t status = inode->CheckPermissions(R_OK); + if (status < B_OK) + return status; + + return inode->ReadAttribute(name,type,pos,(uint8 *)buffer,_length); +} + + +// #pragma mark - +// Index functions + + +int +bfs_open_indexdir(void *_ns, void **_cookie) +{ + FUNCTION(); + + if (_ns == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + + if (volume->IndicesNode() == NULL) + RETURN_ERROR(B_ENTRY_NOT_FOUND); + + // Since the indices root node is just a directory, and we are storing + // a pointer to it in our Volume object, we can just use the directory + // traversal functions. + // In fact we're storing it in the Volume object for that reason. + + RETURN_ERROR(bfs_open_dir(_ns,volume->IndicesNode(),_cookie)); +} + + +int +bfs_close_indexdir(void *_ns, void *_cookie) +{ + FUNCTION(); + if (_ns == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + RETURN_ERROR(bfs_close_dir(_ns,volume->IndicesNode(),_cookie)); +} + + +int +bfs_free_indexdir_cookie(void *_ns, void *_node, void *_cookie) +{ + FUNCTION(); + if (_ns == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + RETURN_ERROR(bfs_free_dir_cookie(_ns,volume->IndicesNode(),_cookie)); +} + + +int +bfs_rewind_indexdir(void *_ns, void *_cookie) +{ + FUNCTION(); + if (_ns == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + RETURN_ERROR(bfs_rewind_dir(_ns,volume->IndicesNode(),_cookie)); +} + + +int +bfs_read_indexdir(void *_ns, void *_cookie, long *num, struct dirent *dirent, size_t bufferSize) +{ + FUNCTION(); + if (_ns == NULL || _cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + RETURN_ERROR(bfs_read_dir(_ns,volume->IndicesNode(),_cookie,num,dirent,bufferSize)); +} + + +int +bfs_create_index(void *_ns, const char *name, int type, int flags) +{ + FUNCTION_START(("name = \"%s\", type = %ld, flags = %ld\n",name,type,flags)); + if (_ns == NULL || name == NULL || *name == '\0') + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + + if (volume->IsReadOnly()) + return B_READ_ONLY_DEVICE; + + // only root users are allowed to create indices + if (geteuid() != 0) + return B_NOT_ALLOWED; + + Transaction transaction(volume,volume->Indices()); + + Index index(volume); + status_t status = index.Create(&transaction,name,type); + + if (status == B_OK) + transaction.Done(); + + RETURN_ERROR(status); +} + + +int +bfs_remove_index(void *_ns, const char *name) +{ + FUNCTION(); + if (_ns == NULL || name == NULL || *name == '\0') + return B_BAD_VALUE; + + Volume *volume = (Volume *)_ns; + + if (volume->IsReadOnly()) + return B_READ_ONLY_DEVICE; + + // only root users are allowed to remove indices + if (geteuid() != 0) + return B_NOT_ALLOWED; + + Inode *indices; + if ((indices = volume->IndicesNode()) == NULL) + return B_ENTRY_NOT_FOUND; + + Transaction transaction(volume,volume->Indices()); + + status_t status = indices->Remove(&transaction,name); + if (status == B_OK) + transaction.Done(); + + RETURN_ERROR(status); +} + + +int +bfs_rename_index(void *ns, const char *oldname, const char *newname) +{ + FUNCTION_START(("from = %s, to = %s\n",oldname,newname)); + + // ToDo: implement bfs_rename_index()?! + // Well, renaming an index doesn't make that much sense, as you + // would also need to remove every file in it (or the index + // would contain wrong data) + // But in that case, you can better remove the old one, and + // create a new one... + // There is also no way to call this function from a userland + // application. + + RETURN_ERROR(B_ENTRY_NOT_FOUND); +} + + +int +bfs_stat_index(void *_ns, const char *name, struct index_info *indexInfo) +{ + FUNCTION_START(("name = %s\n",name)); + if (_ns == NULL || name == NULL || indexInfo == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Volume *volume = (Volume *)_ns; + Index index(volume); + status_t status = index.SetTo(name); + if (status < B_OK) + RETURN_ERROR(status); + + bfs_inode *node = index.Node()->Node(); + + indexInfo->type = index.Type(); + indexInfo->size = node->data.size; + indexInfo->modification_time = (time_t)(node->last_modified_time >> INODE_TIME_SHIFT); + indexInfo->creation_time = (time_t)(node->create_time >> INODE_TIME_SHIFT); + indexInfo->uid = node->uid; + indexInfo->gid = node->gid; + + return B_OK; +} + + +// #pragma mark - +// Query functions + + +int +bfs_open_query(void *_ns,const char *queryString,ulong flags,port_id port,long token,void **cookie) +{ + FUNCTION(); + if (_ns == NULL || queryString == NULL || cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + PRINT(("query = \"%s\", flags = %lu, port_id = %ld, token = %ld\n",queryString,flags,port,token)); + + Volume *volume = (Volume *)_ns; + + Expression *expression = new Expression((char *)queryString); + if (expression == NULL) + RETURN_ERROR(B_NO_MEMORY); + + if (expression->InitCheck() < B_OK) { + FATAL(("Could not parse query, stopped at: \"%s\"\n",expression->Position())); + delete expression; + RETURN_ERROR(B_BAD_VALUE); + } + + Query *query = new Query(volume,expression); + if (query == NULL) { + delete expression; + RETURN_ERROR(B_NO_MEMORY); + } + + if (flags & B_LIVE_QUERY) + query->SetLiveMode(port,token); + + *cookie = (void *)query; + + return B_OK; +} + + +int +bfs_close_query(void *ns, void *cookie) +{ + FUNCTION(); + return B_OK; +} + + +int +bfs_free_query_cookie(void *ns, void *node, void *cookie) +{ + FUNCTION(); + if (cookie == NULL) + RETURN_ERROR(B_BAD_VALUE); + + Query *query = (Query *)cookie; + Expression *expression = query->GetExpression(); + delete query; + delete expression; + + return B_OK; +} + + +int +bfs_read_query(void */*ns*/,void *cookie,long *num,struct dirent *dirent,size_t bufferSize) +{ + FUNCTION(); + Query *query = (Query *)cookie; + if (query == NULL) + RETURN_ERROR(B_BAD_VALUE); + + status_t status = query->GetNextEntry(dirent,bufferSize); + if (status == B_OK) + *num = 1; + else if (status == B_ENTRY_NOT_FOUND) + *num = 0; + else + return status; + + return B_OK; +} + diff --git a/src/add-ons/kernel/file_systems/bfs/lock.h b/src/add-ons/kernel/file_systems/bfs/lock.h new file mode 100644 index 0000000000..b05adaa21b --- /dev/null +++ b/src/add-ons/kernel/file_systems/bfs/lock.h @@ -0,0 +1,47 @@ +/* + Copyright 1999-2001, Be Incorporated. All Rights Reserved. + This file may be used under the terms of the Be Sample Code License. +*/ + +#ifndef _LOCK_H +#define _LOCK_H + +#include + +#include + +#ifdef __cplusplus + extern "C" { +#else + typedef struct lock lock; + typedef struct mlock mlock; +#endif + + +struct lock { + sem_id s; + long c; +}; + +struct mlock { + sem_id s; +}; + +extern _IMPEXP_KERNEL int new_lock(lock *l, const char *name); +extern _IMPEXP_KERNEL int free_lock(lock *l); + +#define LOCK(l) if (atomic_add(&l.c, -1) <= 0) acquire_sem(l.s); +#define UNLOCK(l) if (atomic_add(&l.c, 1) < 0) release_sem(l.s); + +extern _IMPEXP_KERNEL int new_mlock(mlock *l, long c, const char *name); +extern _IMPEXP_KERNEL int free_mlock(mlock *l); + +#define LOCKM(l,cnt) acquire_sem_etc(l.s, cnt, 0, 0) +#define UNLOCKM(l,cnt) release_sem_etc(l.s, cnt, 0) + + +#ifdef __cplusplus + } // extern "C" +#endif + +#endif