Axel Dörfler c0614f3291 Ported r13366 back from the R5 BFS to the Haiku BFS: fixed bad bug in the journaling
code, the super block log data could have been wrong. Moved the doubly linked list
code to the kernel's util/DoublyLinkedList.h.
Also removed Journal::fCurrent, as it's not really used.


git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@13367 a95241bf-73f2-0310-859d-f6bbb57e9c96
2005-06-30 16:38:37 +00:00

528 lines
13 KiB
C++

/* Journal - transaction and logging
*
* Copyright 2001-2005, Axel Dörfler, axeld@pinc-software.de.
* This file may be used under the terms of the MIT License.
*/
#include "Journal.h"
#include "Inode.h"
#include "Debug.h"
#include <Drivers.h>
#include <util/kernel_cpp.h>
#include <errno.h>
struct log_entry : public DoublyLinkedListLinkImpl<log_entry> {
uint16 start;
uint16 length;
uint32 cached_blocks;
Journal *journal;
};
Journal::Journal(Volume *volume)
:
fVolume(volume),
fLock("bfs journal"),
fOwner(NULL),
fArray(volume->BlockSize()),
fLogSize(volume->Log().length),
fMaxTransactionSize(fLogSize / 4 - 5),
fUsed(0),
fTransactionsInEntry(0)
{
if (fMaxTransactionSize > fLogSize / 2)
fMaxTransactionSize = fLogSize / 2 - 5;
}
Journal::~Journal()
{
FlushLogAndBlocks();
}
status_t
Journal::InitCheck()
{
if (fVolume->LogStart() != fVolume->LogEnd()) {
if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));
return ReplayLog();
}
return B_OK;
}
status_t
Journal::CheckLogEntry(int32 count, const off_t *array)
{
// ToDo: check log entry integrity (block numbers and entry size)
PRINT(("Log entry has %ld entries (%Ld)\n", count, array[0]));
return B_OK;
}
/** Replays an entry in the log.
* \a _start points to the entry in the log, and will be bumped to the next
* one if replaying succeeded.
*/
status_t
Journal::ReplayLogEntry(int32 *_start)
{
PRINT(("ReplayLogEntry(start = %ld)\n", *_start));
off_t logOffset = fVolume->ToBlock(fVolume->Log());
off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log());
int32 blockSize = fVolume->BlockSize();
int32 count = 1, valuesInBlock = blockSize / sizeof(off_t);
int32 numArrayBlocks;
off_t blockNumber = 0;
bool first = true;
CachedBlock cached(fVolume);
while (count > 0) {
const off_t *array = (const off_t *)cached.SetTo(arrayBlock);
if (array == NULL)
return B_IO_ERROR;
int32 index = 0;
if (first) {
if (array[0] < 1 || array[0] >= fLogSize)
return B_BAD_DATA;
count = array[0];
first = false;
numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize;
blockNumber = (*_start + numArrayBlocks) % fLogSize;
// first real block in this log entry
*_start += count;
index++;
// the first entry in the first block is the number
// of blocks in that log entry
}
(*_start)++;
if (CheckLogEntry(count, array + 1) < B_OK)
return B_BAD_DATA;
CachedBlock cachedCopy(fVolume);
for (; index < valuesInBlock && count-- > 0; index++) {
PRINT(("replay block %Ld in log at %Ld!\n", array[index], blockNumber));
const uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber);
if (copy == NULL)
RETURN_ERROR(B_IO_ERROR);
ssize_t written = write_pos(fVolume->Device(),
array[index] << fVolume->BlockShift(), copy, blockSize);
if (written != blockSize)
RETURN_ERROR(B_IO_ERROR);
blockNumber = (blockNumber + 1) % fLogSize;
}
arrayBlock++;
if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize)
arrayBlock = fVolume->ToBlock(fVolume->Log());
}
return B_OK;
}
/** Replays all log entries - this will put the disk into a
* consistent and clean state, if it was not correctly unmounted
* before.
* This method is called by Journal::InitCheck() if the log start
* and end pointer don't match.
*/
status_t
Journal::ReplayLog()
{
INFORM(("Replay log, disk was not correctly unmounted...\n"));
int32 start = fVolume->LogStart();
int32 lastStart = -1;
while (true) {
// stop if the log is completely flushed
if (start == fVolume->LogEnd())
break;
if (start == lastStart) {
// strange, flushing the log hasn't changed the log_start pointer
return B_ERROR;
}
lastStart = start;
status_t status = ReplayLogEntry(&start);
if (status < B_OK) {
FATAL(("replaying log entry from %ld failed: %s\n", start, strerror(status)));
return B_ERROR;
}
start = start % fLogSize;
}
PRINT(("replaying worked fine!\n"));
fVolume->SuperBlock().log_start = fVolume->LogEnd();
fVolume->LogStart() = fVolume->LogEnd();
fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
return fVolume->WriteSuperBlock();
}
/** This is a callback function that is called by the cache, whenever
* a block is flushed to disk that was updated as part of a transaction.
* This is necessary to keep track of completed transactions, to be
* able to update the log start pointer.
*/
void
Journal::blockNotify(int32 transactionID, void *arg)
{
log_entry *logEntry = (log_entry *)arg;
PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, transactionID));
Journal *journal = logEntry->journal;
disk_super_block &superBlock = journal->fVolume->SuperBlock();
bool update = false;
// Set log_start pointer if possible...
if (logEntry == journal->fEntries.First()) {
log_entry *next = journal->fEntries.GetNext(logEntry);
if (next != NULL) {
int32 length = next->start - logEntry->start;
superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize;
} else
superBlock.log_start = journal->fVolume->LogEnd();
update = true;
}
journal->fEntriesLock.Lock();
journal->fUsed -= logEntry->length;
journal->fEntries.Remove(logEntry);
journal->fEntriesLock.Unlock();
free(logEntry);
// update the super block, and change the disk's state, if necessary
if (update) {
journal->fVolume->LogStart() = superBlock.log_start;
if (superBlock.log_start == superBlock.log_end)
superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
status_t status = journal->fVolume->WriteSuperBlock();
if (status != B_OK) {
FATAL(("blockNotify: could not write back super block: %s\n",
strerror(status)));
}
}
}
status_t
Journal::WriteLogEntry()
{
fTransactionsInEntry = 0;
fHasChangedBlocks = false;
// insert all changed blocks into the log array
uint32 cookie = 0;
{
off_t blockNumber;
while (cache_next_block_in_transaction(fVolume->BlockCache(), fTransactionID, &cookie,
&blockNumber, NULL, NULL) == B_OK) {
fArray.Insert(blockNumber);
}
}
sorted_array *array = fArray.Array();
if (array == NULL || array->count == 0) {
// nothing has changed during this transaction
cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, NULL);
return B_OK;
}
// Make sure there is enough space in the log.
// If that fails for whatever reason, panic!
// ToDo:
/* force_cache_flush(fVolume->Device(), false);
int32 tries = fLogSize / 2 + 1;
while (TransactionSize() > FreeLogBlocks() && tries-- > 0)
force_cache_flush(fVolume->Device(), true);
if (tries <= 0) {
fVolume->Panic();
return B_BAD_DATA;
}
*/
int32 blockShift = fVolume->BlockShift();
off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
off_t logStart = fVolume->LogEnd();
off_t logPosition = logStart % fLogSize;
// Write disk block array
uint8 *arrayBlock = (uint8 *)array;
// ToDo: the single writes should be combined!
for (int32 size = fArray.BlocksUsed(); size-- > 0;) {
write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
arrayBlock, fVolume->BlockSize());
logPosition = (logPosition + 1) % fLogSize;
arrayBlock += fVolume->BlockSize();
}
// Write logged blocks into the log
for (int32 i = 0; i < array->count; i++) {
const uint8 *block = (const uint8 *)block_cache_get(fVolume->BlockCache(), array->values[i]);
if (block == NULL) {
FATAL(("Could not get block %Ld\n", array->values[i]));
continue;
}
// ToDo: combine blocks whenever possible (using iovecs)!
write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
block, fVolume->BlockSize());
block_cache_put(fVolume->BlockCache(), array->values[i]);
logPosition = (logPosition + 1) % fLogSize;
}
// create and add log entry
log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry));
if (logEntry == NULL) {
DIE(("Could not create next log entry (out of memory)\n"));
return B_NO_MEMORY;
}
logEntry->start = logStart;
logEntry->length = TransactionSize();
logEntry->journal = this;
fEntriesLock.Lock();
fEntries.Add(logEntry);
fUsed += logEntry->length;
fEntriesLock.Unlock();
// Update the log end pointer in the super block
fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
fVolume->SuperBlock().log_end = logPosition;
fVolume->LogEnd() = logPosition;
status_t status = fVolume->WriteSuperBlock();
// We need to flush the drives own cache here to ensure
// disk consistency.
// If that call fails, we can't do anything about it anyway
ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
// at this point, we can finally end the transaction - we're in
// a guaranteed valid state
cache_end_transaction(fVolume->BlockCache(), fTransactionID, blockNotify, logEntry);
fArray.MakeEmpty();
// If the log goes to the next round (the log is written as a
// circular buffer), all blocks will be flushed out which is
// possible because we don't have any locked blocks at this
// point.
if (logPosition < logStart)
fVolume->FlushDevice();
return status;
}
status_t
Journal::FlushLogAndBlocks()
{
status_t status = fLock.Lock();
if (status != B_OK)
return status;
if (fLock.OwnerCount() > 1) {
// whoa, FlushLogAndBlocks() was called from inside a transaction
fLock.Unlock();
return B_OK;
}
// write the current log entry to disk
if (fTransactionID != -1 && TransactionSize() != 0) {
status = WriteLogEntry();
if (status < B_OK)
FATAL(("writing current log entry failed: %s\n", strerror(status)));
}
status = fVolume->FlushDevice();
fLock.Unlock();
return status;
}
status_t
Journal::Lock(Transaction *owner)
{
status_t status = fLock.Lock();
if (status != B_OK)
return status;
/* ToDo:
// if the last transaction is older than 2 secs, start a new one
if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L)
WriteLogEntry();
*/
if (fLock.OwnerCount() > 1) {
// we'll just use the current transaction again
return B_OK;
}
fOwner = owner;
fTransactionID = cache_start_transaction(fVolume->BlockCache());
if (fTransactionID < B_OK) {
fLock.Unlock();
return fTransactionID;
}
return B_OK;
}
void
Journal::Unlock(Transaction *owner, bool success)
{
if (fLock.OwnerCount() == 1) {
// we only end the transaction if we would really unlock it
// ToDo: what about failing transactions that do not unlock?
TransactionDone(success);
fTransactionID = -1;
fTimestamp = system_time();
fOwner = NULL;
}
fLock.Unlock();
}
status_t
Journal::TransactionDone(bool success)
{
if (!success) {
fArray.MakeEmpty();
cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
return B_OK;
}
// ToDo:
/*
if (!success && fTransactionsInEntry == 0) {
// we can safely abort the transaction
sorted_array *array = fArray.Array();
if (array != NULL) {
// release the lock for all blocks in the array (we don't need
// to be notified when they are actually written to disk)
for (int32 i = 0; i < array->count; i++)
release_block(fVolume->Device(), array->values[i]);
}
return B_OK;
}
// Up to a maximum size, we will just batch several
// transactions together to improve speed
if (TransactionSize() < fMaxTransactionSize) {
fTransactionsInEntry++;
fHasChangedBlocks = false;
return B_OK;
}
*/
return WriteLogEntry();
}
status_t
Journal::LogBlocks(off_t blockNumber, const uint8 *buffer, size_t numBlocks)
{
panic("LogBlocks() called!\n");
// ToDo: that's for now - we should change the log file size here
if (TransactionSize() + numBlocks + 1 > fLogSize)
return B_DEVICE_FULL;
fHasChangedBlocks = true;
int32 blockSize = fVolume->BlockSize();
for (;numBlocks-- > 0; blockNumber++, buffer += blockSize) {
if (fArray.Find(blockNumber) >= 0) {
// The block is already in the log, so just update its data
// Note, this is only necessary if this method is called with a buffer
// different from the cached block buffer - which is unlikely but
// we'll make sure this way (costs one cache lookup, though).
// ToDo:
/* status_t status = cached_write(fVolume->Device(), blockNumber, buffer, 1, blockSize);
if (status < B_OK)
return status;
*/
continue;
}
// Insert the block into the transaction's array, and write the changes
// back into the locked cache buffer
fArray.Insert(blockNumber);
// ToDo:
/* status_t status = cached_write_locked(fVolume->Device(), blockNumber, buffer, 1, blockSize);
if (status < B_OK)
return status;
*/ }
// ToDo:
// If necessary, flush the log, so that we have enough space for this transaction
/* if (TransactionSize() > FreeLogBlocks())
force_cache_flush(fVolume->Device(), true);
*/
return B_OK;
}
// #pragma mark -
status_t
Transaction::Start(Volume *volume, off_t refBlock)
{
// has it already been started?
if (fJournal != NULL)
return B_OK;
fJournal = volume->GetJournal(refBlock);
if (fJournal != NULL && fJournal->Lock(this) == B_OK)
return B_OK;
fJournal = NULL;
return B_ERROR;
}