haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp

/* Journal - transaction and logging
 *
 * Copyright 2001-2005, Axel Dörfler, axeld@pinc-software.de.
 * This file may be used under the terms of the MIT License.
 */


#include "Journal.h"
#include "Inode.h"
#include "Debug.h"

#include <Drivers.h>
#include <util/kernel_cpp.h>
#include <errno.h>


struct log_entry : public DoublyLinkedListLinkImpl<log_entry> {
	uint16		start;
	uint16		length;
	uint32		cached_blocks;
	Journal		*journal;
};


Journal::Journal(Volume *volume)
	:
	fVolume(volume),
	fLock("bfs journal"),
	fOwner(NULL),
	fArray(volume->BlockSize()),
	fLogSize(volume->Log().length),
	fMaxTransactionSize(fLogSize / 4 - 5),
	fUsed(0),
	fTransactionsInEntry(0)
{
	if (fMaxTransactionSize > fLogSize / 2)
		fMaxTransactionSize = fLogSize / 2 - 5;
}


Journal::~Journal()
{
	FlushLogAndBlocks();
}


status_t
Journal::InitCheck()
{
	if (fVolume->LogStart() != fVolume->LogEnd()) {
		if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
			FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));

		return ReplayLog();
	}

	return B_OK;
}


status_t
Journal::CheckLogEntry(int32 count, const off_t *array)
{
	// ToDo: check log entry integrity (block numbers and entry size)
	PRINT(("Log entry has %ld entries (%Ld)\n", count, array[0]));
	return B_OK;
}


/**	Replays an entry in the log.
 *	\a _start points to the entry in the log, and will be bumped to the next
 *	one if replaying succeeded.
 */

status_t
Journal::ReplayLogEntry(int32 *_start)
{
	PRINT(("ReplayLogEntry(start = %ld)\n", *_start));

	off_t logOffset = fVolume->ToBlock(fVolume->Log());
	off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log());
	int32 blockSize = fVolume->BlockSize();
	int32 count = 1, valuesInBlock = blockSize / sizeof(off_t);
	int32 numArrayBlocks;
	off_t blockNumber = 0;
	bool first = true;

	CachedBlock cached(fVolume);

	while (count > 0) {
		const off_t *array = (const off_t *)cached.SetTo(arrayBlock);
		if (array == NULL)
			return B_IO_ERROR;

		int32 index = 0;
		if (first) {
			if (array[0] < 1 || array[0] >= fLogSize)
				return B_BAD_DATA;

			count = array[0];
			first = false;

			numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize;
			blockNumber = (*_start + numArrayBlocks) % fLogSize;
				// first real block in this log entry
			*_start += count;
			index++;
				// the first entry in the first block is the number
				// of blocks in that log entry
		}
		(*_start)++;

		if (CheckLogEntry(count, array + 1) < B_OK)
			return B_BAD_DATA;

		CachedBlock cachedCopy(fVolume);
		for (; index < valuesInBlock && count-- > 0; index++) {
			PRINT(("replay block %Ld in log at %Ld!\n", array[index], blockNumber));

			const uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber);
			if (copy == NULL)
				RETURN_ERROR(B_IO_ERROR);

			ssize_t written = write_pos(fVolume->Device(),
						array[index] << fVolume->BlockShift(), copy, blockSize);
			if (written != blockSize)
				RETURN_ERROR(B_IO_ERROR);

			blockNumber = (blockNumber + 1) % fLogSize;
		}
		arrayBlock++;
		if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize)
			arrayBlock = fVolume->ToBlock(fVolume->Log());
	}
	return B_OK;
}


/**	Replays all log entries - this will put the disk into a
 *	consistent and clean state, if it was not correctly unmounted
 *	before.
 *	This method is called by Journal::InitCheck() if the log start
 *	and end pointer don't match.
 */

status_t
Journal::ReplayLog()
{
	INFORM(("Replay log, disk was not correctly unmounted...\n"));

	int32 start = fVolume->LogStart();
	int32 lastStart = -1;
	while (true) {
		// stop if the log is completely flushed
		if (start == fVolume->LogEnd())
			break;

		if (start == lastStart) {
			// strange, flushing the log hasn't changed the log_start pointer
			return B_ERROR;
		}
		lastStart = start;

		status_t status = ReplayLogEntry(&start);
		if (status < B_OK) {
			FATAL(("replaying log entry from %ld failed: %s\n", start, strerror(status)));
			return B_ERROR;
		}
		start = start % fLogSize;
	}

	PRINT(("replaying worked fine!\n"));
	fVolume->SuperBlock().log_start = fVolume->LogEnd();
	fVolume->LogStart() = fVolume->LogEnd();
	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;

	return fVolume->WriteSuperBlock();
}


/**	This is a callback function that is called by the cache, whenever
 *	a block is flushed to disk that was updated as part of a transaction.
 *	This is necessary to keep track of completed transactions, to be
 *	able to update the log start pointer.
 */

void
Journal::blockNotify(int32 transactionID, void *arg)
{
	log_entry *logEntry = (log_entry *)arg;

	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, transactionID));

	Journal *journal = logEntry->journal;
	disk_super_block &superBlock = journal->fVolume->SuperBlock();
	bool update = false;

	// Set log_start pointer if possible...

	if (logEntry == journal->fEntries.First()) {
		log_entry *next = journal->fEntries.GetNext(logEntry);
		if (next != NULL) {
			int32 length = next->start - logEntry->start;
			superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize;
		} else
			superBlock.log_start = journal->fVolume->LogEnd();

		update = true;
	}

	journal->fEntriesLock.Lock();
	journal->fUsed -= logEntry->length;
	journal->fEntries.Remove(logEntry);
	journal->fEntriesLock.Unlock();

	free(logEntry);

	// update the super block, and change the disk's state, if necessary

	if (update) {
		journal->fVolume->LogStart() = superBlock.log_start;

		if (superBlock.log_start == superBlock.log_end)
			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;

		status_t status = journal->fVolume->WriteSuperBlock();
		if (status != B_OK) {
			FATAL(("blockNotify: could not write back super block: %s\n",
				strerror(status)));
		}
	}
}


status_t
Journal::WriteLogEntry()
{
	fTransactionsInEntry = 0;
	fHasChangedBlocks = false;

	// insert all changed blocks into the log array
	uint32 cookie = 0;
	{
		off_t blockNumber;
		while (cache_next_block_in_transaction(fVolume->BlockCache(), fTransactionID, &cookie,
				&blockNumber, NULL, NULL) == B_OK)  {
			fArray.Insert(blockNumber);
		}
	}

	sorted_array *array = fArray.Array();
	if (array == NULL || array->count == 0) {
		// nothing has changed during this transaction
		cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, NULL);
		return B_OK;
	}

	// Make sure there is enough space in the log.
	// If that fails for whatever reason, panic!
	// ToDo:
/*	force_cache_flush(fVolume->Device(), false);
	int32 tries = fLogSize / 2 + 1;
	while (TransactionSize() > FreeLogBlocks() && tries-- > 0)
		force_cache_flush(fVolume->Device(), true);

	if (tries <= 0) {
		fVolume->Panic();
		return B_BAD_DATA;
	}
*/
	int32 blockShift = fVolume->BlockShift();
	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
	off_t logStart = fVolume->LogEnd();
	off_t logPosition = logStart % fLogSize;

	// Write disk block array

	uint8 *arrayBlock = (uint8 *)array;

	// ToDo: the single writes should be combined!
	for (int32 size = fArray.BlocksUsed(); size-- > 0;) {
		write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
			arrayBlock, fVolume->BlockSize());

		logPosition = (logPosition + 1) % fLogSize;
		arrayBlock += fVolume->BlockSize();
	}

	// Write logged blocks into the log

	for (int32 i = 0; i < array->count; i++) {
		const uint8 *block = (const uint8 *)block_cache_get(fVolume->BlockCache(), array->values[i]);
		if (block == NULL) {
			FATAL(("Could not get block %Ld\n", array->values[i]));
			continue;
		}

		// ToDo: combine blocks whenever possible (using iovecs)!
		write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
			block, fVolume->BlockSize());

		block_cache_put(fVolume->BlockCache(), array->values[i]);
		logPosition = (logPosition + 1) % fLogSize;
	}

	// create and add log entry

	log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry));
	if (logEntry == NULL) {
		DIE(("Could not create next log entry (out of memory)\n"));
		return B_NO_MEMORY;
	}

	logEntry->start = logStart;
	logEntry->length = TransactionSize();
	logEntry->journal = this;

	fEntriesLock.Lock();
	fEntries.Add(logEntry);
	fUsed += logEntry->length;
	fEntriesLock.Unlock();

	// Update the log end pointer in the super block

	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
	fVolume->SuperBlock().log_end = logPosition;
	fVolume->LogEnd() = logPosition;

	status_t status = fVolume->WriteSuperBlock();

	// We need to flush the drives own cache here to ensure
	// disk consistency.
	// If that call fails, we can't do anything about it anyway
	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);

	// at this point, we can finally end the transaction - we're in
	// a guaranteed valid state
	cache_end_transaction(fVolume->BlockCache(), fTransactionID, blockNotify, logEntry);
	fArray.MakeEmpty();

	// If the log goes to the next round (the log is written as a
	// circular buffer), all blocks will be flushed out which is
	// possible because we don't have any locked blocks at this
	// point.
	if (logPosition < logStart)
		fVolume->FlushDevice();

	return status;
}


status_t
Journal::FlushLogAndBlocks()
{
	status_t status = fLock.Lock();
	if (status != B_OK)
		return status;

	if (fLock.OwnerCount() > 1) {
		// whoa, FlushLogAndBlocks() was called from inside a transaction
		fLock.Unlock();
		return B_OK;
	}

	// write the current log entry to disk

	if (fTransactionID != -1 && TransactionSize() != 0) {
		status = WriteLogEntry();
		if (status < B_OK)
			FATAL(("writing current log entry failed: %s\n", strerror(status)));
	}

	status = fVolume->FlushDevice();

	fLock.Unlock();
	return status;
}


status_t
Journal::Lock(Transaction *owner)
{
	status_t status = fLock.Lock();
	if (status != B_OK)
		return status;

/*	ToDo:
	// if the last transaction is older than 2 secs, start a new one
	if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L)
		WriteLogEntry();
*/

	if (fLock.OwnerCount() > 1) {
		// we'll just use the current transaction again
		return B_OK;
	}

	fOwner = owner;

	fTransactionID = cache_start_transaction(fVolume->BlockCache());
	if (fTransactionID < B_OK) {
		fLock.Unlock();
		return fTransactionID;
	}

	return B_OK;
}


void
Journal::Unlock(Transaction *owner, bool success)
{
	if (fLock.OwnerCount() == 1) {
		// we only end the transaction if we would really unlock it
		// ToDo: what about failing transactions that do not unlock?
		TransactionDone(success);

		fTransactionID = -1;
		fTimestamp = system_time();
		fOwner = NULL;
	}

	fLock.Unlock();
}


status_t
Journal::TransactionDone(bool success)
{
	if (!success) {
		fArray.MakeEmpty();
		cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
		return B_OK;
	}

// ToDo:
/*
	if (!success && fTransactionsInEntry == 0) {
		// we can safely abort the transaction
		sorted_array *array = fArray.Array();
		if (array != NULL) {
			// release the lock for all blocks in the array (we don't need
			// to be notified when they are actually written to disk)
			for (int32 i = 0; i < array->count; i++)
				release_block(fVolume->Device(), array->values[i]);
		}

		return B_OK;
	}

	// Up to a maximum size, we will just batch several
	// transactions together to improve speed
	if (TransactionSize() < fMaxTransactionSize) {
		fTransactionsInEntry++;
		fHasChangedBlocks = false;

		return B_OK;
	}
*/

	return WriteLogEntry();
}


status_t
Journal::LogBlocks(off_t blockNumber, const uint8 *buffer, size_t numBlocks)
{
	panic("LogBlocks() called!\n");

	// ToDo: that's for now - we should change the log file size here
	if (TransactionSize() + numBlocks + 1 > fLogSize)
		return B_DEVICE_FULL;

	fHasChangedBlocks = true;
	int32 blockSize = fVolume->BlockSize();

	for (;numBlocks-- > 0; blockNumber++, buffer += blockSize) {
		if (fArray.Find(blockNumber) >= 0) {
			// The block is already in the log, so just update its data
			// Note, this is only necessary if this method is called with a buffer
			// different from the cached block buffer - which is unlikely but
			// we'll make sure this way (costs one cache lookup, though).
			// ToDo:
/*			status_t status = cached_write(fVolume->Device(), blockNumber, buffer, 1, blockSize);
			if (status < B_OK)
				return status;
*/
			continue;
		}

		// Insert the block into the transaction's array, and write the changes
		// back into the locked cache buffer
		fArray.Insert(blockNumber);

		// ToDo:
/*		status_t status = cached_write_locked(fVolume->Device(), blockNumber, buffer, 1, blockSize);
		if (status < B_OK)
			return status;
*/	}

	// ToDo:
	// If necessary, flush the log, so that we have enough space for this transaction
/*	if (TransactionSize() > FreeLogBlocks())
		force_cache_flush(fVolume->Device(), true);
*/
	return B_OK;
}


//	#pragma mark -


status_t
Transaction::Start(Volume *volume, off_t refBlock)
{
	// has it already been started?
	if (fJournal != NULL)
		return B_OK;

	fJournal = volume->GetJournal(refBlock);
	if (fJournal != NULL && fJournal->Lock(this) == B_OK)
		return B_OK;

	fJournal = NULL;
	return B_ERROR;
}