kernel/vm: Completely replace mlock() implementation.

The old implementation used the real lock_memory(). This is problematic and does not work for a large number of reasons: 1) Various parts of the kernel assume memory is locked only very temporarily, and will often wait on locked memory to become unlocked. The transient nature of locks is further demonstrated by the fact that lock_memory acquires references to structures, like the address space, which are only released by unlock_memory 2) The VM has a hard assumption that all lock_memory calls will be exactly balanced, and maintains internal "WiredRange" structures on areas, etc. corresponding to the original lock_memory calls. Maintaining separate data structures as this code did is a recipe for even more problems when the structures are manipulated separately, leading to confusing or incorrect behavior on unlocks. 3) Areas with locked memory cannot be deleted, nor can the pages which are locked be removed from the areas/caches. This of course is most notable when destroying teams which locked memory, but the problem also occurs when just using delete_area, resize_area, mmap/munmap, etc. Because of (2) and especially (3), adding support for mlock()-like semantics to the existing memory locking system is just not a good option. A further reason is that our lock_memory is much stricter than mlock(), which only demands the pages in question must remain resident in RAM and cannot be swapped out (or, it seems, otherwise written back to disk.) Thus, this commit completely removes the old implementation (which was seriously broken and did not actually automatically unlock memory on team exit or area destruction at all, etc.) and instead adds a new feature to VMAnonymousCache to block certain pages from being written out. The syscall then just invokes this to do its work. Fixes #17674. Related to #13651. Change-Id: Id2745c51796bcf9a74ba5325fe686a95623cd521 Reviewed-on: https://review.haiku-os.org/c/haiku/+/5147 Reviewed-by: waddlesplash <waddlesplash@gmail.com>
2022-03-29 20:09:36 -04:00 · 2022-03-29 20:09:36 -04:00 · c25f6f53b5
commit c25f6f53b5
parent 88275138ba
4 changed files with 119 additions and 228 deletions
--- a/headers/private/kernel/thread_types.h
+++ b/headers/private/kernel/thread_types.h
@ -66,9 +66,6 @@ struct select_info;
 struct user_thread;				// defined in libroot/user_thread.h
 struct VMAddressSpace;
 struct xsi_sem_context;			// defined in xsi_semaphore.cpp
-struct LockedPages;
-
-typedef DoublyLinkedList<LockedPages> LockedPagesList;

 namespace Scheduler {
 	struct ThreadData;
@ -248,8 +245,6 @@ struct Team : TeamThreadIteratorEntry<team_id>, KernelReferenceable,
 	struct team_death_entry *death_entry;	// protected by fLock
 	struct list		dead_threads;

-	LockedPagesList	locked_pages_list;
-
 	// protected by the team's fLock
 	team_dead_children dead_children;
 	team_job_control_children stopped_children;
--- a/src/system/kernel/vm/VMAnonymousCache.cpp
+++ b/src/system/kernel/vm/VMAnonymousCache.cpp
@ -47,6 +47,7 @@
 #include <thread.h>
 #include <tracing.h>
 #include <util/AutoLock.h>
+#include <util/Bitmap.h>
 #include <util/DoublyLinkedList.h>
 #include <util/OpenHashTable.h>
 #include <util/RadixBitmap.h>
@ -445,6 +446,9 @@ private:

 VMAnonymousCache::~VMAnonymousCache()
 {
+	delete fNoSwapPages;
+	fNoSwapPages = NULL;
+
 	_FreeSwapPageRange(virtual_base, virtual_end, false);
 	swap_space_unreserve(fCommittedSwapSize);
 	if (committed_size > fCommittedSwapSize)
@ -468,6 +472,7 @@ VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
 	fCanOvercommit = canOvercommit;
 	fHasPrecommitted = false;
 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
+	fNoSwapPages = NULL;
 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
 	fCommittedSwapSize = 0;
 	fAllocatedSwapSize = 0;
@ -476,6 +481,42 @@ VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
 }


+status_t
+VMAnonymousCache::SetCanSwapPages(off_t base, size_t size, bool canSwap)
+{
+	const page_num_t first = base >> PAGE_SHIFT;
+	const size_t count = PAGE_ALIGN(size + ((first << PAGE_SHIFT) - base)) >> PAGE_SHIFT;
+
+	if (count == 0)
+		return B_OK;
+	if (canSwap && fNoSwapPages == NULL)
+		return B_OK;
+
+	if (fNoSwapPages == NULL)
+		fNoSwapPages = new(std::nothrow) Bitmap(0);
+	if (fNoSwapPages == NULL)
+		return B_NO_MEMORY;
+
+	const page_num_t pageCount = PAGE_ALIGN(virtual_end) >> PAGE_SHIFT;
+
+	if (fNoSwapPages->Resize(pageCount) != B_OK)
+		return B_NO_MEMORY;
+
+	for (size_t i = 0; i < count; i++) {
+		if (canSwap)
+			fNoSwapPages->Clear(first + i);
+		else
+			fNoSwapPages->Set(first + i);
+	}
+
+	if (fNoSwapPages->GetHighestSet() < 0) {
+		delete fNoSwapPages;
+		fNoSwapPages = NULL;
+	}
+	return B_OK;
+}
+
+
 void
 VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
 	bool skipBusyPages)
@ -541,6 +582,11 @@ VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
 status_t
 VMAnonymousCache::Resize(off_t newSize, int priority)
 {
+	if (fNoSwapPages != NULL) {
+		if (fNoSwapPages->Resize(PAGE_ALIGN(newSize) >> PAGE_SHIFT) != B_OK)
+			return B_NO_MEMORY;
+	}
+
 	_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
 		virtual_end + B_PAGE_SIZE - 1);
 	return VMCache::Resize(newSize, priority);
@ -550,6 +596,11 @@ VMAnonymousCache::Resize(off_t newSize, int priority)
 status_t
 VMAnonymousCache::Rebase(off_t newBase, int priority)
 {
+	if (fNoSwapPages != NULL) {
+		const ssize_t sizeDifference = (newBase >> PAGE_SHIFT) - (virtual_base >> PAGE_SHIFT);
+		fNoSwapPages->Shift(sizeDifference);
+	}
+
 	_FreeSwapPageRange(virtual_base, newBase);
 	return VMCache::Rebase(newBase, priority);
 }
@ -891,10 +942,14 @@ VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
 bool
 VMAnonymousCache::CanWritePage(off_t offset)
 {
+	const off_t pageIndex = offset >> PAGE_SHIFT;
+	if (fNoSwapPages != NULL && fNoSwapPages->Get(pageIndex))
+		return false;
+
 	// We can write the page, if we have not used all of our committed swap
 	// space or the page already has a swap slot assigned.
 	return fAllocatedSwapSize < fCommittedSwapSize
-		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
+		|| _SwapBlockGetAddress(pageIndex) != SWAP_SLOT_NONE;
 }


--- a/src/system/kernel/vm/VMAnonymousCache.h
+++ b/src/system/kernel/vm/VMAnonymousCache.h
@ -19,6 +19,7 @@ typedef uint32 swap_addr_t;
 	// TODO: Should be wider, but RadixBitmap supports only a 32 bit type ATM!
 struct swap_block;
 struct system_memory_info;
+namespace BKernel { class Bitmap; }


 extern "C" {
@ -39,6 +40,8 @@ public:
 									int32 numGuardPages,
 									uint32 allocationFlags);

+			status_t			SetCanSwapPages(off_t base, size_t size, bool canSwap);
+
 	virtual	status_t			Resize(off_t newSize, int priority);
 	virtual	status_t			Rebase(off_t newBase, int priority);
 	virtual	status_t			Adopt(VMCache* source, off_t offset,
@ -102,6 +105,7 @@ private:
 			bool				fHasPrecommitted;
 			uint8				fPrecommittedPages;
 			int32				fGuardedSize;
+			BKernel::Bitmap*	fNoSwapPages;
 			off_t				fCommittedSwapSize;
 			off_t				fAllocatedSwapSize;
 };
--- a/src/system/kernel/vm/vm.cpp
+++ b/src/system/kernel/vm/vm.cpp
@ -6925,52 +6925,10 @@ _user_get_memory_properties(team_id teamID, const void* address,
 }


-// An ordered list of non-overlapping ranges to track mlock/munlock locking.
-// It is allowed to call mlock/munlock in unbalanced ways (lock a range
-// multiple times, unlock a part of it, lock several consecutive ranges and
-// unlock them in one go, etc). However the low level lock_memory and
-// unlock_memory calls require the locks/unlocks to be balanced (you lock a
-// fixed range, and then unlock exactly the same range). This list allows to
-// keep track of what was locked exactly so we can unlock the correct things.
-struct LockedPages : DoublyLinkedListLinkImpl<LockedPages> {
-	addr_t start;
-	addr_t end;
-
-	status_t LockMemory()
-	{
-		return lock_memory((void*)start, end - start, 0);
-	}
-
-	status_t UnlockMemory()
-	{
-		return unlock_memory((void*)start, end - start, 0);
-	}
-
-	status_t Move(addr_t start, addr_t end)
-	{
-		status_t result = lock_memory((void*)start, end - start, 0);
-		if (result != B_OK)
-			return result;
-
-		result = UnlockMemory();
-
-		if (result != B_OK) {
-			// What can we do if the unlock fails?
-			panic("Failed to unlock memory: %s", strerror(result));
-			return result;
-		}
-
-		this->start = start;
-		this->end = end;
-
-		return B_OK;
-	}
-};
-
-
-status_t
-_user_mlock(const void* _address, size_t size)
+static status_t
+user_set_memory_swappable(const void* _address, size_t size, bool swappable)
 {
+#if ENABLE_SWAP_SUPPORT
 	// check address range
 	addr_t address = (addr_t)_address;
 	size = PAGE_ALIGN(size);
@ -6980,198 +6938,77 @@ _user_mlock(const void* _address, size_t size)
 	if (!validate_user_memory_range(_address, size))
 		return EINVAL;

-	addr_t endAddress = address + size;
+	const addr_t endAddress = address + size;

-	// Pre-allocate a linked list element we may need (it's simpler to do it
-	// now than run out of memory in the midle of changing things)
-	LockedPages* newRange = new(std::nothrow) LockedPages();
-	if (newRange == NULL)
-		return ENOMEM;
-	ObjectDeleter<LockedPages> newRangeDeleter(newRange);
-
-	// Get and lock the team
-	Team* team = thread_get_current_thread()->team;
-	TeamLocker teamLocker(team);
-	teamLocker.Lock();
-
-	status_t error = B_OK;
-	LockedPagesList* lockedPages = &team->locked_pages_list;
-
-	// Locate the first locked range possibly overlapping ours
-	LockedPages* currentRange = lockedPages->Head();
-	while (currentRange != NULL && currentRange->end <= address)
-		currentRange = lockedPages->GetNext(currentRange);
-
-	if (currentRange == NULL || currentRange->start >= endAddress) {
-		// No existing range is overlapping with ours. We can just lock our
-		// range and stop here.
-		newRange->start = address;
-		newRange->end = endAddress;
-		error = newRange->LockMemory();
-		if (error != B_OK)
-			return error;
-
-		lockedPages->InsertBefore(currentRange, newRange);
-		newRangeDeleter.Detach();
-		return B_OK;
-	}
-
-	// We get here when there is at least one existing overlapping range.
-
-	if (currentRange->start <= address) {
-		if (currentRange->end >= endAddress) {
-			// An existing range is already fully covering the pages we need to
-			// lock. Nothing to do then.
-			return B_OK;
-		} else {
-			// An existing range covers the start of the area we want to lock.
-			// Advance our start address to avoid it.
-			address = currentRange->end;
-
-			// Move on to the next range for the next step
-			currentRange = lockedPages->GetNext(currentRange);
-		}
-	}
-
-	// First, lock the new range
-	newRange->start = address;
-	newRange->end = endAddress;
-	error = newRange->LockMemory();
+	AddressSpaceReadLocker addressSpaceLocker;
+	status_t error = addressSpaceLocker.SetTo(team_get_current_team_id());
 	if (error != B_OK)
 		return error;
+	VMAddressSpace* addressSpace = addressSpaceLocker.AddressSpace();

-	// Unlock all ranges fully overlapping with the area we need to lock
-	while (currentRange != NULL && currentRange->end < endAddress) {
-		// The existing range is fully contained inside the new one we're
-		// trying to lock. Delete/unlock it, and replace it with a new one
-		// (this limits fragmentation of the range list, and is simpler to
-		// manage)
-		error = currentRange->UnlockMemory();
+	// iterate through all concerned areas
+	addr_t nextAddress = address;
+	while (nextAddress != endAddress) {
+		// get the next area
+		VMArea* area = addressSpace->LookupArea(nextAddress);
+		if (area == NULL) {
+			error = B_BAD_ADDRESS;
+			break;
+		}
+
+		const addr_t areaStart = nextAddress;
+		const addr_t areaEnd = std::min(endAddress, area->Base() + area->Size());
+		nextAddress = areaEnd;
+
+		error = lock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
 		if (error != B_OK) {
-			panic("Failed to unlock a memory range: %s", strerror(error));
-			newRange->UnlockMemory();
-			return error;
+			// We don't need to unset or reset things on failure.
+			break;
 		}
-		LockedPages* temp = currentRange;
-		currentRange = lockedPages->GetNext(currentRange);
-		lockedPages->Remove(temp);
-		delete temp;
+
+		VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
+		VMAnonymousCache* anonCache = NULL;
+		if (dynamic_cast<VMAnonymousNoSwapCache*>(area->cache) != NULL) {
+			// This memory will aready never be swapped. Nothing to do.
+		} else if ((anonCache = dynamic_cast<VMAnonymousCache*>(area->cache)) != NULL) {
+			error = anonCache->SetCanSwapPages(areaStart - area->Base(),
+				areaEnd - areaStart, swappable);
+		} else {
+			// Some other cache type? We cannot affect anything here.
+			error = EINVAL;
+		}
+
+		cacheChainLocker.Unlock();
+
+		unlock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
+		if (error != B_OK)
+			break;
 	}

-	if (currentRange != NULL) {
-		// One last range may cover the end of the area we're trying to lock
-
-		if (currentRange->start == address) {
-			// In case two overlapping ranges (one at the start and the other
-			// at the end) already cover the area we're after, there's nothing
-			// more to do. So we destroy our new extra allocation
-			error = newRange->UnlockMemory();
-			return error;
-		}
-
-		if (currentRange->start < endAddress) {
-			// Make sure the last range is not overlapping, by moving its start
-			error = currentRange->Move(endAddress, currentRange->end);
-			if (error != B_OK) {
-				panic("Failed to move a memory range: %s", strerror(error));
-				newRange->UnlockMemory();
-				return error;
-			}
-		}
-	}
-
-	// Finally, store the new range in the locked list
-	lockedPages->InsertBefore(currentRange, newRange);
-	newRangeDeleter.Detach();
+	return error;
+#else
+	// No swap support? Nothing to do.
 	return B_OK;
+#endif
+}
+
+
+status_t
+_user_mlock(const void* _address, size_t size)
+{
+	return user_set_memory_swappable(_address, size, false);
 }


 status_t
 _user_munlock(const void* _address, size_t size)
 {
-	// check address range
-	addr_t address = (addr_t)_address;
-	size = PAGE_ALIGN(size);
-
-	if ((address % B_PAGE_SIZE) != 0)
-		return EINVAL;
-	if (!validate_user_memory_range(_address, size))
-		return EINVAL;
-
-	addr_t endAddress = address + size;
-
-	// Get and lock the team
-	Team* team = thread_get_current_thread()->team;
-	TeamLocker teamLocker(team);
-	teamLocker.Lock();
-	LockedPagesList* lockedPages = &team->locked_pages_list;
-
-	status_t error = B_OK;
-
-	// Locate the first locked range possibly overlapping ours
-	LockedPages* currentRange = lockedPages->Head();
-	while (currentRange != NULL && currentRange->end <= address)
-		currentRange = lockedPages->GetNext(currentRange);
-
-	if (currentRange == NULL || currentRange->start >= endAddress) {
-		// No range is intersecting, nothing to unlock
-		return B_OK;
-	}
-
-	if (currentRange->start < address) {
-		if (currentRange->end > endAddress) {
-			// There is a range fully covering the area we want to unlock,
-			// and it extends on both sides. We need to split it in two
-			LockedPages* newRange = new(std::nothrow) LockedPages();
-			if (newRange == NULL)
-				return ENOMEM;
-
-			newRange->start = endAddress;
-			newRange->end = currentRange->end;
-
-			error = newRange->LockMemory();
-			if (error != B_OK) {
-				delete newRange;
-				return error;
-			}
-
-			error = currentRange->Move(currentRange->start, address);
-			if (error != B_OK) {
-				delete newRange;
-				return error;
-			}
-
-			lockedPages->InsertAfter(currentRange, newRange);
-			return B_OK;
-		} else {
-			// There is a range that overlaps and extends before the one we
-			// want to unlock, we need to shrink it
-			error = currentRange->Move(currentRange->start, address);
-			if (error != B_OK)
-				return error;
-		}
-	}
-
-	while (currentRange != NULL && currentRange->end <= endAddress) {
-		// Unlock all fully overlapping ranges
-		error = currentRange->UnlockMemory();
-		if (error != B_OK)
-			return error;
-		LockedPages* temp = currentRange;
-		currentRange = lockedPages->GetNext(currentRange);
-		lockedPages->Remove(temp);
-		delete temp;
-	}
-
-	// Finally split the last partially overlapping range if any
-	if (currentRange != NULL && currentRange->start < endAddress) {
-		error = currentRange->Move(endAddress, currentRange->end);
-		if (error != B_OK)
-			return error;
-	}
-
-	return B_OK;
+	// TODO: B_SHARED_AREAs need to be handled a bit differently:
+	// if multiple clones of an area had mlock() called on them,
+	// munlock() must also be called on all of them to actually unlock.
+	// (At present, the first munlock() will unlock all.)
+	// TODO: fork() should automatically unlock memory in the child.
+	return user_set_memory_swappable(_address, size, true);
 }