kernel/vm: Completely replace mlock() implementation.

The old implementation used the real lock_memory(). This is problematic
and does not work for a large number of reasons:

1) Various parts of the kernel assume memory is locked only very
   temporarily, and will often wait on locked memory to become unlocked.
   The transient nature of locks is further demonstrated by the fact that
   lock_memory acquires references to structures, like the address space,
   which are only released by unlock_memory

2) The VM has a hard assumption that all lock_memory calls will be
   exactly balanced, and maintains internal "WiredRange" structures
   on areas, etc. corresponding to the original lock_memory calls.
   Maintaining separate data structures as this code did is a recipe
   for even more problems when the structures are manipulated separately,
   leading to confusing or incorrect behavior on unlocks.

3) Areas with locked memory cannot be deleted, nor can the pages which are
   locked be removed from the areas/caches. This of course is most notable
   when destroying teams which locked memory, but the problem also occurs
   when just using delete_area, resize_area, mmap/munmap, etc.

Because of (2) and especially (3), adding support for mlock()-like semantics
to the existing memory locking system is just not a good option. A further
reason is that our lock_memory is much stricter than mlock(), which only
demands the pages in question must remain resident in RAM and cannot be
swapped out (or, it seems, otherwise written back to disk.)

Thus, this commit completely removes the old implementation (which
was seriously broken and did not actually automatically unlock memory
on team exit or area destruction at all, etc.) and instead adds a new
feature to VMAnonymousCache to block certain pages from being written out.
The syscall then just invokes this to do its work.

Fixes #17674. Related to #13651.

Change-Id: Id2745c51796bcf9a74ba5325fe686a95623cd521
Reviewed-on: https://review.haiku-os.org/c/haiku/+/5147
Reviewed-by: waddlesplash <waddlesplash@gmail.com>
This commit is contained in:
Augustin Cavalier 2022-03-29 20:09:36 -04:00 committed by waddlesplash
parent 88275138ba
commit c25f6f53b5
4 changed files with 119 additions and 228 deletions

View File

@ -66,9 +66,6 @@ struct select_info;
struct user_thread; // defined in libroot/user_thread.h
struct VMAddressSpace;
struct xsi_sem_context; // defined in xsi_semaphore.cpp
struct LockedPages;
typedef DoublyLinkedList<LockedPages> LockedPagesList;
namespace Scheduler {
struct ThreadData;
@ -248,8 +245,6 @@ struct Team : TeamThreadIteratorEntry<team_id>, KernelReferenceable,
struct team_death_entry *death_entry; // protected by fLock
struct list dead_threads;
LockedPagesList locked_pages_list;
// protected by the team's fLock
team_dead_children dead_children;
team_job_control_children stopped_children;

View File

@ -47,6 +47,7 @@
#include <thread.h>
#include <tracing.h>
#include <util/AutoLock.h>
#include <util/Bitmap.h>
#include <util/DoublyLinkedList.h>
#include <util/OpenHashTable.h>
#include <util/RadixBitmap.h>
@ -445,6 +446,9 @@ private:
VMAnonymousCache::~VMAnonymousCache()
{
delete fNoSwapPages;
fNoSwapPages = NULL;
_FreeSwapPageRange(virtual_base, virtual_end, false);
swap_space_unreserve(fCommittedSwapSize);
if (committed_size > fCommittedSwapSize)
@ -468,6 +472,7 @@ VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
fCanOvercommit = canOvercommit;
fHasPrecommitted = false;
fPrecommittedPages = min_c(numPrecommittedPages, 255);
fNoSwapPages = NULL;
fGuardedSize = numGuardPages * B_PAGE_SIZE;
fCommittedSwapSize = 0;
fAllocatedSwapSize = 0;
@ -476,6 +481,42 @@ VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
}
status_t
VMAnonymousCache::SetCanSwapPages(off_t base, size_t size, bool canSwap)
{
const page_num_t first = base >> PAGE_SHIFT;
const size_t count = PAGE_ALIGN(size + ((first << PAGE_SHIFT) - base)) >> PAGE_SHIFT;
if (count == 0)
return B_OK;
if (canSwap && fNoSwapPages == NULL)
return B_OK;
if (fNoSwapPages == NULL)
fNoSwapPages = new(std::nothrow) Bitmap(0);
if (fNoSwapPages == NULL)
return B_NO_MEMORY;
const page_num_t pageCount = PAGE_ALIGN(virtual_end) >> PAGE_SHIFT;
if (fNoSwapPages->Resize(pageCount) != B_OK)
return B_NO_MEMORY;
for (size_t i = 0; i < count; i++) {
if (canSwap)
fNoSwapPages->Clear(first + i);
else
fNoSwapPages->Set(first + i);
}
if (fNoSwapPages->GetHighestSet() < 0) {
delete fNoSwapPages;
fNoSwapPages = NULL;
}
return B_OK;
}
void
VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
bool skipBusyPages)
@ -541,6 +582,11 @@ VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
status_t
VMAnonymousCache::Resize(off_t newSize, int priority)
{
if (fNoSwapPages != NULL) {
if (fNoSwapPages->Resize(PAGE_ALIGN(newSize) >> PAGE_SHIFT) != B_OK)
return B_NO_MEMORY;
}
_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
virtual_end + B_PAGE_SIZE - 1);
return VMCache::Resize(newSize, priority);
@ -550,6 +596,11 @@ VMAnonymousCache::Resize(off_t newSize, int priority)
status_t
VMAnonymousCache::Rebase(off_t newBase, int priority)
{
if (fNoSwapPages != NULL) {
const ssize_t sizeDifference = (newBase >> PAGE_SHIFT) - (virtual_base >> PAGE_SHIFT);
fNoSwapPages->Shift(sizeDifference);
}
_FreeSwapPageRange(virtual_base, newBase);
return VMCache::Rebase(newBase, priority);
}
@ -891,10 +942,14 @@ VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
bool
VMAnonymousCache::CanWritePage(off_t offset)
{
const off_t pageIndex = offset >> PAGE_SHIFT;
if (fNoSwapPages != NULL && fNoSwapPages->Get(pageIndex))
return false;
// We can write the page, if we have not used all of our committed swap
// space or the page already has a swap slot assigned.
return fAllocatedSwapSize < fCommittedSwapSize
|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
|| _SwapBlockGetAddress(pageIndex) != SWAP_SLOT_NONE;
}

View File

@ -19,6 +19,7 @@ typedef uint32 swap_addr_t;
// TODO: Should be wider, but RadixBitmap supports only a 32 bit type ATM!
struct swap_block;
struct system_memory_info;
namespace BKernel { class Bitmap; }
extern "C" {
@ -39,6 +40,8 @@ public:
int32 numGuardPages,
uint32 allocationFlags);
status_t SetCanSwapPages(off_t base, size_t size, bool canSwap);
virtual status_t Resize(off_t newSize, int priority);
virtual status_t Rebase(off_t newBase, int priority);
virtual status_t Adopt(VMCache* source, off_t offset,
@ -102,6 +105,7 @@ private:
bool fHasPrecommitted;
uint8 fPrecommittedPages;
int32 fGuardedSize;
BKernel::Bitmap* fNoSwapPages;
off_t fCommittedSwapSize;
off_t fAllocatedSwapSize;
};

View File

@ -6925,52 +6925,10 @@ _user_get_memory_properties(team_id teamID, const void* address,
}
// An ordered list of non-overlapping ranges to track mlock/munlock locking.
// It is allowed to call mlock/munlock in unbalanced ways (lock a range
// multiple times, unlock a part of it, lock several consecutive ranges and
// unlock them in one go, etc). However the low level lock_memory and
// unlock_memory calls require the locks/unlocks to be balanced (you lock a
// fixed range, and then unlock exactly the same range). This list allows to
// keep track of what was locked exactly so we can unlock the correct things.
struct LockedPages : DoublyLinkedListLinkImpl<LockedPages> {
addr_t start;
addr_t end;
status_t LockMemory()
{
return lock_memory((void*)start, end - start, 0);
}
status_t UnlockMemory()
{
return unlock_memory((void*)start, end - start, 0);
}
status_t Move(addr_t start, addr_t end)
{
status_t result = lock_memory((void*)start, end - start, 0);
if (result != B_OK)
return result;
result = UnlockMemory();
if (result != B_OK) {
// What can we do if the unlock fails?
panic("Failed to unlock memory: %s", strerror(result));
return result;
}
this->start = start;
this->end = end;
return B_OK;
}
};
status_t
_user_mlock(const void* _address, size_t size)
static status_t
user_set_memory_swappable(const void* _address, size_t size, bool swappable)
{
#if ENABLE_SWAP_SUPPORT
// check address range
addr_t address = (addr_t)_address;
size = PAGE_ALIGN(size);
@ -6980,198 +6938,77 @@ _user_mlock(const void* _address, size_t size)
if (!validate_user_memory_range(_address, size))
return EINVAL;
addr_t endAddress = address + size;
const addr_t endAddress = address + size;
// Pre-allocate a linked list element we may need (it's simpler to do it
// now than run out of memory in the midle of changing things)
LockedPages* newRange = new(std::nothrow) LockedPages();
if (newRange == NULL)
return ENOMEM;
ObjectDeleter<LockedPages> newRangeDeleter(newRange);
// Get and lock the team
Team* team = thread_get_current_thread()->team;
TeamLocker teamLocker(team);
teamLocker.Lock();
status_t error = B_OK;
LockedPagesList* lockedPages = &team->locked_pages_list;
// Locate the first locked range possibly overlapping ours
LockedPages* currentRange = lockedPages->Head();
while (currentRange != NULL && currentRange->end <= address)
currentRange = lockedPages->GetNext(currentRange);
if (currentRange == NULL || currentRange->start >= endAddress) {
// No existing range is overlapping with ours. We can just lock our
// range and stop here.
newRange->start = address;
newRange->end = endAddress;
error = newRange->LockMemory();
if (error != B_OK)
return error;
lockedPages->InsertBefore(currentRange, newRange);
newRangeDeleter.Detach();
return B_OK;
}
// We get here when there is at least one existing overlapping range.
if (currentRange->start <= address) {
if (currentRange->end >= endAddress) {
// An existing range is already fully covering the pages we need to
// lock. Nothing to do then.
return B_OK;
} else {
// An existing range covers the start of the area we want to lock.
// Advance our start address to avoid it.
address = currentRange->end;
// Move on to the next range for the next step
currentRange = lockedPages->GetNext(currentRange);
}
}
// First, lock the new range
newRange->start = address;
newRange->end = endAddress;
error = newRange->LockMemory();
AddressSpaceReadLocker addressSpaceLocker;
status_t error = addressSpaceLocker.SetTo(team_get_current_team_id());
if (error != B_OK)
return error;
VMAddressSpace* addressSpace = addressSpaceLocker.AddressSpace();
// Unlock all ranges fully overlapping with the area we need to lock
while (currentRange != NULL && currentRange->end < endAddress) {
// The existing range is fully contained inside the new one we're
// trying to lock. Delete/unlock it, and replace it with a new one
// (this limits fragmentation of the range list, and is simpler to
// manage)
error = currentRange->UnlockMemory();
// iterate through all concerned areas
addr_t nextAddress = address;
while (nextAddress != endAddress) {
// get the next area
VMArea* area = addressSpace->LookupArea(nextAddress);
if (area == NULL) {
error = B_BAD_ADDRESS;
break;
}
const addr_t areaStart = nextAddress;
const addr_t areaEnd = std::min(endAddress, area->Base() + area->Size());
nextAddress = areaEnd;
error = lock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
if (error != B_OK) {
panic("Failed to unlock a memory range: %s", strerror(error));
newRange->UnlockMemory();
return error;
// We don't need to unset or reset things on failure.
break;
}
LockedPages* temp = currentRange;
currentRange = lockedPages->GetNext(currentRange);
lockedPages->Remove(temp);
delete temp;
VMCacheChainLocker cacheChainLocker(vm_area_get_locked_cache(area));
VMAnonymousCache* anonCache = NULL;
if (dynamic_cast<VMAnonymousNoSwapCache*>(area->cache) != NULL) {
// This memory will aready never be swapped. Nothing to do.
} else if ((anonCache = dynamic_cast<VMAnonymousCache*>(area->cache)) != NULL) {
error = anonCache->SetCanSwapPages(areaStart - area->Base(),
areaEnd - areaStart, swappable);
} else {
// Some other cache type? We cannot affect anything here.
error = EINVAL;
}
cacheChainLocker.Unlock();
unlock_memory_etc(addressSpace->ID(), (void*)areaStart, areaEnd - areaStart, 0);
if (error != B_OK)
break;
}
if (currentRange != NULL) {
// One last range may cover the end of the area we're trying to lock
if (currentRange->start == address) {
// In case two overlapping ranges (one at the start and the other
// at the end) already cover the area we're after, there's nothing
// more to do. So we destroy our new extra allocation
error = newRange->UnlockMemory();
return error;
}
if (currentRange->start < endAddress) {
// Make sure the last range is not overlapping, by moving its start
error = currentRange->Move(endAddress, currentRange->end);
if (error != B_OK) {
panic("Failed to move a memory range: %s", strerror(error));
newRange->UnlockMemory();
return error;
}
}
}
// Finally, store the new range in the locked list
lockedPages->InsertBefore(currentRange, newRange);
newRangeDeleter.Detach();
return error;
#else
// No swap support? Nothing to do.
return B_OK;
#endif
}
status_t
_user_mlock(const void* _address, size_t size)
{
return user_set_memory_swappable(_address, size, false);
}
status_t
_user_munlock(const void* _address, size_t size)
{
// check address range
addr_t address = (addr_t)_address;
size = PAGE_ALIGN(size);
if ((address % B_PAGE_SIZE) != 0)
return EINVAL;
if (!validate_user_memory_range(_address, size))
return EINVAL;
addr_t endAddress = address + size;
// Get and lock the team
Team* team = thread_get_current_thread()->team;
TeamLocker teamLocker(team);
teamLocker.Lock();
LockedPagesList* lockedPages = &team->locked_pages_list;
status_t error = B_OK;
// Locate the first locked range possibly overlapping ours
LockedPages* currentRange = lockedPages->Head();
while (currentRange != NULL && currentRange->end <= address)
currentRange = lockedPages->GetNext(currentRange);
if (currentRange == NULL || currentRange->start >= endAddress) {
// No range is intersecting, nothing to unlock
return B_OK;
}
if (currentRange->start < address) {
if (currentRange->end > endAddress) {
// There is a range fully covering the area we want to unlock,
// and it extends on both sides. We need to split it in two
LockedPages* newRange = new(std::nothrow) LockedPages();
if (newRange == NULL)
return ENOMEM;
newRange->start = endAddress;
newRange->end = currentRange->end;
error = newRange->LockMemory();
if (error != B_OK) {
delete newRange;
return error;
}
error = currentRange->Move(currentRange->start, address);
if (error != B_OK) {
delete newRange;
return error;
}
lockedPages->InsertAfter(currentRange, newRange);
return B_OK;
} else {
// There is a range that overlaps and extends before the one we
// want to unlock, we need to shrink it
error = currentRange->Move(currentRange->start, address);
if (error != B_OK)
return error;
}
}
while (currentRange != NULL && currentRange->end <= endAddress) {
// Unlock all fully overlapping ranges
error = currentRange->UnlockMemory();
if (error != B_OK)
return error;
LockedPages* temp = currentRange;
currentRange = lockedPages->GetNext(currentRange);
lockedPages->Remove(temp);
delete temp;
}
// Finally split the last partially overlapping range if any
if (currentRange != NULL && currentRange->start < endAddress) {
error = currentRange->Move(endAddress, currentRange->end);
if (error != B_OK)
return error;
}
return B_OK;
// TODO: B_SHARED_AREAs need to be handled a bit differently:
// if multiple clones of an area had mlock() called on them,
// munlock() must also be called on all of them to actually unlock.
// (At present, the first munlock() will unlock all.)
// TODO: fork() should automatically unlock memory in the child.
return user_set_memory_swappable(_address, size, true);
}