scheduler: Code refactoring

2013-12-05 22:47:30 +01:00 · 2013-12-05 22:47:30 +01:00 · d287274dce
commit d287274dce
parent 2e3cbcfa8a
12 changed files with 1555 additions and 1435 deletions
--- a/headers/private/kernel/thread_types.h
+++ b/headers/private/kernel/thread_types.h
@ -17,7 +17,6 @@
 #include <heap.h>
 #include <ksignal.h>
 #include <lock.h>
-#include <RunQueueLink.h>
 #include <smp.h>
 #include <thread_defs.h>
 #include <timer.h>
@ -58,12 +57,15 @@ struct cpu_ent;
 struct image;					// defined in image.c
 struct io_context;
 struct realtime_sem_context;	// defined in realtime_sem.cpp
-struct scheduler_thread_data;
 struct select_info;
 struct user_thread;				// defined in libroot/user_thread.h
 struct VMAddressSpace;
 struct xsi_sem_context;			// defined in xsi_semaphore.cpp

+namespace Scheduler {
+	struct ThreadData;
+}
+
 namespace BKernel {
 	struct Team;
 	struct Thread;
@ -412,8 +414,7 @@ private:
 };


-struct Thread : TeamThreadIteratorEntry<thread_id>, KernelReferenceable,
-	RunQueueLinkImpl<Thread> {
+struct Thread : TeamThreadIteratorEntry<thread_id>, KernelReferenceable {
 	int32			flags;			// summary of events relevant in interrupt
 									// handlers (signals pending, user debugging
 									// enabled, etc.)
@ -444,7 +445,7 @@ struct Thread : TeamThreadIteratorEntry<thread_id>, KernelReferenceable,
 	bool			in_kernel;		// protected by time_lock, only written by
 									// this thread
 	bool			has_yielded;	// protected by scheduler lock
-	struct scheduler_thread_data* scheduler_data; // protected by scheduler lock
+	Scheduler::ThreadData*	scheduler_data; // protected by scheduler lock

 	struct user_thread*	user_thread;	// write-protected by fLock, only
 										// modified by the thread itself and
--- a/headers/private/kernel/util/atomic.h
+++ b/headers/private/kernel/util/atomic.h
@ -10,6 +10,8 @@

 #include <SupportDefs.h>

+#include <debug.h>
+

 #ifdef __cplusplus

--- a/src/system/kernel/Jamfile
+++ b/src/system/kernel/Jamfile
@ -65,6 +65,8 @@ KernelMergeObject kernel_core.o :
 	low_latency.cpp
 	power_saving.cpp
 	scheduler.cpp
+	scheduler_cpu.cpp
+	scheduler_thread.cpp
 	scheduler_tracing.cpp
 	scheduling_analysis.cpp

--- a/src/system/kernel/scheduler/low_latency.cpp
+++ b/src/system/kernel/scheduler/low_latency.cpp
@ -7,7 +7,9 @@
 #include <util/AutoLock.h>

 #include "scheduler_common.h"
+#include "scheduler_cpu.h"
 #include "scheduler_modes.h"
+#include "scheduler_thread.h"


 using namespace Scheduler;
@ -29,16 +31,12 @@ set_cpu_enabled(int32 /* cpu */, bool /* enabled */)


 static bool
-has_cache_expired(Thread* thread)
+has_cache_expired(const ThreadData* threadData)
 {
 	ASSERT(!gSingleCore);

-	scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
-	ASSERT(schedulerThreadData->previous_core >= 0);
-
-	CoreEntry* coreEntry = &gCoreEntries[schedulerThreadData->previous_core];
-	return atomic_get64(&coreEntry->fActiveTime)
-			- schedulerThreadData->went_sleep_active > kCacheExpire;
+	return atomic_get64(&threadData->GetCore()->fActiveTime)
+			- threadData->fWentSleepActive > kCacheExpire;
 }


@ -58,51 +56,45 @@ get_most_idle_package(void)
 }


-static int32
-choose_core(Thread* thread)
+static CoreEntry*
+choose_core(const ThreadData* /* threadData */)
 {
-	CoreEntry* entry = NULL;
-
 	ReadSpinLocker locker(gIdlePackageLock);
 	// wake new package
-	PackageEntry* package = gIdlePackageList->Last();
+	PackageEntry* package = gIdlePackageList.Last();
 	if (package == NULL) {
 		// wake new core
 		package = get_most_idle_package();
 	}
 	locker.Unlock();

+	CoreEntry* core = NULL;
 	if (package != NULL) {
 		ReadSpinLocker _(package->fCoreLock);
-		entry = package->fIdleCores.Last();
+		core = package->fIdleCores.Last();
 	}

-	if (entry == NULL) {
+	if (core == NULL) {
 		ReadSpinLocker coreLocker(gCoreHeapsLock);
 		// no idle cores, use least occupied core
-		entry = gCoreLoadHeap->PeekMinimum();
-		if (entry == NULL)
-			entry = gCoreHighLoadHeap->PeekMinimum();
+		core = gCoreLoadHeap.PeekMinimum();
+		if (core == NULL)
+			core = gCoreHighLoadHeap.PeekMinimum();
 	}

-	ASSERT(entry != NULL);
-	return entry->fCoreID;
+	ASSERT(core != NULL);
+	return core;
 }


 static bool
-should_rebalance(Thread* thread)
+should_rebalance(const ThreadData* threadData)
 {
-	scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
-	ASSERT(schedulerThreadData->previous_core >= 0);
-
-	CoreEntry* coreEntry = &gCoreEntries[schedulerThreadData->previous_core];
-
-	int32 coreLoad = get_core_load(coreEntry);
+	int32 coreLoad = threadData->GetCore()->GetLoad();

 	// If the thread produces more than 50% of the load, leave it here. In
 	// such situation it is better to move other threads away.
-	if (schedulerThreadData->load >= coreLoad / 2)
+	if (threadData->GetLoad() >= coreLoad / 2)
 		return false;

 	// If there is high load on this core but this thread does not contribute
@ -110,22 +102,20 @@ should_rebalance(Thread* thread)
 	if (coreLoad > kHighLoad) {
 		ReadSpinLocker coreLocker(gCoreHeapsLock);

-		CoreEntry* other = gCoreLoadHeap->PeekMinimum();
-		if (other != NULL && coreLoad - get_core_load(other)
-				>= kLoadDifference) {
+		CoreEntry* other = gCoreLoadHeap.PeekMinimum();
+		if (other != NULL && coreLoad - other->GetLoad() >= kLoadDifference)
 			return true;
-		}
 	}

 	// No cpu bound threads - the situation is quite good. Make sure it
 	// won't get much worse...
 	ReadSpinLocker coreLocker(gCoreHeapsLock);

-	CoreEntry* other = gCoreLoadHeap->PeekMinimum();
+	CoreEntry* other = gCoreLoadHeap.PeekMinimum();
 	if (other == NULL)
-		other = gCoreHighLoadHeap->PeekMinimum();
+		other = gCoreHighLoadHeap.PeekMinimum();
 	ASSERT(other != NULL);
-	return coreLoad - get_core_load(other) >= kLoadDifference * 2;
+	return coreLoad - other->GetLoad() >= kLoadDifference * 2;
 }


@ -155,26 +145,22 @@ rebalance_irqs(bool idle)
 		return;

 	ReadSpinLocker coreLocker(gCoreHeapsLock);
-	CoreEntry* other = gCoreLoadHeap->PeekMinimum();
+	CoreEntry* other = gCoreLoadHeap.PeekMinimum();
 	if (other == NULL)
-		other = gCoreHighLoadHeap->PeekMinimum();
+		other = gCoreHighLoadHeap.PeekMinimum();
 	coreLocker.Unlock();

 	SpinLocker cpuLocker(other->fCPULock);
-	int32 newCPU = gCPUPriorityHeaps[other->fCoreID].PeekMinimum()->fCPUNumber;
+	int32 newCPU = other->fCPUHeap.PeekMinimum()->fCPUNumber;
 	cpuLocker.Unlock();

-
 	ASSERT(other != NULL);

-	int32 thisCore = gCPUToCore[smp_get_current_cpu()];
-	if (other->fCoreID == thisCore)
+	CoreEntry* core = CoreEntry::GetCore(cpu->cpu_num);
+	if (other == core)
 		return;
-
-	if (get_core_load(other) + kLoadDifference
-			>= get_core_load(&gCoreEntries[thisCore])) {
+	if (other->GetLoad() + kLoadDifference >= core->GetLoad())
 		return;
-	}

 	assign_io_interrupt_to_cpu(chosen->irq, newCPU);
 }
--- a/src/system/kernel/scheduler/power_saving.cpp
+++ b/src/system/kernel/scheduler/power_saving.cpp
@ -4,10 +4,13 @@
 */


+#include <util/atomic.h>
 #include <util/AutoLock.h>

 #include "scheduler_common.h"
+#include "scheduler_cpu.h"
 #include "scheduler_modes.h"
+#include "scheduler_thread.h"


 using namespace Scheduler;
@ -15,13 +18,13 @@ using namespace Scheduler;

 const bigtime_t kCacheExpire = 100000;

-static int32 sSmallTaskCore;
+static CoreEntry* sSmallTaskCore;


 static void
 switch_to_mode(void)
 {
-	sSmallTaskCore = -1;
+	sSmallTaskCore = NULL;
 }


@ -29,34 +32,32 @@ static void
 set_cpu_enabled(int32 cpu, bool enabled)
 {
 	if (!enabled)
-		sSmallTaskCore = -1;
+		sSmallTaskCore = NULL;
 }


 static bool
-has_cache_expired(Thread* thread)
+has_cache_expired(const ThreadData* threadData)
 {
 	ASSERT(!gSingleCore);

-	scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
-	ASSERT(schedulerThreadData->previous_core >= 0);
-
-	return system_time() - schedulerThreadData->went_sleep > kCacheExpire;
+	return system_time() - threadData->fWentSleep > kCacheExpire;
 }


-static int32
+static CoreEntry*
 choose_small_task_core(void)
 {
 	ReadSpinLocker locker(gCoreHeapsLock);
-	CoreEntry* candidate = gCoreLoadHeap->PeekMaximum();
+	CoreEntry* core = gCoreLoadHeap.PeekMaximum();
 	locker.Unlock();
-	if (candidate == NULL)
+
+	if (core == NULL)
 		return sSmallTaskCore;

-	int32 core = candidate->fCoreID;
-	int32 smallTaskCore = atomic_test_and_set(&sSmallTaskCore, core, -1);
-	if (smallTaskCore == -1)
+	CoreEntry* smallTaskCore
+		= atomic_pointer_test_and_set(&sSmallTaskCore, core, (CoreEntry*)NULL);
+	if (smallTaskCore == NULL)
 		return core;
 	return smallTaskCore;
 }
@ -65,121 +66,118 @@ choose_small_task_core(void)
 static CoreEntry*
 choose_idle_core(void)
 {
-	PackageEntry* current = NULL;
+	PackageEntry* package = NULL;
+
 	for (int32 i = 0; i < gPackageCount; i++) {
-		if (gPackageEntries[i].fIdleCoreCount != 0 && (current == NULL
-				|| gPackageEntries[i].fIdleCoreCount
-					< current->fIdleCoreCount)) {
-			current = &gPackageEntries[i];
+		PackageEntry* current = &gPackageEntries[i];
+		if (current->fIdleCoreCount != 0 && (package == NULL
+				|| current->fIdleCoreCount < package->fIdleCoreCount)) {
+			package = current;
 		}
 	}

-	if (current == NULL) {
+	if (package == NULL) {
 		ReadSpinLocker _(gIdlePackageLock);
-		current = gIdlePackageList->Last();
+		package = gIdlePackageList.Last();
 	}

-	if (current != NULL) {
-		ReadSpinLocker _(current->fCoreLock);
-		return current->fIdleCores.Last();
+	if (package != NULL) {
+		ReadSpinLocker _(package->fCoreLock);
+		return package->fIdleCores.Last();
 	}

 	return NULL;
 }


-static int32
-choose_core(Thread* thread)
+static CoreEntry*
+choose_core(const ThreadData* threadData)
 {
-	CoreEntry* entry;
+	CoreEntry* core = NULL;

-	int32 core = -1;
 	// try to pack all threads on one core
 	core = choose_small_task_core();

-	if (core != -1
-		&& get_core_load(&gCoreEntries[core]) + thread->scheduler_data->load
-			< kHighLoad) {
-		entry = &gCoreEntries[core];
-	} else {
+	if (core == NULL || core->GetLoad() + threadData->GetLoad() >= kHighLoad) {
 		ReadSpinLocker coreLocker(gCoreHeapsLock);
+
 		// run immediately on already woken core
-		entry = gCoreLoadHeap->PeekMinimum();
-		if (entry == NULL) {
+		core = gCoreLoadHeap.PeekMinimum();
+		if (core == NULL) {
 			coreLocker.Unlock();

-			entry = choose_idle_core();
+			core = choose_idle_core();

-			if (entry == NULL) {
+			if (core == NULL) {
 				coreLocker.Lock();
-				entry = gCoreHighLoadHeap->PeekMinimum();
+				core = gCoreHighLoadHeap.PeekMinimum();
 			}
 		}
 	}

-	ASSERT(entry != NULL);
-	return entry->fCoreID;
+	ASSERT(core != NULL);
+	return core;
 }


 static bool
-should_rebalance(Thread* thread)
+should_rebalance(const ThreadData* threadData)
 {
 	ASSERT(!gSingleCore);

-	scheduler_thread_data* schedulerThreadData = thread->scheduler_data;
-	ASSERT(schedulerThreadData->previous_core >= 0);
+	CoreEntry* core = threadData->GetCore();

-	int32 core = schedulerThreadData->previous_core;
-	CoreEntry* coreEntry = &gCoreEntries[core];
-
-	int32 coreLoad = get_core_load(coreEntry);
+	int32 coreLoad = core->GetLoad();
 	if (coreLoad > kHighLoad) {
 		ReadSpinLocker coreLocker(gCoreHeapsLock);
 		if (sSmallTaskCore == core) {
-			sSmallTaskCore = -1;
+			sSmallTaskCore = NULL;
 			choose_small_task_core();
-			if (schedulerThreadData->load > coreLoad / 3)
+
+			if (threadData->GetLoad() > coreLoad / 3)
 				return false;
 			return coreLoad > kVeryHighLoad;
 		}

-		if (schedulerThreadData->load >= coreLoad / 2)
+		if (threadData->GetLoad() >= coreLoad / 2)
 			return false;

-		CoreEntry* other = gCoreLoadHeap->PeekMaximum();
+		CoreEntry* other = gCoreLoadHeap.PeekMaximum();
 		if (other == NULL)
-			other = gCoreHighLoadHeap->PeekMinimum();
+			other = gCoreHighLoadHeap.PeekMinimum();
 		ASSERT(other != NULL);
-		return coreLoad - get_core_load(other) >= kLoadDifference / 2;
+		return coreLoad - other->GetLoad() >= kLoadDifference / 2;
 	}

 	if (coreLoad >= kMediumLoad)
 		return false;

-	int32 smallTaskCore = choose_small_task_core();
-	if (smallTaskCore == -1)
+	CoreEntry* smallTaskCore = choose_small_task_core();
+	if (smallTaskCore == NULL)
 		return false;
 	return smallTaskCore != core
-		&& get_core_load(&gCoreEntries[smallTaskCore])
-				+ thread->scheduler_data->load < kHighLoad;
+		&& smallTaskCore->GetLoad() +threadData->GetLoad() < kHighLoad;
 }


 static inline void
 pack_irqs(void)
 {
+	CoreEntry* smallTaskCore = atomic_pointer_get(&sSmallTaskCore);
+	if (smallTaskCore == NULL)
+		return;
+
 	cpu_ent* cpu = get_cpu_struct();
-	int32 core = gCPUToCore[cpu->cpu_num];
+	if (smallTaskCore == CoreEntry::GetCore(cpu->cpu_num))
+		return;

 	SpinLocker locker(cpu->irqs_lock);
-	while (sSmallTaskCore != core && list_get_first_item(&cpu->irqs) != NULL) {
+	while (list_get_first_item(&cpu->irqs) != NULL) {
 		irq_assignment* irq = (irq_assignment*)list_get_first_item(&cpu->irqs);
 		locker.Unlock();

 		ReadSpinLocker coreLocker(gCoreHeapsLock);
-		int32 newCPU
-			= gCPUPriorityHeaps[sSmallTaskCore].PeekMinimum()->fCPUNumber;
+		int32 newCPU = smallTaskCore->fCPUHeap.PeekMinimum()->fCPUNumber;
 		coreLocker.Unlock();

 		if (newCPU != cpu->cpu_num)
@ -193,12 +191,12 @@ pack_irqs(void)
 static void
 rebalance_irqs(bool idle)
 {
-	if (idle && sSmallTaskCore != -1) {
+	if (idle && sSmallTaskCore != NULL) {
 		pack_irqs();
 		return;
 	}

-	if (idle || sSmallTaskCore != -1)
+	if (idle || sSmallTaskCore != NULL)
 		return;

 	cpu_ent* cpu = get_cpu_struct();
@ -219,22 +217,19 @@ rebalance_irqs(bool idle)
 		return;

 	ReadSpinLocker coreLocker(gCoreHeapsLock);
-	CoreEntry* other = gCoreLoadHeap->PeekMinimum();
+	CoreEntry* other = gCoreLoadHeap.PeekMinimum();
 	coreLocker.Unlock();
 	if (other == NULL)
 		return;
 	SpinLocker cpuLocker(other->fCPULock);
-	int32 newCPU = gCPUPriorityHeaps[other->fCoreID].PeekMinimum()->fCPUNumber;
+	int32 newCPU = other->fCPUHeap.PeekMinimum()->fCPUNumber;
 	cpuLocker.Unlock();

-	int32 thisCore = gCPUToCore[smp_get_current_cpu()];
-	if (other->fCoreID == thisCore)
+	CoreEntry* core = CoreEntry::GetCore(smp_get_current_cpu());
+	if (other == core)
 		return;
-
-	if (get_core_load(other) + kLoadDifference
-			>= get_core_load(&gCoreEntries[thisCore])) {
+	if (other->GetLoad() + kLoadDifference >= core->GetLoad())
 		return;
-	}

 	assign_io_interrupt_to_cpu(chosen->irq, newCPU);
 }
--- a/src/system/kernel/scheduler/scheduler.cpp
+++ b/src/system/kernel/scheduler/scheduler.cpp
--- a/src/system/kernel/scheduler/scheduler_common.h
+++ b/src/system/kernel/scheduler/scheduler_common.h
@ -7,6 +7,8 @@
 #define KERNEL_SCHEDULER_COMMON_H


+#include <algorithm>
+
 #include <debug.h>
 #include <kscheduler.h>
 #include <load_tracking.h>
@ -29,6 +31,9 @@
 namespace Scheduler {


+struct CPUEntry;
+struct CoreEntry;
+
 const int kLowLoad = kMaxLoad * 20 / 100;
 const int kTargetLoad = kMaxLoad * 55 / 100;
 const int kHighLoad = kMaxLoad * 70 / 100;
@ -39,179 +44,12 @@ const int kLoadDifference = kMaxLoad * 20 / 100;

 extern bool gSingleCore;

-// Heaps in sCPUPriorityHeaps are used for load balancing on a core the logical
-// processors in the heap belong to. Since there are no cache affinity issues
-// at this level and the run queue is shared among all logical processors on
-// the core the only real concern is to make lower priority threads give way to
-// the higher priority threads.
-struct CPUEntry : public MinMaxHeapLinkImpl<CPUEntry, int32> {
-				CPUEntry();

-	int32		fCPUNumber;
-
-	int32		fPriority;
-
-	bigtime_t	fMeasureActiveTime;
-	bigtime_t	fMeasureTime;
-
-	int32		fLoad;
-
-	rw_spinlock fSchedulerModeLock;
-} CACHE_LINE_ALIGN;
-typedef MinMaxHeap<CPUEntry, int32> CPUHeap CACHE_LINE_ALIGN;
-
-extern CPUEntry* gCPUEntries;
-extern CPUHeap* gCPUPriorityHeaps;
-
-struct CoreEntry : public MinMaxHeapLinkImpl<CoreEntry, int32>,
-	DoublyLinkedListLinkImpl<CoreEntry> {
-				CoreEntry();
-
-	int32		fCoreID;
-
-	int32		fCPUCount;
-
-	spinlock	fCPULock;
-	spinlock	fQueueLock;
-
-	int32		fStarvationCounter;
-
-	int32		fThreadCount;
-	DoublyLinkedList<scheduler_thread_data>	fThreadList;
-
-	bigtime_t	fActiveTime;
-
-	int32		fLoad;
-	bool		fHighLoad;
-} CACHE_LINE_ALIGN;
-typedef MinMaxHeap<CoreEntry, int32> CoreLoadHeap;
-
-extern CoreEntry* gCoreEntries;
-extern CoreLoadHeap* gCoreLoadHeap;
-extern CoreLoadHeap* gCoreHighLoadHeap;
-extern rw_spinlock gCoreHeapsLock;
-extern int32 gCoreCount;
-
-// gPackageEntries are used to decide which core should be woken up from the
-// idle state. When aiming for performance we should use as many packages as
-// possible with as little cores active in each package as possible (so that the
-// package can enter any boost mode if it has one and the active core have more
-// of the shared cache for themselves. If power saving is the main priority we
-// should keep active cores on as little packages as possible (so that other
-// packages can go to the deep state of sleep). The heap stores only packages
-// with at least one core active and one core idle. The packages with all cores
-// idle are stored in sPackageIdleList (in LIFO manner).
-struct PackageEntry : public DoublyLinkedListLinkImpl<PackageEntry> {
-								PackageEntry();
-
-	int32						fPackageID;
-
-	rw_spinlock					fCoreLock;
-
-	DoublyLinkedList<CoreEntry>	fIdleCores;
-	int32						fIdleCoreCount;
-
-	int32						fCoreCount;
-} CACHE_LINE_ALIGN;
-typedef DoublyLinkedList<PackageEntry> IdlePackageList;
-
-extern PackageEntry* gPackageEntries;
-extern IdlePackageList* gIdlePackageList;
-extern rw_spinlock gIdlePackageLock;
-extern int32 gPackageCount;
-
-// The run queues. Holds the threads ready to run ordered by priority.
-// One queue per schedulable target per core. Additionally, each
-// logical processor has its sPinnedRunQueues used for scheduling
-// pinned threads.
-typedef RunQueue<Thread, THREAD_MAX_SET_PRIORITY> CACHE_LINE_ALIGN
-	ThreadRunQueue;
-
-extern ThreadRunQueue* gRunQueues;
-extern ThreadRunQueue* gPinnedRunQueues;
-
-// Since CPU IDs used internally by the kernel bear no relation to the actual
-// CPU topology the following arrays are used to efficiently get the core
-// and the package that CPU in question belongs to.
-extern int32* gCPUToCore;
-extern int32* gCPUToPackage;
+void init_debug_commands(void);


 }	// namespace Scheduler


-struct scheduler_thread_data :
-	public DoublyLinkedListLinkImpl<scheduler_thread_data> {
-	inline				scheduler_thread_data();
-			void		Init();
-
-			int32		priority_penalty;
-			int32		additional_penalty;
-
-			bigtime_t	time_left;
-			bigtime_t	stolen_time;
-			bigtime_t	quantum_start;
-			bigtime_t	last_interrupt_time;
-
-			bigtime_t	measure_active_time;
-			bigtime_t	measure_time;
-			int32		load;
-
-			bigtime_t	went_sleep;
-			bigtime_t	went_sleep_active;
-			int32		went_sleep_count;
-
-			int32		previous_core;
-
-			bool		enqueued;
-};
-
-
-static inline int32
-get_core_load(struct Scheduler::CoreEntry* core)
-{
-	return core->fLoad / core->fCPUCount;
-}
-
-
-static inline int32
-get_minimal_priority(Thread* thread)
-{
-	return max_c(min_c(thread->priority, 25) / 5, 1);
-}
-
-
-static inline int32
-get_thread_penalty(Thread* thread)
-{
-	int32 penalty = thread->scheduler_data->priority_penalty;
-
-	const int kMinimalPriority = get_minimal_priority(thread);
-	if (kMinimalPriority > 0) {
-		penalty
-			+= thread->scheduler_data->additional_penalty % kMinimalPriority;
-	}
-
-	return penalty;
-}
-
-
-static inline int32
-get_effective_priority(Thread* thread)
-{
-	if (thread->priority == B_IDLE_PRIORITY)
-		return thread->priority;
-	if (thread->priority >= B_FIRST_REAL_TIME_PRIORITY)
-		return thread->priority;
-
-	int32 effectivePriority = thread->priority;
-	effectivePriority -= get_thread_penalty(thread);
-
-	ASSERT(effectivePriority < B_FIRST_REAL_TIME_PRIORITY);
-	ASSERT(effectivePriority >= B_LOWEST_ACTIVE_PRIORITY);
-
-	return effectivePriority;
-}
-
-
 #endif	// KERNEL_SCHEDULER_COMMON_H
+
--- a/src/system/kernel/scheduler/scheduler_cpu.cpp
+++ b/src/system/kernel/scheduler/scheduler_cpu.cpp
@ -0,0 +1,466 @@
+/*
+ * Copyright 2013, Paweł Dziepak, pdziepak@quarnos.org.
+ * Distributed under the terms of the MIT License.
+ */
+
+
+#include "scheduler_cpu.h"
+
+#include <util/AutoLock.h>
+
+#include <algorithm>
+
+#include "scheduler_thread.h"
+
+
+using namespace Scheduler;
+
+
+static CPUPriorityHeap sDebugCPUHeap;
+static CoreLoadHeap sDebugCoreHeap;
+
+
+void
+ThreadRunQueue::Dump() const
+{
+	ThreadRunQueue::ConstIterator iterator = GetConstIterator();
+	if (!iterator.HasNext())
+		kprintf("Run queue is empty.\n");
+	else {
+		kprintf("thread      id      priority penalty  name\n");
+		while (iterator.HasNext()) {
+			ThreadData* threadData = iterator.Next();
+			Thread* thread = threadData->GetThread();
+
+			kprintf("%p  %-7" B_PRId32 " %-8" B_PRId32 " %-8" B_PRId32 " %s\n",
+				thread, thread->id, thread->priority,
+				threadData->GetEffectivePriority(), thread->name);
+		}
+	}
+}
+
+
+CPUEntry::CPUEntry()
+	:
+	fPriority(B_IDLE_PRIORITY),
+	fLoad(0),
+	fMeasureActiveTime(0),
+	fMeasureTime(0)
+{
+	B_INITIALIZE_RW_SPINLOCK(&fSchedulerModeLock);
+}
+
+
+void
+CPUEntry::UpdatePriority(int32 priority)
+{
+	int32 corePriority = CPUPriorityHeap::GetKey(fCore->fCPUHeap.PeekMaximum());
+	fCore->fCPUHeap.ModifyKey(this, priority);
+
+	if (gSingleCore)
+		return;
+
+	int32 maxPriority = CPUPriorityHeap::GetKey(fCore->fCPUHeap.PeekMaximum());
+	if (corePriority == maxPriority)
+		return;
+
+	PackageEntry* packageEntry = fCore->fPackage;
+	if (maxPriority == B_IDLE_PRIORITY) {
+		WriteSpinLocker _(packageEntry->fCoreLock);
+
+		// core goes idle
+		ASSERT(packageEntry->fIdleCoreCount >= 0);
+		ASSERT(packageEntry->fIdleCoreCount < packageEntry->fCoreCount);
+
+		packageEntry->fIdleCoreCount++;
+		packageEntry->fIdleCores.Add(fCore);
+
+		if (packageEntry->fIdleCoreCount == packageEntry->fCoreCount) {
+			// package goes idle
+			WriteSpinLocker _(gIdlePackageLock);
+			gIdlePackageList.Add(packageEntry);
+		}
+	} else if (corePriority == B_IDLE_PRIORITY) {
+		WriteSpinLocker _(packageEntry->fCoreLock);
+
+		// core wakes up
+		ASSERT(packageEntry->fIdleCoreCount > 0);
+		ASSERT(packageEntry->fIdleCoreCount <= packageEntry->fCoreCount);
+
+		packageEntry->fIdleCoreCount--;
+		packageEntry->fIdleCores.Remove(fCore);
+
+		if (packageEntry->fIdleCoreCount + 1 == packageEntry->fCoreCount) {
+			// package wakes up
+			WriteSpinLocker _(gIdlePackageLock);
+			gIdlePackageList.Remove(packageEntry);
+		}
+	}
+}
+
+
+void
+CPUEntry::ComputeLoad()
+{
+	ASSERT(!gSingleCore);
+	ASSERT(fCPUNumber == smp_get_current_cpu());
+
+	int oldLoad = compute_load(fMeasureTime, fMeasureActiveTime, fLoad);
+	if (oldLoad < 0)
+		return;
+
+	if (oldLoad != fLoad) {
+		int32 delta = fLoad - oldLoad;
+		atomic_add(&fCore->fLoad, delta);
+
+		fCore->UpdateLoad();
+	}
+
+	if (fLoad > kVeryHighLoad)
+		gCurrentMode->rebalance_irqs(false);
+}
+
+
+ThreadData*
+CPUEntry::ChooseNextThread(ThreadData* oldThread, bool putAtBack)
+{
+	SpinLocker runQueueLocker(fCore->fQueueLock);
+
+	ThreadData* sharedThread = fCore->fRunQueue.PeekMaximum();
+	ThreadData* pinnedThread = fRunQueue.PeekMaximum();
+
+	ASSERT(sharedThread != NULL || pinnedThread != NULL || oldThread != NULL);
+
+	int32 pinnedPriority = -1;
+	if (pinnedThread != NULL)
+		pinnedPriority = pinnedThread->GetEffectivePriority();
+
+	int32 sharedPriority = -1;
+	if (sharedThread != NULL)
+		sharedPriority = sharedThread->GetEffectivePriority();
+
+	int32 oldPriority = -1;
+	if (oldThread != NULL)
+		oldPriority = oldThread->GetEffectivePriority();
+
+	int32 rest = std::max(pinnedPriority, sharedPriority);
+	if (oldPriority > rest || (!putAtBack && oldPriority == rest))
+		return oldThread;
+
+	if (sharedPriority > pinnedPriority) {
+		sharedThread->fEnqueued = false;
+
+		fCore->fRunQueue.Remove(sharedThread);
+		if (thread_is_idle_thread(sharedThread->GetThread())
+			|| fCore->fThreadList.Head() == sharedThread) {
+			atomic_add(&fCore->fStarvationCounter, 1);
+		}
+
+		if (sharedThread->fWentSleepCount == 0)
+			fCore->fThreadList.Remove(sharedThread);
+
+		atomic_add(&fCore->fThreadCount, -1);
+		return sharedThread;
+	}
+
+	pinnedThread->fEnqueued = false;
+	fRunQueue.Remove(pinnedThread);
+	return pinnedThread;
+}
+
+
+void
+CPUEntry::TrackActivity(ThreadData* oldThreadData, ThreadData* nextThreadData)
+{
+	cpu_ent* cpuEntry = &gCPU[fCPUNumber];
+
+	Thread* oldThread = oldThreadData->GetThread();
+	if (!thread_is_idle_thread(oldThread)) {
+		bigtime_t active
+			= (oldThread->kernel_time - cpuEntry->last_kernel_time)
+				+ (oldThread->user_time - cpuEntry->last_user_time);
+
+		atomic_add64(&cpuEntry->active_time, active);
+		oldThreadData->UpdateActivity(active);
+	}
+
+	oldThreadData->ComputeLoad();
+	nextThreadData->ComputeLoad();
+	if (!gSingleCore && !cpuEntry->disabled)
+		ComputeLoad();
+
+	Thread* nextThread = nextThreadData->GetThread();
+	if (!thread_is_idle_thread(nextThread)) {
+		cpuEntry->last_kernel_time = nextThread->kernel_time;
+		cpuEntry->last_user_time = nextThread->user_time;
+
+		nextThreadData->fLastInterruptTime = cpuEntry->interrupt_time;
+
+		_RequestPerformanceLevel(nextThreadData);
+	}
+}
+
+
+inline void
+CPUEntry::_RequestPerformanceLevel(ThreadData* threadData)
+{
+	int32 load = std::max(threadData->GetLoad(), fCore->GetLoad());
+	load = std::min(std::max(load, int32(0)), kMaxLoad);
+
+	if (load < kTargetLoad) {
+		int32 delta = kTargetLoad - load;
+
+		delta *= kTargetLoad;
+		delta /= kCPUPerformanceScaleMax;
+
+		decrease_cpu_performance(delta);
+	} else {
+		bool allowBoost = !gCurrentMode->avoid_boost;
+
+		int32 delta = load - kTargetLoad;
+		delta *= kMaxLoad - kTargetLoad;
+		delta /= kCPUPerformanceScaleMax;
+
+		increase_cpu_performance(delta, allowBoost);
+	}
+}
+
+
+CPUPriorityHeap::CPUPriorityHeap(int32 cpuCount)
+	:
+	MinMaxHeap<CPUEntry, int32>(cpuCount)
+{
+}
+
+
+void
+CPUPriorityHeap::Dump()
+{
+	kprintf("cpu priority load\n");
+	CPUEntry* entry = PeekMinimum();
+	while (entry) {
+		int32 cpu = entry->fCPUNumber;
+		int32 key = GetKey(entry);
+		kprintf("%3" B_PRId32 " %8" B_PRId32 " %3" B_PRId32 "%%\n", cpu, key,
+			entry->fLoad / 10);
+
+		RemoveMinimum();
+		sDebugCPUHeap.Insert(entry, key);
+
+		entry = PeekMinimum();
+	}
+
+	entry = sDebugCPUHeap.PeekMinimum();
+	while (entry) {
+		int32 key = GetKey(entry);
+		sDebugCPUHeap.RemoveMinimum();
+		Insert(entry, key);
+		entry = sDebugCPUHeap.PeekMinimum();
+	}
+}
+
+
+CoreEntry::CoreEntry()
+	:
+	fCPUCount(0),
+	fStarvationCounter(0),
+	fThreadCount(0),
+	fActiveTime(0),
+	fLoad(0),
+	fHighLoad(false)
+{
+	B_INITIALIZE_SPINLOCK(&fCPULock);
+	B_INITIALIZE_SPINLOCK(&fQueueLock);
+}
+
+
+void
+CoreEntry::UpdateLoad()
+{
+	ASSERT(!gSingleCore);
+
+	if (fCPUCount == 0) {
+		fLoad = 0;
+		return;
+	}
+
+	WriteSpinLocker coreLocker(gCoreHeapsLock);
+
+	int32 newKey = GetLoad();
+	int32 oldKey = CoreLoadHeap::GetKey(this);
+
+	ASSERT(oldKey >= 0 && oldKey <= kMaxLoad);
+	ASSERT(newKey >= 0 && newKey <= kMaxLoad);
+
+	if (oldKey == newKey)
+		return;
+
+	if (newKey > kHighLoad) {
+		if (!fHighLoad) {
+			gCoreLoadHeap.ModifyKey(this, -1);
+			ASSERT(gCoreLoadHeap.PeekMinimum() == this);
+			gCoreLoadHeap.RemoveMinimum();
+
+			gCoreHighLoadHeap.Insert(this, newKey);
+
+			fHighLoad = true;
+		} else
+			gCoreHighLoadHeap.ModifyKey(this, newKey);
+	} else if (newKey < kMediumLoad) {
+		if (fHighLoad) {
+			gCoreHighLoadHeap.ModifyKey(this, -1);
+			ASSERT(gCoreHighLoadHeap.PeekMinimum() == this);
+			gCoreHighLoadHeap.RemoveMinimum();
+
+			gCoreLoadHeap.Insert(this, newKey);
+
+			fHighLoad = false;
+		} else
+			gCoreLoadHeap.ModifyKey(this, newKey);
+	} else {
+		if (fHighLoad)
+			gCoreHighLoadHeap.ModifyKey(this, newKey);
+		else
+			gCoreLoadHeap.ModifyKey(this, newKey);
+	}
+}
+
+
+CoreLoadHeap::CoreLoadHeap(int32 coreCount)
+	:
+	MinMaxHeap<CoreEntry, int32>(coreCount)
+{
+}
+
+
+void
+CoreLoadHeap::Dump()
+{
+	CoreEntry* entry = PeekMinimum();
+	while (entry) {
+		int32 key = GetKey(entry);
+		kprintf("%4" B_PRId32 " %3" B_PRId32 "%%\n", entry->fCoreID,
+			entry->GetLoad() / 10);
+
+		RemoveMinimum();
+		sDebugCoreHeap.Insert(entry, key);
+
+		entry = PeekMinimum();
+	}
+
+	entry = sDebugCoreHeap.PeekMinimum();
+	while (entry) {
+		int32 key = GetKey(entry);
+		sDebugCoreHeap.RemoveMinimum();
+		Insert(entry, key);
+		entry = sDebugCoreHeap.PeekMinimum();
+	}
+}
+
+
+PackageEntry::PackageEntry()
+	:
+	fIdleCoreCount(0),
+	fCoreCount(0)
+{
+	B_INITIALIZE_RW_SPINLOCK(&fCoreLock);
+}
+
+
+static int
+dump_run_queue(int argc, char **argv)
+{
+	int32 cpuCount = smp_get_num_cpus();
+	int32 coreCount = gCoreCount;
+
+	
+	for (int32 i = 0; i < coreCount; i++) {
+		kprintf("%sCore %" B_PRId32 " run queue:\n", i > 0 ? "\n" : "", i);
+		gCoreEntries[i].fRunQueue.Dump();
+	}
+
+	for (int32 i = 0; i < cpuCount; i++) {
+		CPUEntry* cpu = &gCPUEntries[i];
+		ThreadRunQueue::ConstIterator iterator
+			= cpu->fRunQueue.GetConstIterator();
+
+		if (iterator.HasNext()
+			&& !thread_is_idle_thread(iterator.Next()->GetThread())) {
+			kprintf("\nCPU %" B_PRId32 " run queue:\n", i);
+			cpu->fRunQueue.Dump();
+		}
+	}
+
+	return 0;
+}
+
+
+static int
+dump_cpu_heap(int argc, char** argv)
+{
+	kprintf("core load\n");
+	gCoreLoadHeap.Dump();
+	kprintf("\n");
+	gCoreHighLoadHeap.Dump();
+
+	for (int32 i = 0; i < gCoreCount; i++) {
+		if (gCoreEntries[i].fCPUCount < 2)
+			continue;
+
+		kprintf("\nCore %" B_PRId32 " heap:\n", i);
+		gCoreEntries[i].fCPUHeap.Dump();
+	}
+
+	return 0;
+}
+
+
+static int
+dump_idle_cores(int argc, char** argv)
+{
+	kprintf("Idle packages:\n");
+	IdlePackageList::ReverseIterator idleIterator
+		= gIdlePackageList.GetReverseIterator();
+
+	if (idleIterator.HasNext()) {
+		kprintf("package cores\n");
+
+		while (idleIterator.HasNext()) {
+			PackageEntry* entry = idleIterator.Next();
+			kprintf("%-7" B_PRId32 " ", entry->fPackageID);
+
+			DoublyLinkedList<CoreEntry>::ReverseIterator iterator
+				= entry->fIdleCores.GetReverseIterator();
+			if (iterator.HasNext()) {
+				while (iterator.HasNext()) {
+					CoreEntry* coreEntry = iterator.Next();
+					kprintf("%" B_PRId32 "%s", coreEntry->fCoreID,
+						iterator.HasNext() ? ", " : "");
+				}
+			} else
+				kprintf("-");
+			kprintf("\n");
+		}
+	} else
+		kprintf("No idle packages.\n");
+
+	return 0;
+}
+
+
+void Scheduler::init_debug_commands(void)
+{
+	new(&sDebugCPUHeap) CPUPriorityHeap(smp_get_num_cpus());
+	new(&sDebugCoreHeap) CoreLoadHeap(smp_get_num_cpus());
+
+	add_debugger_command_etc("run_queue", &dump_run_queue,
+		"List threads in run queue", "\nLists threads in run queue", 0);
+	if (!gSingleCore) {
+		add_debugger_command_etc("cpu_heap", &dump_cpu_heap,
+			"List CPUs in CPU priority heap",
+			"\nList CPUs in CPU priority heap", 0);
+		add_debugger_command_etc("idle_cores", &dump_idle_cores,
+			"List idle cores", "\nList idle cores", 0);
+	}
+}
+
--- a/src/system/kernel/scheduler/scheduler_cpu.h
+++ b/src/system/kernel/scheduler/scheduler_cpu.h
@ -0,0 +1,170 @@
+/*
+ * Copyright 2013, Paweł Dziepak, pdziepak@quarnos.org.
+ * Distributed under the terms of the MIT License.
+ */
+#ifndef KERNEL_SCHEDULER_CPU_H
+#define KERNEL_SCHEDULER_CPU_H
+
+
+#include <OS.h>
+
+#include <thread.h>
+#include <util/MinMaxHeap.h>
+
+#include <cpufreq.h>
+
+#include "RunQueue.h"
+#include "scheduler_common.h"
+#include "scheduler_modes.h"
+
+
+namespace Scheduler {
+
+
+struct ThreadData;
+
+struct CPUEntry;
+struct CoreEntry;
+struct PackageEntry;
+
+// The run queues. Holds the threads ready to run ordered by priority.
+// One queue per schedulable target per core. Additionally, each
+// logical processor has its sPinnedRunQueues used for scheduling
+// pinned threads.
+class ThreadRunQueue : public RunQueue<ThreadData, THREAD_MAX_SET_PRIORITY> {
+public:
+						void			Dump() const;
+};
+
+struct CPUEntry : public MinMaxHeapLinkImpl<CPUEntry, int32> {
+										CPUEntry();
+
+						void			UpdatePriority(int32 priority);
+
+						void			ComputeLoad();
+
+						ThreadData*		ChooseNextThread(ThreadData* oldThread,
+											bool putAtBack);
+
+						void			TrackActivity(ThreadData* oldThreadData,
+											ThreadData* nextThreadData);
+
+						int32			fCPUNumber;
+						CoreEntry*		fCore;
+
+						rw_spinlock 	fSchedulerModeLock;
+
+						int32			fPriority;
+						ThreadRunQueue	fRunQueue;
+
+						int32			fLoad;
+
+						bigtime_t		fMeasureActiveTime;
+						bigtime_t		fMeasureTime;
+
+private:
+	inline				void			_RequestPerformanceLevel(
+											ThreadData* threadData);
+
+} CACHE_LINE_ALIGN;
+
+class CPUPriorityHeap : public MinMaxHeap<CPUEntry, int32> {
+public:
+										CPUPriorityHeap() { }
+										CPUPriorityHeap(int32 cpuCount);
+
+						void			Dump();
+};
+
+struct CoreEntry : public MinMaxHeapLinkImpl<CoreEntry, int32>,
+	DoublyLinkedListLinkImpl<CoreEntry> {
+										CoreEntry();
+
+	inline				int32			GetLoad() const;
+						void			UpdateLoad();
+
+	static inline		CoreEntry*		GetCore(int32 cpu);
+
+						int32			fCoreID;
+						PackageEntry*	fPackage;
+
+						int32			fCPUCount;
+						CPUPriorityHeap	fCPUHeap;
+						spinlock		fCPULock;
+
+						int32			fStarvationCounter;
+						DoublyLinkedList<ThreadData>	fThreadList;
+
+						int32			fThreadCount;
+						ThreadRunQueue	fRunQueue;
+						spinlock		fQueueLock;
+
+						bigtime_t		fActiveTime;
+
+						int32			fLoad;
+						bool			fHighLoad;
+} CACHE_LINE_ALIGN;
+
+class CoreLoadHeap : public MinMaxHeap<CoreEntry, int32> {
+public:
+										CoreLoadHeap() { }
+										CoreLoadHeap(int32 coreCount);
+
+						void			Dump();
+};
+
+// gPackageEntries are used to decide which core should be woken up from the
+// idle state. When aiming for performance we should use as many packages as
+// possible with as little cores active in each package as possible (so that the
+// package can enter any boost mode if it has one and the active core have more
+// of the shared cache for themselves. If power saving is the main priority we
+// should keep active cores on as little packages as possible (so that other
+// packages can go to the deep state of sleep). The heap stores only packages
+// with at least one core active and one core idle. The packages with all cores
+// idle are stored in gPackageIdleList (in LIFO manner).
+struct PackageEntry : public DoublyLinkedListLinkImpl<PackageEntry> {
+											PackageEntry();
+
+						int32				fPackageID;
+
+						DoublyLinkedList<CoreEntry>	fIdleCores;
+						int32				fIdleCoreCount;
+						int32				fCoreCount;
+						rw_spinlock			fCoreLock;
+} CACHE_LINE_ALIGN;
+typedef DoublyLinkedList<PackageEntry> IdlePackageList;
+
+extern CPUEntry* gCPUEntries;
+
+extern CoreEntry* gCoreEntries;
+extern CoreLoadHeap gCoreLoadHeap;
+extern CoreLoadHeap gCoreHighLoadHeap;
+extern rw_spinlock gCoreHeapsLock;
+extern int32 gCoreCount;
+
+extern PackageEntry* gPackageEntries;
+extern IdlePackageList gIdlePackageList;
+extern rw_spinlock gIdlePackageLock;
+extern int32 gPackageCount;
+
+
+inline int32
+CoreEntry::GetLoad() const
+{
+	ASSERT(fCPUCount >= 0);
+	return fLoad / fCPUCount;
+}
+
+
+/* static */ inline CoreEntry*
+CoreEntry::GetCore(int32 cpu)
+{
+	return gCPUEntries[cpu].fCore;
+}
+
+
+}	// namespace Scheduler
+
+
+#endif	// KERNEL_SCHEDULER_CPU_H
+
--- a/src/system/kernel/scheduler/scheduler_modes.h
+++ b/src/system/kernel/scheduler/scheduler_modes.h
@ -11,26 +11,40 @@


 struct scheduler_mode_operations {
-	const char*		name;
+	const char*				name;

-	bool			avoid_boost;
+	bool					avoid_boost;

-	bigtime_t		base_quantum;
-	bigtime_t		minimal_quantum;
-	bigtime_t		quantum_multipliers[2];
+	bigtime_t				base_quantum;
+	bigtime_t				minimal_quantum;
+	bigtime_t				quantum_multipliers[2];

-	bigtime_t		maximum_latency;
+	bigtime_t				maximum_latency;

-	void			(*switch_to_mode)(void);
-	void			(*set_cpu_enabled)(int32 cpu, bool enabled);
-	bool			(*has_cache_expired)(Thread* thread);
-	int32			(*choose_core)(Thread* thread);
-	bool			(*should_rebalance)(Thread* thread);
-	void			(*rebalance_irqs)(bool idle);
+	void					(*switch_to_mode)(void);
+	void					(*set_cpu_enabled)(int32 cpu, bool enabled);
+	bool					(*has_cache_expired)(
+								const Scheduler::ThreadData* threadData);
+	Scheduler::CoreEntry*	(*choose_core)(
+								const Scheduler::ThreadData* threadData);
+	bool					(*should_rebalance)(
+								const Scheduler::ThreadData* threadData);
+	void					(*rebalance_irqs)(bool idle);
 };

 extern struct scheduler_mode_operations gSchedulerLowLatencyMode;
 extern struct scheduler_mode_operations gSchedulerPowerSavingMode;

+
+namespace Scheduler {
+
+
+extern scheduler_mode gCurrentModeID;
+extern scheduler_mode_operations* gCurrentMode;
+
+
+}
+
+
 #endif	// KERNEL_SCHEDULER_MODES_H

--- a/src/system/kernel/scheduler/scheduler_thread.cpp
+++ b/src/system/kernel/scheduler/scheduler_thread.cpp
@ -0,0 +1,158 @@
+/*
+ * Copyright 2013, Paweł Dziepak, pdziepak@quarnos.org.
+ * Distributed under the terms of the MIT License.
+ */
+
+#include "scheduler_thread.h"
+
+
+using namespace Scheduler;
+
+
+ThreadData::ThreadData(Thread* thread)
+	:
+	fThread(thread)
+{
+	Init();
+}
+
+
+void
+ThreadData::Init()
+{
+	fPriorityPenalty = 0;
+	fAdditionalPenalty = 0;
+
+	fTimeLeft = 0;
+	fStolenTime = 0;
+
+	fMeasureActiveTime = 0;
+	fMeasureTime = 0;
+	fLoad = 0;
+
+	fWentSleep = 0;
+	fWentSleepActive = 0;
+	fWentSleepCount = -1;
+
+	fEnqueued = false;
+
+	fCore = NULL;
+}
+
+
+void
+ThreadData::Init(CoreEntry* core)
+{
+	Init();
+	fCore = core;
+}
+
+
+void
+ThreadData::Dump() const
+{
+	kprintf("\tpriority_penalty:\t%" B_PRId32 "\n", fPriorityPenalty);
+
+	int32 additionalPenalty = 0;
+	const int kMinimalPriority = _GetMinimalPriority();
+	if (kMinimalPriority > 0)
+		additionalPenalty = fAdditionalPenalty % kMinimalPriority;
+	kprintf("\tadditional_penalty:\t%" B_PRId32 " (%" B_PRId32 ")\n",
+		additionalPenalty, fAdditionalPenalty);
+	kprintf("\tstolen_time:\t\t%" B_PRId64 "\n", fStolenTime);
+	kprintf("\tload:\t\t\t%" B_PRId32 "%%\n", fLoad / 10);
+	kprintf("\twent_sleep:\t\t%" B_PRId64 "\n", fWentSleep);
+	kprintf("\twent_sleep_active:\t%" B_PRId64 "\n", fWentSleepActive);
+	kprintf("\twent_sleep_count:\t%" B_PRId32 "\n", fWentSleepCount);
+	kprintf("\tcore:\t\t\t%" B_PRId32 "\n",
+		fCore != NULL ? fCore->fCoreID : -1);
+	if (fCore != NULL && HasCacheExpired())
+		kprintf("\tcache affinity has expired\n");
+}
+
+
+bool
+ThreadData::ChooseCoreAndCPU(CoreEntry*& targetCore, CPUEntry*& targetCPU)
+{
+	bool rescheduleNeeded = false;
+
+	if (targetCore == NULL && targetCPU != NULL)
+		targetCore = targetCPU->fCore;
+	else if (targetCore != NULL && targetCPU == NULL)
+		targetCPU = _ChooseCPU(targetCore, rescheduleNeeded);
+	else if (targetCore == NULL && targetCPU == NULL) {
+		targetCore = _ChooseCore();
+		targetCPU = _ChooseCPU(targetCore, rescheduleNeeded);
+	}
+
+	ASSERT(targetCore != NULL);
+	ASSERT(targetCPU != NULL);
+
+	fCore = targetCore;
+	return rescheduleNeeded;
+}
+
+
+bigtime_t
+ThreadData::ComputeQuantum()
+{
+	bigtime_t quantum;
+	if (fTimeLeft != 0)
+		quantum = fTimeLeft;
+	else
+		quantum = _GetBaseQuantum();
+
+	if (fThread->priority >= B_FIRST_REAL_TIME_PRIORITY)
+		return quantum;
+
+	quantum += fStolenTime;
+	fStolenTime = 0;
+
+	int32 threadCount = (fCore->fThreadCount + 1) / fCore->fCPUCount;
+	threadCount = max_c(threadCount, 1);
+
+	quantum = std::min(gCurrentMode->maximum_latency / threadCount, quantum);
+	quantum = std::max(quantum,	gCurrentMode->minimal_quantum);
+
+	fTimeLeft = quantum;
+	fQuantumStart = system_time();
+
+	return quantum;
+}
+
+
+inline bigtime_t
+ThreadData::_GetBaseQuantum() const
+{
+	int32 priority = GetEffectivePriority();
+
+	const bigtime_t kQuantum0 = gCurrentMode->base_quantum;
+	if (priority >= B_URGENT_DISPLAY_PRIORITY)
+		return kQuantum0;
+
+	const bigtime_t kQuantum1
+		= kQuantum0 * gCurrentMode->quantum_multipliers[0];
+	if (priority > B_NORMAL_PRIORITY) {
+		return _ScaleQuantum(kQuantum1, kQuantum0, B_URGENT_DISPLAY_PRIORITY,
+			B_NORMAL_PRIORITY, priority);
+	}
+
+	const bigtime_t kQuantum2
+		= kQuantum0 * gCurrentMode->quantum_multipliers[1];
+	return _ScaleQuantum(kQuantum2, kQuantum1, B_NORMAL_PRIORITY,
+		B_IDLE_PRIORITY, priority);
+}
+
+
+/* static */ bigtime_t
+ThreadData::_ScaleQuantum(bigtime_t maxQuantum, bigtime_t minQuantum,
+	int32 maxPriority, int32 minPriority, int32 priority)
+{
+	ASSERT(priority <= maxPriority);
+	ASSERT(priority >= minPriority);
+
+	bigtime_t result = (maxQuantum - minQuantum) * (priority - minPriority);
+	result /= maxPriority - minPriority;
+	return maxQuantum - result;
+}
+
--- a/src/system/kernel/scheduler/scheduler_thread.h
+++ b/src/system/kernel/scheduler/scheduler_thread.h
@ -0,0 +1,360 @@
+/*
+ * Copyright 2013, Paweł Dziepak, pdziepak@quarnos.org.
+ * Distributed under the terms of the MIT License.
+ */
+#ifndef KERNEL_SCHEDULER_THREAD_H
+#define KERNEL_SCHEDULER_THREAD_H
+
+
+#include <thread.h>
+#include <util/AutoLock.h>
+
+#include "scheduler_common.h"
+#include "scheduler_cpu.h"
+
+
+namespace Scheduler {
+
+
+struct ThreadData : public DoublyLinkedListLinkImpl<ThreadData>,
+	RunQueueLinkImpl<ThreadData> {
+public:
+						ThreadData(Thread* thread);
+
+			void		Init();
+			void		Init(CoreEntry* core);
+
+			void		Dump() const;
+
+	inline	bool		HasCacheExpired() const;
+	inline	bool		ShouldRebalance() const;
+
+	inline	int32		GetEffectivePriority() const;
+
+	inline	void		IncreasePenalty();
+	inline	void		CancelPenalty();
+	inline	bool		ShouldCancelPenalty() const;
+
+			bool		ChooseCoreAndCPU(CoreEntry*& targetCore,
+							CPUEntry*& targetCPU);
+
+	inline	void		GoesAway();
+
+	inline	void		PutBack();
+	inline	void		Enqueue();
+	inline	bool		Dequeue();
+
+	inline	void		UpdateActivity(bigtime_t active);
+	inline	void		ComputeLoad();
+
+	inline	bool		HasQuantumEnded(bool wasPreempted, bool hasYielded);
+			bigtime_t	ComputeQuantum();
+
+	inline	Thread*		GetThread() const	{ return fThread; }
+	inline	int32		GetLoad() const	{ return fLoad; }
+
+	inline	CoreEntry*	GetCore() const	{ return fCore; }
+	inline	void		UnassignCore() { fCore = NULL; }
+
+			bigtime_t	fStolenTime;
+			bigtime_t	fQuantumStart;
+			bigtime_t	fLastInterruptTime;
+
+			bigtime_t	fWentSleep;
+			bigtime_t	fWentSleepActive;
+			int32		fWentSleepCount;
+
+			bool		fEnqueued;
+
+private:
+	inline	int32		_GetPenalty() const;
+	inline	int32		_GetMinimalPriority() const;
+
+	inline	CoreEntry*	_ChooseCore() const;
+	inline	CPUEntry*	_ChooseCPU(CoreEntry* core,
+							bool& rescheduleNeeded) const;
+
+	inline	bigtime_t	_GetBaseQuantum() const;
+	static	bigtime_t	_ScaleQuantum(bigtime_t maxQuantum,
+							bigtime_t minQuantum, int32 maxPriority,
+							int32 minPriority, int32 priority);
+
+			Thread*		fThread;
+
+			int32		fPriorityPenalty;
+			int32		fAdditionalPenalty;
+
+			bigtime_t	fTimeLeft;
+
+			bigtime_t	fMeasureActiveTime;
+			bigtime_t	fMeasureTime;
+			int32		fLoad;
+
+			CoreEntry*	fCore;
+};
+
+
+inline bool
+ThreadData::HasCacheExpired() const
+{
+	return gCurrentMode->has_cache_expired(this);
+}
+
+
+inline bool
+ThreadData::ShouldRebalance() const
+{
+	ASSERT(!gSingleCore);
+	return gCurrentMode->should_rebalance(this);
+}
+
+
+inline int32
+ThreadData::GetEffectivePriority() const
+{
+	if (thread_is_idle_thread(fThread))
+		return B_IDLE_PRIORITY;
+	if (fThread->priority >= B_FIRST_REAL_TIME_PRIORITY)
+		return fThread->priority;
+
+	int32 effectivePriority = fThread->priority;
+	effectivePriority -= _GetPenalty();
+
+	ASSERT(effectivePriority < B_FIRST_REAL_TIME_PRIORITY);
+	ASSERT(effectivePriority >= B_LOWEST_ACTIVE_PRIORITY);
+
+	return effectivePriority;
+}
+
+
+inline void
+ThreadData::IncreasePenalty()
+{
+	if (fThread->priority < B_LOWEST_ACTIVE_PRIORITY)
+		return;
+	if (fThread->priority >= B_FIRST_REAL_TIME_PRIORITY)
+		return;
+
+	TRACE("increasing thread %ld penalty\n", fThread->id);
+
+	int32 oldPenalty = fPriorityPenalty++;
+
+	ASSERT(fThread->priority - oldPenalty >= B_LOWEST_ACTIVE_PRIORITY);
+
+	const int kMinimalPriority = _GetMinimalPriority();
+	if (fThread->priority - oldPenalty <= kMinimalPriority) {
+		fPriorityPenalty = oldPenalty;
+		fAdditionalPenalty++;
+	}
+}
+
+
+inline void
+ThreadData::CancelPenalty()
+{
+	if (fPriorityPenalty != 0)
+		TRACE("cancelling thread %ld penalty\n", fThread->id);
+
+	fAdditionalPenalty = 0;
+	fPriorityPenalty = 0;
+}
+
+
+inline bool
+ThreadData::ShouldCancelPenalty() const
+{
+	if (fCore == NULL)
+		return false;
+
+	return atomic_get(&fCore->fStarvationCounter) != fWentSleepCount
+		&& system_time() - fWentSleep > gCurrentMode->base_quantum;
+}
+
+
+inline void
+ThreadData::GoesAway()
+{
+	fLastInterruptTime = 0;
+
+	fWentSleep = system_time();
+	fWentSleepActive = atomic_get64(&fCore->fActiveTime);
+	fWentSleepCount = atomic_get(&fCore->fStarvationCounter);
+}
+
+
+inline void
+ThreadData::PutBack()
+{
+	ComputeLoad();
+	fWentSleepCount = -1;
+
+	int32 priority = GetEffectivePriority();
+
+	SpinLocker runQueueLocker(fCore->fQueueLock);
+	ASSERT(!fEnqueued);
+	fEnqueued = true;
+	if (fThread->pinned_to_cpu > 0) {
+		ASSERT(fThread->cpu != NULL);
+
+		CPUEntry* cpu = &gCPUEntries[fThread->cpu->cpu_num];
+		cpu->fRunQueue.PushFront(this, priority);
+	} else {
+		fCore->fRunQueue.PushFront(this, priority);
+		atomic_add(&fCore->fThreadCount, 1);
+	}
+}
+
+
+inline void
+ThreadData::Enqueue()
+{
+	fThread->state = B_THREAD_READY;
+	ComputeLoad();
+	fWentSleepCount = 0;
+
+	int32 priority = GetEffectivePriority();
+
+	SpinLocker runQueueLocker(fCore->fQueueLock);
+	ASSERT(!fEnqueued);
+	fEnqueued = true;
+	if (fThread->pinned_to_cpu > 0) {
+		ASSERT(fThread->previous_cpu != NULL);
+
+		CPUEntry* cpu = &gCPUEntries[fThread->previous_cpu->cpu_num];
+		cpu->fRunQueue.PushBack(this, priority);
+	} else {
+		fCore->fRunQueue.PushBack(this, priority);
+		fCore->fThreadList.Insert(this);
+
+		atomic_add(&fCore->fThreadCount, 1);
+	}
+}
+
+
+inline bool
+ThreadData::Dequeue()
+{
+	SpinLocker runQueueLocker(fCore->fQueueLock);
+	if (!fEnqueued)
+		return false;
+
+	fEnqueued = false;
+	if (fThread->pinned_to_cpu > 0) {
+		ASSERT(fThread->previous_cpu != NULL);
+
+		CPUEntry* cpu = &gCPUEntries[fThread->previous_cpu->cpu_num];
+		cpu->fRunQueue.Remove(this);
+	} else {
+		fCore->fRunQueue.Remove(this);
+
+		ASSERT(fWentSleepCount < 1);
+		if (fWentSleepCount == 0)
+			fCore->fThreadList.Remove(this);
+		atomic_add(&fCore->fThreadCount, -1);
+	}
+
+	return true;
+}
+
+
+inline void
+ThreadData::UpdateActivity(bigtime_t active)
+{
+	fMeasureActiveTime += active;
+	gCPUEntries[smp_get_current_cpu()].fMeasureActiveTime += active;
+	atomic_add64(&fCore->fActiveTime, active);
+}
+
+
+inline void
+ThreadData::ComputeLoad()
+{
+	if (fLastInterruptTime > 0) {
+		bigtime_t interruptTime = gCPU[smp_get_current_cpu()].interrupt_time;
+		interruptTime -= fLastInterruptTime;
+		fMeasureActiveTime -= interruptTime;
+	}
+
+	compute_load(fMeasureTime, fMeasureActiveTime, fLoad);
+}
+
+
+inline bool
+ThreadData::HasQuantumEnded(bool wasPreempted, bool hasYielded)
+{
+	if (hasYielded) {
+		fTimeLeft = 0;
+		return true;
+	}
+
+	bigtime_t timeUsed = system_time() - fQuantumStart;
+	fTimeLeft -= timeUsed;
+	fTimeLeft = std::max(fTimeLeft, bigtime_t(0));
+
+	// too little time left, it's better make the next quantum a bit longer
+	if (wasPreempted || fTimeLeft <= gCurrentMode->minimal_quantum) {
+		fStolenTime += fTimeLeft;
+		fTimeLeft = 0;
+	}
+
+	return fTimeLeft == 0;
+}
+
+
+inline int32
+ThreadData::_GetPenalty() const
+{
+	int32 penalty = fPriorityPenalty;
+
+	const int kMinimalPriority = _GetMinimalPriority();
+	if (kMinimalPriority > 0)
+		penalty += fAdditionalPenalty % kMinimalPriority;
+
+	return penalty;
+}
+
+
+inline int32
+ThreadData::_GetMinimalPriority() const
+{
+	const int32 kDivisor = 5;
+
+	const int32 kMaximalPriority = 25;
+	const int32 kMinimalPriority = B_LOWEST_ACTIVE_PRIORITY;
+
+	int32 priority = fThread->priority / kDivisor;
+	return std::max(std::min(priority, kMaximalPriority), kMinimalPriority);
+}
+
+
+inline CoreEntry*
+ThreadData::_ChooseCore() const
+{
+	ASSERT(!gSingleCore);
+	return gCurrentMode->choose_core(this);
+}
+
+
+inline CPUEntry*
+ThreadData::_ChooseCPU(CoreEntry* core, bool& rescheduleNeeded) const
+{
+	SpinLocker cpuLocker(core->fCPULock);
+	CPUEntry* cpu = core->fCPUHeap.PeekMinimum();
+	ASSERT(cpu != NULL);
+
+	int32 threadPriority = GetEffectivePriority();
+	if (CPUPriorityHeap::GetKey(cpu) < threadPriority) {
+		cpu->UpdatePriority(threadPriority);
+		rescheduleNeeded = true;
+	} else
+		rescheduleNeeded = false;
+
+	return cpu;
+}
+
+
+}	// namespace Scheduler
+
+
+#endif	// KERNEL_SCHEDULER_THREAD_H
+