diff --git a/share/man/man9/pcu.9 b/share/man/man9/pcu.9
index 322a16514820..e21c48f2aa3e 100644
--- a/share/man/man9/pcu.9
+++ b/share/man/man9/pcu.9
@@ -1,4 +1,4 @@
-.\"	$NetBSD: pcu.9,v 1.6 2014/05/15 23:59:05 wiz Exp $
+.\"	$NetBSD: pcu.9,v 1.7 2014/05/16 00:48:41 rmind Exp $
 .\"
 .\" Copyright (c) 2012-2014 The NetBSD Foundation, Inc.
 .\" All rights reserved.
@@ -27,7 +27,7 @@
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd May 12, 2014
+.Dd May 15, 2014
 .Dt PCU 9
 .Os
 .Sh NAME
@@ -42,14 +42,14 @@
 .Ft void
 .Fn pcu_save_all "lwp_t *l"
 .Ft void
-.Fn pcu_discard "const pcu_ops_t *pcu"
+.Fn pcu_discard "const pcu_ops_t *pcu" "bool valid"
 .Ft bool
-.Fn pcu_used_p "const pcu_ops_t *pcu"
+.Fn pcu_valid_p "const pcu_ops_t *pcu"
 .\" -----
 .Sh DESCRIPTION
 Per CPU Unit (PCU) is an interface to manage synchronization of any
 per-CPU context (unit) tied to an LWP context.
-Typical use of PCU is for "lazy-switch" synchronisation of FPU state.
+Typical use of PCU is for "lazy-switch" synchronization of the FPU state.
 Each PCU has its operations defined by a
 .Vt pcu_ops_t
 structure.
@@ -58,62 +58,55 @@ Members of
 are
 .Bd -literal
         u_int	pcu_id;
-        void	(*pcu_state_save)(lwp_t *l, u_int flags);
+        void	(*pcu_state_save)(lwp_t *l);
         void	(*pcu_state_load)(lwp_t *l, u_int flags);
-        void	(*pcu_state_release)(lwp_t *l, u_int flags);
+        void	(*pcu_state_release)(lwp_t *l);
 .Ed
 .Pp
-The PCU operations take a
-.Ar flags
-parameter which is a bitmask that always can have one of the general
-bits set:
-.Bl -tag -width PCU_KERNELXXX
-.It Dv PCU_USER
-PCU state is for the user
-.It Dv PCU_KERNEL
-PCU state is for the kernel
-.El
-.Pp
-.Bl -tag -width pcu_state_saveXXX
 .It Fn pcu_state_save
-save the current CPU's state into the given LWP's MD storage.
+indicate to MD code that the PCU state on the current CPU should be
+saved into the given LWP's MD storage.
 .It Fn pcu_state_load
 load PCU state from the given LWP's MD storage to the current CPU.
 The
 .Ar flags
-is a combination of the above general flags and one or more of
-the following:
-.Bl -tag -width PCU_LOADEDXXX
-.It Dv PCU_RELOAD
-Load registers into the PCU
-.It Dv PCU_ENABLE
-Enable the PCU
-.It Dv PCU_LOADED
-the current LWP has used this PCU before
+is a combination of one or more of the following:
+.Bl -tag -width PCU_VALIDXXX
+.It Dv PCU_VALID
+Indicates that the PCU state is considered valid and need not be initialized
+This is a case if the PCU state was already used (and thus loaded) by the LWP
+and has not been discarded since.
+.It Dv PCU_REENABLE
+Indicates that a fault reoccurred while the PCU state is loaded,
+therefore PCU should be re-enabled.
+This is applicable only in some cases, e.g. ARM NEON extensions.
 .El
 .It Fn pcu_state_release
 indicate to MD code that the PCU ownership by the LWP was released,
 therefore the next use of PCU on the LWP shall be detected and
 .Fn pcu_load
-be called to reacquire ownership.
-For example, this would often be the changing of a bit for a CPU to
+be called to reacquire the ownership.
+For example, this would normally be the changing of a bit for a CPU to
 trap on the execution of one of the PCU's instructions.
 .El
 .Sh FUNCTIONS
 .Bl -tag -width pcu_save_allXXX
 .It Fn pcu_load
-Load (initialize) the PCU state of the current LWP on the current CPU.
+Load or initialize the PCU state of the current LWP on the current CPU.
 .It Fn pcu_save
 Save the specified PCU state to the given LWP.
 .It Fn pcu_discard
 Discard the specified PCU state of the current LWP.
-.It Fn pcu_used_p
-Return true if PCU was used (i.e.,
+The PCU state will be considered invalid,
+unless the "valid" parameter is set to true.
+.It Fn pcu_valid_p
+Return true if PCU state is considered valid.
+Generally, it always becomes "valid" when
 .Fn pcu_load
-was called) by the LWP.
+is called by the LWP.
 Otherwise, return false.
 .It Fn pcu_save_all
-Save all PCU state of the given LWP, so that it could be used later.
+Save all PCU states of the given LWP, so that they could be used later.
 .El
 .\" -----
 .Sh CODE REFERENCES
diff --git a/sys/arch/alpha/alpha/fp_complete.c b/sys/arch/alpha/alpha/fp_complete.c
index 202cbc7e57ce..c02c1abd4993 100644
--- a/sys/arch/alpha/alpha/fp_complete.c
+++ b/sys/arch/alpha/alpha/fp_complete.c
@@ -1,4 +1,4 @@
-/* $NetBSD: fp_complete.c,v 1.15 2012/12/26 19:13:19 matt Exp $ */
+/* $NetBSD: fp_complete.c,v 1.16 2014/05/16 00:48:41 rmind Exp $ */
 
 /*-
  * Copyright (c) 2001 Ross Harvey
@@ -35,7 +35,7 @@
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
 
-__KERNEL_RCSID(0, "$NetBSD: fp_complete.c,v 1.15 2012/12/26 19:13:19 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: fp_complete.c,v 1.16 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_compat_osf1.h"
 
@@ -731,7 +731,7 @@ fpu_state_load(struct lwp *l, u_int flags)
 	 * If a process has used FP, count a "used FP, and took
 	 * a trap to use it again" event.
 	 */
-	if (!fpu_used_p(l)) {
+	if ((flags & PCU_VALID) == 0) {
 		atomic_inc_ulong(&fpevent_use.ev_count);
 		fpu_mark_used(l);
 	} else
@@ -749,7 +749,7 @@ fpu_state_load(struct lwp *l, u_int flags)
  */
 
 void
-fpu_state_save(struct lwp *l, u_int flags)
+fpu_state_save(struct lwp *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
@@ -762,7 +762,7 @@ fpu_state_save(struct lwp *l, u_int flags)
  * Release the FPU.
  */
 void
-fpu_state_release(struct lwp *l, u_int flags)
+fpu_state_release(struct lwp *l)
 {
 	l->l_md.md_flags &= ~MDLWP_FPACTIVE;
 }
diff --git a/sys/arch/alpha/include/alpha.h b/sys/arch/alpha/include/alpha.h
index 16a62597a8ae..7f517023ed24 100644
--- a/sys/arch/alpha/include/alpha.h
+++ b/sys/arch/alpha/include/alpha.h
@@ -1,4 +1,4 @@
-/* $NetBSD: alpha.h,v 1.32 2013/08/22 19:50:54 drochner Exp $ */
+/* $NetBSD: alpha.h,v 1.33 2014/05/16 00:48:41 rmind Exp $ */
 
 /*
  * Copyright (c) 1988 University of Utah.
@@ -110,8 +110,8 @@ char *	dot_conv(unsigned long);
 extern const pcu_ops_t fpu_ops;
 
 void    fpu_state_load(struct lwp *, u_int);
-void    fpu_state_save(struct lwp *, u_int);
-void    fpu_state_release(struct lwp *, u_int);
+void    fpu_state_save(struct lwp *);
+void    fpu_state_release(struct lwp *);
 
 static inline void
 fpu_load(void)
diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S
index c433ed342c8c..94a38921488b 100644
--- a/sys/arch/amd64/amd64/locore.S
+++ b/sys/arch/amd64/amd64/locore.S
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.75 2013/12/01 01:05:16 christos Exp $	*/
+/*	$NetBSD: locore.S,v 1.76 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*
  * Copyright-o-rama!
@@ -1017,9 +1017,8 @@ ENTRY(cpu_switchto)
 	jne	5f
 
 	/*
-	 * Restore cr0 (including FPU state).  Raise the IPL to IPL_HIGH.
-	 * FPU IPIs can alter the LWP's saved cr0.  Dropping the priority
-	 * is deferred until mi_switch(), when cpu_switchto() returns.
+	 * Restore cr0 including FPU state (may have CR0_TS set).  Note that
+	 * IPL_SCHED prevents from FPU interrupt altering the LWP's saved cr0.
 	 */
 2:
 #ifndef XEN
diff --git a/sys/arch/arm/vfp/vfp_init.c b/sys/arch/arm/vfp/vfp_init.c
index ab4d05b51837..5780986c0dce 100644
--- a/sys/arch/arm/vfp/vfp_init.c
+++ b/sys/arch/arm/vfp/vfp_init.c
@@ -1,4 +1,4 @@
-/*      $NetBSD: vfp_init.c,v 1.38 2014/04/06 00:54:52 matt Exp $ */
+/*      $NetBSD: vfp_init.c,v 1.39 2014/05/16 00:48:41 rmind Exp $ */
 
 /*
  * Copyright (c) 2008 ARM Ltd
@@ -131,8 +131,8 @@ static int neon_handler(u_int, u_int, trapframe_t *, int);
 #endif
 
 static void vfp_state_load(lwp_t *, u_int);
-static void vfp_state_save(lwp_t *, u_int);
-static void vfp_state_release(lwp_t *, u_int);
+static void vfp_state_save(lwp_t *);
+static void vfp_state_release(lwp_t *);
 
 const pcu_ops_t arm_vfp_ops = {
 	.pcu_id = PCU_FPU,
@@ -475,24 +475,6 @@ static void
 vfp_state_load(lwp_t *l, u_int flags)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
-
-	KASSERT(flags & PCU_ENABLE);
-
-	if (flags & PCU_KERNEL) {
-		if ((flags & PCU_LOADED) == 0) {
-			pcb->pcb_kernel_vfp.vfp_fpexc = pcb->pcb_vfp.vfp_fpexc;
-		}
-		pcb->pcb_vfp.vfp_fpexc = VFP_FPEXC_EN;
-		armreg_fpexc_write(pcb->pcb_vfp.vfp_fpexc);
-		/*
-		 * Load the kernel registers (just the first 16) if they've
-		 * been used..
-		 */
-		if (flags & PCU_LOADED) {
-			load_vfpregs_lo(pcb->pcb_kernel_vfp.vfp_regs);
-		}
-		return;
-	}
 	struct vfpreg * const fregs = &pcb->pcb_vfp;
 
 	/*
@@ -503,55 +485,53 @@ vfp_state_load(lwp_t *l, u_int flags)
 	 * If a process has used the VFP, count a "used VFP, and took
 	 * a trap to use it again" event.
 	 */
-	if (__predict_false((flags & PCU_LOADED) == 0)) {
-		KASSERT(flags & PCU_RELOAD);
+	if (__predict_false((flags & PCU_VALID) == 0)) {
 		curcpu()->ci_vfp_evs[0].ev_count++;
 		pcb->pcb_vfp.vfp_fpscr = vfp_fpscr_default;
 	} else {
 		curcpu()->ci_vfp_evs[1].ev_count++;
 	}
 
-	uint32_t fpexc = armreg_fpexc_read();
-	if (flags & PCU_RELOAD) {
-		bool enabled = fregs->vfp_fpexc & VFP_FPEXC_EN;
-
-		/*
-		 * Load and Enable the VFP (so that we can write the
-		 * registers).
-		 */
-		fregs->vfp_fpexc |= VFP_FPEXC_EN;
-		armreg_fpexc_write(fregs->vfp_fpexc);
-		if (enabled) {
-			/*
-			 * If we think the VFP is enabled, it must have be
-			 * disabled by vfp_state_release for another LWP so
-			 * we can now just return.
-			 */
-			return;
-		}
-
-		load_vfpregs(fregs);
-		armreg_fpscr_write(fregs->vfp_fpscr);
-
-		if (fregs->vfp_fpexc & VFP_FPEXC_EX) {
-			/* Need to restore the exception handling state.  */
-			armreg_fpinst2_write(fregs->vfp_fpinst2);
-			if (fregs->vfp_fpexc & VFP_FPEXC_FP2V)
-				armreg_fpinst_write(fregs->vfp_fpinst);
-		}
-	} else {
-		/*
-		 * If the VFP is already enabled we must be bouncing an
-		 * instruction.
-		 */
+	/*
+	 * If the VFP is already enabled we must be bouncing an instruction.
+	 */
+	if (flags & PCU_REENABLE) {
+		uint32_t fpexc = armreg_fpexc_read();
 		armreg_fpexc_write(fpexc | VFP_FPEXC_EN);
+		return;
+	}
+
+	/*
+	 * Load and Enable the VFP (so that we can write the registers).
+	 */
+	bool enabled = fregs->vfp_fpexc & VFP_FPEXC_EN;
+	fregs->vfp_fpexc |= VFP_FPEXC_EN;
+	armreg_fpexc_write(fregs->vfp_fpexc);
+	if (enabled) {
+		/*
+		 * If we think the VFP is enabled, it must have be
+		 * disabled by vfp_state_release for another LWP so
+		 * we can now just return.
+		 */
+		return;
+	}
+
+	load_vfpregs(fregs);
+	armreg_fpscr_write(fregs->vfp_fpscr);
+
+	if (fregs->vfp_fpexc & VFP_FPEXC_EX) {
+		/* Need to restore the exception handling state.  */
+		armreg_fpinst2_write(fregs->vfp_fpinst2);
+		if (fregs->vfp_fpexc & VFP_FPEXC_FP2V)
+			armreg_fpinst_write(fregs->vfp_fpinst);
 	}
 }
 
 void
-vfp_state_save(lwp_t *l, u_int flags)
+vfp_state_save(lwp_t *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
+	struct vfpreg * const fregs = &pcb->pcb_vfp;
 	uint32_t fpexc = armreg_fpexc_read();
 
 	/*
@@ -561,17 +541,6 @@ vfp_state_save(lwp_t *l, u_int flags)
 	 */
 	armreg_fpexc_write((fpexc | VFP_FPEXC_EN) & ~VFP_FPEXC_EX);
 
-	if (flags & PCU_KERNEL) {
-		/*
-		 * Save the kernel set of VFP registers.
-		 * (just the first 16).
-		 */
-		save_vfpregs_lo(pcb->pcb_kernel_vfp.vfp_regs);
-		return;
-	}
-
-	struct vfpreg * const fregs = &pcb->pcb_vfp;
-
 	fregs->vfp_fpexc = fpexc;
 	if (fpexc & VFP_FPEXC_EX) {
 		/* Need to save the exception handling state */
@@ -587,22 +556,15 @@ vfp_state_save(lwp_t *l, u_int flags)
 }
 
 void
-vfp_state_release(lwp_t *l, u_int flags)
+vfp_state_release(lwp_t *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
-	if (flags & PCU_KERNEL) {
-		/*
-		 * Restore the FPEXC since we borrowed that field.
-		 */
-		pcb->pcb_vfp.vfp_fpexc = pcb->pcb_kernel_vfp.vfp_fpexc;
-	} else {
-		/*
-		 * Now mark the VFP as disabled (and our state
-		 * has been already saved or is being discarded).
-		 */
-		pcb->pcb_vfp.vfp_fpexc &= ~VFP_FPEXC_EN;
-	}
+	/*
+	 * Now mark the VFP as disabled (and our state
+	 * has been already saved or is being discarded).
+	 */
+	pcb->pcb_vfp.vfp_fpexc &= ~VFP_FPEXC_EN;
 
 	/*
 	 * Turn off the FPU so the next time a VFP instruction is issued
@@ -630,43 +592,7 @@ vfp_discardcontext(bool used_p)
 bool
 vfp_used_p(void)
 {
-	return pcu_used_p(&arm_vfp_ops);
-}
-
-void
-vfp_kernel_acquire(void)
-{
-	if (__predict_false(cpu_intr_p())) {
-		armreg_fpexc_write(VFP_FPEXC_EN);
-		if (curcpu()->ci_data.cpu_pcu_curlwp[PCU_FPU] != NULL) {
-			lwp_t * const l = curlwp;
-			struct pcb * const pcb = lwp_getpcb(l);
-			KASSERT((l->l_md.md_flags & MDLWP_VFPINTR) == 0);
-			l->l_md.md_flags |= MDLWP_VFPINTR;
-			save_vfpregs_lo(&pcb->pcb_kernel_vfp.vfp_regs[16]);
-		}
-	} else {
-		pcu_kernel_acquire(&arm_vfp_ops);
-	}
-}
-
-void
-vfp_kernel_release(void)
-{
-	if (__predict_false(cpu_intr_p())) {
-		uint32_t fpexc = 0;
-		if (curcpu()->ci_data.cpu_pcu_curlwp[PCU_FPU] != NULL) {
-			lwp_t * const l = curlwp;
-			struct pcb * const pcb = lwp_getpcb(l);
-			KASSERT(l->l_md.md_flags & MDLWP_VFPINTR);
-			load_vfpregs_lo(&pcb->pcb_kernel_vfp.vfp_regs[16]);
-			l->l_md.md_flags &= ~MDLWP_VFPINTR;
-			fpexc = pcb->pcb_vfp.vfp_fpexc;
-		}
-		armreg_fpexc_write(fpexc);
-	} else {
-		pcu_kernel_release(&arm_vfp_ops);
-	}
+	return pcu_valid_p(&arm_vfp_ops);
 }
 
 void
diff --git a/sys/arch/mips/mips/mips_dsp.c b/sys/arch/mips/mips/mips_dsp.c
index 7b25d8311a27..f2ec4cccce23 100644
--- a/sys/arch/mips/mips/mips_dsp.c
+++ b/sys/arch/mips/mips/mips_dsp.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: mips_dsp.c,v 1.3 2013/08/22 19:50:54 drochner Exp $	*/
+/*	$NetBSD: mips_dsp.c,v 1.4 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2011 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: mips_dsp.c,v 1.3 2013/08/22 19:50:54 drochner Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mips_dsp.c,v 1.4 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_multiprocessor.h"
 
@@ -46,9 +46,9 @@ __KERNEL_RCSID(0, "$NetBSD: mips_dsp.c,v 1.3 2013/08/22 19:50:54 drochner Exp $"
 #include <mips/regnum.h>
 #include <mips/pcb.h>
 
-static void mips_dsp_state_save(lwp_t *, u_int);
+static void mips_dsp_state_save(lwp_t *);
 static void mips_dsp_state_load(lwp_t *, u_int);
-static void mips_dsp_state_release(lwp_t *, u_int);
+static void mips_dsp_state_release(lwp_t *);
 
 const pcu_ops_t mips_dsp_ops = {
 	.pcu_id = PCU_DSP,
@@ -78,11 +78,11 @@ dsp_save(void)
 bool
 dsp_used_p(void)
 {
-	return pcu_used_p(&mips_dsp_ops);
+	return pcu_valid_p(&mips_dsp_ops);
 }
 
 void
-mips_dsp_state_save(lwp_t *l, u_int flags)
+mips_dsp_state_save(lwp_t *l)
 {
 	struct trapframe * const tf = l->l_md.md_utf;
 	struct pcb * const pcb = lwp_getpcb(l);
@@ -146,7 +146,7 @@ mips_dsp_state_load(lwp_t *l, u_int flags)
 	/*
 	 * If this is the first time the state is being loaded, zero it first.
 	 */
-	if (__predict_false((flags & PCU_LOADED) == 0)) {
+	if (__predict_false((flags & PCU_VALID) == 0)) {
 		memset(&pcb->pcb_dspregs, 0, sizeof(pcb->pcb_dspregs));
 	}
 
@@ -192,9 +192,8 @@ mips_dsp_state_load(lwp_t *l, u_int flags)
 }
 
 void
-mips_dsp_state_release(lwp_t *l, u_int flags)
+mips_dsp_state_release(lwp_t *l)
 {
-
 	KASSERT(l == curlwp);
 	l->l_md.md_utf->tf_regs[_R_SR] &= ~MIPS_SR_MX;
 }
diff --git a/sys/arch/mips/mips/mips_fpu.c b/sys/arch/mips/mips/mips_fpu.c
index 05795a063b0b..1a502fbf6bf4 100644
--- a/sys/arch/mips/mips/mips_fpu.c
+++ b/sys/arch/mips/mips/mips_fpu.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: mips_fpu.c,v 1.11 2013/08/22 19:50:54 drochner Exp $	*/
+/*	$NetBSD: mips_fpu.c,v 1.12 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: mips_fpu.c,v 1.11 2013/08/22 19:50:54 drochner Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mips_fpu.c,v 1.12 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_multiprocessor.h"
 
@@ -46,9 +46,9 @@ __KERNEL_RCSID(0, "$NetBSD: mips_fpu.c,v 1.11 2013/08/22 19:50:54 drochner Exp $
 #include <mips/regnum.h>
 #include <mips/pcb.h>
 
-static void mips_fpu_state_save(lwp_t *, u_int);
+static void mips_fpu_state_save(lwp_t *);
 static void mips_fpu_state_load(lwp_t *, u_int);
-static void mips_fpu_state_release(lwp_t *, u_int);
+static void mips_fpu_state_release(lwp_t *);
 
 const pcu_ops_t mips_fpu_ops = {
 	.pcu_id = PCU_FPU,
@@ -78,11 +78,11 @@ fpu_save(void)
 bool
 fpu_used_p(void)
 {
-	return pcu_used_p(&mips_fpu_ops);
+	return pcu_valid_p(&mips_fpu_ops);
 }
 
 void
-mips_fpu_state_save(lwp_t *l, u_int flags)
+mips_fpu_state_save(lwp_t *l)
 {
 	struct trapframe * const tf = l->l_md.md_utf;
 	struct pcb * const pcb = lwp_getpcb(l);
@@ -218,7 +218,7 @@ mips_fpu_state_load(lwp_t *l, u_int flags)
 	/*
 	 * If this is the first time the state is being loaded, zero it first.
 	 */
-	if (__predict_false((flags & PCU_LOADED) == 0)) {
+	if (__predict_false((flags & PCU_VALID) == 0)) {
 		memset(&pcb->pcb_fpregs, 0, sizeof(pcb->pcb_fpregs));
 	}
 
@@ -343,8 +343,7 @@ mips_fpu_state_load(lwp_t *l, u_int flags)
 }
 
 void
-mips_fpu_state_release(lwp_t *l, u_int flags)
+mips_fpu_state_release(lwp_t *l)
 {
-
 	l->l_md.md_utf->tf_regs[_R_SR] &= ~MIPS_SR_COP_1_BIT;
 }
diff --git a/sys/arch/powerpc/booke/spe.c b/sys/arch/powerpc/booke/spe.c
index 4d6a3805499b..c1a855d90141 100644
--- a/sys/arch/powerpc/booke/spe.c
+++ b/sys/arch/powerpc/booke/spe.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: spe.c,v 1.7 2013/08/23 06:19:46 matt Exp $	*/
+/*	$NetBSD: spe.c,v 1.8 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2011 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: spe.c,v 1.7 2013/08/23 06:19:46 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: spe.c,v 1.8 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_altivec.h"
 
@@ -50,8 +50,8 @@ __KERNEL_RCSID(0, "$NetBSD: spe.c,v 1.7 2013/08/23 06:19:46 matt Exp $");
 #include <powerpc/pcb.h>
 
 static void vec_state_load(lwp_t *, u_int);
-static void vec_state_save(lwp_t *, u_int);
-static void vec_state_release(lwp_t *, u_int);
+static void vec_state_save(lwp_t *);
+static void vec_state_release(lwp_t *);
 
 const pcu_ops_t vec_ops = {
 	.pcu_id = PCU_VEC,
@@ -63,7 +63,7 @@ const pcu_ops_t vec_ops = {
 bool
 vec_used_p(lwp_t *l)
 {
-	return pcu_used_p(&vec_ops);
+	return pcu_valid_p(&vec_ops);
 }
 
 void
@@ -77,7 +77,7 @@ vec_state_load(lwp_t *l, u_int flags)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
-	if (__predict_false(!vec_used_p(l))) {
+	if ((flags & PCU_VALID) == 0) {
 		memset(&pcb->pcb_vr, 0, sizeof(pcb->pcb_vr));
 		vec_mark_used(l);
 	}
@@ -109,7 +109,7 @@ vec_state_load(lwp_t *l, u_int flags)
 }
 
 void
-vec_state_save(lwp_t *l, u_int flags)
+vec_state_save(lwp_t *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
@@ -134,7 +134,7 @@ vec_state_save(lwp_t *l, u_int flags)
 }
 
 void
-vec_state_release(lwp_t *l, u_int flags)
+vec_state_release(lwp_t *l)
 {
 	/*
 	 * Turn off SPV so the next SPE instruction will cause a
diff --git a/sys/arch/powerpc/oea/altivec.c b/sys/arch/powerpc/oea/altivec.c
index 4e92b549bfcb..118d5b49275e 100644
--- a/sys/arch/powerpc/oea/altivec.c
+++ b/sys/arch/powerpc/oea/altivec.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: altivec.c,v 1.28 2013/08/23 06:21:33 matt Exp $	*/
+/*	$NetBSD: altivec.c,v 1.29 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*
  * Copyright (C) 1996 Wolfgang Solfrank.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: altivec.c,v 1.28 2013/08/23 06:21:33 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: altivec.c,v 1.29 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_multiprocessor.h"
 
@@ -50,8 +50,8 @@ __KERNEL_RCSID(0, "$NetBSD: altivec.c,v 1.28 2013/08/23 06:21:33 matt Exp $");
 #include <powerpc/psl.h>
 
 static void vec_state_load(lwp_t *, u_int);
-static void vec_state_save(lwp_t *, u_int);
-static void vec_state_release(lwp_t *, u_int);
+static void vec_state_save(lwp_t *);
+static void vec_state_release(lwp_t *);
 
 const pcu_ops_t vec_ops = {
 	.pcu_id = PCU_VEC,
@@ -63,7 +63,7 @@ const pcu_ops_t vec_ops = {
 bool
 vec_used_p(lwp_t *l)
 {
-	return pcu_used_p(&vec_ops);
+	return pcu_valid_p(&vec_ops);
 }
 
 void
@@ -77,7 +77,7 @@ vec_state_load(lwp_t *l, u_int flags)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
-	if (__predict_false(!vec_used_p(l))) {
+	if ((flags & PCU_VALID) == 0) {
 		memset(&pcb->pcb_vr, 0, sizeof(pcb->pcb_vr));
 		vec_mark_used(l);
 	}
@@ -114,7 +114,7 @@ vec_state_load(lwp_t *l, u_int flags)
 }
 
 void
-vec_state_save(lwp_t *l, u_int flags)
+vec_state_save(lwp_t *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
@@ -149,7 +149,7 @@ vec_state_save(lwp_t *l, u_int flags)
 }
 
 void
-vec_state_release(lwp_t *l, u_int flags)
+vec_state_release(lwp_t *l)
 {
 	__asm volatile("dssall;sync");
 	l->l_md.md_utf->tf_srr1 &= ~PSL_VEC;
diff --git a/sys/arch/powerpc/powerpc/fpu.c b/sys/arch/powerpc/powerpc/fpu.c
index cf1a465159e4..f0a615e8495b 100644
--- a/sys/arch/powerpc/powerpc/fpu.c
+++ b/sys/arch/powerpc/powerpc/fpu.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: fpu.c,v 1.34 2013/08/23 06:19:46 matt Exp $	*/
+/*	$NetBSD: fpu.c,v 1.35 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*
  * Copyright (C) 1996 Wolfgang Solfrank.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.34 2013/08/23 06:19:46 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.35 2014/05/16 00:48:41 rmind Exp $");
 
 #include "opt_multiprocessor.h"
 
@@ -49,8 +49,8 @@ __KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.34 2013/08/23 06:19:46 matt Exp $");
 
 #ifdef PPC_HAVE_FPU
 static void fpu_state_load(lwp_t *, u_int);
-static void fpu_state_save(lwp_t *, u_int);
-static void fpu_state_release(lwp_t *, u_int);
+static void fpu_state_save(lwp_t *);
+static void fpu_state_release(lwp_t *);
 #endif
 
 const pcu_ops_t fpu_ops = {
@@ -65,7 +65,7 @@ const pcu_ops_t fpu_ops = {
 bool
 fpu_used_p(lwp_t *l)
 {
-	return pcu_used_p(&fpu_ops);
+	return pcu_valid_p(&fpu_ops);
 }
 
 void
@@ -80,7 +80,7 @@ fpu_state_load(lwp_t *l, u_int flags)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
-	if (__predict_false(!fpu_used_p(l))) {
+	if ((flags & PCU_VALID) == 0) {
 		memset(&pcb->pcb_fpu, 0, sizeof(pcb->pcb_fpu));
 	}
 
@@ -102,7 +102,7 @@ fpu_state_load(lwp_t *l, u_int flags)
  * Save the contents of the current CPU's FPU to its PCB.
  */
 void
-fpu_state_save(lwp_t *l, u_int flags)
+fpu_state_save(lwp_t *l)
 {
 	struct pcb * const pcb = lwp_getpcb(l);
 
@@ -118,7 +118,7 @@ fpu_state_save(lwp_t *l, u_int flags)
 }
 
 void
-fpu_state_release(lwp_t *l, u_int flags)
+fpu_state_release(lwp_t *l)
 {
 	l->l_md.md_utf->tf_srr1 &= ~PSL_FP;
 }
@@ -211,7 +211,7 @@ fpu_save_to_mcontext(lwp_t *l, mcontext_t *mcp, unsigned int *flagp)
 {
 	KASSERT(l == curlwp);
 
-	if (!pcu_used_p(&fpu_ops))
+	if (!pcu_valid_p(&fpu_ops))
 		return false;
 
 	struct pcb * const pcb = lwp_getpcb(l);
diff --git a/sys/kern/subr_pcu.c b/sys/kern/subr_pcu.c
index 12b60bcba02e..5400d65ffad0 100644
--- a/sys/kern/subr_pcu.c
+++ b/sys/kern/subr_pcu.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: subr_pcu.c,v 1.17 2014/01/23 17:32:03 skrll Exp $	*/
+/*	$NetBSD: subr_pcu.c,v 1.18 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*-
- * Copyright (c) 2011 The NetBSD Foundation, Inc.
+ * Copyright (c) 2011, 2014 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -38,26 +38,21 @@
  *	PCU state may be loaded only by the current LWP, that is, curlwp.
  *	Therefore, only LWP itself can set a CPU for lwp_t::l_pcu_cpu[id].
  *
- *	Request for a PCU release can be from owner LWP (whether PCU state
- *	is on current CPU or remote CPU) or any other LWP running on that
- *	CPU (in such case, owner LWP is on a remote CPU or sleeping).
+ *	There are some important rules about operation calls.  The request
+ *	for a PCU release can be from a) the owner LWP (regardless whether
+ *	the PCU state is on the current CPU or remote CPU) b) any other LWP
+ *	running on that CPU (in such case, the owner LWP is on a remote CPU
+ *	or sleeping).
  *
- *	In any case, PCU state can only be changed from the running CPU.
- *	If said PCU state is on the remote CPU, a cross-call will be sent
- *	by the owner LWP.  Therefore struct cpu_info::ci_pcu_curlwp[id]
- *	may only be changed by current CPU, and lwp_t::l_pcu_cpu[id] may
- *	only be unset by the CPU which has PCU state loaded.
- *
- *	There is a race condition: LWP may have a PCU state on a remote CPU,
- *	which it requests to be released via cross-call.  At the same time,
- *	other LWP on remote CPU might release existing PCU state and load
- *	its own one.  Cross-call may arrive after this and release different
- *	PCU state than intended.  In such case, such LWP would re-load its
- *	PCU state again.
+ *	In any case, the PCU state can *only* be changed from the current
+ *	CPU.  If said PCU state is on the remote CPU, a cross-call will be
+ *	sent by the owner LWP.  Therefore struct cpu_info::ci_pcu_curlwp[id]
+ *	may only be changed by the current CPU and lwp_t::l_pcu_cpu[id] may
+ *	only be cleared by the CPU which has the PCU state loaded.
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_pcu.c,v 1.17 2014/01/23 17:32:03 skrll Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_pcu.c,v 1.18 2014/05/16 00:48:41 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/cpu.h>
@@ -68,17 +63,25 @@ __KERNEL_RCSID(0, "$NetBSD: subr_pcu.c,v 1.17 2014/01/23 17:32:03 skrll Exp $");
 #if PCU_UNIT_COUNT > 0
 
 static inline void pcu_do_op(const pcu_ops_t *, lwp_t * const, const int);
-static void pcu_cpu_op(const pcu_ops_t *, const int);
 static void pcu_lwp_op(const pcu_ops_t *, lwp_t *, const int);
 
-__CTASSERT(PCU_KERNEL == 1);
+/*
+ * Internal PCU commands for the pcu_do_op() function.
+ */
+#define	PCU_CMD_SAVE		0x01	/* save PCU state to the LWP */
+#define	PCU_CMD_RELEASE		0x02	/* release PCU state on the CPU */
 
-#define	PCU_SAVE	(PCU_LOADED << 1) /* Save PCU state to the LWP. */
-#define	PCU_RELEASE	(PCU_SAVE << 1)	/* Release PCU state on the CPU. */
-#define	PCU_CLAIM	(PCU_RELEASE << 1)	/* CLAIM a PCU for a LWP. */
+/*
+ * Message structure for another CPU passed via xcall(9).
+ */
+typedef struct {
+	const pcu_ops_t *pcu;
+	lwp_t *		owner;
+	const int	flags;
+} pcu_xcall_msg_t;
 
-/* XXX */
-extern const pcu_ops_t * const	pcu_ops_md_defs[];
+/* PCU operations structure provided by the MD code. */
+extern const pcu_ops_t * const pcu_ops_md_defs[];
 
 /*
  * pcu_switchpoint: release PCU state if the LWP is being run on another CPU.
@@ -86,44 +89,22 @@ extern const pcu_ops_t * const	pcu_ops_md_defs[];
  * On each context switches, called by mi_switch() with IPL_SCHED.
  * 'l' is an LWP which is just we switched to.  (the new curlwp)
  */
-
 void
 pcu_switchpoint(lwp_t *l)
 {
-	const uint32_t pcu_kernel_inuse = l->l_pcu_used[PCU_KERNEL];
-	uint32_t pcu_user_inuse = l->l_pcu_used[PCU_USER];
+	const uint32_t pcu_valid = l->l_pcu_valid;
 	/* int s; */
 
 	KASSERTMSG(l == curlwp, "l %p != curlwp %p", l, curlwp);
 
-	if (__predict_false(pcu_kernel_inuse != 0)) {
-		for (u_int id = 0; id < PCU_UNIT_COUNT; id++) {
-			if ((pcu_kernel_inuse & (1 << id)) == 0) {
-				continue;
-			}
-			struct cpu_info * const pcu_ci = l->l_pcu_cpu[id];
-			if (pcu_ci == NULL || pcu_ci == l->l_cpu) {
-				continue;
-			}
-			const pcu_ops_t * const pcu = pcu_ops_md_defs[id];
-			/*
-			 * Steal the PCU away from the current owner and
-			 * take ownership of it.
-			 */
-			pcu_cpu_op(pcu, PCU_SAVE | PCU_RELEASE);
-			pcu_do_op(pcu, l, PCU_KERNEL | PCU_CLAIM | PCU_RELOAD);
-			pcu_user_inuse &= ~(1 << id);
-		}
-	}
-
-	if (__predict_true(pcu_user_inuse == 0)) {
+	if (__predict_true(pcu_valid == 0)) {
 		/* PCUs are not in use. */
 		return;
 	}
 	/* commented out as we know we are already at IPL_SCHED */
 	/* s = splsoftserial(); */
 	for (u_int id = 0; id < PCU_UNIT_COUNT; id++) {
-		if ((pcu_user_inuse & (1 << id)) == 0) {
+		if ((pcu_valid & (1U << id)) == 0) {
 			continue;
 		}
 		struct cpu_info * const pcu_ci = l->l_pcu_cpu[id];
@@ -131,7 +112,7 @@ pcu_switchpoint(lwp_t *l)
 			continue;
 		}
 		const pcu_ops_t * const pcu = pcu_ops_md_defs[id];
-		pcu->pcu_state_release(l, 0);
+		pcu->pcu_state_release(l);
 	}
 	/* splx(s); */
 }
@@ -141,35 +122,29 @@ pcu_switchpoint(lwp_t *l)
  *
  * Used by exec and LWP exit.
  */
-
 void
 pcu_discard_all(lwp_t *l)
 {
-	const uint32_t pcu_inuse = l->l_pcu_used[PCU_USER];
+	const uint32_t pcu_valid = l->l_pcu_valid;
 
-	KASSERT(l == curlwp || ((l->l_flag & LW_SYSTEM) && pcu_inuse == 0));
-	KASSERT(l->l_pcu_used[PCU_KERNEL] == 0);
+	KASSERT(l == curlwp || ((l->l_flag & LW_SYSTEM) && pcu_valid == 0));
 
-	if (__predict_true(pcu_inuse == 0)) {
+	if (__predict_true(pcu_valid == 0)) {
 		/* PCUs are not in use. */
 		return;
 	}
 	const int s = splsoftserial();
 	for (u_int id = 0; id < PCU_UNIT_COUNT; id++) {
-		if ((pcu_inuse & (1 << id)) == 0) {
+		if ((pcu_valid & (1U << id)) == 0) {
 			continue;
 		}
 		if (__predict_true(l->l_pcu_cpu[id] == NULL)) {
 			continue;
 		}
 		const pcu_ops_t * const pcu = pcu_ops_md_defs[id];
-		/*
-		 * We aren't releasing since this LWP isn't giving up PCU,
-		 * just saving it.
-		 */
-		pcu_lwp_op(pcu, l, PCU_RELEASE);
+		pcu_lwp_op(pcu, l, PCU_CMD_RELEASE);
 	}
-	l->l_pcu_used[PCU_USER] = 0;
+	l->l_pcu_valid = 0;
 	splx(s);
 }
 
@@ -177,35 +152,33 @@ pcu_discard_all(lwp_t *l)
  * pcu_save_all: save PCU state of the given LWP so that eg. coredump can
  * examine it.
  */
-
 void
 pcu_save_all(lwp_t *l)
 {
-	const uint32_t pcu_inuse = l->l_pcu_used[PCU_USER];
-	/*
-	 * Unless LW_WCORE, we aren't releasing since this LWP isn't giving
-	 * up PCU, just saving it.
-	 */
-	const int flags = PCU_SAVE | (l->l_flag & LW_WCORE ? PCU_RELEASE : 0);
+	const uint32_t pcu_valid = l->l_pcu_valid;
+	int flags = PCU_CMD_SAVE;
+
+	/* If LW_WCORE, we are also releasing the state. */
+	if (__predict_false(l->l_flag & LW_WCORE)) {
+		flags |= PCU_CMD_RELEASE;
+	}
 
 	/*
 	 * Normally we save for the current LWP, but sometimes we get called
 	 * with a different LWP (forking a system LWP or doing a coredump of
 	 * a process with multiple threads) and we need to deal with that.
 	 */
-	KASSERT(l == curlwp
-	    || (((l->l_flag & LW_SYSTEM)
-		 || (curlwp->l_proc == l->l_proc && l->l_stat == LSSUSPENDED))
-	        && pcu_inuse == 0));
-	KASSERT(l->l_pcu_used[PCU_KERNEL] == 0);
+	KASSERT(l == curlwp || (((l->l_flag & LW_SYSTEM) ||
+	    (curlwp->l_proc == l->l_proc && l->l_stat == LSSUSPENDED)) &&
+	    pcu_valid == 0));
 
-	if (__predict_true(pcu_inuse == 0)) {
+	if (__predict_true(pcu_valid == 0)) {
 		/* PCUs are not in use. */
 		return;
 	}
 	const int s = splsoftserial();
 	for (u_int id = 0; id < PCU_UNIT_COUNT; id++) {
-		if ((pcu_inuse & (1 << id)) == 0) {
+		if ((pcu_valid & (1U << id)) == 0) {
 			continue;
 		}
 		if (__predict_true(l->l_pcu_cpu[id] == NULL)) {
@@ -227,55 +200,42 @@ pcu_do_op(const pcu_ops_t *pcu, lwp_t * const l, const int flags)
 {
 	struct cpu_info * const ci = curcpu();
 	const u_int id = pcu->pcu_id;
-	u_int state_flags = flags & (PCU_KERNEL|PCU_RELOAD|PCU_ENABLE);
-	uint32_t id_mask = 1 << id;
-	const bool kernel_p = (l->l_pcu_used[PCU_KERNEL] & id_mask) != 0;
 
-	KASSERT(l->l_pcu_cpu[id] == (flags & PCU_CLAIM ? NULL : ci));
+	KASSERT(l->l_pcu_cpu[id] == ci);
 
-	if (flags & PCU_SAVE) {
-		pcu->pcu_state_save(l, (kernel_p ? PCU_KERNEL : 0));
+	if (flags & PCU_CMD_SAVE) {
+		pcu->pcu_state_save(l);
 	}
-	if (flags & PCU_RELEASE) {
-		pcu->pcu_state_release(l, state_flags);
-		if (flags & PCU_KERNEL) {
-			l->l_pcu_used[PCU_KERNEL] &= ~id_mask;
-		}
+	if (flags & PCU_CMD_RELEASE) {
+		pcu->pcu_state_release(l);
 		ci->ci_pcu_curlwp[id] = NULL;
 		l->l_pcu_cpu[id] = NULL;
 	}
-	if (flags & PCU_CLAIM) {
-		if (l->l_pcu_used[(flags & PCU_KERNEL)] & id_mask)
-			state_flags |= PCU_LOADED;
-		pcu->pcu_state_load(l, state_flags);
-		l->l_pcu_cpu[id] = ci;
-		ci->ci_pcu_curlwp[id] = l;
-		l->l_pcu_used[flags & PCU_KERNEL] |= id_mask;
-	}
-	if (flags == PCU_KERNEL) {
-		KASSERT(ci->ci_pcu_curlwp[id] == l);
-		pcu->pcu_state_save(l, 0);
-		l->l_pcu_used[PCU_KERNEL] |= id_mask;
-	}
 }
 
 /*
- * pcu_cpu_op: helper routine to call pcu_do_op() via xcall(9) or
- * by pcu_load.
+ * pcu_cpu_xcall: helper routine to call pcu_do_op() via xcall(9).
  */
 static void
-pcu_cpu_op(const pcu_ops_t *pcu, const int flags)
+pcu_cpu_xcall(void *arg1, void *arg2 __unused)
 {
+	const pcu_xcall_msg_t *pcu_msg = arg1;
+	const pcu_ops_t *pcu = pcu_msg->pcu;
 	const u_int id = pcu->pcu_id;
-	lwp_t * const l = curcpu()->ci_pcu_curlwp[id];
+	lwp_t *l = pcu_msg->owner;
 
-	//KASSERT(cpu_softintr_p());
+	KASSERT(cpu_softintr_p());
+	KASSERT(pcu_msg->owner != NULL);
 
-	/* If no state - nothing to do. */
-	if (l == NULL) {
+	if (curcpu()->ci_pcu_curlwp[id] != l) {
+		/*
+		 * Different ownership: another LWP raced with us and
+		 * perform save and release.  There is nothing to do.
+		 */
+		KASSERT(l->l_pcu_cpu[id] == NULL);
 		return;
 	}
-	pcu_do_op(pcu, l, flags);
+	pcu_do_op(pcu, l, pcu_msg->flags);
 }
 
 /*
@@ -300,7 +260,6 @@ pcu_lwp_op(const pcu_ops_t *pcu, lwp_t *l, const int flags)
 		/*
 		 * State is on the current CPU - just perform the operations.
 		 */
-		KASSERT((flags & PCU_CLAIM) == 0);
 		KASSERTMSG(ci->ci_pcu_curlwp[id] == l,
 		    "%s: cpu%u: pcu_curlwp[%u] (%p) != l (%p)",
 		     __func__, cpu_index(ci), id, ci->ci_pcu_curlwp[id], l);
@@ -308,27 +267,21 @@ pcu_lwp_op(const pcu_ops_t *pcu, lwp_t *l, const int flags)
 		splx(s);
 		return;
 	}
+	splx(s);
 
 	if (__predict_false(ci == NULL)) {
-		if (flags & PCU_CLAIM) {
-			pcu_do_op(pcu, l, flags);
-		}
 		/* Cross-call has won the race - no state to manage. */
-		splx(s);
 		return;
 	}
 
-	splx(s);
-
 	/*
-	 * State is on the remote CPU - perform the operations there.
-	 * Note: there is a race condition; see description in the top.
+	 * The state is on the remote CPU: perform the operation(s) there.
 	 */
-	where = xc_unicast(XC_HIGHPRI, (xcfunc_t)pcu_cpu_op,
-	    __UNCONST(pcu), (void *)(uintptr_t)flags, ci);
+	pcu_xcall_msg_t pcu_msg = { .pcu = pcu, .owner = l, .flags = flags };
+	where = xc_unicast(XC_HIGHPRI, pcu_cpu_xcall, &pcu_msg, NULL, ci);
 	xc_wait(where);
 
-	KASSERT((flags & PCU_RELEASE) == 0 || l->l_pcu_cpu[id] == NULL);
+	KASSERT((flags & PCU_CMD_RELEASE) == 0 || l->l_pcu_cpu[id] == NULL);
 }
 
 /*
@@ -337,9 +290,9 @@ pcu_lwp_op(const pcu_ops_t *pcu, lwp_t *l, const int flags)
 void
 pcu_load(const pcu_ops_t *pcu)
 {
+	lwp_t *oncpu_lwp, * const l = curlwp;
 	const u_int id = pcu->pcu_id;
 	struct cpu_info *ci, *curci;
-	lwp_t * const l = curlwp;
 	uint64_t where;
 	int s;
 
@@ -351,11 +304,10 @@ pcu_load(const pcu_ops_t *pcu)
 
 	/* Does this CPU already have our PCU state loaded? */
 	if (ci == curci) {
+		/* Fault happen: indicate re-enable. */
 		KASSERT(curci->ci_pcu_curlwp[id] == l);
-		KASSERT(pcu_used_p(pcu));
-
-		/* Re-enable */
-		pcu->pcu_state_load(l, PCU_LOADED | PCU_ENABLE);
+		KASSERT(pcu_valid_p(pcu));
+		pcu->pcu_state_load(l, PCU_VALID | PCU_REENABLE);
 		splx(s);
 		return;
 	}
@@ -363,9 +315,11 @@ pcu_load(const pcu_ops_t *pcu)
 	/* If PCU state of this LWP is on the remote CPU - save it there. */
 	if (ci) {
 		splx(s);
-		/* Note: there is a race; see description in the top. */
-		where = xc_unicast(XC_HIGHPRI, (xcfunc_t)pcu_cpu_op,
-		    __UNCONST(pcu), (void *)(PCU_SAVE | PCU_RELEASE), ci);
+
+		pcu_xcall_msg_t pcu_msg = { .pcu = pcu, .owner = l,
+		    .flags = PCU_CMD_SAVE | PCU_CMD_RELEASE };
+		where = xc_unicast(XC_HIGHPRI, pcu_cpu_xcall,
+		    &pcu_msg, NULL, ci);
 		xc_wait(where);
 
 		/* Enter IPL_SOFTSERIAL and re-fetch the current CPU. */
@@ -375,38 +329,44 @@ pcu_load(const pcu_ops_t *pcu)
 	KASSERT(l->l_pcu_cpu[id] == NULL);
 
 	/* Save the PCU state on the current CPU, if there is any. */
-	pcu_cpu_op(pcu, PCU_SAVE | PCU_RELEASE);
-	KASSERT(curci->ci_pcu_curlwp[id] == NULL);
+	if ((oncpu_lwp = curci->ci_pcu_curlwp[id]) != NULL) {
+		pcu_do_op(pcu, oncpu_lwp, PCU_CMD_SAVE | PCU_CMD_RELEASE);
+		KASSERT(curci->ci_pcu_curlwp[id] == NULL);
+	}
 
 	/*
 	 * Finally, load the state for this LWP on this CPU.  Indicate to
-	 * load function whether PCU was used before.  Note the usage.
+	 * the load function whether PCU state was valid before this call.
 	 */
-	pcu_do_op(pcu, l, PCU_CLAIM | PCU_ENABLE | PCU_RELOAD);
+	const bool valid = ((1U << id) & l->l_pcu_valid) != 0;
+	pcu->pcu_state_load(l, valid ? PCU_VALID : 0);
+	curci->ci_pcu_curlwp[id] = l;
+	l->l_pcu_cpu[id] = curci;
+	l->l_pcu_valid |= (1U << id);
 	splx(s);
 }
 
 /*
- * pcu_discard: discard the PCU state of current LWP.
- * If the "usesw" flag is set, pcu_used_p() will return "true".
+ * pcu_discard: discard the PCU state of current LWP.  If "valid"
+ * parameter is true, then keep considering the PCU state as valid.
  */
 void
-pcu_discard(const pcu_ops_t *pcu, bool usesw)
+pcu_discard(const pcu_ops_t *pcu, bool valid)
 {
 	const u_int id = pcu->pcu_id;
 	lwp_t * const l = curlwp;
 
 	KASSERT(!cpu_intr_p() && !cpu_softintr_p());
 
-	if (usesw)
-		l->l_pcu_used[PCU_USER] |= (1 << id);
-	else
-		l->l_pcu_used[PCU_USER] &= ~(1 << id);
-
+	if (__predict_false(valid)) {
+		l->l_pcu_valid |= (1U << id);
+	} else {
+		l->l_pcu_valid &= ~(1U << id);
+	}
 	if (__predict_true(l->l_pcu_cpu[id] == NULL)) {
 		return;
 	}
-	pcu_lwp_op(pcu, l, PCU_RELEASE);
+	pcu_lwp_op(pcu, l, PCU_CMD_RELEASE);
 }
 
 /*
@@ -423,71 +383,40 @@ pcu_save(const pcu_ops_t *pcu)
 	if (__predict_true(l->l_pcu_cpu[id] == NULL)) {
 		return;
 	}
-	pcu_lwp_op(pcu, l, PCU_SAVE | PCU_RELEASE);
+	pcu_lwp_op(pcu, l, PCU_CMD_SAVE | PCU_CMD_RELEASE);
 }
 
 /*
- * pcu_save_all_on_cpu: save all PCU state on current CPU
+ * pcu_save_all_on_cpu: save all PCU states on the current CPU.
  */
 void
 pcu_save_all_on_cpu(void)
 {
+	int s;
 
+	s = splsoftserial();
 	for (u_int id = 0; id < PCU_UNIT_COUNT; id++) {
-		pcu_cpu_op(pcu_ops_md_defs[id], PCU_SAVE | PCU_RELEASE);
+		const pcu_ops_t * const pcu = pcu_ops_md_defs[id];
+		lwp_t *l;
+
+		if ((l = curcpu()->ci_pcu_curlwp[id]) != NULL) {
+			pcu_do_op(pcu, l, PCU_CMD_SAVE | PCU_CMD_RELEASE);
+		}
 	}
+	splx(s);
 }
 
 /*
- * pcu_used: return true if PCU was used (pcu_load() case) by the LWP.
+ * pcu_valid_p: return true if PCU state is considered valid.  Generally,
+ * it always becomes "valid" when pcu_load() is called.
  */
 bool
-pcu_used_p(const pcu_ops_t *pcu)
+pcu_valid_p(const pcu_ops_t *pcu)
 {
 	const u_int id = pcu->pcu_id;
 	lwp_t * const l = curlwp;
 
-	return l->l_pcu_used[PCU_USER] & (1 << id);
-}
-
-void
-pcu_kernel_acquire(const pcu_ops_t *pcu)
-{
-	struct cpu_info * const ci = curcpu();
-	lwp_t * const l = curlwp;
-	const u_int id = pcu->pcu_id;
-
-	/*
-	 * If we own the PCU, save our user state.
-	 */
-	if (ci == l->l_pcu_cpu[id]) {
-		pcu_lwp_op(pcu, l, PCU_KERNEL);
-		return;
-	}
-	if (ci->ci_data.cpu_pcu_curlwp[id] != NULL) {
-		/*
-		 * The PCU is owned by another LWP so save its state.
-		 */
-		pcu_cpu_op(pcu, PCU_SAVE | PCU_RELEASE);
-	}
-	/*
-	 * Mark the PCU as hijacked and take ownership of it.
-	 */
-	pcu_lwp_op(pcu, l, PCU_KERNEL | PCU_CLAIM | PCU_ENABLE | PCU_RELOAD);
-}
-
-void
-pcu_kernel_release(const pcu_ops_t *pcu)
-{
-	lwp_t * const l = curlwp;
-
-	KASSERT(l->l_pcu_used[PCU_KERNEL] & (1 << pcu->pcu_id));
-
-	/*
-	 * Release the PCU, if the curlwp wants to use it, it will have incur
-	 * a trap to reenable it.
-	 */
-	pcu_lwp_op(pcu, l, PCU_KERNEL | PCU_RELEASE);
+	return (l->l_pcu_valid & (1U << id)) != 0;
 }
 
 #endif /* PCU_UNIT_COUNT > 0 */
diff --git a/sys/sys/pcu.h b/sys/sys/pcu.h
index 8d335511ea96..dbfa4cb693e5 100644
--- a/sys/sys/pcu.h
+++ b/sys/sys/pcu.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: pcu.h,v 1.11 2013/08/22 19:50:55 drochner Exp $	*/
+/*	$NetBSD: pcu.h,v 1.12 2014/05/16 00:48:41 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2011 The NetBSD Foundation, Inc.
@@ -50,31 +50,31 @@
 #if PCU_UNIT_COUNT > 0
 
 /*
- * pcu_state_save(lwp, flags)
- *	save the current CPU's state into the given LWP's MD storage.
+ * pcu_state_save(lwp)
+ *	Tells MD code to save the current CPU's state into the given
+ *	LWP's MD storage.
  *
  * pcu_state_load(lwp, flags)
- *	load PCU state from the given LWP's MD storage to the current CPU.
- *	the 'flags' argument contains PCU_LOADED if it isn't the first time
- *	the LWP has used the PCU.
+ *	Tells MD code to load PCU state from the given LWP's MD storage
+ *	to the current CPU.
  *
- * pcu_state_release(lwp, flags)
- *	tell MD code detect the next use of the PCU on the LWP, and call
- *	pcu_load().
+ * pcu_state_release(lwp)
+ *	Tells MD code detect the next use of the PCU on the LWP and
+ *	call pcu_load().
  */
 
 typedef struct {
 	u_int	pcu_id;
-	void	(*pcu_state_save)(lwp_t *, u_int);
-	void	(*pcu_state_load)(lwp_t *, u_int);
-	void	(*pcu_state_release)(lwp_t *, u_int);
+	void	(*pcu_state_save)(lwp_t *);
+	void	(*pcu_state_load)(lwp_t *, unsigned);
+	void	(*pcu_state_release)(lwp_t *);
 } pcu_ops_t;
 
-#define	PCU_USER	0x00		/* PCU state is for the user */
-#define	PCU_KERNEL	0x01		/* PCU state is for the kernel */
-#define	PCU_RELOAD	0x02		/* Load registers into the PCU, */
-#define	PCU_ENABLE	0x04		/* Enable the PCU, */
-#define	PCU_LOADED	0x08		/* LWP has used the PCU before, */
+/*
+ * Flags for the pcu_state_load() operation.
+ */
+#define	PCU_VALID	0x01	/* PCU state is considered valid */
+#define	PCU_REENABLE	0x02	/* the state is present, re-enable PCU */
 
 void	pcu_switchpoint(lwp_t *);
 void	pcu_discard_all(lwp_t *);
@@ -84,9 +84,7 @@ void	pcu_load(const pcu_ops_t *);
 void	pcu_save(const pcu_ops_t *);
 void	pcu_save_all_on_cpu(void);
 void	pcu_discard(const pcu_ops_t *, bool);
-void	pcu_kernel_acquire(const pcu_ops_t *);
-void	pcu_kernel_release(const pcu_ops_t *);
-bool	pcu_used_p(const pcu_ops_t *);
+bool	pcu_valid_p(const pcu_ops_t *);
 
 #else
 #define	pcu_switchpoint(l)