diff --git a/sys/arch/sparc/sparc/locore.s b/sys/arch/sparc/sparc/locore.s
index baeb49e84303..47f037ff91ff 100644
--- a/sys/arch/sparc/sparc/locore.s
+++ b/sys/arch/sparc/sparc/locore.s
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.s,v 1.163 2002/11/27 16:08:34 pk Exp $	*/
+/*	$NetBSD: locore.s,v 1.164 2002/11/28 15:32:15 pk Exp $	*/
 
 /*
  * Copyright (c) 1996 Paul Kranenburg
@@ -3803,7 +3803,7 @@ no_3mmu:
 	beq	3f
 	 nop
 	cmp	%g4, CPU_SUN4D
-	bne	5f
+	bne	4f
 
 3:
 	/*
@@ -3854,8 +3854,7 @@ no_3mmu:
 
 	sta	%o3, [%g0] ASI_SRMMU	! restore mmu-sreg
 	wr	%o4, 0x0, %psr		! restore psr
-	!b,a	startmap_done
-	b,a	4f
+	b,a	startmap_done
 
 	/*
 	 * The following is generic and should work on all
@@ -3874,71 +3873,6 @@ remap_notvik:
 	sta	%l4, [%o1] ASI_BYPASS
 	!b,a	startmap_done
 4:
-#define	OVERWRITE(rtn, v8_rtn, len) \
-	set	v8_rtn, %o0; \
-	set	rtn, %o1; \
-	call	_C_LABEL(bcopy); \
-	 mov	len, %o2
-
-	OVERWRITE(.mul, v8_smul, v8_smul_len)
-	OVERWRITE(.umul, v8_umul, v8_umul_len)
-	OVERWRITE(.div, v8_sdiv, v8_sdiv_len)
-	OVERWRITE(.udiv, v8_udiv, v8_udiv_len)
-	OVERWRITE(.rem, v8_srem, v8_srem_len)
-	OVERWRITE(.urem, v8_urem, v8_urem_len)
-
-#undef	OVERWRITE
-
-	b,a	startmap_done
-
-/*
- * V8 multiply and divide routines, to be copied over the code
- * for the V6/V7 routines.  Seems a shame to spend the call, but....
- * Note: while .umul and .smul return a 64-bit result in %o1%o0,
- * gcc only really cares about the low 32 bits in %o0.  This is
- * really just gcc output, cleaned up a bit.
- */
-v8_smul:
-	retl
-	 smul	%o0, %o1, %o0
-v8_smul_len = .-v8_smul
-v8_umul:
-	retl
-	 umul	%o0, %o1, %o0
-!v8_umul_len = 2 * 4
-v8_umul_len = .-v8_umul
-v8_sdiv:
-	sra	%o0, 31, %g2
-	wr	%g2, 0, %y
-	nop; nop; nop
-	retl
-	 sdiv	%o0, %o1, %o0
-v8_sdiv_len = .-v8_sdiv
-v8_udiv:
-	wr	%g0, 0, %y
-	nop; nop; nop
-	retl
-	 udiv	%o0, %o1, %o0
-v8_udiv_len = .-v8_udiv
-v8_srem:
-	sra	%o0, 31, %g3
-	wr	%g3, 0, %y
-	nop; nop; nop
-	sdiv	%o0, %o1, %g2
-	smul	%g2, %o1, %g2
-	retl
-	 sub	%o0, %g2, %o0
-v8_srem_len = .-v8_srem
-v8_urem:
-	wr	%g0, 0, %y
-	nop; nop; nop
-	udiv	%o0, %o1, %g2
-	smul	%g2, %o1, %g2
-	retl
-	 sub	%o0, %g2, %o0
-v8_urem_len = .-v8_urem
-
-5:
 #endif /* SUN4M || SUN4D */
 	! botch! We should blow up.
 
@@ -4137,6 +4071,84 @@ Lgandul:	nop
 	 clr	%o0			! our frame arg is ignored
 	/*NOTREACHED*/
 
+#if defined(SUN4M)
+	.globl	_C_LABEL(sparc_v8_muldiv)
+_C_LABEL(sparc_v8_muldiv):
+	save    %sp, -CCFSZ, %sp
+
+#define	OVERWRITE(rtn, v8_rtn, len)	\
+	set	v8_rtn, %o0;		\
+	set	rtn, %o1;		\
+	call	_C_LABEL(bcopy);	\
+	 mov	len, %o2;		\
+	/* now flush the insn cache */	\
+	set	rtn, %o0;		\
+	 mov	len, %o1;		\
+0:					\
+	flush	%o0;			\
+	subcc	%o1, 8, %o1;		\
+	bgu	0b;			\
+	 add	%o0, 8, %o0;		\
+
+	OVERWRITE(.mul, v8_smul, v8_smul_len)
+	OVERWRITE(.umul, v8_umul, v8_umul_len)
+	OVERWRITE(.div, v8_sdiv, v8_sdiv_len)
+	OVERWRITE(.udiv, v8_udiv, v8_udiv_len)
+	OVERWRITE(.rem, v8_srem, v8_srem_len)
+	OVERWRITE(.urem, v8_urem, v8_urem_len)
+#undef	OVERWRITE
+	ret
+	 restore
+
+/*
+ * V8 multiply and divide routines, to be copied over the code
+ * for the V6/V7 routines.  Seems a shame to spend the call, but....
+ * Note: while .umul and .smul return a 64-bit result in %o1%o0,
+ * gcc only really cares about the low 32 bits in %o0.  This is
+ * really just gcc output, cleaned up a bit.
+ */
+v8_smul:
+	retl
+	 smul	%o0, %o1, %o0
+v8_smul_len = .-v8_smul
+v8_umul:
+	retl
+	 umul	%o0, %o1, %o0
+!v8_umul_len = 2 * 4
+v8_umul_len = .-v8_umul
+v8_sdiv:
+	sra	%o0, 31, %g2
+	wr	%g2, 0, %y
+	nop; nop; nop
+	retl
+	 sdiv	%o0, %o1, %o0
+v8_sdiv_len = .-v8_sdiv
+v8_udiv:
+	wr	%g0, 0, %y
+	nop; nop; nop
+	retl
+	 udiv	%o0, %o1, %o0
+v8_udiv_len = .-v8_udiv
+v8_srem:
+	sra	%o0, 31, %g3
+	wr	%g3, 0, %y
+	nop; nop; nop
+	sdiv	%o0, %o1, %g2
+	smul	%g2, %o1, %g2
+	retl
+	 sub	%o0, %g2, %o0
+v8_srem_len = .-v8_srem
+v8_urem:
+	wr	%g0, 0, %y
+	nop; nop; nop
+	udiv	%o0, %o1, %g2
+	smul	%g2, %o1, %g2
+	retl
+	 sub	%o0, %g2, %o0
+v8_urem_len = .-v8_urem
+
+#endif /* SUN4M */
+
 #if defined(MULTIPROCESSOR)
 	/*
 	 * Entry point for non-boot CPUs in MP systems.