From 01d8a9761383135b2a34757029a9a23988618b51 Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Mon, 4 Jul 2005 17:44:08 +0000 Subject: [PATCH] Try to cleanup/rewrite RepeatSpeedups optimization This code doesn't add new speedups but makes it very easy After some validation it could be no problem to enable repeat speedups optimization for REP MOVSx with any address size. And REP STOSx too. --- bochs/cpu/cpu.h | 18 +- bochs/cpu/string.cc | 1328 +++++++++++++++++++++++++------------------ 2 files changed, 783 insertions(+), 563 deletions(-) diff --git a/bochs/cpu/cpu.h b/bochs/cpu/cpu.h index d5c6d5656..289db5640 100644 --- a/bochs/cpu/cpu.h +++ b/bochs/cpu/cpu.h @@ -1,5 +1,5 @@ ///////////////////////////////////////////////////////////////////////// -// $Id: cpu.h,v 1.222 2005-06-16 17:24:50 sshwarts Exp $ +// $Id: cpu.h,v 1.223 2005-07-04 17:44:08 sshwarts Exp $ ///////////////////////////////////////////////////////////////////////// // // Copyright (C) 2001 MandrakeSoft S.A. @@ -2660,6 +2660,22 @@ public: // for now... BX_SMF void branch_near64(bxInstruction_c *i) BX_CPP_AttrRegparmN(1); #endif +#if BX_SupportRepeatSpeedups + BX_SMF Bit32u FastRepMOVSB(bxInstruction_c *i, unsigned srcSeg, bx_address srcOff, + unsigned dstSeg, bx_address dstOff, Bit32u count); + BX_SMF Bit32u FastRepMOVSW(bxInstruction_c *i, unsigned srcSeg, bx_address srcOff, + unsigned dstSeg, bx_address dstOff, Bit32u count); + BX_SMF Bit32u FastRepMOVSD(bxInstruction_c *i, unsigned srcSeg, bx_address srcOff, + unsigned dstSeg, bx_address dstOff, Bit32u count); + + BX_SMF Bit32u FastRepSTOSB(bxInstruction_c *i, unsigned dstSeg, bx_address dstOff, + Bit8u val, Bit32u count); + BX_SMF Bit32u FastRepSTOSW(bxInstruction_c *i, unsigned dstSeg, bx_address dstOff, + Bit16u val, Bit32u count); + BX_SMF Bit32u FastRepSTOSD(bxInstruction_c *i, unsigned dstSeg, bx_address dstOff, + Bit32u val, Bit32u count); +#endif + BX_SMF void access_linear(bx_address address, unsigned length, unsigned pl, unsigned rw, void *data) BX_CPP_AttrRegparmN(3); BX_SMF Bit32u translate_linear(bx_address laddr, diff --git a/bochs/cpu/string.cc b/bochs/cpu/string.cc index e1f8d593a..27fffa561 100644 --- a/bochs/cpu/string.cc +++ b/bochs/cpu/string.cc @@ -1,5 +1,5 @@ ///////////////////////////////////////////////////////////////////////// -// $Id: string.cc,v 1.29 2005-06-21 17:01:21 sshwarts Exp $ +// $Id: string.cc,v 1.30 2005-07-04 17:44:08 sshwarts Exp $ ///////////////////////////////////////////////////////////////////////// // // Copyright (C) 2001 MandrakeSoft S.A. @@ -38,10 +38,710 @@ #endif +#if BX_SupportRepeatSpeedups +Bit32u BX_CPU_C::FastRepMOVSB(bxInstruction_c *i, unsigned srcSeg, bx_address srcOff, unsigned dstSeg, bx_address dstOff, Bit32u count) +{ + Bit32u bytesFitSrc, bytesFitDst; + signed int pointerDelta; + bx_address laddrDst, laddrSrc; + Bit32u paddrDst, paddrSrc; + + bx_segment_reg_t *srcSegPtr = &BX_CPU_THIS_PTR sregs[srcSeg]; + bx_segment_reg_t *dstSegPtr = &BX_CPU_THIS_PTR sregs[dstSeg]; + + // Do segment checks for the 1st byte. We do not want to + // trip an exception beyond this, because the address would + // be incorrect. After we know how many bytes we will directly + // transfer, we can do the full segment limit check ourselves + // without generating an exception. + read_virtual_checks(srcSegPtr, srcOff, 1); + laddrSrc = BX_CPU_THIS_PTR get_segment_base(srcSeg) + srcOff; + if (BX_CPU_THIS_PTR cr0.pg) { + paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ); + } + else { + paddrSrc = laddrSrc; + } + + // If we want to write directly into the physical memory array, + // we need the A20 address. + paddrSrc = A20ADDR(paddrSrc); + Bit8u *hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, + paddrSrc, BX_READ); + + if (! hostAddrSrc) return 0; + + write_virtual_checks(dstSegPtr, dstOff, 1); + laddrDst = BX_CPU_THIS_PTR get_segment_base(dstSeg) + dstOff; + if (BX_CPU_THIS_PTR cr0.pg) { + paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); + } + else { + paddrDst = laddrDst; + } + + // If we want to write directly into the physical memory array, + // we need the A20 address. + paddrDst = A20ADDR(paddrDst); + Bit8u *hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many bytes can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + bytesFitSrc = 1 + (paddrSrc & 0xfff); + bytesFitDst = 1 + (paddrDst & 0xfff); + pointerDelta = (signed int) -1; + } + else { + // Counting upward. + bytesFitSrc = (0x1000 - (paddrSrc & 0xfff)); + bytesFitDst = (0x1000 - (paddrDst & 0xfff)); + pointerDelta = (signed int) 1; + } + + // Restrict word count to the number that will fit in either + // source or dest pages. + if (count > bytesFitSrc) + count = bytesFitSrc; + if (count > bytesFitDst) + count = bytesFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(srcSegPtr->cache.valid & SegAccessROK) || + !(dstSegPtr->cache.valid & SegAccessWOK) ) + { + return 0; + } + + if ( !IsLongMode() ) + { + Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using si/di + // rolling over 16-bit boundaries. + if (srcSegLimit > 0xffff) + srcSegLimit = 0xffff; + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + Bit32u minOffset = (count-1); + if ( srcOff < minOffset ) + return 0; + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u srcMaxOffset = (srcSegLimit - count) + 1; + Bit32u dstMaxOffset = (dstSegLimit - count) + 1; + if ( srcOff > srcMaxOffset ) + return 0; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jgetHostMemAddr(BX_CPU_THIS, + paddrSrc, BX_READ); + + if (! hostAddrSrc) return 0; + + write_virtual_checks(dstSegPtr, dstOff, 2); + laddrDst = BX_CPU_THIS_PTR get_segment_base(dstSeg) + dstOff; + if (BX_CPU_THIS_PTR cr0.pg) { + paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); + } + else { + paddrDst = laddrDst; + } + + // If we want to write directly into the physical memory array, + // we need the A20 address. + paddrDst = A20ADDR(paddrDst); + Bit8u *hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many words can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + // Note: 1st word must not cross page boundary. + if ( ((paddrSrc & 0xfff) > 0xffe) || ((paddrDst & 0xfff) > 0xffe) ) + return 0; + wordsFitSrc = (2 + (paddrSrc & 0xfff)) >> 1; + wordsFitDst = (2 + (paddrDst & 0xfff)) >> 1; + pointerDelta = (signed int) -2; + } + else { + // Counting upward. + wordsFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 1; + wordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 1; + pointerDelta = (signed int) 2; + } + + // Restrict word count to the number that will fit in either + // source or dest pages. + if (count > wordsFitSrc) + count = wordsFitSrc; + if (count > wordsFitDst) + count = wordsFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(srcSegPtr->cache.valid & SegAccessROK) || + !(dstSegPtr->cache.valid & SegAccessWOK) ) + { + return 0; + } + + if ( !IsLongMode() ) + { + Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using si/di + // rolling over 16-bit boundaries. + if (srcSegLimit > 0xffff) + srcSegLimit = 0xffff; + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + Bit32u minOffset = (count-1) << 1; + if ( srcOff < minOffset ) + return 0; + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u srcMaxOffset = (srcSegLimit - (count<<1)) + 1; + Bit32u dstMaxOffset = (dstSegLimit - (count<<1)) + 1; + if ( srcOff > srcMaxOffset ) + return 0; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jgetHostMemAddr(BX_CPU_THIS, + paddrSrc, BX_READ); + + if (! hostAddrSrc) return 0; + + write_virtual_checks(dstSegPtr, dstOff, 4); + laddrDst = BX_CPU_THIS_PTR get_segment_base(dstSeg) + dstOff; + if (BX_CPU_THIS_PTR cr0.pg) { + paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); + } + else { + paddrDst = laddrDst; + } + + // If we want to write directly into the physical memory array, + // we need the A20 address. + paddrDst = A20ADDR(paddrDst); + Bit8u *hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many dwords can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + // Note: 1st dword must not cross page boundary. + if ( ((paddrSrc & 0xfff) > 0xffc) || ((paddrDst & 0xfff) > 0xffc) ) + return 0; + dwordsFitSrc = (4 + (paddrSrc & 0xfff)) >> 2; + dwordsFitDst = (4 + (paddrDst & 0xfff)) >> 2; + pointerDelta = (signed int) -4; + } + else { + // Counting upward. + dwordsFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 2; + dwordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 2; + pointerDelta = (signed int) 4; + } + + // Restrict dword count to the number that will fit in either + // source or dest pages. + if (count > dwordsFitSrc) + count = dwordsFitSrc; + if (count > dwordsFitDst) + count = dwordsFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(srcSegPtr->cache.valid & SegAccessROK) || + !(dstSegPtr->cache.valid & SegAccessWOK) ) + { + return 0; + } + + if ( !IsLongMode() ) + { + Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using si/di + // rolling over 16-bit boundaries. + if (srcSegLimit > 0xffff) + srcSegLimit = 0xffff; + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + Bit32u minOffset = (count-1) << 2; + if ( srcOff < minOffset ) + return 0; + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u srcMaxOffset = (srcSegLimit - (count<<2)) + 1; + Bit32u dstMaxOffset = (dstSegLimit - (count<<2)) + 1; + if ( srcOff > srcMaxOffset ) + return 0; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jgetHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many bytes can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + bytesFitDst = 1 + (paddrDst & 0xfff); + pointerDelta = (signed int) -1; + } + else { + // Counting upward. + bytesFitDst = (0x1000 - (paddrDst & 0xfff)); + pointerDelta = (signed int) 1; + } + + // Restrict word count to the number that will fit in either + // source or dest pages. + if (count > bytesFitDst) + count = bytesFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) return 0; + + if ( !IsLongMode() ) + { + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using di + // rolling over 16-bit boundaries. + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + Bit32u minOffset = (count-1); + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u dstMaxOffset = (dstSegLimit - count) + 1; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jgetHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many words can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + // Note: 1st word must not cross page boundary. + if ((paddrDst & 0xfff) > 0xffe) return 0; + wordsFitDst = (2 + (paddrDst & 0xfff)) >> 1; + pointerDelta = (signed int) -2; + } + else { + // Counting upward. + wordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 1; + pointerDelta = (signed int) 2; + } + + // Restrict word count to the number that will fit in either + // source or dest pages. + if (count > wordsFitDst) + count = wordsFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) return 0; + + if ( !IsLongMode() ) + { + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using di + // rolling over 16-bit boundaries. + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + Bit32u minOffset = (count-1) << 1; + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u dstMaxOffset = (dstSegLimit - (count<<1)) + 1; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jgetHostMemAddr(BX_CPU_THIS, + paddrDst, BX_WRITE); + + if (! hostAddrDst) return 0; + + // See how many dwords can fit in the rest of this page. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + // Note: 1st dword must not cross page boundary. + if ((paddrDst & 0xfff) > 0xffc) return 0; + dwordsFitDst = (4 + (paddrDst & 0xfff)) >> 2; + pointerDelta = (signed int) -4; + } + else { + // Counting upward. + dwordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 2; + pointerDelta = (signed int) 4; + } + + // Restrict dword count to the number that will fit in either + // source or dest pages. + if (count > dwordsFitDst) + count = dwordsFitDst; + if (count > bx_pc_system.getNumCpuTicksLeftNextEvent()) + count = bx_pc_system.getNumCpuTicksLeftNextEvent(); + + // If after all the restrictions, there is anything left to do... + if (count) { + // Before we copy memory, we need to make sure that the segments + // allow the accesses up to the given source and dest offset. If + // the cache.valid bits have SegAccessWOK and ROK, we know that + // the cache is valid for those operations, and that the segments + // are non expand-down (thus we can make a simple limit check). + if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) return 0; + + if ( !IsLongMode() ) + { + Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; + + if (! i->as32L()) { + // For 16-bit addressing mode, clamp the segment limits to 16bits + // so we don't have to worry about computations using di + // rolling over 16-bit boundaries. + if (dstSegLimit > 0xffff) + dstSegLimit = 0xffff; + } + + // Now make sure transfer will fit within the constraints of the + // segment boundaries, 0..limit for non expand-down. We know + // count >= 1 here. + if (BX_CPU_THIS_PTR get_DF ()) { + // Counting downward. + Bit32u minOffset = (count-1) << 2; + if ( dstOff < minOffset ) + return 0; + } + else { + // Counting upward. + Bit32u dstMaxOffset = (dstSegLimit - (count<<2)) + 1; + if ( dstOff > dstMaxOffset ) + return 0; + } + } + + // Transfer data directly using host addresses + for (unsigned j=0; jrepUsedL() && !BX_CPU_THIS_PTR async_event) { - Bit32u byteCount; + Bit32u byteCount = CX; + BX_ASSERT(byteCount > 0); + byteCount = FastRepMOVSB(i, seg, si, BX_SEG_REG_ES, di, byteCount); + if (byteCount) + { + // Decrement the ticks count by the number of iterations, minus + // one, since the main cpu loop will decrement one. Also, + // the count is predecremented before examined, so defintely + // don't roll it under zero. + BX_TICKN(byteCount-1); -#if BX_SUPPORT_X86_64 - if (i->as64L()) - byteCount = RCX; // Truncated to 32bits. (we're only doing 1 page) - else -#endif - if (i->as32L()) - byteCount = ECX; - else - byteCount = CX; + // Decrement eCX. Note, the main loop will decrement 1 also, so + // decrement by one less than expected, like the case above. + CX -= (byteCount-1); - if (byteCount) { - Bit32u bytesFitSrc, bytesFitDst; - Bit8u *hostAddrSrc, *hostAddrDst; - signed int pointerDelta; - bx_segment_reg_t *srcSegPtr, *dstSegPtr; - bx_address laddrDst, laddrSrc; - Bit32u paddrDst, paddrSrc; - - srcSegPtr = &BX_CPU_THIS_PTR sregs[seg]; - dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SEG_REG_ES]; - - // Do segment checks for the 1st word. We do not want to - // trip an exception beyond this, because the address would - // be incorrect. After we know how many bytes we will directly - // transfer, we can do the full segment limit check ourselves - // without generating an exception. - read_virtual_checks(srcSegPtr, si, 1); - laddrSrc = BX_CPU_THIS_PTR get_segment_base(seg) + si; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ); - } - else { - paddrSrc = laddrSrc; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrSrc = A20ADDR(paddrSrc); - - write_virtual_checks(dstSegPtr, di, 1); - laddrDst = BX_CPU_THIS_PTR get_segment_base(BX_SEG_REG_ES) + di; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); - } - else { - paddrDst = laddrDst; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrDst = A20ADDR(paddrDst); - - hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrSrc, BX_READ); - hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrDst, BX_WRITE); - - if ( hostAddrSrc && hostAddrDst ) - { - // See how many bytes can fit in the rest of this page. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - bytesFitSrc = 1 + (paddrSrc & 0xfff); - bytesFitDst = 1 + (paddrDst & 0xfff); - pointerDelta = (signed int) -1; - } - else { - // Counting upward. - bytesFitSrc = (0x1000 - (paddrSrc & 0xfff)); - bytesFitDst = (0x1000 - (paddrDst & 0xfff)); - pointerDelta = (signed int) 1; - } - // Restrict count to the number that will fit in either - // source or dest pages. - if (byteCount > bytesFitSrc) - byteCount = bytesFitSrc; - if (byteCount > bytesFitDst) - byteCount = bytesFitDst; - if (byteCount > bx_pc_system.getNumCpuTicksLeftNextEvent()) - byteCount = bx_pc_system.getNumCpuTicksLeftNextEvent(); - - // If after all the restrictions, there is anything left to do... - if (byteCount) { - Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; - Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; - - // For 16-bit addressing mode, clamp the segment limits to 16bits - // so we don't have to worry about computations using si/di - // rolling over 16-bit boundaries. - if (!i->as32L()) { - if (srcSegLimit > 0xffff) - srcSegLimit = 0xffff; - if (dstSegLimit > 0xffff) - dstSegLimit = 0xffff; - } - - // Before we copy memory, we need to make sure that the segments - // allow the accesses up to the given source and dest offset. If - // the cache.valid bits have SegAccessWOK and ROK, we know that - // the cache is valid for those operations, and that the segments - // are non expand-down (thus we can make a simple limit check). - if ( !(srcSegPtr->cache.valid & SegAccessROK) || - !(dstSegPtr->cache.valid & SegAccessWOK) ) - { - goto noAcceleration16; - } - - // Now make sure transfer will fit within the constraints of the - // segment boundaries, 0..limit for non expand-down. We know - // byteCount >= 1 here. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - Bit32u minOffset = (byteCount-1); - if ( si < minOffset ) - goto noAcceleration16; - if ( di < minOffset ) - goto noAcceleration16; - } - else { - // Counting upward. - Bit32u srcMaxOffset = (srcSegLimit - byteCount) + 1; - Bit32u dstMaxOffset = (dstSegLimit - byteCount) + 1; - if ( si > srcMaxOffset ) - goto noAcceleration16; - if ( di > dstMaxOffset ) - goto noAcceleration16; - } - - // Transfer data directly using host addresses. - for (unsigned j=0; jas64L()) - RCX -= (byteCount-1); - else -#endif - if (i->as32L()) - ECX -= (byteCount-1); - else - CX -= (byteCount-1); - incr = byteCount; - goto doIncr16; - } - } + incr = byteCount; + goto doIncr16; } } - -noAcceleration16: - #endif // (BX_DEBUGGER == 0) #endif // BX_SupportRepeatSpeedups read_virtual_byte(seg, si, &temp8); - write_virtual_byte(BX_SEG_REG_ES, di, &temp8); - incr = 1; #if BX_SupportRepeatSpeedups #if (BX_DEBUGGER == 0) @@ -369,7 +924,7 @@ void BX_CPU_C::MOVSW_XwYw(bxInstruction_c *i) } else { /* 16bit address mode */ - unsigned incr; + unsigned incr = 2; Bit16u si = SI; Bit16u di = DI; @@ -382,155 +937,29 @@ void BX_CPU_C::MOVSW_XwYw(bxInstruction_c *i) if (i->repUsedL() && !BX_CPU_THIS_PTR async_event) { Bit32u wordCount = CX; + BX_ASSERT(wordCount > 0); + wordCount = FastRepMOVSW(i, seg, si, BX_SEG_REG_ES, di, wordCount); + if (wordCount) + { + // Decrement the ticks count by the number of iterations, minus + // one, since the main cpu loop will decrement one. Also, + // the count is predecremented before examined, so defintely + // don't roll it under zero. + BX_TICKN(wordCount-1); - if (wordCount) { - Bit32u wordsFitSrc, wordsFitDst; - Bit8u *hostAddrSrc, *hostAddrDst; - signed int pointerDelta; - bx_segment_reg_t *srcSegPtr, *dstSegPtr; - bx_address laddrDst, laddrSrc; - Bit32u paddrDst, paddrSrc; + // Decrement eCX. Note, the main loop will decrement 1 also, so + // decrement by one less than expected, like the case above. + CX -= (wordCount-1); - srcSegPtr = &BX_CPU_THIS_PTR sregs[seg]; - dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SEG_REG_ES]; - - // Do segment checks for the 1st word. We do not want to - // trip an exception beyond this, because the address would - // be incorrect. After we know how many bytes we will directly - // transfer, we can do the full segment limit check ourselves - // without generating an exception. - read_virtual_checks(srcSegPtr, si, 2); - laddrSrc = BX_CPU_THIS_PTR get_segment_base(seg) + si; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ); - } - else { - paddrSrc = laddrSrc; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrSrc = A20ADDR(paddrSrc); - - write_virtual_checks(dstSegPtr, di, 2); - laddrDst = BX_CPU_THIS_PTR get_segment_base(BX_SEG_REG_ES) + di; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); - } - else { - paddrDst = laddrDst; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrDst = A20ADDR(paddrDst); - - hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrSrc, BX_READ); - hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrDst, BX_WRITE); - - if ( hostAddrSrc && hostAddrDst ) { - // See how many words can fit in the rest of this page. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - // Note: 1st word must not cross page boundary. - if ( ((paddrSrc & 0xfff) > 0xffe) || - ((paddrDst & 0xfff) > 0xffe) ) - goto noAcceleration16; - wordsFitSrc = (2 + (paddrSrc & 0xfff)) >> 1; - wordsFitDst = (2 + (paddrDst & 0xfff)) >> 1; - pointerDelta = (signed int) -2; - } - else { - // Counting upward. - wordsFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 1; - wordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 1; - pointerDelta = (signed int) 2; - } - // Restrict word count to the number that will fit in either - // source or dest pages. - if (wordCount > wordsFitSrc) - wordCount = wordsFitSrc; - if (wordCount > wordsFitDst) - wordCount = wordsFitDst; - if (wordCount > bx_pc_system.getNumCpuTicksLeftNextEvent()) - wordCount = bx_pc_system.getNumCpuTicksLeftNextEvent(); - - // If after all the restrictions, there is anything left to do... - if (wordCount) { - Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; - Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; - - // For 16-bit addressing mode, clamp the segment limits to 16bits - // so we don't have to worry about computations using si/di - // rolling over 16-bit boundaries. - if (srcSegLimit > 0xffff) - srcSegLimit = 0xffff; - if (dstSegLimit > 0xffff) - dstSegLimit = 0xffff; - - // Before we copy memory, we need to make sure that the segments - // allow the accesses up to the given source and dest offset. If - // the cache.valid bits have SegAccessWOK and ROK, we know that - // the cache is valid for those operations, and that the segments - // are non expand-down (thus we can make a simple limit check). - if ( !(srcSegPtr->cache.valid & SegAccessROK) || - !(dstSegPtr->cache.valid & SegAccessWOK) ) - { - goto noAcceleration16; - } - - // Now make sure transfer will fit within the constraints of the - // segment boundaries, 0..limit for non expand-down. We know - // wordCount >= 1 here. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - Bit32u minOffset = (wordCount-1) << 1; - if ( si < minOffset ) - goto noAcceleration16; - if ( di < minOffset ) - goto noAcceleration16; - } - else { - // Counting upward. - Bit32u srcMaxOffset = (srcSegLimit - (wordCount<<1)) + 1; - Bit32u dstMaxOffset = (dstSegLimit - (wordCount<<1)) + 1; - if ( si > srcMaxOffset ) - goto noAcceleration16; - if ( di > dstMaxOffset ) - goto noAcceleration16; - } - - // Transfer data directly using host addresses. - for (unsigned j=0; jas32L()) { - unsigned incr; + unsigned incr = 4; Bit32u esi = ESI; Bit32u edi = EDI; @@ -604,150 +1033,29 @@ void BX_CPU_C::MOVSD_XdYd(bxInstruction_c *i) if (i->repUsedL() && !BX_CPU_THIS_PTR async_event) { Bit32u dwordCount = ECX; - + BX_ASSERT(dwordCount > 0); + dwordCount = FastRepMOVSD(i, seg, esi, BX_SEG_REG_ES, edi, dwordCount); if (dwordCount) { - Bit32u dwordsFitSrc, dwordsFitDst; - Bit8u *hostAddrSrc, *hostAddrDst; - signed int pointerDelta; - bx_segment_reg_t *srcSegPtr, *dstSegPtr; - bx_address laddrDst, laddrSrc; - Bit32u paddrDst, paddrSrc; + // Decrement the ticks count by the number of iterations, minus + // one, since the main cpu loop will decrement one. Also, + // the count is predecremented before examined, so defintely + // don't roll it under zero. + BX_TICKN(dwordCount-1); - srcSegPtr = &BX_CPU_THIS_PTR sregs[seg]; - dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SEG_REG_ES]; + // Decrement eCX. Note, the main loop will decrement 1 also, so + // decrement by one less than expected, like the case above. + ECX -= (dwordCount-1); - // Do segment checks for the 1st word. We do not want to - // trip an exception beyond this, because the address would - // be incorrect. After we know how many bytes we will directly - // transfer, we can do the full segment limit check ourselves - // without generating an exception. - read_virtual_checks(srcSegPtr, esi, 4); - laddrSrc = BX_CPU_THIS_PTR get_segment_base(seg) + esi; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ); - } - else { - paddrSrc = laddrSrc; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrSrc = A20ADDR(paddrSrc); - - write_virtual_checks(dstSegPtr, edi, 4); - laddrDst = BX_CPU_THIS_PTR get_segment_base(BX_SEG_REG_ES) + edi; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); - } - else { - paddrDst = laddrDst; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrDst = A20ADDR(paddrDst); - - hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrSrc, BX_READ); - hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrDst, BX_WRITE); - - if ( hostAddrSrc && hostAddrDst ) - { - // See how many dwords can fit in the rest of this page. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - // Note: 1st dword must not cross page boundary. - if ( ((paddrSrc & 0xfff) > 0xffc) || - ((paddrDst & 0xfff) > 0xffc) ) - goto noAcceleration32; - dwordsFitSrc = (4 + (paddrSrc & 0xfff)) >> 2; - dwordsFitDst = (4 + (paddrDst & 0xfff)) >> 2; - pointerDelta = (signed int) -4; - } - else { - // Counting upward. - dwordsFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 2; - dwordsFitDst = (0x1000 - (paddrDst & 0xfff)) >> 2; - pointerDelta = (signed int) 4; - } - // Restrict dword count to the number that will fit in either - // source or dest pages. - if (dwordCount > dwordsFitSrc) - dwordCount = dwordsFitSrc; - if (dwordCount > dwordsFitDst) - dwordCount = dwordsFitDst; - if (dwordCount > bx_pc_system.getNumCpuTicksLeftNextEvent()) - dwordCount = bx_pc_system.getNumCpuTicksLeftNextEvent(); - - // If after all the restrictions, there is anything left to do... - if (dwordCount) { - Bit32u srcSegLimit = srcSegPtr->cache.u.segment.limit_scaled; - Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; - - // Before we copy memory, we need to make sure that the segments - // allow the accesses up to the given source and dest offset. If - // the cache.valid bits have SegAccessWOK and ROK, we know that - // the cache is valid for those operations, and that the segments - // are non expand-down (thus we can make a simple limit check). - if ( !(srcSegPtr->cache.valid & SegAccessROK) || - !(dstSegPtr->cache.valid & SegAccessWOK) ) - { - goto noAcceleration32; - } - if ( !IsLongMode() ) { - // Now make sure transfer will fit within the constraints of the - // segment boundaries, 0..limit for non expand-down. We know - // dwordCount >= 1 here. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - Bit32u minOffset = (dwordCount-1) << 2; - if ( esi < minOffset ) - goto noAcceleration32; - if ( edi < minOffset ) - goto noAcceleration32; - } - else { - // Counting upward. - Bit32u srcMaxOffset = (srcSegLimit - (dwordCount<<2)) + 1; - Bit32u dstMaxOffset = (dstSegLimit - (dwordCount<<2)) + 1; - if ( esi > srcMaxOffset ) - goto noAcceleration32; - if ( edi > dstMaxOffset ) - goto noAcceleration32; - } - } - - // Transfer data directly using host addresses. - for (unsigned j=0; jas32L()) { @@ -1473,140 +1781,36 @@ void BX_CPU_C::STOSB_YbAL(bxInstruction_c *i) { Bit32u byteCount; -#if BX_SUPPORT_X86_64 - if (i->as64L()) - byteCount = RCX; // Truncated to 32bits. (we're only doing 1 page) - else -#endif if (i->as32L()) byteCount = ECX; else byteCount = CX; - if (byteCount) { - Bit32u bytesFitDst; - Bit8u *hostAddrDst; - signed int pointerDelta; - bx_segment_reg_t *dstSegPtr; - bx_address laddrDst; - Bit32u paddrDst; + BX_ASSERT(byteCount); + byteCount = FastRepSTOSB(i, BX_SEG_REG_ES, edi, al, byteCount); + if (byteCount) + { + // Decrement the ticks count by the number of iterations, minus + // one, since the main cpu loop will decrement one. Also, + // the count is predecremented before examined, so defintely + // don't roll it under zero. + BX_TICKN(byteCount-1); - dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SEG_REG_ES]; + // Decrement eCX. Note, the main loop will decrement 1 also, so + // decrement by one less than expected, like the case above. + if (i->as32L()) + ECX -= (byteCount-1); + else + CX -= (byteCount-1); - // Do segment checks for the 1st word. We do not want to - // trip an exception beyond this, because the address would - // be incorrect. After we know how many bytes we will directly - // transfer, we can do the full segment limit check ourselves - // without generating an exception. - write_virtual_checks(dstSegPtr, edi, 1); - laddrDst = BX_CPU_THIS_PTR get_segment_base(BX_SEG_REG_ES) + edi; - if (BX_CPU_THIS_PTR cr0.pg) { - paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE); - } - else { - paddrDst = laddrDst; - } - // If we want to write directly into the physical memory array, - // we need the A20 address. - paddrDst = A20ADDR(paddrDst); - - hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(BX_CPU_THIS, - paddrDst, BX_WRITE); - - if ( hostAddrDst ) - { - // See how many bytes can fit in the rest of this page. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - bytesFitDst = 1 + (paddrDst & 0xfff); - pointerDelta = (signed int) -1; - } - else { - // Counting upward. - bytesFitDst = (0x1000 - (paddrDst & 0xfff)); - pointerDelta = (signed int) 1; - } - // Restrict count to the number that will fit in either - // source or dest pages. - if (byteCount > bytesFitDst) - byteCount = bytesFitDst; - if (byteCount > bx_pc_system.getNumCpuTicksLeftNextEvent()) - byteCount = bx_pc_system.getNumCpuTicksLeftNextEvent(); - - // If after all the restrictions, there is anything left to do... - if (byteCount) { - Bit32u dstSegLimit = dstSegPtr->cache.u.segment.limit_scaled; - - // For 16-bit addressing mode, clamp the segment limits to 16bits - // so we don't have to worry about computations using si/di - // rolling over 16-bit boundaries. - if (!i->as32L()) { - if (dstSegLimit > 0xffff) - dstSegLimit = 0xffff; - } - - // Before we copy memory, we need to make sure that the segments - // allow the accesses up to the given source and dest offset. If - // the cache.valid bits have SegAccessWOK and ROK, we know that - // the cache is valid for those operations, and that the segments - // are non expand-down (thus we can make a simple limit check). - if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) { - goto noAcceleration16; - } - if ( !IsLongMode() ) { - // Now make sure transfer will fit within the constraints of the - // segment boundaries, 0..limit for non expand-down. We know - // byteCount >= 1 here. - if (BX_CPU_THIS_PTR get_DF ()) { - // Counting downward. - Bit32u minOffset = (byteCount-1); - if ( edi < minOffset ) - goto noAcceleration16; - } - else { - // Counting upward. - Bit32u dstMaxOffset = (dstSegLimit - byteCount) + 1; - if ( edi > dstMaxOffset ) - goto noAcceleration16; - } - } - - // Transfer data directly using host addresses. - for (unsigned j=0; jas64L()) - RCX -= (byteCount-1); - else -#endif - if (i->as32L()) - ECX -= (byteCount-1); - else - CX -= (byteCount-1); - incr = byteCount; - goto doIncr16; - } - } + incr = byteCount; + goto doIncr16; } } - -noAcceleration16: - #endif // (BX_DEBUGGER == 0) #endif // BX_SupportRepeatSpeedups write_virtual_byte(BX_SEG_REG_ES, edi, &al); - incr = 1; #if BX_SupportRepeatSpeedups #if (BX_DEBUGGER == 0) @@ -1625,7 +1829,7 @@ doIncr16: // zero extension of RDI RDI = edi; else - DI = edi; + DI = edi; } }