Bochs/bochs/patches/patch.optimize-rep

1024 lines
34 KiB
Plaintext
Raw Normal View History

----------------------------------------------------------------------
Patch name: patch.optimize-rep
Author: Kevin Lawton <kevinlawton2001@yahoo.com>
Date: 2002-08-29
Detailed description:
I made some more performance enhancements, this time
accelerating repeated IO instructions (to get some
better transfers out of the disk access mostly),
and repeated string moves and stores (for memcpy()
and memset() type operations). Enhanced some
variants of: INSW, OUTSW, MOVSB, MOVSW, STOSB.
These diffs rely slightly on the previous segment
type access check redundancy elimination diffs I
set a couple days ago, so you need those first.
[ YOU MUST USE patch.seg-checks TOO OR THIS WON'T EVEN COMPILE. -bbd]
-Kevin
Notes:
- Increase was something like 7% of a win95 boot time.
Previous diffs were an additional 3% of win95 boot
time.
10% total so far, but hey, who's counting? :^)
- Some of the accelerations are bracked with #if __x86__,
because of alignment and endian issues. The byte
oriented variations don't care of course. Could extend
them.
- On x86, could extend the copies even more by using
a native REP instruction since it knows about the
direction flag (eflags.DF).
- Keep in mind, overlapping buffers if you try to extend
things further, and any semantical differences those
imply for move sizes different than the instruction
requests.
- I moved one of the ticks fields in pc_system.h to
the public
area, since I needed to read it.
Patch was created with:
cvs diff -u
Apply patch to what version:
current cvs WITH patch.seg-checks applied
Instructions:
To patch, go to main bochs directory.
Type "patch -p0 < THIS_PATCH_FILE".
----------------------------------------------------------------------
Index: pc_system.h
===================================================================
RCS file: /cvsroot/bochs/bochs/pc_system.h,v
retrieving revision 1.11
diff -u -r1.11 pc_system.h
--- pc_system.h 16 Jun 2002 15:02:27 -0000 1.11
+++ pc_system.h 29 Aug 2002 16:20:27 -0000
@@ -59,7 +59,6 @@
} timer[BX_MAX_TIMERS];
unsigned num_timers;
Bit64u num_cpu_ticks_in_period;
- Bit64u num_cpu_ticks_left;
void expire_ticks(void);
#if !defined(PROVIDE_M_IPS)
@@ -67,6 +66,7 @@
#endif
public:
+ Bit64u num_cpu_ticks_left;
Boolean HRQ; // Hold Request
//Boolean INTR; // Interrupt
Index: cpu/io.cc
===================================================================
RCS file: /cvsroot/bochs/bochs/cpu/io.cc,v
retrieving revision 1.5
diff -u -r1.5 io.cc
--- cpu/io.cc 3 Oct 2001 13:10:37 -0000 1.5
+++ cpu/io.cc 29 Aug 2002 16:20:35 -0000
@@ -36,8 +36,6 @@
-
-
void
BX_CPU_C::INSB_YbDX(BxInstruction_t *i)
{
@@ -125,6 +123,104 @@
}
}
+#if (BX_DEBUGGER == 0)
+#if (defined(__i386__) && __i386__)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u wordCount;
+ bx_segment_reg_t *dstSegPtr;
+
+ if (i->as_32)
+ wordCount = ECX;
+ else
+ wordCount = CX;
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ write_virtual_checks(dstSegPtr, edi, 2);
+ if (wordCount) {
+ Bit32u laddr, paddr, wordsCanFit;
+ Bit8u *hostAddrDst;
+
+ laddr = dstSegPtr->cache.u.segment.base + edi;
+ if (BX_CPU_THIS_PTR cr0.pg)
+ paddr = dtranslate_linear(laddr, CPL==3, BX_WRITE);
+ else
+ paddr = laddr;
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddr = A20ADDR(paddr);
+
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddr, BX_WRITE);
+
+ // Check that native host access was not vetoed for that page, and
+ // that the address is word aligned.
+ if ( hostAddrDst && ! (paddr & 1) ) {
+ // See how many words can fit in the rest of this page.
+ wordsCanFit = (0x1000 - (paddr & 0xfff)) >> 1;
+ // Restrict word count to the number that will fit in this page.
+ if (wordCount > wordsCanFit)
+ wordCount = wordsCanFit;
+
+ // If after all the restrictions, there is anything left to do...
+ if (wordCount) {
+ unsigned transferLen;
+ Bit32u roomDst;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = wordCount<<1; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) {
+ goto noAcceleration;
+ }
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
+ if ( roomDst < transferLen ) {
+ goto noAcceleration;
+ }
+
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -2;
+ else
+ pointerDelta = 2;
+ for (j=0; j<wordCount; ) {
+ Bit16u temp16;
+ temp16 = BX_INP(DX, 2);
+ * (Bit16u *) hostAddrDst = temp16;
+ hostAddrDst += pointerDelta;
+ j++;
+ BX_TICK1();
+ if ( BX_CPU_THIS_PTR async_event )
+ break;
+ }
+ wordCount = j;
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (wordCount-1);
+ else
+ CX -= (wordCount-1);
+ incr = wordCount << 1; // count * 2.
+ goto doIncr;
+ }
+ }
+ }
+ }
+#endif // __i386__
+#endif
+
+noAcceleration:
+
// Write a zero to memory, to trigger any segment or page
// faults before reading from IO port.
write_virtual_word(BX_SEG_REG_ES, edi, &value16);
@@ -136,6 +232,10 @@
incr = 2;
}
+#if (BX_DEBUGGER == 0)
+doIncr:
+#endif
+
if (i->as_32) {
if (BX_CPU_THIS_PTR eflags.df)
EDI = EDI - incr;
@@ -236,11 +336,113 @@
}
}
+#if (BX_DEBUGGER == 0)
+#if (defined(__i386__) && __i386__)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u wordCount;
+ bx_segment_reg_t *srcSegPtr;
+
+ if (i->as_32)
+ wordCount = ECX;
+ else
+ wordCount = CX;
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ read_virtual_checks(srcSegPtr, esi, 2);
+ if (wordCount) {
+ Bit32u laddr, paddr, wordsCanFit;
+ Bit8u *hostAddrSrc;
+
+ laddr = srcSegPtr->cache.u.segment.base + esi;
+ if (BX_CPU_THIS_PTR cr0.pg)
+ paddr = dtranslate_linear(laddr, CPL==3, BX_READ);
+ else
+ paddr = laddr;
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddr = A20ADDR(paddr);
+
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddr, BX_READ);
+
+ // Check that native host access was not vetoed for that page, and
+ // that the address is word aligned.
+ if ( hostAddrSrc && ! (paddr & 1) ) {
+ // See how many words can fit in the rest of this page.
+ wordsCanFit = (0x1000 - (paddr & 0xfff)) >> 1;
+ // Restrict word count to the number that will fit in this page.
+ if (wordCount > wordsCanFit)
+ wordCount = wordsCanFit;
+
+ // If after all the restrictions, there is anything left to do...
+ if (wordCount) {
+ unsigned transferLen;
+ Bit32u roomSrc;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = wordCount<<1; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ) {
+ goto noAcceleration;
+ }
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - esi) + 1;
+ if ( roomSrc < transferLen ) {
+ goto noAcceleration;
+ }
+
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -2;
+ else
+ pointerDelta = 2;
+ for (j=0; j<wordCount; ) {
+ Bit16u temp16;
+ temp16 = * (Bit16u *) hostAddrSrc;
+ hostAddrSrc += pointerDelta;
+ BX_OUTP(DX, temp16, 2);
+ j++;
+ BX_TICK1();
+ if ( BX_CPU_THIS_PTR async_event )
+ break;
+ }
+ wordCount = j;
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (wordCount-1);
+ else
+ CX -= (wordCount-1);
+ incr = wordCount << 1; // count * 2.
+ goto doIncr;
+ }
+ }
+ }
+ }
+#endif // __i386__
+#endif
+
+noAcceleration:
+
read_virtual_word(seg, esi, &value16);
BX_OUTP(DX, value16, 2);
incr = 2;
}
+
+#if (BX_DEBUGGER == 0)
+doIncr:
+#endif
if (i->as_32) {
if (BX_CPU_THIS_PTR eflags.df)
Index: cpu/string.cc
===================================================================
RCS file: /cvsroot/bochs/bochs/cpu/string.cc,v
retrieving revision 1.5
diff -u -r1.5 string.cc
--- cpu/string.cc 3 Oct 2001 13:10:37 -0000 1.5
+++ cpu/string.cc 29 Aug 2002 16:20:36 -0000
@@ -34,6 +34,8 @@
#define LOG_THIS BX_CPU_THIS_PTR
+
+
/* MOVSB ES:[EDI], DS:[ESI] DS may be overridden
* mov string from DS:[ESI] into ES:[EDI]
*/
@@ -80,24 +82,149 @@
else
#endif /* BX_CPU_LEVEL >= 3 */
{ /* 16 bit address mode */
+ unsigned incr;
Bit16u si, di;
si = SI;
di = DI;
+#if (BX_DEBUGGER == 0)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u byteCount;
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
+
+ if (i->as_32)
+ byteCount = ECX;
+ else
+ byteCount = CX;
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
+
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ read_virtual_checks(srcSegPtr, si, 1);
+ laddrSrc = srcSegPtr->cache.u.segment.base + si;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
+ }
+ else {
+ paddrSrc = laddrSrc;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrSrc = A20ADDR(paddrSrc);
+
+ write_virtual_checks(dstSegPtr, di, 1);
+ laddrDst = dstSegPtr->cache.u.segment.base + di;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
+ }
+ else {
+ paddrDst = laddrDst;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrDst = A20ADDR(paddrDst);
+
+ if (byteCount) {
+ Bit32u bytesCanFitSrc, bytesCanFitDst;
+ Bit8u *hostAddrSrc, *hostAddrDst;
+
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
+
+ if ( hostAddrSrc && hostAddrDst ) {
+ // See how many bytes can fit in the rest of this page.
+ bytesCanFitSrc = (0x1000 - (paddrSrc & 0xfff));
+ bytesCanFitDst = (0x1000 - (paddrDst & 0xfff));
+ // Restrict count to the number that will fit in either
+ // source or dest pages.
+ if (byteCount > bytesCanFitSrc)
+ byteCount = bytesCanFitSrc;
+ if (byteCount > bytesCanFitDst)
+ byteCount = bytesCanFitDst;
+ if (byteCount > bx_pc_system.num_cpu_ticks_left)
+ byteCount = bx_pc_system.num_cpu_ticks_left;
+
+ // If after all the restrictions, there is anything left to do...
+ if (byteCount) {
+ unsigned transferLen;
+ Bit32u roomSrc, roomDst;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = byteCount; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
+ goto noAcceleration16;
+ }
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - si) + 1;
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - di) + 1;
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
+ goto noAcceleration16;
+ }
+
+ // Transfer data directly using host addresses.
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -1;
+ else
+ pointerDelta = 1;
+ for (j=0; j<byteCount; j++) {
+ * (Bit8u *) hostAddrDst = * (Bit8u *) hostAddrSrc;
+ hostAddrDst += pointerDelta;
+ hostAddrSrc += pointerDelta;
+ }
+ // Decrement the ticks count by the number of iterations, minus
+ // one, since the main cpu loop will decrement one. Also,
+ // the count is predecremented before examined, so defintely
+ // don't roll it under zero.
+ bx_pc_system.num_cpu_ticks_left -= (byteCount-1);
+
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (byteCount-1);
+ else
+ CX -= (byteCount-1);
+ incr = byteCount;
+ goto doIncr16;
+ }
+ }
+ }
+ }
+#endif
+
+noAcceleration16:
+
read_virtual_byte(seg, si, &temp8);
write_virtual_byte(BX_SEG_REG_ES, di, &temp8);
+ incr = 1;
+
+doIncr16:
if (BX_CPU_THIS_PTR eflags.df) {
/* decrement SI, DI */
- si--;
- di--;
+ si -= incr;
+ di -= incr;
}
else {
/* increment SI, DI */
- si++;
- di++;
+ si += incr;
+ di += incr;
}
SI = si;
@@ -109,7 +236,7 @@
BX_CPU_C::MOVSW_XvYv(BxInstruction_t *i)
{
unsigned seg;
-
+ unsigned incr;
if (!BX_NULL_SEG_REG(i->seg)) {
seg = i->seg;
@@ -128,19 +255,146 @@
edi = EDI;
if (i->os_32) {
+
+#if (BX_DEBUGGER == 0)
+#if (defined(__i386__) && __i386__)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u dwordCount;
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
+
+ if (i->as_32)
+ dwordCount = ECX;
+ else
+ dwordCount = CX;
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
+
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ read_virtual_checks(srcSegPtr, esi, 4);
+ laddrSrc = srcSegPtr->cache.u.segment.base + esi;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
+ }
+ else {
+ paddrSrc = laddrSrc;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrSrc = A20ADDR(paddrSrc);
+
+ write_virtual_checks(dstSegPtr, edi, 4);
+ laddrDst = dstSegPtr->cache.u.segment.base + edi;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
+ }
+ else {
+ paddrDst = laddrDst;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrDst = A20ADDR(paddrDst);
+
+ if (dwordCount) {
+ Bit32u dwordsCanFitSrc, dwordsCanFitDst;
+ Bit8u *hostAddrSrc, *hostAddrDst;
+
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
+
+ if ( hostAddrSrc && hostAddrDst ) {
+ // See how many dwords can fit in the rest of this page.
+ dwordsCanFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 2;
+ dwordsCanFitDst = (0x1000 - (paddrDst & 0xfff)) >> 2;
+ // Restrict dword count to the number that will fit in either
+ // source or dest pages.
+ if (dwordCount > dwordsCanFitSrc)
+ dwordCount = dwordsCanFitSrc;
+ if (dwordCount > dwordsCanFitDst)
+ dwordCount = dwordsCanFitDst;
+ if (dwordCount > bx_pc_system.num_cpu_ticks_left)
+ dwordCount = bx_pc_system.num_cpu_ticks_left;
+
+ // If after all the restrictions, there is anything left to do...
+ if (dwordCount) {
+ unsigned transferLen;
+ Bit32u roomSrc, roomDst;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = dwordCount<<2; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
+ goto noAcceleration32;
+ }
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - esi) + 1;
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
+ goto noAcceleration32;
+ }
+
+ // Transfer data directly using host addresses.
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -4;
+ else
+ pointerDelta = 4;
+ for (j=0; j<dwordCount; j++) {
+ * (Bit32u *) hostAddrDst = * (Bit32u *) hostAddrSrc;
+ hostAddrDst += pointerDelta;
+ hostAddrSrc += pointerDelta;
+ }
+ // Decrement the ticks count by the number of iterations, minus
+ // one, since the main cpu loop will decrement one. Also,
+ // the count is predecremented before examined, so defintely
+ // don't roll it under zero.
+ bx_pc_system.num_cpu_ticks_left -= (dwordCount-1);
+
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (dwordCount-1);
+ else
+ CX -= (dwordCount-1);
+ incr = dwordCount << 2; // count * 4.
+ goto doIncr32;
+ }
+ }
+ }
+ }
+#endif // __i386__
+#endif
+
+noAcceleration32:
+
read_virtual_dword(seg, esi, &temp32);
write_virtual_dword(BX_SEG_REG_ES, edi, &temp32);
+ incr = 4;
+
+doIncr32:
if (BX_CPU_THIS_PTR eflags.df) {
/* decrement ESI */
- esi -= 4;
- edi -= 4;
+ esi -= incr;
+ edi -= incr;
}
else {
/* increment ESI */
- esi += 4;
- edi += 4;
+ esi += incr;
+ edi += incr;
}
} /* if (i->os_32) ... */
else { /* 16 bit opsize mode */
@@ -198,19 +452,145 @@
{ /* 16 bit opsize mode */
Bit16u temp16;
+#if (BX_DEBUGGER == 0)
+#if (defined(__i386__) && __i386__)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u wordCount;
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
+
+ if (i->as_32)
+ wordCount = ECX;
+ else
+ wordCount = CX;
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
+
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ read_virtual_checks(srcSegPtr, si, 2);
+ laddrSrc = srcSegPtr->cache.u.segment.base + si;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
+ }
+ else {
+ paddrSrc = laddrSrc;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrSrc = A20ADDR(paddrSrc);
+
+ write_virtual_checks(dstSegPtr, di, 2);
+ laddrDst = dstSegPtr->cache.u.segment.base + di;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
+ }
+ else {
+ paddrDst = laddrDst;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrDst = A20ADDR(paddrDst);
+
+ if (wordCount) {
+ Bit32u wordsCanFitSrc, wordsCanFitDst;
+ Bit8u *hostAddrSrc, *hostAddrDst;
+
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
+
+ if ( hostAddrSrc && hostAddrDst ) {
+ // See how many words can fit in the rest of this page.
+ wordsCanFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 1;
+ wordsCanFitDst = (0x1000 - (paddrDst & 0xfff)) >> 1;
+ // Restrict dword count to the number that will fit in either
+ // source or dest pages.
+ if (wordCount > wordsCanFitSrc)
+ wordCount = wordsCanFitSrc;
+ if (wordCount > wordsCanFitDst)
+ wordCount = wordsCanFitDst;
+ if (wordCount > bx_pc_system.num_cpu_ticks_left)
+ wordCount = bx_pc_system.num_cpu_ticks_left;
+
+ // If after all the restrictions, there is anything left to do...
+ if (wordCount) {
+ unsigned transferLen;
+ Bit32u roomSrc, roomDst;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = wordCount<<1; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
+ goto noAcceleration16;
+ }
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - si) + 1;
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - di) + 1;
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
+ goto noAcceleration16;
+ }
+
+ // Transfer data directly using host addresses.
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -2;
+ else
+ pointerDelta = 2;
+ for (j=0; j<wordCount; j++) {
+ * (Bit16u *) hostAddrDst = * (Bit16u *) hostAddrSrc;
+ hostAddrDst += pointerDelta;
+ hostAddrSrc += pointerDelta;
+ }
+ // Decrement the ticks count by the number of iterations, minus
+ // one, since the main cpu loop will decrement one. Also,
+ // the count is predecremented before examined, so defintely
+ // don't roll it under zero.
+ bx_pc_system.num_cpu_ticks_left -= (wordCount-1);
+
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (wordCount-1);
+ else
+ CX -= (wordCount-1);
+ incr = wordCount << 1; // count * 2.
+ goto doIncr16;
+ }
+ }
+ }
+ }
+#endif // __i386__
+#endif
+
+noAcceleration16:
+
read_virtual_word(seg, si, &temp16);
write_virtual_word(BX_SEG_REG_ES, di, &temp16);
+ incr = 2;
+
+doIncr16:
if (BX_CPU_THIS_PTR eflags.df) {
/* decrement SI, DI */
- si -= 2;
- di -= 2;
+ si -= incr;
+ di -= incr;
}
else {
/* increment SI, DI */
- si += 2;
- di += 2;
+ si += incr;
+ di += incr;
}
}
@@ -593,48 +973,142 @@
BX_CPU_C::STOSB_YbAL(BxInstruction_t *i)
{
Bit8u al;
+ Bit32u edi;
+ unsigned incr;
#if BX_CPU_LEVEL >= 3
if (i->as_32) {
- Bit32u edi;
-
edi = EDI;
-
- al = AL;
- write_virtual_byte(BX_SEG_REG_ES, edi, &al);
-
- if (BX_CPU_THIS_PTR eflags.df) {
- /* decrement EDI */
- edi--;
- }
- else {
- /* increment EDI */
- edi++;
- }
-
- EDI = edi;
}
else
#endif /* BX_CPU_LEVEL >= 3 */
{ /* 16bit address size */
- Bit16u di;
+ edi = DI;
+ }
+ al = AL;
- di = DI;
- al = AL;
- write_virtual_byte(BX_SEG_REG_ES, di, &al);
+#if (BX_DEBUGGER == 0)
+ /* If conditions are right, we can transfer IO to physical memory
+ * in a batch, rather than one instruction at a time.
+ */
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
+ Bit32u byteCount;
+ bx_segment_reg_t *dstSegPtr;
+ Bit32u laddrDst, paddrDst;
+
+ if (i->as_32)
+ byteCount = ECX;
+ else
+ byteCount = CX;
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
+
+ // Do segment checks for the 1st word. We do not want to
+ // trip an exception beyond this, because the address would
+ // be incorrect. After we know how many bytes we will directly
+ // transfer, we can do the full segment limit check ourselves
+ // without generating an exception.
+ write_virtual_checks(dstSegPtr, edi, 1);
+ laddrDst = dstSegPtr->cache.u.segment.base + edi;
+ if (BX_CPU_THIS_PTR cr0.pg) {
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
+ }
+ else {
+ paddrDst = laddrDst;
+ }
+ // If we want to write directly into the physical memory array,
+ // we need the A20 address.
+ paddrDst = A20ADDR(paddrDst);
+
+ if (byteCount) {
+ Bit32u bytesCanFitDst;
+ Bit8u *hostAddrDst;
+
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
+
+ if ( hostAddrDst ) {
+ // See how many bytes can fit in the rest of this page.
+ bytesCanFitDst = (0x1000 - (paddrDst & 0xfff));
+ // Restrict count to the number that will fit in either
+ // source or dest pages.
+ if (byteCount > bytesCanFitDst)
+ byteCount = bytesCanFitDst;
+ if (byteCount > bx_pc_system.num_cpu_ticks_left)
+ byteCount = bx_pc_system.num_cpu_ticks_left;
+
+ // If after all the restrictions, there is anything left to do...
+ if (byteCount) {
+ unsigned transferLen;
+ Bit32u roomDst;
+ unsigned j;
+ unsigned pointerDelta;
+
+ transferLen = byteCount; // Number bytes to transfer.
+
+ // Before we copy memory, we need to make sure that the segments
+ // allow the accesses up to the given source and dest offset. If
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
+ // the cache is valid for those operations, and that the segments
+ // are non-expand down (thus we can make a simple limit check).
+ if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) {
+ goto noAcceleration16;
+ }
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
+ if ( roomDst < transferLen ) {
+ goto noAcceleration16;
+ }
+
+ // Transfer data directly using host addresses.
+ if (BX_CPU_THIS_PTR eflags.df)
+ pointerDelta = (unsigned) -1;
+ else
+ pointerDelta = 1;
+ for (j=0; j<byteCount; j++) {
+ * (Bit8u *) hostAddrDst = al;
+ hostAddrDst += pointerDelta;
+ }
+ // Decrement the ticks count by the number of iterations, minus
+ // one, since the main cpu loop will decrement one. Also,
+ // the count is predecremented before examined, so defintely
+ // don't roll it under zero.
+ bx_pc_system.num_cpu_ticks_left -= (byteCount-1);
+
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
+ // decrement by one less than expected, like the case above.
+ if (i->as_32)
+ ECX -= (byteCount-1);
+ else
+ CX -= (byteCount-1);
+ incr = byteCount;
+ goto doIncr16;
+ }
+ }
+ }
+ }
+#endif
+
+noAcceleration16:
+
+ write_virtual_byte(BX_SEG_REG_ES, edi, &al);
+ incr = 1;
+
+doIncr16:
if (BX_CPU_THIS_PTR eflags.df) {
/* decrement EDI */
- di--;
+ edi -= incr;
}
else {
/* increment EDI */
- di++;
+ edi += incr;
}
- DI = di;
- }
+#if BX_CPU_LEVEL >= 3
+ if (i->as_32)
+ EDI = edi;
+ else
+#endif
+ DI = edi;
}
void
Index: memory/memory.h
===================================================================
RCS file: /cvsroot/bochs/bochs/memory/memory.h,v
retrieving revision 1.6
diff -u -r1.6 memory.h
--- memory/memory.h 3 Oct 2001 13:10:38 -0000 1.6
+++ memory/memory.h 29 Aug 2002 16:20:38 -0000
@@ -65,6 +65,7 @@
BX_MEM_SMF Boolean dbg_crc32(
unsigned long (*f)(unsigned char *buf, int len),
Bit32u addr1, Bit32u addr2, Bit32u *crc);
+ BX_MEM_SMF Bit8u * getHostMemAddr(Bit32u a20Addr, unsigned op);
};
#if BX_PROVIDE_CPU_MEMORY==1
Index: memory/misc_mem.cc
===================================================================
RCS file: /cvsroot/bochs/bochs/memory/misc_mem.cc,v
retrieving revision 1.23
diff -u -r1.23 misc_mem.cc
--- memory/misc_mem.cc 18 Aug 2002 08:53:26 -0000 1.23
+++ memory/misc_mem.cc 29 Aug 2002 16:20:39 -0000
@@ -282,3 +282,32 @@
return(1);
}
+
+ Bit8u *
+BX_MEM_C::getHostMemAddr(Bit32u a20Addr, unsigned op)
+ // Return a host address corresponding to the guest physical memory
+ // address (with A20 already applied), given that the calling
+ // code will perform an 'op' operation. This address will be
+ // used for direct access to guest memory as an acceleration by
+ // a few instructions, like REP {MOV, INS, OUTS, etc}.
+ // Values of 'op' are { BX_READ, BX_WRITE, BX_RW }.
+
+ // The other assumption is that the calling code _only_ accesses memory
+ // directly within the page that encompasses the address requested.
+{
+#if BX_PCI_SUPPORT
+#error "Fix getHostMemAddr for PCI support."
+#endif
+ if ( a20Addr >= BX_MEM_THIS len )
+ return(NULL); // Error, requested addr is out of bounds.
+ if (op == BX_READ) {
+ if ( (a20Addr < 0xa0000) || (a20Addr > 0xbffff) )
+ return( (Bit8u *) & vector[a20Addr] );
+ return(NULL); // Vetoed! Mem mapped IO (VGA)
+ }
+ else { // op == {BX_WRITE, BX_RW}
+ if ( (a20Addr < 0xa0000) || (a20Addr > 0xfffff) )
+ return( (Bit8u *) & vector[a20Addr] );
+ return(NULL); // Vetoed! Mem mapped IO (VGA) and ROMs
+ }
+}