1024 lines
34 KiB
Plaintext
1024 lines
34 KiB
Plaintext
----------------------------------------------------------------------
|
|
Patch name: patch.optimize-rep
|
|
Author: Kevin Lawton <kevinlawton2001@yahoo.com>
|
|
Date: 2002-08-29
|
|
|
|
Detailed description:
|
|
|
|
I made some more performance enhancements, this time
|
|
accelerating repeated IO instructions (to get some
|
|
better transfers out of the disk access mostly),
|
|
and repeated string moves and stores (for memcpy()
|
|
and memset() type operations). Enhanced some
|
|
variants of: INSW, OUTSW, MOVSB, MOVSW, STOSB.
|
|
|
|
These diffs rely slightly on the previous segment
|
|
type access check redundancy elimination diffs I
|
|
set a couple days ago, so you need those first.
|
|
[ YOU MUST USE patch.seg-checks TOO OR THIS WON'T EVEN COMPILE. -bbd]
|
|
|
|
-Kevin
|
|
|
|
Notes:
|
|
|
|
- Increase was something like 7% of a win95 boot time.
|
|
Previous diffs were an additional 3% of win95 boot
|
|
time.
|
|
10% total so far, but hey, who's counting? :^)
|
|
- Some of the accelerations are bracked with #if __x86__,
|
|
because of alignment and endian issues. The byte
|
|
oriented variations don't care of course. Could extend
|
|
them.
|
|
- On x86, could extend the copies even more by using
|
|
a native REP instruction since it knows about the
|
|
direction flag (eflags.DF).
|
|
- Keep in mind, overlapping buffers if you try to extend
|
|
things further, and any semantical differences those
|
|
imply for move sizes different than the instruction
|
|
requests.
|
|
- I moved one of the ticks fields in pc_system.h to
|
|
the public
|
|
area, since I needed to read it.
|
|
|
|
|
|
|
|
Patch was created with:
|
|
cvs diff -u
|
|
Apply patch to what version:
|
|
current cvs WITH patch.seg-checks applied
|
|
Instructions:
|
|
To patch, go to main bochs directory.
|
|
Type "patch -p0 < THIS_PATCH_FILE".
|
|
----------------------------------------------------------------------
|
|
Index: pc_system.h
|
|
===================================================================
|
|
RCS file: /cvsroot/bochs/bochs/pc_system.h,v
|
|
retrieving revision 1.11
|
|
diff -u -r1.11 pc_system.h
|
|
--- pc_system.h 16 Jun 2002 15:02:27 -0000 1.11
|
|
+++ pc_system.h 29 Aug 2002 16:20:27 -0000
|
|
@@ -59,7 +59,6 @@
|
|
} timer[BX_MAX_TIMERS];
|
|
unsigned num_timers;
|
|
Bit64u num_cpu_ticks_in_period;
|
|
- Bit64u num_cpu_ticks_left;
|
|
void expire_ticks(void);
|
|
|
|
#if !defined(PROVIDE_M_IPS)
|
|
@@ -67,6 +66,7 @@
|
|
#endif
|
|
|
|
public:
|
|
+ Bit64u num_cpu_ticks_left;
|
|
|
|
Boolean HRQ; // Hold Request
|
|
//Boolean INTR; // Interrupt
|
|
Index: cpu/io.cc
|
|
===================================================================
|
|
RCS file: /cvsroot/bochs/bochs/cpu/io.cc,v
|
|
retrieving revision 1.5
|
|
diff -u -r1.5 io.cc
|
|
--- cpu/io.cc 3 Oct 2001 13:10:37 -0000 1.5
|
|
+++ cpu/io.cc 29 Aug 2002 16:20:35 -0000
|
|
@@ -36,8 +36,6 @@
|
|
|
|
|
|
|
|
-
|
|
-
|
|
void
|
|
BX_CPU_C::INSB_YbDX(BxInstruction_t *i)
|
|
{
|
|
@@ -125,6 +123,104 @@
|
|
}
|
|
}
|
|
|
|
+#if (BX_DEBUGGER == 0)
|
|
+#if (defined(__i386__) && __i386__)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u wordCount;
|
|
+ bx_segment_reg_t *dstSegPtr;
|
|
+
|
|
+ if (i->as_32)
|
|
+ wordCount = ECX;
|
|
+ else
|
|
+ wordCount = CX;
|
|
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ write_virtual_checks(dstSegPtr, edi, 2);
|
|
+ if (wordCount) {
|
|
+ Bit32u laddr, paddr, wordsCanFit;
|
|
+ Bit8u *hostAddrDst;
|
|
+
|
|
+ laddr = dstSegPtr->cache.u.segment.base + edi;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg)
|
|
+ paddr = dtranslate_linear(laddr, CPL==3, BX_WRITE);
|
|
+ else
|
|
+ paddr = laddr;
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddr = A20ADDR(paddr);
|
|
+
|
|
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddr, BX_WRITE);
|
|
+
|
|
+ // Check that native host access was not vetoed for that page, and
|
|
+ // that the address is word aligned.
|
|
+ if ( hostAddrDst && ! (paddr & 1) ) {
|
|
+ // See how many words can fit in the rest of this page.
|
|
+ wordsCanFit = (0x1000 - (paddr & 0xfff)) >> 1;
|
|
+ // Restrict word count to the number that will fit in this page.
|
|
+ if (wordCount > wordsCanFit)
|
|
+ wordCount = wordsCanFit;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (wordCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomDst;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = wordCount<<1; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) {
|
|
+ goto noAcceleration;
|
|
+ }
|
|
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
|
|
+ if ( roomDst < transferLen ) {
|
|
+ goto noAcceleration;
|
|
+ }
|
|
+
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -2;
|
|
+ else
|
|
+ pointerDelta = 2;
|
|
+ for (j=0; j<wordCount; ) {
|
|
+ Bit16u temp16;
|
|
+ temp16 = BX_INP(DX, 2);
|
|
+ * (Bit16u *) hostAddrDst = temp16;
|
|
+ hostAddrDst += pointerDelta;
|
|
+ j++;
|
|
+ BX_TICK1();
|
|
+ if ( BX_CPU_THIS_PTR async_event )
|
|
+ break;
|
|
+ }
|
|
+ wordCount = j;
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (wordCount-1);
|
|
+ else
|
|
+ CX -= (wordCount-1);
|
|
+ incr = wordCount << 1; // count * 2.
|
|
+ goto doIncr;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif // __i386__
|
|
+#endif
|
|
+
|
|
+noAcceleration:
|
|
+
|
|
// Write a zero to memory, to trigger any segment or page
|
|
// faults before reading from IO port.
|
|
write_virtual_word(BX_SEG_REG_ES, edi, &value16);
|
|
@@ -136,6 +232,10 @@
|
|
incr = 2;
|
|
}
|
|
|
|
+#if (BX_DEBUGGER == 0)
|
|
+doIncr:
|
|
+#endif
|
|
+
|
|
if (i->as_32) {
|
|
if (BX_CPU_THIS_PTR eflags.df)
|
|
EDI = EDI - incr;
|
|
@@ -236,11 +336,113 @@
|
|
}
|
|
}
|
|
|
|
+#if (BX_DEBUGGER == 0)
|
|
+#if (defined(__i386__) && __i386__)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u wordCount;
|
|
+ bx_segment_reg_t *srcSegPtr;
|
|
+
|
|
+ if (i->as_32)
|
|
+ wordCount = ECX;
|
|
+ else
|
|
+ wordCount = CX;
|
|
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ read_virtual_checks(srcSegPtr, esi, 2);
|
|
+ if (wordCount) {
|
|
+ Bit32u laddr, paddr, wordsCanFit;
|
|
+ Bit8u *hostAddrSrc;
|
|
+
|
|
+ laddr = srcSegPtr->cache.u.segment.base + esi;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg)
|
|
+ paddr = dtranslate_linear(laddr, CPL==3, BX_READ);
|
|
+ else
|
|
+ paddr = laddr;
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddr = A20ADDR(paddr);
|
|
+
|
|
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddr, BX_READ);
|
|
+
|
|
+ // Check that native host access was not vetoed for that page, and
|
|
+ // that the address is word aligned.
|
|
+ if ( hostAddrSrc && ! (paddr & 1) ) {
|
|
+ // See how many words can fit in the rest of this page.
|
|
+ wordsCanFit = (0x1000 - (paddr & 0xfff)) >> 1;
|
|
+ // Restrict word count to the number that will fit in this page.
|
|
+ if (wordCount > wordsCanFit)
|
|
+ wordCount = wordsCanFit;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (wordCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomSrc;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = wordCount<<1; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ) {
|
|
+ goto noAcceleration;
|
|
+ }
|
|
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - esi) + 1;
|
|
+ if ( roomSrc < transferLen ) {
|
|
+ goto noAcceleration;
|
|
+ }
|
|
+
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -2;
|
|
+ else
|
|
+ pointerDelta = 2;
|
|
+ for (j=0; j<wordCount; ) {
|
|
+ Bit16u temp16;
|
|
+ temp16 = * (Bit16u *) hostAddrSrc;
|
|
+ hostAddrSrc += pointerDelta;
|
|
+ BX_OUTP(DX, temp16, 2);
|
|
+ j++;
|
|
+ BX_TICK1();
|
|
+ if ( BX_CPU_THIS_PTR async_event )
|
|
+ break;
|
|
+ }
|
|
+ wordCount = j;
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (wordCount-1);
|
|
+ else
|
|
+ CX -= (wordCount-1);
|
|
+ incr = wordCount << 1; // count * 2.
|
|
+ goto doIncr;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif // __i386__
|
|
+#endif
|
|
+
|
|
+noAcceleration:
|
|
+
|
|
read_virtual_word(seg, esi, &value16);
|
|
|
|
BX_OUTP(DX, value16, 2);
|
|
incr = 2;
|
|
}
|
|
+
|
|
+#if (BX_DEBUGGER == 0)
|
|
+doIncr:
|
|
+#endif
|
|
|
|
if (i->as_32) {
|
|
if (BX_CPU_THIS_PTR eflags.df)
|
|
Index: cpu/string.cc
|
|
===================================================================
|
|
RCS file: /cvsroot/bochs/bochs/cpu/string.cc,v
|
|
retrieving revision 1.5
|
|
diff -u -r1.5 string.cc
|
|
--- cpu/string.cc 3 Oct 2001 13:10:37 -0000 1.5
|
|
+++ cpu/string.cc 29 Aug 2002 16:20:36 -0000
|
|
@@ -34,6 +34,8 @@
|
|
#define LOG_THIS BX_CPU_THIS_PTR
|
|
|
|
|
|
+
|
|
+
|
|
/* MOVSB ES:[EDI], DS:[ESI] DS may be overridden
|
|
* mov string from DS:[ESI] into ES:[EDI]
|
|
*/
|
|
@@ -80,24 +82,149 @@
|
|
else
|
|
#endif /* BX_CPU_LEVEL >= 3 */
|
|
{ /* 16 bit address mode */
|
|
+ unsigned incr;
|
|
Bit16u si, di;
|
|
|
|
si = SI;
|
|
di = DI;
|
|
|
|
+#if (BX_DEBUGGER == 0)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u byteCount;
|
|
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
|
|
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
|
|
+
|
|
+ if (i->as_32)
|
|
+ byteCount = ECX;
|
|
+ else
|
|
+ byteCount = CX;
|
|
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
|
|
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
|
|
+
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ read_virtual_checks(srcSegPtr, si, 1);
|
|
+ laddrSrc = srcSegPtr->cache.u.segment.base + si;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
|
|
+ }
|
|
+ else {
|
|
+ paddrSrc = laddrSrc;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrSrc = A20ADDR(paddrSrc);
|
|
+
|
|
+ write_virtual_checks(dstSegPtr, di, 1);
|
|
+ laddrDst = dstSegPtr->cache.u.segment.base + di;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
|
|
+ }
|
|
+ else {
|
|
+ paddrDst = laddrDst;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrDst = A20ADDR(paddrDst);
|
|
+
|
|
+ if (byteCount) {
|
|
+ Bit32u bytesCanFitSrc, bytesCanFitDst;
|
|
+ Bit8u *hostAddrSrc, *hostAddrDst;
|
|
+
|
|
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
|
|
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
|
|
+
|
|
+ if ( hostAddrSrc && hostAddrDst ) {
|
|
+ // See how many bytes can fit in the rest of this page.
|
|
+ bytesCanFitSrc = (0x1000 - (paddrSrc & 0xfff));
|
|
+ bytesCanFitDst = (0x1000 - (paddrDst & 0xfff));
|
|
+ // Restrict count to the number that will fit in either
|
|
+ // source or dest pages.
|
|
+ if (byteCount > bytesCanFitSrc)
|
|
+ byteCount = bytesCanFitSrc;
|
|
+ if (byteCount > bytesCanFitDst)
|
|
+ byteCount = bytesCanFitDst;
|
|
+ if (byteCount > bx_pc_system.num_cpu_ticks_left)
|
|
+ byteCount = bx_pc_system.num_cpu_ticks_left;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (byteCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomSrc, roomDst;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = byteCount; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
|
|
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - si) + 1;
|
|
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - di) + 1;
|
|
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+
|
|
+ // Transfer data directly using host addresses.
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -1;
|
|
+ else
|
|
+ pointerDelta = 1;
|
|
+ for (j=0; j<byteCount; j++) {
|
|
+ * (Bit8u *) hostAddrDst = * (Bit8u *) hostAddrSrc;
|
|
+ hostAddrDst += pointerDelta;
|
|
+ hostAddrSrc += pointerDelta;
|
|
+ }
|
|
+ // Decrement the ticks count by the number of iterations, minus
|
|
+ // one, since the main cpu loop will decrement one. Also,
|
|
+ // the count is predecremented before examined, so defintely
|
|
+ // don't roll it under zero.
|
|
+ bx_pc_system.num_cpu_ticks_left -= (byteCount-1);
|
|
+
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (byteCount-1);
|
|
+ else
|
|
+ CX -= (byteCount-1);
|
|
+ incr = byteCount;
|
|
+ goto doIncr16;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
+noAcceleration16:
|
|
+
|
|
read_virtual_byte(seg, si, &temp8);
|
|
|
|
write_virtual_byte(BX_SEG_REG_ES, di, &temp8);
|
|
+ incr = 1;
|
|
+
|
|
+doIncr16:
|
|
|
|
if (BX_CPU_THIS_PTR eflags.df) {
|
|
/* decrement SI, DI */
|
|
- si--;
|
|
- di--;
|
|
+ si -= incr;
|
|
+ di -= incr;
|
|
}
|
|
else {
|
|
/* increment SI, DI */
|
|
- si++;
|
|
- di++;
|
|
+ si += incr;
|
|
+ di += incr;
|
|
}
|
|
|
|
SI = si;
|
|
@@ -109,7 +236,7 @@
|
|
BX_CPU_C::MOVSW_XvYv(BxInstruction_t *i)
|
|
{
|
|
unsigned seg;
|
|
-
|
|
+ unsigned incr;
|
|
|
|
if (!BX_NULL_SEG_REG(i->seg)) {
|
|
seg = i->seg;
|
|
@@ -128,19 +255,146 @@
|
|
edi = EDI;
|
|
|
|
if (i->os_32) {
|
|
+
|
|
+#if (BX_DEBUGGER == 0)
|
|
+#if (defined(__i386__) && __i386__)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u dwordCount;
|
|
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
|
|
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
|
|
+
|
|
+ if (i->as_32)
|
|
+ dwordCount = ECX;
|
|
+ else
|
|
+ dwordCount = CX;
|
|
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
|
|
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
|
|
+
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ read_virtual_checks(srcSegPtr, esi, 4);
|
|
+ laddrSrc = srcSegPtr->cache.u.segment.base + esi;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
|
|
+ }
|
|
+ else {
|
|
+ paddrSrc = laddrSrc;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrSrc = A20ADDR(paddrSrc);
|
|
+
|
|
+ write_virtual_checks(dstSegPtr, edi, 4);
|
|
+ laddrDst = dstSegPtr->cache.u.segment.base + edi;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
|
|
+ }
|
|
+ else {
|
|
+ paddrDst = laddrDst;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrDst = A20ADDR(paddrDst);
|
|
+
|
|
+ if (dwordCount) {
|
|
+ Bit32u dwordsCanFitSrc, dwordsCanFitDst;
|
|
+ Bit8u *hostAddrSrc, *hostAddrDst;
|
|
+
|
|
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
|
|
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
|
|
+
|
|
+ if ( hostAddrSrc && hostAddrDst ) {
|
|
+ // See how many dwords can fit in the rest of this page.
|
|
+ dwordsCanFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 2;
|
|
+ dwordsCanFitDst = (0x1000 - (paddrDst & 0xfff)) >> 2;
|
|
+ // Restrict dword count to the number that will fit in either
|
|
+ // source or dest pages.
|
|
+ if (dwordCount > dwordsCanFitSrc)
|
|
+ dwordCount = dwordsCanFitSrc;
|
|
+ if (dwordCount > dwordsCanFitDst)
|
|
+ dwordCount = dwordsCanFitDst;
|
|
+ if (dwordCount > bx_pc_system.num_cpu_ticks_left)
|
|
+ dwordCount = bx_pc_system.num_cpu_ticks_left;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (dwordCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomSrc, roomDst;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = dwordCount<<2; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
|
|
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
|
|
+ goto noAcceleration32;
|
|
+ }
|
|
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - esi) + 1;
|
|
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
|
|
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
|
|
+ goto noAcceleration32;
|
|
+ }
|
|
+
|
|
+ // Transfer data directly using host addresses.
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -4;
|
|
+ else
|
|
+ pointerDelta = 4;
|
|
+ for (j=0; j<dwordCount; j++) {
|
|
+ * (Bit32u *) hostAddrDst = * (Bit32u *) hostAddrSrc;
|
|
+ hostAddrDst += pointerDelta;
|
|
+ hostAddrSrc += pointerDelta;
|
|
+ }
|
|
+ // Decrement the ticks count by the number of iterations, minus
|
|
+ // one, since the main cpu loop will decrement one. Also,
|
|
+ // the count is predecremented before examined, so defintely
|
|
+ // don't roll it under zero.
|
|
+ bx_pc_system.num_cpu_ticks_left -= (dwordCount-1);
|
|
+
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (dwordCount-1);
|
|
+ else
|
|
+ CX -= (dwordCount-1);
|
|
+ incr = dwordCount << 2; // count * 4.
|
|
+ goto doIncr32;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif // __i386__
|
|
+#endif
|
|
+
|
|
+noAcceleration32:
|
|
+
|
|
read_virtual_dword(seg, esi, &temp32);
|
|
|
|
write_virtual_dword(BX_SEG_REG_ES, edi, &temp32);
|
|
+ incr = 4;
|
|
+
|
|
+doIncr32:
|
|
|
|
if (BX_CPU_THIS_PTR eflags.df) {
|
|
/* decrement ESI */
|
|
- esi -= 4;
|
|
- edi -= 4;
|
|
+ esi -= incr;
|
|
+ edi -= incr;
|
|
}
|
|
else {
|
|
/* increment ESI */
|
|
- esi += 4;
|
|
- edi += 4;
|
|
+ esi += incr;
|
|
+ edi += incr;
|
|
}
|
|
} /* if (i->os_32) ... */
|
|
else { /* 16 bit opsize mode */
|
|
@@ -198,19 +452,145 @@
|
|
{ /* 16 bit opsize mode */
|
|
Bit16u temp16;
|
|
|
|
+#if (BX_DEBUGGER == 0)
|
|
+#if (defined(__i386__) && __i386__)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u wordCount;
|
|
+ bx_segment_reg_t *srcSegPtr, *dstSegPtr;
|
|
+ Bit32u laddrDst, laddrSrc, paddrDst, paddrSrc;
|
|
+
|
|
+ if (i->as_32)
|
|
+ wordCount = ECX;
|
|
+ else
|
|
+ wordCount = CX;
|
|
+ srcSegPtr = &BX_CPU_THIS_PTR sregs[seg];
|
|
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
|
|
+
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ read_virtual_checks(srcSegPtr, si, 2);
|
|
+ laddrSrc = srcSegPtr->cache.u.segment.base + si;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrSrc = dtranslate_linear(laddrSrc, CPL==3, BX_READ);
|
|
+ }
|
|
+ else {
|
|
+ paddrSrc = laddrSrc;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrSrc = A20ADDR(paddrSrc);
|
|
+
|
|
+ write_virtual_checks(dstSegPtr, di, 2);
|
|
+ laddrDst = dstSegPtr->cache.u.segment.base + di;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
|
|
+ }
|
|
+ else {
|
|
+ paddrDst = laddrDst;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrDst = A20ADDR(paddrDst);
|
|
+
|
|
+ if (wordCount) {
|
|
+ Bit32u wordsCanFitSrc, wordsCanFitDst;
|
|
+ Bit8u *hostAddrSrc, *hostAddrDst;
|
|
+
|
|
+ hostAddrSrc = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrSrc, BX_READ);
|
|
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
|
|
+
|
|
+ if ( hostAddrSrc && hostAddrDst ) {
|
|
+ // See how many words can fit in the rest of this page.
|
|
+ wordsCanFitSrc = (0x1000 - (paddrSrc & 0xfff)) >> 1;
|
|
+ wordsCanFitDst = (0x1000 - (paddrDst & 0xfff)) >> 1;
|
|
+ // Restrict dword count to the number that will fit in either
|
|
+ // source or dest pages.
|
|
+ if (wordCount > wordsCanFitSrc)
|
|
+ wordCount = wordsCanFitSrc;
|
|
+ if (wordCount > wordsCanFitDst)
|
|
+ wordCount = wordsCanFitDst;
|
|
+ if (wordCount > bx_pc_system.num_cpu_ticks_left)
|
|
+ wordCount = bx_pc_system.num_cpu_ticks_left;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (wordCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomSrc, roomDst;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = wordCount<<1; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(srcSegPtr->cache.valid & SegAccessROK) ||
|
|
+ !(dstSegPtr->cache.valid & SegAccessWOK) ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+ roomSrc = (srcSegPtr->cache.u.segment.limit_scaled - si) + 1;
|
|
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - di) + 1;
|
|
+ if ( (roomSrc < transferLen) || (roomDst < transferLen) ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+
|
|
+ // Transfer data directly using host addresses.
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -2;
|
|
+ else
|
|
+ pointerDelta = 2;
|
|
+ for (j=0; j<wordCount; j++) {
|
|
+ * (Bit16u *) hostAddrDst = * (Bit16u *) hostAddrSrc;
|
|
+ hostAddrDst += pointerDelta;
|
|
+ hostAddrSrc += pointerDelta;
|
|
+ }
|
|
+ // Decrement the ticks count by the number of iterations, minus
|
|
+ // one, since the main cpu loop will decrement one. Also,
|
|
+ // the count is predecremented before examined, so defintely
|
|
+ // don't roll it under zero.
|
|
+ bx_pc_system.num_cpu_ticks_left -= (wordCount-1);
|
|
+
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (wordCount-1);
|
|
+ else
|
|
+ CX -= (wordCount-1);
|
|
+ incr = wordCount << 1; // count * 2.
|
|
+ goto doIncr16;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif // __i386__
|
|
+#endif
|
|
+
|
|
+noAcceleration16:
|
|
+
|
|
read_virtual_word(seg, si, &temp16);
|
|
|
|
write_virtual_word(BX_SEG_REG_ES, di, &temp16);
|
|
+ incr = 2;
|
|
+
|
|
+doIncr16:
|
|
|
|
if (BX_CPU_THIS_PTR eflags.df) {
|
|
/* decrement SI, DI */
|
|
- si -= 2;
|
|
- di -= 2;
|
|
+ si -= incr;
|
|
+ di -= incr;
|
|
}
|
|
else {
|
|
/* increment SI, DI */
|
|
- si += 2;
|
|
- di += 2;
|
|
+ si += incr;
|
|
+ di += incr;
|
|
}
|
|
}
|
|
|
|
@@ -593,48 +973,142 @@
|
|
BX_CPU_C::STOSB_YbAL(BxInstruction_t *i)
|
|
{
|
|
Bit8u al;
|
|
+ Bit32u edi;
|
|
+ unsigned incr;
|
|
|
|
#if BX_CPU_LEVEL >= 3
|
|
if (i->as_32) {
|
|
- Bit32u edi;
|
|
-
|
|
edi = EDI;
|
|
-
|
|
- al = AL;
|
|
- write_virtual_byte(BX_SEG_REG_ES, edi, &al);
|
|
-
|
|
- if (BX_CPU_THIS_PTR eflags.df) {
|
|
- /* decrement EDI */
|
|
- edi--;
|
|
- }
|
|
- else {
|
|
- /* increment EDI */
|
|
- edi++;
|
|
- }
|
|
-
|
|
- EDI = edi;
|
|
}
|
|
else
|
|
#endif /* BX_CPU_LEVEL >= 3 */
|
|
{ /* 16bit address size */
|
|
- Bit16u di;
|
|
+ edi = DI;
|
|
+ }
|
|
+ al = AL;
|
|
|
|
- di = DI;
|
|
|
|
- al = AL;
|
|
- write_virtual_byte(BX_SEG_REG_ES, di, &al);
|
|
+#if (BX_DEBUGGER == 0)
|
|
+ /* If conditions are right, we can transfer IO to physical memory
|
|
+ * in a batch, rather than one instruction at a time.
|
|
+ */
|
|
+ if (i->rep_used && !BX_CPU_THIS_PTR async_event) {
|
|
+ Bit32u byteCount;
|
|
+ bx_segment_reg_t *dstSegPtr;
|
|
+ Bit32u laddrDst, paddrDst;
|
|
+
|
|
+ if (i->as_32)
|
|
+ byteCount = ECX;
|
|
+ else
|
|
+ byteCount = CX;
|
|
+ dstSegPtr = &BX_CPU_THIS_PTR sregs[BX_SREG_ES];
|
|
+
|
|
+ // Do segment checks for the 1st word. We do not want to
|
|
+ // trip an exception beyond this, because the address would
|
|
+ // be incorrect. After we know how many bytes we will directly
|
|
+ // transfer, we can do the full segment limit check ourselves
|
|
+ // without generating an exception.
|
|
+ write_virtual_checks(dstSegPtr, edi, 1);
|
|
+ laddrDst = dstSegPtr->cache.u.segment.base + edi;
|
|
+ if (BX_CPU_THIS_PTR cr0.pg) {
|
|
+ paddrDst = dtranslate_linear(laddrDst, CPL==3, BX_WRITE);
|
|
+ }
|
|
+ else {
|
|
+ paddrDst = laddrDst;
|
|
+ }
|
|
+ // If we want to write directly into the physical memory array,
|
|
+ // we need the A20 address.
|
|
+ paddrDst = A20ADDR(paddrDst);
|
|
+
|
|
+ if (byteCount) {
|
|
+ Bit32u bytesCanFitDst;
|
|
+ Bit8u *hostAddrDst;
|
|
+
|
|
+ hostAddrDst = BX_CPU_THIS_PTR mem->getHostMemAddr(paddrDst, BX_WRITE);
|
|
+
|
|
+ if ( hostAddrDst ) {
|
|
+ // See how many bytes can fit in the rest of this page.
|
|
+ bytesCanFitDst = (0x1000 - (paddrDst & 0xfff));
|
|
+ // Restrict count to the number that will fit in either
|
|
+ // source or dest pages.
|
|
+ if (byteCount > bytesCanFitDst)
|
|
+ byteCount = bytesCanFitDst;
|
|
+ if (byteCount > bx_pc_system.num_cpu_ticks_left)
|
|
+ byteCount = bx_pc_system.num_cpu_ticks_left;
|
|
+
|
|
+ // If after all the restrictions, there is anything left to do...
|
|
+ if (byteCount) {
|
|
+ unsigned transferLen;
|
|
+ Bit32u roomDst;
|
|
+ unsigned j;
|
|
+ unsigned pointerDelta;
|
|
+
|
|
+ transferLen = byteCount; // Number bytes to transfer.
|
|
+
|
|
+ // Before we copy memory, we need to make sure that the segments
|
|
+ // allow the accesses up to the given source and dest offset. If
|
|
+ // the cache.valid bits have SegAccessWOK and ROK, we know that
|
|
+ // the cache is valid for those operations, and that the segments
|
|
+ // are non-expand down (thus we can make a simple limit check).
|
|
+ if ( !(dstSegPtr->cache.valid & SegAccessWOK) ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+ roomDst = (dstSegPtr->cache.u.segment.limit_scaled - edi) + 1;
|
|
+ if ( roomDst < transferLen ) {
|
|
+ goto noAcceleration16;
|
|
+ }
|
|
+
|
|
+ // Transfer data directly using host addresses.
|
|
+ if (BX_CPU_THIS_PTR eflags.df)
|
|
+ pointerDelta = (unsigned) -1;
|
|
+ else
|
|
+ pointerDelta = 1;
|
|
+ for (j=0; j<byteCount; j++) {
|
|
+ * (Bit8u *) hostAddrDst = al;
|
|
+ hostAddrDst += pointerDelta;
|
|
+ }
|
|
+ // Decrement the ticks count by the number of iterations, minus
|
|
+ // one, since the main cpu loop will decrement one. Also,
|
|
+ // the count is predecremented before examined, so defintely
|
|
+ // don't roll it under zero.
|
|
+ bx_pc_system.num_cpu_ticks_left -= (byteCount-1);
|
|
+
|
|
+ // Decrement eCX. Note, the main loop will decrement 1 also, so
|
|
+ // decrement by one less than expected, like the case above.
|
|
+ if (i->as_32)
|
|
+ ECX -= (byteCount-1);
|
|
+ else
|
|
+ CX -= (byteCount-1);
|
|
+ incr = byteCount;
|
|
+ goto doIncr16;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+
|
|
+noAcceleration16:
|
|
+
|
|
+ write_virtual_byte(BX_SEG_REG_ES, edi, &al);
|
|
+ incr = 1;
|
|
+
|
|
+doIncr16:
|
|
|
|
if (BX_CPU_THIS_PTR eflags.df) {
|
|
/* decrement EDI */
|
|
- di--;
|
|
+ edi -= incr;
|
|
}
|
|
else {
|
|
/* increment EDI */
|
|
- di++;
|
|
+ edi += incr;
|
|
}
|
|
|
|
- DI = di;
|
|
- }
|
|
+#if BX_CPU_LEVEL >= 3
|
|
+ if (i->as_32)
|
|
+ EDI = edi;
|
|
+ else
|
|
+#endif
|
|
+ DI = edi;
|
|
}
|
|
|
|
void
|
|
Index: memory/memory.h
|
|
===================================================================
|
|
RCS file: /cvsroot/bochs/bochs/memory/memory.h,v
|
|
retrieving revision 1.6
|
|
diff -u -r1.6 memory.h
|
|
--- memory/memory.h 3 Oct 2001 13:10:38 -0000 1.6
|
|
+++ memory/memory.h 29 Aug 2002 16:20:38 -0000
|
|
@@ -65,6 +65,7 @@
|
|
BX_MEM_SMF Boolean dbg_crc32(
|
|
unsigned long (*f)(unsigned char *buf, int len),
|
|
Bit32u addr1, Bit32u addr2, Bit32u *crc);
|
|
+ BX_MEM_SMF Bit8u * getHostMemAddr(Bit32u a20Addr, unsigned op);
|
|
};
|
|
|
|
#if BX_PROVIDE_CPU_MEMORY==1
|
|
Index: memory/misc_mem.cc
|
|
===================================================================
|
|
RCS file: /cvsroot/bochs/bochs/memory/misc_mem.cc,v
|
|
retrieving revision 1.23
|
|
diff -u -r1.23 misc_mem.cc
|
|
--- memory/misc_mem.cc 18 Aug 2002 08:53:26 -0000 1.23
|
|
+++ memory/misc_mem.cc 29 Aug 2002 16:20:39 -0000
|
|
@@ -282,3 +282,32 @@
|
|
|
|
return(1);
|
|
}
|
|
+
|
|
+ Bit8u *
|
|
+BX_MEM_C::getHostMemAddr(Bit32u a20Addr, unsigned op)
|
|
+ // Return a host address corresponding to the guest physical memory
|
|
+ // address (with A20 already applied), given that the calling
|
|
+ // code will perform an 'op' operation. This address will be
|
|
+ // used for direct access to guest memory as an acceleration by
|
|
+ // a few instructions, like REP {MOV, INS, OUTS, etc}.
|
|
+ // Values of 'op' are { BX_READ, BX_WRITE, BX_RW }.
|
|
+
|
|
+ // The other assumption is that the calling code _only_ accesses memory
|
|
+ // directly within the page that encompasses the address requested.
|
|
+{
|
|
+#if BX_PCI_SUPPORT
|
|
+#error "Fix getHostMemAddr for PCI support."
|
|
+#endif
|
|
+ if ( a20Addr >= BX_MEM_THIS len )
|
|
+ return(NULL); // Error, requested addr is out of bounds.
|
|
+ if (op == BX_READ) {
|
|
+ if ( (a20Addr < 0xa0000) || (a20Addr > 0xbffff) )
|
|
+ return( (Bit8u *) & vector[a20Addr] );
|
|
+ return(NULL); // Vetoed! Mem mapped IO (VGA)
|
|
+ }
|
|
+ else { // op == {BX_WRITE, BX_RW}
|
|
+ if ( (a20Addr < 0xa0000) || (a20Addr > 0xfffff) )
|
|
+ return( (Bit8u *) & vector[a20Addr] );
|
|
+ return(NULL); // Vetoed! Mem mapped IO (VGA) and ROMs
|
|
+ }
|
|
+}
|