motorola_powerpc/bootloader/exception.S

0001 /*
0002  *  exception.S -- Exception handlers for early boot.
0003  *
0004  *  Copyright (C) 1998, 1999 Gabriel Paubert, paubert@iram.es
0005  *
0006  *  Modified to compile in RTEMS development environment
0007  *  by Eric Valette
0008  *
0009  *  Copyright (C) 1999 Eric Valette. valette@crf.canon.fr
0010  *
0011  *  The license and distribution terms for this file may be
0012  *  found in the file LICENSE in this distribution or at
0013  *  http://www.rtems.org/license/LICENSE.
0014  */
0015
0016 /* This is an improved version of the TLB interrupt handling code from
0017  * the 603e users manual (603eUM.pdf) downloaded from the WWW. All the
0018  * visible bugs have been removed. Note that many have survived in the errata
0019  * to the 603 user manual (603UMer.pdf).
0020  *
0021  *  This code also pays particular attention to optimization, takes into
0022  * account the differences between 603 and 603e, single/multiple processor
0023  * systems and tries to order instructions for dual dispatch in many places.
0024  *
0025  *  The optimization has been performed along two lines:
0026  * 1) to minimize the number of instruction cache lines needed for the most
0027  *    common execution paths (the ones that do not result in an exception).
0028  * 2) then to order the code to maximize the number of dual issue and
0029  *    completion opportunities without increasing the number of cache lines
0030  *    used in the same cases.
0031  *
0032  *  The last goal of this code is to fit inside the address range
0033  * assigned to the interrupt vectors: 192 instructions with fixed
0034  * entry points every 64 instructions.
0035  *
0036  *  Some typos have also been corrected and the Power l (lowercase L)
0037  * instructions replaced by lwz without comment.
0038  *
0039  *  I have attempted to describe the reasons of the order and of the choice
0040  * of the instructions but the comments may be hard to understand without
0041  * the processor manual.
0042  *
0043  *  Note that the fact that the TLB are reloaded by software in theory
0044  * allows tremendous flexibility, for example we could avoid setting the
0045  * reference bit of the PTE which will could actually not be accessed because
0046  * of protection violation by changing a few lines of code. However,
0047  * this would significantly slow down most TLB reload operations, and
0048  * this is the reason for which we try never to make checks which would be
0049  * redundant with hardware and usually indicate a bug in a program.
0050  *
0051  *  There are some inconsistencies in the documentation concerning the
0052  * settings of SRR1 bit 15. All recent documentations say now that it is set
0053  * for stores and cleared for loads. Anyway this handler never uses this bit.
0054  *
0055  *  A final remark, the rfi instruction seems to implicitly clear the
0056  * MSR<14> (tgpr)bit. The documentation claims that this bit is restored
0057  * from SRR1 by rfi, but the corresponding bit in SRR1 is the LRU way bit.
0058  * Anyway, the only exception which can occur while TGPR is set is a machine
0059  * check which would indicate an unrecoverable problem. Recent documentation
0060  * now says in some place that rfi clears MSR<14>.
0061  *
0062  *  TLB software load for 602/603/603e/603ev:
0063  *    Specific Instructions:
0064  *      tlbld - write the dtlb with the pte in rpa reg
0065  *      tlbli - write the itlb with the pte in rpa reg
0066  *    Specific SPRs:
0067  *      dmiss - address of dstream miss
0068  *      imiss - address of istream miss
0069  *      hash1 - address primary hash PTEG address
0070  *      hash2 - returns secondary hash PTEG address
0071  *      iCmp - returns the primary istream compare value
0072  *      dCmp - returns the primary dstream compare value
0073  *      rpa - the second word of pte used by tlblx
0074  *    Other specific resources:
0075  *      cr0 saved in 4 high order bits of SRR1,
0076  *      SRR1 bit 14 [WAY] selects TLB set to load from LRU algorithm
0077  *      gprs r0..r3 shadowed by the setting of MSR bit 14 [TGPR]
0078  *      other bits in SRR1 (unused by this handler but see earlier comments)
0079  *
0080  *    There are three basic flows corresponding to three vectors:
0081  *      0x1000: Instruction TLB miss,
0082  *      0x1100: Data TLB miss on load,
0083  *      0x1200: Data TLB miss on store or not dirty page
0084  */
0085
0086 /* define the following if code does not have to run on basic 603 */
0087 /* #define USE_KEY_BIT */
0088
0089 /* define the following for safe multiprocessing */
0090 /* #define MULTIPROCESSING */
0091
0092 /* define the following for mixed endian */
0093 /* #define CHECK_MIXED_ENDIAN */
0094
0095 /* define the following if entries always have the reference bit set */
0096 #define ASSUME_REF_SET
0097
0098 /* Some OS kernels may want to keep a single copy of the dirty bit in a per
0099  * page table. In this case writable pages are always write-protected as long
0100  * as they are clean, and the dirty bit set actually means that the page
0101  * is writable.
0102  */
0103 #define DIRTY_MEANS_WRITABLE
0104
0105 #include <rtems/asm.h>
0106 #include <rtems/score/cpu.h>
0107 #include "bootldr.h"
0108
0109 /*
0110  * Instruction TLB miss flow
0111  *   Entry at 0x1000 with the following:
0112  *     srr0 -> address of instruction that missed
0113  *     srr1 -> 0:3=cr0, 13=1 (instruction), 14=lru way, 16:31=saved MSR
0114  *     msr<tgpr> -> 1
0115  *     iMiss -> ea that missed
0116  *     iCmp -> the compare value for the va that missed
0117  *     hash1 -> pointer to first hash pteg
0118  *     hash2 -> pointer to second hash pteg
0119  *
0120  *   Register usage:
0121  *     r0 is limit address during search / scratch after
0122  *     r1 is pte data / error code for ISI exception when search fails
0123  *     r2 is pointer to pte
0124  *     r3 is compare value during search / scratch after
0125  */
0126 /* Binutils or assembler bug ? Declaring the section executable and writable
0127  * generates an error message on the @fixup entries.
0128  */
0129     .section .exception,"aw"
0130 #   .org    0x1000        # instruction TLB miss entry point
0131     .globl  tlb_handlers
0132 tlb_handlers:
0133     .type   tlb_handlers,@function
0134 #define ISIVec tlb_handlers-0x1000+0x400
0135 #define DSIVec tlb_handlers-0x1000+0x300
0136     mfspr   r2,HASH1
0137     lwz     r1,0(r2)      # Start memory access as soon as possible
0138     mfspr   r3,ICMP       # to load the cache.
0139 0:  la      r0,48(r2)     # Use explicit loop to avoid using ctr
0140 1:  cmpw    r1,r3         # In theory the loop is somewhat slower
0141     beq-    2f            # than documentation example
0142     cmpw    r0,r2         # but we gain from starting cache load
0143     lwzu    r1,8(r2)      # earlier and using slots between load
0144     bne+    1b            # and comparison for other purposes.
0145     cmpw    r1,r3
0146     bne-    4f            # Secondary hash check
0147 2:  lwz     r1,4(r2)      # Found:  load second word of PTE
0148     mfspr   r0,IMISS      # get miss address during load delay
0149 #ifdef ASSUME_REF_SET
0150     andi.   r3,r1,8       # check for guarded memory
0151     bne-    5f
0152     mtspr   PPC_RPA,r1
0153     mfsrr1  r3
0154     tlbli   r0
0155 #else
0156 /* This is basically the original code from the manual. */
0157 #   andi.   r3,r1,8       # check for guarded memory
0158 #   bne-    5f
0159 #   andi.   r3,r1,0x100   # check R bit ahead to help folding
0160 /* However there is a better solution: these last three instructions can be
0161 replaced by the following which should cause less pipeline stalls because
0162 both tests are combined and there is a single CR rename buffer */
0163     extlwi  r3,r1,6,23    # Keep only RCWIMG in 6 most significant bits.
0164     rlwinm. r3,r3,5,0,27  # Keep only G (in sign) and R and test.
0165     blt-    5f            # Negative means guarded, zero R not set.
0166     mfsrr1  r3            # get saved cr0 bits now to dual issue
0167     ori     r1,r1,0x100
0168     mtspr   PPC_RPA,r1
0169     tlbli   r0
0170 /* Do not update PTE if R bit already set, this will save one cache line
0171 writeback at a later time, and avoid even more bus traffic in
0172 multiprocessing systems, when several processors access the same PTEGs.
0173 We also hope that the reference bit will be already set. */
0174     bne+    3f
0175 #ifdef MULTIPROCESSING
0176     srwi    r1,r1,8       # get byte 7 of pte
0177     stb     r1,+6(r2)     # update page table
0178 #else
0179     sth     r1,+6(r2)     # update page table
0180 #endif
0181 #endif
0182 3:  mtcrf   0x80,r3       # restore CR0
0183     rfi                   # return to executing program
0184
0185 /* The preceding code is 20 to 25 instructions long, which occupies
0186 3 or 4 cache lines. */
0187 4:  andi.   r0,r3,0x0040  # see if we have done second hash
0188     lis     r1,0x4000     # set up error code in case next branch taken
0189     bne-    6f            # speculatively issue the following
0190     mfspr   r2,HASH2      # get the second pointer
0191     ori     r3,r3,0x0040  # change the compare value
0192     lwz     r1,0(r2)      # load first entry
0193     b       0b            # and go back to main loop
0194 /* We are now at 27 to 32 instructions, using 3 or 4 cache lines for all
0195 cases in which the TLB is successfully loaded. */
0196
0197 /* Guarded memory protection violation: synthesize an ISI exception. */
0198 5:  lis     r1,0x1000     # set srr1<3>=1 to flag guard violation
0199 /* Entry Not Found branches here with r1 correctly set. */
0200 6:  mfsrr1  r3
0201     mfmsr   r0
0202     insrwi  r1,r3,16,16   # build srr1 for ISI exception
0203     mtsrr1  r1            # set srr1
0204 /* It seems few people have realized rlwinm can be used to clear a bit or
0205 a field of contiguous bits in a register by setting mask_begin>mask_end. */
0206     rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
0207     mtcrf   0x80, r3      # restore CR0
0208     mtmsr   r0            # flip back to the native gprs
0209     isync                 # Required from 602 doc!
0210     b       ISIVec        # go to instruction access exception
0211 /* Up to now there are 37 to 42 instructions so at least 20 could be
0212 inserted for complex cases or for statistics recording. */
0213
0214
0215 /*
0216   Data TLB miss on load flow
0217     Entry at 0x1100 with the following:
0218       srr0 -> address of instruction that caused the miss
0219       srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=0, 16:31=saved MSR
0220       msr<tgpr> -> 1
0221       dMiss -> ea that missed
0222       dCmp -> the compare value for the va that missed
0223       hash1 -> pointer to first hash pteg
0224       hash2 -> pointer to second hash pteg
0225
0226     Register usage:
0227       r0 is limit address during search / scratch after
0228       r1 is pte data / error code for DSI exception when search fails
0229       r2 is pointer to pte
0230       r3 is compare value during search / scratch after
0231 */
0232     .org    tlb_handlers+0x100
0233     mfspr   r2,HASH1
0234     lwz     r1,0(r2)      # Start memory access as soon as possible
0235     mfspr   r3,DCMP       # to load the cache.
0236 0:  la      r0,48(r2)     # Use explicit loop to avoid using ctr
0237 1:  cmpw    r1,r3         # In theory the loop is somewhat slower
0238     beq-    2f            # than documentation example
0239     cmpw    r0,r2         # but we gain from starting cache load
0240     lwzu    r1,8(r2)      # earlier and using slots between load
0241     bne+    1b            # and comparison for other purposes.
0242     cmpw    r1,r3
0243     bne-    4f            # Secondary hash check
0244 2:  lwz     r1,4(r2)      # Found:  load second word of PTE
0245     mfspr   r0,DMISS      # get miss address during load delay
0246 #ifdef ASSUME_REF_SET
0247     mtspr   PPC_RPA,r1
0248     mfsrr1  r3
0249     tlbld   r0
0250 #else
0251     andi.   r3,r1,0x100   # check R bit ahead to help folding
0252     mfsrr1  r3            # get saved cr0 bits now to dual issue
0253     ori     r1,r1,0x100
0254     mtspr   PPC_RPA,r1
0255     tlbld   r0
0256 /* Do not update PTE if R bit already set, this will save one cache line
0257 writeback at a later time, and avoid even more bus traffic in
0258 multiprocessing systems, when several processors access the same PTEGs.
0259 We also hope that the reference bit will be already set. */
0260     bne+    3f
0261 #ifdef MULTIPROCESSING
0262     srwi    r1,r1,8       # get byte 7 of pte
0263     stb     r1,+6(r2)     # update page table
0264 #else
0265     sth     r1,+6(r2)     # update page table
0266 #endif
0267 #endif
0268 3:  mtcrf   0x80,r3       # restore CR0
0269     rfi                   # return to executing program
0270
0271 /* The preceding code is 18 to 23 instructions long, which occupies
0272 3 cache lines. */
0273 4:  andi.   r0,r3,0x0040  # see if we have done second hash
0274     lis     r1,0x4000     # set up error code in case next branch taken
0275     bne-    9f            # speculatively issue the following
0276     mfspr   r2,HASH2      # get the second pointer
0277     ori     r3,r3,0x0040  # change the compare value
0278     lwz     r1,0(r2)      # load first entry asap
0279     b       0b            # and go back to main loop
0280 /* We are now at 25 to 30 instructions, using 3 or 4 cache lines for all
0281 cases in which the TLB is successfully loaded. */
0282
0283
0284 /*
0285   Data TLB miss on store or not dirty page flow
0286     Entry at 0x1200 with the following:
0287       srr0 -> address of instruction that caused the miss
0288       srr1 -> 0:3=cr0, 13=0 (data), 14=lru way, 15=1, 16:31=saved MSR
0289       msr<tgpr> -> 1
0290       dMiss -> ea that missed
0291       dCmp -> the compare value for the va that missed
0292       hash1 -> pointer to first hash pteg
0293       hash2 -> pointer to second hash pteg
0294
0295     Register usage:
0296       r0 is limit address during search / scratch after
0297       r1 is pte data / error code for DSI exception when search fails
0298       r2 is pointer to pte
0299       r3 is compare value during search / scratch after
0300 */
0301     .org    tlb_handlers+0x200
0302     mfspr   r2,HASH1
0303     lwz     r1,0(r2)      # Start memory access as soon as possible
0304     mfspr   r3,DCMP       # to load the cache.
0305 0:  la      r0,48(r2)     # Use explicit loop to avoid using ctr
0306 1:  cmpw    r1,r3         # In theory the loop is somewhat slower
0307     beq-    2f            # than documentation example
0308     cmpw    r0,r2         # but we gain from starting cache load
0309     lwzu    r1,8(r2)      # earlier and using slots between load
0310     bne+    1b            # and comparison for other purposes.
0311     cmpw    r1,r3
0312     bne-    4f            # Secondary hash check
0313 2:  lwz     r1,4(r2)      # Found:  load second word of PTE
0314     mfspr   r0,DMISS      # get miss address during load delay
0315 /* We could simply set the C bit and then rely on hardware to flag protection
0316 violations. This raises the problem that a page which actually has not been
0317 modified may be marked as dirty and violates the OEA model for guaranteed
0318 bit settings (table 5-8 of 603eUM.pdf). This can have harmful consequences
0319 on operating system memory management routines, and play havoc with copy on
0320 write schemes. So the protection check is ABSOLUTELY necessary. */
0321     andi.   r3,r1,0x80    # check C bit
0322     beq-    5f            # if (C==0) go to check protection
0323 3:  mfsrr1  r3            # get the saved cr0 bits
0324     mtspr   PPC_RPA,r1        # set the pte
0325     tlbld   r0            # load the dtlb
0326     mtcrf   0x80,r3       # restore CR0
0327     rfi                   # return to executing program
0328 /* The preceding code is 20 instructions long, which occupy
0329 3 cache lines. */
0330 4:  andi.   r0,r3,0x0040  # see if we have done second hash
0331     lis     r1,0x4200     # set up error code in case next branch taken
0332     bne-    9f            # speculatively issue the following
0333     mfspr   r2,HASH2      # get the second pointer
0334     ori     r3,r3,0x0040  # change the compare value
0335     lwz     r1,0(r2)      # load first entry asap
0336     b       0b            # and go back to main loop
0337 /* We are now at 27 instructions, using 3 or 4 cache lines for all
0338 cases in which the TLB C bit is already set. */
0339
0340 #ifdef DIRTY_MEANS_WRITABLE
0341 5:  lis     r1,0x0A00     # protection violation on store
0342 #else
0343 /*
0344   Entry found and C==0: check protection before setting C:
0345     Register usage:
0346       r0 is dMiss register
0347       r1 is PTE entry (to be copied to RPA if success)
0348       r2 is pointer to pte
0349       r3 is trashed
0350
0351     For the 603e, the key bit in SRR1 helps to decide whether there is a
0352   protection violation. However the way the check is done in the manual is
0353   not very efficient. The code shown here works as well for 603 and 603e and
0354   is much more efficient for the 603 and comparable to the manual example
0355   for 603e. This code however has quite a bad structure due to the fact it
0356   has been reordered to speed up the most common cases.
0357 */
0358 /* The first of the following two instructions could be replaced by
0359 andi. r3,r1,3 but it would compete with cmplwi for cr0 resource. */
0360 5:  clrlwi  r3,r1,30      # Extract two low order bits
0361     cmplwi  r3,2          # Test for PP=10
0362     bne-    7f            # assume fallthrough is more frequent
0363 6:  ori     r1,r1,0x180   # set referenced and changed bit
0364     sth     r1,6(r2)      # update page table
0365     b       3b            # and finish loading TLB
0366 /* We are now at 33 instructions, using 5 cache lines. */
0367 7:  bgt-    8f            # if PP=11 then DSI protection exception
0368 /* This code only works if key bit is present (602/603e/603ev) */
0369 #ifdef USE_KEY_BIT
0370     mfsrr1  r3            # get the KEY bit and test it
0371     andis.  r3,r3,0x0008
0372     beq     6b            # default prediction taken, truly better ?
0373 #else
0374 /* This code is for all 602 and 603 family models: */
0375     mfsrr1  r3            # Here the trick is to use the MSR PR bit as a
0376     mfsrin  r0,r0         # shift count for an rlwnm. instruction which
0377     extrwi  r3,r3,1,17    # extracts and tests the correct key bit from
0378     rlwnm.  r3,r0,r3,1,1  # the segment register. RISC they said...
0379     mfspr   r0,DMISS      # Restore fault address to r0
0380     beq     6b            # if 0 load tlb else protection fault
0381 #endif
0382 /* We are now at 40 instructions, (37 if using key bit), using 5 cache
0383 lines in all cases in which the C bit is successfully set */
0384 8:  lis     r1,0x0A00     # protection violation on store
0385 #endif /* DIRTY_IS_WRITABLE */
0386 /* PTE entry not found branch here with DSISR code in r1 */
0387 9:  mfsrr1  r3
0388     mtdsisr r1
0389     clrlwi  r2,r3,16      # set up srr1 for DSI exception
0390     mfmsr   r0
0391 /* I have some doubts about the usefulness of the xori instruction in
0392 mixed or pure little-endian environment. The address is in the same
0393 doubleword, hence in the same protection domain and performing an exclusive
0394 or with 7 is only valid for byte accesses. */
0395 #ifdef CHECK_MIXED_ENDIAN
0396     andi.   r1,r2,1       # test LE bit ahead to help folding
0397 #endif
0398     mtsrr1  r2
0399     rlwinm  r0,r0,0,15,13 # clear the msr<tgpr> bit
0400     mfspr   r1,DMISS      # get miss address
0401 #ifdef CHECK_MIXED_ENDIAN
0402     beq     1f            # if little endian then:
0403     xori    r1,r1,0x07    # de-mung the data address
0404 1:
0405 #endif
0406     mtdar   r1            # put in dar
0407     mtcrf   0x80,r3       # restore CR0
0408     mtmsr   r0            # flip back to the native gprs
0409     isync                 # required from 602 manual
0410     b       DSIVec        # branch to DSI exception
0411 /* We are now between 50 and 56 instructions. Close to the limit
0412 but should be sufficient in case bugs are found. */
0413 /* Altogether the three handlers occupy 128 instructions in the worst
0414 case, 64 instructions could still be added (non contiguously). */
0415     .org    tlb_handlers+0x300
0416     .globl  _handler_glue
0417 _handler_glue:
0418 /* Entry code for exceptions: DSI (0x300), ISI(0x400), alignment(0x600) and
0419  * traps(0x700). In theory it is not necessary to save and restore r13 and all
0420  * higher numbered registers, but it is done because it allowed to call the
0421  * firmware (PPCBug) for debugging in the very first stages when writing the
0422  * bootloader.
0423  */
0424     stwu    r1,-160(r1)
0425     stw r0,save_r(0)
0426     mflr    r0
0427     stmw    r2,save_r(2)
0428     bl  0f
0429 0:  mfctr   r4
0430     stw r0,save_lr
0431     mflr    r9      /* Interrupt vector + few instructions */
0432     la  r10,160(r1)
0433     stw r4,save_ctr
0434     mfcr    r5
0435     lwz r8,2f-0b(r9)
0436     mfxer   r6
0437     stw r5,save_cr
0438     mtctr   r8
0439     stw r6,save_xer
0440     mfsrr0  r7
0441     stw r10,save_r(1)
0442     mfsrr1  r8
0443     stw r7,save_nip
0444     la  r4,8(r1)
0445     lwz r13,1f-0b(r9)
0446     rlwinm  r3,r9,24,0x3f   /* Interrupt vector >> 8 */
0447     stw r8,save_msr
0448     bctrl
0449
0450     lwz r7,save_msr
0451     lwz r6,save_nip
0452     mtsrr1  r7
0453     lwz r5,save_xer
0454     mtsrr0  r6
0455     lwz r4,save_ctr
0456     mtxer   r5
0457     lwz r3,save_lr
0458     mtctr   r4
0459     lwz r0,save_cr
0460     mtlr    r3
0461     lmw r2,save_r(2)
0462     mtcr    r0
0463     lwz r0,save_r(0)
0464     la  r1,160(r1)
0465     rfi
0466 1:  .long   (__bd)@fixup
0467 2:  .long   (_handler)@fixup
0468     .section .fixup,"aw"
0469     .align  2
0470     .long 1b, 2b
0471     .previous