Back to home page

LXR

 
 

    


File indexing completed on 2025-05-11 08:23:58

0001 #ifdef __ALTIVEC__
0002 
0003 /* Altivec support for RTEMS; vector register context management.  */
0004 
0005 /*
0006  * Authorship
0007  * ----------
0008  * This software was created by
0009  *     Till Straumann <strauman@slac.stanford.edu>, 2009,
0010  *     Stanford Linear Accelerator Center, Stanford University.
0011  *
0012  * Acknowledgement of sponsorship
0013  * ------------------------------
0014  * This software was produced by
0015  *     the Stanford Linear Accelerator Center, Stanford University,
0016  *     under Contract DE-AC03-76SFO0515 with the Department of Energy.
0017  *
0018  * Government disclaimer of liability
0019  * ----------------------------------
0020  * Neither the United States nor the United States Department of Energy,
0021  * nor any of their employees, makes any warranty, express or implied, or
0022  * assumes any legal liability or responsibility for the accuracy,
0023  * completeness, or usefulness of any data, apparatus, product, or process
0024  * disclosed, or represents that its use would not infringe privately owned
0025  * rights.
0026  *
0027  * Stanford disclaimer of liability
0028  * --------------------------------
0029  * Stanford University makes no representations or warranties, express or
0030  * implied, nor assumes any liability for the use of this software.
0031  *
0032  * Stanford disclaimer of copyright
0033  * --------------------------------
0034  * Stanford University, owner of the copyright, hereby disclaims its
0035  * copyright and all other rights in this software.  Hence, anyone may
0036  * freely use it for any purpose without restriction.
0037  *
0038  * Maintenance of notices
0039  * ----------------------
0040  * In the interest of clarity regarding the origin and status of this
0041  * SLAC software, this and all the preceding Stanford University notices
0042  * are to remain affixed to any copy or derivative of this software made
0043  * or distributed by the recipient and are to be affixed to any copy of
0044  * software made or distributed by the recipient that contains a copy or
0045  * derivative of this software.
0046  *
0047  * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
0048  */
0049 
0050 
0051 #include <rtems/powerpc/powerpc.h>
0052 
0053 #ifndef PPC_CACHE_ALIGNMENT
0054 #error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
0055 #endif
0056 
0057 #define ALTIVEC_TESTING
0058 
0059 #if PPC_CACHE_ALIGNMENT != 32
0060 #error "Altivec support assumes cache-line size is 32 bytes!"
0061 #else
0062 #undef  LD_PPC_CACHE_ALIGNMENT
0063 #define LD_PPC_CACHE_ALIGNMENT 5
0064 #endif
0065 
0066     .set   v0,   0
0067     .set   v8,   8
0068     .set   v16, 16
0069     .set   v20, 20
0070     .set   v24, 24
0071     .set   v28, 28
0072 
0073     .set   r0,   0
0074     .set   r3,   3
0075     .set   r4,   4
0076     /* Do not use r5, since this is used by _CPU_Context_switch() */
0077     .set   r6,   6
0078     .set   r7,   7
0079     .set   r8,   8
0080     .set   r9,   9
0081     .set   r10, 10
0082     .set   r11, 11
0083     /* Do not use r12, since this is used by _CPU_Context_switch() */
0084 
0085     .set   cr5,  5
0086 
0087     .set   VECSIZE,    16
0088 
0089     .set   VRSAVE_INIT_VAL, 0
0090     .set   VSCR_INIT_VAL,   0
0091 
0092     .set   VRSAVE_OFF, 16
0093     .set   VSCR_OFF,   16+12
0094 
0095     .set   ds0,  0
0096 
0097     /* Block size for dst -- in units of 16-bytes */
0098     .set   BSIZE,   2       /* = 32 bytes */
0099     .set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
0100     .set   BSTRIDE, 32      /*      bytes */
0101 
0102     .data
0103 
0104     .global _CPU_altivec_vrsave_initval
0105 _CPU_altivec_vrsave_initval:
0106     .long   0
0107 
0108     .global _CPU_altivec_vscr_initval
0109 _CPU_altivec_vscr_initval:
0110     .long   0
0111 
0112     .text
0113 
0114     .extern _CPU_altivec_psim_cpu
0115     .extern _CPU_altivec_ctxt_off
0116 
0117     .macro  CMPOFF _B0
0118     lis \_B0, _CPU_altivec_ctxt_off@ha
0119     lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
0120     .endm
0121 
0122     /* Conditionally load or store a vector _VR to
0123      *  EA(_R1|0 + _R2)
0124      * If bit _VR (corresponding to _VR) is set in CRC
0125      * then the load/store is performed but otherwise
0126      * it is skipped.
0127      * If compiled with IGNORE_VRSAVE defined then
0128      * the load/store is done unconditionally.
0129      *
0130      * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
0131      * _VR    : target vector register
0132      * _R1    : base register (NOTE: _R1=r0 uses a
0133      *          implicit ZERO constant, not the contents
0134      *          of r0) for address computation.
0135      * _R2    : 'offset' register for address computation.
0136      *
0137      * MODIFIES:      _VR on output if a load operation is performed.
0138      * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
0139      *                defined.
0140      */
0141     .macro LDST _OPCODE, _VR, _R1, _R2
0142 #ifndef IGNORE_VRSAVE
0143     bc       4, \_VR, 111f
0144 #endif
0145     \_OPCODE \_VR, \_R1, \_R2
0146 111:
0147     .endm
0148 
0149     /*
0150      * Load or store four 'adjacent' vector registers.
0151      *
0152      * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
0153      * _VR    : target vector register
0154      * _R1    : base register (NOTE: _R1=r0 uses a
0155      *          implicit ZERO constant, not the contents
0156      *          of r0) for address computation.
0157      * _B0    : base register 0
0158      * _B1    : base register 1
0159      * _B2    : base register 2
0160      * _B3    : base register 3
0161      * _RO    : offset register
0162      *
0163      * memory addresses for _VR, _VR+1, _VR+2, _VR+3
0164      * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
0165      *
0166      * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
0167      *                operation is performed.
0168      * IMPLICIT USE:  see LDST
0169      */
0170     .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
0171     LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
0172     LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
0173     LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
0174     LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
0175     .endm
0176 
0177     /*
0178      * Preload/zero two cache lines and save 4 vector registers
0179      * to memory.
0180      * Note that the cache operation targets memory *past* the
0181      * current storage area which should hopefully hit when
0182      * This same code is executed on the next two cache lines...
0183      *
0184      * This code effectively does
0185      *   dcbz (_B0 + 64)
0186      *   dcbz (_B0 + 64 + 32)
0187      *   stvx _VF+0, (_B0+ 0)
0188      *   stvx _VF+1, (_B0+16)
0189      *   stvx _VF+2, (_B0+32)
0190      *   stvx _VF+3, (_B0+48)
0191      *
0192      * _LRU:  may be 'l' or empty. The former variant should be
0193      *        used when it is conceivable that the memory area is
0194      *        unlikely to be used in the near future thus making
0195      *        it a candidate for early eviction from the caches.
0196      *
0197      *        If it is likely that the memory area is reused soon
0198      *        (e.g., save/restore across ISR execution) then the
0199      *        'stvx' opcode (w/o 'l' suffix) should be used.
0200      *
0201      * _VR:   first of four target vector registers; _VR+0,
0202      *        _VR+1, _VR+2, _VR+3 are saved.
0203      *
0204      * _BO:   base address of memory area.
0205      * _B1:   should contain _B0+16 on entry
0206      * _B2:   should contain _B0+32 on entry
0207      * _B3:   should contain _B0+48 on entry
0208      *
0209      * _O1:   contains the offset where the four vectors are
0210      *        stored. 
0211      *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
0212      *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
0213      *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
0214      *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
0215      * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
0216      *        used to address the two cache-lines past the
0217      *        current memory area.
0218      *
0219      * MODIFIES: _O2; contains _O1 + 64 after execution of this
0220      *        code.
0221      *
0222      * NOTES: a different set of four vectors can be addressed
0223      *        simply by changing the one offset register _O1.
0224      *
0225      *        Saving more than 4 registers can simply be
0226      *        achieved by expanding this macro multiple 
0227      *        times with _O1 and _O2 swapped (new _O1 
0228      *        becomes _O2 = old _O1 + 64) thus stepping
0229      *        through the memory area.
0230      *
0231      */
0232     .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
0233     addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
0234     dcbz  \_B0, \_O2
0235     dcbz  \_B2, \_O2
0236     LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
0237     .endm
0238 
0239     /*
0240      * Save eight vector registers by expanding S4VEC_P twice.
0241      * See notes for S4VEC_P above.
0242      *
0243      * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
0244      *
0245      * MODIFIES: After execution, 
0246      *           _O2 contains original _O1 +  64,
0247      *           _O1 contains original _O1 + 128
0248      *
0249      * NOTES:    Expanding this macro multiple times lets you save
0250      *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
0251      */
0252     .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
0253     S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
0254     /* Note that the roles of _O1 and _O2 are swapped here */
0255     S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
0256     .endm
0257 
0258     /*
0259      * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
0260      *
0261      * See notes above (for S4VEC_P).
0262      *
0263      * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
0264      * MODIFIES: _O1 contains original _O1 + 256
0265      *           _O2 contains original _O1 + 256 - 64
0266      */
0267     .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
0268     S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
0269     S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
0270     LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
0271     .endm
0272 
0273     /*
0274      * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
0275      *
0276      * See notes above (for S4VEC_P, S_V0TOV19).
0277      *
0278      * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
0279      * MODIFIES: _O1 contains original _O1 + 128
0280      *           _O2 contains original _O1 + 128 - 64
0281      */
0282     .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
0283     S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
0284     LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
0285     .endm
0286 
0287     /*
0288      * Save all registers to memory area
0289      *
0290      * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
0291      * MODIFIES: _O1 contains original _O1 + 512
0292      *           _O2 contains original _O1 + 512 - 64
0293      */
0294     .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
0295     S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
0296     S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
0297     S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
0298     S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
0299     LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
0300     .endm
0301 
0302 
0303     /*
0304      * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
0305      * We can pass either of them as arguments to another macro which
0306      * allows us to decide if the main macro uses dcbt or not when
0307      * we expand it...
0308      */
0309     .macro DO_DCBT _RA, _RB
0310     dcbt \_RA, \_RB
0311     .endm
0312 
0313     .macro NO_DCBT _RA, _RB
0314     .endm
0315 
0316     /*
0317      * NOTE REGARDING dcbt VS dst
0318      *
0319      * Preloading the cache with memory areas that we soon need
0320      * can be done either using 'dcbt' or 'dst' instructions
0321      * "ahead of time".
0322      * When experimenting (on a mpc7457) I found that the 'dst'
0323      * stream instruction was very efficient if there is enough
0324      * time to read ahead. It works well when we do a context
0325      * switch:
0326      *
0327      *   1) start DST on new context to be loaded
0328      *   2) save old context to memory
0329      *   3) load new context from memory
0330      *
0331      * Because of the interleaved step 2) dst works nicely and
0332      * 3) finds what it needs in the cache.
0333      *
0334      * However, in a situation when there is not much time
0335      * to start the DST, e.g., because we want to restore
0336      * a context out of the blue (e.g., after returning
0337      * from and ISR):
0338      *
0339      *   1) save volatile registers to memory/stack
0340      *   2) execute ISR
0341      *   3) might do a task context switch
0342      *   4) when returned to old task context then
0343      *      reload volatile registers from memory/stack.
0344      *
0345      * In this situation, preloading the target memory before
0346      * or after step 1) makes obviously no sense because after
0347      * 1) the registers area is most likely in the cache already.
0348      *
0349      * Starting preload after 2) doesn't make much sense either.
0350      * If ISR doesn't lead to a context switch then it is quite
0351      * likely that the register area is still in the cache.
0352      * OTOTH, if a context switch happens then the preload after 2)
0353      * might be useless.
0354      * 
0355      * This leaves us at step 4) where we want to load immediately.
0356      * In this case, I found that 'dcbt' works more efficiently
0357      * so that's what we use when restoring volatile registers.
0358      *
0359      * When restoring the non-volatile VRs during a 'normal'
0360      * context switch then we shall use DST (and no dcbt).
0361      */
0362 
0363     /*
0364      * Symmetric to S4VEC_P above but addresses loading four
0365      * vector registers from memory.
0366      *
0367      * Touches two cache lines past the current memory area
0368      * and loads four vectors from the current area.
0369      *
0370      * Optionally, the DCBT operation may be omitted
0371      * (when expanding with _DCBT=NO_DCBT).
0372      * This is useful if the cache was already preloaded
0373      * by another means (dst instruction).
0374      *
0375      * NOTE: We always use the 'LRU' form of lvx: lvxl, 
0376      *       because we deem it unlikely that the context
0377      *       that was just loaded has to be saved again
0378      *       to memory in the immediate future.
0379      *
0380      * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
0381      *           as explained above.
0382      *
0383      * MODIFIES: _O2 contains original _O1 + 64.
0384      *           _VR.._VR+3 loaded from memory.
0385      */
0386     .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
0387     addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
0388     /* preload/touch 2 lines at offset 64 from _B0 */
0389     \_DCBT   \_B0, \_O2
0390     \_DCBT   \_B2, \_O2
0391     /* load four vectors at off set 0 from _B0     */
0392     LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
0393     .endm
0394 
0395     /*
0396      * Symmetric to S8VEC_P; loads 8 vector registers
0397      * from memory -- see comments above...
0398      *
0399      * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
0400      *           as explained above.
0401      *
0402      * MODIFIES: _O1 contains original _O1 + 128.
0403      *           _O2 contains original _O1 +  64.
0404      *           _VR.._VR+7 loaded from memory.
0405      */
0406     .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
0407     L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0408     L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
0409     .endm
0410     
0411     /*
0412      * Load volatile vector registers v0..v19 employing
0413      * the DCBT to preload the cache. The rationale for
0414      * using DCBT here but not when restoring non-volatile
0415      * registers is explained above, see
0416      *
0417      *    "NOTE REGARDING dcbt VS dst"
0418      * 
0419      * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
0420      *           as explained above.
0421      *
0422      * MODIFIES: _O1 contains original _O1 + 256.
0423      *           _O2 contains original _O1 + 256 - 64.
0424      *           VR0..VR19 loaded from memory.
0425      */ 
0426     .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
0427     L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0428     L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0429     LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
0430     .endm
0431 
0432     /*
0433      * Load non-volatile vector registers v20..v31.
0434      * Note that no DCBT is performed since we use
0435      * DST for preloading the cache during a context
0436      * switch, see
0437      *
0438      *    "NOTE REGARDING dcbt VS dst"
0439      *
0440      * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
0441      *           as explained above.
0442      *
0443      * MODIFIES: _O1 contains original _O1 + 128.
0444      *           _O2 contains original _O1 + 128 - 64.
0445      *           VR20..VR31 loaded from memory.
0446      */
0447     .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
0448     L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0449     LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
0450     .endm
0451 
0452     /*
0453      * Load all registers from memory area.
0454      */
0455     .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
0456     L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0457     L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0458     L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0459     L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
0460     LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
0461     .endm
0462 
0463     /*
0464      * Compute
0465      *     _B1 = _B0 + 16
0466      *     _B2 = _B0 + 32
0467      *     _B3 = _B0 + 48
0468      * and load
0469      *     _RO = 0
0470      *
0471      * convenience macro to be expanded before
0472      * any of the load/store macros that use
0473      * four base addresses etc.
0474      *
0475      * INPUT: _B0 = cache-aligned start of memory area
0476      *
0477      * MODIFIES: _B1, _B2, _B3, _RO as described above.
0478      */
0479     .macro CMP_BASES _B0, _B1, _B2, _B3, _RO
0480     addi       \_B1, \_B0, 1*VECSIZE
0481     addi       \_B2, \_B0, 2*VECSIZE
0482     addi       \_B3, \_B0, 3*VECSIZE
0483     li         \_RO, 0
0484     .endm
0485 
0486     /*
0487      * Prepare for saving general vector registers.
0488      *
0489      * If not built with #define IGNORE_VRSAVE then
0490      *
0491      *  1) copy vrsave to CRC
0492      *
0493      * endif
0494      *
0495      *  2) copy vrsave to _VRSAVE_REG
0496      *  3) preload/zero cache line where vrsave and vscr are stored.
0497      *  4) compute base adresses from _B0
0498      *  5) preload/zero first two cache lines (remember that the
0499      *     first S8VEC_P starts preloading/zeroing at offset 64).
0500      *
0501      * INPUT:    'vrsave' register, _B0 (base address of memory area)
0502      * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
0503      *           _B0 = original _BO + 32
0504      *           _B1 = original _B0 + 32 + 16,
0505      *           _B2 = original _B0 + 32 + 32,
0506      *           _B3 = original _B0 + 32 + 48,
0507      *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
0508      */
0509     .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
0510     mfvrsave   \_VRSAVE_REG
0511 #ifndef IGNORE_VRSAVE
0512     mtcr       \_VRSAVE_REG
0513 #endif
0514     dcbz       0, \_B0
0515     addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
0516     dcbz       0, \_B0
0517     CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
0518     dcbz       0, \_B2
0519     .endm
0520 
0521     /*
0522      * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
0523      * must have been loaded from 'vrsave' and 'vscr', respectively,
0524      * prior to expanding this macro.
0525      *
0526      * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
0527      *           _VSCR_VREG  VR  holding 'vscr'   contents
0528      *           _B0 cache-aligned (base) address of memory area.
0529      * MODIFIES: _SCRATCH_REG
0530      */
0531     .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
0532     stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
0533     li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
0534     stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
0535     .endm
0536 
0537     /*
0538      * Load 'vrsave' and 'vscr' from memory.
0539      *
0540      * INPUTS:   _B0 cache-aligned (base) address of memory area.
0541      * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
0542      *           'vscr', 'vrsave'.
0543      *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
0544      *           with IGNORE_VRSAVE undefined).
0545      */
0546     .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
0547     lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
0548     mtvrsave  \_SCRATCH_REG
0549 #ifndef IGNORE_VRSAVE
0550     mtcr      \_SCRATCH_REG
0551 #endif
0552     li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
0553     lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
0554     mtvscr    \_SCRATCH_VREG
0555     .endm
0556 
0557     /*
0558      * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
0559      *
0560      * INPUT:    _B0
0561      * MODIFIES: _B0 (as stated above)
0562      */
0563     .macro CACHE_DOWNALGN _B0
0564     rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
0565     .endm
0566 
0567     .text
0568 
0569     .global _CPU_save_altivec_volatile
0570 _CPU_save_altivec_volatile:
0571     /* Align address up to next cache-line boundary */
0572     addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
0573     CACHE_DOWNALGN r3
0574 
0575 #ifndef IGNORE_VRSAVE
0576     /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
0577      * when testing if we really should do the load/store operation.
0578      */
0579     mfcr      r9
0580 #endif
0581 
0582     PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
0583     /* r0 now contains VRSAVE, r3 still the aligned memory area
0584      * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
0585      * respectively. r10 holds zero
0586      */
0587     S_V0TOV19     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
0588     mfvscr        v0
0589     /* Store vrsave (still in r0) and vscr (in v0) to memory area */
0590     S_VSCR_VRSAVE r0, v0, r3, r11
0591 
0592 #ifndef IGNORE_VRSAVE
0593     /* Restore CRC */
0594     mtcr      r9
0595 #endif
0596     blr
0597 
0598     .global _CPU_load_altivec_volatile
0599 _CPU_load_altivec_volatile:
0600     /* Align address up to next cache-line boundary */
0601     addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
0602     CACHE_DOWNALGN r3
0603 #ifndef IGNORE_VRSAVE
0604     /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
0605      * when testing if we really should do the load/store operation.
0606      */
0607     mfcr      r9
0608 #endif
0609 
0610     /* Try to preload 1st line (where vscr and vrsave are stored) */
0611     dcbt      0, r3
0612     /* Point to start of general vector-register area             */
0613     addi      r3, r3, PPC_CACHE_ALIGNMENT
0614     /* Start preloading 2nd line (where first two vectors are)    */
0615     dcbt      0, r3
0616     L_VSCR_VRSAVE r3, r0, v0
0617     CMP_BASES     r3, r4, r8, r6, r10
0618     /* Start preloading 3rd line (where vectors 3 and 4 are)      */
0619     dcbt      0, r8
0620     L_V0TOV19 r3, r4, r8, r6, r10, r11
0621 
0622 #ifndef IGNORE_VRSAVE
0623     mtcr      r9
0624 #endif
0625     blr
0626 
0627     .global _CPU_Context_switch_altivec
0628 _CPU_Context_switch_altivec:
0629 
0630     /* fetch offset of altivec area in context                   */
0631     CMPOFF    r8
0632     /* down-align 'to' area to cache-line boundary               */
0633     add       r4, r4, r8
0634     CACHE_DOWNALGN r4
0635 
0636     /* Check for PSIM                                            */
0637     lis       r6, _CPU_altivec_psim_cpu@ha
0638     lwz       r6, _CPU_altivec_psim_cpu@l(r6)
0639     cmpli     0, r6, 0
0640     bne       1f
0641     /* Skip data-stream instructions on PSIM (not implemented)   */
0642     dssall
0643     /* Pre-load new context into cache                           */
0644     lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
0645     ori       r6, r6, BSTRIDE
0646     dstt      r4, r6, ds0
0647 1:
0648 
0649 #ifndef IGNORE_VRSAVE
0650     /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
0651      * when testing if we really should do the load/store operation.
0652      */
0653     mfcr      r9
0654 #endif
0655 
0656     /* Is 'from' context == NULL ? (then we just do a 'restore') */
0657     cmpli     0, r3, 0
0658     beq       1f           /* yes: skip saving 'from' context    */
0659 
0660     /* SAVE NON-VOLATILE REGISTERS                               */
0661 
0662     /* Compute aligned destination pointer (r8 still holds offset
0663      * to 'altivec' area in context)
0664      */
0665     add       r3, r3, r8
0666     CACHE_DOWNALGN r3
0667 
0668     PREP_FOR_SAVE r0, r3, r8, r6, r7, r10
0669     /* The manual says reading vscr can take some time - do 
0670      * read it here (into a volatile vector register) while
0671      * we wait for cache blocks to be allocated
0672      */
0673     mfvscr    v0
0674     S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11
0675     /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
0676     S_VSCR_VRSAVE r0, v0, r3, r8
0677 
0678 1:
0679 
0680     /* LOAD NON-VOLATILE REGISTERS                               */
0681 
0682     /* Advance past vrsave/vscr area                             */
0683     addi      r4, r4, PPC_CACHE_ALIGNMENT
0684     L_VSCR_VRSAVE r4, r0, v0
0685     CMP_BASES r4, r8, r6, r7, r10
0686     L_V20TOV31 r4, r8, r6, r7, r10, r11
0687 
0688 #ifndef IGNORE_VRSAVE
0689     mtcr      r9
0690 #endif
0691     blr
0692 
0693     .global _CPU_Context_initialize_altivec
0694 _CPU_Context_initialize_altivec:
0695     CMPOFF    r8
0696     add       r3, r3, r8
0697     CACHE_DOWNALGN r3
0698     lis       r8, _CPU_altivec_vrsave_initval@ha
0699     lwz       r8, _CPU_altivec_vrsave_initval@l(r8)
0700     stw       r8, VRSAVE_OFF(r3)
0701     lis       r6, _CPU_altivec_vscr_initval@ha
0702     lwz       r6, _CPU_altivec_vscr_initval@l(r6)
0703     stw       r6, VSCR_OFF(r3)
0704     blr
0705 
0706     /*
0707      * Change the initial value of VRSAVE.
0708      * Can be used by initialization code if
0709      * it is determined that code was compiled
0710      * with -mvrsave=no. In this case, VRSAVE
0711      * must be set to all-ones which causes this
0712      * support code to save/restore *all* registers
0713      * (only has an effect if IGNORE_VRSAVE is
0714      * not defined -- otherwise all registers are
0715      * saved/restored anyways).
0716      */
0717     .global _CPU_altivec_set_vrsave_initval
0718 _CPU_altivec_set_vrsave_initval:
0719     lis       r8, _CPU_altivec_vrsave_initval@ha
0720     stw       r3, _CPU_altivec_vrsave_initval@l(r8)
0721     mtvrsave  r3
0722     blr
0723 
0724 #ifdef ALTIVEC_TESTING
0725     .global msr_VE_on
0726 msr_VE_on:
0727     mfmsr r3
0728     oris  r3, r3, 1<<(31-6-16)
0729     mtmsr r3
0730     blr
0731 
0732     .global msr_VE_off
0733 msr_VE_off:
0734     mfmsr r3
0735     lis   r4,  1<<(31-6-16)
0736     andc  r3, r3, r4
0737     mtmsr r3
0738     blr
0739 
0740 
0741     .global mfvrsave
0742 mfvrsave:
0743     mfvrsave r3
0744     blr
0745 
0746     .global mtvrsave
0747 mtvrsave:
0748     mtvrsave r3
0749     blr
0750 
0751     /* Load all vector registers from memory area.
0752      * NOTE: This routine is not strictly ABI compliant --
0753      *       it guarantees that volatile vector registers
0754      *       have certain values on exit!
0755      */
0756     .global _CPU_altivec_load_all
0757 _CPU_altivec_load_all:
0758     /* Align address up to next cache-line boundary */
0759     addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
0760     CACHE_DOWNALGN r3
0761 #ifndef IGNORE_VRSAVE
0762     /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
0763      * when testing if we really should do the load/store operation.
0764      */
0765     mfcr      r9
0766 #endif
0767 
0768     /* Try to preload 1st line (where vscr and vrsave are stored) */
0769     dcbt      0, r3
0770     /* Point to start of general vector-register area             */
0771     addi      r3, r3, PPC_CACHE_ALIGNMENT
0772     /* Start preloading 2nd line (where first two vectors are)    */
0773     dcbt      0, r3
0774     L_VSCR_VRSAVE r3, r0, v0
0775     CMP_BASES     r3, r4, r8, r6, r10
0776     /* Start preloading 3rd line (where vectors 3 and 4 are)      */
0777     dcbt      0, r8
0778     L_V0TOV31 r3, r4, r8, r6, r10, r11
0779 
0780 #ifndef IGNORE_VRSAVE
0781     mtcr      r9
0782 #endif
0783     blr
0784 
0785     .global _CPU_altivec_save_all
0786 _CPU_altivec_save_all:
0787     /* Align address up to next cache-line boundary */
0788     addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
0789     CACHE_DOWNALGN r3
0790 
0791 #ifndef IGNORE_VRSAVE
0792     /* Save CRC -- it is used implicitly by all the LOAD/STORE macros
0793      * when testing if we really should do the load/store operation.
0794      */
0795     mfcr      r9
0796 #endif
0797 
0798     PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
0799     /* r0 now contains VRSAVE, r3 still the aligned memory area
0800      * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
0801      * respectively. r10 holds zero
0802      */
0803     S_V0TOV31     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
0804     mfvscr        v0
0805     /* Store vrsave (still in r0) and vscr (in v0) to memory area */
0806     S_VSCR_VRSAVE r0, v0, r3, r11
0807 
0808 #ifndef IGNORE_VRSAVE
0809     /* Restore CRC */
0810     mtcr      r9
0811 #endif
0812     blr
0813 
0814 
0815 #if 0
0816     .gnu_attribute 4,1
0817     .gnu_attribute 8,1
0818 #endif
0819 
0820 #endif
0821 #endif