![]() |
|
|||
File indexing completed on 2025-05-11 08:23:58
0001 #ifdef __ALTIVEC__ 0002 0003 /* Altivec support for RTEMS; vector register context management. */ 0004 0005 /* 0006 * Authorship 0007 * ---------- 0008 * This software was created by 0009 * Till Straumann <strauman@slac.stanford.edu>, 2009, 0010 * Stanford Linear Accelerator Center, Stanford University. 0011 * 0012 * Acknowledgement of sponsorship 0013 * ------------------------------ 0014 * This software was produced by 0015 * the Stanford Linear Accelerator Center, Stanford University, 0016 * under Contract DE-AC03-76SFO0515 with the Department of Energy. 0017 * 0018 * Government disclaimer of liability 0019 * ---------------------------------- 0020 * Neither the United States nor the United States Department of Energy, 0021 * nor any of their employees, makes any warranty, express or implied, or 0022 * assumes any legal liability or responsibility for the accuracy, 0023 * completeness, or usefulness of any data, apparatus, product, or process 0024 * disclosed, or represents that its use would not infringe privately owned 0025 * rights. 0026 * 0027 * Stanford disclaimer of liability 0028 * -------------------------------- 0029 * Stanford University makes no representations or warranties, express or 0030 * implied, nor assumes any liability for the use of this software. 0031 * 0032 * Stanford disclaimer of copyright 0033 * -------------------------------- 0034 * Stanford University, owner of the copyright, hereby disclaims its 0035 * copyright and all other rights in this software. Hence, anyone may 0036 * freely use it for any purpose without restriction. 0037 * 0038 * Maintenance of notices 0039 * ---------------------- 0040 * In the interest of clarity regarding the origin and status of this 0041 * SLAC software, this and all the preceding Stanford University notices 0042 * are to remain affixed to any copy or derivative of this software made 0043 * or distributed by the recipient and are to be affixed to any copy of 0044 * software made or distributed by the recipient that contains a copy or 0045 * derivative of this software. 0046 * 0047 * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 0048 */ 0049 0050 0051 #include <rtems/powerpc/powerpc.h> 0052 0053 #ifndef PPC_CACHE_ALIGNMENT 0054 #error "Missing header; PPC_CACHE_ALIGNMENT is not defined" 0055 #endif 0056 0057 #define ALTIVEC_TESTING 0058 0059 #if PPC_CACHE_ALIGNMENT != 32 0060 #error "Altivec support assumes cache-line size is 32 bytes!" 0061 #else 0062 #undef LD_PPC_CACHE_ALIGNMENT 0063 #define LD_PPC_CACHE_ALIGNMENT 5 0064 #endif 0065 0066 .set v0, 0 0067 .set v8, 8 0068 .set v16, 16 0069 .set v20, 20 0070 .set v24, 24 0071 .set v28, 28 0072 0073 .set r0, 0 0074 .set r3, 3 0075 .set r4, 4 0076 /* Do not use r5, since this is used by _CPU_Context_switch() */ 0077 .set r6, 6 0078 .set r7, 7 0079 .set r8, 8 0080 .set r9, 9 0081 .set r10, 10 0082 .set r11, 11 0083 /* Do not use r12, since this is used by _CPU_Context_switch() */ 0084 0085 .set cr5, 5 0086 0087 .set VECSIZE, 16 0088 0089 .set VRSAVE_INIT_VAL, 0 0090 .set VSCR_INIT_VAL, 0 0091 0092 .set VRSAVE_OFF, 16 0093 .set VSCR_OFF, 16+12 0094 0095 .set ds0, 0 0096 0097 /* Block size for dst -- in units of 16-bytes */ 0098 .set BSIZE, 2 /* = 32 bytes */ 0099 .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ 0100 .set BSTRIDE, 32 /* bytes */ 0101 0102 .data 0103 0104 .global _CPU_altivec_vrsave_initval 0105 _CPU_altivec_vrsave_initval: 0106 .long 0 0107 0108 .global _CPU_altivec_vscr_initval 0109 _CPU_altivec_vscr_initval: 0110 .long 0 0111 0112 .text 0113 0114 .extern _CPU_altivec_psim_cpu 0115 .extern _CPU_altivec_ctxt_off 0116 0117 .macro CMPOFF _B0 0118 lis \_B0, _CPU_altivec_ctxt_off@ha 0119 lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) 0120 .endm 0121 0122 /* Conditionally load or store a vector _VR to 0123 * EA(_R1|0 + _R2) 0124 * If bit _VR (corresponding to _VR) is set in CRC 0125 * then the load/store is performed but otherwise 0126 * it is skipped. 0127 * If compiled with IGNORE_VRSAVE defined then 0128 * the load/store is done unconditionally. 0129 * 0130 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl 0131 * _VR : target vector register 0132 * _R1 : base register (NOTE: _R1=r0 uses a 0133 * implicit ZERO constant, not the contents 0134 * of r0) for address computation. 0135 * _R2 : 'offset' register for address computation. 0136 * 0137 * MODIFIES: _VR on output if a load operation is performed. 0138 * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE 0139 * defined. 0140 */ 0141 .macro LDST _OPCODE, _VR, _R1, _R2 0142 #ifndef IGNORE_VRSAVE 0143 bc 4, \_VR, 111f 0144 #endif 0145 \_OPCODE \_VR, \_R1, \_R2 0146 111: 0147 .endm 0148 0149 /* 0150 * Load or store four 'adjacent' vector registers. 0151 * 0152 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl 0153 * _VR : target vector register 0154 * _R1 : base register (NOTE: _R1=r0 uses a 0155 * implicit ZERO constant, not the contents 0156 * of r0) for address computation. 0157 * _B0 : base register 0 0158 * _B1 : base register 1 0159 * _B2 : base register 2 0160 * _B3 : base register 3 0161 * _RO : offset register 0162 * 0163 * memory addresses for _VR, _VR+1, _VR+2, _VR+3 0164 * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. 0165 * 0166 * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load 0167 * operation is performed. 0168 * IMPLICIT USE: see LDST 0169 */ 0170 .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO 0171 LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO 0172 LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO 0173 LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO 0174 LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO 0175 .endm 0176 0177 /* 0178 * Preload/zero two cache lines and save 4 vector registers 0179 * to memory. 0180 * Note that the cache operation targets memory *past* the 0181 * current storage area which should hopefully hit when 0182 * This same code is executed on the next two cache lines... 0183 * 0184 * This code effectively does 0185 * dcbz (_B0 + 64) 0186 * dcbz (_B0 + 64 + 32) 0187 * stvx _VF+0, (_B0+ 0) 0188 * stvx _VF+1, (_B0+16) 0189 * stvx _VF+2, (_B0+32) 0190 * stvx _VF+3, (_B0+48) 0191 * 0192 * _LRU: may be 'l' or empty. The former variant should be 0193 * used when it is conceivable that the memory area is 0194 * unlikely to be used in the near future thus making 0195 * it a candidate for early eviction from the caches. 0196 * 0197 * If it is likely that the memory area is reused soon 0198 * (e.g., save/restore across ISR execution) then the 0199 * 'stvx' opcode (w/o 'l' suffix) should be used. 0200 * 0201 * _VR: first of four target vector registers; _VR+0, 0202 * _VR+1, _VR+2, _VR+3 are saved. 0203 * 0204 * _BO: base address of memory area. 0205 * _B1: should contain _B0+16 on entry 0206 * _B2: should contain _B0+32 on entry 0207 * _B3: should contain _B0+48 on entry 0208 * 0209 * _O1: contains the offset where the four vectors are 0210 * stored. 0211 * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) 0212 * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) 0213 * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) 0214 * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) 0215 * _O2: is set to _O1 + 64 by this macro. Hence _O2 is 0216 * used to address the two cache-lines past the 0217 * current memory area. 0218 * 0219 * MODIFIES: _O2; contains _O1 + 64 after execution of this 0220 * code. 0221 * 0222 * NOTES: a different set of four vectors can be addressed 0223 * simply by changing the one offset register _O1. 0224 * 0225 * Saving more than 4 registers can simply be 0226 * achieved by expanding this macro multiple 0227 * times with _O1 and _O2 swapped (new _O1 0228 * becomes _O2 = old _O1 + 64) thus stepping 0229 * through the memory area. 0230 * 0231 */ 0232 .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 0233 addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT 0234 dcbz \_B0, \_O2 0235 dcbz \_B2, \_O2 0236 LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 0237 .endm 0238 0239 /* 0240 * Save eight vector registers by expanding S4VEC_P twice. 0241 * See notes for S4VEC_P above. 0242 * 0243 * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) 0244 * 0245 * MODIFIES: After execution, 0246 * _O2 contains original _O1 + 64, 0247 * _O1 contains original _O1 + 128 0248 * 0249 * NOTES: Expanding this macro multiple times lets you save 0250 * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). 0251 */ 0252 .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 0253 S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 0254 /* Note that the roles of _O1 and _O2 are swapped here */ 0255 S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 0256 .endm 0257 0258 /* 0259 * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) 0260 * 0261 * See notes above (for S4VEC_P). 0262 * 0263 * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) 0264 * MODIFIES: _O1 contains original _O1 + 256 0265 * _O2 contains original _O1 + 256 - 64 0266 */ 0267 .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 0268 S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 0269 S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 0270 LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 0271 .endm 0272 0273 /* 0274 * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) 0275 * 0276 * See notes above (for S4VEC_P, S_V0TOV19). 0277 * 0278 * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) 0279 * MODIFIES: _O1 contains original _O1 + 128 0280 * _O2 contains original _O1 + 128 - 64 0281 */ 0282 .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 0283 S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 0284 LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 0285 .endm 0286 0287 /* 0288 * Save all registers to memory area 0289 * 0290 * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) 0291 * MODIFIES: _O1 contains original _O1 + 512 0292 * _O2 contains original _O1 + 512 - 64 0293 */ 0294 .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 0295 S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 0296 S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 0297 S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 0298 S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 0299 LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 0300 .endm 0301 0302 0303 /* 0304 * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. 0305 * We can pass either of them as arguments to another macro which 0306 * allows us to decide if the main macro uses dcbt or not when 0307 * we expand it... 0308 */ 0309 .macro DO_DCBT _RA, _RB 0310 dcbt \_RA, \_RB 0311 .endm 0312 0313 .macro NO_DCBT _RA, _RB 0314 .endm 0315 0316 /* 0317 * NOTE REGARDING dcbt VS dst 0318 * 0319 * Preloading the cache with memory areas that we soon need 0320 * can be done either using 'dcbt' or 'dst' instructions 0321 * "ahead of time". 0322 * When experimenting (on a mpc7457) I found that the 'dst' 0323 * stream instruction was very efficient if there is enough 0324 * time to read ahead. It works well when we do a context 0325 * switch: 0326 * 0327 * 1) start DST on new context to be loaded 0328 * 2) save old context to memory 0329 * 3) load new context from memory 0330 * 0331 * Because of the interleaved step 2) dst works nicely and 0332 * 3) finds what it needs in the cache. 0333 * 0334 * However, in a situation when there is not much time 0335 * to start the DST, e.g., because we want to restore 0336 * a context out of the blue (e.g., after returning 0337 * from and ISR): 0338 * 0339 * 1) save volatile registers to memory/stack 0340 * 2) execute ISR 0341 * 3) might do a task context switch 0342 * 4) when returned to old task context then 0343 * reload volatile registers from memory/stack. 0344 * 0345 * In this situation, preloading the target memory before 0346 * or after step 1) makes obviously no sense because after 0347 * 1) the registers area is most likely in the cache already. 0348 * 0349 * Starting preload after 2) doesn't make much sense either. 0350 * If ISR doesn't lead to a context switch then it is quite 0351 * likely that the register area is still in the cache. 0352 * OTOTH, if a context switch happens then the preload after 2) 0353 * might be useless. 0354 * 0355 * This leaves us at step 4) where we want to load immediately. 0356 * In this case, I found that 'dcbt' works more efficiently 0357 * so that's what we use when restoring volatile registers. 0358 * 0359 * When restoring the non-volatile VRs during a 'normal' 0360 * context switch then we shall use DST (and no dcbt). 0361 */ 0362 0363 /* 0364 * Symmetric to S4VEC_P above but addresses loading four 0365 * vector registers from memory. 0366 * 0367 * Touches two cache lines past the current memory area 0368 * and loads four vectors from the current area. 0369 * 0370 * Optionally, the DCBT operation may be omitted 0371 * (when expanding with _DCBT=NO_DCBT). 0372 * This is useful if the cache was already preloaded 0373 * by another means (dst instruction). 0374 * 0375 * NOTE: We always use the 'LRU' form of lvx: lvxl, 0376 * because we deem it unlikely that the context 0377 * that was just loaded has to be saved again 0378 * to memory in the immediate future. 0379 * 0380 * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded 0381 * as explained above. 0382 * 0383 * MODIFIES: _O2 contains original _O1 + 64. 0384 * _VR.._VR+3 loaded from memory. 0385 */ 0386 .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 0387 addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT 0388 /* preload/touch 2 lines at offset 64 from _B0 */ 0389 \_DCBT \_B0, \_O2 0390 \_DCBT \_B2, \_O2 0391 /* load four vectors at off set 0 from _B0 */ 0392 LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 0393 .endm 0394 0395 /* 0396 * Symmetric to S8VEC_P; loads 8 vector registers 0397 * from memory -- see comments above... 0398 * 0399 * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded 0400 * as explained above. 0401 * 0402 * MODIFIES: _O1 contains original _O1 + 128. 0403 * _O2 contains original _O1 + 64. 0404 * _VR.._VR+7 loaded from memory. 0405 */ 0406 .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 0407 L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0408 L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 0409 .endm 0410 0411 /* 0412 * Load volatile vector registers v0..v19 employing 0413 * the DCBT to preload the cache. The rationale for 0414 * using DCBT here but not when restoring non-volatile 0415 * registers is explained above, see 0416 * 0417 * "NOTE REGARDING dcbt VS dst" 0418 * 0419 * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded 0420 * as explained above. 0421 * 0422 * MODIFIES: _O1 contains original _O1 + 256. 0423 * _O2 contains original _O1 + 256 - 64. 0424 * VR0..VR19 loaded from memory. 0425 */ 0426 .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 0427 L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0428 L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0429 LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 0430 .endm 0431 0432 /* 0433 * Load non-volatile vector registers v20..v31. 0434 * Note that no DCBT is performed since we use 0435 * DST for preloading the cache during a context 0436 * switch, see 0437 * 0438 * "NOTE REGARDING dcbt VS dst" 0439 * 0440 * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded 0441 * as explained above. 0442 * 0443 * MODIFIES: _O1 contains original _O1 + 128. 0444 * _O2 contains original _O1 + 128 - 64. 0445 * VR20..VR31 loaded from memory. 0446 */ 0447 .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 0448 L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0449 LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 0450 .endm 0451 0452 /* 0453 * Load all registers from memory area. 0454 */ 0455 .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 0456 L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0457 L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0458 L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0459 L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 0460 LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 0461 .endm 0462 0463 /* 0464 * Compute 0465 * _B1 = _B0 + 16 0466 * _B2 = _B0 + 32 0467 * _B3 = _B0 + 48 0468 * and load 0469 * _RO = 0 0470 * 0471 * convenience macro to be expanded before 0472 * any of the load/store macros that use 0473 * four base addresses etc. 0474 * 0475 * INPUT: _B0 = cache-aligned start of memory area 0476 * 0477 * MODIFIES: _B1, _B2, _B3, _RO as described above. 0478 */ 0479 .macro CMP_BASES _B0, _B1, _B2, _B3, _RO 0480 addi \_B1, \_B0, 1*VECSIZE 0481 addi \_B2, \_B0, 2*VECSIZE 0482 addi \_B3, \_B0, 3*VECSIZE 0483 li \_RO, 0 0484 .endm 0485 0486 /* 0487 * Prepare for saving general vector registers. 0488 * 0489 * If not built with #define IGNORE_VRSAVE then 0490 * 0491 * 1) copy vrsave to CRC 0492 * 0493 * endif 0494 * 0495 * 2) copy vrsave to _VRSAVE_REG 0496 * 3) preload/zero cache line where vrsave and vscr are stored. 0497 * 4) compute base adresses from _B0 0498 * 5) preload/zero first two cache lines (remember that the 0499 * first S8VEC_P starts preloading/zeroing at offset 64). 0500 * 0501 * INPUT: 'vrsave' register, _B0 (base address of memory area) 0502 * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') 0503 * _B0 = original _BO + 32 0504 * _B1 = original _B0 + 32 + 16, 0505 * _B2 = original _B0 + 32 + 32, 0506 * _B3 = original _B0 + 32 + 48, 0507 * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) 0508 */ 0509 .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO 0510 mfvrsave \_VRSAVE_REG 0511 #ifndef IGNORE_VRSAVE 0512 mtcr \_VRSAVE_REG 0513 #endif 0514 dcbz 0, \_B0 0515 addi \_B0, \_B0, PPC_CACHE_ALIGNMENT 0516 dcbz 0, \_B0 0517 CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO 0518 dcbz 0, \_B2 0519 .endm 0520 0521 /* 0522 * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers 0523 * must have been loaded from 'vrsave' and 'vscr', respectively, 0524 * prior to expanding this macro. 0525 * 0526 * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents 0527 * _VSCR_VREG VR holding 'vscr' contents 0528 * _B0 cache-aligned (base) address of memory area. 0529 * MODIFIES: _SCRATCH_REG 0530 */ 0531 .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG 0532 stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) 0533 li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF 0534 stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG 0535 .endm 0536 0537 /* 0538 * Load 'vrsave' and 'vscr' from memory. 0539 * 0540 * INPUTS: _B0 cache-aligned (base) address of memory area. 0541 * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) 0542 * 'vscr', 'vrsave'. 0543 * CRC (holds contents of 'vrsave') (ONLY IF COMPILED 0544 * with IGNORE_VRSAVE undefined). 0545 */ 0546 .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG 0547 lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) 0548 mtvrsave \_SCRATCH_REG 0549 #ifndef IGNORE_VRSAVE 0550 mtcr \_SCRATCH_REG 0551 #endif 0552 li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF 0553 lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG 0554 mtvscr \_SCRATCH_VREG 0555 .endm 0556 0557 /* 0558 * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) 0559 * 0560 * INPUT: _B0 0561 * MODIFIES: _B0 (as stated above) 0562 */ 0563 .macro CACHE_DOWNALGN _B0 0564 rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT 0565 .endm 0566 0567 .text 0568 0569 .global _CPU_save_altivec_volatile 0570 _CPU_save_altivec_volatile: 0571 /* Align address up to next cache-line boundary */ 0572 addi r3, r3, PPC_CACHE_ALIGNMENT - 1 0573 CACHE_DOWNALGN r3 0574 0575 #ifndef IGNORE_VRSAVE 0576 /* Save CRC -- it is used implicitly by all the LOAD/STORE macros 0577 * when testing if we really should do the load/store operation. 0578 */ 0579 mfcr r9 0580 #endif 0581 0582 PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 0583 /* r0 now contains VRSAVE, r3 still the aligned memory area 0584 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, 0585 * respectively. r10 holds zero 0586 */ 0587 S_V0TOV19 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 0588 mfvscr v0 0589 /* Store vrsave (still in r0) and vscr (in v0) to memory area */ 0590 S_VSCR_VRSAVE r0, v0, r3, r11 0591 0592 #ifndef IGNORE_VRSAVE 0593 /* Restore CRC */ 0594 mtcr r9 0595 #endif 0596 blr 0597 0598 .global _CPU_load_altivec_volatile 0599 _CPU_load_altivec_volatile: 0600 /* Align address up to next cache-line boundary */ 0601 addi r3, r3, PPC_CACHE_ALIGNMENT - 1 0602 CACHE_DOWNALGN r3 0603 #ifndef IGNORE_VRSAVE 0604 /* Save CRC -- it is used implicitly by all the LOAD/STORE macros 0605 * when testing if we really should do the load/store operation. 0606 */ 0607 mfcr r9 0608 #endif 0609 0610 /* Try to preload 1st line (where vscr and vrsave are stored) */ 0611 dcbt 0, r3 0612 /* Point to start of general vector-register area */ 0613 addi r3, r3, PPC_CACHE_ALIGNMENT 0614 /* Start preloading 2nd line (where first two vectors are) */ 0615 dcbt 0, r3 0616 L_VSCR_VRSAVE r3, r0, v0 0617 CMP_BASES r3, r4, r8, r6, r10 0618 /* Start preloading 3rd line (where vectors 3 and 4 are) */ 0619 dcbt 0, r8 0620 L_V0TOV19 r3, r4, r8, r6, r10, r11 0621 0622 #ifndef IGNORE_VRSAVE 0623 mtcr r9 0624 #endif 0625 blr 0626 0627 .global _CPU_Context_switch_altivec 0628 _CPU_Context_switch_altivec: 0629 0630 /* fetch offset of altivec area in context */ 0631 CMPOFF r8 0632 /* down-align 'to' area to cache-line boundary */ 0633 add r4, r4, r8 0634 CACHE_DOWNALGN r4 0635 0636 /* Check for PSIM */ 0637 lis r6, _CPU_altivec_psim_cpu@ha 0638 lwz r6, _CPU_altivec_psim_cpu@l(r6) 0639 cmpli 0, r6, 0 0640 bne 1f 0641 /* Skip data-stream instructions on PSIM (not implemented) */ 0642 dssall 0643 /* Pre-load new context into cache */ 0644 lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) 0645 ori r6, r6, BSTRIDE 0646 dstt r4, r6, ds0 0647 1: 0648 0649 #ifndef IGNORE_VRSAVE 0650 /* Save CRC -- it is used implicitly by all the LOAD/STORE macros 0651 * when testing if we really should do the load/store operation. 0652 */ 0653 mfcr r9 0654 #endif 0655 0656 /* Is 'from' context == NULL ? (then we just do a 'restore') */ 0657 cmpli 0, r3, 0 0658 beq 1f /* yes: skip saving 'from' context */ 0659 0660 /* SAVE NON-VOLATILE REGISTERS */ 0661 0662 /* Compute aligned destination pointer (r8 still holds offset 0663 * to 'altivec' area in context) 0664 */ 0665 add r3, r3, r8 0666 CACHE_DOWNALGN r3 0667 0668 PREP_FOR_SAVE r0, r3, r8, r6, r7, r10 0669 /* The manual says reading vscr can take some time - do 0670 * read it here (into a volatile vector register) while 0671 * we wait for cache blocks to be allocated 0672 */ 0673 mfvscr v0 0674 S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11 0675 /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ 0676 S_VSCR_VRSAVE r0, v0, r3, r8 0677 0678 1: 0679 0680 /* LOAD NON-VOLATILE REGISTERS */ 0681 0682 /* Advance past vrsave/vscr area */ 0683 addi r4, r4, PPC_CACHE_ALIGNMENT 0684 L_VSCR_VRSAVE r4, r0, v0 0685 CMP_BASES r4, r8, r6, r7, r10 0686 L_V20TOV31 r4, r8, r6, r7, r10, r11 0687 0688 #ifndef IGNORE_VRSAVE 0689 mtcr r9 0690 #endif 0691 blr 0692 0693 .global _CPU_Context_initialize_altivec 0694 _CPU_Context_initialize_altivec: 0695 CMPOFF r8 0696 add r3, r3, r8 0697 CACHE_DOWNALGN r3 0698 lis r8, _CPU_altivec_vrsave_initval@ha 0699 lwz r8, _CPU_altivec_vrsave_initval@l(r8) 0700 stw r8, VRSAVE_OFF(r3) 0701 lis r6, _CPU_altivec_vscr_initval@ha 0702 lwz r6, _CPU_altivec_vscr_initval@l(r6) 0703 stw r6, VSCR_OFF(r3) 0704 blr 0705 0706 /* 0707 * Change the initial value of VRSAVE. 0708 * Can be used by initialization code if 0709 * it is determined that code was compiled 0710 * with -mvrsave=no. In this case, VRSAVE 0711 * must be set to all-ones which causes this 0712 * support code to save/restore *all* registers 0713 * (only has an effect if IGNORE_VRSAVE is 0714 * not defined -- otherwise all registers are 0715 * saved/restored anyways). 0716 */ 0717 .global _CPU_altivec_set_vrsave_initval 0718 _CPU_altivec_set_vrsave_initval: 0719 lis r8, _CPU_altivec_vrsave_initval@ha 0720 stw r3, _CPU_altivec_vrsave_initval@l(r8) 0721 mtvrsave r3 0722 blr 0723 0724 #ifdef ALTIVEC_TESTING 0725 .global msr_VE_on 0726 msr_VE_on: 0727 mfmsr r3 0728 oris r3, r3, 1<<(31-6-16) 0729 mtmsr r3 0730 blr 0731 0732 .global msr_VE_off 0733 msr_VE_off: 0734 mfmsr r3 0735 lis r4, 1<<(31-6-16) 0736 andc r3, r3, r4 0737 mtmsr r3 0738 blr 0739 0740 0741 .global mfvrsave 0742 mfvrsave: 0743 mfvrsave r3 0744 blr 0745 0746 .global mtvrsave 0747 mtvrsave: 0748 mtvrsave r3 0749 blr 0750 0751 /* Load all vector registers from memory area. 0752 * NOTE: This routine is not strictly ABI compliant -- 0753 * it guarantees that volatile vector registers 0754 * have certain values on exit! 0755 */ 0756 .global _CPU_altivec_load_all 0757 _CPU_altivec_load_all: 0758 /* Align address up to next cache-line boundary */ 0759 addi r3, r3, PPC_CACHE_ALIGNMENT - 1 0760 CACHE_DOWNALGN r3 0761 #ifndef IGNORE_VRSAVE 0762 /* Save CRC -- it is used implicitly by all the LOAD/STORE macros 0763 * when testing if we really should do the load/store operation. 0764 */ 0765 mfcr r9 0766 #endif 0767 0768 /* Try to preload 1st line (where vscr and vrsave are stored) */ 0769 dcbt 0, r3 0770 /* Point to start of general vector-register area */ 0771 addi r3, r3, PPC_CACHE_ALIGNMENT 0772 /* Start preloading 2nd line (where first two vectors are) */ 0773 dcbt 0, r3 0774 L_VSCR_VRSAVE r3, r0, v0 0775 CMP_BASES r3, r4, r8, r6, r10 0776 /* Start preloading 3rd line (where vectors 3 and 4 are) */ 0777 dcbt 0, r8 0778 L_V0TOV31 r3, r4, r8, r6, r10, r11 0779 0780 #ifndef IGNORE_VRSAVE 0781 mtcr r9 0782 #endif 0783 blr 0784 0785 .global _CPU_altivec_save_all 0786 _CPU_altivec_save_all: 0787 /* Align address up to next cache-line boundary */ 0788 addi r3, r3, PPC_CACHE_ALIGNMENT - 1 0789 CACHE_DOWNALGN r3 0790 0791 #ifndef IGNORE_VRSAVE 0792 /* Save CRC -- it is used implicitly by all the LOAD/STORE macros 0793 * when testing if we really should do the load/store operation. 0794 */ 0795 mfcr r9 0796 #endif 0797 0798 PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 0799 /* r0 now contains VRSAVE, r3 still the aligned memory area 0800 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, 0801 * respectively. r10 holds zero 0802 */ 0803 S_V0TOV31 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 0804 mfvscr v0 0805 /* Store vrsave (still in r0) and vscr (in v0) to memory area */ 0806 S_VSCR_VRSAVE r0, v0, r3, r11 0807 0808 #ifndef IGNORE_VRSAVE 0809 /* Restore CRC */ 0810 mtcr r9 0811 #endif 0812 blr 0813 0814 0815 #if 0 0816 .gnu_attribute 4,1 0817 .gnu_attribute 8,1 0818 #endif 0819 0820 #endif 0821 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |