File indexing completed on 2025-05-11 08:24:28
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196 #define DP
0197 #define ROLL
0198
0199 #ifdef SP
0200 #define REAL float
0201 #define ZERO 0.0
0202 #define ONE 1.0
0203 #define PREC "Single "
0204 #endif
0205
0206 #ifdef DP
0207 #define REAL double
0208 #define ZERO 0.0e0
0209 #define ONE 1.0e0
0210 #define PREC "Double "
0211 #endif
0212
0213 #ifdef ROLL
0214 #define ROLLING "Rolled "
0215 #endif
0216 #ifdef UNROLL
0217 #define ROLLING "Unrolled "
0218 #endif
0219
0220
0221 #define NTIMES atoi(argv[1])
0222
0223 #include <stdio.h>
0224 #include <math.h>
0225 #include <stdlib.h>
0226 #ifdef __rtems__
0227 #include <rtems/test-printer.h>
0228 #undef print_time
0229 #define fprintf(f, ...) rtems_printf(&rtems_test_printer, __VA_ARGS__)
0230 #endif
0231
0232
0233 static REAL atime[9][15];
0234
0235 void print_time (int row);
0236 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
0237 void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
0238 void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
0239 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
0240 void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
0241 REAL epslon (REAL x);
0242 int idamax (int n, REAL dx[], int incx);
0243 void dscal (int n, REAL da, REAL dx[], int incx);
0244 REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
0245
0246
0247 #include <sys/time.h> /* for following time functions only */
0248 static REAL second(void)
0249 {
0250 struct timeval tv;
0251
0252 gettimeofday(&tv, NULL);
0253 return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
0254 }
0255
0256 int main (int argc, char **argv)
0257 {
0258 static REAL aa[200*200],a[200*201],b[200],x[200];
0259 REAL cray,ops,total,norma,normx;
0260 REAL resid,residn,eps,t1,tm2,epsn,x1,x2;
0261 REAL mflops;
0262 static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
0263 int pass, loop;
0264 REAL overhead1, overhead2, time1, time2;
0265 char *compiler, *options;
0266
0267
0268
0269
0270
0271 compiler = "INSERT COMPILER NAME HERE";
0272 options = "INSERT OPTIMISATION OPTIONS HERE";
0273
0274
0275 lda = 201;
0276 ldaa = 200;
0277 cray = .056;
0278 n = 100;
0279
0280 fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0281 fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n");
0282 fprintf(stdout,"Compiler %s\n",compiler);
0283 fprintf(stdout,"Optimisation %s\n\n",options);
0284
0285 ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
0286
0287 matgen(a,lda,n,b,&norma);
0288 t1 = second();
0289 dgefa(a,lda,n,ipvt,&info);
0290 atime[0][0] = second() - t1;
0291 t1 = second();
0292 dgesl(a,lda,n,ipvt,b,0);
0293 atime[1][0] = second() - t1;
0294 total = atime[0][0] + atime[1][0];
0295
0296
0297
0298 for (i = 0; i < n; i++) {
0299 x[i] = b[i];
0300 }
0301 matgen(a,lda,n,b,&norma);
0302 for (i = 0; i < n; i++) {
0303 b[i] = -b[i];
0304 }
0305 dmxpy(n,b,n,lda,x,a);
0306 resid = 0.0;
0307 normx = 0.0;
0308 for (i = 0; i < n; i++) {
0309 resid = (resid > fabs((double)b[i]))
0310 ? resid : fabs((double)b[i]);
0311 normx = (normx > fabs((double)x[i]))
0312 ? normx : fabs((double)x[i]);
0313 }
0314 eps = epslon(ONE);
0315 residn = resid/( n*norma*normx*eps );
0316 epsn = eps;
0317 x1 = x[0] - 1;
0318 x2 = x[n-1] - 1;
0319
0320 printf("norm resid resid machep");
0321 printf(" x[0]-1 x[n-1]-1\n");
0322 printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
0323 (double)residn, (double)resid, (double)epsn,
0324 (double)x1, (double)x2);
0325
0326 fprintf(stdout,"Times are reported for matrices of order %5d\n",n);
0327 fprintf(stdout,"1 pass times for array with leading dimension of%5d\n\n",lda);
0328 fprintf(stdout," dgefa dgesl total Mflops unit");
0329 fprintf(stdout," ratio\n");
0330
0331 atime[2][0] = total;
0332 if (total > 0.0)
0333 {
0334 atime[3][0] = ops/(1.0e6*total);
0335 atime[4][0] = 2.0/atime[3][0];
0336 }
0337 else
0338 {
0339 atime[3][0] = 0.0;
0340 atime[4][0] = 0.0;
0341 }
0342 atime[5][0] = total/cray;
0343
0344 print_time(0);
0345
0346
0347
0348
0349
0350 fprintf (stdout,"\nCalculating matgen overhead\n");
0351 pass = -20;
0352 loop = NTIMES;
0353 do
0354 {
0355 time1 = second();
0356 pass = pass + 1;
0357 for ( i = 0 ; i < loop ; i++)
0358 {
0359 matgen(a,lda,n,b,&norma);
0360 }
0361 time2 = second();
0362 overhead1 = (time2 - time1);
0363 fprintf (stdout,"%10d times %6.2f seconds\n", loop, overhead1);
0364 if (overhead1 > 5.0)
0365 {
0366 pass = 0;
0367 }
0368 if (pass < 0)
0369 {
0370 if (overhead1 < 0.1)
0371 {
0372 loop = loop * 10;
0373 }
0374 else
0375 {
0376 loop = loop * 2;
0377 }
0378 }
0379 }
0380 while (pass < 0);
0381
0382 overhead1 = overhead1 / (double)loop;
0383
0384 fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
0385
0386
0387
0388
0389
0390 fprintf (stdout,"Calculating matgen/dgefa passes for 5 seconds\n");
0391 pass = -20;
0392 ntimes = NTIMES;
0393 do
0394 {
0395 time1 = second();
0396 pass = pass + 1;
0397 for ( i = 0 ; i < ntimes ; i++)
0398 {
0399 matgen(a,lda,n,b,&norma);
0400 dgefa(a,lda,n,ipvt,&info );
0401 }
0402 time2 = second() - time1;
0403 fprintf (stdout,"%10d times %6.2f seconds\n", ntimes, time2);
0404 if (time2 > 5.0)
0405 {
0406 pass = 0;
0407 }
0408 if (pass < 0)
0409 {
0410 if (time2 < 0.1)
0411 {
0412 ntimes = ntimes * 10;
0413 }
0414 else
0415 {
0416 ntimes = ntimes * 2;
0417 }
0418 }
0419 }
0420 while (pass < 0);
0421
0422 ntimes = 5.0 * (double)ntimes / time2;
0423 if (ntimes == 0) ntimes = 1;
0424
0425 fprintf (stdout,"Passes used %10d \n\n", ntimes);
0426 fprintf(stdout,"Times for array with leading dimension of%4d\n\n",lda);
0427 fprintf(stdout," dgefa dgesl total Mflops unit");
0428 fprintf(stdout," ratio\n");
0429
0430
0431
0432
0433
0434 tm2 = ntimes * overhead1;
0435 atime[3][6] = 0;
0436
0437 for (j=1 ; j<6 ; j++)
0438 {
0439
0440 t1 = second();
0441
0442 for (i = 0; i < ntimes; i++)
0443 {
0444 matgen(a,lda,n,b,&norma);
0445 dgefa(a,lda,n,ipvt,&info );
0446 }
0447
0448 atime[0][j] = (second() - t1 - tm2)/ntimes;
0449
0450 t1 = second();
0451
0452 for (i = 0; i < ntimes; i++)
0453 {
0454 dgesl(a,lda,n,ipvt,b,0);
0455 }
0456
0457 atime[1][j] = (second() - t1)/ntimes;
0458 total = atime[0][j] + atime[1][j];
0459 atime[2][j] = total;
0460 atime[3][j] = ops/(1.0e6*total);
0461 atime[4][j] = 2.0/atime[3][j];
0462 atime[5][j] = total/cray;
0463 atime[3][6] = atime[3][6] + atime[3][j];
0464
0465 print_time(j);
0466 }
0467 atime[3][6] = atime[3][6] / 5.0;
0468 fprintf (stdout,"Average %11.2f\n",
0469 (double)atime[3][6]);
0470
0471 fprintf (stdout,"\nCalculating matgen2 overhead\n");
0472
0473
0474
0475
0476
0477 time1 = second();
0478 for ( i = 0 ; i < loop ; i++)
0479 {
0480 matgen(aa,ldaa,n,b,&norma);
0481 }
0482 time2 = second();
0483 overhead2 = (time2 - time1);
0484 overhead2 = overhead2 / (double)loop;
0485
0486 fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
0487 fprintf(stdout,"Times for array with leading dimension of%4d\n\n",ldaa);
0488 fprintf(stdout," dgefa dgesl total Mflops unit");
0489 fprintf(stdout," ratio\n");
0490
0491
0492
0493
0494
0495 tm2 = ntimes * overhead2;
0496 atime[3][12] = 0;
0497
0498 for (j=7 ; j<12 ; j++)
0499 {
0500
0501 t1 = second();
0502
0503 for (i = 0; i < ntimes; i++)
0504 {
0505 matgen(aa,ldaa,n,b,&norma);
0506 dgefa(aa,ldaa,n,ipvt,&info );
0507 }
0508
0509 atime[0][j] = (second() - t1 - tm2)/ntimes;
0510
0511 t1 = second();
0512
0513 for (i = 0; i < ntimes; i++)
0514 {
0515 dgesl(aa,ldaa,n,ipvt,b,0);
0516 }
0517
0518 atime[1][j] = (second() - t1)/ntimes;
0519 total = atime[0][j] + atime[1][j];
0520 atime[2][j] = total;
0521 atime[3][j] = ops/(1.0e6*total);
0522 atime[4][j] = 2.0/atime[3][j];
0523 atime[5][j] = total/cray;
0524 atime[3][12] = atime[3][12] + atime[3][j];
0525
0526 print_time(j);
0527 }
0528 atime[3][12] = atime[3][12] / 5.0;
0529 fprintf (stdout,"Average %11.2f\n",
0530 (double)atime[3][12]);
0531
0532
0533
0534
0535
0536 mflops = atime[3][6];
0537 if (atime[3][12] < mflops) mflops = atime[3][12];
0538
0539 fprintf(stdout,"\n");
0540 fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0541 fprintf(stdout," Precision %11.2f Mflops \n\n",mflops);
0542 #ifdef __rtems__
0543 return 0;
0544 #endif
0545 }
0546
0547
0548 void print_time (int row)
0549
0550 {
0551 fprintf(stdout,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row],
0552 (double)atime[1][row], (double)atime[2][row], (double)atime[3][row],
0553 (double)atime[4][row], (double)atime[5][row]);
0554 return;
0555 }
0556
0557
0558
0559 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
0560
0561
0562
0563
0564
0565 {
0566 int init, i, j;
0567
0568 init = 1325;
0569 *norma = 0.0;
0570 for (j = 0; j < n; j++) {
0571 for (i = 0; i < n; i++) {
0572 init = 3125*init % 65536;
0573 a[lda*j+i] = (init - 32768.0)/16384.0;
0574 *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
0575
0576
0577
0578
0579 }
0580 }
0581 for (i = 0; i < n; i++) {
0582 b[i] = 0.0;
0583 }
0584 for (j = 0; j < n; j++) {
0585 for (i = 0; i < n; i++) {
0586 b[i] = b[i] + a[lda*j+i];
0587 }
0588 }
0589 return;
0590 }
0591
0592
0593 void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
0594
0595
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628
0629
0630
0631
0632
0633
0634
0635
0636
0637
0638
0639
0640
0641
0642
0643 {
0644
0645
0646 REAL t;
0647 int j,k,kp1,l,nm1;
0648
0649
0650
0651
0652 *info = 0;
0653 nm1 = n - 1;
0654 if (nm1 >= 0) {
0655 for (k = 0; k < nm1; k++) {
0656 kp1 = k + 1;
0657
0658
0659
0660 l = idamax(n-k,&a[lda*k+k],1) + k;
0661 ipvt[k] = l;
0662
0663
0664
0665
0666 if (a[lda*k+l] != ZERO) {
0667
0668
0669
0670 if (l != k) {
0671 t = a[lda*k+l];
0672 a[lda*k+l] = a[lda*k+k];
0673 a[lda*k+k] = t;
0674 }
0675
0676
0677
0678 t = -ONE/a[lda*k+k];
0679 dscal(n-(k+1),t,&a[lda*k+k+1],1);
0680
0681
0682
0683 for (j = kp1; j < n; j++) {
0684 t = a[lda*j+l];
0685 if (l != k) {
0686 a[lda*j+l] = a[lda*j+k];
0687 a[lda*j+k] = t;
0688 }
0689 daxpy(n-(k+1),t,&a[lda*k+k+1],1,
0690 &a[lda*j+k+1],1);
0691 }
0692 }
0693 else {
0694 *info = k;
0695 }
0696 }
0697 }
0698 ipvt[n-1] = n-1;
0699 if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
0700 return;
0701 }
0702
0703
0704
0705 void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
0706
0707
0708
0709
0710
0711
0712
0713
0714
0715
0716
0717
0718
0719
0720
0721
0722
0723
0724
0725
0726
0727
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739
0740
0741
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757
0758
0759
0760
0761
0762
0763
0764
0765
0766 {
0767
0768
0769 REAL t;
0770 int k,kb,l,nm1;
0771
0772 nm1 = n - 1;
0773 if (job == 0) {
0774
0775
0776
0777
0778 if (nm1 >= 1) {
0779 for (k = 0; k < nm1; k++) {
0780 l = ipvt[k];
0781 t = b[l];
0782 if (l != k){
0783 b[l] = b[k];
0784 b[k] = t;
0785 }
0786 daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
0787 }
0788 }
0789
0790
0791
0792 for (kb = 0; kb < n; kb++) {
0793 k = n - (kb + 1);
0794 b[k] = b[k]/a[lda*k+k];
0795 t = -b[k];
0796 daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
0797 }
0798 }
0799 else {
0800
0801
0802
0803
0804 for (k = 0; k < n; k++) {
0805 t = ddot(k,&a[lda*k+0],1,&b[0],1);
0806 b[k] = (b[k] - t)/a[lda*k+k];
0807 }
0808
0809
0810
0811 if (nm1 >= 1) {
0812 for (kb = 1; kb < nm1; kb++) {
0813 k = n - (kb+1);
0814 b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
0815 l = ipvt[k];
0816 if (l != k) {
0817 t = b[l];
0818 b[l] = b[k];
0819 b[k] = t;
0820 }
0821 }
0822 }
0823 }
0824 return;
0825 }
0826
0827
0828
0829 void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
0830
0831
0832
0833
0834
0835 {
0836 int i,ix,iy;
0837
0838 if(n <= 0) return;
0839 if (da == ZERO) return;
0840
0841 if(incx != 1 || incy != 1) {
0842
0843
0844
0845
0846 ix = 0;
0847 iy = 0;
0848 if(incx < 0) ix = (-n+1)*incx;
0849 if(incy < 0)iy = (-n+1)*incy;
0850 for (i = 0;i < n; i++) {
0851 dy[iy] = dy[iy] + da*dx[ix];
0852 ix = ix + incx;
0853 iy = iy + incy;
0854
0855 }
0856 return;
0857 }
0858
0859
0860
0861
0862 #ifdef ROLL
0863
0864 for (i = 0;i < n; i++) {
0865 dy[i] = dy[i] + da*dx[i];
0866 }
0867
0868
0869 #endif
0870
0871 #ifdef UNROLL
0872
0873 m = n % 4;
0874 if ( m != 0) {
0875 for (i = 0; i < m; i++)
0876 dy[i] = dy[i] + da*dx[i];
0877
0878 if (n < 4) return;
0879 }
0880 for (i = m; i < n; i = i + 4) {
0881 dy[i] = dy[i] + da*dx[i];
0882 dy[i+1] = dy[i+1] + da*dx[i+1];
0883 dy[i+2] = dy[i+2] + da*dx[i+2];
0884 dy[i+3] = dy[i+3] + da*dx[i+3];
0885
0886 }
0887
0888 #endif
0889 return;
0890 }
0891
0892
0893
0894 REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
0895
0896
0897
0898
0899
0900 {
0901 REAL dtemp;
0902 int i,ix,iy;
0903
0904 dtemp = ZERO;
0905
0906 if(n <= 0) return(ZERO);
0907
0908 if(incx != 1 || incy != 1) {
0909
0910
0911
0912
0913 ix = 0;
0914 iy = 0;
0915 if (incx < 0) ix = (-n+1)*incx;
0916 if (incy < 0) iy = (-n+1)*incy;
0917 for (i = 0;i < n; i++) {
0918 dtemp = dtemp + dx[ix]*dy[iy];
0919 ix = ix + incx;
0920 iy = iy + incy;
0921
0922 }
0923 return(dtemp);
0924 }
0925
0926
0927
0928
0929 #ifdef ROLL
0930
0931 for (i=0;i < n; i++)
0932 dtemp = dtemp + dx[i]*dy[i];
0933
0934 return(dtemp);
0935
0936 #endif
0937
0938 #ifdef UNROLL
0939
0940
0941 m = n % 5;
0942 if (m != 0) {
0943 for (i = 0; i < m; i++)
0944 dtemp = dtemp + dx[i]*dy[i];
0945 if (n < 5) return(dtemp);
0946 }
0947 for (i = m; i < n; i = i + 5) {
0948 dtemp = dtemp + dx[i]*dy[i] +
0949 dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
0950 dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
0951 }
0952 return(dtemp);
0953
0954 #endif
0955
0956 }
0957
0958
0959 void dscal(int n, REAL da, REAL dx[], int incx)
0960
0961
0962
0963
0964
0965 {
0966 int i,nincx;
0967
0968 if(n <= 0)return;
0969 if(incx != 1) {
0970
0971
0972
0973 nincx = n*incx;
0974 for (i = 0; i < nincx; i = i + incx)
0975 dx[i] = da*dx[i];
0976
0977 return;
0978 }
0979
0980
0981
0982
0983 #ifdef ROLL
0984
0985 for (i = 0; i < n; i++)
0986 dx[i] = da*dx[i];
0987
0988
0989 #endif
0990
0991 #ifdef UNROLL
0992
0993
0994 m = n % 5;
0995 if (m != 0) {
0996 for (i = 0; i < m; i++)
0997 dx[i] = da*dx[i];
0998 if (n < 5) return;
0999 }
1000 for (i = m; i < n; i = i + 5){
1001 dx[i] = da*dx[i];
1002 dx[i+1] = da*dx[i+1];
1003 dx[i+2] = da*dx[i+2];
1004 dx[i+3] = da*dx[i+3];
1005 dx[i+4] = da*dx[i+4];
1006 }
1007
1008 #endif
1009
1010 }
1011
1012
1013 int idamax(int n, REAL dx[], int incx)
1014
1015
1016
1017
1018
1019
1020
1021 {
1022 REAL dmax;
1023 int i, ix, itemp;
1024
1025 if( n < 1 ) return(-1);
1026 if(n ==1 ) return(0);
1027 itemp = -1;
1028 if(incx != 1) {
1029
1030
1031
1032 ix = 1;
1033 dmax = fabs((double)dx[0]);
1034 ix = ix + incx;
1035 for (i = 1; i < n; i++) {
1036 if(fabs((double)dx[ix]) > dmax) {
1037 itemp = i;
1038 dmax = fabs((double)dx[ix]);
1039 }
1040 ix = ix + incx;
1041 }
1042 }
1043 else {
1044
1045
1046
1047 itemp = 0;
1048 dmax = fabs((double)dx[0]);
1049 for (i = 1; i < n; i++) {
1050 if(fabs((double)dx[i]) > dmax) {
1051 itemp = i;
1052 dmax = fabs((double)dx[i]);
1053 }
1054 }
1055 }
1056 return (itemp);
1057 }
1058
1059
1060 REAL epslon (REAL x)
1061
1062
1063
1064
1065
1066 {
1067 REAL a,b,c,eps;
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096 a = 4.0e0/3.0e0;
1097 eps = ZERO;
1098 while (eps == ZERO) {
1099 b = a - ONE;
1100 c = b + b + b;
1101 eps = fabs((double)(c-ONE));
1102 }
1103 return(eps*fabs((double)x));
1104 }
1105
1106
1107 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136 {
1137 int j,i,jmin;
1138
1139
1140 j = n2 % 2;
1141 if (j >= 1) {
1142 j = j - 1;
1143 for (i = 0; i < n1; i++)
1144 y[i] = (y[i]) + x[j]*m[ldm*j+i];
1145 }
1146
1147
1148
1149 j = n2 % 4;
1150 if (j >= 2) {
1151 j = j - 1;
1152 for (i = 0; i < n1; i++)
1153 y[i] = ( (y[i])
1154 + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1155 }
1156
1157
1158
1159 j = n2 % 8;
1160 if (j >= 4) {
1161 j = j - 1;
1162 for (i = 0; i < n1; i++)
1163 y[i] = ((( (y[i])
1164 + x[j-3]*m[ldm*(j-3)+i])
1165 + x[j-2]*m[ldm*(j-2)+i])
1166 + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1167 }
1168
1169
1170
1171 j = n2 % 16;
1172 if (j >= 8) {
1173 j = j - 1;
1174 for (i = 0; i < n1; i++)
1175 y[i] = ((((((( (y[i])
1176 + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
1177 + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
1178 + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
1179 + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i];
1180 }
1181
1182
1183
1184 jmin = (n2%16)+16;
1185 for (j = jmin-1; j < n2; j = j + 16) {
1186 for (i = 0; i < n1; i++)
1187 y[i] = ((((((((((((((( (y[i])
1188 + x[j-15]*m[ldm*(j-15)+i])
1189 + x[j-14]*m[ldm*(j-14)+i])
1190 + x[j-13]*m[ldm*(j-13)+i])
1191 + x[j-12]*m[ldm*(j-12)+i])
1192 + x[j-11]*m[ldm*(j-11)+i])
1193 + x[j-10]*m[ldm*(j-10)+i])
1194 + x[j- 9]*m[ldm*(j- 9)+i])
1195 + x[j- 8]*m[ldm*(j- 8)+i])
1196 + x[j- 7]*m[ldm*(j- 7)+i])
1197 + x[j- 6]*m[ldm*(j- 6)+i])
1198 + x[j- 5]*m[ldm*(j- 5)+i])
1199 + x[j- 4]*m[ldm*(j- 4)+i])
1200 + x[j- 3]*m[ldm*(j- 3)+i])
1201 + x[j- 2]*m[ldm*(j- 2)+i])
1202 + x[j- 1]*m[ldm*(j- 1)+i])
1203 + x[j] *m[ldm*j+i];
1204 }
1205 return;
1206 }
1207
1208