benchmarks/linpack/linpack-pc.c

0001 /*
0002  *          Linpack 100x100 Benchmark In C/C++ For PCs
0003  *
0004  ********************************************************************
0005  *
0006  *                 Original Source from NETLIB
0007  *
0008  *  Translated to C by Bonnie Toy 5/88 (modified on 2/25/94  to fix
0009  *  a problem with daxpy for unequal increments or equal increments
0010  *  not equal to 1. Jack Dongarra)
0011  *
0012  *  To obtain rolled source BLAS, add -DROLL to the command lines.
0013  *  To obtain unrolled source BLAS, add -DUNROLL to the command lines.
0014  *
0015  *  You must specify one of -DSP or -DDP to compile correctly.
0016  *
0017  *  You must specify one of -DROLL or -DUNROLL to compile correctly.
0018  *
0019  ********************************************************************
0020  *
0021  *                   Changes in this version
0022  *
0023  *  1. Function prototypes are declared and function headers have
0024  *     embedded parameter types to produce code for C and C++
0025  *
0026  *  2. Arrays aa and a are declared as [200*200] and [200*201] to
0027  *     allow compilation with prototypes.
0028  *
0029  *  3. Function second changed (compiler dependent).
0030  *
0031  *  4. Timing method changed due to inaccuracy of PC clock (see below).
0032  *
0033  *  5. Additional date function included (compiler dependent).
0034  *
0035  *  6. Additional code used as a standard for a series of benchmarks:-
0036  *       Automatic run time calibration rather than fixed parameters
0037  *       Initial calibration with display to show linearity
0038  *       Results displayed at reasonable rate for viewing (5 seconds)
0039  *       Facilities for typing in details of system used etc.
0040  *       Compiler details in code in case .exe files used elsewhere
0041  *       Results appended to a text file (Linpack.txt)
0042  *
0043  *  Roy Longbottom  101323.2241@compuserve.com    14 September 1996
0044  *
0045  ************************************************************************
0046  *
0047  *                             Timing
0048  *
0049  *  The PC timer is updated at about 18 times per second or resolution of
0050  *  0.05 to 0.06 seconds which is similar to the time taken by the main
0051  *  time consuming function dgefa on a 100 MHz Pentium. Thus there is no
0052  *  point in running the dgefa/dges1 combination three times as in the
0053  *  original version. Main timing for the latter, in the loop run NTIMES,
0054  *  executes matgen/dgefa, summing the time taken by matgen within the
0055  *  loop for later deduction from the total time. On a modern PC this sum
0056  *  can be based on a random selection of 0 or 0.05/0.06. This version
0057  *  executes the single pass once and the main timing loop five times,
0058  *  calculating the matgen overhead separately.
0059  *
0060  *************************************************************************
0061  *
0062  *                    Example of Output
0063  *
0064  * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++'
0065  *
0066  * Compiler     Watcom C/C++ 10.5 Win 386
0067  * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL
0068  *
0069  *
0070  * norm resid      resid           machep         x[0]-1          x[n-1]-1
0071  *  0.4   7.41628980e-014  1.00000000e-015 -1.49880108e-014 -1.89848137e-014
0072  *
0073  *
0074  * Times are reported for matrices of order          100
0075  * 1 pass times for array with leading dimension of  201
0076  *
0077  *     dgefa      dgesl      total     Mflops       unit      ratio
0078  *   0.06000    0.00000    0.06000      11.44     0.1748     1.0714
0079  *
0080  *
0081  * Calculating matgen overhead
0082  *
0083  *       10 times   0.11 seconds
0084  *       20 times   0.22 seconds
0085  *       40 times   0.44 seconds
0086  *       80 times   0.87 seconds
0087  *      160 times   1.76 seconds
0088  *      320 times   3.52 seconds
0089  *      640 times   7.03 seconds
0090  *
0091  * Overhead for 1 matgen      0.01098 seconds
0092  *
0093  *
0094  * Calculating matgen/dgefa passes for 5 seconds
0095  *
0096  *       10 times   0.71 seconds
0097  *       20 times   1.38 seconds
0098  *       40 times   2.80 seconds
0099  *       80 times   5.66 seconds
0100  *
0101  *      Passes used         70
0102  *
0103  *  This is followed by output of the normal data for dgefa, dges1,
0104  *  total, Mflops, unit and ratio with five sets of results for each.
0105  *
0106  ************************************************************************
0107  *
0108  *                Example from output file Linpack.txt
0109  *
0110  * LINPACK BENCHMARK FOR PCs 'C/C++'    n @ 100
0111  *
0112  * Month run         9/1996
0113  * PC model          Escom
0114  * CPU               Pentium
0115  * Clock MHz         100
0116  * Cache             256K
0117  * Options           Neptune chipset
0118  * OS/DOS            Windows 95
0119  * Compiler          Watcom C/C++ 10.5 Win 386
0120  * OptLevel          -zp4 -otexan -fp5 -5r -dDP -dROLL
0121  * Run by            Roy Longbottom
0122  * From              UK
0123  * Mail              101323.2241@compuserve.com
0124  *
0125  * Rolling            Rolled
0126  * Precision          Double
0127  * norm. resid                     0.4
0128  * resid               7.41628980e-014
0129  * machep              1.00000000e-015             (8.88178420e-016 NON OPT)
0130  * x[0]-1             -1.49880108e-014
0131  * x[n-1]-1           -1.89848137e-014
0132  * matgen 1 seconds            0.01051
0133  * matgen 2 seconds            0.01050
0134  * Repetitions                      70
0135  * Leading dimension               201
0136  *                               dgefa     dgesl     total    Mflops
0137  * 1 pass seconds              0.06000   0.00000   0.06000
0138  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0139  * Repeat seconds              0.06077   0.00157   0.06234     11.01
0140  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0141  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0142  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0143  * Average                                                     10.99
0144  * Leading dimension               200
0145  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0146  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0147  * Repeat seconds              0.05864   0.00157   0.06021     11.40
0148  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0149  * Repeat seconds              0.05864   0.00157   0.06021     11.40
0150  * Average                                                     11.32
0151  *
0152  ************************************************************************
0153  *
0154  *                     Examples of Results
0155  *
0156  *  Precompiled codes were produced via a Watcom C/C++ 10.5 compiler.
0157  *  Versions are available for DOS, Windows 3/95 and NT/Win 95. Both
0158  *  non-optimised and optimised programs are available. The latter has
0159  *  options as in the above example. Although these options can place
0160  *  functions in-line, in this case, daxpy is not in-lined. Optimisation
0161  *  reduces 18 instructions in the loop in this function to the following:
0162  *
0163  *               L85         fld     st(0)
0164  *                           fmul    qword ptr [edx]
0165  *                           add     eax,00000008H
0166  *                           add     edx,00000008H
0167  *                           fadd    qword ptr -8H[eax]
0168  *                           inc     ebx
0169  *                           fstp    qword ptr -8H[eax]
0170  *                           cmp     ebx,esi
0171  *                           jl      L85
0172  *
0173  *  Results produced are not consistent between runs but produce similar
0174  *  speeds when executing at a particular dimension (see above). An example
0175  *  of other results is 11.4/10.5 Mflops. Most typical double precision
0176  *  rolled results are:
0177  *
0178  *                               Opt   No Opt                        Version/
0179  *               MHz    Cache  Mflops  Mflops  Make/Options            Via
0180  *
0181  *   AM80386DX    40     128K    0.53    0.36  Clone                  Win/W95
0182  *   80486DX2     66     128K    2.5     1.9   Escom SIS chipset      Win/W95
0183  *   80486DX2     66     128K    2.3     1.9   Escom SIS chipset       NT/W95
0184  *   80486DX2     66     128K    2.8     2.0   Escom SIS chipset      Dos/Dos
0185  *   Pentium     100     256K    11      4.2   Escom Neptune chipset  Win/W95
0186  *   Pentium     100     256K    11      5.5   Escom Neptune chipset   NT/W95
0187  *   Pentium     100     256K    12      4.4   Escom Neptune chipset  Dos/Dos
0188  *   Pentium Pro 200     256K    48     19     Dell XPS Pro200n        NT/NT
0189  *
0190  *  The results are as produced when compiled as Linpack.cpp. Compiling as
0191  *  Linpack.c gives similar speeds but the code is a little different.
0192  *
0193  ***************************************************************************
0194 */
0195
0196 #define DP
0197 #define ROLL
0198
0199 #ifdef SP
0200 #define REAL float
0201 #define ZERO 0.0
0202 #define ONE 1.0
0203 #define PREC "Single "
0204 #endif
0205
0206 #ifdef DP
0207 #define REAL double
0208 #define ZERO 0.0e0
0209 #define ONE 1.0e0
0210 #define PREC "Double "
0211 #endif
0212
0213 #ifdef ROLL
0214 #define ROLLING "Rolled "
0215 #endif
0216 #ifdef UNROLL
0217 #define ROLLING "Unrolled "
0218 #endif
0219
0220
0221 #define NTIMES atoi(argv[1])
0222
0223 #include <stdio.h>
0224 #include <math.h>
0225 #include <stdlib.h>
0226 #ifdef __rtems__
0227 #include <rtems/test-printer.h>
0228 #undef print_time
0229 #define fprintf(f, ...) rtems_printf(&rtems_test_printer, __VA_ARGS__)
0230 #endif /* __rtems__ */
0231
0232
0233 static REAL atime[9][15];
0234
0235 void print_time (int row);
0236 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
0237 void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
0238 void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
0239 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
0240 void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
0241 REAL epslon (REAL x);
0242 int idamax (int n, REAL dx[], int incx);
0243 void dscal (int n, REAL da, REAL dx[], int incx);
0244 REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
0245
0246 /* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */
0247    #include <sys/time.h>  /* for following time functions only */
0248    static REAL second(void)
0249      {
0250         struct timeval tv;
0251
0252         gettimeofday(&tv, NULL);
0253         return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
0254      }
0255
0256 int main (int argc, char **argv)
0257 {
0258         static REAL aa[200*200],a[200*201],b[200],x[200];
0259         REAL cray,ops,total,norma,normx;
0260         REAL resid,residn,eps,t1,tm2,epsn,x1,x2;
0261         REAL mflops;
0262         static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
0263         int pass, loop;
0264         REAL overhead1, overhead2, time1, time2;
0265         char *compiler, *options;
0266
0267 /************************************************************************
0268  *           Enter details of compiler and options used                 *
0269  ************************************************************************/
0270                   /*----------------- --------- --------- ---------*/
0271         compiler = "INSERT COMPILER NAME HERE";
0272         options  = "INSERT OPTIMISATION OPTIONS HERE";
0273                   /* Include -dDP or -dSP and -dROLL or -dUNROLL */
0274
0275         lda = 201;
0276         ldaa = 200;
0277         cray = .056;
0278         n = 100;
0279
0280         fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0281         fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n");
0282         fprintf(stdout,"Compiler     %s\n",compiler);
0283         fprintf(stdout,"Optimisation %s\n\n",options);
0284
0285         ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
0286
0287         matgen(a,lda,n,b,&norma);
0288         t1 = second();
0289         dgefa(a,lda,n,ipvt,&info);
0290         atime[0][0] = second() - t1;
0291         t1 = second();
0292         dgesl(a,lda,n,ipvt,b,0);
0293         atime[1][0] = second() - t1;
0294         total = atime[0][0] + atime[1][0];
0295
0296 /*     compute a residual to verify results.  */
0297
0298         for (i = 0; i < n; i++) {
0299                 x[i] = b[i];
0300         }
0301         matgen(a,lda,n,b,&norma);
0302         for (i = 0; i < n; i++) {
0303                 b[i] = -b[i];
0304         }
0305         dmxpy(n,b,n,lda,x,a);
0306         resid = 0.0;
0307         normx = 0.0;
0308         for (i = 0; i < n; i++) {
0309                 resid = (resid > fabs((double)b[i]))
0310                         ? resid : fabs((double)b[i]);
0311                 normx = (normx > fabs((double)x[i]))
0312                         ? normx : fabs((double)x[i]);
0313         }
0314         eps = epslon(ONE);
0315         residn = resid/( n*norma*normx*eps );
0316         epsn = eps;
0317         x1 = x[0] - 1;
0318         x2 = x[n-1] - 1;
0319
0320         printf("norm resid      resid           machep");
0321         printf("         x[0]-1          x[n-1]-1\n");
0322         printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
0323                (double)residn, (double)resid, (double)epsn,
0324                (double)x1, (double)x2);
0325
0326         fprintf(stdout,"Times are reported for matrices of order        %5d\n",n);
0327         fprintf(stdout,"1 pass times for array with leading dimension of%5d\n\n",lda);
0328         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0329         fprintf(stdout,"      ratio\n");
0330
0331         atime[2][0] = total;
0332         if (total > 0.0)
0333         {
0334             atime[3][0] = ops/(1.0e6*total);
0335             atime[4][0] = 2.0/atime[3][0];
0336         }
0337         else
0338         {
0339             atime[3][0] = 0.0;
0340             atime[4][0] = 0.0;
0341         }
0342         atime[5][0] = total/cray;
0343
0344         print_time(0);
0345
0346 /************************************************************************
0347  *       Calculate overhead of executing matgen procedure              *
0348  ************************************************************************/
0349
0350         fprintf (stdout,"\nCalculating matgen overhead\n");
0351         pass = -20;
0352         loop = NTIMES;
0353         do
0354         {
0355             time1 = second();
0356             pass = pass + 1;
0357             for ( i = 0 ; i < loop ; i++)
0358             {
0359                  matgen(a,lda,n,b,&norma);
0360             }
0361             time2 = second();
0362             overhead1 = (time2 - time1);
0363             fprintf (stdout,"%10d times %6.2f seconds\n", loop, overhead1);
0364             if (overhead1 > 5.0)
0365             {
0366                 pass = 0;
0367             }
0368             if (pass < 0)
0369             {
0370                 if (overhead1 < 0.1)
0371                 {
0372                     loop = loop * 10;
0373                 }
0374                 else
0375                 {
0376                     loop = loop * 2;
0377                 }
0378             }
0379         }
0380         while (pass < 0);
0381
0382         overhead1 = overhead1 / (double)loop;
0383
0384         fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
0385
0386 /************************************************************************
0387  *           Calculate matgen/dgefa passes for 5 seconds                *
0388  ************************************************************************/
0389
0390         fprintf (stdout,"Calculating matgen/dgefa passes for 5 seconds\n");
0391         pass = -20;
0392         ntimes = NTIMES;
0393         do
0394         {
0395             time1 = second();
0396             pass = pass + 1;
0397             for ( i = 0 ; i < ntimes ; i++)
0398             {
0399                 matgen(a,lda,n,b,&norma);
0400                 dgefa(a,lda,n,ipvt,&info );
0401             }
0402             time2 = second() - time1;
0403             fprintf (stdout,"%10d times %6.2f seconds\n", ntimes, time2);
0404             if (time2 > 5.0)
0405             {
0406                 pass = 0;
0407             }
0408             if (pass < 0)
0409             {
0410                 if (time2 < 0.1)
0411                 {
0412                     ntimes = ntimes * 10;
0413                 }
0414                 else
0415                 {
0416                     ntimes = ntimes * 2;
0417                 }
0418             }
0419         }
0420         while (pass < 0);
0421
0422         ntimes =  5.0 * (double)ntimes / time2;
0423         if (ntimes == 0) ntimes = 1;
0424
0425         fprintf (stdout,"Passes used %10d \n\n", ntimes);
0426         fprintf(stdout,"Times for array with leading dimension of%4d\n\n",lda);
0427         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0428         fprintf(stdout,"      ratio\n");
0429
0430 /************************************************************************
0431  *                              Execute 5 passes                        *
0432  ************************************************************************/
0433
0434         tm2 = ntimes * overhead1;
0435         atime[3][6] = 0;
0436
0437         for (j=1 ; j<6 ; j++)
0438         {
0439
0440             t1 = second();
0441
0442             for (i = 0; i < ntimes; i++)
0443             {
0444                 matgen(a,lda,n,b,&norma);
0445                 dgefa(a,lda,n,ipvt,&info );
0446             }
0447
0448             atime[0][j] = (second() - t1 - tm2)/ntimes;
0449
0450             t1 = second();
0451
0452             for (i = 0; i < ntimes; i++)
0453             {
0454                 dgesl(a,lda,n,ipvt,b,0);
0455             }
0456
0457             atime[1][j] = (second() - t1)/ntimes;
0458             total       = atime[0][j] + atime[1][j];
0459             atime[2][j] = total;
0460             atime[3][j] = ops/(1.0e6*total);
0461             atime[4][j] = 2.0/atime[3][j];
0462             atime[5][j] = total/cray;
0463             atime[3][6] = atime[3][6] + atime[3][j];
0464
0465             print_time(j);
0466         }
0467         atime[3][6] = atime[3][6] / 5.0;
0468         fprintf (stdout,"Average                          %11.2f\n",
0469                                                (double)atime[3][6]);
0470
0471         fprintf (stdout,"\nCalculating matgen2 overhead\n");
0472
0473 /************************************************************************
0474  *             Calculate overhead of executing matgen procedure         *
0475  ************************************************************************/
0476
0477         time1 = second();
0478         for ( i = 0 ; i < loop ; i++)
0479         {
0480             matgen(aa,ldaa,n,b,&norma);
0481         }
0482         time2 = second();
0483         overhead2 = (time2 - time1);
0484         overhead2 = overhead2 / (double)loop;
0485
0486         fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
0487         fprintf(stdout,"Times for array with leading dimension of%4d\n\n",ldaa);
0488         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0489         fprintf(stdout,"      ratio\n");
0490
0491 /************************************************************************
0492  *                              Execute 5 passes                        *
0493  ************************************************************************/
0494
0495         tm2 = ntimes * overhead2;
0496         atime[3][12] = 0;
0497
0498         for (j=7 ; j<12 ; j++)
0499         {
0500
0501             t1 = second();
0502
0503             for (i = 0; i < ntimes; i++)
0504             {
0505                 matgen(aa,ldaa,n,b,&norma);
0506                 dgefa(aa,ldaa,n,ipvt,&info  );
0507             }
0508
0509             atime[0][j] = (second() - t1 - tm2)/ntimes;
0510
0511             t1 = second();
0512
0513             for (i = 0; i < ntimes; i++)
0514             {
0515                 dgesl(aa,ldaa,n,ipvt,b,0);
0516             }
0517
0518             atime[1][j] = (second() - t1)/ntimes;
0519             total       = atime[0][j] + atime[1][j];
0520             atime[2][j] = total;
0521             atime[3][j] = ops/(1.0e6*total);
0522             atime[4][j] = 2.0/atime[3][j];
0523             atime[5][j] = total/cray;
0524             atime[3][12] = atime[3][12] + atime[3][j];
0525
0526             print_time(j);
0527         }
0528         atime[3][12] = atime[3][12] / 5.0;
0529         fprintf (stdout,"Average                          %11.2f\n",
0530                                               (double)atime[3][12]);
0531
0532 /************************************************************************
0533  *           Use minimum average as overall Mflops rating               *
0534  ************************************************************************/
0535
0536         mflops = atime[3][6];
0537         if (atime[3][12] < mflops) mflops = atime[3][12];
0538
0539         fprintf(stdout,"\n");
0540         fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0541         fprintf(stdout," Precision %11.2f Mflops \n\n",mflops);
0542 #ifdef __rtems__
0543     return 0;
0544 #endif
0545 }
0546
0547 /*----------------------*/
0548 void print_time (int row)
0549
0550 {
0551 fprintf(stdout,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n",   (double)atime[0][row],
0552        (double)atime[1][row], (double)atime[2][row], (double)atime[3][row],
0553        (double)atime[4][row], (double)atime[5][row]);
0554        return;
0555 }
0556
0557 /*----------------------*/
0558
0559 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
0560
0561
0562 /* We would like to declare a[][lda], but c does not allow it.  In this
0563 function, references to a[i][j] are written a[lda*i+j].  */
0564
0565 {
0566         int init, i, j;
0567
0568         init = 1325;
0569         *norma = 0.0;
0570         for (j = 0; j < n; j++) {
0571                 for (i = 0; i < n; i++) {
0572                         init = 3125*init % 65536;
0573                         a[lda*j+i] = (init - 32768.0)/16384.0;
0574                         *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
0575
0576                         /* alternative for some compilers
0577                         if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
0578                         */
0579                 }
0580         }
0581         for (i = 0; i < n; i++) {
0582           b[i] = 0.0;
0583         }
0584         for (j = 0; j < n; j++) {
0585                 for (i = 0; i < n; i++) {
0586                         b[i] = b[i] + a[lda*j+i];
0587                 }
0588         }
0589         return;
0590 }
0591
0592 /*----------------------*/
0593 void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
0594
0595
0596 /* We would like to declare a[][lda], but c does not allow it.  In this
0597 function, references to a[i][j] are written a[lda*i+j].  */
0598 /*
0599      dgefa factors a double precision matrix by gaussian elimination.
0600
0601      dgefa is usually called by dgeco, but it can be called
0602      directly with a saving in time if  rcond  is not needed.
0603      (time for dgeco) = (1 + 9/n)*(time for dgefa) .
0604
0605      on entry
0606
0607         a       REAL precision[n][lda]
0608                 the matrix to be factored.
0609
0610         lda     integer
0611                 the leading dimension of the array  a .
0612
0613         n       integer
0614                 the order of the matrix  a .
0615
0616      on return
0617
0618         a       an upper triangular matrix and the multipliers
0619                 which were used to obtain it.
0620                 the factorization can be written  a = l*u  where
0621                 l  is a product of permutation and unit lower
0622                 triangular matrices and  u  is upper triangular.
0623
0624         ipvt    integer[n]
0625                 an integer vector of pivot indices.
0626
0627         info    integer
0628                 = 0  normal value.
0629                 = k  if  u[k][k] .eq. 0.0 .  this is not an error
0630                      condition for this subroutine, but it does
0631                      indicate that dgesl or dgedi will divide by zero
0632                      if called.  use  rcond  in dgeco for a reliable
0633                      indication of singularity.
0634
0635      linpack. this version dated 08/14/78 .
0636      cleve moler, university of new mexico, argonne national lab.
0637
0638      functions
0639
0640      blas daxpy,dscal,idamax
0641 */
0642
0643 {
0644 /*     internal variables       */
0645
0646 REAL t;
0647 int j,k,kp1,l,nm1;
0648
0649
0650 /*     gaussian elimination with partial pivoting       */
0651
0652         *info = 0;
0653         nm1 = n - 1;
0654         if (nm1 >=  0) {
0655                 for (k = 0; k < nm1; k++) {
0656                         kp1 = k + 1;
0657
0658                         /* find l = pivot index */
0659
0660                         l = idamax(n-k,&a[lda*k+k],1) + k;
0661                         ipvt[k] = l;
0662
0663                         /* zero pivot implies this column already
0664                            triangularized */
0665
0666                         if (a[lda*k+l] != ZERO) {
0667
0668                                 /* interchange if necessary */
0669
0670                                 if (l != k) {
0671                                         t = a[lda*k+l];
0672                                         a[lda*k+l] = a[lda*k+k];
0673                                         a[lda*k+k] = t;
0674                                 }
0675
0676                                 /* compute multipliers */
0677
0678                                 t = -ONE/a[lda*k+k];
0679                                 dscal(n-(k+1),t,&a[lda*k+k+1],1);
0680
0681                                 /* row elimination with column indexing */
0682
0683                                 for (j = kp1; j < n; j++) {
0684                                         t = a[lda*j+l];
0685                                         if (l != k) {
0686                                                 a[lda*j+l] = a[lda*j+k];
0687                                                 a[lda*j+k] = t;
0688                                         }
0689                                         daxpy(n-(k+1),t,&a[lda*k+k+1],1,
0690                                               &a[lda*j+k+1],1);
0691                                 }
0692                         }
0693                         else {
0694                                 *info = k;
0695                         }
0696                 }
0697         }
0698         ipvt[n-1] = n-1;
0699         if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
0700         return;
0701 }
0702
0703 /*----------------------*/
0704
0705 void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
0706
0707
0708 /* We would like to declare a[][lda], but c does not allow it.  In this
0709 function, references to a[i][j] are written a[lda*i+j].  */
0710
0711 /*
0712      dgesl solves the double precision system
0713      a * x = b  or  trans(a) * x = b
0714      using the factors computed by dgeco or dgefa.
0715
0716      on entry
0717
0718         a       double precision[n][lda]
0719                 the output from dgeco or dgefa.
0720
0721         lda     integer
0722                 the leading dimension of the array  a .
0723
0724         n       integer
0725                 the order of the matrix  a .
0726
0727         ipvt    integer[n]
0728                 the pivot vector from dgeco or dgefa.
0729
0730         b       double precision[n]
0731                 the right hand side vector.
0732
0733         job     integer
0734                 = 0         to solve  a*x = b ,
0735                 = nonzero   to solve  trans(a)*x = b  where
0736                             trans(a)  is the transpose.
0737
0738     on return
0739
0740         b       the solution vector  x .
0741
0742      error condition
0743
0744         a division by zero will occur if the input factor contains a
0745         zero on the diagonal.  technically this indicates singularity
0746         but it is often caused by improper arguments or improper
0747         setting of lda .  it will not occur if the subroutines are
0748         called correctly and if dgeco has set rcond .gt. 0.0
0749         or dgefa has set info .eq. 0 .
0750
0751      to compute  inverse(a) * c  where  c  is a matrix
0752      with  p  columns
0753            dgeco(a,lda,n,ipvt,rcond,z)
0754            if (!rcond is too small){
0755                 for (j=0,j<p,j++)
0756                         dgesl(a,lda,n,ipvt,c[j][0],0);
0757            }
0758
0759      linpack. this version dated 08/14/78 .
0760      cleve moler, university of new mexico, argonne national lab.
0761
0762      functions
0763
0764      blas daxpy,ddot
0765 */
0766 {
0767 /*     internal variables       */
0768
0769         REAL t;
0770         int k,kb,l,nm1;
0771
0772         nm1 = n - 1;
0773         if (job == 0) {
0774
0775                 /* job = 0 , solve  a * x = b
0776                    first solve  l*y = b         */
0777
0778                 if (nm1 >= 1) {
0779                         for (k = 0; k < nm1; k++) {
0780                                 l = ipvt[k];
0781                                 t = b[l];
0782                                 if (l != k){
0783                                         b[l] = b[k];
0784                                         b[k] = t;
0785                                 }
0786                                 daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
0787                         }
0788                 }
0789
0790                 /* now solve  u*x = y */
0791
0792                 for (kb = 0; kb < n; kb++) {
0793                     k = n - (kb + 1);
0794                     b[k] = b[k]/a[lda*k+k];
0795                     t = -b[k];
0796                     daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
0797                 }
0798         }
0799         else {
0800
0801                 /* job = nonzero, solve  trans(a) * x = b
0802                    first solve  trans(u)*y = b                  */
0803
0804                 for (k = 0; k < n; k++) {
0805                         t = ddot(k,&a[lda*k+0],1,&b[0],1);
0806                         b[k] = (b[k] - t)/a[lda*k+k];
0807                 }
0808
0809                 /* now solve trans(l)*x = y     */
0810
0811                 if (nm1 >= 1) {
0812                         for (kb = 1; kb < nm1; kb++) {
0813                                 k = n - (kb+1);
0814                                 b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
0815                                 l = ipvt[k];
0816                                 if (l != k) {
0817                                         t = b[l];
0818                                         b[l] = b[k];
0819                                         b[k] = t;
0820                                 }
0821                         }
0822                 }
0823         }
0824         return;
0825 }
0826
0827 /*----------------------*/
0828
0829 void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
0830 /*
0831      constant times a vector plus a vector.
0832      jack dongarra, linpack, 3/11/78.
0833 */
0834
0835 {
0836         int i,ix,iy;
0837
0838         if(n <= 0) return;
0839         if (da == ZERO) return;
0840
0841         if(incx != 1 || incy != 1) {
0842
0843                 /* code for unequal increments or equal increments
0844                    not equal to 1                                       */
0845
0846                 ix = 0;
0847                 iy = 0;
0848                 if(incx < 0) ix = (-n+1)*incx;
0849                 if(incy < 0)iy = (-n+1)*incy;
0850                 for (i = 0;i < n; i++) {
0851                         dy[iy] = dy[iy] + da*dx[ix];
0852                         ix = ix + incx;
0853                         iy = iy + incy;
0854
0855                 }
0856                 return;
0857         }
0858
0859         /* code for both increments equal to 1 */
0860
0861
0862 #ifdef ROLL
0863
0864         for (i = 0;i < n; i++) {
0865                 dy[i] = dy[i] + da*dx[i];
0866         }
0867
0868
0869 #endif
0870
0871 #ifdef UNROLL
0872
0873         m = n % 4;
0874         if ( m != 0) {
0875                 for (i = 0; i < m; i++)
0876                         dy[i] = dy[i] + da*dx[i];
0877
0878                 if (n < 4) return;
0879         }
0880         for (i = m; i < n; i = i + 4) {
0881                 dy[i] = dy[i] + da*dx[i];
0882                 dy[i+1] = dy[i+1] + da*dx[i+1];
0883                 dy[i+2] = dy[i+2] + da*dx[i+2];
0884                 dy[i+3] = dy[i+3] + da*dx[i+3];
0885
0886         }
0887
0888 #endif
0889 return;
0890 }
0891
0892 /*----------------------*/
0893
0894 REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
0895 /*
0896      forms the dot product of two vectors.
0897      jack dongarra, linpack, 3/11/78.
0898 */
0899
0900 {
0901         REAL dtemp;
0902         int i,ix,iy;
0903
0904         dtemp = ZERO;
0905
0906         if(n <= 0) return(ZERO);
0907
0908         if(incx != 1 || incy != 1) {
0909
0910                 /* code for unequal increments or equal increments
0911                    not equal to 1                                       */
0912
0913                 ix = 0;
0914                 iy = 0;
0915                 if (incx < 0) ix = (-n+1)*incx;
0916                 if (incy < 0) iy = (-n+1)*incy;
0917                 for (i = 0;i < n; i++) {
0918                         dtemp = dtemp + dx[ix]*dy[iy];
0919                         ix = ix + incx;
0920                         iy = iy + incy;
0921
0922                 }
0923                 return(dtemp);
0924         }
0925
0926         /* code for both increments equal to 1 */
0927
0928
0929 #ifdef ROLL
0930
0931         for (i=0;i < n; i++)
0932                 dtemp = dtemp + dx[i]*dy[i];
0933
0934         return(dtemp);
0935
0936 #endif
0937
0938 #ifdef UNROLL
0939
0940
0941         m = n % 5;
0942         if (m != 0) {
0943                 for (i = 0; i < m; i++)
0944                         dtemp = dtemp + dx[i]*dy[i];
0945                 if (n < 5) return(dtemp);
0946         }
0947         for (i = m; i < n; i = i + 5) {
0948                 dtemp = dtemp + dx[i]*dy[i] +
0949                 dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
0950                 dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
0951         }
0952         return(dtemp);
0953
0954 #endif
0955
0956 }
0957
0958 /*----------------------*/
0959 void dscal(int n, REAL da, REAL dx[], int incx)
0960
0961 /*     scales a vector by a constant.
0962       jack dongarra, linpack, 3/11/78.
0963 */
0964
0965 {
0966         int i,nincx;
0967
0968         if(n <= 0)return;
0969         if(incx != 1) {
0970
0971                 /* code for increment not equal to 1 */
0972
0973                 nincx = n*incx;
0974                 for (i = 0; i < nincx; i = i + incx)
0975                         dx[i] = da*dx[i];
0976
0977                 return;
0978         }
0979
0980         /* code for increment equal to 1 */
0981
0982
0983 #ifdef ROLL
0984
0985         for (i = 0; i < n; i++)
0986                 dx[i] = da*dx[i];
0987
0988
0989 #endif
0990
0991 #ifdef UNROLL
0992
0993
0994         m = n % 5;
0995         if (m != 0) {
0996                 for (i = 0; i < m; i++)
0997                         dx[i] = da*dx[i];
0998                 if (n < 5) return;
0999         }
1000         for (i = m; i < n; i = i + 5){
1001                 dx[i] = da*dx[i];
1002                 dx[i+1] = da*dx[i+1];
1003                 dx[i+2] = da*dx[i+2];
1004                 dx[i+3] = da*dx[i+3];
1005                 dx[i+4] = da*dx[i+4];
1006         }
1007
1008 #endif
1009
1010 }
1011
1012 /*----------------------*/
1013 int idamax(int n, REAL dx[], int incx)
1014
1015 /*
1016      finds the index of element having max. absolute value.
1017      jack dongarra, linpack, 3/11/78.
1018 */
1019
1020
1021 {
1022         REAL dmax;
1023         int i, ix, itemp;
1024
1025         if( n < 1 ) return(-1);
1026         if(n ==1 ) return(0);
1027         itemp = -1;
1028         if(incx != 1) {
1029
1030                 /* code for increment not equal to 1 */
1031
1032                 ix = 1;
1033                 dmax = fabs((double)dx[0]);
1034                 ix = ix + incx;
1035                 for (i = 1; i < n; i++) {
1036                         if(fabs((double)dx[ix]) > dmax)  {
1037                                 itemp = i;
1038                                 dmax = fabs((double)dx[ix]);
1039                         }
1040                         ix = ix + incx;
1041                 }
1042         }
1043         else {
1044
1045                 /* code for increment equal to 1 */
1046
1047                 itemp = 0;
1048                 dmax = fabs((double)dx[0]);
1049                 for (i = 1; i < n; i++) {
1050                         if(fabs((double)dx[i]) > dmax) {
1051                                 itemp = i;
1052                                 dmax = fabs((double)dx[i]);
1053                         }
1054                 }
1055         }
1056         return (itemp);
1057 }
1058
1059 /*----------------------*/
1060 REAL epslon (REAL x)
1061
1062 /*
1063      estimate unit roundoff in quantities of size x.
1064 */
1065
1066 {
1067         REAL a,b,c,eps;
1068 /*
1069      this program should function properly on all systems
1070      satisfying the following two assumptions,
1071         1.  the base used in representing dfloating point
1072             numbers is not a power of three.
1073         2.  the quantity  a  in statement 10 is represented to
1074             the accuracy used in dfloating point variables
1075             that are stored in memory.
1076      the statement number 10 and the go to 10 are intended to
1077      force optimizing compilers to generate code satisfying
1078      assumption 2.
1079      under these assumptions, it should be true that,
1080             a  is not exactly equal to four-thirds,
1081             b  has a zero for its last bit or digit,
1082             c  is not exactly equal to one,
1083             eps  measures the separation of 1.0 from
1084                  the next larger dfloating point number.
1085      the developers of eispack would appreciate being informed
1086      about any systems where these assumptions do not hold.
1087
1088      *****************************************************************
1089      this routine is one of the auxiliary routines used by eispack iii
1090      to avoid machine dependencies.
1091      *****************************************************************
1092
1093      this version dated 4/6/83.
1094 */
1095
1096         a = 4.0e0/3.0e0;
1097         eps = ZERO;
1098         while (eps == ZERO) {
1099                 b = a - ONE;
1100                 c = b + b + b;
1101                 eps = fabs((double)(c-ONE));
1102         }
1103         return(eps*fabs((double)x));
1104 }
1105
1106 /*----------------------*/
1107 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
1108
1109
1110 /* We would like to declare m[][ldm], but c does not allow it.  In this
1111 function, references to m[i][j] are written m[ldm*i+j].  */
1112
1113 /*
1114    purpose:
1115      multiply matrix m times vector x and add the result to vector y.
1116
1117    parameters:
1118
1119      n1 integer, number of elements in vector y, and number of rows in
1120          matrix m
1121
1122      y double [n1], vector of length n1 to which is added
1123          the product m*x
1124
1125      n2 integer, number of elements in vector x, and number of columns
1126          in matrix m
1127
1128      ldm integer, leading dimension of array m
1129
1130      x double [n2], vector of length n2
1131
1132      m double [ldm][n2], matrix of n1 rows and n2 columns
1133
1134  ----------------------------------------------------------------------
1135 */
1136 {
1137         int j,i,jmin;
1138         /* cleanup odd vector */
1139
1140         j = n2 % 2;
1141         if (j >= 1) {
1142                 j = j - 1;
1143                 for (i = 0; i < n1; i++)
1144                         y[i] = (y[i]) + x[j]*m[ldm*j+i];
1145         }
1146
1147         /* cleanup odd group of two vectors */
1148
1149         j = n2 % 4;
1150         if (j >= 2) {
1151                 j = j - 1;
1152                 for (i = 0; i < n1; i++)
1153                         y[i] = ( (y[i])
1154                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1155         }
1156
1157         /* cleanup odd group of four vectors */
1158
1159         j = n2 % 8;
1160         if (j >= 4) {
1161                 j = j - 1;
1162                 for (i = 0; i < n1; i++)
1163                         y[i] = ((( (y[i])
1164                                + x[j-3]*m[ldm*(j-3)+i])
1165                                + x[j-2]*m[ldm*(j-2)+i])
1166                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1167         }
1168
1169         /* cleanup odd group of eight vectors */
1170
1171         j = n2 % 16;
1172         if (j >= 8) {
1173                 j = j - 1;
1174                 for (i = 0; i < n1; i++)
1175                         y[i] = ((((((( (y[i])
1176                                + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
1177                                + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
1178                                + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
1179                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
1180         }
1181
1182         /* main loop - groups of sixteen vectors */
1183
1184         jmin = (n2%16)+16;
1185         for (j = jmin-1; j < n2; j = j + 16) {
1186                 for (i = 0; i < n1; i++)
1187                         y[i] = ((((((((((((((( (y[i])
1188                                 + x[j-15]*m[ldm*(j-15)+i])
1189                                 + x[j-14]*m[ldm*(j-14)+i])
1190                                 + x[j-13]*m[ldm*(j-13)+i])
1191                                 + x[j-12]*m[ldm*(j-12)+i])
1192                                 + x[j-11]*m[ldm*(j-11)+i])
1193                                 + x[j-10]*m[ldm*(j-10)+i])
1194                                 + x[j- 9]*m[ldm*(j- 9)+i])
1195                                 + x[j- 8]*m[ldm*(j- 8)+i])
1196                                 + x[j- 7]*m[ldm*(j- 7)+i])
1197                                 + x[j- 6]*m[ldm*(j- 6)+i])
1198                                 + x[j- 5]*m[ldm*(j- 5)+i])
1199                                 + x[j- 4]*m[ldm*(j- 4)+i])
1200                                 + x[j- 3]*m[ldm*(j- 3)+i])
1201                                 + x[j- 2]*m[ldm*(j- 2)+i])
1202                                 + x[j- 1]*m[ldm*(j- 1)+i])
1203                                 + x[j]   *m[ldm*j+i];
1204         }
1205         return;
1206 }
1207
1208 /*----------------------*/