Back to home page

LXR

 
 

    


File indexing completed on 2025-05-11 08:24:28

0001 /*
0002  *          Linpack 100x100 Benchmark In C/C++ For PCs
0003  *
0004  ********************************************************************
0005  *
0006  *                 Original Source from NETLIB
0007  *
0008  *  Translated to C by Bonnie Toy 5/88 (modified on 2/25/94  to fix
0009  *  a problem with daxpy for unequal increments or equal increments
0010  *  not equal to 1. Jack Dongarra)
0011  *
0012  *  To obtain rolled source BLAS, add -DROLL to the command lines.
0013  *  To obtain unrolled source BLAS, add -DUNROLL to the command lines.
0014  *
0015  *  You must specify one of -DSP or -DDP to compile correctly.
0016  *
0017  *  You must specify one of -DROLL or -DUNROLL to compile correctly.
0018  *
0019  ********************************************************************
0020  *
0021  *                   Changes in this version
0022  *
0023  *  1. Function prototypes are declared and function headers have
0024  *     embedded parameter types to produce code for C and C++
0025  *
0026  *  2. Arrays aa and a are declared as [200*200] and [200*201] to
0027  *     allow compilation with prototypes.
0028  *
0029  *  3. Function second changed (compiler dependent).
0030  *
0031  *  4. Timing method changed due to inaccuracy of PC clock (see below).
0032  *
0033  *  5. Additional date function included (compiler dependent).
0034  *
0035  *  6. Additional code used as a standard for a series of benchmarks:-
0036  *       Automatic run time calibration rather than fixed parameters
0037  *       Initial calibration with display to show linearity
0038  *       Results displayed at reasonable rate for viewing (5 seconds)
0039  *       Facilities for typing in details of system used etc.
0040  *       Compiler details in code in case .exe files used elsewhere
0041  *       Results appended to a text file (Linpack.txt)
0042  *
0043  *  Roy Longbottom  101323.2241@compuserve.com    14 September 1996
0044  * 
0045  ************************************************************************
0046  *
0047  *                             Timing
0048  *
0049  *  The PC timer is updated at about 18 times per second or resolution of
0050  *  0.05 to 0.06 seconds which is similar to the time taken by the main
0051  *  time consuming function dgefa on a 100 MHz Pentium. Thus there is no
0052  *  point in running the dgefa/dges1 combination three times as in the
0053  *  original version. Main timing for the latter, in the loop run NTIMES,
0054  *  executes matgen/dgefa, summing the time taken by matgen within the
0055  *  loop for later deduction from the total time. On a modern PC this sum
0056  *  can be based on a random selection of 0 or 0.05/0.06. This version
0057  *  executes the single pass once and the main timing loop five times,
0058  *  calculating the matgen overhead separately.
0059  *
0060  *************************************************************************
0061  *
0062  *                    Example of Output
0063  *
0064  * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++'
0065  *
0066  * Compiler     Watcom C/C++ 10.5 Win 386
0067  * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL
0068  *
0069  *
0070  * norm resid      resid           machep         x[0]-1          x[n-1]-1
0071  *  0.4   7.41628980e-014  1.00000000e-015 -1.49880108e-014 -1.89848137e-014
0072  *
0073  *
0074  * Times are reported for matrices of order          100
0075  * 1 pass times for array with leading dimension of  201
0076  *
0077  *     dgefa      dgesl      total     Mflops       unit      ratio
0078  *   0.06000    0.00000    0.06000      11.44     0.1748     1.0714
0079  *
0080  *
0081  * Calculating matgen overhead
0082  *
0083  *       10 times   0.11 seconds
0084  *       20 times   0.22 seconds
0085  *       40 times   0.44 seconds
0086  *       80 times   0.87 seconds
0087  *      160 times   1.76 seconds
0088  *      320 times   3.52 seconds
0089  *      640 times   7.03 seconds
0090  *
0091  * Overhead for 1 matgen      0.01098 seconds
0092  *
0093  *
0094  * Calculating matgen/dgefa passes for 5 seconds
0095  *
0096  *       10 times   0.71 seconds
0097  *       20 times   1.38 seconds
0098  *       40 times   2.80 seconds
0099  *       80 times   5.66 seconds      
0100  *
0101  *      Passes used         70 
0102  *
0103  *  This is followed by output of the normal data for dgefa, dges1,
0104  *  total, Mflops, unit and ratio with five sets of results for each.
0105  *
0106  ************************************************************************
0107  *
0108  *                Example from output file Linpack.txt
0109  *
0110  * LINPACK BENCHMARK FOR PCs 'C/C++'    n @ 100
0111  *
0112  * Month run         9/1996
0113  * PC model          Escom
0114  * CPU               Pentium
0115  * Clock MHz         100
0116  * Cache             256K
0117  * Options           Neptune chipset
0118  * OS/DOS            Windows 95
0119  * Compiler          Watcom C/C++ 10.5 Win 386
0120  * OptLevel          -zp4 -otexan -fp5 -5r -dDP -dROLL
0121  * Run by            Roy Longbottom
0122  * From              UK
0123  * Mail              101323.2241@compuserve.com 
0124  *
0125  * Rolling            Rolled 
0126  * Precision          Double 
0127  * norm. resid                     0.4
0128  * resid               7.41628980e-014
0129  * machep              1.00000000e-015             (8.88178420e-016 NON OPT)
0130  * x[0]-1             -1.49880108e-014
0131  * x[n-1]-1           -1.89848137e-014
0132  * matgen 1 seconds            0.01051
0133  * matgen 2 seconds            0.01050
0134  * Repetitions                      70
0135  * Leading dimension               201
0136  *                               dgefa     dgesl     total    Mflops
0137  * 1 pass seconds              0.06000   0.00000   0.06000
0138  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0139  * Repeat seconds              0.06077   0.00157   0.06234     11.01
0140  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0141  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0142  * Repeat seconds              0.06092   0.00157   0.06249     10.99
0143  * Average                                                     10.99
0144  * Leading dimension               200
0145  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0146  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0147  * Repeat seconds              0.05864   0.00157   0.06021     11.40
0148  * Repeat seconds              0.05936   0.00157   0.06093     11.27
0149  * Repeat seconds              0.05864   0.00157   0.06021     11.40
0150  * Average                                                     11.32
0151  *
0152  ************************************************************************
0153  *
0154  *                     Examples of Results
0155  *
0156  *  Precompiled codes were produced via a Watcom C/C++ 10.5 compiler. 
0157  *  Versions are available for DOS, Windows 3/95 and NT/Win 95. Both
0158  *  non-optimised and optimised programs are available. The latter has
0159  *  options as in the above example. Although these options can place
0160  *  functions in-line, in this case, daxpy is not in-lined. Optimisation
0161  *  reduces 18 instructions in the loop in this function to the following:
0162  *
0163  *               L85         fld     st(0)
0164  *                           fmul    qword ptr [edx]
0165  *                           add     eax,00000008H
0166  *                           add     edx,00000008H
0167  *                           fadd    qword ptr -8H[eax]
0168  *                           inc     ebx
0169  *                           fstp    qword ptr -8H[eax]
0170  *                           cmp     ebx,esi
0171  *                           jl      L85
0172  *
0173  *  Results produced are not consistent between runs but produce similar
0174  *  speeds when executing at a particular dimension (see above). An example
0175  *  of other results is 11.4/10.5 Mflops. Most typical double precision
0176  *  rolled results are:
0177  *
0178  *                               Opt   No Opt                        Version/
0179  *               MHz    Cache  Mflops  Mflops  Make/Options            Via
0180  *
0181  *   AM80386DX    40     128K    0.53    0.36  Clone                  Win/W95
0182  *   80486DX2     66     128K    2.5     1.9   Escom SIS chipset      Win/W95
0183  *   80486DX2     66     128K    2.3     1.9   Escom SIS chipset       NT/W95
0184  *   80486DX2     66     128K    2.8     2.0   Escom SIS chipset      Dos/Dos
0185  *   Pentium     100     256K    11      4.2   Escom Neptune chipset  Win/W95
0186  *   Pentium     100     256K    11      5.5   Escom Neptune chipset   NT/W95 
0187  *   Pentium     100     256K    12      4.4   Escom Neptune chipset  Dos/Dos
0188  *   Pentium Pro 200     256K    48     19     Dell XPS Pro200n        NT/NT
0189  *
0190  *  The results are as produced when compiled as Linpack.cpp. Compiling as
0191  *  Linpack.c gives similar speeds but the code is a little different.
0192  * 
0193  ***************************************************************************
0194 */
0195 
0196 #define DP
0197 #define ROLL
0198 
0199 #ifdef SP
0200 #define REAL float
0201 #define ZERO 0.0
0202 #define ONE 1.0
0203 #define PREC "Single "
0204 #endif
0205 
0206 #ifdef DP
0207 #define REAL double
0208 #define ZERO 0.0e0
0209 #define ONE 1.0e0
0210 #define PREC "Double "
0211 #endif
0212 
0213 #ifdef ROLL
0214 #define ROLLING "Rolled "
0215 #endif
0216 #ifdef UNROLL
0217 #define ROLLING "Unrolled "
0218 #endif
0219 
0220 
0221 #define NTIMES atoi(argv[1])
0222 
0223 #include <stdio.h>
0224 #include <math.h>
0225 #include <stdlib.h>
0226 #ifdef __rtems__
0227 #include <rtems/test-printer.h>
0228 #undef print_time
0229 #define fprintf(f, ...) rtems_printf(&rtems_test_printer, __VA_ARGS__)
0230 #endif /* __rtems__ */
0231 
0232 
0233 static REAL atime[9][15];
0234 
0235 void print_time (int row);
0236 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
0237 void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
0238 void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
0239 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
0240 void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
0241 REAL epslon (REAL x);
0242 int idamax (int n, REAL dx[], int incx);
0243 void dscal (int n, REAL da, REAL dx[], int incx);
0244 REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
0245 
0246 /* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */
0247    #include <sys/time.h>  /* for following time functions only */
0248    static REAL second(void)
0249      {        
0250         struct timeval tv;
0251 
0252         gettimeofday(&tv, NULL);
0253         return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
0254      }
0255 
0256 int main (int argc, char **argv)
0257 {
0258         static REAL aa[200*200],a[200*201],b[200],x[200];       
0259         REAL cray,ops,total,norma,normx;
0260         REAL resid,residn,eps,t1,tm2,epsn,x1,x2;
0261         REAL mflops;
0262         static int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
0263         int pass, loop;
0264         REAL overhead1, overhead2, time1, time2;
0265         char *compiler, *options;
0266 
0267 /************************************************************************
0268  *           Enter details of compiler and options used                 *
0269  ************************************************************************/
0270                   /*----------------- --------- --------- ---------*/
0271         compiler = "INSERT COMPILER NAME HERE";
0272         options  = "INSERT OPTIMISATION OPTIONS HERE";
0273                   /* Include -dDP or -dSP and -dROLL or -dUNROLL */
0274     
0275         lda = 201;
0276         ldaa = 200;
0277         cray = .056; 
0278         n = 100;
0279 
0280         fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0281         fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n");
0282         fprintf(stdout,"Compiler     %s\n",compiler);
0283         fprintf(stdout,"Optimisation %s\n\n",options);
0284 
0285         ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
0286 
0287         matgen(a,lda,n,b,&norma);
0288         t1 = second();
0289         dgefa(a,lda,n,ipvt,&info);
0290         atime[0][0] = second() - t1;
0291         t1 = second();
0292         dgesl(a,lda,n,ipvt,b,0);
0293         atime[1][0] = second() - t1;
0294         total = atime[0][0] + atime[1][0];
0295 
0296 /*     compute a residual to verify results.  */ 
0297 
0298         for (i = 0; i < n; i++) {
0299                 x[i] = b[i];
0300         }
0301         matgen(a,lda,n,b,&norma);
0302         for (i = 0; i < n; i++) {
0303                 b[i] = -b[i];
0304         }
0305         dmxpy(n,b,n,lda,x,a);
0306         resid = 0.0;
0307         normx = 0.0;
0308         for (i = 0; i < n; i++) {
0309                 resid = (resid > fabs((double)b[i])) 
0310                         ? resid : fabs((double)b[i]);
0311                 normx = (normx > fabs((double)x[i])) 
0312                         ? normx : fabs((double)x[i]);
0313         }
0314         eps = epslon(ONE);
0315         residn = resid/( n*norma*normx*eps );
0316         epsn = eps;
0317         x1 = x[0] - 1;
0318         x2 = x[n-1] - 1;
0319         
0320         printf("norm resid      resid           machep");
0321         printf("         x[0]-1          x[n-1]-1\n");
0322         printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
0323                (double)residn, (double)resid, (double)epsn, 
0324                (double)x1, (double)x2);
0325 
0326         fprintf(stdout,"Times are reported for matrices of order        %5d\n",n);
0327         fprintf(stdout,"1 pass times for array with leading dimension of%5d\n\n",lda);
0328         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0329         fprintf(stdout,"      ratio\n");
0330 
0331         atime[2][0] = total;
0332         if (total > 0.0)
0333         {
0334             atime[3][0] = ops/(1.0e6*total);
0335             atime[4][0] = 2.0/atime[3][0];
0336         }
0337         else
0338         {
0339             atime[3][0] = 0.0;
0340             atime[4][0] = 0.0;
0341         }
0342         atime[5][0] = total/cray;
0343        
0344         print_time(0);
0345 
0346 /************************************************************************
0347  *       Calculate overhead of executing matgen procedure              *
0348  ************************************************************************/
0349        
0350         fprintf (stdout,"\nCalculating matgen overhead\n");
0351         pass = -20;
0352         loop = NTIMES;
0353         do
0354         {
0355             time1 = second();
0356             pass = pass + 1;        
0357             for ( i = 0 ; i < loop ; i++)
0358             {
0359                  matgen(a,lda,n,b,&norma);
0360             }
0361             time2 = second();
0362             overhead1 = (time2 - time1);
0363             fprintf (stdout,"%10d times %6.2f seconds\n", loop, overhead1);
0364             if (overhead1 > 5.0)
0365             {
0366                 pass = 0;
0367             }
0368             if (pass < 0)
0369             {
0370                 if (overhead1 < 0.1)
0371                 {
0372                     loop = loop * 10;
0373                 }
0374                 else
0375                 {
0376                     loop = loop * 2;
0377                 }
0378             }
0379         }
0380         while (pass < 0);
0381         
0382         overhead1 = overhead1 / (double)loop;
0383 
0384         fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
0385 
0386 /************************************************************************
0387  *           Calculate matgen/dgefa passes for 5 seconds                *
0388  ************************************************************************/
0389        
0390         fprintf (stdout,"Calculating matgen/dgefa passes for 5 seconds\n");
0391         pass = -20;
0392         ntimes = NTIMES;
0393         do
0394         {
0395             time1 = second();
0396             pass = pass + 1;        
0397             for ( i = 0 ; i < ntimes ; i++)
0398             {
0399                 matgen(a,lda,n,b,&norma);
0400                 dgefa(a,lda,n,ipvt,&info );
0401             }
0402             time2 = second() - time1;
0403             fprintf (stdout,"%10d times %6.2f seconds\n", ntimes, time2);
0404             if (time2 > 5.0)
0405             {
0406                 pass = 0;
0407             }
0408             if (pass < 0)
0409             {
0410                 if (time2 < 0.1)
0411                 {
0412                     ntimes = ntimes * 10;
0413                 }
0414                 else
0415                 {
0416                     ntimes = ntimes * 2;
0417                 }
0418             }
0419         }
0420         while (pass < 0);
0421         
0422         ntimes =  5.0 * (double)ntimes / time2;
0423         if (ntimes == 0) ntimes = 1;
0424 
0425         fprintf (stdout,"Passes used %10d \n\n", ntimes);
0426         fprintf(stdout,"Times for array with leading dimension of%4d\n\n",lda);
0427         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0428         fprintf(stdout,"      ratio\n");        
0429 
0430 /************************************************************************
0431  *                              Execute 5 passes                        *
0432  ************************************************************************/
0433       
0434         tm2 = ntimes * overhead1;
0435         atime[3][6] = 0;
0436 
0437         for (j=1 ; j<6 ; j++)
0438         {
0439         
0440             t1 = second();
0441 
0442             for (i = 0; i < ntimes; i++)
0443             {
0444                 matgen(a,lda,n,b,&norma);
0445                 dgefa(a,lda,n,ipvt,&info );
0446             }
0447 
0448             atime[0][j] = (second() - t1 - tm2)/ntimes;
0449 
0450             t1 = second();      
0451         
0452             for (i = 0; i < ntimes; i++)
0453             {
0454                 dgesl(a,lda,n,ipvt,b,0);
0455             }
0456 
0457             atime[1][j] = (second() - t1)/ntimes;
0458             total       = atime[0][j] + atime[1][j];
0459             atime[2][j] = total;
0460             atime[3][j] = ops/(1.0e6*total);
0461             atime[4][j] = 2.0/atime[3][j];
0462             atime[5][j] = total/cray;
0463             atime[3][6] = atime[3][6] + atime[3][j];
0464             
0465             print_time(j);
0466         }
0467         atime[3][6] = atime[3][6] / 5.0;
0468         fprintf (stdout,"Average                          %11.2f\n",
0469                                                (double)atime[3][6]);        
0470         
0471         fprintf (stdout,"\nCalculating matgen2 overhead\n");
0472 
0473 /************************************************************************
0474  *             Calculate overhead of executing matgen procedure         *
0475  ************************************************************************/
0476 
0477         time1 = second();        
0478         for ( i = 0 ; i < loop ; i++)
0479         {
0480             matgen(aa,ldaa,n,b,&norma);    
0481         }
0482         time2 = second();
0483         overhead2 = (time2 - time1);
0484         overhead2 = overhead2 / (double)loop;
0485         
0486         fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
0487         fprintf(stdout,"Times for array with leading dimension of%4d\n\n",ldaa);
0488         fprintf(stdout,"      dgefa      dgesl      total     Mflops       unit");
0489         fprintf(stdout,"      ratio\n");
0490 
0491 /************************************************************************
0492  *                              Execute 5 passes                        *
0493  ************************************************************************/
0494               
0495         tm2 = ntimes * overhead2;
0496         atime[3][12] = 0;
0497 
0498         for (j=7 ; j<12 ; j++)
0499         {
0500         
0501             t1 = second();
0502 
0503             for (i = 0; i < ntimes; i++)
0504             {
0505                 matgen(aa,ldaa,n,b,&norma);
0506                 dgefa(aa,ldaa,n,ipvt,&info  );
0507             }
0508 
0509             atime[0][j] = (second() - t1 - tm2)/ntimes;
0510 
0511             t1 = second();      
0512         
0513             for (i = 0; i < ntimes; i++)
0514             {
0515                 dgesl(aa,ldaa,n,ipvt,b,0);
0516             }
0517 
0518             atime[1][j] = (second() - t1)/ntimes;
0519             total       = atime[0][j] + atime[1][j];
0520             atime[2][j] = total;
0521             atime[3][j] = ops/(1.0e6*total);
0522             atime[4][j] = 2.0/atime[3][j];
0523             atime[5][j] = total/cray;
0524             atime[3][12] = atime[3][12] + atime[3][j];
0525 
0526             print_time(j);
0527         }
0528         atime[3][12] = atime[3][12] / 5.0; 
0529         fprintf (stdout,"Average                          %11.2f\n",
0530                                               (double)atime[3][12]);  
0531 
0532 /************************************************************************
0533  *           Use minimum average as overall Mflops rating               *
0534  ************************************************************************/
0535       
0536         mflops = atime[3][6];
0537         if (atime[3][12] < mflops) mflops = atime[3][12];
0538        
0539         fprintf(stdout,"\n");
0540         fprintf(stdout,ROLLING);fprintf(stdout,PREC);
0541         fprintf(stdout," Precision %11.2f Mflops \n\n",mflops);
0542 #ifdef __rtems__
0543     return 0;
0544 #endif
0545 }
0546      
0547 /*----------------------*/ 
0548 void print_time (int row)
0549 
0550 {
0551 fprintf(stdout,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n",   (double)atime[0][row],
0552        (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], 
0553        (double)atime[4][row], (double)atime[5][row]);
0554        return;
0555 }
0556       
0557 /*----------------------*/ 
0558 
0559 void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
0560 
0561 
0562 /* We would like to declare a[][lda], but c does not allow it.  In this
0563 function, references to a[i][j] are written a[lda*i+j].  */
0564 
0565 {
0566         int init, i, j;
0567 
0568         init = 1325;
0569         *norma = 0.0;
0570         for (j = 0; j < n; j++) {
0571                 for (i = 0; i < n; i++) {
0572                         init = 3125*init % 65536;
0573                         a[lda*j+i] = (init - 32768.0)/16384.0;                        
0574                         *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
0575                         
0576                         /* alternative for some compilers
0577                         if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
0578                         */
0579                 }
0580         }
0581         for (i = 0; i < n; i++) {
0582           b[i] = 0.0;
0583         }
0584         for (j = 0; j < n; j++) {
0585                 for (i = 0; i < n; i++) {
0586                         b[i] = b[i] + a[lda*j+i];
0587                 }
0588         }
0589         return;
0590 }
0591 
0592 /*----------------------*/ 
0593 void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
0594 
0595 
0596 /* We would like to declare a[][lda], but c does not allow it.  In this
0597 function, references to a[i][j] are written a[lda*i+j].  */
0598 /*
0599      dgefa factors a double precision matrix by gaussian elimination.
0600 
0601      dgefa is usually called by dgeco, but it can be called
0602      directly with a saving in time if  rcond  is not needed.
0603      (time for dgeco) = (1 + 9/n)*(time for dgefa) .
0604 
0605      on entry
0606 
0607         a       REAL precision[n][lda]
0608                 the matrix to be factored.
0609 
0610         lda     integer
0611                 the leading dimension of the array  a .
0612 
0613         n       integer
0614                 the order of the matrix  a .
0615 
0616      on return
0617 
0618         a       an upper triangular matrix and the multipliers
0619                 which were used to obtain it.
0620                 the factorization can be written  a = l*u  where
0621                 l  is a product of permutation and unit lower
0622                 triangular matrices and  u  is upper triangular.
0623 
0624         ipvt    integer[n]
0625                 an integer vector of pivot indices.
0626 
0627         info    integer
0628                 = 0  normal value.
0629                 = k  if  u[k][k] .eq. 0.0 .  this is not an error
0630                      condition for this subroutine, but it does
0631                      indicate that dgesl or dgedi will divide by zero
0632                      if called.  use  rcond  in dgeco for a reliable
0633                      indication of singularity.
0634 
0635      linpack. this version dated 08/14/78 .
0636      cleve moler, university of new mexico, argonne national lab.
0637 
0638      functions
0639 
0640      blas daxpy,dscal,idamax
0641 */
0642 
0643 {
0644 /*     internal variables       */
0645 
0646 REAL t;
0647 int j,k,kp1,l,nm1;
0648 
0649 
0650 /*     gaussian elimination with partial pivoting       */
0651 
0652         *info = 0;
0653         nm1 = n - 1;
0654         if (nm1 >=  0) {
0655                 for (k = 0; k < nm1; k++) {
0656                         kp1 = k + 1;
0657 
0658                         /* find l = pivot index */
0659 
0660                         l = idamax(n-k,&a[lda*k+k],1) + k;
0661                         ipvt[k] = l;
0662 
0663                         /* zero pivot implies this column already 
0664                            triangularized */
0665 
0666                         if (a[lda*k+l] != ZERO) {
0667 
0668                                 /* interchange if necessary */
0669 
0670                                 if (l != k) {
0671                                         t = a[lda*k+l];
0672                                         a[lda*k+l] = a[lda*k+k];
0673                                         a[lda*k+k] = t; 
0674                                 }
0675 
0676                                 /* compute multipliers */
0677 
0678                                 t = -ONE/a[lda*k+k];
0679                                 dscal(n-(k+1),t,&a[lda*k+k+1],1);
0680 
0681                                 /* row elimination with column indexing */
0682 
0683                                 for (j = kp1; j < n; j++) {
0684                                         t = a[lda*j+l];
0685                                         if (l != k) {
0686                                                 a[lda*j+l] = a[lda*j+k];
0687                                                 a[lda*j+k] = t;
0688                                         }
0689                                         daxpy(n-(k+1),t,&a[lda*k+k+1],1,
0690                                               &a[lda*j+k+1],1);
0691                                 } 
0692                         }
0693                         else { 
0694                                 *info = k;
0695                         }
0696                 } 
0697         }
0698         ipvt[n-1] = n-1;
0699         if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
0700         return;
0701 }
0702 
0703 /*----------------------*/ 
0704 
0705 void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
0706 
0707 
0708 /* We would like to declare a[][lda], but c does not allow it.  In this
0709 function, references to a[i][j] are written a[lda*i+j].  */
0710 
0711 /*
0712      dgesl solves the double precision system
0713      a * x = b  or  trans(a) * x = b
0714      using the factors computed by dgeco or dgefa.
0715 
0716      on entry
0717 
0718         a       double precision[n][lda]
0719                 the output from dgeco or dgefa.
0720 
0721         lda     integer
0722                 the leading dimension of the array  a .
0723 
0724         n       integer
0725                 the order of the matrix  a .
0726 
0727         ipvt    integer[n]
0728                 the pivot vector from dgeco or dgefa.
0729 
0730         b       double precision[n]
0731                 the right hand side vector.
0732 
0733         job     integer
0734                 = 0         to solve  a*x = b ,
0735                 = nonzero   to solve  trans(a)*x = b  where
0736                             trans(a)  is the transpose.
0737 
0738     on return
0739 
0740         b       the solution vector  x .
0741 
0742      error condition
0743 
0744         a division by zero will occur if the input factor contains a
0745         zero on the diagonal.  technically this indicates singularity
0746         but it is often caused by improper arguments or improper
0747         setting of lda .  it will not occur if the subroutines are
0748         called correctly and if dgeco has set rcond .gt. 0.0
0749         or dgefa has set info .eq. 0 .
0750 
0751      to compute  inverse(a) * c  where  c  is a matrix
0752      with  p  columns
0753            dgeco(a,lda,n,ipvt,rcond,z)
0754            if (!rcond is too small){
0755                 for (j=0,j<p,j++)
0756                         dgesl(a,lda,n,ipvt,c[j][0],0);
0757            }
0758 
0759      linpack. this version dated 08/14/78 .
0760      cleve moler, university of new mexico, argonne national lab.
0761 
0762      functions
0763 
0764      blas daxpy,ddot
0765 */
0766 {
0767 /*     internal variables       */
0768 
0769         REAL t;
0770         int k,kb,l,nm1;
0771 
0772         nm1 = n - 1;
0773         if (job == 0) {
0774 
0775                 /* job = 0 , solve  a * x = b
0776                    first solve  l*y = b         */
0777 
0778                 if (nm1 >= 1) {
0779                         for (k = 0; k < nm1; k++) {
0780                                 l = ipvt[k];
0781                                 t = b[l];
0782                                 if (l != k){ 
0783                                         b[l] = b[k];
0784                                         b[k] = t;
0785                                 }       
0786                                 daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
0787                         }
0788                 } 
0789 
0790                 /* now solve  u*x = y */
0791 
0792                 for (kb = 0; kb < n; kb++) {
0793                     k = n - (kb + 1);
0794                     b[k] = b[k]/a[lda*k+k];
0795                     t = -b[k];
0796                     daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
0797                 }
0798         }
0799         else { 
0800 
0801                 /* job = nonzero, solve  trans(a) * x = b
0802                    first solve  trans(u)*y = b                  */
0803 
0804                 for (k = 0; k < n; k++) {
0805                         t = ddot(k,&a[lda*k+0],1,&b[0],1);
0806                         b[k] = (b[k] - t)/a[lda*k+k];
0807                 }
0808 
0809                 /* now solve trans(l)*x = y     */
0810 
0811                 if (nm1 >= 1) {
0812                         for (kb = 1; kb < nm1; kb++) {
0813                                 k = n - (kb+1);
0814                                 b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
0815                                 l = ipvt[k];
0816                                 if (l != k) {
0817                                         t = b[l];
0818                                         b[l] = b[k];
0819                                         b[k] = t;
0820                                 }
0821                         }
0822                 }
0823         }
0824         return;
0825 }
0826 
0827 /*----------------------*/ 
0828 
0829 void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
0830 /*
0831      constant times a vector plus a vector.
0832      jack dongarra, linpack, 3/11/78.
0833 */
0834 
0835 {
0836         int i,ix,iy;
0837 
0838         if(n <= 0) return;
0839         if (da == ZERO) return;
0840 
0841         if(incx != 1 || incy != 1) {
0842 
0843                 /* code for unequal increments or equal increments
0844                    not equal to 1                                       */
0845 
0846                 ix = 0;
0847                 iy = 0;
0848                 if(incx < 0) ix = (-n+1)*incx;
0849                 if(incy < 0)iy = (-n+1)*incy;
0850                 for (i = 0;i < n; i++) {
0851                         dy[iy] = dy[iy] + da*dx[ix];
0852                         ix = ix + incx;
0853                         iy = iy + incy;
0854                      
0855                 }
0856                 return;
0857         }
0858         
0859         /* code for both increments equal to 1 */
0860         
0861 
0862 #ifdef ROLL
0863 
0864         for (i = 0;i < n; i++) {
0865                 dy[i] = dy[i] + da*dx[i];
0866         }
0867 
0868 
0869 #endif
0870 
0871 #ifdef UNROLL
0872 
0873         m = n % 4;
0874         if ( m != 0) {
0875                 for (i = 0; i < m; i++) 
0876                         dy[i] = dy[i] + da*dx[i];
0877                         
0878                 if (n < 4) return;
0879         }
0880         for (i = m; i < n; i = i + 4) {
0881                 dy[i] = dy[i] + da*dx[i];
0882                 dy[i+1] = dy[i+1] + da*dx[i+1];
0883                 dy[i+2] = dy[i+2] + da*dx[i+2];
0884                 dy[i+3] = dy[i+3] + da*dx[i+3];
0885                 
0886         }
0887 
0888 #endif
0889 return;
0890 }
0891    
0892 /*----------------------*/ 
0893 
0894 REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
0895 /*
0896      forms the dot product of two vectors.
0897      jack dongarra, linpack, 3/11/78.
0898 */
0899 
0900 {
0901         REAL dtemp;
0902         int i,ix,iy;
0903 
0904         dtemp = ZERO;
0905 
0906         if(n <= 0) return(ZERO);
0907 
0908         if(incx != 1 || incy != 1) {
0909 
0910                 /* code for unequal increments or equal increments
0911                    not equal to 1                                       */
0912 
0913                 ix = 0;
0914                 iy = 0;
0915                 if (incx < 0) ix = (-n+1)*incx;
0916                 if (incy < 0) iy = (-n+1)*incy;
0917                 for (i = 0;i < n; i++) {
0918                         dtemp = dtemp + dx[ix]*dy[iy];
0919                         ix = ix + incx;
0920                         iy = iy + incy;
0921                        
0922                 }
0923                 return(dtemp);
0924         }
0925 
0926         /* code for both increments equal to 1 */
0927 
0928 
0929 #ifdef ROLL
0930 
0931         for (i=0;i < n; i++)
0932                 dtemp = dtemp + dx[i]*dy[i];
0933                
0934         return(dtemp);
0935 
0936 #endif
0937 
0938 #ifdef UNROLL
0939 
0940 
0941         m = n % 5;
0942         if (m != 0) {
0943                 for (i = 0; i < m; i++)
0944                         dtemp = dtemp + dx[i]*dy[i];
0945                 if (n < 5) return(dtemp);
0946         }
0947         for (i = m; i < n; i = i + 5) {
0948                 dtemp = dtemp + dx[i]*dy[i] +
0949                 dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
0950                 dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
0951         }
0952         return(dtemp);
0953 
0954 #endif
0955 
0956 }
0957 
0958 /*----------------------*/ 
0959 void dscal(int n, REAL da, REAL dx[], int incx)
0960 
0961 /*     scales a vector by a constant.
0962       jack dongarra, linpack, 3/11/78.
0963 */
0964 
0965 {
0966         int i,nincx;
0967 
0968         if(n <= 0)return;
0969         if(incx != 1) {
0970 
0971                 /* code for increment not equal to 1 */
0972 
0973                 nincx = n*incx;
0974                 for (i = 0; i < nincx; i = i + incx)
0975                         dx[i] = da*dx[i];
0976                         
0977                 return;
0978         }
0979 
0980         /* code for increment equal to 1 */
0981 
0982 
0983 #ifdef ROLL
0984 
0985         for (i = 0; i < n; i++)
0986                 dx[i] = da*dx[i];
0987                 
0988 
0989 #endif
0990 
0991 #ifdef UNROLL
0992 
0993 
0994         m = n % 5;
0995         if (m != 0) {
0996                 for (i = 0; i < m; i++)
0997                         dx[i] = da*dx[i];
0998                 if (n < 5) return;
0999         }
1000         for (i = m; i < n; i = i + 5){
1001                 dx[i] = da*dx[i];
1002                 dx[i+1] = da*dx[i+1];
1003                 dx[i+2] = da*dx[i+2];
1004                 dx[i+3] = da*dx[i+3];
1005                 dx[i+4] = da*dx[i+4];
1006         }
1007 
1008 #endif
1009 
1010 }
1011 
1012 /*----------------------*/ 
1013 int idamax(int n, REAL dx[], int incx)
1014 
1015 /*
1016      finds the index of element having max. absolute value.
1017      jack dongarra, linpack, 3/11/78.
1018 */
1019 
1020 
1021 {
1022         REAL dmax;
1023         int i, ix, itemp;
1024 
1025         if( n < 1 ) return(-1);
1026         if(n ==1 ) return(0);
1027         itemp = -1;
1028         if(incx != 1) {
1029 
1030                 /* code for increment not equal to 1 */
1031 
1032                 ix = 1;
1033                 dmax = fabs((double)dx[0]);
1034                 ix = ix + incx;
1035                 for (i = 1; i < n; i++) {
1036                         if(fabs((double)dx[ix]) > dmax)  {
1037                                 itemp = i;
1038                                 dmax = fabs((double)dx[ix]);
1039                         }
1040                         ix = ix + incx;
1041                 }
1042         }
1043         else {
1044 
1045                 /* code for increment equal to 1 */
1046 
1047                 itemp = 0;
1048                 dmax = fabs((double)dx[0]);
1049                 for (i = 1; i < n; i++) {
1050                         if(fabs((double)dx[i]) > dmax) {
1051                                 itemp = i;
1052                                 dmax = fabs((double)dx[i]);
1053                         }
1054                 }
1055         }
1056         return (itemp);
1057 }
1058 
1059 /*----------------------*/ 
1060 REAL epslon (REAL x)
1061 
1062 /*
1063      estimate unit roundoff in quantities of size x.
1064 */
1065 
1066 {
1067         REAL a,b,c,eps;
1068 /*
1069      this program should function properly on all systems
1070      satisfying the following two assumptions,
1071         1.  the base used in representing dfloating point
1072             numbers is not a power of three.
1073         2.  the quantity  a  in statement 10 is represented to 
1074             the accuracy used in dfloating point variables
1075             that are stored in memory.
1076      the statement number 10 and the go to 10 are intended to
1077      force optimizing compilers to generate code satisfying 
1078      assumption 2.
1079      under these assumptions, it should be true that,
1080             a  is not exactly equal to four-thirds,
1081             b  has a zero for its last bit or digit,
1082             c  is not exactly equal to one,
1083             eps  measures the separation of 1.0 from
1084                  the next larger dfloating point number.
1085      the developers of eispack would appreciate being informed
1086      about any systems where these assumptions do not hold.
1087 
1088      *****************************************************************
1089      this routine is one of the auxiliary routines used by eispack iii
1090      to avoid machine dependencies.
1091      *****************************************************************
1092 
1093      this version dated 4/6/83.
1094 */
1095 
1096         a = 4.0e0/3.0e0;
1097         eps = ZERO;
1098         while (eps == ZERO) {
1099                 b = a - ONE;
1100                 c = b + b + b;
1101                 eps = fabs((double)(c-ONE));
1102         }
1103         return(eps*fabs((double)x));
1104 }
1105  
1106 /*----------------------*/ 
1107 void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
1108 
1109 
1110 /* We would like to declare m[][ldm], but c does not allow it.  In this
1111 function, references to m[i][j] are written m[ldm*i+j].  */
1112 
1113 /*
1114    purpose:
1115      multiply matrix m times vector x and add the result to vector y.
1116 
1117    parameters:
1118 
1119      n1 integer, number of elements in vector y, and number of rows in
1120          matrix m
1121 
1122      y double [n1], vector of length n1 to which is added 
1123          the product m*x
1124 
1125      n2 integer, number of elements in vector x, and number of columns
1126          in matrix m
1127 
1128      ldm integer, leading dimension of array m
1129 
1130      x double [n2], vector of length n2
1131 
1132      m double [ldm][n2], matrix of n1 rows and n2 columns
1133 
1134  ----------------------------------------------------------------------
1135 */
1136 {
1137         int j,i,jmin;
1138         /* cleanup odd vector */
1139 
1140         j = n2 % 2;
1141         if (j >= 1) {
1142                 j = j - 1;
1143                 for (i = 0; i < n1; i++) 
1144                         y[i] = (y[i]) + x[j]*m[ldm*j+i];
1145         } 
1146 
1147         /* cleanup odd group of two vectors */
1148 
1149         j = n2 % 4;
1150         if (j >= 2) {
1151                 j = j - 1;
1152                 for (i = 0; i < n1; i++)
1153                         y[i] = ( (y[i])
1154                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1155         } 
1156 
1157         /* cleanup odd group of four vectors */
1158 
1159         j = n2 % 8;
1160         if (j >= 4) {
1161                 j = j - 1;
1162                 for (i = 0; i < n1; i++)
1163                         y[i] = ((( (y[i])
1164                                + x[j-3]*m[ldm*(j-3)+i]) 
1165                                + x[j-2]*m[ldm*(j-2)+i])
1166                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
1167         } 
1168 
1169         /* cleanup odd group of eight vectors */
1170 
1171         j = n2 % 16;
1172         if (j >= 8) {
1173                 j = j - 1;
1174                 for (i = 0; i < n1; i++)
1175                         y[i] = ((((((( (y[i])
1176                                + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
1177                                + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
1178                                + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
1179                                + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
1180         } 
1181         
1182         /* main loop - groups of sixteen vectors */
1183 
1184         jmin = (n2%16)+16;
1185         for (j = jmin-1; j < n2; j = j + 16) {
1186                 for (i = 0; i < n1; i++) 
1187                         y[i] = ((((((((((((((( (y[i])
1188                                 + x[j-15]*m[ldm*(j-15)+i]) 
1189                                 + x[j-14]*m[ldm*(j-14)+i])
1190                                 + x[j-13]*m[ldm*(j-13)+i]) 
1191                                 + x[j-12]*m[ldm*(j-12)+i])
1192                                 + x[j-11]*m[ldm*(j-11)+i]) 
1193                                 + x[j-10]*m[ldm*(j-10)+i])
1194                                 + x[j- 9]*m[ldm*(j- 9)+i]) 
1195                                 + x[j- 8]*m[ldm*(j- 8)+i])
1196                                 + x[j- 7]*m[ldm*(j- 7)+i]) 
1197                                 + x[j- 6]*m[ldm*(j- 6)+i])
1198                                 + x[j- 5]*m[ldm*(j- 5)+i]) 
1199                                 + x[j- 4]*m[ldm*(j- 4)+i])
1200                                 + x[j- 3]*m[ldm*(j- 3)+i]) 
1201                                 + x[j- 2]*m[ldm*(j- 2)+i])
1202                                 + x[j- 1]*m[ldm*(j- 1)+i]) 
1203                                 + x[j]   *m[ldm*j+i];
1204         }
1205         return;
1206 } 
1207 
1208 /*----------------------*/