diff -Nur hpl/HISTORY hpl-patched/HISTORY
--- hpl/HISTORY	2004-01-22 00:13:09.000000000 -0500
+++ hpl-patched/HISTORY	2006-12-14 10:19:19.000000000 -0500
@@ -20,4 +20,7 @@
  Various building problems on the T3E have been fixed;  Thanks
  to Edward Anderson.
 
+ - 06/26/06 applied the "native MPI_Bcast() patch for HPL"
+ - 06/30/06 applied the "non-blocking collective operations patch to HPL"
+
 ==============================================================
diff -Nur hpl/include/hpl_comm.h hpl-patched/include/hpl_comm.h
--- hpl/include/hpl_comm.h	2004-01-22 00:13:17.000000000 -0500
+++ hpl-patched/include/hpl_comm.h	2006-12-14 10:06:53.000000000 -0500
@@ -64,7 +64,9 @@
    HPL_2RING         = 403,                      /* Increasing 2-ring */
    HPL_2RING_M       = 404,           /* Increasing 2-ring (modified) */
    HPL_BLONG         = 405,                         /* long broadcast */
-   HPL_BLONG_M       = 406               /* long broadcast (modified) */
+   HPL_BLONG_M       = 406,              /* long broadcast (modified) */
+   HPL_NATIVE        = 407,                      /* native MPI_Bcast() */
+   HPL_NBC           = 408            /* non-blocking NBC_Ibcast() */
 } HPL_T_TOP;
 /*
  * ---------------------------------------------------------------------
@@ -155,6 +157,14 @@
 int HPL_bcast_blonM STDC_ARGS( ( HPL_T_panel *, int * ) );
 int HPL_bwait_blonM STDC_ARGS( ( HPL_T_panel *        ) );
 
+int HPL_binit_naMPI STDC_ARGS( ( HPL_T_panel *        ) );
+int HPL_bcast_naMPI STDC_ARGS( ( HPL_T_panel *, int * ) );
+int HPL_bwait_naMPI STDC_ARGS( ( HPL_T_panel *        ) );
+
+int HPL_binit_nbc STDC_ARGS( ( HPL_T_panel *        ) );
+int HPL_bcast_nbc STDC_ARGS( ( HPL_T_panel *, int * ) );
+int HPL_bwait_nbc STDC_ARGS( ( HPL_T_panel *        ) );
+
 #endif
 /*
  * End of hpl_comm.h
diff -Nur hpl/makes/Make.comm hpl-patched/makes/Make.comm
--- hpl/makes/Make.comm	2004-01-22 00:13:14.000000000 -0500
+++ hpl-patched/makes/Make.comm	2006-12-14 10:05:37.000000000 -0500
@@ -59,7 +59,9 @@
    HPL_2rinM.o            HPL_blong.o            HPL_blonM.o            \
    HPL_packL.o            HPL_copyL.o            HPL_binit.o            \
    HPL_bcast.o            HPL_bwait.o            HPL_send.o             \
-   HPL_recv.o             HPL_sdrv.o
+   HPL_recv.o             HPL_sdrv.o             HPL_naMPI.o            \
+   HPL_nbc.o
+   
 #
 ## Targets #############################################################
 #
@@ -86,6 +88,10 @@
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_blong.c
 HPL_blonM.o            : ../HPL_blonM.c            $(INCdep)
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_blonM.c
+HPL_naMPI.o            : ../HPL_naMPI.c            $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_naMPI.c
+HPL_nbc.o            : ../HPL_nbc.c            $(INCdep)
+	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_nbc.c
 HPL_packL.o            : ../HPL_packL.c            $(INCdep)
 	$(CC) -o $@ -c $(CCFLAGS)  ../HPL_packL.c
 HPL_copyL.o            : ../HPL_copyL.c            $(INCdep)
diff -Nur hpl/src/comm/HPL_bcast.c hpl-patched/src/comm/HPL_bcast.c
--- hpl/src/comm/HPL_bcast.c	2004-01-22 00:13:43.000000000 -0500
+++ hpl-patched/src/comm/HPL_bcast.c	2006-12-14 10:06:26.000000000 -0500
@@ -108,6 +108,8 @@
       case HPL_2RING   : ierr = HPL_bcast_2ring( PANEL, IFLAG ); break;
       case HPL_BLONG_M : ierr = HPL_bcast_blonM( PANEL, IFLAG ); break;
       case HPL_BLONG   : ierr = HPL_bcast_blong( PANEL, IFLAG ); break;
+      case HPL_NATIVE  : ierr = HPL_bcast_naMPI( PANEL, IFLAG ); break;
+      case HPL_NBC     : ierr = HPL_bcast_nbc( PANEL, IFLAG ); break;
       default          : ierr = HPL_SUCCESS;
    }
  
diff -Nur hpl/src/comm/HPL_binit.c hpl-patched/src/comm/HPL_binit.c
--- hpl/src/comm/HPL_binit.c	2004-01-22 00:13:43.000000000 -0500
+++ hpl-patched/src/comm/HPL_binit.c	2006-12-14 10:07:23.000000000 -0500
@@ -98,6 +98,8 @@
       case HPL_2RING   : ierr = HPL_binit_2ring( PANEL ); break;
       case HPL_BLONG_M : ierr = HPL_binit_blonM( PANEL ); break;
       case HPL_BLONG   : ierr = HPL_binit_blong( PANEL ); break;
+      case HPL_NATIVE  : ierr = HPL_binit_naMPI( PANEL ); break;
+      case HPL_NBC     : ierr = HPL_binit_nbc( PANEL ); break;
       default          : ierr = HPL_SUCCESS;
    }
  
diff -Nur hpl/src/comm/HPL_bwait.c hpl-patched/src/comm/HPL_bwait.c
--- hpl/src/comm/HPL_bwait.c	2004-01-22 00:13:43.000000000 -0500
+++ hpl-patched/src/comm/HPL_bwait.c	2006-12-14 10:07:57.000000000 -0500
@@ -99,6 +99,8 @@
       case HPL_2RING   : ierr = HPL_bwait_2ring( PANEL ); break;
       case HPL_BLONG_M : ierr = HPL_bwait_blonM( PANEL ); break;
       case HPL_BLONG   : ierr = HPL_bwait_blong( PANEL ); break;
+      case HPL_NATIVE  : ierr = HPL_bwait_naMPI( PANEL ); break;
+      case HPL_NBC     : ierr = HPL_bwait_nbc( PANEL ); break;
       default          : ierr = HPL_SUCCESS;
    }
  
diff -Nur hpl/src/comm/HPL_naMPI.c hpl-patched/src/comm/HPL_naMPI.c
--- hpl/src/comm/HPL_naMPI.c	1969-12-31 19:00:00.000000000 -0500
+++ hpl-patched/src/comm/HPL_naMPI.c	2006-12-01 05:37:06.000000000 -0500
@@ -0,0 +1,190 @@
+/*
+ * derived from HPL_1ring.c; 7th algorithm which uses native MPI_Bcast()
+ */
+/* 
+ * -- High Performance Computing Linpack Benchmark (HPL)                
+ *    HPL - 1.0a - January 20, 2004                          
+ *    Antoine P. Petitet                                                
+ *    University of Tennessee, Knoxville                                
+ *    Innovative Computing Laboratories                                 
+ *    (C) Copyright 2000-2004 All Rights Reserved                       
+ *                                                                      
+ * -- Copyright notice and Licensing terms:                             
+ *                                                                      
+ * Redistribution  and  use in  source and binary forms, with or without
+ * modification, are  permitted provided  that the following  conditions
+ * are met:                                                             
+ *                                                                      
+ * 1. Redistributions  of  source  code  must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.        
+ *                                                                      
+ * 2. Redistributions in binary form must reproduce  the above copyright
+ * notice, this list of conditions,  and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution. 
+ *                                                                      
+ * 3. All  advertising  materials  mentioning  features  or  use of this
+ * software must display the following acknowledgement:                 
+ * This  product  includes  software  developed  at  the  University  of
+ * Tennessee, Knoxville, Innovative Computing Laboratories.             
+ *                                                                      
+ * 4. The name of the  University,  the name of the  Laboratory,  or the
+ * names  of  its  contributors  may  not  be used to endorse or promote
+ * products  derived   from   this  software  without  specific  written
+ * permission.                                                          
+ *                                                                      
+ * -- Disclaimer:                                                       
+ *                                                                      
+ * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ * ---------------------------------------------------------------------
+ */ 
+/*
+ * Include files
+ */
+#include "hpl.h"
+
+#ifdef HPL_NO_MPI_DATATYPE  /* The user insists to not use MPI types */
+#ifndef HPL_COPY_L       /* and also want to avoid the copy of L ... */
+#define HPL_COPY_L   /* well, sorry, can not do that: force the copy */
+#endif
+#endif
+
+#ifdef STDC_HEADERS
+int HPL_binit_naMPI
+(
+   HPL_T_panel *              PANEL
+)
+#else
+int HPL_binit_naMPI( PANEL )
+   HPL_T_panel *              PANEL;
+#endif
+{
+#ifdef HPL_USE_MPI_DATATYPE
+/*
+ * .. Local Variables ..
+ */
+   int                        ierr;
+#endif
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL )           { return( HPL_SUCCESS ); }
+   if( PANEL->grid->npcol <= 1 ) { return( HPL_SUCCESS ); }
+#ifdef HPL_USE_MPI_DATATYPE
+#ifdef HPL_COPY_L
+/*
+ * Copy the panel into a contiguous buffer
+ */
+   HPL_copyL( PANEL );
+#endif
+/*
+ * Create the MPI user-defined data type
+ */
+   ierr = HPL_packL( PANEL, 0, PANEL->len, 0 );
+ 
+   return( ( ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE ) );
+#else
+/*
+ * Force the copy of the panel into a contiguous buffer
+ */
+   HPL_copyL( PANEL );
+
+   return( HPL_SUCCESS );
+#endif
+}
+
+#ifdef HPL_USE_MPI_DATATYPE
+
+#define   _M_BUFF     PANEL->buffers[0]
+#define   _M_COUNT    PANEL->counts[0]
+#define   _M_TYPE     PANEL->dtypes[0]
+
+#else
+
+#define   _M_BUFF     (void *)(PANEL->L2)
+#define   _M_COUNT    PANEL->len
+#define   _M_TYPE     MPI_DOUBLE
+
+#endif
+
+#ifdef STDC_HEADERS
+int HPL_bcast_naMPI
+(
+   HPL_T_panel                * PANEL,
+   int                        * IFLAG
+)
+#else
+int HPL_bcast_naMPI( PANEL, IFLAG )
+   HPL_T_panel                * PANEL;
+   int                        * IFLAG;
+#endif
+{
+/*
+ * .. Local Variables ..
+ */
+   MPI_Comm                   comm;
+   int                        ierr, root, size;
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL ) { *IFLAG = HPL_SUCCESS; return( HPL_SUCCESS ); }
+   if( ( size = PANEL->grid->npcol ) <= 1 )
+   {                     *IFLAG = HPL_SUCCESS; return( HPL_SUCCESS ); }
+/*
+ * The root process spreads the panel. Each non root process waits until
+ * it receives the panel (no non-blocking feature anymore!)
+ */
+   comm  = PANEL->grid->row_comm;
+   root = PANEL->pcol;
+
+   ierr = MPI_Bcast( _M_BUFF, _M_COUNT, _M_TYPE, root, comm);
+ 
+/*
+ * If the message was received or being forwarded,  return HPL_SUCCESS.
+ * If an error occured in an MPI call, return HPL_FAILURE.
+ */  
+   *IFLAG = ( ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE );
+
+   return( *IFLAG );
+}
+
+#ifdef STDC_HEADERS
+int HPL_bwait_naMPI
+(
+   HPL_T_panel *              PANEL
+)
+#else
+int HPL_bwait_naMPI( PANEL )
+   HPL_T_panel *              PANEL;
+#endif
+{
+#ifdef HPL_USE_MPI_DATATYPE
+/*
+ * .. Local Variables ..
+ */
+   int                        ierr;
+#endif
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL )           { return( HPL_SUCCESS ); }
+   if( PANEL->grid->npcol <= 1 ) { return( HPL_SUCCESS ); }
+/*
+ * Release the arrays of request / status / data-types and buffers 
+ */
+#ifdef HPL_USE_MPI_DATATYPE
+   ierr = MPI_Type_free( &PANEL->dtypes[0] );
+   return( ( ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE ) );
+#else
+   return( HPL_SUCCESS );
+#endif
+}
diff -Nur hpl/src/comm/HPL_nbc.c hpl-patched/src/comm/HPL_nbc.c
--- hpl/src/comm/HPL_nbc.c	1969-12-31 19:00:00.000000000 -0500
+++ hpl-patched/src/comm/HPL_nbc.c	2006-12-14 10:09:45.000000000 -0500
@@ -0,0 +1,150 @@
+/*
+ * Include files
+ */
+#include "hpl.h"
+#include "nbc.h"
+
+#ifdef HPL_NO_MPI_DATATYPE  /* The user insists to not use MPI types */
+#ifndef HPL_COPY_L       /* and also want to avoid the copy of L ... */
+#define HPL_COPY_L   /* well, sorry, can not do that: force the copy */
+#endif
+#endif
+
+#ifdef STDC_HEADERS
+int HPL_binit_nbc
+(
+   HPL_T_panel *              PANEL
+)
+#else
+int HPL_binit_nbc( PANEL )
+   HPL_T_panel *              PANEL;
+#endif
+{
+#ifdef HPL_USE_MPI_DATATYPE
+/*
+ * .. Local Variables ..
+ */
+   int                        ierr;
+#endif
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL )           { return( HPL_SUCCESS ); }
+   if( PANEL->grid->npcol <= 1 ) { return( HPL_SUCCESS ); }
+#ifdef HPL_USE_MPI_DATATYPE
+#ifdef HPL_COPY_L
+/*
+ * Copy the panel into a contiguous buffer
+ */
+   HPL_copyL( PANEL );
+#endif
+/*
+ * Create the MPI user-defined data type
+ */
+   ierr = HPL_packL( PANEL, 0, PANEL->len, 0 );
+ 
+   return( ( ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE ) );
+#else
+/*
+ * Force the copy of the panel into a contiguous buffer
+ */
+   HPL_copyL( PANEL );
+
+   return( HPL_SUCCESS );
+#endif
+}
+
+#ifdef HPL_USE_MPI_DATATYPE
+
+#define   _M_BUFF     PANEL->buffers[0]
+#define   _M_COUNT    PANEL->counts[0]
+#define   _M_TYPE     PANEL->dtypes[0]
+
+#else
+
+#define   _M_BUFF     (void *)(PANEL->L2)
+#define   _M_COUNT    PANEL->len
+#define   _M_TYPE     MPI_DOUBLE
+
+#endif
+
+/* global handle */
+NBC_Handle handle;
+int sent=0;
+
+#ifdef STDC_HEADERS
+int HPL_bcast_nbc
+(
+   HPL_T_panel                * PANEL,
+   int                        * IFLAG
+)
+#else
+int HPL_bcast_nbc( PANEL, IFLAG )
+   HPL_T_panel                * PANEL;
+   int                        * IFLAG;
+#endif
+{
+/*
+ * .. Local Variables ..
+ */
+   MPI_Comm                   comm;
+   int                        ierr, root;
+
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL ) { *IFLAG = HPL_SUCCESS; return( HPL_SUCCESS ); }
+   if( ( PANEL->grid->npcol ) <= 1 )
+   {                     *IFLAG = HPL_SUCCESS; return( HPL_SUCCESS ); }
+
+   comm  = PANEL->grid->row_comm;
+   root = PANEL->pcol;
+
+   //ierr = MPI_Bcast(_M_BUFF, _M_COUNT, _M_TYPE, root, comm);
+   
+   if(sent == 0) {
+     ierr = NBC_Ibcast(_M_BUFF, _M_COUNT, _M_TYPE, root, comm, &handle);
+     sent = 1;
+   }
+   
+   if(NBC_Test(&handle) == NBC_OK) {
+     *IFLAG = HPL_SUCCESS;
+     sent = 0;
+   }
+   else 
+     *IFLAG = HPL_KEEP_TESTING;
+   
+   return( *IFLAG );
+}
+
+#ifdef STDC_HEADERS
+int HPL_bwait_nbc
+(
+   HPL_T_panel *              PANEL
+)
+#else
+int HPL_bwait_nbc( PANEL )
+   HPL_T_panel *              PANEL;
+#endif
+{
+#ifdef HPL_USE_MPI_DATATYPE
+/*
+ * .. Local Variables ..
+ */
+   int                        ierr;
+#endif
+/* ..
+ * .. Executable Statements ..
+ */
+   if( PANEL == NULL )           { return( HPL_SUCCESS ); }
+   if( PANEL->grid->npcol <= 1 ) { return( HPL_SUCCESS ); }
+/*
+ * Release the arrays of request / status / data-types and buffers 
+ */
+#ifdef HPL_USE_MPI_DATATYPE
+   ierr = MPI_Type_free( &PANEL->dtypes[0] );
+   return( ( ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE ) );
+#else
+   return( HPL_SUCCESS );
+#endif
+}
diff -Nur hpl/testing/ptest/HPL.dat hpl-patched/testing/ptest/HPL.dat
--- hpl/testing/ptest/HPL.dat	2004-01-22 00:13:29.000000000 -0500
+++ hpl-patched/testing/ptest/HPL.dat	2006-12-14 10:10:19.000000000 -0500
@@ -20,7 +20,7 @@
 3            # of recursive panel fact.
 0 1 2        RFACTs (0=left, 1=Crout, 2=Right)
 1            # of broadcast
-0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
 1            # of lookahead depth
 0            DEPTHs (>=0)
 2            SWAP (0=bin-exch,1=long,2=mix)
diff -Nur hpl/testing/ptest/HPL_pddriver.c hpl-patched/testing/ptest/HPL_pddriver.c
--- hpl/testing/ptest/HPL_pddriver.c	2004-01-22 00:13:21.000000000 -0500
+++ hpl-patched/testing/ptest/HPL_pddriver.c	2006-12-14 10:10:34.000000000 -0500
@@ -137,7 +137,7 @@
  * 3            # of recursive panel fact.
  * 0 1 2        RFACTs (0=left, 1=Crout, 2=Right)
  * 1            # of broadcast
- * 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+ * 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
  * 1            # of lookahead depth
  * 0            DEPTHs (>=0)
  * 2            SWAP (0=bin-exch,1=long,2=mix)
diff -Nur hpl/testing/ptest/HPL_pdinfo.c hpl-patched/testing/ptest/HPL_pdinfo.c
--- hpl/testing/ptest/HPL_pdinfo.c	2004-01-22 00:13:25.000000000 -0500
+++ hpl-patched/testing/ptest/HPL_pdinfo.c	2006-12-14 10:13:19.000000000 -0500
@@ -516,7 +516,7 @@
          else              RF[ i ] = HPL_RIGHT_LOOKING;
       }
 /*
- * Broadcast topology (TP) (0=rg, 1=2rg, 2=rgM, 3=2rgM, 4=L)
+ * Broadcast topology (TP) (0=1rg, 1=1rM, 2=2rg, 3=2rM, 4=Lng, 5=LnM, 6=MPI, 7=NBC)
  */
       (void) fgets( line, HPL_LINE_MAX - 2, infp );
       (void) sscanf( line, "%s", num ); *NTPS = atoi( num );
@@ -538,6 +538,8 @@
          else if( j == 3 ) TP[ i ] = HPL_2RING_M;
          else if( j == 4 ) TP[ i ] = HPL_BLONG;
          else if( j == 5 ) TP[ i ] = HPL_BLONG_M;
+         else if( j == 6 ) TP[ i ] = HPL_NATIVE;
+         else if( j == 7 ) TP[ i ] = HPL_NBC;
          else              TP[ i ] = HPL_1RING_M;
       }
 /*
@@ -696,6 +698,8 @@
          else if( TP[i] == HPL_2RING_M ) iwork[j] = 3;
          else if( TP[i] == HPL_BLONG   ) iwork[j] = 4;
          else if( TP[i] == HPL_BLONG_M ) iwork[j] = 5;
+         else if( TP[i] == HPL_NATIVE  ) iwork[j] = 6;
+         else if( TP[i] == HPL_NBC  )    iwork[j] = 7;
          j++;
       }
       for( i = 0; i < *NDHS; i++ ) { iwork[j] = DH[i]; j++; }
@@ -739,6 +743,8 @@
          else if( iwork[j] == 3 ) TP[i] = HPL_2RING_M;
          else if( iwork[j] == 4 ) TP[i] = HPL_BLONG;
          else if( iwork[j] == 5 ) TP[i] = HPL_BLONG_M;
+         else if( iwork[j] == 6 ) TP[i] = HPL_NATIVE;
+         else if( iwork[j] == 7 ) TP[i] = HPL_NBC;
          j++;
       }
       for( i = 0; i < *NDHS; i++ ) { DH[i] = iwork[j]; j++; }
@@ -994,6 +1000,10 @@
             HPL_fprintf( TEST->outfp,       "   Blong " );
          else if( TP[i] == HPL_BLONG_M )
             HPL_fprintf( TEST->outfp,       "  BlongM " );
+         else if( TP[i] == HPL_NATIVE )
+            HPL_fprintf( TEST->outfp,       "  natMPI " );
+         else if( TP[i] == HPL_NBC )
+            HPL_fprintf( TEST->outfp,       "  NBC " );
       }
       if( *NTPS > 8 )
       {
@@ -1012,6 +1022,10 @@
                HPL_fprintf( TEST->outfp,       "   Blong " );
             else if( TP[i] == HPL_BLONG_M )
                HPL_fprintf( TEST->outfp,       "  BlongM " );
+            else if( TP[i] == HPL_NATIVE )
+               HPL_fprintf( TEST->outfp,       "  natMPI " );
+            else if( TP[i] == HPL_NBC )
+               HPL_fprintf( TEST->outfp,       "  NBC " );
          }
          if( *NTPS > 16 )
          {
@@ -1030,6 +1044,10 @@
                   HPL_fprintf( TEST->outfp,       "   Blong " );
                else if( TP[i] == HPL_BLONG_M )
                   HPL_fprintf( TEST->outfp,       "  BlongM " );
+               else if( TP[i] == HPL_NATIVE )
+                  HPL_fprintf( TEST->outfp,       "  natMPI " );
+               else if( TP[i] == HPL_NBC )
+                  HPL_fprintf( TEST->outfp,       "  NBC " );
             }
          }
       }
diff -Nur hpl/testing/ptest/HPL_pdtest.c hpl-patched/testing/ptest/HPL_pdtest.c
--- hpl/testing/ptest/HPL_pdtest.c	2004-01-22 00:13:29.000000000 -0500
+++ hpl-patched/testing/ptest/HPL_pdtest.c	2006-12-14 10:13:58.000000000 -0500
@@ -235,7 +235,10 @@
       else if( ALGO->btopo == HPL_2RING   ) ctop = '2';
       else if( ALGO->btopo == HPL_2RING_M ) ctop = '3';
       else if( ALGO->btopo == HPL_BLONG   ) ctop = '4';
-      else /* if( ALGO->btopo == HPL_BLONG_M ) */ ctop = '5';
+      else if( ALGO->btopo == HPL_BLONG_M ) ctop = '5';
+      else if( ALGO->btopo == HPL_NATIVE  ) ctop = '6';
+      else if( ALGO->btopo == HPL_NBC  )    ctop = '7';
+      else ctop = '?';
 
       if( wtime[0] > HPL_rzero )
          HPL_fprintf( TEST->outfp,
diff -Nur hpl/TUNING hpl-patched/TUNING
--- hpl/TUNING	2004-01-22 00:13:09.000000000 -0500
+++ hpl-patched/TUNING	2006-12-14 10:02:11.000000000 -0500
@@ -217,13 +217,13 @@
  a good choice. Lines 22-23: (Example 1):
 
  1       # of broadcast
- 1       BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+ 1       BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
  
  This will cause HPL  to broadcast the current panel using the
  increasing ring modified topology. Lines 22-23: (Example 2):
  
  2       # of broadcast
- 0 4     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+ 0 4     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
  
  This will cause  HPL to broadcast the current panel using the
  increasing ring virtual topology and the long message algori-
@@ -359,12 +359,17 @@
  wing for the lines 22-23:
  
  2       # of broadcast
- 1 3     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+ 1 3     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
 
  The best broadcast  depends  on your problem size and harware
  performance. My take is that 4 or 5  may be  competitive  for
  machines  featuring  very  fast nodes  comparatively  to  the 
  network.
+ Two new broadcast versions are added to the original HPL code. One uses
+ the MPI_Bcast() to perform the broadcast operation (option 6). The
+ second one uses the non-blocking broadcast implemented in LibNBC
+ (http://www.unixer.de/NBC) to benefit from overlapping computation and
+ communication.
 
  6) Look-ahead depth: as mentioned above  0 or 1 are likely to 
  be the best choices.  This also  depends  on the problem size
diff -Nur hpl/www/tuning.html hpl-patched/www/tuning.html
--- hpl/www/tuning.html	2004-01-22 00:13:08.000000000 -0500
+++ hpl-patched/www/tuning.html	2006-12-14 10:15:34.000000000 -0500
@@ -252,14 +252,14 @@
 <STRONG>Lines 22-23: (Example 1)</STRONG>
 <TT><PRE>
 1       # of broadcast
-1       BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+1       BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
 </PRE></TT>
 This will cause HPL  to broadcast the current panel using the
 increasing ring modified topology.<BR><BR>
 <STRONG>Lines 22-23: (Example 2)</STRONG>
 <TT><PRE>
 2       # of broadcast
-0 4     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+0 4     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
 </PRE></TT>
 This will cause  HPL to broadcast the current panel using the
 increasing   ring  virtual  topology  and  the  long  message
@@ -394,12 +394,18 @@
 following for the lines 22-23:
 <TT><PRE>
 2       # of broadcast
-1 3     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+1 3     BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=MPI,7=NBC)
 </PRE></TT>
 The best broadcast  depends  on your problem size and harware
 performance. My take is that 4 or 5  may be  competitive  for
 machines  featuring  very  fast nodes  comparatively  to  the 
-network.<BR><BR>
+network.<BR>
+Two new broadcast versions are added to the original HPL code. One uses
+the MPI_Bcast() to perform the broadcast operation (option 6). The
+second one uses the non-blocking broadcast implemented in LibNBC
+(http://www.unixer.de/NBC) to benefit from overlapping computation and
+communication.
+<BR><BR>
 
 <LI>Look-ahead depth: as mentioned above 0 or 1 are likely to 
 be the best choices.  This also  depends  on the problem size
