From e0e66b73d16846dd5b4f8b9278a9b2f5474a7456 Mon Sep 17 00:00:00 2001
From: Roland Fehrenbacher <rfehren@debian.org>
Date: Fri, 29 Dec 2017 02:47:37 -0800
Subject: Import infinipath-psm_3.3+20.604758e7.orig.tar.xz

[dgit import orig infinipath-psm_3.3+20.604758e7.orig.tar.xz]
---
 .gitignore                            |    6 +
 COPYING                               |  378 ++++
 Makefile                              |  285 +++
 README                                |  155 ++
 buildflags.mak                        |   98 +
 doc/Makefile                          |   40 +
 include/ipath_byteorder.h             |  257 +++
 include/ipath_common.h                |  892 ++++++++
 include/ipath_debug.h                 |   86 +
 include/ipath_intf.h                  |   95 +
 include/ipath_queue.h                 |  512 +++++
 include/ipath_service.h               |  160 ++
 include/ipath_udebug.h                |  130 ++
 include/ipath_user.h                  |  529 +++++
 include/linux-i386/bit_ops.h          |   76 +
 include/linux-i386/sysdep.h           |  135 ++
 include/linux-ppc/bit_ops.h           |  145 ++
 include/linux-ppc/sysdep.h            |  104 +
 include/valgrind/memcheck.h           |  279 +++
 include/valgrind/valgrind.h           | 3914 +++++++++++++++++++++++++++++++++
 infinipath-psm.spec.in                |  163 ++
 intel-mic-psm-card.spec.in            |  112 +
 intel-mic-psm.spec.in                 |  207 ++
 ipath-psm-devel.srclist.in            |    4 +
 ipath-psm.srclist.in                  |    4 +
 ipath/Makefile                        |   98 +
 ipath/ipath_debug.c                   |  256 +++
 ipath/ipath_dwordcpy-generic.c        |   78 +
 ipath/ipath_dwordcpy-i386.S           |   62 +
 ipath/ipath_dwordcpy-ppc64.c          |   78 +
 ipath/ipath_dwordcpy-x86_64-fast.S    |   55 +
 ipath/ipath_dwordcpy-x86_64.c         |   78 +
 ipath/ipath_i2cflash.c                |   67 +
 ipath/ipath_proto.c                   |  547 +++++
 ipath/ipath_protomic.c                |  616 ++++++
 ipath/ipath_service.c                 | 1377 ++++++++++++
 ipath/ipath_sysfs.c                   |  752 +++++++
 ipath/ipath_syslog.c                  |   92 +
 ipath/ipath_time.c                    |  300 +++
 ipath/ipath_utils.c                   |  597 +++++
 ipath/ipath_write_pio-i386.c          |  276 +++
 ipath/ipath_write_pio-ppc.c           |  279 +++
 ipath/ipath_write_pio-ppc64.c         |  283 +++
 ipath/ipath_write_pio-x86_64.c        |  325 +++
 libuuid/COPYING                       |   25 +
 libuuid/ChangeLog                     |  556 +++++
 libuuid/Makefile                      |   45 +
 libuuid/clear.c                       |   44 +
 libuuid/compare.c                     |   56 +
 libuuid/copy.c                        |   46 +
 libuuid/gen_uuid.c                    |  322 +++
 libuuid/isnull.c                      |   49 +
 libuuid/pack.c                        |   70 +
 libuuid/parse.c                       |   80 +
 libuuid/psm_uuid.c                    |  214 ++
 libuuid/psm_uuid.h                    |   39 +
 libuuid/tst_uuid.c                    |  168 ++
 libuuid/unpack.c                      |   64 +
 libuuid/unparse.c                     |   79 +
 libuuid/uuid.h                        |  108 +
 libuuid/uuidP.h                       |   77 +
 libuuid/uuid_time.c                   |  161 ++
 mic-psm-card-devel.srclist.in         |    2 +
 mic-psm-card.srclist.in               |    6 +
 mic-psm-devel.srclist.in              |    4 +
 mic-psm.srclist.in                    |    5 +
 mic/etc/sysconfig/mic/conf.d/psm.conf |    2 +
 mic/opt/intel/mic/psm/psm.filelist.in |    7 +
 mpspawn/mpspawn_stats.h               |  115 +
 psm.c                                 |  522 +++++
 psm.h                                 | 1045 +++++++++
 psm.supp                              |   58 +
 psm_am.c                              |  170 ++
 psm_am.h                              |  290 +++
 psm_am_internal.h                     |   66 +
 psm_context.c                         |  686 ++++++
 psm_context.h                         |   91 +
 psm_diags.c                           |  325 +++
 psm_ep.c                              | 1423 ++++++++++++
 psm_ep.h                              |  273 +++
 psm_ep_connect.c                      |  292 +++
 psm_error.c                           |  316 +++
 psm_error.h                           |   54 +
 psm_help.h                            |  143 ++
 psm_lock.h                            |   94 +
 psm_memcpy.c                          |  340 +++
 psm_mpool.c                           |  469 ++++
 psm_mpool.h                           |   72 +
 psm_mq.c                              |  729 ++++++
 psm_mq.h                              |  600 +++++
 psm_mq_internal.h                     |  484 ++++
 psm_mq_recv.c                         |  546 +++++
 psm_mq_utils.c                        |  402 ++++
 psm_noship.h                          |   57 +
 psm_stats.c                           |  649 ++++++
 psm_stats.h                           |  101 +
 psm_timer.c                           |  193 ++
 psm_timer.h                           |  133 ++
 psm_user.h                            |  214 ++
 psm_utils.c                           | 1278 +++++++++++
 psm_utils.h                           |  292 +++
 psmd/Makefile                         |   82 +
 psmd/psmd.c                           |  758 +++++++
 ptl.h                                 |  182 ++
 ptl_am/Makefile                       |   45 +
 ptl_am/am_reqrep.c                    |   96 +
 ptl_am/am_reqrep_shmem.c              | 3513 +++++++++++++++++++++++++++++
 ptl_am/kcopyrw.h                      |   50 +
 ptl_am/kcopyrwu.c                     |  105 +
 ptl_am/knemrw.h                       |   58 +
 ptl_am/knemrwu.c                      |  154 ++
 ptl_am/psm_am_internal.h              |  524 +++++
 ptl_am/ptl.c                          |  375 ++++
 ptl_am/ptl_fwd.h                      |   58 +
 ptl_am/scifrw.h                       |   50 +
 ptl_am/scifrwu.c                      |   97 +
 ptl_ips/Makefile                      |   55 +
 ptl_ips/ips_crc32.c                   |   91 +
 ptl_ips/ips_epstate.c                 |  137 ++
 ptl_ips/ips_epstate.h                 |   83 +
 ptl_ips/ips_expected_proto.h          |  280 +++
 ptl_ips/ips_opp_path_rec.c            |  444 ++++
 ptl_ips/ips_path_rec.c                |  660 ++++++
 ptl_ips/ips_path_rec.h                |  149 ++
 ptl_ips/ips_proto.c                   | 2061 +++++++++++++++++
 ptl_ips/ips_proto.h                   |  701 ++++++
 ptl_ips/ips_proto_am.c                |  355 +++
 ptl_ips/ips_proto_am.h                |   71 +
 ptl_ips/ips_proto_connect.c           | 1639 ++++++++++++++
 ptl_ips/ips_proto_dump.c              |  259 +++
 ptl_ips/ips_proto_expected.c          | 2489 +++++++++++++++++++++
 ptl_ips/ips_proto_header.h            |  174 ++
 ptl_ips/ips_proto_help.h              |  759 +++++++
 ptl_ips/ips_proto_internal.h          |   70 +
 ptl_ips/ips_proto_mq.c                |  964 ++++++++
 ptl_ips/ips_proto_params.h            |  204 ++
 ptl_ips/ips_proto_recv.c              | 1547 +++++++++++++
 ptl_ips/ips_recvhdrq.c                |  717 ++++++
 ptl_ips/ips_recvhdrq.h                |  206 ++
 ptl_ips/ips_recvq.c                   |   74 +
 ptl_ips/ips_recvq.h                   |   97 +
 ptl_ips/ips_scb.c                     |  314 +++
 ptl_ips/ips_scb.h                     |  169 ++
 ptl_ips/ips_spio.c                    |  504 +++++
 ptl_ips/ips_spio.h                    |   85 +
 ptl_ips/ips_stats.h                   |   62 +
 ptl_ips/ips_subcontext.c              |   72 +
 ptl_ips/ips_subcontext.h              |   58 +
 ptl_ips/ips_tid.c                     |  116 +
 ptl_ips/ips_tid.h                     |   99 +
 ptl_ips/ips_tidflow.c                 |  184 ++
 ptl_ips/ips_tidflow.h                 |  127 ++
 ptl_ips/ips_writehdrq.c               |   86 +
 ptl_ips/ips_writehdrq.h               |  236 ++
 ptl_ips/ipserror.c                    |  175 ++
 ptl_ips/ipserror.h                    |  100 +
 ptl_ips/ptl.c                         |  860 ++++++++
 ptl_ips/ptl_fwd.h                     |   42 +
 ptl_ips/ptl_ips.h                     |  166 ++
 ptl_ips/ptl_rcvthread.c               |  444 ++++
 ptl_self/Makefile                     |   45 +
 ptl_self/ptl.c                        |  299 +++
 ptl_self/ptl_fwd.h                    |   41 +
 163 files changed, 54450 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 COPYING
 create mode 100644 Makefile
 create mode 100644 README
 create mode 100644 buildflags.mak
 create mode 100644 doc/Makefile
 create mode 100644 include/ipath_byteorder.h
 create mode 100644 include/ipath_common.h
 create mode 100644 include/ipath_debug.h
 create mode 100644 include/ipath_intf.h
 create mode 100644 include/ipath_queue.h
 create mode 100644 include/ipath_service.h
 create mode 100644 include/ipath_udebug.h
 create mode 100644 include/ipath_user.h
 create mode 100644 include/linux-i386/bit_ops.h
 create mode 100644 include/linux-i386/sysdep.h
 create mode 100644 include/linux-ppc/bit_ops.h
 create mode 100644 include/linux-ppc/sysdep.h
 create mode 100644 include/valgrind/memcheck.h
 create mode 100644 include/valgrind/valgrind.h
 create mode 100644 infinipath-psm.spec.in
 create mode 100644 intel-mic-psm-card.spec.in
 create mode 100644 intel-mic-psm.spec.in
 create mode 100644 ipath-psm-devel.srclist.in
 create mode 100644 ipath-psm.srclist.in
 create mode 100644 ipath/Makefile
 create mode 100644 ipath/ipath_debug.c
 create mode 100644 ipath/ipath_dwordcpy-generic.c
 create mode 100644 ipath/ipath_dwordcpy-i386.S
 create mode 100644 ipath/ipath_dwordcpy-ppc64.c
 create mode 100644 ipath/ipath_dwordcpy-x86_64-fast.S
 create mode 100644 ipath/ipath_dwordcpy-x86_64.c
 create mode 100644 ipath/ipath_i2cflash.c
 create mode 100644 ipath/ipath_proto.c
 create mode 100644 ipath/ipath_protomic.c
 create mode 100644 ipath/ipath_service.c
 create mode 100644 ipath/ipath_sysfs.c
 create mode 100644 ipath/ipath_syslog.c
 create mode 100644 ipath/ipath_time.c
 create mode 100644 ipath/ipath_utils.c
 create mode 100644 ipath/ipath_write_pio-i386.c
 create mode 100644 ipath/ipath_write_pio-ppc.c
 create mode 100644 ipath/ipath_write_pio-ppc64.c
 create mode 100644 ipath/ipath_write_pio-x86_64.c
 create mode 100644 libuuid/COPYING
 create mode 100644 libuuid/ChangeLog
 create mode 100644 libuuid/Makefile
 create mode 100644 libuuid/clear.c
 create mode 100644 libuuid/compare.c
 create mode 100644 libuuid/copy.c
 create mode 100644 libuuid/gen_uuid.c
 create mode 100644 libuuid/isnull.c
 create mode 100644 libuuid/pack.c
 create mode 100644 libuuid/parse.c
 create mode 100644 libuuid/psm_uuid.c
 create mode 100644 libuuid/psm_uuid.h
 create mode 100644 libuuid/tst_uuid.c
 create mode 100644 libuuid/unpack.c
 create mode 100644 libuuid/unparse.c
 create mode 100644 libuuid/uuid.h
 create mode 100644 libuuid/uuidP.h
 create mode 100644 libuuid/uuid_time.c
 create mode 100644 mic-psm-card-devel.srclist.in
 create mode 100644 mic-psm-card.srclist.in
 create mode 100644 mic-psm-devel.srclist.in
 create mode 100644 mic-psm.srclist.in
 create mode 100644 mic/etc/sysconfig/mic/conf.d/psm.conf
 create mode 100644 mic/opt/intel/mic/psm/psm.filelist.in
 create mode 100644 mpspawn/mpspawn_stats.h
 create mode 100644 psm.c
 create mode 100644 psm.h
 create mode 100644 psm.supp
 create mode 100644 psm_am.c
 create mode 100644 psm_am.h
 create mode 100644 psm_am_internal.h
 create mode 100644 psm_context.c
 create mode 100644 psm_context.h
 create mode 100644 psm_diags.c
 create mode 100644 psm_ep.c
 create mode 100644 psm_ep.h
 create mode 100644 psm_ep_connect.c
 create mode 100644 psm_error.c
 create mode 100644 psm_error.h
 create mode 100644 psm_help.h
 create mode 100644 psm_lock.h
 create mode 100644 psm_memcpy.c
 create mode 100644 psm_mpool.c
 create mode 100644 psm_mpool.h
 create mode 100644 psm_mq.c
 create mode 100644 psm_mq.h
 create mode 100644 psm_mq_internal.h
 create mode 100644 psm_mq_recv.c
 create mode 100644 psm_mq_utils.c
 create mode 100644 psm_noship.h
 create mode 100644 psm_stats.c
 create mode 100644 psm_stats.h
 create mode 100644 psm_timer.c
 create mode 100644 psm_timer.h
 create mode 100644 psm_user.h
 create mode 100644 psm_utils.c
 create mode 100644 psm_utils.h
 create mode 100644 psmd/Makefile
 create mode 100644 psmd/psmd.c
 create mode 100644 ptl.h
 create mode 100644 ptl_am/Makefile
 create mode 100644 ptl_am/am_reqrep.c
 create mode 100644 ptl_am/am_reqrep_shmem.c
 create mode 100644 ptl_am/kcopyrw.h
 create mode 100644 ptl_am/kcopyrwu.c
 create mode 100644 ptl_am/knemrw.h
 create mode 100644 ptl_am/knemrwu.c
 create mode 100644 ptl_am/psm_am_internal.h
 create mode 100644 ptl_am/ptl.c
 create mode 100644 ptl_am/ptl_fwd.h
 create mode 100644 ptl_am/scifrw.h
 create mode 100644 ptl_am/scifrwu.c
 create mode 100644 ptl_ips/Makefile
 create mode 100644 ptl_ips/ips_crc32.c
 create mode 100644 ptl_ips/ips_epstate.c
 create mode 100644 ptl_ips/ips_epstate.h
 create mode 100644 ptl_ips/ips_expected_proto.h
 create mode 100644 ptl_ips/ips_opp_path_rec.c
 create mode 100644 ptl_ips/ips_path_rec.c
 create mode 100644 ptl_ips/ips_path_rec.h
 create mode 100644 ptl_ips/ips_proto.c
 create mode 100644 ptl_ips/ips_proto.h
 create mode 100644 ptl_ips/ips_proto_am.c
 create mode 100644 ptl_ips/ips_proto_am.h
 create mode 100644 ptl_ips/ips_proto_connect.c
 create mode 100644 ptl_ips/ips_proto_dump.c
 create mode 100644 ptl_ips/ips_proto_expected.c
 create mode 100644 ptl_ips/ips_proto_header.h
 create mode 100644 ptl_ips/ips_proto_help.h
 create mode 100644 ptl_ips/ips_proto_internal.h
 create mode 100644 ptl_ips/ips_proto_mq.c
 create mode 100644 ptl_ips/ips_proto_params.h
 create mode 100644 ptl_ips/ips_proto_recv.c
 create mode 100644 ptl_ips/ips_recvhdrq.c
 create mode 100644 ptl_ips/ips_recvhdrq.h
 create mode 100644 ptl_ips/ips_recvq.c
 create mode 100644 ptl_ips/ips_recvq.h
 create mode 100644 ptl_ips/ips_scb.c
 create mode 100644 ptl_ips/ips_scb.h
 create mode 100644 ptl_ips/ips_spio.c
 create mode 100644 ptl_ips/ips_spio.h
 create mode 100644 ptl_ips/ips_stats.h
 create mode 100644 ptl_ips/ips_subcontext.c
 create mode 100644 ptl_ips/ips_subcontext.h
 create mode 100644 ptl_ips/ips_tid.c
 create mode 100644 ptl_ips/ips_tid.h
 create mode 100644 ptl_ips/ips_tidflow.c
 create mode 100644 ptl_ips/ips_tidflow.h
 create mode 100644 ptl_ips/ips_writehdrq.c
 create mode 100644 ptl_ips/ips_writehdrq.h
 create mode 100644 ptl_ips/ipserror.c
 create mode 100644 ptl_ips/ipserror.h
 create mode 100644 ptl_ips/ptl.c
 create mode 100644 ptl_ips/ptl_fwd.h
 create mode 100644 ptl_ips/ptl_ips.h
 create mode 100644 ptl_ips/ptl_rcvthread.c
 create mode 100644 ptl_self/Makefile
 create mode 100644 ptl_self/ptl.c
 create mode 100644 ptl_self/ptl_fwd.h

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5f61dda
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+infinipath-psm.spec
+infinipath-psm-*.tar.gz
+*.o
+*.d
+*.so*
+_revision.c
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..560cf3a
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,378 @@
+This software is available to you under a choice of one of two
+licenses.  You may choose to be licensed under the terms of the the
+OpenIB.org BSD license or the GNU General Public License (GPL) Version
+2, both included below.
+
+Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+
+==================================================================
+
+		       OpenIB.org BSD license
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+==================================================================
+
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..d79c4bd
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,285 @@
+# Copyright (c) 2013 Intel Corporation.  All rights reserved.
+# Copyright (c) 2006-2011. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+top_srcdir := $(shell pwd)
+build_dir ?= $(top_srcdir)
+include $(top_srcdir)/buildflags.mak
+lib_build_dir := $(build_dir)
+ifdef LOCAL_PREFIX
+	INSTALL_PREFIX := $(LOCAL_PREFIX)
+else
+	INSTALL_PREFIX := /usr
+endif
+libdir ?= $(INSTALL_PREFIX)/lib64
+sbindir ?= $(INSTALL_PREFIX)/sbin
+
+INSTALL_LIB_TARG = $(libdir)
+INSTALL_SBIN_TARG = $(sbindir)
+RPM_BUILD_DIR=${top_srcdir}/rpmbuild
+TARG_DIR ?= $(top_srcdir)
+TMI_DIR := $(top_srcdir)/contrib/$(TMI_NAME)
+
+TMI_NAME := tmi-2009-11-20
+TARGLIB := libpsm_infinipath
+
+SUBDIRS:= ptl_self ptl_ips ptl_am libuuid ipath
+
+LDLIBS := -linfinipath $(SCIF_LINK_FLAGS) -lrt -lpthread -ldl ${EXTRA_LIBS}
+
+# Library version information
+PSM_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h)
+PSM_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h)
+PSM_LIB_MAJOR   := $(shell printf "%d" ${PSM_VERNO_MAJOR})
+PSM_LIB_MINOR   := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h`)
+IPATH_LIB_MAJOR := 4
+IPATH_LIB_MINOR := 0
+MAJOR := $(PSM_LIB_MAJOR)
+MINOR := $(PSM_LIB_MINOR)
+
+# The desired version number comes from the most recent tag starting with "v"
+VERSION := $(shell if [ -d .git ] ; then  git  describe --tags --abbrev=0 --match='v*' | sed -e 's/^v//' -e 's/-/_/'; else echo "version" ; fi)
+
+# The desired release number comes the git describe following the version which
+# is the number of commits since the version tag was planted suffixed by the g<commitid>
+RELEASE := $(shell if [ -d .git ] ; then git describe --tags --long --match='v*' | sed -e 's/v[0-9.]*-\(.*\)/\1_open/' -e 's/-/_/'; else echo "release" ; fi)
+
+VERSION_RELEASE := $(VERSION)-$(RELEASE)
+
+# Try to figure out which libuuid to use. This needs to be
+# done before we include buildflags.mak
+PSM_USE_SYS_UUID=0
+ifneq (1,${USE_PSM_UUID})
+    # Check whether the uuid header file is present. The header file is
+    # installed by the -devel package, which should have a dependency
+    # on the package which installs the library.
+    PSM_HAVE_UUID_H=$(shell if [ -f /usr/include/uuid/uuid.h ]; then echo 1; else echo 0; fi)
+    ifeq (1,${PSM_HAVE_UUID_H})
+       SYS_UUID_RPM_NAME=$(shell rpm -qf --qf "%{NAME} = %{VERSION}-%{RELEASE}" /usr/include/uuid/uuid.h)
+       PSM_USE_SYS_UUID=1
+    endif
+endif
+
+# Build the daemon only if SCIF headers are found and we are building for the host
+SUBDIRS += $(and $(MIC:1=),$(PSM_HAVE_SCIF:0=),psmd)
+
+ifneq (x86_64,$(arch))
+   ifneq (i386,$(arch))
+      $(error Unsupported architecture $(arch))
+   endif
+endif
+
+export top_srcdir build_srcdir TMI_NAME TMI_DIR PSM_VERNO_MAJOR PSM_LIB_MAJOR \
+	PSM_VERNO_MINOR PSM_LIB_MINOR IPATH_LIB_MAJOR IPATH_LIB_MINOR PSM_USE_SYS_UUID \
+	DESTDIR INSTALL_SBIN_TARG INSTALL_LIB_TARG PSM_HAVE_SCIF
+
+${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o	\
+		   ptl_am/am_reqrep.o		\
+		   ptl_am/ptl.o			\
+		   ptl_am/kcopyrwu.o		\
+		   ptl_am/knemrwu.o		\
+		   ptl_am/scifrwu.o		\
+		   psm_context.o		\
+		   psm_ep.o			\
+		   psm_ep_connect.o		\
+		   psm_error.o			\
+		   psm_utils.o			\
+		   psm_timer.o			\
+		   psm_am.o			\
+		   psm_mq.o			\
+		   psm_mq_utils.o		\
+		   psm_mq_recv.o		\
+		   psm_mpool.o			\
+		   psm_stats.o			\
+		   psm_memcpy.o			\
+		   psm.o			\
+		   libuuid/psm_uuid.o		\
+		   ptl_ips/ptl.o		\
+		   ptl_ips/ptl_rcvthread.o	\
+		   ptl_ips/ipserror.o		\
+		   ptl_ips/ips_scb.o		\
+		   ptl_ips/ips_epstate.o	\
+		   ptl_ips/ips_recvq.o		\
+		   ptl_ips/ips_recvhdrq.o	\
+		   ptl_ips/ips_spio.o		\
+		   ptl_ips/ips_proto.o		\
+		   ptl_ips/ips_proto_recv.o	\
+		   ptl_ips/ips_proto_connect.o  \
+		   ptl_ips/ips_proto_expected.o \
+		   ptl_ips/ips_tid.o		\
+		   ptl_ips/ips_crc32.o 		\
+		   ptl_ips/ips_tidflow.o        \
+		   ptl_ips/ips_proto_dump.o	\
+		   ptl_ips/ips_proto_mq.o       \
+		   ptl_ips/ips_proto_am.o       \
+		   ptl_ips/ips_subcontext.o	\
+		   ptl_ips/ips_path_rec.o       \
+		   ptl_ips/ips_opp_path_rec.o   \
+		   ptl_ips/ips_writehdrq.o	\
+		   ptl_self/ptl.o		\
+		   psm_diags.o
+
+all: libs
+
+libs: symlinks
+	for subdir in $(SUBDIRS); do \
+		$(MAKE) -C $$subdir ;\
+	done
+	$(MAKE) ${TARGLIB}.so
+
+clean:
+	rm -f _revision.c
+	for subdir in $(SUBDIRS); do \
+		$(MAKE) -C $$subdir $@ ;\
+	done
+	rm -f *.o ${TARGLIB}.*
+
+distclean: cleanlinks clean
+	rm -f *.spec *.srclist
+	rm -f *.tar.gz
+
+.PHONY: symlinks
+symlinks:
+	@[[ -L $(build_dir)/include/linux-ppc64 ]] || \
+		ln -sf linux-ppc $(build_dir)/include/linux-ppc64
+	@[[ -L $(build_dir)/include/linux-x86_64 ]] || \
+		ln -sf linux-i386 $(build_dir)/include/linux-x86_64
+
+cleanlinks:
+	rm -f $(build_dir)/include/linux-ppc64
+	rm -f $(build_dir)/include/linux-x86_64
+
+install: all
+	for subdir in $(SUBDIRS); do \
+		$(MAKE) -i -C $$subdir $@ ;\
+	done
+	install -D ${TARGLIB}.so.${MAJOR}.${MINOR} \
+		$(DESTDIR)${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR}
+	(cd $(DESTDIR)${INSTALL_LIB_TARG} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so) ; \
+	if [ X$(MIC) != X1 ]; then \
+		install -D psm.h ${DESTDIR}/usr/include/psm.h ; \
+		install -D psm_mq.h ${DESTDIR}/usr/include/psm_mq.h ; \
+	else \
+		filelist=/opt/intel/mic/psm/psm.filelist ; \
+		sed -e 's!%IPATHMAJOR%!$(IPATH_LIB_MAJOR)!g' \
+			-e 's!%IPATHMINOR%!$(IPATH_LIB_MINOR)!g' \
+			-e 's!%PSMMAJOR%!$(MAJOR)!g' \
+			-e 's!%PSMMINOR%!$(MINOR)!g' \
+		mic$$filelist.in > mic$$filelist ; \
+		install -D mic/$$filelist ${DESTDIR}$$filelist ; \
+		rm -f mic$$filelist ; \
+	fi
+
+tmi: libs
+	$(MAKE) -C contrib/$(TMI_NAME) verbs=PSM
+tmiclean:
+	$(MAKE) -C contrib/$(TMI_NAME) verbs=PSM clean
+
+
+.PHONY: infinipath-psm.spec 
+infinipath-psm.spec: infinipath-psm.spec.in
+	sed -e 's/@VERSION@/'${VERSION}'/g' -e 's/@RELEASE@/'${RELEASE}'/g' $< > $@
+	if [ X$(MIC) != X1 ]; then \
+		if [ X$(PSM_USE_SYS_UUID) = X1 ]; then \
+			REQUIRES="Requires: $(shell echo $(SYS_UUID_RPM_NAME) | sed -e 's/-devel//')" ; \
+			REQUIRESDEVEL="Requires: $(SYS_UUID_RPM_NAME)" ; \
+		fi ; \
+		[ -n "$${REQUIRES}" ] && \
+			sed -i -e 's%@REQUIRES@%'"$${REQUIRES}"'%g' -e 's/@PSM_UUID@//g' $@ || \
+			sed -i -e '/@REQUIRES@/d' -e 's/@PSM_UUID@/USE_PSM_UUID=1/g' $@ ; \
+		[ -n "$${REQUIRESDEVEL}" ] && \
+			sed -i -e 's%@REQUIRES-DEVEL@%'"$$REQUIRESDEVEL"'%g' $@ || \
+			sed -i -e '/@REQUIRES-DEVEL@/d' $@ ; \
+	else \
+		sed -i -e '/@REQUIRES@/d' \
+				-e '/@REQUIRES-DEVEL@/d' \
+				-e 's/@PSM_UUID@/USE_PSM_UUID=1/g' $@ ; \
+	fi
+dist: distclean infinipath-psm.spec
+	rm -rf $(RPM_BUILD_DIR)
+	mkdir -p infinipath-psm-${VERSION_RELEASE}
+	for x in $$(/usr/bin/find . -name ".git" -prune -o \
+			-name "cscope*" -prune -o \
+			-name "*.spec.in" -prune -o \
+			-name "infinipath-psm-${VERSION_RELEASE}" -prune -o \
+			-name "*.orig" -prune -o \
+			-name "*~" -prune -o \
+			-name "#*" -prune -o \
+			-name "*.rpm" -prune -o \
+			-name "build" -prune -o \
+			-name ".gitignore" -prune -o \
+			-print); do \
+		dir=$$(dirname $$x); \
+		mkdir -p infinipath-psm-${VERSION_RELEASE}/$$dir; \
+		[ ! -d $$x ] && cp $$x infinipath-psm-${VERSION_RELEASE}/$$dir; \
+	done ; \
+	if [ -d .git ] ; then git log -n1 --pretty=format:%H > \
+		infinipath-psm-${VERSION_RELEASE}/COMMIT ; fi
+	tar czvf infinipath-psm-${VERSION_RELEASE}.tar.gz infinipath-psm-${VERSION_RELEASE}
+	rm -rf infinipath-psm-${VERSION_RELEASE}
+
+ofeddist:
+	USE_PSM_UUID=1 $(MAKE) dist
+
+
+# rebuild the cscope database, skipping sccs files, done once for
+# top level
+cscope:
+	find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \
+	  -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i -
+
+${TARGLIB}.so: ${TARGLIB}.so.${MAJOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+${TARGLIB}.so.${MAJOR}: ${TARGLIB}.so.${MAJOR}.${MINOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+# when we build the shared library, generate a revision and date
+# string in it, for easier id'ing when people may have copied the
+# file around.  Generate it such that the ident command can find it
+# and strings -a | grep InfiniPath does a reasonable job as well.
+${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs}
+	date +'char psmi_infinipath_revision[] ="$$""Date: %F %R ${rpm_extra_description}InfiniPath $$";' > ${lib_build_dir}/_revision.c
+	$(CC) -c $(BASECFLAGS) $(INCLUDES) _revision.c -o _revision.o
+	$(CC) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared -Wl,--unique='*fastpath*' \
+		${${TARGLIB}-objs} _revision.o -L$(build_dir)/ipath $(LDLIBS)
+	@leaks=`nm $@ | grep ' [DT] ' | \
+	 grep -v -e ' [DT] \(_edata\|_fini\|_init\|infinipath_\|ips_\|psmi\|__psm_\|__psmi_\|_rest.pr\|_save.pr\|kcopy\|knem\|scif\)'`; \
+	 if test -n "$$leaks"; then echo "Build failed, leaking symbols:"; echo "$$leaks"; exit 1; fi
+
+%.o: %.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+.PHONY: $(SUBDIRS)
+
diff --git a/README b/README
new file mode 100644
index 0000000..505a973
--- /dev/null
+++ b/README
@@ -0,0 +1,155 @@
+
+  Copyright (c) 2013-2014, Intel Corporation.  All rights reserved.
+  Copyright (c) 2006-2011. QLogic Corporation. All rights reserved.
+  Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ 
+  This software is available to you under a choice of one of two
+  licenses.  You may choose to be licensed under the terms of the GNU
+  General Public License (GPL) Version 2, available from the file
+  COPYING in the main directory of this source tree, or the
+  OpenIB.org BSD license below:
+ 
+      Redistribution and use in source and binary forms, with or
+      without modification, are permitted provided that the following
+      conditions are met:
+ 
+       - Redistributions of source code must retain the above
+         copyright notice, this list of conditions and the following
+         disclaimer.
+ 
+       - Redistributions in binary form must reproduce the above
+         copyright notice, this list of conditions and the following
+         disclaimer in the documentation and/or other materials
+         provided with the distribution.
+ 
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+  BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+  ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+ 
+================================================================================
+
+OFED Support
+------------
+OFED 3.5 or above should be installed on the node. Prior versions of OFED
+have an older QLogic IB driver (ib_qib) and do not fully support all the PSM
+features with this release.
+
+Building PSM
+------------
+Build requires that GNU GCC compiler is installed on the machine doing the
+build.  If compiling code for MIC, SCIF must be present.  Root privileges 
+are required to install the runtime libraries and development header files 
+into the standard system location.
+
+Building from Makefile
+-----------------------
+1. Untar the tarball:
+	$ tar zxvf infinipath-psm-$PRODUCT-$RELEASE.tar.gz
+2. Change directory into the untarred location:
+	$ cd infinipath-psm-$PRODUCT-$RELEASE
+3. Run make on the command line. This will build the libraries. By default,
+   the Makefile will auto-detect whether libuuid and the uuid.h header file
+   are installed. If so, it will use the system's libuuid. Otherwise, PSM
+   will be compiled with the libuuid included with PSM.
+	$ make
+
+   The Makefile will attempt to detect if SCIF is present, and if found, it will
+   build the SCIF enabled variant by default.  Auto-detection of SCIF can be
+   disabled by passing the PSM_HAVE_SCIF variable.
+
+   To specify the SCIF-enabled version, set the PSM_HAVE_SCIF variable:
+        $ make PSM_HAVE_SCIF=1
+   To specify the non-SCIF version, even if SCIF is present, clear the variable:
+        $ make PSM_HAVE_SCIF=0
+
+   To force compiling with the included libuuid, use the USE_PSM_UUID variable:
+        $ make USE_PSM_UUID=1
+
+   (PSM_HAVE_SCIF and USE_PSM_UUID may be used in conjunction)
+
+4. Install the libraries and header files on the system (as root):
+	$ make install
+
+The libraries will be installed in either /usr/lib or /usr/lib64, depending on
+the architecture of the machine, and the header files will be installed in 
+/usr/include.
+This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on
+the "make install" command line. "DESTDIR" will add a leading path component
+to the overall install path and "LIBDIR" will change the path where libraries
+will be installed. For example, "make DESTDIR=/tmp/psm-install install" will
+install all files (libraries and headers) into "/tmp/psm-install/usr/...",
+"make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the
+libraries in "/tmp/psm-install/libraries" and the headers in
+"/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will
+install the libraries in "/tmp/libs" and the headers in "/usr/include".
+
+MPI Libraries supported
+-----------------------
+A large number of open source (OpenMPI, MVAPICH, MVAPICH2) and Vendor MPI
+implementations support PSM for optimized communication on QLogic Truescale
+Infiniband HCAs. Vendor MPI implementations (HP-MPI, Intel MPI 4.0 with PMI,
+Platform/Scali MPI) require that the PSM runtime libraries be installed and
+available on each node. Usually a configuration file or a command line switch
+to mpirun needs to be specified to utilize the PSM transport.
+
+OpenMPI support
+---------------
+It is recommended to use the OpenMPI v1.5 development branch. Prior versions
+of OpenMPI have an issue with support PSM network transports mixed with standard
+Verbs transport (BTL openib). This prevents an OpenMPI installation with
+network modules available for PSM and Verbs to work correctly on nodes with
+no QLogic IB hardware. This has been fixed in the latest development branch
+allowing a single OpenMPI installation to target IB hardware via PSM or Verbs
+as well as alternate transports seamlessly.
+
+PSM header and runtime files need to be installed on a node where the OpenMPI
+build is performed. All compute nodes additionally should have the PSM runtime
+libraries available on them. OpenMPI provides a standard configure, make and
+make install mechanism which will detect and build the relevant PSM network
+modules for OpenMPI once the header and runtime files are detected. Further
+information on compiling and running MPI applications with OpenMPI on PSM is
+available in the QLogic OFED User guide available at:
+
+http://driverdownloads.qlogic.com/QLogicDriverDownloads_UI/SearchByProduct.aspx?ProductCategory=301&Product=1116&Os=65
+
+MVAPICH and MVAPICH2 support
+----------------------------
+Both MVAPICH and MVAPICH2 support PSM transport for optimized communication on
+QLogic Truescale IB hardware. MVAPICH2 1.4 and MVAPICH 1.2 versions are
+recommended. PSM header and runtime files need to be installed on a node where 
+MVAPICH builds are performed. All compute nodes additionally should have the 
+PSM runtime libraries available on them.
+
+MVAPICH provides a shell script in it's top level directory called
+make.mvapich.psm to configur, make and install MVAPICH with PSM support.
+
+MVAPICH2 provides a standard configure and make infrastructure. In order to
+MVAPICH2 for PSM the following should be performed from the top level directory:
+
+    - ./configure --prefix=<path_to_install_mvapich2> --with-device=ch3:psm
+    - make
+    - make install
+
+Further information on compiling and running MPI applications with MVAPICH on 
+PSM is available in the QLogic OFED User guide available at:
+
+http://driverdownloads.qlogic.com/QLogicDriverDownloads_UI/SearchByProduct.aspx?ProductCategory=301&Product=1116&Os=65
+
+Submitting comments, bugs, questions
+------------------------------------
+The best way to report bugs, send comments or ask questions is to sign up to the
+developers mailing list: psm-devel@qlogic.com. Because of spam only subscribers
+are allowed to post to the list. Please ensure that you subscribe with and post
+from the same email address else posts will be blocked as spam. To subscribe 
+send the following in the BODY of an email to majordomo@qlogic.com:
+
+  subscribe psm-devel
+
+Majordomo will reply back with instructions on how to confirm your subscription.
+The mailing list can be used to report bugs, send comments and ask questions.
+
diff --git a/buildflags.mak b/buildflags.mak
new file mode 100644
index 0000000..34fdf1c
--- /dev/null
+++ b/buildflags.mak
@@ -0,0 +1,98 @@
+# Copyright (c) 2012. Intel Corporation. All rights reserved.
+# Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+# Copyright (c) 2003, 2004, 2005 PathScale, Inc.  All rights reserved.
+# 
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+# set top_srcdir and include this file
+
+ifeq (,$(top_srcdir))
+$(error top_srcdir must be set to include makefile fragment)
+endif
+
+export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]')
+export arch := $(shell uname -p | sed -e 's,\(i[456]86\|athlon$$\),i386,')
+
+CC ?= gcc
+
+SCIF_LINK_FLAGS :=
+SCIF_INCLUDE_FLAGS :=
+
+compiler_arch := $(shell $(CC) -dumpmachine || echo "none")
+ifeq ($(compiler_arch),none)
+$(error Could not determine compiler arch for $(CC))
+endif
+MIC := $(if $(findstring k1om,$(compiler_arch)),1,0)
+
+# If SCIF_ROOT_DIR is set, we should assume using SCIF
+# If SCIF_INCLUDE_FLAGS is set, we should assume using SCIF
+# If /usr/include/scif.h exists, we should assume using SCIF
+
+ifdef SCIF_ROOT_DIR
+	SCIF_LINK_FLAGS := -L$(SCIF_ROOT_DIR)/source-root/k1om-hybrid/$(if $(MIC:0=),card,host)/scif_lib #-lscif
+	SCIF_INCLUDE_FLAGS := -I$(SCIF_ROOT_DIR)/source-root/k1om-hybrid/include
+endif
+
+PSM_HAVE_SCIF ?= $(shell printf '\#include <scif.h>\nint main(void){return(0);}\n' | \
+	$(CC) $(CFLAGS) $(LDFLAGS) -x c - -o /dev/null &> /dev/null && echo 1 || echo 0)
+
+ifeq (1,$(PSM_HAVE_SCIF))
+	SCIF_INCLUDE_FLAGS += -DPSM_HAVE_SCIF=1
+	SCIF_LINK_FLAGS += -lscif
+endif
+
+WERROR := -Werror
+INCLUDES := -I. -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn \
+	-I$(top_srcdir)/include/$(os)-$(arch) $(SCIF_INCLUDE_FLAGS)
+BASECFLAGS += $(BASE_FLAGS) $(if $(MIC:0=),$(if $(filter $(CC),icc),-mmic,-D__MIC__)) \
+	-Wall $(WERROR) $(if $(MIC:0=),-Wno-unused) -fpic -fPIC -D_GNU_SOURCE \
+	$(if $(filter $(CC),icc),,-funwind-tables) $(if $(PSM_PROFILE:0=),-DPSM_PROFILE) \
+	${IPATH_CFLAGS}
+ASFLAGS += $(BASE_FLAGS) $(if $(MIC:0=),$(if $(filter $(CC),icc),-mmic,-D__MIC__)) -g3 -fpic
+
+LDFLAGS += $(SCIF_LINK_FLAGS)
+
+# If linker flags are needed, uncomment the line below and set flags
+#LDFLAGS +=
+
+ifneq (,${PSM_DEBUG})
+  BASECFLAGS += -O -g3 -DPSM_DEBUG $(if $(filter $(CC),icc),,-funit-at-a-time) \
+	-Wp,-D_FORTIFY_SOURCE=2
+else
+  BASECFLAGS += -O3 -g3 
+endif
+ifeq (1,${PSM_USE_SYS_UUID})
+  BASECFLAGS += -DPSM_USE_SYS_UUID
+  EXTRA_LIBS = -luuid
+endif
+
+CFLAGS += $(BASECFLAGS) $(if $(filter $(CC),gcc),-Wno-strict-aliasing) \
+	$(if $(PSM_VALGRIND:0=),-DPSM_VALGRIND,-DNVALGRIND)
+
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..dba53ee
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,40 @@
+# Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+ifeq (,$(build_dir))
+$(error build_dir must be set)
+endif
+
+top_srcdir := ..
+
+
+
diff --git a/include/ipath_byteorder.h b/include/ipath_byteorder.h
new file mode 100644
index 0000000..d5cd40d
--- /dev/null
+++ b/include/ipath_byteorder.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ipath_byteorder_h
+#define ipath_byteorder_h
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+#include <sys/param.h>
+#include <endian.h>
+
+#ifndef __BYTE_ORDER
+#	error "BYTE_ORDER undefined"
+#endif
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+static __inline__ __u16 __ipath_fswab16(__u16) __attribute__ ((always_inline));
+static __inline__ __u32 __ipath_fswab32(__u32) __attribute__ ((always_inline));
+static __inline__ __u64 __ipath_fswab64(__u64) __attribute__ ((always_inline));
+
+static __inline__ __u16 __ipath_fswab16(__u16 x)
+{
+	return    ((x & (__u16)0x00ffU) << 8)
+		| ((x & (__u16)0xff00U) >> 8);
+}
+
+static __inline__ __u32 __ipath_fswab32(__u32 x)
+{
+	return    ((x & (__u32)0x000000ffUL) << 24)
+		| ((x & (__u32)0x0000ff00UL) << 8)
+		| ((x & (__u32)0x00ff0000UL) >> 8)
+		| ((x & (__u32)0xff000000UL) >> 24);
+}
+
+static __inline__ __u64 __ipath_fswab64(__u64 x)
+{
+	return    ((x & (__u64)0x00000000000000ffULL) << 56)
+		| ((x & (__u64)0x000000000000ff00ULL) << 40)
+		| ((x & (__u64)0x0000000000ff0000ULL) << 24)
+		| ((x & (__u64)0x00000000ff000000ULL) << 8)
+		| ((x & (__u64)0x000000ff00000000ULL) >> 8)
+		| ((x & (__u64)0x0000ff0000000000ULL) >> 24)
+		| ((x & (__u64)0x00ff000000000000ULL) >> 40)
+		| ((x & (__u64)0xff00000000000000ULL) >> 56);
+}
+
+static __inline__ __u16 __cpu_to_le16(__le16) __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_le32(__le32) __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_le64(__le64) __attribute__ ((always_inline));
+
+static __inline__ __u16 __le16_to_cpu(__le16) __attribute__ ((always_inline));
+static __inline__ __u32 __le32_to_cpu(__le32) __attribute__ ((always_inline));
+static __inline__ __u64 __le64_to_cpu(__le64) __attribute__ ((always_inline));
+
+static __inline__ __u16 __cpu_to_be16(__be16) __attribute__ ((always_inline));
+static __inline__ __u32 __cpu_to_be32(__be32) __attribute__ ((always_inline));
+static __inline__ __u64 __cpu_to_be64(__be64) __attribute__ ((always_inline));
+
+static __inline__ __u16 __be16_to_cpu(__be16) __attribute__ ((always_inline));
+static __inline__ __u32 __be32_to_cpu(__be32) __attribute__ ((always_inline));
+static __inline__ __u64 __be64_to_cpu(__be64) __attribute__ ((always_inline));
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x)
+{
+	return x;
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x)
+{
+	return x;
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x)
+{
+	return x;
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x)
+{
+	return x;
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x)
+{
+	return x;
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x)
+{
+	return x;
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x)
+{
+	return __ipath_fswab16(x);
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x)
+{
+	return __ipath_fswab32(x);
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x)
+{
+	return __ipath_fswab64(x);
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x)
+{
+	return __ipath_fswab16(x);
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x)
+{
+	return __ipath_fswab32(x);
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x)
+{
+	return __ipath_fswab64(x);
+}
+
+#elif __BYTE_ORDER == __BIG_ENDIAN
+
+/*
+ * __cpu_to_le* routines
+ */
+static __inline__ __le16 __cpu_to_le16(__u16 x)
+{
+	return __ipath_fswab16(x);
+}
+
+static __inline__ __le32 __cpu_to_le32(__u32 x)
+{
+	return __ipath_fswab32(x);
+}
+
+static __inline__ __le64 __cpu_to_le64(__u64 x)
+{
+	return __ipath_fswab64(x);
+}
+
+/*
+ * __le*_to_cpu routines
+ */
+static __inline__ __u16 __le16_to_cpu(__le16 x)
+{
+	return __ipath_fswab16(x);
+}
+
+static __inline__ __u32 __le32_to_cpu(__le32 x)
+{
+	return __ipath_fswab32(x);
+}
+
+static __inline__ __u64 __le64_to_cpu(__le64 x)
+{
+	return __ipath_fswab64(x);
+}
+
+/*
+ * __cpu_to_be* routines
+ */
+static __inline__ __be16 __cpu_to_be16(__u16 x)
+{
+	return x;
+}
+
+static __inline__ __be32 __cpu_to_be32(__u32 x)
+{
+	return x;
+}
+
+static __inline__ __be64 __cpu_to_be64(__u64 x)
+{
+	return x;
+}
+
+/*
+ * __be*_to_cpu routines
+ */
+static __inline__ __u16 __be16_to_cpu(__be16 x)
+{
+	return x;
+}
+
+static __inline__ __u32 __be32_to_cpu(__be32 x)
+{
+	return x;
+}
+
+static __inline__ __u64 __be64_to_cpu(__be64 x)
+{
+	return x;
+}
+
+#else
+#	error "unsupported BYTE_ORDER: " #BYTE_ORDER
+#endif
+
+#ifdef __cplusplus
+	} // extern "C"
+#endif
+
+#endif // ipath_byteorder_h
diff --git a/include/ipath_common.h b/include/ipath_common.h
new file mode 100644
index 0000000..8bf9986
--- /dev/null
+++ b/include/ipath_common.h
@@ -0,0 +1,892 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* BEGIN_NOSHIP_TO_OPENIB */
+#include <asm/types.h>
+#ifndef __KERNEL__
+// Pointer annotations used by the "sparse" checker tool.
+#define __iomem
+#include "ipath_byteorder.h"
+#endif
+/* END_NOSHIP_TO_OPENIB */
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN		0
+#define IPATH_IB_LINKARM		1
+#define IPATH_IB_LINKACTIVE		2
+#define IPATH_IB_LINKINIT		3
+#define IPATH_IB_LINKDOWN_SLEEP		4
+#define IPATH_IB_LINKDOWN_DISABLE	5
+#define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1	/* basic initialization done */
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number (Shared contexts). */
+	_IPATH_UregMax = 4,
+	/* (RW) RcvTIDFlow table for expected sends in QLE73XX */
+	ur_rcvtidflow = 512
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_PCIE	0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER	0x4
+#define IPATH_RUNTIME_RCVHDR_COPY	0x8
+#define IPATH_RUNTIME_MASTER	0x10
+#define IPATH_RUNTIME_RCHK	0x20
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SPECIAL_TRIGGER 0x100
+#define IPATH_RUNTIME_SDMA 0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+/*
+ * MEA: below means chip expects 7322-style context/qp mapping,
+ * not 7220-style. This needs work, because we actually care what
+ * the remote chip uses, not what the local chip uses, other
+ * than to somehow tell the remote endpoint.
+ */
+#define IPATH_RUNTIME_CTXT_MSB_IN_QP 0x1000
+#define IPATH_RUNTIME_CTXT_REDIRECT 0x2000
+#define IPATH_RUNTIME_HDRSUPP 0x4000
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* InfiniPath context assigned, goes into sent packets */
+	__u16 spi_context;
+	__u16 spi_subcontext;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer in byts.  Any given packet's total size must
+	 * be less than this.  Included is the starting control word, so
+	 * if 2052 is returned, then total pkt size is 2048 bytes or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in infinipath, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in infinipath, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where receive buffer queue is mapped into */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an infinipath
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always maps real chip register space.
+	 */
+         __u64 spi_uregbase;
+
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize bytes, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+	 * link status qword (formerly combined with driver status), then a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip contexts available to user processes */
+	__u32 spi_ncontexts;
+	__u16 spi_unit; /* unit number of chip we are using; */
+	__u16 spi_port; /* IB port number we are using for send */
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+        /*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 spi_subctxt_uregbase;
+	__u64 spi_subctxt_rcvegrbuf;
+	__u64 spi_subctxt_rcvhdr_base;
+
+	/* shared memory page for send buffer disarm status */
+	__u64 spi_sendbuf_status;
+} __attribute__ ((aligned(8)));
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 13
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+/* BEGIN_NOSHIP_TO_OPENIB */
+#ifndef IPATH_KERN_TYPE
+/* END_NOSHIP_TO_OPENIB */
+#define IPATH_KERN_TYPE 0
+/* BEGIN_NOSHIP_TO_OPENIB */
+#endif
+/* END_NOSHIP_TO_OPENIB */
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * If the unit is specified via open, HCA choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HCAs, using different algorithims.  WITHIN is
+ * the old default, prior to this mechanism.
+*/
+#define IPATH_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then
+                                * ports; this is the default */
+#define IPATH_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin
+                                * active ports within), then next HCA */
+#define IPATH_PORT_ALG_COUNT 2 /* number of algorithm choices */
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to IPATH_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	__u32 _spu_scif_nodeid; /* used for mic processes */
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	__u32 spu_port_alg; /* which IPATH_PORT_ALG_*; unused user minor < 11 */
+
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the spu_subcontext_cnt and spu_subcontext_id to the same
+	 * values.  The only restriction on the spu_subcontext_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subcontext_cnt;
+	__u16 spu_subcontext_id;
+
+	__u32 spu_port; /* IB port requested by user if > 0 */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define __IPATH_CMD_USER_INIT   16      /* old set up userspace */
+#define IPATH_CMD_CTXT_INFO	17	/* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL	18	/* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE	19	/* update expected TID entries */
+#define IPATH_CMD_TID_FREE	20	/* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY	21	/* add partition key */
+#define __IPATH_CMD_SLAVE_INFO  22      /* return info on slave processes */
+#define IPATH_CMD_ASSIGN_CONTEXT 23	/* allocate HCA and context (or port, historically) */
+#define IPATH_CMD_USER_INIT 	24	/* set up userspace */
+#define IPATH_CMD_PIOAVAILCHK	25	/* check if pio send stuck */
+#define IPATH_CMD_TIDCHKFIX	26	/* check expected tid, and fixup */
+#define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL       29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31	/* latest sdma inflight count */
+#define IPATH_CMD_SDMA_COMPLETE	32	/* try to complete pending sdma */
+/* CMD 33 is available (used to be to enable backpressure). Removed in IFS 5.1*/
+#define IPATH_CMD_DISARM_BUFS	34	/* disarm send buffers w/ errors */
+#define IPATH_CMD_ACK_EVENT     35	/* ack & clear bits *spi_sendbuf_status */
+/* MIC to setup memory with mic driver */
+#define IPATH_CMD_MIC_MEM_INFO	41	/* mic memory setup operation */
+
+/*
+ * IPATH_CMD_ACK_EVENT obsoletes IPATH_CMD_DISARM_BUFS, but we keep it for
+ * compatibility with libraries from previous release.   The ACK_EVENT
+ * will take appropriate driver action (if any, just DISARM for now),
+ * then clear the bits passed in as part of the mask.  These bits are
+ * in the first 64bit word at spi_sendbuf_status, and are passed to
+ * the driver in 
+ */
+#define IPATH_EVENT_DISARM_BUFS		(1ULL << 0)
+#define IPATH_EVENT_LINKDOWN		(1ULL << 1)
+#define IPATH_EVENT_LID_CHANGE		(1ULL << 2)
+#define IPATH_EVENT_LMC_CHANGE		(1ULL << 3)
+#define IPATH_EVENT_SL2VL_CHANGE	(1ULL << 4)
+
+/*
+ * The following ipath commands are only used for mic system to send
+ * commands to host daemon. All commands above are also used by mic.
+ */
+#define IPATH_CMD_CONTEXT_OPEN		51	/* open a context */
+#define IPATH_CMD_CONTEXT_CLOSE		52	/* close a context */
+
+#define IPATH_CMD_GET_NUM_UNITS		61	/* number of hca units */
+#define IPATH_CMD_GET_NUM_CTXTS		62	/* number of contexts */
+#define IPATH_CMD_GET_PORT_LID		63	/* port lid */
+#define IPATH_CMD_GET_PORT_GID		64	/* port gid */
+#define IPATH_CMD_GET_PORT_LMC		65	/* port lmc */
+#define IPATH_CMD_GET_PORT_RATE		66	/* port rate */
+#define IPATH_CMD_GET_PORT_S2V		67	/* port sl2vl */
+
+#define IPATH_CMD_GET_STATS_NAMES	68	/* stats names */
+#define IPATH_CMD_GET_STATS		69	/* stats */
+#define IPATH_CMD_GET_CTRS_UNAMES	70	/* counters unit names */
+#define IPATH_CMD_GET_CTRS_UNIT		71	/* counters unit */
+#define IPATH_CMD_GET_CTRS_PNAMES	72	/* counters port names */
+#define IPATH_CMD_GET_CTRS_PORT		73	/* counters port */
+
+#define IPATH_CMD_GET_CC_SETTINGS	74	/* get cc settings */
+#define IPATH_CMD_GET_CC_TABLE		75	/* get cc table */
+
+/* cmd for diag code */
+#define IPATH_CMD_WAIT_FOR_PACKET       76
+#define IPATH_CMD_GET_UNIT_FLASH        77
+#define IPATH_CMD_PUT_UNIT_FLASH        78
+
+/*
+ * Poll types
+ */
+
+#define IPATH_POLL_TYPE_ANYRCV   0
+#define IPATH_POLL_TYPE_URGENT	 0x01
+
+struct ipath_ctxt_info {
+	__u16 num_active;	/* number of active units */
+	__u16 unit;		/* unit (chip) assigned to caller */
+	__u16 port;		/* IB port assigned to caller */
+	__u16 context;		/* context on unit assigned to caller */
+	__u16 subcontext;	/* subcontext on unit assigned to caller */
+	__u16 num_contexts;	/* number of contexts available on unit */
+	__u16 num_subcontexts;	/* number of subcontexts opened on context */
+	__u16 rec_cpu;		/* cpu # for affinity (ffff if none) */
+};
+
+struct ipath_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+/*
+ * To send general info between PSM on mic and psmd on host.
+ * this structure should be not more that "structure ipath_user_info".
+ */
+struct ipath_mic_info {
+	int unit;		/* unit number */
+	int port;		/* port number */
+	int data1;		/* return data or -1 */
+	int data2;		/* errno if data1=-1 */
+	__u64 data3;		/* other data */
+	__u64 data4;		/* other data */
+} __attribute__ ((aligned(8)));
+
+/*
+ * PSM tells mic driver how to operate memores. flags:
+ * 0x1: map remote host buffer, offset is the SCIF offset
+ * 0x2: allocate knx memory in kernel.
+ * 0x4: allocate physically contiguous knx memory in kernel.
+ * 0x8: SCIF register knx memory, and copy offset to first 8 bytes.
+ */
+struct ipath_mem_info {
+        uint32_t        key;            /* key to match mmap offset */
+        uint32_t        flags;          /* flags indicate what to do */
+        size_t          length;         /* buffer length in bytes */
+        off_t           offset;         /* remotely registerd offset */
+};
+
+struct ipath_cmd {
+	__u32 type;			/* command type */
+	union {
+		struct ipath_mem_info mem_info;	/* mic memory */
+		struct ipath_mic_info mic_info;
+		struct ipath_tid_info tid_info;
+		struct ipath_user_info user_info;
+		/* send dma inflight/completion counter */
+		__u64 sdma_cntr;
+		/* address in userspace of struct ipath_ctxt_info to
+		   write result to */
+		__u64 ctxt_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+		/* back pressure enable bit for one particular context */
+		__u8 ctxt_bp;
+		/* ipath_event_ack(), IPATH_EVENT_* bits */
+		__u64 event_mask;
+	} cmd;
+};
+
+struct ipath_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * ipath_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+	__u32 sps_flags;	/* flags for packet (TBD) */
+	__u32 sps_cnt;		/* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct ipath_iovec sps_iov[4];
+};
+
+/* Passed into diag data special file's ->write method. */
+struct ipath_diag_pkt {
+	__u32 unit;
+	__u64 data;
+	__u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+	/* flash layout version (IPATH_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_H_TFGENERR  0x00400000
+#define INFINIPATH_RHF_H_TFSEQERR  0x00200000
+#define INFINIPATH_RHF_H_ERR_MASK  0xFFE00000
+#define INFINIPATH_RHF_L_USE_EGR   0x80000000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* TidFlow related bits */
+#define INFINIPATH_TF_SEQNUM_SHIFT                 0
+#define INFINIPATH_TF_SEQNUM_MASK                  0x7ff
+#define INFINIPATH_TF_GENVAL_SHIFT                 11
+#define INFINIPATH_TF_GENVAL_MASK                  0xff
+#define INFINIPATH_TF_ISVALID_SHIFT                19
+#define INFINIPATH_TF_ISVALID_MASK                 0x1
+#define INFINIPATH_TF_ENABLED_SHIFT                20
+#define INFINIPATH_TF_ENABLED_MASK                 0x1
+#define INFINIPATH_TF_KEEP_AFTER_SEQERR_SHIFT      21
+#define INFINIPATH_TF_KEEP_AFTER_SEQERR_MASK       0x1
+#define INFINIPATH_TF_KEEP_AFTER_GENERR_SHIFT      22
+#define INFINIPATH_TF_KEEP_AFTER_GENERR_MASK       0x1
+#define INFINIPATH_TF_STATUS_SHIFT                 27
+#define INFINIPATH_TF_STATUS_MASK                  0x3
+#define INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT     27
+#define INFINIPATH_TF_STATUS_SEQMISMATCH_MASK      0x1
+#define INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT     28
+#define INFINIPATH_TF_STATUS_GENMISMATCH_MASK      0x1
+
+#define INFINIPATH_TF_FLOWID_SHIFT                 19
+#define INFINIPATH_TF_NFLOWS                       32
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_CONTEXT_MASK 0xF
+#define INFINIPATH_I_CONTEXT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_HDRSUPP 0x2
+#define INFINIPATH_KPF_INTR_HDRSUPP_MASK 0x3
+#define INFINIPATH_KPF_COMMIDX_MASK 0x003C
+#define INFINIPATH_KPF_COMMIDX_SHIFT 2
+#define INFINIPATH_KPF_RESERVED_BITS(pktflags)            \
+  ((__le16_to_cpu(pktflags) & INFINIPATH_KPF_COMMIDX_MASK) \
+    << IPS_EPSTATE_COMMIDX_SHIFT) \
+
+#define INFINIPATH_MAX_SUBCONTEXT	4
+
+#define IPATH_MAX_UNIT  4 /* max units supported */
+#define IPATH_MAX_PORT	2 /* no boards have more than 2 IB ports */
+
+/* SendPIO per-buffer control */
+/* BEGIN_NOSHIP_TO_OPENIB */
+// #define INFINIPATH_SP_LENGTHP1_MASK 0x3FF	/* unused currently */
+// #define INFINIPATH_SP_LENGTHP1_SHIFT 0	/* unused currently */
+// #define INFINIPATH_SP_INTR    0x80		/* unused currently */
+/* END_NOSHIP_TO_OPENIB */
+#define INFINIPATH_SP_TEST    0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+
+/* these are currently used only on 7322 chips; they should be referenced
+ * only at the lowest level pio send buffer fill routines; they go into 
+ * the pbcflags field.  OLSON: need to clean this up.  */
+#define __PBC_IBPORT (1U << 26)
+#define __PBC_VLSHIFT (27)
+
+/* this portion only defines what we currently use */
+union ipath_pbc {
+	__u64 qword;
+	__u32 dword;
+	struct {
+		__u16 length;
+		__u16 fill1;
+		__u32 pbcflags;
+	};
+};
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+	/*
+	 * Version - 4 bits, Context (or port, historically) - 4 bits, 
+	 * TID - 10 bits and Offset.
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Port 4, TID 11, offset 13.
+	 */
+	__le32 ver_context_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct ipath_header iph;
+	__u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct ipath_header iph;
+	__u8 sub_opcode;
+	__u8 cmd;
+	__be16 lid;
+	__u16 mac[3];
+	__u8 frag_num;
+	__u8 seq_num;
+	__le32 len;
+	/* MUST be of word size due to PIO write requirements */
+	__le32 csum;
+	__le16 csum_offset;
+	__le16 flags;
+	__u16 first_2_bytes;
+	__u8 unused[2];		/* currently unused */
+};
+
+/* BEGIN_NOSHIP_TO_OPENIB */
+/*
+ * The PIO buffer used for sending infinipath messages must only be written
+ * in 32-bit words, all the data must be written, and no writes can occur
+ * after the last word is written (which transfers "ownership" of the buffer
+ * to the chip and triggers the message to be sent).
+ * Since the Linux sk_buff structure can be recursive, non-aligned, and
+ * any number of bytes in each segment, we use the following structure
+ * to keep information about the overall state of the copy operation.
+ * This is used to save the information needed to store the checksum
+ * in the right place before sending the last word to the hardware and
+ * to buffer the last 0-3 bytes of non-word sized segments.
+ */
+struct copy_data_s {
+	struct ether_header *hdr;
+	/* addr of PIO buf to write csum to */
+	__u32 __iomem *csum_pio;
+	__u32 __iomem *to;	/* addr of PIO buf to write data to */
+	__u32 device;		/* which device to allocate PIO bufs from */
+	__s32 error;		/* set if there is an error. */
+	__s32 extra;		/* amount of data saved in u.buf below */
+	__u32 len;		/* total length to send in bytes */
+	__u32 flen;		/* frament length in words */
+	__u32 csum;		/* partial IP checksum */
+	__u32 pos;		/* position for partial checksum */
+	__u32 offset;		/* offset to where data currently starts */
+	__s32 checksum_calc;	/* set to 1 when csum has been calculated */
+	struct sk_buff *skb;
+	union {
+		__u32 w;
+		__u8 buf[4];
+	} u;
+};
+/* END_NOSHIP_TO_OPENIB */
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_SERVICE_ID 0x1000117500000000ULL
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+/* BEGIN_NOSHIP_TO_OPENIB */
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+/* END_NOSHIP_TO_OPENIB */
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+/* BEGIN_NOSHIP_TO_OPENIB */
+/* OpCodes  */
+#define IPATH_OPCODE_USER1 0xC0
+#define IPATH_OPCODE_ITH4X 0xC1
+
+/* OpCode 30 is use by stand-alone test programs  */
+#define IPATH_OPCODE_RAW_DATA 0xDE
+/* last OpCode (31) is reserved for test  */
+#define IPATH_OPCODE_TEST 0xDF
+/* END_NOSHIP_TO_OPENIB */
+
+/* sub OpCodes - ith4x  */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+/* Value set in ips_common.h for IPS_HEADER_QUEUE_WORDS */
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+	    & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+		& INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+	    & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+	    & INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+	    & INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 * rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+	    & INFINIPATH_I_VERS_MASK;
+}
+
+#endif				/* _IPATH_COMMON_H */
diff --git a/include/ipath_debug.h b/include/ipath_debug.h
new file mode 100644
index 0000000..41ba098
--- /dev/null
+++ b/include/ipath_debug.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING	/* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1	/* generic low verbosity stuff */
+#define __IPATH_DBG         0x2	/* generic debug */
+#define __IPATH_TRSAMPLE    0x8	/* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40	/* very verbose debug */
+#define __IPATH_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages and important env vars */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+/* low-level environment variables */
+#define __IPATH_ENVDBG	    0x400
+#define __IPATH_EPKTDBG     0x800	/* print error packet data */
+#define __IPATH_CCADBG      0x1000      /* print CCA related events */
+#else				/* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0	/* generic low verbosity stuff */
+#define __IPATH_DBG       0x0	/* generic debug */
+#define __IPATH_TRSAMPLE  0x0	/* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0	/* very verbose debug */
+#define __IPATH_PKTDBG    0x0	/* print packet data */
+#define __IPATH_PROCDBG   0x0	/* print process startup (init)/exit messages */
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_CCADBG    0x0   /* print CCA related events */
+
+#endif				/* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif				/* _IPATH_DEBUG_H */
diff --git a/include/ipath_intf.h b/include/ipath_intf.h
new file mode 100644
index 0000000..66506e9
--- /dev/null
+++ b/include/ipath_intf.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_INTF_H
+#define _IPATH_INTF_H
+
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <stdint.h>
+
+#ifdef __inline__
+#undef __inline__
+#endif
+#define __inline__ inline __attribute__((always_inline,unused))
+#ifdef __unused__
+#undef __unused__
+#endif
+#define __unused__ __attribute__((unused))
+
+#include "sysdep.h"
+#include "bit_ops.h"
+
+/* these aren't implemented for user mode, which is OK until we multi-thread */
+typedef struct _atomic {
+	uint32_t counter;
+} atomic_t;			/* no atomic_t type in user-land */
+#define atomic_set(a,v) ((a)->counter = (v))
+#define atomic_inc_return(a)  (++(a)->counter)
+
+#if defined(__PATHCC__) && __PATHCC__ < 3
+  #define likely(x)	(x)
+  #define unlikely(x)	(x)
+  #define if_pt(cond) if (cond)
+  #define if_pf(cond) if (cond)
+  #define _Pragma_unlikely _Pragma("mips_frequency_hint never")
+  #define _Pragma_likely   _Pragma("mips_frequency_hint frequent")
+#elif defined(__GNUC__) || (defined(__PATHCC__) && __PATHCC__ >= 3)
+  #define likely(x)    __builtin_expect(!!(x), 1L)
+  #define unlikely(x)  __builtin_expect(!!(x), 0L)
+  #define if_pt(cond) if (likely(cond))
+  #define if_pf(cond) if (unlikely(cond))
+  #define _Pragma_unlikely
+  #define _Pragma_likely
+#else
+  #error "Unsupported compiler"
+#endif
+
+#define yield() sched_yield()
+
+/*
+ * __fastpath is used to group routines in the fastpath, to reduce cache
+ * misses and conflicts
+ */
+#define __fastpath __attribute__((section(".text.fastpath")))
+
+/*
+ * Move from using __fastpath to split __recvpath and __sendpath
+ */
+//#define __sendpath __attribute__((section(".text.sendpath")))
+//#define __recvpath __attribute__((section(".text.recvpath")))
+#define __sendpath __fastpath
+#define __recvpath __fastpath
+
+#endif				/* _IPATH_INTF_H */
diff --git a/include/ipath_queue.h b/include/ipath_queue.h
new file mode 100644
index 0000000..d96610e
--- /dev/null
+++ b/include/ipath_queue.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)queue.h	8.5 (Berkeley) 8/20/94
+ * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define	_SYS_QUEUE_H_
+
+/*
+ * This file defines five types of data structures: singly-linked lists,
+ * singly-linked tail queues, lists, tail queues, and circular queues.
+ *
+ * A singly-linked list is headed by a single forward pointer. The elements
+ * are singly linked for minimum space and pointer manipulation overhead at
+ * the expense of O(n) removal for arbitrary elements. New elements can be
+ * added to the list after an existing element or at the head of the list.
+ * Elements being removed from the head of the list should use the explicit
+ * macro for this purpose for optimum efficiency. A singly-linked list may
+ * only be traversed in the forward direction.  Singly-linked lists are ideal
+ * for applications with large datasets and few or no removals or for
+ * implementing a LIFO queue.
+ *
+ * A singly-linked tail queue is headed by a pair of pointers, one to the
+ * head of the list and the other to the tail of the list. The elements are
+ * singly linked for minimum space and pointer manipulation overhead at the
+ * expense of O(n) removal for arbitrary elements. New elements can be added
+ * to the list after an existing element, at the head of the list, or at the
+ * end of the list. Elements being removed from the head of the tail queue
+ * should use the explicit macro for this purpose for optimum efficiency.
+ * A singly-linked tail queue may only be traversed in the forward direction.
+ * Singly-linked tail queues are ideal for applications with large datasets
+ * and few or no removals or for implementing a FIFO queue.
+ *
+ * A list is headed by a single forward pointer (or an array of forward
+ * pointers for a hash table header). The elements are doubly linked
+ * so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before
+ * or after an existing element or at the head of the list. A list
+ * may only be traversed in the forward direction.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * A circle queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or after
+ * an existing element, at the head of the list, or at the end of the list.
+ * A circle queue may be traversed in either direction, but has a more
+ * complex end of list detection.
+ *
+ * For details on the use of these macros, see the queue(3) manual page.
+ *
+ *
+ *			SLIST	LIST	STAILQ	TAILQ	CIRCLEQ
+ * _HEAD		+	+	+	+	+
+ * _HEAD_INITIALIZER	+	+	+	+	+
+ * _ENTRY		+	+	+	+	+
+ * _INIT		+	+	+	+	+
+ * _EMPTY		+	+	+	+	+
+ * _FIRST		+	+	+	+	+
+ * _NEXT		+	+	+	+	+
+ * _PREV		-	-	-	+	+
+ * _LAST		-	-	+	+	+
+ * _FOREACH		+	+	+	+	+
+ * _FOREACH_REVERSE	-	-	-	+	+
+ * _INSERT_HEAD		+	+	+	+	+
+ * _INSERT_BEFORE	-	+	-	+	+
+ * _INSERT_AFTER	+	+	+	+	+
+ * _INSERT_TAIL		-	-	+	+	+
+ * _REMOVE_HEAD		+	-	+	-	-
+ * _REMOVE		+	+	+	+	+
+ *
+ */
+
+/*
+ * Singly-linked List declarations.
+ */
+#define	SLIST_HEAD(name, type)						\
+struct name {								\
+	struct type *slh_first;	/* first element */			\
+}
+
+#define	SLIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+ 
+#define	SLIST_ENTRY(type)						\
+struct {								\
+	struct type *sle_next;	/* next element */			\
+}
+ 
+/*
+ * Singly-linked List functions.
+ */
+#define	SLIST_EMPTY(head)	((head)->slh_first == NULL)
+
+#define	SLIST_FIRST(head)	((head)->slh_first)
+
+#define	SLIST_FOREACH(var, head, field)					\
+	for ((var) = SLIST_FIRST((head));				\
+	    (var);							\
+	    (var) = SLIST_NEXT((var), field))
+
+#define	SLIST_INIT(head) do {						\
+	SLIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	SLIST_INSERT_AFTER(slistelm, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field);	\
+	SLIST_NEXT((slistelm), field) = (elm);				\
+} while (0)
+
+#define	SLIST_INSERT_HEAD(head, elm, field) do {			\
+	SLIST_NEXT((elm), field) = SLIST_FIRST((head));			\
+	SLIST_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	SLIST_NEXT(elm, field)	((elm)->field.sle_next)
+
+#define	SLIST_REMOVE(head, elm, type, field) do {			\
+	if (SLIST_FIRST((head)) == (elm)) {				\
+		SLIST_REMOVE_HEAD((head), field);			\
+	}								\
+	else {								\
+		struct type *curelm = SLIST_FIRST((head));		\
+		while (SLIST_NEXT(curelm, field) != (elm))		\
+			curelm = SLIST_NEXT(curelm, field);		\
+		SLIST_NEXT(curelm, field) =				\
+		    SLIST_NEXT(SLIST_NEXT(curelm, field), field);	\
+	}								\
+} while (0)
+
+#define	SLIST_REMOVE_HEAD(head, field) do {				\
+	SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field);	\
+} while (0)
+
+/*
+ * Singly-linked Tail queue declarations.
+ */
+#define	STAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *stqh_first;/* first element */			\
+	struct type **stqh_last;/* addr of last next element */		\
+}
+
+#define	STAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).stqh_first }
+
+#define	STAILQ_ENTRY(type)						\
+struct {								\
+	struct type *stqe_next;	/* next element */			\
+}
+
+/*
+ * Singly-linked Tail queue functions.
+ */
+#define	STAILQ_EMPTY(head)	((head)->stqh_first == NULL)
+
+#define	STAILQ_FIRST(head)	((head)->stqh_first)
+
+#define	STAILQ_FOREACH(var, head, field)				\
+	for((var) = STAILQ_FIRST((head));				\
+	   (var);							\
+	   (var) = STAILQ_NEXT((var), field))
+
+#define	STAILQ_INIT(head) do {						\
+	STAILQ_FIRST((head)) = NULL;					\
+	(head)->stqh_last = &STAILQ_FIRST((head));			\
+} while (0)
+
+#define	STAILQ_INSERT_AFTER(head, tqelm, elm, field) do {		\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_NEXT((tqelm), field) = (elm);				\
+} while (0)
+
+#define	STAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL)	\
+		(head)->stqh_last = &STAILQ_NEXT((elm), field);		\
+	STAILQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	STAILQ_INSERT_TAIL(head, elm, field) do {			\
+	STAILQ_NEXT((elm), field) = NULL;				\
+	*(head)->stqh_last = (elm);					\
+	(head)->stqh_last = &STAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	STAILQ_LAST(head, type, field)					\
+	(STAILQ_EMPTY(head) ?						\
+		NULL :							\
+	        ((struct type *)					\
+		((char *)((head)->stqh_last) - offsetof(struct type, field))))
+
+#define	STAILQ_NEXT(elm, field)	((elm)->field.stqe_next)
+
+#define	STAILQ_REMOVE(head, elm, type, field) do {			\
+	if (STAILQ_FIRST((head)) == (elm)) {				\
+		STAILQ_REMOVE_HEAD(head, field);			\
+	}								\
+	else {								\
+		struct type *curelm = STAILQ_FIRST((head));		\
+		while (STAILQ_NEXT(curelm, field) != (elm))		\
+			curelm = STAILQ_NEXT(curelm, field);		\
+		if ((STAILQ_NEXT(curelm, field) =			\
+		     STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\
+			(head)->stqh_last = &STAILQ_NEXT((curelm), field);\
+	}								\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD(head, field) do {				\
+	if ((STAILQ_FIRST((head)) =					\
+	     STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL)		\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+#define	STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do {			\
+	if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL)	\
+		(head)->stqh_last = &STAILQ_FIRST((head));		\
+} while (0)
+
+/*
+ * List declarations.
+ */
+#define	LIST_HEAD(name, type)						\
+struct name {								\
+	struct type *lh_first;	/* first element */			\
+}
+
+#define	LIST_HEAD_INITIALIZER(head)					\
+	{ NULL }
+
+#define	LIST_ENTRY(type)						\
+struct {								\
+	struct type *le_next;	/* next element */			\
+	struct type **le_prev;	/* address of previous next element */	\
+}
+
+/*
+ * List functions.
+ */
+
+#define	LIST_EMPTY(head)	((head)->lh_first == NULL)
+
+#define	LIST_FIRST(head)	((head)->lh_first)
+
+#define	LIST_FOREACH(var, head, field)					\
+	for ((var) = LIST_FIRST((head));				\
+	    (var);							\
+	    (var) = LIST_NEXT((var), field))
+
+#define	LIST_INIT(head) do {						\
+	LIST_FIRST((head)) = NULL;					\
+} while (0)
+
+#define	LIST_INSERT_AFTER(listelm, elm, field) do {			\
+	if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
+		LIST_NEXT((listelm), field)->field.le_prev =		\
+		    &LIST_NEXT((elm), field);				\
+	LIST_NEXT((listelm), field) = (elm);				\
+	(elm)->field.le_prev = &LIST_NEXT((listelm), field);		\
+} while (0)
+
+#define	LIST_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.le_prev = (listelm)->field.le_prev;		\
+	LIST_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.le_prev = (elm);				\
+	(listelm)->field.le_prev = &LIST_NEXT((elm), field);		\
+} while (0)
+
+#define	LIST_INSERT_HEAD(head, elm, field) do {				\
+	if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)	\
+		LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
+	LIST_FIRST((head)) = (elm);					\
+	(elm)->field.le_prev = &LIST_FIRST((head));			\
+} while (0)
+
+#define	LIST_NEXT(elm, field)	((elm)->field.le_next)
+
+#define	LIST_REMOVE(elm, field) do {					\
+	if (LIST_NEXT((elm), field) != NULL)				\
+		LIST_NEXT((elm), field)->field.le_prev = 		\
+		    (elm)->field.le_prev;				\
+	*(elm)->field.le_prev = LIST_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Tail queue declarations.
+ */
+#define	TAILQ_HEAD(name, type)						\
+struct name {								\
+	struct type *tqh_first;	/* first element */			\
+	struct type **tqh_last;	/* addr of last next element */		\
+}
+
+#define	TAILQ_HEAD_INITIALIZER(head)					\
+	{ NULL, &(head).tqh_first }
+
+#define	TAILQ_ENTRY(type)						\
+struct {								\
+	struct type *tqe_next;	/* next element */			\
+	struct type **tqe_prev;	/* address of previous next element */	\
+}
+
+/*
+ * Tail queue functions.
+ */
+#define	TAILQ_EMPTY(head)	((head)->tqh_first == NULL)
+
+#define	TAILQ_FIRST(head)	((head)->tqh_first)
+
+#define	TAILQ_FOREACH(var, head, field)					\
+	for ((var) = TAILQ_FIRST((head));				\
+	    (var);							\
+	    (var) = TAILQ_NEXT((var), field))
+
+#define	TAILQ_FOREACH_REVERSE(var, head, headname, field)		\
+	for ((var) = TAILQ_LAST((head), headname);			\
+	    (var);							\
+	    (var) = TAILQ_PREV((var), headname, field))
+
+#define	TAILQ_INIT(head) do {						\
+	TAILQ_FIRST((head)) = NULL;					\
+	(head)->tqh_last = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
+		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_NEXT((listelm), field) = (elm);				\
+	(elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_BEFORE(listelm, elm, field) do {			\
+	(elm)->field.tqe_prev = (listelm)->field.tqe_prev;		\
+	TAILQ_NEXT((elm), field) = (listelm);				\
+	*(listelm)->field.tqe_prev = (elm);				\
+	(listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field);		\
+} while (0)
+
+#define	TAILQ_INSERT_HEAD(head, elm, field) do {			\
+	if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)	\
+		TAILQ_FIRST((head))->field.tqe_prev =			\
+		    &TAILQ_NEXT((elm), field);				\
+	else								\
+		(head)->tqh_last = &TAILQ_NEXT((elm), field);		\
+	TAILQ_FIRST((head)) = (elm);					\
+	(elm)->field.tqe_prev = &TAILQ_FIRST((head));			\
+} while (0)
+
+#define	TAILQ_INSERT_TAIL(head, elm, field) do {			\
+	TAILQ_NEXT((elm), field) = NULL;				\
+	(elm)->field.tqe_prev = (head)->tqh_last;			\
+	*(head)->tqh_last = (elm);					\
+	(head)->tqh_last = &TAILQ_NEXT((elm), field);			\
+} while (0)
+
+#define	TAILQ_LAST(head, headname)					\
+	(*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+#define	TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define	TAILQ_PREV(elm, headname, field)				\
+	(*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define	TAILQ_REMOVE(head, elm, field) do {				\
+	if ((TAILQ_NEXT((elm), field)) != NULL)				\
+		TAILQ_NEXT((elm), field)->field.tqe_prev = 		\
+		    (elm)->field.tqe_prev;				\
+	else								\
+		(head)->tqh_last = (elm)->field.tqe_prev;		\
+	*(elm)->field.tqe_prev = TAILQ_NEXT((elm), field);		\
+} while (0)
+
+/*
+ * Circular queue declarations.
+ */
+#define	CIRCLEQ_HEAD(name, type)					\
+struct name {								\
+	struct type *cqh_first;		/* first element */		\
+	struct type *cqh_last;		/* last element */		\
+}
+
+#define	CIRCLEQ_HEAD_INITIALIZER(head)					\
+	{ (void *)&(head), (void *)&(head) }
+
+#define	CIRCLEQ_ENTRY(type)						\
+struct {								\
+	struct type *cqe_next;		/* next element */		\
+	struct type *cqe_prev;		/* previous element */		\
+}
+
+/*
+ * Circular queue functions.
+ */
+#define	CIRCLEQ_EMPTY(head)	((head)->cqh_first == (void *)(head))
+
+#define	CIRCLEQ_FIRST(head)	((head)->cqh_first)
+
+#define	CIRCLEQ_FOREACH(var, head, field)				\
+	for ((var) = CIRCLEQ_FIRST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_NEXT((var), field))
+
+#define	CIRCLEQ_FOREACH_REVERSE(var, head, field)			\
+	for ((var) = CIRCLEQ_LAST((head));				\
+	    (var) != (void *)(head) || ((var) = NULL);			\
+	    (var) = CIRCLEQ_PREV((var), field))
+
+#define	CIRCLEQ_INIT(head) do {						\
+	CIRCLEQ_FIRST((head)) = (void *)(head);				\
+	CIRCLEQ_LAST((head)) = (void *)(head);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field);	\
+	CIRCLEQ_PREV((elm), field) = (listelm);				\
+	if (CIRCLEQ_NEXT((listelm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\
+	CIRCLEQ_NEXT((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {		\
+	CIRCLEQ_NEXT((elm), field) = (listelm);				\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field);	\
+	if (CIRCLEQ_PREV((listelm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\
+	CIRCLEQ_PREV((listelm), field) = (elm);				\
+} while (0)
+
+#define	CIRCLEQ_INSERT_HEAD(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head));		\
+	CIRCLEQ_PREV((elm), field) = (void *)(head);			\
+	if (CIRCLEQ_LAST((head)) == (void *)(head))			\
+		CIRCLEQ_LAST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm);	\
+	CIRCLEQ_FIRST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_INSERT_TAIL(head, elm, field) do {			\
+	CIRCLEQ_NEXT((elm), field) = (void *)(head);			\
+	CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head));		\
+	if (CIRCLEQ_FIRST((head)) == (void *)(head))			\
+		CIRCLEQ_FIRST((head)) = (elm);				\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm);	\
+	CIRCLEQ_LAST((head)) = (elm);					\
+} while (0)
+
+#define	CIRCLEQ_LAST(head)	((head)->cqh_last)
+
+#define	CIRCLEQ_NEXT(elm,field)	((elm)->field.cqe_next)
+
+#define	CIRCLEQ_PREV(elm,field)	((elm)->field.cqe_prev)
+
+#define	CIRCLEQ_REMOVE(head, elm, field) do {				\
+	if (CIRCLEQ_NEXT((elm), field) == (void *)(head))		\
+		CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field);	\
+	else								\
+		CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) =	\
+		    CIRCLEQ_PREV((elm), field);				\
+	if (CIRCLEQ_PREV((elm), field) == (void *)(head))		\
+		CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field);	\
+	else								\
+		CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) =	\
+		    CIRCLEQ_NEXT((elm), field);				\
+} while (0)
+
+#endif /* !_SYS_QUEUE_H_ */
diff --git a/include/ipath_service.h b/include/ipath_service.h
new file mode 100644
index 0000000..72ac29e
--- /dev/null
+++ b/include/ipath_service.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_SERVICE_H
+#define _IPATH_SERVICE_H
+
+//  This file contains all the lowest level routines calling into sysfs
+//  and qib driver. All other calls are based on these routines.
+
+#include "ipath_intf.h"
+#include "ipath_common.h"
+#include "ipath_udebug.h"
+
+// any unit id to match.
+#define IPATH_UNIT_ID_ANY ((long)-1)
+
+// Given the unit number and port, return an error, or the corresponding LID
+// Returns an int, so -1 indicates an error.  0 indicates that
+// the unit is valid, but no LID has been assigned.
+int ipath_get_port_lid(int, int);
+
+// Given the unit number and port, return an error, or the corresponding GID
+// Returns an int, so -1 indicates an error.
+int ipath_get_port_gid(int, int, uint64_t *hi, uint64_t *lo);
+
+// Given the unit number, return an error, or the corresponding LMC value
+// for the port
+// Returns an int, so -1 indicates an error.  0
+int ipath_get_port_lmc(int unit, int port);
+
+// Given the unit number, return an error, or the corresponding link rate
+// for the port
+// Returns an int, so -1 indicates an error. 
+int ipath_get_port_rate(int unit, int port);
+
+// Given a unit, port and SL, return an error, or the corresponding VL for the
+// SL as programmed by the SM
+// Returns an int, so -1 indicates an error.
+int ipath_get_port_sl2vl(int unit, int port, int sl);
+
+// get the number of units supported by the driver.  Does not guarantee
+// that a working chip has been found for each possible unit #.  Returns
+// -1 with errno set, or number of units >=0 (0 means none found).
+int ipath_get_num_units(void);
+
+// get the number of contexts from the unit id.
+// Returns 0 if no unit or no match.
+int ipath_get_num_contexts(int unit);
+
+// Open ipath device file, return -1 on error.
+int ipath_context_open(int unit, int port, uint64_t open_timeout);
+void ipath_context_close(int fd);
+int ipath_cmd_write(int fd, struct ipath_cmd *, size_t count);
+int ipath_cmd_writev(int fd, const struct iovec *iov, int iovcnt);
+int ipath_cmd_assign_context(int fd, void *buf, size_t count);
+int ipath_cmd_user_init(int fd, void *buf, size_t count);
+
+int ipath_get_cc_settings_bin(int unit, int port, char *ccabuf);
+int ipath_get_cc_table_bin(int unit, int port, uint16_t **cctp);
+
+// we use mmap64() because we compile in both 32 and 64 bit mode,
+// and we have to map physical addresses that are > 32 bits long.
+// While linux implements mmap64, it doesn't have a man page,
+// and isn't declared in any header file, so we declare it here ourselves.
+
+// We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+// redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+// it doesn't work when the address being mapped is > 32 bits.  It chips
+// off bits 32 and above.   So we stay with mmap64.
+extern void *mmap64(void *, size_t, int, int, int, __off64_t);
+void *ipath_mmap64(void *, size_t, int, int, int, __off64_t);
+
+// Statistics maintained by the driver
+int infinipath_get_stats(uint64_t *, int);
+int infinipath_get_stats_names(char **namep);
+// Counters maintained in the chip, globally, and per-prot
+int infinipath_get_ctrs_unit(int unitno, uint64_t *, int);
+int infinipath_get_ctrs_unit_names(int unitno, char **namep);
+int infinipath_get_ctrs_port(int unitno, int port, uint64_t *, int);
+int infinipath_get_ctrs_port_names(int unitno, char **namep);
+
+/* sysfs helper routines (only those currently used are exported;
+ * try to avoid using others) */
+
+/* base name of path (without unit #) for qib driver */
+#define QIB_CLASS_PATH "/sys/class/infiniband/qib"
+
+/* read a signed 64-bit quantity, in some arbitrary base */
+int ipath_sysfs_read_s64(const char *attr, int64_t *valp, int base);
+
+/* read a string value */
+int ipath_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+			  char **datap);
+
+/* open attribute in unit's sysfs directory via open(2) */
+int ipath_sysfs_unit_open(uint32_t unit, const char *attr, int flags);
+/* print to attribute in {unit,port} sysfs directory */
+int ipath_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+			    const char *fmt, ...)
+  __attribute__((format(printf, 4, 5)));
+int ipath_sysfs_unit_printf(uint32_t unit, const char *attr,
+			    const char *fmt, ...)
+  __attribute__((format(printf, 3, 4)));
+
+int ipath_ipathfs_unit_write(uint32_t unit, const char *attr, const void *data,
+	size_t len);
+/* read up to one page of malloc'ed data (caller must free), returning
+   number of bytes read or -1 */
+int ipath_ipathfs_read(const char *attr, char **datap);
+int ipath_ipathfs_unit_read(uint32_t unit, const char *attr, char **data);
+/* read a signed 64-bit quantity, in some arbitrary base */
+int ipath_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+			      int64_t *valp, int base);
+int ipath_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+			      int64_t *valp, int base);
+/* these read directly into supplied buffer and take a count */
+int ipath_ipathfs_rd(const char *, void *, int);
+int ipath_ipathfs_unit_rd(uint32_t unit, const char *, void *, int);
+
+int ipath_ipathfs_open(const char *relname, int flags);
+
+/* wait for device special file to show up. timeout is in
+ *    milliseconds, 0 is "callee knows best", < 0 is infinite. */
+int ipath_wait_for_device(const char *path, long timeout);
+
+int ipath_cmd_wait_for_packet(int fd);
+int infinipath_get_unit_flash(int unit, char **datap);
+int infinipath_put_unit_flash(int unit, char *data, int len);
+
+#endif				// _IPATH_SERVICE_H
diff --git a/include/ipath_udebug.h b/include/ipath_udebug.h
new file mode 100644
index 0000000..bce2233
--- /dev/null
+++ b/include/ipath_udebug.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_UDEBUG_H
+#define _IPATH_UDEBUG_H
+
+#include <stdio.h>
+#include "ipath_debug.h"
+
+extern unsigned infinipath_debug;
+const char *ipath_get_unit_name(int unit);
+extern char *__progname;
+
+#if _IPATH_DEBUGGING
+
+extern char *__ipath_mylabel;
+void ipath_set_mylabel(char *);
+char *ipath_get_mylabel();
+extern FILE *__ipath_dbgout;
+
+#define _IPATH_UNIT_ERROR(unit,fmt,...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s%s: " fmt, __ipath_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while(0)
+
+#define _IPATH_ERROR(fmt,...) \
+	do { \
+		_Pragma_unlikely \
+		printf("%s%s: " fmt, __ipath_mylabel, __progname, \
+		       ##__VA_ARGS__); \
+	} while(0)
+
+#define _IPATH_INFO(fmt,...) \
+	do { \
+		_Pragma_unlikely \
+		if(unlikely(infinipath_debug&__IPATH_INFO))  \
+			printf("%s%s: " fmt, __ipath_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while(0)
+
+#define __IPATH_PKTDBG_ON unlikely(infinipath_debug & __IPATH_PKTDBG)
+
+#define __IPATH_DBG_WHICH(which,fmt,...) \
+	do { \
+		_Pragma_unlikely \
+		if(unlikely(infinipath_debug&(which))) \
+			fprintf(__ipath_dbgout, "%s%s: " fmt, __ipath_mylabel, __func__, \
+			       ##__VA_ARGS__); \
+	} while(0)
+
+#define __IPATH_DBG_WHICH_NOFUNC(which,fmt,...) \
+	do { \
+		_Pragma_unlikely \
+		if(unlikely(infinipath_debug&(which))) \
+			fprintf(__ipath_dbgout, "%s" fmt, __ipath_mylabel, \
+			       ##__VA_ARGS__); \
+	} while(0)
+
+#define _IPATH_DBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+#define _IPATH_VDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_VERBDBG,fmt,##__VA_ARGS__)
+#define _IPATH_PDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_PKTDBG,fmt,##__VA_ARGS__)
+#define _IPATH_EPDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_EPKTDBG,fmt,##__VA_ARGS__)
+#define _IPATH_PRDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_PROCDBG,fmt,##__VA_ARGS__)
+#define _IPATH_ENVDBG(lev,fmt,...) \
+	__IPATH_DBG_WHICH_NOFUNC(					    \
+		(lev==0) ? __IPATH_INFO :				    \
+		    (lev>1?__IPATH_ENVDBG:(__IPATH_PROCDBG|__IPATH_ENVDBG)),\
+		"env " fmt,##__VA_ARGS__)
+#define _IPATH_MMDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_MMDBG,fmt,##__VA_ARGS__)
+#define _IPATH_CCADBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_CCADBG,fmt,##__VA_ARGS__)
+
+#else				/* ! _IPATH_DEBUGGING */
+
+#define _IPATH_UNIT_ERROR(unit,fmt,...) \
+	do { \
+		printf ("%s" fmt, "", ##__VA_ARGS__); \
+	} while(0)
+
+#define _IPATH_ERROR(fmt,...) \
+	do { \
+		printf ("%s" fmt, "", ##__VA_ARGS__); \
+	} while(0)
+
+#define _IPATH_INFO(fmt,...)
+
+#define __IPATH_PKTDBG_ON 0
+
+#define _IPATH_DBG(fmt,...)
+#define _IPATH_PDBG(fmt,...)
+#define _IPATH_EPDBG(fmt,...)
+#define _IPATH_PRDBG(fmt,...)
+#define _IPATH_VDBG(fmt,...)
+#define _IPATH_MMDBG(fmt,...)
+#define _IPATH_CCADBG(fmt,...)
+
+#endif				/* _IPATH_DEBUGGING */
+
+#endif				/* _IPATH_DEBUG_H */
diff --git a/include/ipath_user.h b/include/ipath_user.h
new file mode 100644
index 0000000..3d120f0
--- /dev/null
+++ b/include/ipath_user.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_USER_H
+#define _IPATH_USER_H
+
+//  This file contains all of the data structures and routines that are
+//  publicly visible and usable (to low level infrastructure code; it is
+//  not expected that any application, or even normal application-level library,
+//  will ever need to use any of this).
+
+//  Additional entry points and data structures that are used by these routines
+//  may be referenced in this file, but they should not be generally available;
+//  they are visible here only to allow use in inlined functions.  Any variable,
+//  data structure, or function that starts with a leading "_" is in this
+//  category.
+
+//  Include header files we need that are unlikely to otherwise be needed by
+//  programs.
+#include <stddef.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/user.h>
+#include <syslog.h>
+#include "ipath_intf.h"
+#include "ipath_common.h"
+#include "ipath_byteorder.h"
+#include "ipath_udebug.h"
+#include "ipath_service.h"
+
+// interval timing routines
+// Convert a count of cycles to elapsed nanoseconds
+// this is only accurate for reasonably large numbers of cycles (at least tens)
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t)
+    __attribute__ ((always_inline));
+// convert elapsed nanoseconds to elapsed cycles
+// this is only accurate for reasonably large numbers of nsecs (at least tens)
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t)
+    __attribute__ ((always_inline));
+// get current count of nanoseconds from unspecified base value (only useful for
+// intervals)
+static __inline__ uint64_t get_nanoseconds() __attribute__ ((always_inline));
+
+// This block will eventually move to a separate file, but for now we'll leave
+// it here.
+typedef struct _ipath_dev {
+  int32_t spd_fd;
+  int32_t spd_type;	// ipath_type  
+  volatile uint64_t *spd_uregbase; // mmap'ed to chip or virtual user regs
+  volatile uint64_t *spd_piobase;	// mmap'ed access to chip PIO buffers
+  uint64_t __pad[8]; // placeholder for future binary compat expansion     
+} ipath_dev;
+
+struct _ipath_ctrl {
+	ipath_dev spc_dev;	// for use by "driver" code only, other code treats as an opaque cookie.
+
+// some local storages in some condition:
+// as storage of __ipath_rcvtidflow in ipath_userinit().
+	__le32 regs[INFINIPATH_TF_NFLOWS << 1];
+// as storage of __ipath_tidflow_wmb in ipath_userinit().
+	__le32 tidflow_wmb_location;
+// as storage of spi_sendbuf_status in ipath_userinit().
+	uint64_t sendbuf_status;
+// for ipath_check_unit_status(), ipath_proto.c
+	int lasterr;
+
+// location to which InfiniPath writes the rcvhdrtail
+// register whenever it changes, so that no chip registers are read in
+// the performance path. 
+	volatile __le32 *__ipath_rcvtail;
+// address where ur_rcvhdrhead is written
+	volatile __le32 *__ipath_rcvhdrhead;
+// address where ur_rcvegrindexhead is written
+	volatile __le32 *__ipath_rcvegrhead;
+// address where ur_rcvegrindextail is read
+	volatile __le32 *__ipath_rcvegrtail;
+// number of eager buffers
+	uint32_t __ipath_tidegrcnt;
+// address where ur_rcvtidflow is written
+	volatile __le32 *__ipath_rcvtidflow;
+// Serialize writes to tidflow QLE73XX
+	volatile __le32 *__ipath_tidflow_wmb;
+
+// save away spi_status for use in ipath_check_unit_status()
+	volatile __u64 *__ipath_spi_status;
+};
+
+// PIO write routines assume that the message header is always 56 bytes.
+#define IPATH_MESSAGE_HDR_SIZE	56
+// Usable bytes in header (hdrsize - lrh - bth)
+#define IPATH_MESSAGE_HDR_SIZE_IPATH	(IPATH_MESSAGE_HDR_SIZE-20) 
+// Must be same as PSM_CRC_SIZE_IN_BYTES in ips_proto_params.h
+#define IPATH_CRC_SIZE_IN_BYTES 8
+
+// After the device is opened, ipath_userinit() is called to give the driver the
+// parameters the user code wants to use, and to get the implementation values,
+// etc. back.  0 is returned on success, a positive value is a standard errno,
+// and a negative value is reserved for future use.  The first argument is
+// the filedescriptor returned by the device open.
+//
+// It is allowed to have multiple devices (and of different types)
+// simultaneously opened and initialized, although this won't be fully
+// implemented initially.  This routine is used by the low level
+// infinipath protocol code (and any other code that has similar low level
+// functionality).
+// This is the only routine that takes a file descriptor, rather than an
+// struct _ipath_ctrl *.  The struct _ipath_ctrl * used for everything
+// else is returned by this routine.
+struct _ipath_ctrl *ipath_userinit(int32_t, struct ipath_user_info *,
+				   struct ipath_base_info *b);
+
+// don't inline these; it's all init code, and not inlining makes the
+// overall code shorter and easier to debug
+void ipath_touch_mmap(void *, size_t) __attribute__ ((noinline));
+
+int32_t ipath_update_tid_err(void);	// handle update tid errors out of line
+int32_t ipath_free_tid_err(void);	// handle free tid errors out of line
+
+// set the BTH pkey to check for this process.
+// This is for receive checks, not for sends.  It isn't necessary
+// to set the default key, that's always allowed by the hardware.
+// If too many pkeys are in use for the hardware to support, this
+// will return EAGAIN, and the caller should then fail and exit
+// or use the default key and check the pkey in the received packet
+// checking.
+int32_t ipath_set_pkey(struct _ipath_ctrl *, uint16_t);
+
+// flush the eager buffers, by setting the
+// eager index head register == eager index tail, if queue is full
+void ipath_flush_egr_bufs(struct _ipath_ctrl *ctrl);
+
+int ipath_wait_for_packet(struct _ipath_ctrl *);
+
+// stop_start == 0 disables receive on the context, for use in queue overflow
+// conditions.  stop_start==1 re-enables, and returns value of tail register,
+// to be used to re-init the software copy of the head register
+int ipath_manage_rcvq(struct _ipath_ctrl *ctrl, uint32_t stop_start);
+
+// ctxt_bp == 0 disables fabric back pressure on the context.
+// ctxt_bp == 1 enables fabric back pressure on the context.
+int ipath_manage_bp(struct _ipath_ctrl *ctrl, uint8_t ctxt_bp);
+
+// enable == 1 enables armlaunch (normal), 0 disables (only used
+// ipath_pkt_test -B at the moment, needed for linda).
+int ipath_armlaunch_ctrl(struct _ipath_ctrl *ctrl, uint32_t enable);
+
+// force an update of the PIOAvail register to memory
+int ipath_force_pio_avail_update(struct _ipath_ctrl *ctrl);
+
+// Disarm any send buffers which need disarming.
+int ipath_disarm_bufs(struct _ipath_ctrl *ctrl);
+
+// New user event mechanism, using spi_sendbuf_status IPATH_EVENT_* bits
+// obsoletes ipath_disarm_bufs(), and extends it, although old mechanism
+// remains for binary compatibility.
+int ipath_event_ack(struct _ipath_ctrl *ctrl, __u64 ackbits);
+
+// Return send dma's current "in flight counter "
+int ipath_sdma_inflight(struct _ipath_ctrl *ctrl, uint32_t *counter);
+
+// Return send dma's current "completion counter"
+int ipath_sdma_complete(struct _ipath_ctrl *ctrl, uint32_t *counter);
+
+// set whether we want an interrupt on all packets, or just urgent ones
+int ipath_poll_type(struct _ipath_ctrl *ctrl, uint16_t poll_type);
+
+static int32_t __inline__ ipath_free_tid(struct _ipath_ctrl *,
+					 uint32_t, uint64_t)
+    __attribute__ ((always_inline));
+
+// check the unit status, and return an IPS_RC_* code if it is not in a
+// usable state.   It will also print a message if not in a usable state
+int ipath_check_unit_status(struct _ipath_ctrl *ctrl);
+
+// Statistics maintained by the driver
+const char * infinipath_get_next_name(char **names);
+uint64_t infinipath_get_single_stat(const char *attr, uint64_t *s);
+int infinipath_get_stats_names_count(void);
+// Counters maintained in the chip, globally, and per-prot
+int infinipath_get_ctrs_unit_names_count(int unitno);
+int infinipath_get_ctrs_port_names_count(int unitno);
+
+uint64_t infinipath_get_single_unitctr(int unit, const char *attr, uint64_t *s);
+int infinipath_get_single_portctr(int unit, int port, const char *attr,
+				  uint64_t *c);
+void infinipath_release_names(char *namep);
+
+// Syslog wrapper
+// 
+// level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING,
+//                 LOG_NOTICE, LOG_INFO, LOG_DEBUG.
+//
+// prefix should be a short string to describe which part of the software stack
+// is using syslog, i.e. "PSM", "mpi", "mpirun".
+//
+void ipath_syslog(const char *prefix, int to_console, int level, 
+		  const char *format, ...)
+	    __attribute__((format(printf, 4, 5)));
+
+void ipath_vsyslog(const char *prefix, int to_console, int level, 
+		  const char *format, va_list ap);
+
+/* parameters for PBC for pio write routines, to avoid passing lots
+ * of args; we instead pass the structure pointer.  */
+struct ipath_pio_params {
+  uint16_t length;
+  uint8_t vl;
+  uint8_t port;
+  uint32_t cksum_is_valid;
+  uint32_t cksum;
+  uint32_t rate;
+};
+
+// write pio buffers.  The ipath_write_pio_force_order() version assumes
+// that the processor does not write store buffers to i/o devices in the
+// order in which they are writte, and that when flushing partially
+// filled store buffers, the words are not ordered either.   The ipath_write_pio()
+// form is used when the processor writes store buffers to i/o in the order
+// in which they are filled, and writes partially filled buffers in increasing
+// address order (assuming they are filled that way).
+// The arguments are pio buffer address, payload length, header, and payload
+void ipath_write_pio_vector(volatile uint32_t *, const struct ipath_pio_params *,
+	void *, void *);  
+void ipath_write_pio(volatile uint32_t *, const struct ipath_pio_params *,
+	void *, void *);  
+void ipath_write_pio_force_order(volatile uint32_t *,
+	const struct ipath_pio_params *, void *, void *);
+
+#define IPATH_SPECIAL_TRIGGER_MAGIC        0xaebecede
+// IBA7220 can use a "Special" trigger.  We write to the last dword
+// in the mapped SendBuf to trigger the launch.
+void ipath_write_pio_special_trigger2k(volatile uint32_t *,
+	const struct ipath_pio_params *, void *, void *);
+void ipath_write_pio_special_trigger4k(volatile uint32_t *,
+	const struct ipath_pio_params *, void *, void *);
+
+/*
+ * Copy routine that may copy a byte multiple times but optimized for througput
+ * This is not safe to use for PIO routines where we want a guarantee that a 
+ * byte is only copied/moved across the bus once.
+ */
+void ipath_dwordcpy(volatile uint32_t *dest, const uint32_t * src, uint32_t ndwords);
+
+/*
+* Safe version of ipath_dwordcpy that is guaranteed to only copy each byte once.
+*/
+#if defined(__x86_64__)
+void ipath_dwordcpy_safe(volatile uint32_t *dest, const uint32_t * src, uint32_t ndwords);
+#else
+#define ipath_dwordcpy_safe ipath_dwordcpy
+#endif
+
+//  From here to the end of the file are implementation details that should not
+//  be used outside this file (other than to call the function), except in the
+//  one infrastructure file in which they are defined.
+
+// NOTE:  doing paired 32 bit writes to the chip to store 64 bit values (as from
+// 32 bit programs) will not work correctly, because there is no sub-qword address
+// decode.  Therefore 32 bit programs use only a single 32 bit store; the head
+// register values are all less than 32 bits, anyway.   Given that, we use
+// only 32 bits even for 64 bit programs, for simplicity.  These functions must
+// not be called until after ipath_userinit() is called.
+// The ctrl argument is currently unused, but remains useful for adding
+// debug code.
+
+static __inline__ void ipath_put_rcvegrindexhead(struct _ipath_ctrl *ctrl,
+						 uint32_t val)
+{
+	*ctrl->__ipath_rcvegrhead = __cpu_to_le32(val);
+}
+
+static __inline__ void ipath_put_rcvhdrhead(struct _ipath_ctrl *ctrl,
+					    uint32_t val)
+{
+	*ctrl->__ipath_rcvhdrhead = __cpu_to_le32(val);
+}
+
+static __inline__ uint32_t ipath_get_rcvhdrtail(struct _ipath_ctrl *ctrl)
+{
+    uint32_t res = __le32_to_cpu(*ctrl->__ipath_rcvtail);
+    ips_rmb();
+    return res;
+}
+
+static __inline__ void ipath_tidflow_set_entry(struct _ipath_ctrl *ctrl,
+		uint32_t flowid, uint8_t genval, uint16_t seqnum)
+{
+    ctrl->__ipath_rcvtidflow[flowid << 1] = __cpu_to_le32(
+       (1 << INFINIPATH_TF_ISVALID_SHIFT) |
+       (1 << INFINIPATH_TF_ENABLED_SHIFT) |
+       (1 << INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) |
+       (1 << INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT) |
+       (genval << INFINIPATH_TF_GENVAL_SHIFT) |
+       ((seqnum & INFINIPATH_TF_SEQNUM_MASK) << INFINIPATH_TF_SEQNUM_SHIFT));
+    /* Write a read-only register to act as a delay between tidflow writes */
+    *ctrl->__ipath_tidflow_wmb = 0;
+}
+
+static __inline__ void ipath_tidflow_reset(struct _ipath_ctrl *ctrl,
+		uint32_t flowid)
+{
+    ctrl->__ipath_rcvtidflow[flowid << 1] = __cpu_to_le32(
+           (1 << INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) |
+           (1 << INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT));
+    /* Write a read-only register to act as a delay between tidflow writes */
+    *ctrl->__ipath_tidflow_wmb = 0;
+}
+
+/*
+ * This should only be used for debugging.
+ * Normally, we shouldn't read the chip.
+ */
+static __inline__ uint32_t ipath_tidflow_get(struct _ipath_ctrl *ctrl,
+		uint32_t flowid)
+{
+  return __le32_to_cpu(ctrl->__ipath_rcvtidflow[flowid << 1]);
+}
+
+static __inline__ uint32_t ipath_tidflow_get_seqmismatch(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) &
+    INFINIPATH_TF_STATUS_SEQMISMATCH_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_genmismatch(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT) &
+    INFINIPATH_TF_STATUS_GENMISMATCH_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_isvalid(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_ISVALID_SHIFT) & INFINIPATH_TF_ISVALID_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_seqnum(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_SEQNUM_SHIFT) & INFINIPATH_TF_SEQNUM_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_genval(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_GENVAL_SHIFT) & INFINIPATH_TF_GENVAL_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_enabled(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_ENABLED_SHIFT) & INFINIPATH_TF_ENABLED_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_keep_after_seqerr(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_KEEP_AFTER_SEQERR_SHIFT) &
+    INFINIPATH_TF_KEEP_AFTER_SEQERR_MASK;
+}
+
+static __inline__ uint32_t ipath_tidflow_get_keep_after_generr(uint32_t val)
+{
+  return (val >> INFINIPATH_TF_KEEP_AFTER_GENERR_SHIFT) &
+    INFINIPATH_TF_KEEP_AFTER_GENERR_MASK;
+}
+
+/*
+ * This should only be used by a process to write the eager index into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void ipath_hdrset_index(__le32 *rbuf, uint32_t val)
+{
+	rbuf[0] =
+	    (rbuf[0] &
+		__cpu_to_le32(~(INFINIPATH_RHF_EGRINDEX_MASK <<
+				INFINIPATH_RHF_EGRINDEX_SHIFT))) |
+	    __cpu_to_le32((val & INFINIPATH_RHF_EGRINDEX_MASK) <<
+			  INFINIPATH_RHF_EGRINDEX_SHIFT);
+}
+
+/*
+ * This should only be used by a process to update the receive header
+ * error flags.
+ */
+static __inline__ void ipath_hdrset_err_flags(__le32 *rbuf, uint32_t val)
+{
+	rbuf[1] |= __cpu_to_le32(val);
+}
+
+/*
+ * This should only be used by a process to write the rhf seq number into
+ * a subcontext's eager header entry.
+ */
+static __inline__ void ipath_hdrset_seq(__le32 *rbuf, uint32_t val)
+{
+	rbuf[1] =
+	    (rbuf[1] &
+		__cpu_to_le32(~(INFINIPATH_RHF_SEQ_MASK <<
+				INFINIPATH_RHF_SEQ_SHIFT))) |
+	    __cpu_to_le32((val & INFINIPATH_RHF_SEQ_MASK) <<
+			  INFINIPATH_RHF_SEQ_SHIFT);
+}
+
+// Manage TID entries.  It is possible that not all entries
+// requested may be allocated.  A matching ipath_free_tid() must be
+// done for each ipath_update_tid(), because currently no caching or
+// reuse of expected tid entries is allowed, to work around malloc/free
+// and mmap/munmap issues.  The driver decides which TID entries to allocate.
+// If ipath_free_tid is called to free entries in use by a different
+// send by the same process, data corruption will probably occur,
+// but only within that process, not for other processes.
+
+// update tidcnt expected TID entries from the array pointed to by tidinfo.
+// Returns 0 on success, else an errno.  See full description at declaration
+static int32_t __inline__ ipath_update_tid(struct _ipath_ctrl *ctrl,
+					   uint32_t tidcnt, uint64_t tidlist,
+					   uint64_t vaddr, uint64_t tidmap)
+{
+	struct ipath_cmd cmd;
+
+	cmd.type = IPATH_CMD_TID_UPDATE;
+
+	cmd.cmd.tid_info.tidcnt = tidcnt;	// number of tid entries to do
+	cmd.cmd.tid_info.tidlist = tidlist;	// driver copies tids back directly to this
+	cmd.cmd.tid_info.tidvaddr = vaddr;	// base address for this send to map
+	cmd.cmd.tid_info.tidmap = tidmap;	// driver copies directly to this
+	if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1)
+		return ipath_update_tid_err();
+	return 0;
+}
+
+static int32_t __inline__ ipath_free_tid(struct _ipath_ctrl *ctrl,
+					 uint32_t tidcnt, uint64_t tidmap)
+{
+	struct ipath_cmd cmd;
+
+	cmd.type = IPATH_CMD_TID_FREE;
+
+	cmd.cmd.tid_info.tidcnt = tidcnt;
+	cmd.cmd.tid_info.tidmap = tidmap;	// driver copies from this
+	if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1)
+		return ipath_free_tid_err();
+	return 0;
+}
+
+extern uint32_t __ipath_pico_per_cycle;	// only for use in these functions
+
+// this is only accurate for reasonably large numbers of cycles (at least tens)
+static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs)
+{
+	return (__ipath_pico_per_cycle * cycs) / 1000ULL;
+}
+
+// this is only accurate for reasonably large numbers of nsecs (at least tens)
+static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns)
+{
+	return (ns * 1000ULL) / __ipath_pico_per_cycle;
+}
+
+static __inline__ uint64_t get_nanoseconds()
+{
+	return cycles_to_nanosecs(get_cycles());
+}
+
+// open the diags device, if supported by driver.  Returns 0 on
+// success, errno on failure.  Also tells driver that diags
+// is active, which changes some driver behavior
+int ipath_diag_open(unsigned);	// unit
+int ipath_diag_close(void);
+
+// diags chip read and write routines
+
+int ipathd_read32(uint64_t reg_offset, uint32_t * read_valp);
+int ipathd_write32(uint64_t reg_offset, uint32_t write_val);
+
+int ipathd_readmult(uint64_t, unsigned, uint64_t *);	// chip: offset, cnt, ptr
+int ipathd_write(uint64_t, uint64_t);	// chip: offset, value
+
+#define IPATH_READ_EEPROM 31337
+#define IPATH_WRITE_EEPROM 101
+
+struct ipath_eeprom_req {
+    void *addr;
+    uint16_t len;
+    uint16_t offset;
+};
+
+int ipathd_send_pkt(const void *, unsigned);	// send a packet for diags
+int ipathd_read_i2c(struct ipath_eeprom_req *);	// diags read i2c flash
+
+__u8 ipath_flash_csum(struct ipath_flash *, int);
+
+int ipathd_reset_hardware(uint32_t);
+
+int ipath_hideous_ioctl_emulator(int unit, int reqtype,
+				 struct ipath_eeprom_req *req);
+
+#endif				// _IPATH_USER_H
diff --git a/include/linux-i386/bit_ops.h b/include/linux-i386/bit_ops.h
new file mode 100644
index 0000000..ca8b80f
--- /dev/null
+++ b/include/linux-i386/bit_ops.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_i386_BIT_OPS_H
+#define _IPATH_i386_BIT_OPS_H
+
+static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr)
+{
+    asm volatile(LOCK_PREFIX "btrl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr)
+{
+    asm volatile(LOCK_PREFIX "btcl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+    int oldbit;
+
+    asm volatile(LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit),
+		 "=m" (*addr) : "dIr" (nr) : "memory");
+    return oldbit;
+}
+
+static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr)
+{
+    asm volatile("btrl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr)
+{
+    asm volatile("btcl %1,%0" : "=m" (*addr) : "dIr"(nr));
+}
+
+static __inline__ int ips___test_and_set_bit(int nr,
+					     volatile unsigned long *addr)
+{
+    int oldbit;
+
+    asm volatile("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit),
+		 "=m" (*addr) : "dIr" (nr) : "memory");
+    return oldbit;
+}
+
+#endif /* _IPATH_i386_BIT_OPS_H */
diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
new file mode 100644
index 0000000..ef99d1d
--- /dev/null
+++ b/include/linux-i386/sysdep.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_i386_SYSDEP_H
+#define _IPATH_i386_SYSDEP_H
+
+static __inline__ uint64_t get_cycles(void)
+{
+    uint64_t v;
+    uint32_t a,d;
+
+    asm volatile("rdtsc" : "=a" (a), "=d" (d));
+    v = ((uint64_t)a) | (((uint64_t)d)<<32);
+
+    return v;
+}
+
+#ifndef LOCK_PREFIX
+#define LOCK_PREFIX "lock "
+#endif
+
+static __inline__ void ips_mb()
+{
+#ifdef __MIC__
+    asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+    asm volatile("mfence" : : : "memory");
+#endif
+}
+
+/* gcc-3.4 has a bug with this function body at -O0 */
+static 
+#if defined(__GNUC__) && !defined(__PATHCC__) && __GNUC__==3 && __GNUC_MINOR__==4
+#else
+__inline__ 
+#endif
+void ips_rmb()
+{
+#ifdef __MIC__
+    asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+    asm volatile("" : : : "memory");
+#endif
+}
+
+static __inline__ void ips_wmb()
+{
+#ifdef __MIC__
+    asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+    asm volatile("sfence" : : : "memory");
+#endif
+}
+
+static __inline__ void ips_sync_writes()
+{
+#ifdef __MIC__
+    asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+    asm volatile("sfence" : : : "memory");
+#endif
+}
+
+static __inline__ void ips_sync_reads()
+{
+#ifdef __MIC__
+    asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+    asm volatile("lfence" : : : "memory");
+#endif
+}
+
+static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
+				       uint32_t old, uint32_t new)
+{
+    uint32_t prev;
+    struct xchg_dummy { uint32_t a[100]; };
+
+    asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
+		 : "=a"(prev)
+		 : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old)
+		 : "memory");
+
+    return prev;
+}
+
+typedef struct { volatile int32_t counter; } ips_atomic_t;
+
+#define ips_atomic_set(v,i)		  (((v)->counter) = (i))
+#define ips_atomic_cmpxchg(p,oval,nval)	  \
+	    ips_cmpxchg((volatile uint32_t *) &((p)->counter),oval,nval)
+
+#if 0
+static __inline__ int32_t 
+ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value)
+{
+  asm volatile ("lock cmpxchg %2, %0" :
+                "+m" (*p), "+a" (old_value) :
+                "r" (new_value) :
+                "memory");
+  return old_value;
+}
+#endif
+
+#endif /* _IPATH_i386_SYSDEP_H */
diff --git a/include/linux-ppc/bit_ops.h b/include/linux-ppc/bit_ops.h
new file mode 100644
index 0000000..0326bb4
--- /dev/null
+++ b/include/linux-ppc/bit_ops.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_ppc64_BIT_OPS_H
+#define _IPATH_ppc64_BIT_OPS_H
+
+#if defined(__powerpc64__)
+#	define _NRMASK 63
+#	define _NRSHIFT 6
+#	define _NRSWIZZ 0
+#	define _LLARX "ldarx "
+#	define _STLCX "stdcx. "
+#else
+#	define _NRMASK 31
+#	define _NRSHIFT 5
+#	define _NRSWIZZ 1
+#	define _LLARX "lwarx "
+#	define _STLCX "stwcx. "
+#endif
+
+static __inline__ unsigned long ips___nrmask(int nr)
+{
+	return 1UL << (nr & _NRMASK);
+}
+
+static __inline__ int ips___nroffset(int nr)
+{
+	return (nr >> _NRSHIFT) ^ _NRSWIZZ;
+}
+
+static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long old;
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+
+	__asm__ __volatile__(
+"1:"	_LLARX "%0,0,%3  \n"
+	"andc   %0,%0,%2 \n"
+	_STLCX "%0,0,%3  \n"
+	"bne-   1b"
+	: "=&r" (old), "=m" (*p)
+	: "r" (mask), "r" (p), "m" (*p)
+	: "cc");
+}
+
+static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long old;
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+
+	__asm__ __volatile__(
+"1:"	_LLARX "%0,0,%3  \n"
+	"xor    %0,%0,%2 \n"
+	_STLCX "%0,0,%3  \n"
+	"bne-   1b"
+	: "=&r" (old), "=m" (*p)
+	: "r" (mask), "r" (p), "m" (*p)
+	: "cc");
+}
+
+static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long old, t;
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+
+	__asm__ __volatile__(
+	"eieio           \n"
+"1:"	_LLARX "%0,0,%3  \n"
+	"or     %1,%0,%2 \n"
+	_STLCX "%1,0,%3  \n"
+	"bne-   1b       \n"
+	"sync"
+	: "=&r" (old), "=&r" (t)
+	: "r" (mask), "r" (p)
+	: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
+static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+
+	*p &= ~mask;
+}
+
+static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+
+	*p ^= mask;
+}
+
+static __inline__ int ips___test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = ips___nrmask(nr);
+	volatile unsigned long *p = addr + ips___nroffset(nr);
+	unsigned long old = *p;
+
+	*p = old | mask;
+	return (old & mask) != 0;
+}
+
+#undef _NRMASK
+#undef _NRSHIFT
+#undef _NRSWIZZ
+#undef _LLARX
+#undef _STLCX
+
+#endif /* _IPATH_ppc64_BIT_OPS_H */
diff --git a/include/linux-ppc/sysdep.h b/include/linux-ppc/sysdep.h
new file mode 100644
index 0000000..604096e
--- /dev/null
+++ b/include/linux-ppc/sysdep.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_ppc64_SYSDEP_H
+#define _IPATH_ppc64_SYSDEP_H
+
+static __inline__ uint64_t get_cycles(void)
+{
+    uint64_t v;
+
+#if __WORDSIZE == 64
+    asm volatile("mftb %0" : "=r" (v) : );
+#else
+    uint32_t vu0, vu1, vl;
+    do {
+        asm volatile("mftbu %0" : "=r" (vu0) : );
+        asm volatile("mftb %0"  : "=r" (vl) : );
+        asm volatile("mftbu %0" : "=r" (vu1) : );
+    } while ( vu0 != vu1 );
+
+    v = vu1;
+    v <<= 32;
+    v |= vl;
+#endif
+
+    return v;
+}
+
+static __inline__ void ips_mb()
+{
+    asm volatile ("sync" : : : "memory");
+}
+
+static __inline__ void ips_rmb()
+{
+    asm volatile ("lwsync" : : : "memory");
+}
+
+static __inline__ void ips_wmb()
+{
+    asm volatile ("eieio" : : : "memory");
+}
+
+static __inline__ void ips_sync_writes()
+{
+    asm volatile("lwsync" : : : "memory");
+}
+
+static __inline__ void ips_sync_reads()
+{
+    asm volatile("isync" : : : "memory");
+}
+
+static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *p, uint32_t old,
+                                       uint32_t new)
+{
+    uint32_t prev;
+
+    __asm__ __volatile__ ("\n\
+1:  lwarx   %0,0,%2 \n\
+    cmpw    0,%0,%3 \n\
+    bne     2f \n\
+    stwcx.  %4,0,%2 \n\
+    bne-    1b\n\
+    sync\n\
+2:"
+    : "=&r" (prev), "=m" (*p)
+    : "r" (p), "r" (old), "r" (new), "m" (*p)
+    : "cc", "memory");
+
+    return prev;
+}
+
+#endif /* _IPATH_ppc64_SYSDEP_H */
diff --git a/include/valgrind/memcheck.h b/include/valgrind/memcheck.h
new file mode 100644
index 0000000..2cbb460
--- /dev/null
+++ b/include/valgrind/memcheck.h
@@ -0,0 +1,279 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (memcheck.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of MemCheck, a heavyweight Valgrind tool for
+   detecting memory errors.
+
+   Copyright (C) 2000-2007 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (memcheck.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+#ifndef __MEMCHECK_H
+#define __MEMCHECK_H
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query memory permissions
+   inside your own programs.
+
+   See comment near the top of valgrind.h on how to use them.
+*/
+
+#include "valgrind.h"
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { 
+      VG_USERREQ__MAKE_MEM_NOACCESS = VG_USERREQ_TOOL_BASE('M','C'),
+      VG_USERREQ__MAKE_MEM_UNDEFINED,
+      VG_USERREQ__MAKE_MEM_DEFINED,
+      VG_USERREQ__DISCARD,
+      VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE,
+      VG_USERREQ__CHECK_MEM_IS_DEFINED,
+      VG_USERREQ__DO_LEAK_CHECK,
+      VG_USERREQ__COUNT_LEAKS,
+
+      VG_USERREQ__GET_VBITS,
+      VG_USERREQ__SET_VBITS,
+
+      VG_USERREQ__CREATE_BLOCK,
+
+      VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE,
+
+      /* This is just for memcheck's internal use - don't use it */
+      _VG_USERREQ__MEMCHECK_RECORD_OVERLAP_ERROR 
+         = VG_USERREQ_TOOL_BASE('M','C') + 256
+   } Vg_MemCheckClientRequest;
+
+
+/* Client-code macros to manipulate the state of memory. */
+
+/* Mark memory at _qzz_addr as unaddressable for _qzz_len bytes. */
+#define VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr,_qzz_len)           \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__MAKE_MEM_NOACCESS,       \
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+      
+/* Similarly, mark memory at _qzz_addr as addressable but undefined
+   for _qzz_len bytes. */
+#define VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr,_qzz_len)          \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__MAKE_MEM_UNDEFINED,      \
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+
+/* Similarly, mark memory at _qzz_addr as addressable and defined
+   for _qzz_len bytes. */
+#define VALGRIND_MAKE_MEM_DEFINED(_qzz_addr,_qzz_len)            \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__MAKE_MEM_DEFINED,        \
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+
+/* Similar to VALGRIND_MAKE_MEM_DEFINED except that addressability is
+   not altered: bytes which are addressable are marked as defined,
+   but those which are not addressable are left unchanged. */
+#define VALGRIND_MAKE_MEM_DEFINED_IF_ADDRESSABLE(_qzz_addr,_qzz_len) \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, \
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+
+/* Create a block-description handle.  The description is an ascii
+   string which is included in any messages pertaining to addresses
+   within the specified memory range.  Has no other effect on the
+   properties of the memory range. */
+#define VALGRIND_CREATE_BLOCK(_qzz_addr,_qzz_len, _qzz_desc)	 \
+	(__extension__({unsigned int _qzz_res;			 \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__CREATE_BLOCK,            \
+                            _qzz_addr, _qzz_len, _qzz_desc,      \
+                            0, 0);                               \
+    _qzz_res;							 \
+   }))
+
+/* Discard a block-description-handle. Returns 1 for an
+   invalid handle, 0 for a valid handle. */
+#define VALGRIND_DISCARD(_qzz_blkindex)                          \
+   (__extension__ ({unsigned int _qzz_res;                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \
+                            VG_USERREQ__DISCARD,                 \
+                            0, _qzz_blkindex, 0, 0, 0);          \
+    _qzz_res;                                                    \
+   }))
+
+/* Client-code macros to check the state of memory. */
+
+/* Check that memory at _qzz_addr is addressable for _qzz_len bytes.
+   If suitable addressibility is not established, Valgrind prints an
+   error message and returns the address of the first offending byte.
+   Otherwise it returns zero. */
+#define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(_qzz_addr,_qzz_len)    \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE,\
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+
+/* Check that memory at _qzz_addr is addressable and defined for
+   _qzz_len bytes.  If suitable addressibility and definedness are not
+   established, Valgrind prints an error message and returns the
+   address of the first offending byte.  Otherwise it returns zero. */
+#define VALGRIND_CHECK_MEM_IS_DEFINED(_qzz_addr,_qzz_len)        \
+   (__extension__({unsigned int _qzz_res;                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__CHECK_MEM_IS_DEFINED,    \
+                            _qzz_addr, _qzz_len, 0, 0, 0);       \
+    _qzz_res;                                                    \
+   }))
+
+/* Use this macro to force the definedness and addressibility of an
+   lvalue to be checked.  If suitable addressibility and definedness
+   are not established, Valgrind prints an error message and returns
+   the address of the first offending byte.  Otherwise it returns
+   zero. */
+#define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue)                \
+   VALGRIND_CHECK_MEM_IS_DEFINED(                                \
+      (volatile unsigned char *)&(__lvalue),                     \
+                      (unsigned int)(sizeof (__lvalue)))
+
+/* Do a memory leak check mid-execution.  */
+#define VALGRIND_DO_LEAK_CHECK                                   \
+   {unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__DO_LEAK_CHECK,           \
+                            0, 0, 0, 0, 0);                      \
+   }
+
+/* Just display summaries of leaked memory, rather than all the
+   details */
+#define VALGRIND_DO_QUICK_LEAK_CHECK				 \
+   {unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__DO_LEAK_CHECK,           \
+                            1, 0, 0, 0, 0);                      \
+   }
+
+/* Return number of leaked, dubious, reachable and suppressed bytes found by
+   all previous leak checks.  They must be lvalues.  */
+#define VALGRIND_COUNT_LEAKS(leaked, dubious, reachable, suppressed)     \
+   /* For safety on 64-bit platforms we assign the results to private
+      unsigned long variables, then assign these to the lvalues the user
+      specified, which works no matter what type 'leaked', 'dubious', etc
+      are.  We also initialise '_qzz_leaked', etc because
+      VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as
+      initialised. */                                                    \
+   {unsigned int  _qzz_res;                                              \
+    unsigned long _qzz_leaked    = 0, _qzz_dubious    = 0;               \
+    unsigned long _qzz_reachable = 0, _qzz_suppressed = 0;               \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                              \
+                               VG_USERREQ__COUNT_LEAKS,                  \
+                               &_qzz_leaked, &_qzz_dubious,              \
+                               &_qzz_reachable, &_qzz_suppressed, 0);    \
+    leaked     = _qzz_leaked;                                            \
+    dubious    = _qzz_dubious;                                           \
+    reachable  = _qzz_reachable;                                         \
+    suppressed = _qzz_suppressed;                                        \
+   }
+
+/* Get the validity data for addresses [zza..zza+zznbytes-1] and copy it
+   into the provided zzvbits array.  Return values:
+      0   if not running on valgrind
+      1   success
+      2   [previously indicated unaligned arrays;  these are now allowed]
+      3   if any parts of zzsrc/zzvbits are not addressable.
+   The metadata is not copied in cases 0, 2 or 3 so it should be
+   impossible to segfault your system by using this call.
+*/
+#define VALGRIND_GET_VBITS(zza,zzvbits,zznbytes)                 \
+   (__extension__({unsigned int _qzz_res;                        \
+    char* czza     = (char*)zza;                                 \
+    char* czzvbits = (char*)zzvbits;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__GET_VBITS,               \
+                            czza, czzvbits, zznbytes, 0, 0 );    \
+    _qzz_res;                                                    \
+   }))
+
+/* Set the validity data for addresses [zza..zza+zznbytes-1], copying it
+   from the provided zzvbits array.  Return values:
+      0   if not running on valgrind
+      1   success
+      2   [previously indicated unaligned arrays;  these are now allowed]
+      3   if any parts of zza/zzvbits are not addressable.
+   The metadata is not copied in cases 0, 2 or 3 so it should be
+   impossible to segfault your system by using this call.
+*/
+#define VALGRIND_SET_VBITS(zza,zzvbits,zznbytes)                 \
+   (__extension__({unsigned int _qzz_res;                        \
+    char* czza     = (char*)zza;                                 \
+    char* czzvbits = (char*)zzvbits;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                      \
+                            VG_USERREQ__SET_VBITS,               \
+                            czza, czzvbits, zznbytes, 0, 0 );    \
+    _qzz_res;                                                    \
+   }))
+
+#endif
+
diff --git a/include/valgrind/valgrind.h b/include/valgrind/valgrind.h
new file mode 100644
index 0000000..7b82f83
--- /dev/null
+++ b/include/valgrind/valgrind.h
@@ -0,0 +1,3914 @@
+/* -*- c -*-
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (valgrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 2, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2000-2007 Julian Seward.  All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must 
+      not claim that you wrote the original software.  If you use this 
+      software in a product, an acknowledgment in the product 
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote 
+      products derived from this software without specific prior written 
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (valgrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 2.  See the
+   COPYING file in the source distribution for details.
+
+   ---------------------------------------------------------------- 
+*/
+
+/* This file is for inclusion into client (your!) code.
+
+   You can use these macros to manipulate and query Valgrind's 
+   execution inside your own programs.
+
+   The resulting executables will still run without Valgrind, just a
+   little bit more slowly than they otherwise would, but otherwise
+   unchanged.  When not running on valgrind, each client request
+   consumes very few (eg. 7) instructions, so the resulting performance
+   loss is negligible unless you plan to execute client requests
+   millions of times per second.  Nevertheless, if that is still a
+   problem, you can compile with the NVALGRIND symbol defined (gcc
+   -DNVALGRIND) so that client requests are not even compiled in.  */
+
+#ifndef __VALGRIND_H
+#define __VALGRIND_H
+
+#include <stdarg.h>
+
+/* Nb: this file might be included in a file compiled with -ansi.  So
+   we can't use C++ style "//" comments nor the "asm" keyword (instead
+   use "__asm__"). */
+
+/* Derive some tags indicating what the target platform is.  Note
+   that in this file we're using the compiler's CPP symbols for
+   identifying architectures, which are different to the ones we use
+   within the rest of Valgrind.  Note, __powerpc__ is active for both
+   32 and 64-bit PPC, whereas __powerpc64__ is only active for the
+   latter (on Linux, that is). */
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#if !defined(_AIX) && defined(__i386__)
+#  define PLAT_x86_linux 1
+#elif !defined(_AIX) && defined(__x86_64__)
+#  define PLAT_amd64_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__)
+#  define PLAT_ppc32_linux 1
+#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__)
+#  define PLAT_ppc64_linux 1
+#elif defined(_AIX) && defined(__64BIT__)
+#  define PLAT_ppc64_aix5 1
+#elif defined(_AIX) && !defined(__64BIT__)
+#  define PLAT_ppc32_aix5 1
+#endif
+
+/* If we're not compiling for our target platform, don't generate
+   any inline asms.  */
+#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \
+    && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \
+    && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5)
+#  if !defined(NVALGRIND)
+#    define NVALGRIND 1
+#  endif
+#endif
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS.  There is nothing */
+/* in here of use to end-users -- skip to the next section.           */
+/* ------------------------------------------------------------------ */
+
+#if defined(NVALGRIND)
+
+/* Define NVALGRIND to completely remove the Valgrind magic sequence
+   from the compiled code (analogous to NDEBUG's effects on
+   assert()) */
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+   {                                                              \
+      (_zzq_rlval) = (_zzq_default);                              \
+   }
+
+#else  /* ! NVALGRIND */
+
+/* The following defines the magic code sequences which the JITter
+   spots and handles magically.  Don't look too closely at them as
+   they will rot your brain.
+
+   The assembly code sequences for all architectures is in this one
+   file.  This is because this file must be stand-alone, and we don't
+   want to have multiple files.
+
+   For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default
+   value gets put in the return slot, so that everything works when
+   this is executed not under Valgrind.  Args are passed in a memory
+   block, and so there's no intrinsic limit to the number that could
+   be passed, but it's currently five.
+   
+   The macro args are: 
+      _zzq_rlval    result lvalue
+      _zzq_default  default value (result returned when running on real CPU)
+      _zzq_request  request code
+      _zzq_arg1..5  request params
+
+   The other two macros are used to support function wrapping, and are
+   a lot simpler.  VALGRIND_GET_NR_CONTEXT returns the value of the
+   guest's NRADDR pseudo-register and whatever other information is
+   needed to safely run the call original from the wrapper: on
+   ppc64-linux, the R2 value at the divert point is also needed.  This
+   information is abstracted into a user-visible type, OrigFn.
+
+   VALGRIND_CALL_NOREDIR_* behaves the same as the following on the
+   guest, but guarantees that the branch instruction will not be
+   redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64:
+   branch-and-link-to-r11.  VALGRIND_CALL_NOREDIR is just text, not a
+   complete inline asm, since it needs to be combined with more magic
+   inline asm stuff to be useful.
+*/
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "roll $3,  %%edi ; roll $13, %%edi\n\t"      \
+                     "roll $29, %%edi ; roll $19, %%edi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned int _zzq_args[6];                           \
+    volatile unsigned int _zzq_result;                            \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EDX = client_request ( %EAX ) */         \
+                     "xchgl %%ebx,%%ebx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %EAX = guest_NRADDR */                    \
+                     "xchgl %%ecx,%%ecx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_EAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%EAX */                     \
+                     "xchgl %%edx,%%edx\n\t"
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rolq $3,  %%rdi ; rolq $13, %%rdi\n\t"      \
+                     "rolq $61, %%rdi ; rolq $51, %%rdi\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+  { volatile unsigned long long int _zzq_args[6];                 \
+    volatile unsigned long long int _zzq_result;                  \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RDX = client_request ( %RAX ) */         \
+                     "xchgq %%rbx,%%rbx"                          \
+                     : "=d" (_zzq_result)                         \
+                     : "a" (&_zzq_args[0]), "0" (_zzq_default)    \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    volatile unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %RAX = guest_NRADDR */                    \
+                     "xchgq %%rcx,%%rcx"                          \
+                     : "=a" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_CALL_NOREDIR_RAX                                 \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* call-noredir *%RAX */                     \
+                     "xchgq %%rdx,%%rdx\n\t"
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[6];                          \
+             unsigned int  _zzq_result;                           \
+             unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 3,%1\n\t" /*default*/                    \
+                     "mr 4,%2\n\t" /*ptr*/                        \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"     /*result*/                     \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_default), "b" (_zzq_ptr)         \
+                     : "cc", "memory", "r3", "r4");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    unsigned int __addr;                                          \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory", "r3"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[6];                \
+    register unsigned long long int  _zzq_result __asm__("r3");   \
+    register unsigned long long int* _zzq_ptr __asm__("r4");      \
+    _zzq_args[0] = (unsigned long long int)(_zzq_request);        \
+    _zzq_args[1] = (unsigned long long int)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned long long int)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned long long int)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned long long int)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned long long int)(_zzq_arg5);           \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1"                                   \
+                     : "=r" (_zzq_result)                         \
+                     : "0" (_zzq_default), "r" (_zzq_ptr)         \
+                     : "cc", "memory");                           \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr __asm__("r3");         \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4"                                   \
+                     : "=r" (__addr)                              \
+                     :                                            \
+                     : "cc", "memory"                             \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+typedef
+   struct { 
+      unsigned int nraddr; /* where's the code? */
+      unsigned int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rlwinm 0,0,3,0,0  ; rlwinm 0,0,13,0,0\n\t"  \
+                     "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned int  _zzq_args[7];                          \
+    register unsigned int  _zzq_result;                           \
+    register unsigned int* _zzq_ptr;                              \
+    _zzq_args[0] = (unsigned int)(_zzq_request);                  \
+    _zzq_args[1] = (unsigned int)(_zzq_arg1);                     \
+    _zzq_args[2] = (unsigned int)(_zzq_arg2);                     \
+    _zzq_args[3] = (unsigned int)(_zzq_arg3);                     \
+    _zzq_args[4] = (unsigned int)(_zzq_arg4);                     \
+    _zzq_args[5] = (unsigned int)(_zzq_arg5);                     \
+    _zzq_args[6] = (unsigned int)(_zzq_default);                  \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "lwz 3, 24(4)\n\t"                           \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned int __addr;                                 \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+typedef
+   struct { 
+      unsigned long long int nraddr; /* where's the code? */
+      unsigned long long int r2;  /* what tocptr do we need? */
+   }
+   OrigFn;
+
+#define __SPECIAL_INSTRUCTION_PREAMBLE                            \
+                     "rotldi 0,0,3  ; rotldi 0,0,13\n\t"          \
+                     "rotldi 0,0,61 ; rotldi 0,0,51\n\t"
+
+#define VALGRIND_DO_CLIENT_REQUEST(                               \
+        _zzq_rlval, _zzq_default, _zzq_request,                   \
+        _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5)    \
+                                                                  \
+  {          unsigned long long int  _zzq_args[7];                \
+    register unsigned long long int  _zzq_result;                 \
+    register unsigned long long int* _zzq_ptr;                    \
+    _zzq_args[0] = (unsigned int long long)(_zzq_request);        \
+    _zzq_args[1] = (unsigned int long long)(_zzq_arg1);           \
+    _zzq_args[2] = (unsigned int long long)(_zzq_arg2);           \
+    _zzq_args[3] = (unsigned int long long)(_zzq_arg3);           \
+    _zzq_args[4] = (unsigned int long long)(_zzq_arg4);           \
+    _zzq_args[5] = (unsigned int long long)(_zzq_arg5);           \
+    _zzq_args[6] = (unsigned int long long)(_zzq_default);        \
+    _zzq_ptr = _zzq_args;                                         \
+    __asm__ volatile("mr 4,%1\n\t"                                \
+                     "ld 3, 48(4)\n\t"                            \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = client_request ( %R4 ) */           \
+                     "or 1,1,1\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (_zzq_result)                         \
+                     : "b" (_zzq_ptr)                             \
+                     : "r3", "r4", "cc", "memory");               \
+    _zzq_rlval = _zzq_result;                                     \
+  }
+
+#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval)                       \
+  { volatile OrigFn* _zzq_orig = &(_zzq_rlval);                   \
+    register unsigned long long int __addr;                       \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR */                     \
+                     "or 2,2,2\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->nraddr = __addr;                                   \
+    __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* %R3 = guest_NRADDR_GPR2 */                \
+                     "or 4,4,4\n\t"                               \
+                     "mr %0,3"                                    \
+                     : "=b" (__addr)                              \
+                     :                                            \
+                     : "r3", "cc", "memory"                       \
+                    );                                            \
+    _zzq_orig->r2 = __addr;                                       \
+  }
+
+#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                   \
+                     __SPECIAL_INSTRUCTION_PREAMBLE               \
+                     /* branch-and-link-to-noredir *%R11 */       \
+                     "or 3,3,3\n\t"
+
+#endif /* PLAT_ppc64_aix5 */
+
+/* Insert assembly code for other platforms here... */
+
+#endif /* NVALGRIND */
+
+/* ------------------------------------------------------------------ */
+/* PLATFORM SPECIFICS for FUNCTION WRAPPING.  This is all very        */
+/* ugly.  It's the least-worst tradeoff I can think of.               */
+/* ------------------------------------------------------------------ */
+
+/* This section defines magic (a.k.a appalling-hack) macros for doing
+   guaranteed-no-redirection macros, so as to get from function
+   wrappers to the functions they are wrapping.  The whole point is to
+   construct standard call sequences, but to do the call itself with a
+   special no-redirect call pseudo-instruction that the JIT
+   understands and handles specially.  This section is long and
+   repetitious, and I can't see a way to make it shorter.
+
+   The naming scheme is as follows:
+
+      CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc}
+
+   'W' stands for "word" and 'v' for "void".  Hence there are
+   different macros for calling arity 0, 1, 2, 3, 4, etc, functions,
+   and for each, the possibility of returning a word-typed result, or
+   no result.
+*/
+
+/* Use these to write the name of your wrapper.  NOTE: duplicates
+   VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */
+
+#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname)                    \
+   _vgwZU_##soname##_##fnname
+
+#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname)                    \
+   _vgwZZ_##soname##_##fnname
+
+/* Use this macro from within a wrapper function to collect the
+   context (address and possibly other info) of the original function.
+   Once you have that you can then use it in one of the CALL_FN_
+   macros.  The type of the argument _lval is OrigFn. */
+#define VALGRIND_GET_ORIG_FN(_lval)  VALGRIND_GET_NR_CONTEXT(_lval)
+
+/* Derivatives of the main macros below, for calling functions
+   returning void. */
+
+#define CALL_FN_v_v(fnptr)                                        \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_v(_junk,fnptr); } while (0)
+
+#define CALL_FN_v_W(fnptr, arg1)                                  \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_W(_junk,fnptr,arg1); } while (0)
+
+#define CALL_FN_v_WW(fnptr, arg1,arg2)                            \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0)
+
+#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3)                      \
+   do { volatile unsigned long _junk;                             \
+        CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0)
+
+/* ------------------------- x86-linux ------------------------- */
+
+#if defined(PLAT_x86_linux)
+
+/* These regs are trashed by the hidden call.  No need to mention eax
+   as gcc can already see that, plus causes gcc to bomb. */
+#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx"
+
+/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $4, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $8, %%esp\n"                                       \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $12, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $16, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $20, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $24, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $28, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $32, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $36, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $40, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11)                          \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $44, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,       \
+                                  arg6,arg7,arg8,arg9,arg10,      \
+                                  arg11,arg12)                    \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "pushl 48(%%eax)\n\t"                                    \
+         "pushl 44(%%eax)\n\t"                                    \
+         "pushl 40(%%eax)\n\t"                                    \
+         "pushl 36(%%eax)\n\t"                                    \
+         "pushl 32(%%eax)\n\t"                                    \
+         "pushl 28(%%eax)\n\t"                                    \
+         "pushl 24(%%eax)\n\t"                                    \
+         "pushl 20(%%eax)\n\t"                                    \
+         "pushl 16(%%eax)\n\t"                                    \
+         "pushl 12(%%eax)\n\t"                                    \
+         "pushl 8(%%eax)\n\t"                                     \
+         "pushl 4(%%eax)\n\t"                                     \
+         "movl (%%eax), %%eax\n\t"  /* target->%eax */            \
+         VALGRIND_CALL_NOREDIR_EAX                                \
+         "addl $48, %%esp\n"                                      \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_x86_linux */
+
+/* ------------------------ amd64-linux ------------------------ */
+
+#if defined(PLAT_amd64_linux)
+
+/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi",       \
+                            "rdi", "r8", "r9", "r10", "r11"
+
+/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned
+   long) == 8. */
+
+/* NB 9 Sept 07.  There is a nasty kludge here in all these CALL_FN_
+   macros.  In order not to trash the stack redzone, we need to drop
+   %rsp by 128 before the hidden call, and restore afterwards.  The
+   nastyness is that it is only by luck that the stack still appears
+   to be unwindable during the hidden call - since then the behaviour
+   of any routine using this macro does not match what the CFI data
+   says.  Sigh.
+
+   Why is this important?  Imagine that a wrapper has a stack
+   allocated local, and passes to the hidden call, a pointer to it.
+   Because gcc does not know about the hidden call, it may allocate
+   that local in the redzone.  Unfortunately the hidden call may then
+   trash it before it comes to use it.  So we must step clear of the
+   redzone, for the duration of the hidden call, to make it safe.
+
+   Probably the same problem afflicts the other redzone-style ABIs too
+   (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is
+   self describing (none of this CFI nonsense) so at least messing
+   with the stack pointer doesn't give a danger of non-unwindable
+   stack. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         "addq $128,%%rsp\n\t"                                    \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $8, %%rsp\n"                                       \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $16, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $24, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $32, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $40, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)(arg1);                         \
+      _argvec[2] = (unsigned long)(arg2);                         \
+      _argvec[3] = (unsigned long)(arg3);                         \
+      _argvec[4] = (unsigned long)(arg4);                         \
+      _argvec[5] = (unsigned long)(arg5);                         \
+      _argvec[6] = (unsigned long)(arg6);                         \
+      _argvec[7] = (unsigned long)(arg7);                         \
+      _argvec[8] = (unsigned long)(arg8);                         \
+      _argvec[9] = (unsigned long)(arg9);                         \
+      _argvec[10] = (unsigned long)(arg10);                       \
+      _argvec[11] = (unsigned long)(arg11);                       \
+      _argvec[12] = (unsigned long)(arg12);                       \
+      __asm__ volatile(                                           \
+         "subq $128,%%rsp\n\t"                                    \
+         "pushq 96(%%rax)\n\t"                                    \
+         "pushq 88(%%rax)\n\t"                                    \
+         "pushq 80(%%rax)\n\t"                                    \
+         "pushq 72(%%rax)\n\t"                                    \
+         "pushq 64(%%rax)\n\t"                                    \
+         "pushq 56(%%rax)\n\t"                                    \
+         "movq 48(%%rax), %%r9\n\t"                               \
+         "movq 40(%%rax), %%r8\n\t"                               \
+         "movq 32(%%rax), %%rcx\n\t"                              \
+         "movq 24(%%rax), %%rdx\n\t"                              \
+         "movq 16(%%rax), %%rsi\n\t"                              \
+         "movq 8(%%rax), %%rdi\n\t"                               \
+         "movq (%%rax), %%rax\n\t"  /* target->%rax */            \
+         VALGRIND_CALL_NOREDIR_RAX                                \
+         "addq $48, %%rsp\n"                                      \
+         "addq $128,%%rsp\n\t"                                    \
+         : /*out*/   "=a" (_res)                                  \
+         : /*in*/    "a" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_amd64_linux */
+
+/* ------------------------ ppc32-linux ------------------------ */
+
+#if defined(PLAT_ppc32_linux)
+
+/* This is useful for finding out about the on-stack stuff:
+
+   extern int f9  ( int,int,int,int,int,int,int,int,int );
+   extern int f10 ( int,int,int,int,int,int,int,int,int,int );
+   extern int f11 ( int,int,int,int,int,int,int,int,int,int,int );
+   extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int );
+
+   int g9 ( void ) {
+      return f9(11,22,33,44,55,66,77,88,99);
+   }
+   int g10 ( void ) {
+      return f10(11,22,33,44,55,66,77,88,99,110);
+   }
+   int g11 ( void ) {
+      return f11(11,22,33,44,55,66,77,88,99,110,121);
+   }
+   int g12 ( void ) {
+      return f12(11,22,33,44,55,66,77,88,99,110,121,132);
+   }
+*/
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc32-linux, 
+   sizeof(unsigned long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[1];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[2];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[4];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[5];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[6];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[7];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[8];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[9];                          \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[10];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[11];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-16\n\t"                                       \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,16\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[12];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[13];                         \
+      volatile unsigned long _res;                                \
+      _argvec[0] = (unsigned long)_orig.nraddr;                   \
+      _argvec[1] = (unsigned long)arg1;                           \
+      _argvec[2] = (unsigned long)arg2;                           \
+      _argvec[3] = (unsigned long)arg3;                           \
+      _argvec[4] = (unsigned long)arg4;                           \
+      _argvec[5] = (unsigned long)arg5;                           \
+      _argvec[6] = (unsigned long)arg6;                           \
+      _argvec[7] = (unsigned long)arg7;                           \
+      _argvec[8] = (unsigned long)arg8;                           \
+      _argvec[9] = (unsigned long)arg9;                           \
+      _argvec[10] = (unsigned long)arg10;                         \
+      _argvec[11] = (unsigned long)arg11;                         \
+      _argvec[12] = (unsigned long)arg12;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "addi 1,1,-32\n\t"                                       \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,20(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,16(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,12(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,8(1)\n\t"                                         \
+         /* args1-8 */                                            \
+         "lwz 3,4(11)\n\t"   /* arg1->r3 */                       \
+         "lwz 4,8(11)\n\t"                                        \
+         "lwz 5,12(11)\n\t"                                       \
+         "lwz 6,16(11)\n\t"  /* arg4->r6 */                       \
+         "lwz 7,20(11)\n\t"                                       \
+         "lwz 8,24(11)\n\t"                                       \
+         "lwz 9,28(11)\n\t"                                       \
+         "lwz 10,32(11)\n\t" /* arg8->r10 */                      \
+         "lwz 11,0(11)\n\t"  /* target->r11 */                    \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "addi 1,1,32\n\t"                                        \
+         "mr %0,3"                                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[0])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_linux */
+
+/* ------------------------ ppc64-linux ------------------------ */
+
+#if defined(PLAT_ppc64_linux)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)" /* restore tocptr */                      \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-128\n\t"  /* expand stack frame */            \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,128"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         "std 2,-16(11)\n\t"  /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "addi 1,1,-144\n\t"  /* expand stack frame */            \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         "addi 1,1,144"     /* restore frame */                   \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_linux */
+
+/* ------------------------ ppc32-aix5 ------------------------- */
+
+#if defined(PLAT_ppc32_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "lwz  3," #_n_fr "(1)\n\t"                               \
+         "stw  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned
+   long) == 4. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t" /* arg2->r4 */                       \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(64)                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(64)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "stw  2,-8(11)\n\t"  /* save tocptr */                   \
+         "lwz  2,-4(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(72)                        \
+         /* arg12 */                                              \
+         "lwz 3,48(11)\n\t"                                       \
+         "stw 3,68(1)\n\t"                                        \
+         /* arg11 */                                              \
+         "lwz 3,44(11)\n\t"                                       \
+         "stw 3,64(1)\n\t"                                        \
+         /* arg10 */                                              \
+         "lwz 3,40(11)\n\t"                                       \
+         "stw 3,60(1)\n\t"                                        \
+         /* arg9 */                                               \
+         "lwz 3,36(11)\n\t"                                       \
+         "stw 3,56(1)\n\t"                                        \
+         /* args1-8 */                                            \
+         "lwz  3, 4(11)\n\t"  /* arg1->r3 */                      \
+         "lwz  4, 8(11)\n\t"  /* arg2->r4 */                      \
+         "lwz  5, 12(11)\n\t" /* arg3->r5 */                      \
+         "lwz  6, 16(11)\n\t" /* arg4->r6 */                      \
+         "lwz  7, 20(11)\n\t" /* arg5->r7 */                      \
+         "lwz  8, 24(11)\n\t" /* arg6->r8 */                      \
+         "lwz  9, 28(11)\n\t" /* arg7->r9 */                      \
+         "lwz 10, 32(11)\n\t" /* arg8->r10 */                     \
+         "lwz 11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "lwz 2,-8(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(72)                                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc32_aix5 */
+
+/* ------------------------ ppc64-aix5 ------------------------- */
+
+#if defined(PLAT_ppc64_aix5)
+
+/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */
+
+/* These regs are trashed by the hidden call. */
+#define __CALLER_SAVED_REGS                                       \
+   "lr", "ctr", "xer",                                            \
+   "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",        \
+   "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",   \
+   "r11", "r12", "r13"
+
+/* Expand the stack frame, copying enough info that unwinding
+   still works.  Trashes r3. */
+
+#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr)                      \
+         "addi 1,1,-" #_n_fr "\n\t"                               \
+         "ld   3," #_n_fr "(1)\n\t"                               \
+         "std  3,0(1)\n\t"
+
+#define VG_CONTRACT_FRAME_BY(_n_fr)                               \
+         "addi 1,1," #_n_fr "\n\t"
+
+/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned
+   long) == 8. */
+
+#define CALL_FN_W_v(lval, orig)                                   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+0];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1] = (unsigned long)_orig.r2;                       \
+      _argvec[2] = (unsigned long)_orig.nraddr;                   \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_W(lval, orig, arg1)                             \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+1];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld 2,-16(11)\n\t" /* restore tocptr */                  \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WW(lval, orig, arg1,arg2)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+2];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3)                 \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+3];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+4];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5)        \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+5];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6)   \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+6];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7)                            \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+7];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8)                       \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+8];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,   \
+                                 arg7,arg8,arg9)                  \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+9];                        \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10)           \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+10];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(128)                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(128)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                  arg7,arg8,arg9,arg10,arg11)     \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+11];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6,  \
+                                arg7,arg8,arg9,arg10,arg11,arg12) \
+   do {                                                           \
+      volatile OrigFn        _orig = (orig);                      \
+      volatile unsigned long _argvec[3+12];                       \
+      volatile unsigned long _res;                                \
+      /* _argvec[0] holds current r2 across the call */           \
+      _argvec[1]   = (unsigned long)_orig.r2;                     \
+      _argvec[2]   = (unsigned long)_orig.nraddr;                 \
+      _argvec[2+1] = (unsigned long)arg1;                         \
+      _argvec[2+2] = (unsigned long)arg2;                         \
+      _argvec[2+3] = (unsigned long)arg3;                         \
+      _argvec[2+4] = (unsigned long)arg4;                         \
+      _argvec[2+5] = (unsigned long)arg5;                         \
+      _argvec[2+6] = (unsigned long)arg6;                         \
+      _argvec[2+7] = (unsigned long)arg7;                         \
+      _argvec[2+8] = (unsigned long)arg8;                         \
+      _argvec[2+9] = (unsigned long)arg9;                         \
+      _argvec[2+10] = (unsigned long)arg10;                       \
+      _argvec[2+11] = (unsigned long)arg11;                       \
+      _argvec[2+12] = (unsigned long)arg12;                       \
+      __asm__ volatile(                                           \
+         "mr 11,%1\n\t"                                           \
+         VG_EXPAND_FRAME_BY_trashes_r3(512)                       \
+         "std  2,-16(11)\n\t" /* save tocptr */                   \
+         "ld   2,-8(11)\n\t"  /* use nraddr's tocptr */           \
+         VG_EXPAND_FRAME_BY_trashes_r3(144)                       \
+         /* arg12 */                                              \
+         "ld  3,96(11)\n\t"                                       \
+         "std 3,136(1)\n\t"                                       \
+         /* arg11 */                                              \
+         "ld  3,88(11)\n\t"                                       \
+         "std 3,128(1)\n\t"                                       \
+         /* arg10 */                                              \
+         "ld  3,80(11)\n\t"                                       \
+         "std 3,120(1)\n\t"                                       \
+         /* arg9 */                                               \
+         "ld  3,72(11)\n\t"                                       \
+         "std 3,112(1)\n\t"                                       \
+         /* args1-8 */                                            \
+         "ld   3, 8(11)\n\t"  /* arg1->r3 */                      \
+         "ld   4, 16(11)\n\t" /* arg2->r4 */                      \
+         "ld   5, 24(11)\n\t" /* arg3->r5 */                      \
+         "ld   6, 32(11)\n\t" /* arg4->r6 */                      \
+         "ld   7, 40(11)\n\t" /* arg5->r7 */                      \
+         "ld   8, 48(11)\n\t" /* arg6->r8 */                      \
+         "ld   9, 56(11)\n\t" /* arg7->r9 */                      \
+         "ld  10, 64(11)\n\t" /* arg8->r10 */                     \
+         "ld  11, 0(11)\n\t"  /* target->r11 */                   \
+         VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11                  \
+         "mr 11,%1\n\t"                                           \
+         "mr %0,3\n\t"                                            \
+         "ld  2,-16(11)\n\t" /* restore tocptr */                 \
+         VG_CONTRACT_FRAME_BY(144)                                \
+         VG_CONTRACT_FRAME_BY(512)                                \
+         : /*out*/   "=r" (_res)                                  \
+         : /*in*/    "r" (&_argvec[2])                            \
+         : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS          \
+      );                                                          \
+      lval = (__typeof__(lval)) _res;                             \
+   } while (0)
+
+#endif /* PLAT_ppc64_aix5 */
+
+/* ------------------------------------------------------------------ */
+/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS.               */
+/*                                                                    */
+/* ------------------------------------------------------------------ */
+
+/* Some request codes.  There are many more of these, but most are not
+   exposed to end-user view.  These are the public ones, all of the
+   form 0x1000 + small_number.
+
+   Core ones are in the range 0x00000000--0x0000ffff.  The non-public
+   ones start at 0x2000.
+*/
+
+/* These macros are used by tools -- they must be public, but don't
+   embed them into other programs. */
+#define VG_USERREQ_TOOL_BASE(a,b) \
+   ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16))
+#define VG_IS_TOOL_USERREQ(a, b, v) \
+   (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000))
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! 
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end. */
+typedef
+   enum { VG_USERREQ__RUNNING_ON_VALGRIND  = 0x1001,
+          VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002,
+
+          /* These allow any function to be called from the simulated
+             CPU but run on the real CPU.  Nb: the first arg passed to
+             the function is always the ThreadId of the running
+             thread!  So CLIENT_CALL0 actually requires a 1 arg
+             function, etc. */
+          VG_USERREQ__CLIENT_CALL0 = 0x1101,
+          VG_USERREQ__CLIENT_CALL1 = 0x1102,
+          VG_USERREQ__CLIENT_CALL2 = 0x1103,
+          VG_USERREQ__CLIENT_CALL3 = 0x1104,
+
+          /* Can be useful in regression testing suites -- eg. can
+             send Valgrind's output to /dev/null and still count
+             errors. */
+          VG_USERREQ__COUNT_ERRORS = 0x1201,
+
+          /* These are useful and can be interpreted by any tool that
+             tracks malloc() et al, by using vg_replace_malloc.c. */
+          VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301,
+          VG_USERREQ__FREELIKE_BLOCK   = 0x1302,
+          /* Memory pool support. */
+          VG_USERREQ__CREATE_MEMPOOL   = 0x1303,
+          VG_USERREQ__DESTROY_MEMPOOL  = 0x1304,
+          VG_USERREQ__MEMPOOL_ALLOC    = 0x1305,
+          VG_USERREQ__MEMPOOL_FREE     = 0x1306,
+          VG_USERREQ__MEMPOOL_TRIM     = 0x1307,
+          VG_USERREQ__MOVE_MEMPOOL     = 0x1308,
+          VG_USERREQ__MEMPOOL_CHANGE   = 0x1309,
+          VG_USERREQ__MEMPOOL_EXISTS   = 0x130a,
+
+          /* Allow printfs to valgrind log. */
+          VG_USERREQ__PRINTF           = 0x1401,
+          VG_USERREQ__PRINTF_BACKTRACE = 0x1402,
+
+          /* Stack support. */
+          VG_USERREQ__STACK_REGISTER   = 0x1501,
+          VG_USERREQ__STACK_DEREGISTER = 0x1502,
+          VG_USERREQ__STACK_CHANGE     = 0x1503
+   } Vg_ClientRequest;
+
+#if !defined(__GNUC__)
+#  define __extension__ /* */
+#endif
+
+/* Returns the number of Valgrinds this code is running under.  That
+   is, 0 if running natively, 1 if running under Valgrind, 2 if
+   running under Valgrind which is running under another Valgrind,
+   etc. */
+#define RUNNING_ON_VALGRIND  __extension__                        \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */,          \
+                               VG_USERREQ__RUNNING_ON_VALGRIND,   \
+                               0, 0, 0, 0, 0);                    \
+    _qzz_res;                                                     \
+   })
+
+/* Discard translation of code in the range [_qzz_addr .. _qzz_addr +
+   _qzz_len - 1].  Useful if you are debugging a JITter or some such,
+   since it provides a way to make sure valgrind will retranslate the
+   invalidated area.  Returns no value. */
+#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DISCARD_TRANSLATIONS,  \
+                               _qzz_addr, _qzz_len, 0, 0, 0);     \
+   }
+
+/* These requests are for getting Valgrind itself to print something.
+   Possibly with a backtrace.  This is a really ugly hack. */
+
+#if defined(NVALGRIND)
+
+#  define VALGRIND_PRINTF(...)
+#  define VALGRIND_PRINTF_BACKTRACE(...)
+
+#else /* NVALGRIND */
+
+/* Modern GCC will optimize the static routine out if unused,
+   and unused attribute will shut down warnings about it.  */
+static int VALGRIND_PRINTF(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+   __attribute__((format(__printf__, 1, 2), __unused__));
+static int
+VALGRIND_PRINTF_BACKTRACE(const char *format, ...)
+{
+   unsigned long _qzz_res;
+   va_list vargs;
+   va_start(vargs, format);
+   VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE,
+                              (unsigned long)format, (unsigned long)vargs, 
+                              0, 0, 0);
+   va_end(vargs);
+   return (int)_qzz_res;
+}
+
+#endif /* NVALGRIND */
+
+/* These requests allow control to move from the simulated CPU to the
+   real CPU, calling an arbitary function.
+   
+   Note that the current ThreadId is inserted as the first argument.
+   So this call:
+
+     VALGRIND_NON_SIMD_CALL2(f, arg1, arg2)
+
+   requires f to have this signature:
+
+     Word f(Word tid, Word arg1, Word arg2)
+
+   where "Word" is a word-sized type.
+
+   Note that these client requests are not entirely reliable.  For example,
+   if you call a function with them that subsequently calls printf(),
+   there's a high chance Valgrind will crash.  Generally, your prospects of
+   these working are made higher if the called function does not refer to
+   any global variables, and does not refer to any libc or other functions
+   (printf et al).  Any kind of entanglement with libc or dynamic linking is
+   likely to have a bad outcome, for tricky reasons which we've grappled
+   with a lot in the past.
+*/
+#define VALGRIND_NON_SIMD_CALL0(_qyy_fn)                          \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL0,          \
+                               _qyy_fn,                           \
+                               0, 0, 0, 0);                       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1)               \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL1,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, 0, 0, 0);               \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2)    \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL2,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2, 0, 0);       \
+    _qyy_res;                                                     \
+   })
+
+#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \
+   __extension__                                                  \
+   ({unsigned long _qyy_res;                                      \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__CLIENT_CALL3,          \
+                               _qyy_fn,                           \
+                               _qyy_arg1, _qyy_arg2,              \
+                               _qyy_arg3, 0);                     \
+    _qyy_res;                                                     \
+   })
+
+/* Counts the number of errors that have been recorded by a tool.  Nb:
+   the tool must record the errors with VG_(maybe_record_error)() or
+   VG_(unique_error)() for them to be counted. */
+#define VALGRIND_COUNT_ERRORS                                     \
+   __extension__                                                  \
+   ({unsigned int _qyy_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */,  \
+                               VG_USERREQ__COUNT_ERRORS,          \
+                               0, 0, 0, 0, 0);                    \
+    _qyy_res;                                                     \
+   })
+
+/* Mark a block of memory as having been allocated by a malloc()-like
+   function.  `addr' is the start of the usable block (ie. after any
+   redzone) `rzB' is redzone size if the allocator can apply redzones;
+   use '0' if not.  Adding redzones makes it more likely Valgrind will spot
+   block overruns.  `is_zeroed' indicates if the memory is zeroed, as it is
+   for calloc().  Put it immediately after the point where a block is
+   allocated. 
+   
+   If you're using Memcheck: If you're allocating memory via superblocks,
+   and then handing out small chunks of each superblock, if you don't have
+   redzones on your small blocks, it's worth marking the superblock with
+   VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are
+   detected.  But if you can put redzones on, it's probably better to not do
+   this, so that messages for small overruns are described in terms of the
+   small block rather than the superblock (but if you have a big overrun
+   that skips over a redzone, you could miss an error this way).  See
+   memcheck/tests/custom_alloc.c for an example.
+
+   WARNING: if your allocator uses malloc() or 'new' to allocate
+   superblocks, rather than mmap() or brk(), this will not work properly --
+   you'll likely get assertion failures during leak detection.  This is
+   because Valgrind doesn't like seeing overlapping heap blocks.  Sorry.
+
+   Nb: block must be freed via a free()-like function specified
+   with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */
+#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)    \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MALLOCLIKE_BLOCK,      \
+                               addr, sizeB, rzB, is_zeroed, 0);   \
+   }
+
+/* Mark a block of memory as having been freed by a free()-like function.
+   `rzB' is redzone size;  it must match that given to
+   VALGRIND_MALLOCLIKE_BLOCK.  Memory not freed will be detected by the leak
+   checker.  Put it immediately after the point where the block is freed. */
+#define VALGRIND_FREELIKE_BLOCK(addr, rzB)                        \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__FREELIKE_BLOCK,        \
+                               addr, rzB, 0, 0, 0);               \
+   }
+
+/* Create a memory pool. */
+#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed)             \
+   {unsigned int _qzz_res __unused__;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__CREATE_MEMPOOL,        \
+                               pool, rzB, is_zeroed, 0, 0);       \
+   }
+
+/* Destroy a memory pool. */
+#define VALGRIND_DESTROY_MEMPOOL(pool)                            \
+   {unsigned int _qzz_res __unused__;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__DESTROY_MEMPOOL,       \
+                               pool, 0, 0, 0, 0);                 \
+   }
+
+/* Associate a piece of memory with a memory pool. */
+#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size)                  \
+   {unsigned int _qzz_res __unused__;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_ALLOC,         \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Disassociate a piece of memory from a memory pool. */
+#define VALGRIND_MEMPOOL_FREE(pool, addr)                         \
+   {unsigned int _qzz_res __unused__;                             \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_FREE,          \
+                               pool, addr, 0, 0, 0);              \
+   }
+
+/* Disassociate any pieces outside a particular range. */
+#define VALGRIND_MEMPOOL_TRIM(pool, addr, size)                   \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_TRIM,          \
+                               pool, addr, size, 0, 0);           \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MOVE_MEMPOOL(poolA, poolB)                       \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MOVE_MEMPOOL,          \
+                               poolA, poolB, 0, 0, 0);            \
+   }
+
+/* Resize and/or move a piece associated with a memory pool. */
+#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size)         \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_CHANGE,        \
+                               pool, addrA, addrB, size, 0);      \
+   }
+
+/* Return 1 if a mempool exists, else 0. */
+#define VALGRIND_MEMPOOL_EXISTS(pool)                             \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__MEMPOOL_EXISTS,        \
+                               pool, 0, 0, 0, 0);                 \
+    _qzz_res;                                                     \
+   })
+
+/* Mark a piece of memory as being a stack. Returns a stack id. */
+#define VALGRIND_STACK_REGISTER(start, end)                       \
+   ({unsigned int _qzz_res;                                       \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_REGISTER,        \
+                               start, end, 0, 0, 0);              \
+    _qzz_res;                                                     \
+   })
+
+/* Unmark the piece of memory associated with a stack id as being a
+   stack. */
+#define VALGRIND_STACK_DEREGISTER(id)                             \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_DEREGISTER,      \
+                               id, 0, 0, 0, 0);                   \
+   }
+
+/* Change the start and end address of the stack id. */
+#define VALGRIND_STACK_CHANGE(id, start, end)                     \
+   {unsigned int _qzz_res;                                        \
+    VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0,                       \
+                               VG_USERREQ__STACK_CHANGE,          \
+                               id, start, end, 0, 0);             \
+   }
+
+#undef PLAT_x86_linux
+#undef PLAT_amd64_linux
+#undef PLAT_ppc32_linux
+#undef PLAT_ppc64_linux
+#undef PLAT_ppc32_aix5
+#undef PLAT_ppc64_aix5
+
+#endif   /* __VALGRIND_H */
diff --git a/infinipath-psm.spec.in b/infinipath-psm.spec.in
new file mode 100644
index 0000000..a84f81f
--- /dev/null
+++ b/infinipath-psm.spec.in
@@ -0,0 +1,163 @@
+# Copyright (c) 2012. Intel Corporation. All rights reserved.
+# Copyright (c) 2010. QLogic Corporation. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+Summary: Intel PSM Libraries
+Name: infinipath-psm
+Version: @VERSION@
+Release: @RELEASE@
+Epoch: 4
+License: GPL
+Group: System Environment/Libraries
+URL: http://www.intel.com/
+Source0: %{name}-%{version}-%{release}.tar.gz
+Prefix: /usr
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
+Provides: infinipath-psm = %{version}
+%if "%{PSM_HAVE_SCIF}" == "1"
+Provides: intel-mic-psm = %{version}
+%endif
+# MIC package
+Obsoletes: intel-mic-psm
+# OFED package
+Obsoletes: infinipath-libs <= %{version}-%{release}
+Conflicts: infinipath-libs <= %{version}-%{release}
+# mpss package
+Obsoletes: mpss-psm <= %{version}-%{release}
+Conflicts: mpss-psm <= %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES@
+
+%package -n infinipath-psm-devel
+Summary: Development files for Intel PSM
+Group: System Environment/Development
+Requires: infinipath-psm = %{version}-%{release}
+Provides: infinipath-psm-devel = %{version}
+%if "%{PSM_HAVE_SCIF}" == "1"
+Provides: intel-mic-psm-devel = %{version}
+%endif
+# MIC package
+Obsoletes: intel-mic-psm-devel
+# OFED package
+Obsoletes: infinipath-devel <= %{version}-%{release}
+Conflicts: infinipath-devel <= %{version}-%{release}
+# mpss package
+Obsoletes: mpss-psm-dev <= %{version}-%{release}
+Conflicts: mpss-psm-dev <= %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES-DEVEL@
+
+# %package card-devel
+# Summary: Development files for Intel Xeon Phi
+# Group: System Environment/Development
+# Requires: %{name} = %{version}-%{release}
+# Requires(post): /sbin/ldconfig
+# Requires(postun): /sbin/ldconfig
+
+
+%global debug_package %{nil}
+
+#PSM_HAVE_SCIF is one of: 0 1
+%{!?PSM_HAVE_SCIF:     %global PSM_HAVE_SCIF 0}
+
+%define INFINIPATH_MAKEARG PSM_HAVE_SCIF=0 MIC=0
+%define INTEL_MAKEARG PSM_HAVE_SCIF=1 MIC=0
+%define INTEL_CARD_MAKEARG PSM_HAVE_SCIF=1 MIC=1 LOCAL_PREFIX=/opt/intel/mic/psm
+%define card_prefix /opt/intel/mic/psm
+
+%if "%{PSM_HAVE_SCIF}" == "0"
+  %define MAKEARG PSM_HAVE_SCIF=0 MIC=0
+%else
+  %if "%{PSM_HAVE_SCIF}" == "1"
+    %define MAKEARG PSM_HAVE_SCIF=1 MIC=0
+  %else
+    %define MAKEARG PSM_HAVE_SCIF=0 MIC=0
+    %define PSM_HAVE_SCIF "1"
+  %endif
+%endif
+
+%description
+The PSM Messaging API, or PSM API, is Intel's low-level
+user-level communications interface for the True Scale
+family of products. PSM users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description -n infinipath-psm-devel
+Development files for the libpsm_infinipath library
+
+%prep
+%setup -q -n %{name}-%{version}-%{release}
+
+%build
+%{__make} @PSM_UUID@ %{MAKEARG}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT
+export DESTDIR=$RPM_BUILD_ROOT
+%{__make} install %{MAKEARG}
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
+%files
+%defattr(-,root,root,-)
+/usr/lib64/libpsm_infinipath.so.*
+/usr/lib64/libinfinipath.so.*
+%if "%{PSM_HAVE_SCIF}" == "1"
+/usr/sbin/psmd
+%endif
+
+%files -n infinipath-psm-devel
+%defattr(-,root,root,-)
+/usr/lib64/libpsm_infinipath.so
+/usr/lib64/libinfinipath.so
+/usr/include/psm.h
+/usr/include/psm_mq.h
+
+
+
+%changelog
+* Fri Sep 25 2015 Henry Estela <henry.r.estela@intel.com> - @VERSION@-1
+- Always build infinipath-psm with different Provides names.
+* Tue Nov 6 2012 Mitko Haralanov <mitko.haralanov@intel.com> - @VERSION@-1
+- Add Intel Xeon Phi related changes
+* Tue May 11 2010 Mitko Haralanov <mitko@qlogic.com> - @VERSION@-1
+- Initial build.
+
diff --git a/intel-mic-psm-card.spec.in b/intel-mic-psm-card.spec.in
new file mode 100644
index 0000000..44a3123
--- /dev/null
+++ b/intel-mic-psm-card.spec.in
@@ -0,0 +1,112 @@
+# Copyright (c) 2012. Intel Corporation. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+%define debug_package ${nil}
+%{!?install_prefix:%define install_prefix /usr}
+
+Summary: Intel PSM Libraries for Intel Xeon Phi
+Name: intel-mic-psm-card
+Version: @VERSION@
+Release: @RELEASE@
+License: GPL
+Group: System Environment/Damon
+URL: http://www.intel.com/
+Source0: %{name}-%{version}-%{release}.tar.gz
+Prefix: /usr
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES@
+
+%package devel
+Summary: Development files for Intel Xeon Phi
+Group: System Environment/Development
+Requires: %{name} = %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+
+%description
+The PSM Messaging API, or PSM API, is Intel's low-level
+user-level communications interface for the True Scale
+family of products. PSM users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description devel
+Development files for libpsm_infinipath library
+
+%prep
+%setup -q -n %{name}-%{version}-%{release}
+
+%build
+%{__make}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT
+%{make_install}
+%if %(test "%{install_prefix}" = "/usr" && echo 0 || echo 1)
+    cp -a mic/* $RPM_BUILD_ROOT
+    find $RPM_BUILD_ROOT/ -name "*.in" -exec rm -f {} \;
+%endif
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
+%files
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so.*
+%{install_prefix}/lib64/libinfinipath.so.*
+%if %(test "%{install_prefix}" = "/usr" && echo 0 || echo 1)
+    %{install_prefix}/psm.filelist
+    %{_sysconfdir}/sysconfig/mic/conf.d/psm.conf
+%endif
+
+%files devel
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so
+%{install_prefix}/lib64/libinfinipath.so
+
+%changelog
+* Thu Apr 11 2013 Mitko Haralanov <mitko.haralanov@intel.com>
+- Remove any unwanted files before packaging
+* Wed Nov 28 2012 Mitko Haralanov <mitko.haralanov@intel.com>
+- Add Xeon Phi devel package
+* Thu Nov 9 2012 Mitko Haralanov <mitko.haralanov@intel.com>
+- Add TMI to package
+* Mon Nov 5 2012 Mitko Haralanov <mitko.haralanov@intel.com>
+- Initial build.
+
diff --git a/intel-mic-psm.spec.in b/intel-mic-psm.spec.in
new file mode 100644
index 0000000..71d9021
--- /dev/null
+++ b/intel-mic-psm.spec.in
@@ -0,0 +1,207 @@
+# Copyright (c) 2012. Intel Corporation. All rights reserved.
+# Copyright (c) 2010. QLogic Corporation. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+Summary: Intel PSM Libraries
+Name: intel-mic-psm
+Version: @VERSION@
+Release: @RELEASE@
+License: GPL
+Group: System Environment/Libraries
+URL: http://www.intel.com/
+Source0: %{name}-%{version}-%{release}.tar.gz
+Prefix: /usr
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root
+Provides: %{name} = %{version}
+# ifs package
+Obsoletes: infinipath-libs <= %{version}-%{release}
+Conflicts: infinipath-libs <= %{version}-%{release}
+# mpss package
+Obsoletes: mpss-psm <= %{version}-%{release}
+Conflicts: mpss-psm <= %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES@
+
+%package devel
+Summary: Development files for Intel PSM
+Group: System Environment/Development
+Requires: %{name} = %{version}-%{release}
+Provides: %{name}-devel = %{version}
+# ifs package
+Obsoletes: infinipath-devel <= %{version}-%{release}
+Conflicts: infinipath-devel <= %{version}-%{release}
+# mpss package
+Obsoletes: mpss-psm-dev <= %{version}-%{release}
+Conflicts: mpss-psm-dev <= %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES-DEVEL@
+
+
+%package -n infinipath-psm
+Summary: QLogic PSM Libraries
+Epoch: 4
+License: GPL
+Group: System Environment/Libraries
+URL: http://www.qlogic.com/
+Prefix: /usr
+Provides: infinipath-psm = %{version}
+Conflicts: infinipath-libs intel-mic-psm
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES@
+
+%package -n infinipath-psm-devel
+Summary: Development files for Intel PSM
+Group: System Environment/Development
+Requires: infinipath-psm = %{version}-%{release}
+Provides: infinipath-psm-devel = %{version}
+# ifs package
+Obsoletes: infinipath-devel <= %{version}-%{release}
+Conflicts: infinipath-devel <= %{version}-%{release}
+# mpss package
+Obsoletes: mpss-psm-dev <= %{version}-%{release}
+Conflicts: mpss-psm-dev <= %{version}-%{release}
+Requires(post): /sbin/ldconfig
+Requires(postun): /sbin/ldconfig
+@REQUIRES-DEVEL@
+
+# %package card-devel
+# Summary: Development files for Intel Xeon Phi
+# Group: System Environment/Development
+# Requires: %{name} = %{version}-%{release}
+# Requires(post): /sbin/ldconfig
+# Requires(postun): /sbin/ldconfig
+
+
+%global debug_package %{nil}
+
+#%{!?install_prefix:%define install_prefix /usr}
+#PSM_HAVE_SCIF is one of: 0 1
+%{!?PSM_HAVE_SCIF:     %global PSM_HAVE_SCIF 0}
+
+%define INFINIPATH_MAKEARG PSM_HAVE_SCIF=0 MIC=0
+%define INTEL_MAKEARG PSM_HAVE_SCIF=1 MIC=0
+%define INTEL_CARD_MAKEARG PSM_HAVE_SCIF=1 MIC=1 LOCAL_PREFIX=/opt/intel/mic/psm
+%define card_prefix /opt/intel/mic/psm
+
+%if "%{PSM_HAVE_SCIF}" == "0"
+  %define MAKEARG PSM_HAVE_SCIF=0 MIC=0
+%else
+  %if "%{PSM_HAVE_SCIF}" == "1"
+    %define MAKEARG PSM_HAVE_SCIF=1 MIC=0
+  %else
+    %define MAKEARG PSM_HAVE_SCIF=0 MIC=0
+    %define PSM_HAVE_SCIF "1"
+  %endif
+%endif
+
+%description
+The PSM Messaging API, or PSM API, is Intel's low-level
+user-level communications interface for the True Scale
+family of products. PSM users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description devel
+Development files for the libpsm_infinipath library
+
+%description -n infinipath-psm
+The PSM Messaging API, or PSM API, is QLogic's low-level
+user-level communications interface for the Truescale
+family of products. PSM users are enabled with mechanisms
+necessary to implement higher level communications
+interfaces in parallel environments.
+
+%description -n infinipath-psm-devel
+Development files for the libpsm_infinipath library
+
+%prep
+%setup -q -n %{name}-%{version}-%{release}
+
+%build
+%{__make} @PSM_UUID@ %{MAKEARG}
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT
+export DESTDIR=$RPM_BUILD_ROOT
+%{__make} install %{MAKEARG}
+
+
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
+%if "%{PSM_HAVE_SCIF}" == "1"
+%files
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so.*
+%{install_prefix}/lib64/libinfinipath.so.*
+/usr/sbin/psmd
+
+%files devel
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so
+%{install_prefix}/lib64/libinfinipath.so
+/usr/include/psm.h
+/usr/include/psm_mq.h
+%endif
+
+
+%if "%{PSM_HAVE_SCIF}" == "0"
+%files -n infinipath-psm
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so.*
+%{install_prefix}/lib64/libinfinipath.so.*
+
+%files -n infinipath-psm-devel
+%defattr(-,root,root,-)
+%{install_prefix}/lib64/libpsm_infinipath.so
+%{install_prefix}/lib64/libinfinipath.so
+/usr/include/psm.h
+/usr/include/psm_mq.h
+%endif
+
+
+
+%changelog
+* Tue Nov 6 2012 Mitko Haralanov <mitko.haralanov@intel.com> - @VERSION@-1
+- Add Intel Xeon Phi related changes
+* Tue May 11 2010 Mitko Haralanov <mitko@qlogic.com> - @VERSION@-1
+- Initial build.
+
diff --git a/ipath-psm-devel.srclist.in b/ipath-psm-devel.srclist.in
new file mode 100644
index 0000000..a1dc132
--- /dev/null
+++ b/ipath-psm-devel.srclist.in
@@ -0,0 +1,4 @@
+/usr/include/psm.h
+/usr/include/psm_mq.h
+%LIBPREFIX%/libinfinipath.so
+%LIBPREFIX%/libpsm_infinipath.so
diff --git a/ipath-psm.srclist.in b/ipath-psm.srclist.in
new file mode 100644
index 0000000..97a45ff
--- /dev/null
+++ b/ipath-psm.srclist.in
@@ -0,0 +1,4 @@
+%LIBPREFIX%/libinfinipath.so.4
+%LIBPREFIX%/libinfinipath.so.4.0
+%LIBPREFIX%/libpsm_infinipath.so.1
+%LIBPREFIX%/libpsm_infinipath.so.1.15
diff --git a/ipath/Makefile b/ipath/Makefile
new file mode 100644
index 0000000..8c2cc6e
--- /dev/null
+++ b/ipath/Makefile
@@ -0,0 +1,98 @@
+# Copyright (c) 2012. Intel Corporation. All rights reserved.
+# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+TARGLIB := libinfinipath
+MAJOR := $(IPATH_LIB_MAJOR)
+MINOR := $(IPATH_LIB_MINOR)
+
+include $(top_srcdir)/buildflags.mak
+BASECFLAGS += -D_GNU_SOURCE
+INCLUDES += -I$(top_srcdir)/ptl_ips
+
+ifeq (${arch},x86_64)
+	PLATFORM_OBJ=ipath_dwordcpy-x86_64-fast.o
+else
+	PLATFORM_OBJ=
+endif
+
+${TARGLIB}-objs := ipath_debug.o ipath_time.o ipath_proto.o \
+	ipath_utils.o ipath_service.o ipath_protomic.o \
+	ipath_dwordcpy-$(arch).o ipath_i2cflash.o ipath_sysfs.o ipath_syslog.o \
+	ipath_write_pio-$(arch).o $(PLATFORM_OBJ)
+
+all .DEFAULT: ${TARGLIB}.so
+
+install: all
+	install -D ${TARGLIB}.so.${MAJOR}.${MINOR} \
+		${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR}
+	(cd ${DESTDIR}${INSTALL_LIB_TARG} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \
+		ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so)
+
+${TARGLIB}.so: ${TARGLIB}.so.${MAJOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+${TARGLIB}.so.${MAJOR}: ${TARGLIB}.so.${MAJOR}.${MINOR}
+	ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@
+
+# when we build the shared library, generate a revision and date
+# string in it, for easier id'ing when people may have copied the
+# file around.  Generate it such that the ident command can find it
+# and strings -a | grep InfiniPath does a reasonable job as well.
+${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs}
+	date +'static __attribute__ ((unused)) char __psc_infinipath_revision[] ="$$""Date: %F %R ${rpm_extra_description}InfiniPath $$";' > _revision.c
+	$(CC) -c $(BASECFLAGS) $(INCLUDES) _revision.c -o _revision.o
+	$(CC) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \
+		-Wl,--unique='*fastpath*' \
+		${${TARGLIB}-objs} _revision.o $(LDFLAGS) $(if $(MIC:0=),$(SCIF_LINK_FLAGS))
+
+%.o: %.c
+	$(CC) $(CFLAGS) $(INCLUDES) $(if $(MIC:0=),$(SCIF_INCLUDE_FLAGS)) -c $< -o $@
+
+%.o: %.S
+	$(CC) $(ASFLAGS) -c $< -o $@
+
+ipath_debug.o: WERROR :=
+# This is temporarily necessary in order to get backtrace to work. Bug 3536
+ipath_debug.o: ipath_debug.c
+	$(CC) $(BASECFLAGS) $(INCLUDES) -c $< -o $@
+
+ipath_write_pio-ppc.o: ipath_write_pio-ppc.c
+	$(CC) $(CFLAGS) -maltivec $(INCLUDES) -c $< -o $@
+
+ipath_write_pio-ppc64.o: ipath_write_pio-ppc64.c
+	$(CC) $(CFLAGS) -maltivec $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -f _revision.c
+	rm -f *.o ${TARGLIB}.*
diff --git a/ipath/ipath_debug.c b/ipath/ipath_debug.c
new file mode 100644
index 0000000..b89502f
--- /dev/null
+++ b/ipath/ipath_debug.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <execinfo.h>
+#include <fcntl.h>
+#include <ucontext.h>
+#include "ipath_user.h"
+
+unsigned infinipath_debug = 1;
+char* __ipath_mylabel = NULL;
+FILE *__ipath_dbgout;
+static void init_ipath_mylabel(void) __attribute__ ((constructor));
+static void init_ipath_backtrace(void) __attribute__ ((constructor));
+static void init_ipath_dbgfile(void) __attribute__ ((constructor));
+static void fini_ipath_backtrace(void) __attribute__ ((destructor));
+
+static void init_ipath_mylabel(void)
+{
+    char lbl[1024];
+    char hostname[80];
+    char *e;
+    /* By default, try to come up with a decent default label, it will be
+     * overriden later.  Try getting rank, if that's not available revert to
+     * pid. */
+    gethostname(hostname, 80);
+    lbl[0] = '\0';
+    hostname[sizeof hostname - 1] = '\0';
+    if ((((e = getenv("PSC_MPI_RANK")) && *e)) ||
+	(((e = getenv("MPI_RANKID")) && *e)) ||
+	(((e = getenv("MPIRUN_RANK")) && *e)))
+    {
+	char *ep;
+	unsigned long val;
+	val = strtoul(e, &ep, 10);
+	if (ep != e) /* valid conversion */
+	    snprintf(lbl, 1024, "%s.%lu", hostname, val);
+    }
+    if (lbl[0] == '\0')
+	snprintf(lbl, 1024, "%s.%u", hostname, getpid());
+    __ipath_mylabel = strdup(lbl);
+}
+
+static void
+ipath_sighdlr(int sig, siginfo_t *p1, void *ucv)
+{
+    // we make these static to try and avoid issues caused
+    // by stack overflow that might have gotten us here.
+    static void *backaddr[128]; // avoid stack usage
+    static char buf[150], hname[64], fname[128];
+    static int i, j, fd, id;
+    static int write_result __unused__;
+    extern char *__progname;
+
+    // If this is a SIGINT do not display backtrace. Just invoke exit handlers
+    if ((sig == SIGINT) || (sig == SIGTERM))
+      exit(1);
+    
+    id = snprintf(buf, sizeof buf,
+        "\n%.60s:%u terminated with signal %d", __progname, getpid(), sig);
+    if(ucv) {
+        static ucontext_t *uc;
+        uc = (ucontext_t*)ucv;
+        id += snprintf(buf+id, sizeof buf-id, " at PC=%lx SP=%lx",
+#if defined(__x86_64__)
+            (unsigned long)uc->uc_mcontext.gregs[REG_RIP],
+            (unsigned long)uc->uc_mcontext.gregs[REG_RSP]);
+#elif defined(__i386__)
+            (unsigned long)uc->uc_mcontext.gregs[REG_EIP],
+            (unsigned long)uc->uc_mcontext.gregs[REG_ESP]);
+#else
+            0ul, 0ul);
+#warning No stack pointer or instruction pointer for this arch
+#endif
+    }
+    id += snprintf(buf+id, sizeof buf-id, ".  Backtrace:\n");
+    write_result = write(2, buf, id);
+
+    i = backtrace(backaddr, sizeof(backaddr)/sizeof(backaddr[0]));
+    if(i>2) // skip ourselves and backtrace
+        j=2,i-=j;
+    else
+        j=0;
+    backtrace_symbols_fd(backaddr+j,  i, 2);
+    (void)fsync(2);
+
+    // try to write it to a file as well, in case the rest doesn't make it out.
+    // Do it second, in case we get a second failure (more likely).
+    // We might eventually want to print some more of the registers to the
+    // btr file, to aid debugging, but not for now.
+    // Truncate the program name if overly long, so we always get pid and (at least part of)
+    // hostname.
+    (void)gethostname(hname, sizeof hname);
+    hname[sizeof(hname) - 1] = '\0';
+    snprintf(fname, sizeof fname, "%s.80s-%u,%.32s.btr", __progname, getpid(), hname);
+    if((fd=open(fname, O_CREAT|O_WRONLY, 0644))>=0) {
+        write_result = write(fd, buf, id);
+        backtrace_symbols_fd(backaddr+j,  i, fd);
+        (void)fsync(fd);
+        (void)close(fd);
+    }
+    exit(1); // not _exit(), want atexit handlers to get run
+}
+
+static struct sigaction sigsegv_act;
+static struct sigaction sigbus_act;
+static struct sigaction sigill_act;
+static struct sigaction sigabrt_act;
+static struct sigaction sigint_act;
+static struct sigaction sigterm_act;
+
+// we do this as a constructor so any user program that sets signal
+// handlers for these will override our settings, but we still
+// get backtraces if they don't
+static void init_ipath_backtrace(void)
+{
+    // we need to track memory corruption
+    static struct sigaction act; // easier than memset
+    act.sa_sigaction = ipath_sighdlr;
+    act.sa_flags = SA_SIGINFO;
+
+    if(!getenv("IPATH_NO_BACKTRACE"))  {// permanent, although probably
+        // undocumented way to disable backtraces.
+        (void)sigaction(SIGSEGV, &act, &sigsegv_act);
+        (void)sigaction(SIGBUS,  &act, &sigbus_act);
+        (void)sigaction(SIGILL,  &act, &sigill_act);
+        (void)sigaction(SIGABRT, &act, &sigabrt_act);
+        (void)sigaction(SIGINT,  &act, &sigint_act);
+        (void)sigaction(SIGTERM, &act, &sigterm_act);
+    }
+}
+
+static void fini_ipath_backtrace(void)
+{
+    if(!getenv("IPATH_NO_BACKTRACE"))  {
+        (void)sigaction(SIGSEGV, &sigsegv_act, NULL);
+        (void)sigaction(SIGBUS,  &sigbus_act, NULL);
+        (void)sigaction(SIGILL,  &sigill_act, NULL);
+        (void)sigaction(SIGABRT, &sigabrt_act, NULL);
+        (void)sigaction(SIGINT,  &sigint_act, NULL);
+        (void)sigaction(SIGTERM, &sigterm_act, NULL);
+    }
+}
+
+// if IPATH_DEBUG_FILENAME is set in the environment, then all the
+// debug prints (not info and error) will go to that file.
+// %h is expanded to the hostname, and %p to the pid, if present.
+static void init_ipath_dbgfile(void)
+{
+	char *fname = getenv("IPATH_DEBUG_FILENAME");
+	char *exph, *expp, tbuf[1024];
+	FILE *newf;
+
+	if(!fname) {
+		__ipath_dbgout = stdout;
+		return;
+	}
+	exph = strstr(fname, "%h"); // hostname
+	expp = strstr(fname, "%p"); // pid
+	if(exph || expp) {
+		int baselen;
+		char hname[256], pid[12];
+		if(exph) {
+			*hname = hname[sizeof(hname)-1] = 0;
+			gethostname(hname, sizeof(hname)-1);
+			if(!*hname)
+				strcpy(hname, "[unknown]");
+		}
+		if(expp)
+			snprintf(pid, sizeof pid, "%d", getpid());
+		if(exph && expp) {
+			if(exph < expp) {
+				baselen = exph - fname;
+				snprintf(tbuf, sizeof tbuf, "%.*s%s%.*s%s%s",
+					baselen, fname, hname, 
+					(int)(expp - (exph+2)), exph+2, pid, expp+2);
+			}
+			else {
+				baselen = expp - fname;
+				snprintf(tbuf, sizeof tbuf, "%.*s%s%.*s%s%s",
+					baselen, fname, pid, 
+					(int)(exph - (expp+2)), expp+2, hname, exph+2);
+			}
+		}
+		else if(exph) {
+			baselen = exph - fname;
+			snprintf(tbuf, sizeof tbuf, "%.*s%s%s",
+				baselen, fname, hname, exph+2);
+		}
+		else {
+			baselen = expp - fname;
+			snprintf(tbuf, sizeof tbuf, "%.*s%s%s",
+				baselen, fname, pid, expp+2);
+		}
+		fname = tbuf;
+	}
+	newf = fopen(fname, "a");
+	if(!newf) {
+		_IPATH_ERROR("Unable to open \"%s\" for debug output, using stdout: %s\n",
+			fname, strerror(errno));
+		__ipath_dbgout = stdout;
+	}
+	else {
+		__ipath_dbgout = newf;
+		setlinebuf(__ipath_dbgout);
+	}
+}
+
+void ipath_set_mylabel(char* label)
+{
+    __ipath_mylabel = label;
+}
+
+char *ipath_get_mylabel()
+{
+    return __ipath_mylabel;
+}
diff --git a/ipath/ipath_dwordcpy-generic.c b/ipath/ipath_dwordcpy-generic.c
new file mode 100644
index 0000000..33e7301
--- /dev/null
+++ b/ipath/ipath_dwordcpy-generic.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#if defined(__x86_64__)
+#define ipath_dwordcpy ipath_dwordcpy_safe
+#endif
+
+void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+        uint64_t *src64[4];
+        uint64_t *dst64[4];
+        src64[0] = (uint64_t *)src;
+        dst64[0] = (uint64_t *)dest;
+
+        while ( ndw >= 8 ) {
+                *dst64[0] = *src64[0];
+                src64[1]  = src64[0]+1;
+                src64[2]  = src64[0]+2;
+                src64[3]  = src64[0]+3;
+                ndw -= 8;
+                dst64[1]   = dst64[0]+1;
+                dst64[2]  = dst64[0]+2;
+                dst64[3]  = dst64[0]+3;
+                *dst64[1]  = *src64[1];
+                *dst64[2]  = *src64[2];
+                *dst64[3]  = *src64[3];
+                src64[0] += 4;
+                dst64[0] += 4;
+        }
+        if ( ndw ) {
+                src = (uint32_t *)src64[0];
+                dest = (uint32_t *)dst64[0];
+
+		switch ( ndw ) {
+		case 7: *dest++ = *src++;
+		case 6: *dest++ = *src++;
+		case 5: *dest++ = *src++;
+		case 4: *dest++ = *src++;
+		case 3: *dest++ = *src++;
+		case 2: *dest++ = *src++;
+		case 1: *dest++ = *src++;
+		}
+		
+        }
+}
diff --git a/ipath/ipath_dwordcpy-i386.S b/ipath/ipath_dwordcpy-i386.S
new file mode 100644
index 0000000..970651c
--- /dev/null
+++ b/ipath/ipath_dwordcpy-i386.S
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+ 	.globl ipath_dwordcpy
+	.file	"ipath_dword32cpy.S"
+	.text
+	.p2align 4,,15
+ipath_dwordcpy:
+	// standard C calling convention, args on stack
+        // does not return any value
+	.type	ipath_dwordcpy, @function
+	// save caller-saved regs
+	mov    %edi,%eax
+	mov    %esi,%edx
+
+	// setup regs
+	mov    0xc(%esp,1),%ecx
+	mov    0x4(%esp,1),%edi
+	mov    0x8(%esp,1),%esi
+	// and do it
+        cld
+	rep
+	movsd
+
+	// restore
+	mov %eax,%edi
+	mov %edx,%esi
+	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/ipath/ipath_dwordcpy-ppc64.c b/ipath/ipath_dwordcpy-ppc64.c
new file mode 100644
index 0000000..33e7301
--- /dev/null
+++ b/ipath/ipath_dwordcpy-ppc64.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#if defined(__x86_64__)
+#define ipath_dwordcpy ipath_dwordcpy_safe
+#endif
+
+void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+        uint64_t *src64[4];
+        uint64_t *dst64[4];
+        src64[0] = (uint64_t *)src;
+        dst64[0] = (uint64_t *)dest;
+
+        while ( ndw >= 8 ) {
+                *dst64[0] = *src64[0];
+                src64[1]  = src64[0]+1;
+                src64[2]  = src64[0]+2;
+                src64[3]  = src64[0]+3;
+                ndw -= 8;
+                dst64[1]   = dst64[0]+1;
+                dst64[2]  = dst64[0]+2;
+                dst64[3]  = dst64[0]+3;
+                *dst64[1]  = *src64[1];
+                *dst64[2]  = *src64[2];
+                *dst64[3]  = *src64[3];
+                src64[0] += 4;
+                dst64[0] += 4;
+        }
+        if ( ndw ) {
+                src = (uint32_t *)src64[0];
+                dest = (uint32_t *)dst64[0];
+
+		switch ( ndw ) {
+		case 7: *dest++ = *src++;
+		case 6: *dest++ = *src++;
+		case 5: *dest++ = *src++;
+		case 4: *dest++ = *src++;
+		case 3: *dest++ = *src++;
+		case 2: *dest++ = *src++;
+		case 1: *dest++ = *src++;
+		}
+		
+        }
+}
diff --git a/ipath/ipath_dwordcpy-x86_64-fast.S b/ipath/ipath_dwordcpy-x86_64-fast.S
new file mode 100644
index 0000000..6465aae
--- /dev/null
+++ b/ipath/ipath_dwordcpy-x86_64-fast.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+ 	.globl ipath_dwordcpy
+	.file	"ipath_dwordcpy-x86_64-fast.S"
+	.text
+	.p2align 4,,15
+	// standard C calling convention, rdi is dest, rsi is source, rdx is count
+        // does not return any value
+ipath_dwordcpy:
+	.type	ipath_dwordcpy, @function
+	movl %edx,%ecx
+	shrl $1,%ecx
+	andl $1,%edx
+        cld
+	rep
+	movsq
+	movl %edx,%ecx
+	rep
+	movsd
+	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/ipath/ipath_dwordcpy-x86_64.c b/ipath/ipath_dwordcpy-x86_64.c
new file mode 100644
index 0000000..33e7301
--- /dev/null
+++ b/ipath/ipath_dwordcpy-x86_64.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+
+#if defined(__x86_64__)
+#define ipath_dwordcpy ipath_dwordcpy_safe
+#endif
+
+void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords)
+{
+	uint_fast32_t ndw = ndwords;
+        uint64_t *src64[4];
+        uint64_t *dst64[4];
+        src64[0] = (uint64_t *)src;
+        dst64[0] = (uint64_t *)dest;
+
+        while ( ndw >= 8 ) {
+                *dst64[0] = *src64[0];
+                src64[1]  = src64[0]+1;
+                src64[2]  = src64[0]+2;
+                src64[3]  = src64[0]+3;
+                ndw -= 8;
+                dst64[1]   = dst64[0]+1;
+                dst64[2]  = dst64[0]+2;
+                dst64[3]  = dst64[0]+3;
+                *dst64[1]  = *src64[1];
+                *dst64[2]  = *src64[2];
+                *dst64[3]  = *src64[3];
+                src64[0] += 4;
+                dst64[0] += 4;
+        }
+        if ( ndw ) {
+                src = (uint32_t *)src64[0];
+                dest = (uint32_t *)dst64[0];
+
+		switch ( ndw ) {
+		case 7: *dest++ = *src++;
+		case 6: *dest++ = *src++;
+		case 5: *dest++ = *src++;
+		case 4: *dest++ = *src++;
+		case 3: *dest++ = *src++;
+		case 2: *dest++ = *src++;
+		case 1: *dest++ = *src++;
+		}
+		
+        }
+}
diff --git a/ipath/ipath_i2cflash.c b/ipath/ipath_i2cflash.c
new file mode 100644
index 0000000..e906895
--- /dev/null
+++ b/ipath/ipath_i2cflash.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "ipath_user.h"
+
+uint8_t
+ipath_flash_csum(struct ipath_flash *ifp, int adjust)
+{
+    uint8_t *ip = (uint8_t*)ifp;
+    uint8_t csum = 0, len;
+
+    /*
+     * Limit length checksummed to max length of actual data.
+     * Checksum of erased eeprom will still be bad, but we avoid
+     * reading past the end of the buffer we were passed.
+     */
+    len = ifp->if_length;
+    if (len > sizeof(struct ipath_flash))
+	len = sizeof(struct ipath_flash);
+    while (len--)
+        csum += *ip++;
+    csum -= ifp->if_csum;
+    csum = ~csum;
+    if(adjust)
+        ifp->if_csum = csum;
+    return csum;
+}
+
diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c
new file mode 100644
index 0000000..5f9365f
--- /dev/null
+++ b/ipath/ipath_proto.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MIC__
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+#include <sched.h>
+
+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+
+// don't inline these; it's all init code, and not inlining makes the
+// overall code shorter and easier to debug.
+static void ipath_setaffinity(int) __attribute__ ((noinline));
+
+// set the processor affinity based upon the assigned context.
+// We want to do this early, before much memory is allocated
+// (by user or kernel code) so that we get memory allocated on
+// the node upon which we will be running.  This was done in the
+// MPI init code, but that's way too late...
+//
+// We need to know both the context, and the unit (chip) that we are
+// using.  If we have more than 2 cpus, and we have more than one
+// chip, we use the unit number as part of the algorithm, so that
+// we try to stay on a cpu close to the chip that we are using.
+//
+// This will need more work; it isn't really right yet for dual core,
+// dual cpu.  We may change the command to just return the cpu that
+// should be used for affinity, eventually.
+// Since user contextss start at 1, we subtract one.
+// The "same" code is done as part of MPI_Init, if the job is only
+// using shared memory, no infinipath
+static void ipath_setaffinity(int fd) 
+{
+    struct ipath_ctxt_info info;
+    struct ipath_cmd cmd;
+    cpu_set_t cpuset;
+
+    if(getenv("IPATH_NO_CPUAFFINITY")) {
+        _IPATH_PRDBG("Skipping processor affinity, $IPATH_NO_CPUAFFINITY set\n");
+        return;
+    }
+
+    memset(&cmd, 0, sizeof(struct ipath_cmd));
+    memset(&info, 0, sizeof(struct ipath_ctxt_info));
+    cmd.type = IPATH_CMD_CTXT_INFO;
+    cmd.cmd.ctxt_info = (uintptr_t) &info;
+    if(ipath_cmd_write(fd, &cmd, sizeof(cmd)) == -1) {
+        _IPATH_INFO("CTXT_INFO command failed: %s\n", strerror(errno));
+        return;
+    }
+    if(!info.num_active || !info.context) {
+        _IPATH_INFO("CTXT_INFO: %u active contexts unit %u:%u %u/%u, skip cpu affinity\n",
+            info.num_active, info.unit, info.port, info.context, info.subcontext);
+        return;
+    }
+
+    if(info.rec_cpu == (__u16)-1) {
+        _IPATH_PRDBG("Skipping processor affinity, set already or no "
+		     "unallocated cpu\n");
+	return;
+    }
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(info.rec_cpu, &cpuset);
+    if(sched_setaffinity(0,sizeof cpuset, &cpuset))
+        _IPATH_INFO("Couldn't set runon processor %u (unit:context %u:%u) (%u active chips): %s\n",
+            info.rec_cpu, info.unit, info.context, info.num_active, strerror(errno));
+     else
+         _IPATH_PRDBG("Set CPU affinity to %u, context %u:%u:%u (%u active chips)\n",
+             info.rec_cpu, info.unit, info.context, info.subcontext, info.num_active);
+}
+
+// It is allowed to have multiple devices (and of different types)
+// simultaneously opened and initialized, although this (still! Oct 07)
+// implemented.  This routine is used by the low level
+// infinipath protocol code (and any other code that has similar low level
+// functionality).
+// This is the only routine that takes a file descriptor, rather than an
+// struct _ipath_ctrl *.  The struct _ipath_ctrl * used for everything
+// else is returned as part of ipath_base_info.
+struct _ipath_ctrl *ipath_userinit(int fd, struct ipath_user_info *u,
+                                   struct ipath_base_info *b)
+{
+    struct _ipath_ctrl *spctrl = NULL;
+    void *tmp;
+    uint64_t *tmp64;
+    struct stat st;
+    struct ipath_cmd c;
+    size_t usize;
+    uintptr_t pg_mask;
+    __u64 pioavailaddr;
+    uint64_t uregbase;
+    int __ipath_pg_sz;
+    
+    /* First get the page size */
+    __ipath_pg_sz = sysconf(_SC_PAGESIZE);
+    pg_mask = ~ (intptr_t) (__ipath_pg_sz - 1);
+
+    u->spu_base_info_size = sizeof(*b);
+    u->spu_base_info = (uint64_t)(uintptr_t) b;
+
+    memset(&c, 0, sizeof(struct ipath_cmd));
+    c.type = IPATH_CMD_ASSIGN_CONTEXT;
+    memcpy(&c.cmd.user_info, u, sizeof(*u));
+
+    if(ipath_cmd_assign_context(fd, &c, sizeof(c)) == -1) {
+        _IPATH_INFO("assign_context command failed: %s\n", strerror(errno));
+        goto err;
+    }
+
+    ipath_setaffinity(fd); // prior to memory allocation in driver, etc.
+
+    c.type = IPATH_CMD_USER_INIT;
+    memcpy(&c.cmd.user_info, u, sizeof(*u));
+
+    if(ipath_cmd_user_init(fd, &c, sizeof(c)) == -1) {
+        _IPATH_INFO("userinit command failed: %s\n", strerror(errno));
+        goto err;
+    }
+    /*
+     * If header redirection is enabled, there will be a shared subcontext
+     * with the kernel that we have to examine.
+     */
+    if (b->spi_runtime_flags & IPATH_RUNTIME_CTXT_REDIRECT)
+        u->spu_subcontext_cnt = 1;
+
+    _IPATH_PRDBG("Driver is %sQLogic-built\n",
+	((1<<31)&b->spi_sw_version) ? "" : "not ");
+    if((0x7fff&(b->spi_sw_version >> 16)) != IPATH_USER_SWMAJOR) {
+	_IPATH_INFO
+	    ("User major version 0x%x not same as driver major 0x%x\n",
+	     IPATH_USER_SWMAJOR, b->spi_sw_version >> 16);
+	if((b->spi_sw_version >> 16) < IPATH_USER_SWMAJOR)
+	    goto err; // else assume driver knows how to be compatible
+    }
+    else if ((b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) {
+	_IPATH_PRDBG("User minor version 0x%x not same as driver minor 0x%x\n",
+	     IPATH_USER_SWMINOR, b->spi_sw_version & 0xffff);
+	if ((b->spi_sw_version & 0xffff) < IPATH_USER_SWMINOR)
+	  b->spi_sendbuf_status = 0;
+    }
+
+    if (u->spu_subcontext_cnt &&
+        (b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) {
+        _IPATH_INFO("Mismatched user minor version (%d) and driver "
+                         "minor version (%d) while context sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n", 
+	            IPATH_USER_SWMINOR,
+                    (int) (b->spi_sw_version & 0xffff));
+    }
+
+#ifdef PSM_DEBUG
+    _IPATH_PRDBG("spi_subcontext = %d\n", (int) b->spi_subcontext);
+    _IPATH_PRDBG("spi_subctxt_uregbase = 0x%llx\n", (unsigned long long) b->spi_subctxt_uregbase);
+    _IPATH_PRDBG("spi_subctxt_rcvegrbuf = 0x%llx\n", (unsigned long long) b->spi_subctxt_rcvegrbuf);
+    _IPATH_PRDBG("spi_subctxt_rcvhdr_base = 0x%llx\n", (unsigned long long) b->spi_subctxt_rcvhdr_base);
+    _IPATH_PRDBG("spu_subcontext_cnt = %d\n", (int) u->spu_subcontext_cnt);
+    _IPATH_PRDBG("spu_subcontext_id = %d\n", (int) u->spu_subcontext_id);
+#endif
+
+    if(!(spctrl = calloc(1, sizeof(struct _ipath_ctrl)))) {
+	_IPATH_INFO("can't allocate memory for ipath_ctrl: %s\n",
+		strerror(errno));
+	goto err;
+    }
+
+    /* Check if we need to turn off header suppression in hardware and 
+     * emulate it in software. Since the driver disables all TID flow 
+     * entries we don't need to do anything just fake it that this
+     * looks like Linda. 
+     * Note: This will break the hardware detection heuristics where we
+     * determine that a card is QLE73XX by looking at the capability to 
+     * support header suppression! Need the driver to provide the requisite
+     * information so we can move away from heuristics based on flags.
+     */
+    {
+      const char *env;
+      
+      if ((env = getenv("IPATH_HW_HEADER_SUPPRESSION")) && (*env != '\0')) {
+	int hwsupp = (int) strtol(env, NULL, 0);
+	
+	if (!hwsupp && (b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) {
+	    _IPATH_INFO("Disabling hardware suppresion!\n");
+	  b->spi_runtime_flags &= ~IPATH_RUNTIME_HDRSUPP;
+	}
+      } /* Env */
+      
+    }
+    
+      
+    usize = b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP ?
+      2 * __ipath_pg_sz : __ipath_pg_sz;
+    _IPATH_DBG("uregbase=%llx usize=%u context=%d\n",
+	       (unsigned long long) b->spi_uregbase,
+	       (unsigned) usize, (int) b->spi_context);
+    
+    // now mmap in the rcvhdrq, egr bufs, PIO buffers and user regs
+    // _ipath_uregbase is the user regs; not offset as it is in the kernel
+    uregbase = b->spi_uregbase;
+    if((tmp=ipath_mmap64(0, usize, PROT_WRITE | PROT_READ,
+	    MAP_SHARED | MAP_LOCKED, fd,
+	    (__off64_t)b->spi_uregbase)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of user registers at %llx failed: %s\n",
+	     (long long unsigned)b->spi_uregbase,
+	     strerror(errno));
+	goto err;
+    }
+
+    _IPATH_MMDBG("mmap user regs from kernel %llx to %p (0x%lx bytes)\n",
+		 (long long unsigned) b->spi_uregbase, tmp, 
+		 (unsigned long)usize);
+    
+    // we don't try to fault these in, no need
+    tmp64 = (uint64_t *)tmp;
+    b->spi_uregbase = (uint64_t)(uintptr_t)tmp;
+    spctrl->spc_dev.spd_uregbase = (volatile uint64_t*) tmp;
+    
+    /*
+     * Set up addresses for optimized register writeback routines.
+     * This is for the real onchip registers, shared context or not
+     */
+    spctrl->__ipath_rcvhdrhead = (uint32_t*)&tmp64[ur_rcvhdrhead];
+    spctrl->__ipath_rcvegrhead = (uint32_t*)&tmp64[ur_rcvegrindexhead];
+    spctrl->__ipath_rcvegrtail = (uint32_t*)&tmp64[ur_rcvegrindextail];
+    
+    if (!(b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) {
+      _IPATH_DBG("HdrSupp not available. Using virt tidflow table.\n");
+      spctrl->__ipath_rcvtidflow = spctrl->regs;
+      spctrl->__ipath_tidflow_wmb = &spctrl->tidflow_wmb_location;
+    }
+    else {
+      spctrl->__ipath_rcvtidflow = (uint32_t*)&tmp64[ur_rcvtidflow];
+      spctrl->__ipath_tidflow_wmb = (__le32*)spctrl->__ipath_rcvegrtail;
+    }
+    
+    /* map the receive tidflow table in QLE73XX */    
+    _IPATH_DBG("rcvtidfflow=%p offset=0x%lx\n", 
+	spctrl->__ipath_rcvtidflow,
+	(long) ((uintptr_t) spctrl->__ipath_rcvtidflow - (uintptr_t) tmp64));
+    	
+    {   char *maxpio; uint32_t numpio;
+	maxpio = getenv("IPATH_MAXPIO");
+	if(maxpio && (numpio=strtoul(maxpio, NULL, 0))>0 &&
+	    numpio < b->spi_piocnt) {
+	    _IPATH_INFO("$IPATH_MAXPIO is %u, reducing PIO buffer count from %u\n",
+		numpio, b->spi_piocnt);
+		b->spi_piocnt = numpio;
+	}
+    }
+
+    // map in the PIO buffers, much like ureg, since it's
+    // in the chip address space
+    if((tmp=ipath_mmap64(0, b->spi_pioalign*b->spi_piocnt,
+	    PROT_WRITE, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)b->spi_piobufbase)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of pio buffers at %llx failed: %s\n",
+	     (long long unsigned)b->spi_piobufbase,
+	     strerror(errno));
+	goto err;
+    }
+    else {
+	_IPATH_MMDBG("mmap PIO buffers from kernel %llx, %u pages to %p\n",
+	    (unsigned long long)b->spi_piobufbase, b->spi_piocnt, tmp);
+	// Do not try to read the PIO buffers; they are mapped write
+	// only.  We'll fault them in as we write to them.
+	b->spi_piobufbase = (uintptr_t)tmp;
+    }
+
+    if (b->spi_sendbuf_status) {
+        if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+	        (__off64_t)b->spi_sendbuf_status)) == MAP_FAILED) {
+    	    _IPATH_INFO("mmap of send buffer status page at %llx failed: %s\n",
+	         (long long unsigned)b->spi_sendbuf_status,
+	         strerror(errno));
+	    goto err;
+        }
+        else {
+	   _IPATH_MMDBG("mmap send buffer status page from kernel %llx to %p\n",
+	        (long long unsigned)b->spi_sendbuf_status, tmp);
+	    // we don't try to fault these in; no need
+	    b->spi_sendbuf_status = (uint64_t)(uintptr_t)tmp;
+	}
+    }
+    else{
+      b->spi_sendbuf_status = (uint64_t)(uintptr_t) &spctrl->sendbuf_status;
+    }
+
+    /*
+     * Removed reference to waldo.
+     * Also needs to be read/write when context sharing so process can update the TID.
+     */
+    if((tmp=ipath_mmap64(0, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t),
+		   u->spu_subcontext_cnt ? PROT_READ | PROT_WRITE : PROT_READ,
+		   MAP_SHARED | MAP_LOCKED,
+		   fd, (__off64_t)b->spi_rcvhdr_base)) == MAP_FAILED) {
+      _IPATH_INFO("mmap of rcvhdrq failed: %s\n", strerror(errno));
+      goto err;
+    }
+    else {
+	// for use in protocol code
+	_IPATH_MMDBG("mmap rcvhdrq from kernel %llx, %lx bytes to %p\n",
+	    (unsigned long long)b->spi_rcvhdr_base,
+	    (unsigned long)(b->spi_rcvhdrent_size *
+			    b->spi_rcvhdr_cnt*sizeof(uint32_t)), tmp);
+	ipath_touch_mmap(tmp, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t));
+	b->spi_rcvhdr_base = (uintptr_t)tmp; // set to mapped address
+    }
+
+    if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+        /* Don't mmap tail pointer if not using it. */
+	/* make tail address for false-eager-full recovery, CQ, Jul 15, 2013 */
+	spctrl->__ipath_rcvtail = (volatile uint32_t*)
+	    &spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8];
+	_IPATH_MMDBG("mmap rcvhdrq tail %p\n", spctrl->__ipath_rcvtail);
+	b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t)spctrl->__ipath_rcvtail;
+    }
+    else if ((b->spi_rcvhdr_tailaddr & pg_mask) == (uregbase & pg_mask)) {
+	uintptr_t s;
+	s = b->spi_rcvhdr_tailaddr - (b->spi_rcvhdr_tailaddr & pg_mask);
+	b->spi_rcvhdr_tailaddr = b->spi_uregbase + s;
+	spctrl->__ipath_rcvtail = (volatile uint32_t*)(uintptr_t)b->spi_rcvhdr_tailaddr;
+    }
+    else if (!b->spi_rcvhdr_tailaddr) {
+	/* If tailaddr is NULL, use the ureg page (for context sharing) */
+	spctrl->__ipath_rcvtail = (volatile uint32_t*)
+	    &spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8];
+	_IPATH_MMDBG("mmap rcvhdrq tail %p\n", spctrl->__ipath_rcvtail);
+    }
+    else if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)b->spi_rcvhdr_tailaddr)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno));
+	goto err;
+    }
+    else {
+	ipath_touch_mmap(tmp, __ipath_pg_sz);
+	spctrl->__ipath_rcvtail = (volatile uint32_t*)tmp; // for use in protocol code
+	_IPATH_MMDBG("mmap rcvhdrq tail from kernel %llx to %p\n",
+	    (unsigned long long)b->spi_rcvhdr_tailaddr, tmp);
+	/* Update baseinfo with new value of tail address */
+	b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t) tmp;
+    }
+
+    spctrl->__ipath_tidegrcnt = b->spi_tidegrcnt;
+    if(!b->spi_rcv_egrbuftotlen) {
+	_IPATH_ERROR("new protocol against older driver, fall back to old\n");
+	b->spi_rcv_egrbuftotlen = b->spi_rcv_egrbufsize*b->spi_tidegrcnt;
+    }
+
+    if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen,
+	    PROT_READ, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)b->spi_rcv_egrbufs)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of egr bufs from %llx failed: %s\n",
+	    (long long)b->spi_rcv_egrbufs, strerror(errno));
+	goto err;
+    }
+    else {
+	_IPATH_MMDBG("mmap egr bufs of 0x%x bytes (0x%x) from kernel %llx to %p\n",
+	     b->spi_rcv_egrbufsize, b->spi_rcv_egrbuftotlen,
+	     (long long)b->spi_rcv_egrbufs, tmp);
+	ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen);
+	b->spi_rcv_egrbufs = (uint64_t)(uintptr_t)tmp;
+    }
+
+    pioavailaddr = b->spi_pioavailaddr;
+    if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)b->spi_pioavailaddr)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of pioavail registers (%llx) failed: %s\n",
+	    (long long)b->spi_pioavailaddr, strerror(errno));
+	goto err;
+    }
+    else {
+	volatile __le64 *pio;
+	_IPATH_MMDBG("mmap pioavail from kernel 0x%llx to %p\n",
+	    (long long)b->spi_pioavailaddr, tmp);
+	b->spi_pioavailaddr = (uintptr_t)tmp;
+	pio = (volatile __le64 *)(uintptr_t)b->spi_pioavailaddr;
+	_IPATH_DBG("pioindex=0x%x, piocnt=0x%x "
+	    "pioavailregs 0x%llx, 0x%llx, 0x%llx, 0x%llx\n",
+	    b->spi_pioindex, b->spi_piocnt,
+	    (unsigned long long)__le64_to_cpu(pio[0]),
+	    (unsigned long long)__le64_to_cpu(pio[1]),
+	    (unsigned long long)__le64_to_cpu(pio[2]),
+	    (unsigned long long)__le64_to_cpu(pio[3]));
+    }
+
+    if ((b->spi_status & pg_mask) == (pioavailaddr & pg_mask)) {
+        /* spi_status and spi_pioavailaddr are in the same page */
+	uintptr_t s;
+	s = b->spi_status - pioavailaddr;
+	b->spi_status = (uintptr_t)tmp + s;
+	spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status;
+    }
+    else if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED,
+		 fd, (__off64_t)(b->spi_status & pg_mask))) == MAP_FAILED) {
+	_IPATH_INFO("mmap of spi_status (%llx) failed: %s\n",
+	    (long long)b->spi_status, strerror(errno));
+	goto err;
+    }
+    else {
+        /* spi_status and spi_pioavailaddr are in different pages */
+	uintptr_t s;
+	_IPATH_MMDBG("mmap spi_status from kernel 0x%llx to %p\n",
+	    (long long)b->spi_status, tmp);
+	s = b->spi_status - (b->spi_status & pg_mask);
+	b->spi_status = (uintptr_t)tmp + s;
+	spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status;
+    }
+    _IPATH_DBG("chipstatus=0x%llx\n",
+	       (unsigned long long)*spctrl->__ipath_spi_status);
+
+    if(u->spu_subcontext_cnt > 0) {
+	unsigned num_subcontexts = u->spu_subcontext_cnt;
+	size_t size;
+	int i;
+
+	size = __ipath_pg_sz * num_subcontexts;
+	if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)b->spi_subctxt_uregbase)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext uregbase array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_uregbase, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext uregbase array (0x%zx) from kernel %llx to %p\n",
+		size, (long long)b->spi_subctxt_uregbase, tmp);
+	    ipath_touch_mmap(tmp, size);
+	    
+	    b->spi_subctxt_uregbase = (uint64_t)(uintptr_t)tmp;
+
+	    for (i = 0; i < num_subcontexts; i++) {
+		volatile uint64_t *uregp = (volatile uint64_t *)tmp;
+		if (i == u->spu_subcontext_id) {
+		    * (volatile uint32_t *) &uregp[ur_rcvhdrtail * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvhdrhead * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvegrindexhead * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvegrindextail * 8] = 0;
+		}
+		tmp = (void *)((char*)tmp + __ipath_pg_sz);
+	    }
+	}
+	size = ALIGN(b->spi_rcvhdr_cnt * b->spi_rcvhdrent_size *
+		sizeof(uint32_t), __ipath_pg_sz) * num_subcontexts;
+	if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)b->spi_subctxt_rcvhdr_base)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_rcvhdr_base, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext rcvhdr_base array (0x%zx) from kernel %llx to %p\n",
+		size, (long long)b->spi_subctxt_rcvhdr_base, tmp);
+	    ipath_touch_mmap(tmp, size);
+	    b->spi_subctxt_rcvhdr_base = (uint64_t)(uintptr_t)tmp;
+	}
+	if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen * num_subcontexts,
+		PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)b->spi_subctxt_rcvegrbuf)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_rcvegrbuf, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext rcvegrbuf array (0x%x) from kernel %llx to %p\n",
+		b->spi_rcv_egrbuftotlen, (long long)b->spi_subctxt_rcvegrbuf,
+		tmp);
+	    ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen * num_subcontexts);
+	    b->spi_subctxt_rcvegrbuf = (uint64_t)(uintptr_t)tmp;
+	}
+    }
+
+    spctrl->spc_dev.spd_fd = fd;
+    if(fstat(fd, &st)) {
+	_IPATH_INFO("can't stat infinipath device to determine type: %s\n",
+	    strerror(errno));
+	goto err;
+    }
+    else if(!S_ISCHR(st.st_mode)) {
+	// shouldn't ever happen, since the commands worked, but...
+	_IPATH_INFO("file descriptor is not for a real device, failing\n");
+	goto err;
+    }
+    spctrl->spc_dev.spd_type = minor(st.st_rdev);
+    return spctrl;
+err:
+    if(spctrl)
+        free(spctrl);
+    return NULL;
+}
+
+#endif		//__MIC__
diff --git a/ipath/ipath_protomic.c b/ipath/ipath_protomic.c
new file mode 100644
index 0000000..2c3afa3
--- /dev/null
+++ b/ipath/ipath_protomic.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef __MIC__
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+#include <sched.h>
+
+#include <scif.h>
+
+#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1))
+
+/*
+ * unit		: bit 1-3
+ * context	: bit 4-8
+ * subcontext	: bit 9-11
+ * type		: bit 12-16
+ */
+#define MAKE_KEY(unit, context, subcontext, type, subctxtcnt)	\
+	(((unit)&0x7) | (((context)&0x1F)<<3) |		\
+	(((subcontext)&0x7)<<8) | (((type)&0x1F)<<11) |	\
+	(((subctxtcnt)&0x7)<<16))
+
+#define GET_UNIT_FROM_KEY(key)				\
+	((key)&0x7)
+
+#define GET_CONTEXT_FROM_KEY(key)			\
+	(((key)>>3)&0x1F)
+
+/*
+flags in above structure has the following bits:
+0x1: map remote host buffer, offset is the SCIF offset
+0x2: allocate knx memory in kernel.
+0x4: allocate physically contiguous knx memory in kernel.
+0x8: SCIF register knx memory, and copy offset to first 8 bytes.
+*/
+#define MIC_HOSTMEM_MAP			0x1
+#define MIC_KNXMEM_ALLOC		0x2
+#define MIC_KNXMEM_ALLOC_CONTG		0x4
+#define MIC_KNXMEM_REGISTER		0x8
+
+/*
+ * Memory name to map into PSM process.
+ */
+#define SPI_SENDBUF_STATUS		1
+#define SPI_RCVHDR_BASE			2
+#define SPI_RCVHDR_TAILADDR		3
+#define SPI_RCV_EGRBUFS			4
+#define SPI_UREGBASE			5
+#define SPI_PIOBUFBASE			6
+#define SPI_PIOAVAILADDR		7
+#define SPI_STATUS			8
+#define SPI_SUBCTXT_UREGBASE		9
+#define SPI_SUBCTXT_RCVHDR_BASE		10
+#define SPI_SUBCTXT_RCVEGRBUF		11
+
+static void ipath_setaffinity(int fd) 
+{
+    cpu_set_t cpuset;
+    char *env;
+
+    if(getenv("IPATH_NO_CPUAFFINITY")) {
+        _IPATH_PRDBG("Skipping processor affinity, $IPATH_NO_CPUAFFINITY set\n");
+        return;
+    }
+
+    env = getenv("IPATH_SET_CPUAFFINITY");
+    if (!env) return;
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(atoi(env), &cpuset);
+    if(sched_setaffinity(0,sizeof cpuset, &cpuset)) {
+	_IPATH_INFO("sched_setaffinity() failed, cpu %d\n", atoi(env));
+    }
+
+    return;
+}
+
+// It is allowed to have multiple devices (and of different types)
+// simultaneously opened and initialized, although this (still! Oct 07)
+// implemented.  This routine is used by the low level
+// infinipath protocol code (and any other code that has similar low level
+// functionality).
+// This is the only routine that takes a file descriptor, rather than an
+// struct _ipath_ctrl *.  The struct _ipath_ctrl * used for everything
+// else is returned as part of ipath_base_info.
+struct _ipath_ctrl *ipath_userinit(int fd, struct ipath_user_info *u,
+                                   struct ipath_base_info *b)
+{
+    struct _ipath_ctrl *spctrl = NULL;
+    void *tmp;
+    uint64_t *tmp64;
+    struct stat st;
+    struct ipath_cmd c;
+    size_t usize;
+    uintptr_t pg_mask;
+    __u64 pioavailaddr;
+    __u64 sendbuf_status, rcvhdr_base, rcv_egrbufs;
+    int __ipath_pg_sz;
+    
+    /* First get the page size */
+    __ipath_pg_sz = sysconf(_SC_PAGESIZE);
+    pg_mask = ~ (intptr_t) (__ipath_pg_sz - 1);
+
+    u->spu_base_info_size = sizeof(*b);
+    u->spu_base_info = (uint64_t)(uintptr_t) b;
+
+    memset(&c, 0, sizeof(struct ipath_cmd));
+    c.type = IPATH_CMD_ASSIGN_CONTEXT;
+    memcpy(&c.cmd.user_info, u, sizeof(*u));
+
+    if(ipath_cmd_assign_context(fd, &c, sizeof(c)) == -1) {
+        _IPATH_INFO("assign_context command failed: %s\n", strerror(errno));
+        goto err;
+    }
+
+    ipath_setaffinity(fd); // prior to memory allocation in driver, etc.
+
+    /*
+     * Allocate b->spi_sendbuf_status, one page size.
+     */
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SENDBUF_STATUS, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC|MIC_KNXMEM_REGISTER;
+    c.cmd.mem_info.length = __ipath_pg_sz;
+    c.cmd.mem_info.offset = 0;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+	goto err;
+    }
+    if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, fd,
+        (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+   	    _IPATH_INFO("mmap of send buffer status page at %llx failed: %s\n",
+         (long long unsigned)b->spi_sendbuf_status,
+         strerror(errno));
+	goto err;
+    }
+    else {
+	_IPATH_MMDBG("mmap send buffer status page from kernel %llx to %p\n",
+	    (long long unsigned)b->spi_sendbuf_status, tmp);
+	// we don't try to fault these in; no need
+	sendbuf_status = (uint64_t)(uintptr_t)tmp;
+	if (b->spi_subcontext == 0) {
+	    b->spi_sendbuf_status = (uint64_t)(*((off_t*)tmp));
+	    //*((off_t*)tmp) = 0;
+	}
+    }
+
+    /*
+     * Allocate b->spi_rcvhdr_base.
+     */
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_RCVHDR_BASE, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC_CONTG|MIC_KNXMEM_REGISTER;
+    c.cmd.mem_info.length = b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t);
+    c.cmd.mem_info.offset = 0;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+	goto err;
+    }
+    if((tmp=ipath_mmap64(0, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t),
+		   u->spu_subcontext_cnt ? PROT_READ | PROT_WRITE : PROT_READ,
+		   MAP_SHARED | MAP_LOCKED,
+		   fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+      _IPATH_INFO("mmap of rcvhdrq failed: %s\n", strerror(errno));
+      goto err;
+    }
+    else {
+	// for use in protocol code
+	_IPATH_MMDBG("mmap rcvhdrq from kernel %llx, %lx bytes to %p\n",
+	    (unsigned long long)b->spi_rcvhdr_base,
+	    (unsigned long)(b->spi_rcvhdrent_size *
+			    b->spi_rcvhdr_cnt*sizeof(uint32_t)), tmp);
+	ipath_touch_mmap(tmp, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t));
+	rcvhdr_base = (uintptr_t)tmp; // set to mapped address
+	if (b->spi_subcontext == 0) {
+	    b->spi_rcvhdr_base = (uint64_t)(*((off_t*)tmp));
+	    //*((off_t*)tmp) = 0;
+	}
+    }
+
+    /*
+     * Skip b->spi_rcvhdr_tailaddr.
+     */
+    if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL)
+      ; /* Don't mmap tail pointer if not using it. */
+    else {
+	_IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno));
+	goto err;
+    }
+
+    /*
+     * Allocate b->spi_rcv_egrbufs.
+     */
+    if(!b->spi_rcv_egrbuftotlen) {
+	_IPATH_ERROR("new protocol against older driver, fall back to old\n");
+	goto err;
+    }
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_RCV_EGRBUFS, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC|MIC_KNXMEM_REGISTER;
+    c.cmd.mem_info.length = b->spi_rcv_egrbuftotlen;
+    c.cmd.mem_info.offset = 0;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+	goto err;
+    }
+    if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen,
+	    PROT_READ, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of egr bufs from %llx failed: %s\n",
+	    (long long)b->spi_rcv_egrbufs, strerror(errno));
+	goto err;
+    }
+    else {
+	_IPATH_MMDBG("mmap egr bufs of 0x%x bytes (0x%x) from kernel %llx to %p\n",
+	     b->spi_rcv_egrbufsize, b->spi_rcv_egrbuftotlen,
+	     (long long)b->spi_rcv_egrbufs, tmp);
+	ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen);
+	rcv_egrbufs = (uint64_t)(uintptr_t)tmp;
+	if (b->spi_subcontext == 0) {
+	    b->spi_rcv_egrbufs = (uint64_t)(*((off_t*)tmp));
+	    //*((off_t*)tmp) = 0;
+	}
+    }
+
+    memset(&c, 0, sizeof(struct ipath_cmd));
+    c.type = IPATH_CMD_USER_INIT;
+    memcpy(&c.cmd.user_info, u, sizeof(*u));
+
+    if(ipath_cmd_user_init(fd, &c, sizeof(c)) == -1) {
+        _IPATH_INFO("userinit command failed: %s\n", strerror(errno));
+        goto err;
+    }
+    /*
+     * If header redirection is enabled, there will be a shared subcontext
+     * with the kernel that we have to examine.
+     */
+    if (b->spi_runtime_flags & IPATH_RUNTIME_CTXT_REDIRECT)
+        u->spu_subcontext_cnt = 1;
+
+    _IPATH_PRDBG("Driver is %sQLogic-built\n",
+	((1<<31)&b->spi_sw_version) ? "" : "not ");
+    if((0x7fff&(b->spi_sw_version >> 16)) != IPATH_USER_SWMAJOR) {
+	_IPATH_INFO
+	    ("User major version 0x%x not same as driver major 0x%x\n",
+	     IPATH_USER_SWMAJOR, b->spi_sw_version >> 16);
+	if((b->spi_sw_version >> 16) < IPATH_USER_SWMAJOR)
+	    goto err; // else assume driver knows how to be compatible
+    }
+    else if ((b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) {
+	_IPATH_PRDBG("User minor version 0x%x not same as driver minor 0x%x\n",
+	     IPATH_USER_SWMINOR, b->spi_sw_version & 0xffff);
+	if ((b->spi_sw_version & 0xffff) < IPATH_USER_SWMINOR)
+	  b->spi_sendbuf_status = 0;
+    }
+
+    if (u->spu_subcontext_cnt &&
+        (b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) {
+        _IPATH_INFO("Mismatched user minor version (%d) and driver "
+                         "minor version (%d) while context sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n", 
+	            IPATH_USER_SWMINOR,
+                    (int) (b->spi_sw_version & 0xffff));
+    }
+
+    if(!(spctrl = calloc(1, sizeof(struct _ipath_ctrl)))) {
+	_IPATH_INFO("can't allocate memory for ipath_ctrl: %s\n",
+		strerror(errno));
+	goto err;
+    }
+
+    /*  
+     * Setup KNC buffers mapped to host.
+     */
+    b->spi_sendbuf_status = sendbuf_status;
+    b->spi_rcvhdr_base = rcvhdr_base;
+    b->spi_rcv_egrbufs = rcv_egrbufs;
+
+    /* Check if we need to turn off header suppression in hardware and 
+     * emulate it in software. Since the driver disables all TID flow 
+     * entries we don't need to do anything just fake it that this
+     * looks like Linda. 
+     * Note: This will break the hardware detection heuristics where we
+     * determine that a card is QLE73XX by looking at the capability to 
+     * support header suppression! Need the driver to provide the requisite
+     * information so we can move away from heuristics based on flags.
+     */
+    {
+      const char *env;
+      
+      if ((env = getenv("IPATH_HW_HEADER_SUPPRESSION")) && (*env != '\0')) {
+	int hwsupp = (int) strtol(env, NULL, 0);
+	
+	if (!hwsupp && (b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) {
+	    _IPATH_INFO("Disabling hardware suppresion!\n");
+	  b->spi_runtime_flags &= ~IPATH_RUNTIME_HDRSUPP;
+	}
+      } /* Env */
+      
+    }
+    
+      
+    usize = b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP ?
+      2 * __ipath_pg_sz : __ipath_pg_sz;
+    _IPATH_DBG("uregbase=%llx usize=%u context=%d\n",
+	       (unsigned long long) b->spi_uregbase,
+	       (unsigned) usize, (int) b->spi_context);
+    
+    // now mmap in the rcvhdrq, egr bufs, PIO buffers and user regs
+    // _ipath_uregbase is the user regs; not offset as it is in the kernel
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_UREGBASE, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_HOSTMEM_MAP;
+    c.cmd.mem_info.length = usize;
+    c.cmd.mem_info.offset = b->spi_uregbase;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+        goto err;
+    }
+
+    if((tmp=ipath_mmap64(0, usize, PROT_WRITE | PROT_READ,
+	    MAP_SHARED | MAP_LOCKED, fd,
+	    (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of user registers at %llx failed: %s\n",
+	     (long long unsigned)b->spi_uregbase,
+	     strerror(errno));
+	goto err;
+    }
+
+    _IPATH_MMDBG("mmap user regs from kernel %llx to %p (0x%lx bytes)\n",
+		 (long long unsigned) b->spi_uregbase, tmp, 
+		 (unsigned long)usize);
+    
+    // we don't try to fault these in, no need
+    tmp64 = (uint64_t *)tmp;
+    b->spi_uregbase = (uint64_t)(uintptr_t)tmp;
+    spctrl->spc_dev.spd_uregbase = (volatile uint64_t*) tmp;
+    
+    /*
+     * Set up addresses for optimized register writeback routines.
+     * This is for the real onchip registers, shared context or not
+     */
+    spctrl->__ipath_rcvhdrhead = (uint32_t*)&tmp64[ur_rcvhdrhead];
+    spctrl->__ipath_rcvegrhead = (uint32_t*)&tmp64[ur_rcvegrindexhead];
+    spctrl->__ipath_rcvegrtail = (uint32_t*)&tmp64[ur_rcvegrindextail];
+    
+    if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+      spctrl->__ipath_rcvtail = (volatile uint32_t*)
+		&spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8];
+      b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t)spctrl->__ipath_rcvtail;
+    } else {
+	_IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno));
+	goto err;
+    }
+
+    if (!(b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) {
+      static __le32 regs[INFINIPATH_TF_NFLOWS << 1];
+      static __le32 tidflow_wmb_location;
+      _IPATH_DBG("HdrSupp not available. Using virt tidflow table.\n");
+      spctrl->__ipath_rcvtidflow = regs;
+      spctrl->__ipath_tidflow_wmb = &spctrl->tidflow_wmb_location;
+    }
+    else {
+      spctrl->__ipath_rcvtidflow = (uint32_t*)&tmp64[ur_rcvtidflow];
+      spctrl->__ipath_tidflow_wmb = (__le32*)spctrl->__ipath_rcvegrtail;
+    }
+    
+    /* map the receive tidflow table in QLE73XX */    
+    _IPATH_DBG("rcvtidfflow=%p offset=0x%lx\n", 
+	       spctrl->__ipath_rcvtidflow,
+	       (long) ((uintptr_t) spctrl->__ipath_rcvtidflow - (uintptr_t) tmp64));
+    	
+    {   char *maxpio; uint32_t numpio;
+	maxpio = getenv("IPATH_MAXPIO");
+	if(maxpio && (numpio=strtoul(maxpio, NULL, 0))>0 &&
+	    numpio < b->spi_piocnt) {
+	    _IPATH_INFO("$IPATH_MAXPIO is %u, reducing PIO buffer count from %u\n",
+		numpio, b->spi_piocnt);
+		b->spi_piocnt = numpio;
+	}
+    }
+
+    // map in the PIO buffers, much like ureg, since it's
+    // in the chip address space
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, b->spi_subcontext, SPI_PIOBUFBASE, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_HOSTMEM_MAP;
+    c.cmd.mem_info.length = b->spi_pioalign*b->spi_piocnt;
+    c.cmd.mem_info.offset = b->spi_piobufbase;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+        goto err;
+    }
+
+    if((tmp=ipath_mmap64(0, b->spi_pioalign*b->spi_piocnt,
+	    PROT_WRITE, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of pio buffers at %llx failed: %s\n",
+	     (long long unsigned)b->spi_piobufbase,
+	     strerror(errno));
+	goto err;
+    }
+    else {
+	_IPATH_MMDBG("mmap PIO buffers from kernel %llx, %u pages to %p\n",
+	    (unsigned long long)b->spi_piobufbase, b->spi_piocnt, tmp);
+	// Do not try to read the PIO buffers; they are mapped write
+	// only.  We'll fault them in as we write to them.
+	b->spi_piobufbase = (uintptr_t)tmp;
+    }
+
+    pioavailaddr = b->spi_pioavailaddr;
+    c.type = IPATH_CMD_MIC_MEM_INFO;
+    c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_PIOAVAILADDR, u->spu_subcontext_cnt);
+    c.cmd.mem_info.flags = MIC_HOSTMEM_MAP;
+    c.cmd.mem_info.length = __ipath_pg_sz;
+    c.cmd.mem_info.offset = b->spi_pioavailaddr;
+    if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	_IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+        goto err;
+    }
+
+    if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED,
+	    fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	_IPATH_INFO("mmap of pioavail registers (%llx) failed: %s\n",
+	    (long long)b->spi_pioavailaddr, strerror(errno));
+	goto err;
+    }
+    else {
+	volatile __le64 *pio;
+	_IPATH_MMDBG("mmap pioavail from kernel 0x%llx to %p\n",
+	    (long long)b->spi_pioavailaddr, tmp);
+	b->spi_pioavailaddr = (uintptr_t)tmp;
+	pio = (volatile __le64 *)(uintptr_t)b->spi_pioavailaddr;
+	_IPATH_DBG("pioindex=0x%x, piocnt=0x%x "
+	    "pioavailregs 0x%llx, 0x%llx, 0x%llx, 0x%llx\n",
+	    b->spi_pioindex, b->spi_piocnt,
+	    (unsigned long long)__le64_to_cpu(pio[0]),
+	    (unsigned long long)__le64_to_cpu(pio[1]),
+	    (unsigned long long)__le64_to_cpu(pio[2]),
+	    (unsigned long long)__le64_to_cpu(pio[3]));
+    }
+
+    if ((b->spi_status & pg_mask) == (pioavailaddr & pg_mask)) {
+        /* spi_status and spi_pioavailaddr are in the same page */
+	uintptr_t s;
+	s = b->spi_status - pioavailaddr;
+	b->spi_status = (uintptr_t)tmp + s;
+	spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status;
+    }
+    else {
+	_IPATH_INFO("mmap of spi_status (%llx) failed: %s\n",
+	    (long long)b->spi_status, strerror(errno));
+	goto err;
+    }
+    _IPATH_DBG("chipstatus=0x%llx\n",
+	       (unsigned long long)*spctrl->__ipath_spi_status);
+
+    if(u->spu_subcontext_cnt) {
+	unsigned num_subcontexts = u->spu_subcontext_cnt;
+	size_t size;
+	int i;
+
+	size = __ipath_pg_sz * num_subcontexts;
+        c.type = IPATH_CMD_MIC_MEM_INFO;
+        c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_UREGBASE, u->spu_subcontext_cnt);
+        c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC;
+        c.cmd.mem_info.length = size;
+        c.cmd.mem_info.offset = 0;
+        if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	    _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+            goto err;
+        }
+
+	if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext uregbase array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_uregbase, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext uregbase array (0x%zx) from kernel %llx to %p\n",
+		size, (long long)b->spi_subctxt_uregbase, tmp);
+	    ipath_touch_mmap(tmp, size);
+	    
+	    b->spi_subctxt_uregbase = (uint64_t)(uintptr_t)tmp;
+
+	    for (i = 0; i < num_subcontexts; i++) {
+		volatile uint64_t *uregp = (volatile uint64_t *)tmp;
+		if (i == u->spu_subcontext_id) {
+		    * (volatile uint32_t *) &uregp[ur_rcvhdrtail * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvhdrhead * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvegrindexhead * 8] = 0;
+		    * (volatile uint32_t *) &uregp[ur_rcvegrindextail * 8] = 0;
+		}
+		tmp = (void *)((char *)tmp + __ipath_pg_sz);
+	    }
+	}
+	size = ALIGN(b->spi_rcvhdr_cnt * b->spi_rcvhdrent_size *
+		sizeof(uint32_t), __ipath_pg_sz) * num_subcontexts;
+        c.type = IPATH_CMD_MIC_MEM_INFO;
+        c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_RCVHDR_BASE, u->spu_subcontext_cnt);
+        c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC;
+        c.cmd.mem_info.length = size;
+        c.cmd.mem_info.offset = 0;
+        if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	    _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+            goto err;
+        }
+
+	if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_rcvhdr_base, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext rcvhdr_base array (0x%zx) from kernel %llx to %p\n",
+		size, (long long)b->spi_subctxt_rcvhdr_base, tmp);
+	    ipath_touch_mmap(tmp, size);
+	    b->spi_subctxt_rcvhdr_base = (uint64_t)(uintptr_t)tmp;
+	}
+
+	size = b->spi_rcv_egrbuftotlen * num_subcontexts;
+        c.type = IPATH_CMD_MIC_MEM_INFO;
+        c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_RCVEGRBUF, u->spu_subcontext_cnt);
+        c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC;
+        c.cmd.mem_info.length = size;
+        c.cmd.mem_info.offset = 0;
+        if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) {
+	    _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno));
+            goto err;
+        }
+
+	if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE,
+		MAP_SHARED | MAP_LOCKED,
+		fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) {
+	    _IPATH_INFO("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n",
+		(long long)b->spi_subctxt_rcvegrbuf, strerror(errno));
+	    goto err;
+	}
+	else {
+	    _IPATH_MMDBG(
+		"mmap subcontext rcvegrbuf array (0x%x) from kernel %llx to %p\n",
+		b->spi_rcv_egrbuftotlen, (long long)b->spi_subctxt_rcvegrbuf,
+		tmp);
+	    ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen * num_subcontexts);
+	    b->spi_subctxt_rcvegrbuf = (uint64_t)(uintptr_t)tmp;
+	}
+    }
+
+    spctrl->spc_dev.spd_fd = fd;
+    return spctrl;
+err:
+    if(spctrl)
+        free(spctrl);
+    return NULL;
+}
+
+#endif		//__MIC__
diff --git a/ipath/ipath_service.c b/ipath/ipath_service.c
new file mode 100644
index 0000000..f25b09b
--- /dev/null
+++ b/ipath/ipath_service.c
@@ -0,0 +1,1377 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains ipath service routine interface used by the low
+// level infinipath protocol code.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <poll.h>
+
+#include "ipath_service.h"
+
+/*
+ * This function is necessary in a udev-based world.  There can be an
+ * arbitrarily long (but typically less than one second) delay between
+ * a driver getting loaded and any dynamic special files turning up.
+ *
+ * The timeout is in milliseconds.  A value of zero means "callee
+ * decides timeout".  Negative is infinite.
+ *
+ * Returns 0 on success, -1 on error or timeout.  Check errno to see
+ * whether there was a timeout (ETIMEDOUT) or an error (any other
+ * non-zero value).
+ */
+int
+ipath_wait_for_device(const char *path, long timeout)
+{
+    int saved_errno;
+    struct stat st;
+    long elapsed;
+    int ret;
+
+    if (timeout == 0)
+        timeout = 15000;
+
+    elapsed = 0;
+
+    while (1) {
+        static const long default_ms = 250;
+        struct timespec req = { 0 };
+        long ms;
+
+        ret = stat(path, &st);
+        saved_errno = errno;
+
+        if (ret == 0 || (ret == -1 && errno != ENOENT))
+            break;
+
+        if (timeout - elapsed == 0) {
+            saved_errno = ETIMEDOUT;
+            break;
+        }
+
+        if (elapsed == 0) {
+            if (timeout == -1)
+                _IPATH_DBG("Device file %s not present on first check; "
+                           "waiting indefinitely...\n", path);
+            else
+                _IPATH_DBG("Device file %s not present on first check; "
+                           "waiting up to %.1f seconds...\n",
+                           path, timeout / 1e3);
+        }
+
+        if (timeout < 0 || timeout - elapsed >= default_ms)
+            ms = default_ms;
+        else
+            ms = timeout;
+
+        elapsed += ms;
+        req.tv_nsec = ms * 1000000;
+
+        ret = nanosleep(&req, NULL);
+        saved_errno = errno;
+
+        if (ret == -1)
+            break;
+    }
+
+    if (ret == 0)
+        _IPATH_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3);
+    else
+        _IPATH_INFO("The %s device failed to appear after %.1f seconds: %s\n",
+                    path, elapsed / 1e3, strerror(saved_errno));
+
+    errno = saved_errno;
+    return ret;
+}
+
+#ifdef __MIC__
+#include <scif.h>
+#define PSM_HOST_PORT		SCIF_OFED_PORT_7	/* predefined port */
+#define PSM_HOST_NODE		0			/* host node is always 0 */
+scif_epd_t			psmd_epd = -1;
+int				qibp_fd = -1;
+
+static scif_epd_t
+ipath_psmd_connect(uint16_t node, uint16_t port)
+{
+    int conn_port, tries = 20;
+    struct scif_portID portID;
+    scif_epd_t epd;
+    uid_t uid;
+    gid_t gid;
+
+    epd = scif_open();
+    if (epd < 0) {
+	fprintf(stderr, "scif_open failed with error %d\n", errno);
+	return (scif_epd_t)-1;
+    }
+
+    if ((conn_port = scif_bind(epd, 0)) < 0) {
+	fprintf(stderr, "scif_bind failed with error %d\n", errno);
+	scif_close(epd);
+	return (scif_epd_t)-1;
+    }
+
+    portID.port = port;
+    portID.node = node;
+retry:
+    if (scif_connect(epd, &portID) < 0) {
+	if ((errno == ECONNREFUSED) && (tries > 0)) {
+	    tries--;
+	    sleep(1);
+	    goto retry;
+	}
+	fprintf(stderr, "scif_connect failed with error %d(%s)\n", errno, strerror(errno));
+	fprintf(stderr, "Please check if /usr/sbin/psmd is running on host.\n");
+	scif_close(epd);
+	return (scif_epd_t)-1;
+    }
+
+    uid = geteuid();
+    if (scif_send(epd, &uid, sizeof(uid), SCIF_SEND_BLOCK) != sizeof(uid)) {
+	fprintf(stderr, "cannot send uid to psmd service\n");
+	scif_close(epd);
+	return (scif_epd_t)-1;
+    }
+    gid = getegid();
+    if (scif_send(epd, &gid, sizeof(gid), SCIF_SEND_BLOCK) != sizeof(gid)) {
+	fprintf(stderr, "cannot send gid to psmd service\n");
+	scif_close(epd);
+	return (scif_epd_t)-1;
+    }
+
+    return epd;
+}
+
+static int
+ipath_scif_send(void *buf, size_t len)
+{
+    int ret;
+
+    if (psmd_epd == -1) {
+	psmd_epd = ipath_psmd_connect(PSM_HOST_NODE, PSM_HOST_PORT);
+	if (psmd_epd == -1) return -1;
+    }
+
+    while (len) {
+	ret = scif_send(psmd_epd, buf, (uint32_t)len, SCIF_SEND_BLOCK);
+	if (ret < 0) {
+	    if (errno == EINTR) continue;
+	    return ret;
+	}
+	buf += ret;
+	len -= ret;
+    }
+    return 0;
+}
+
+static int
+ipath_scif_recv(void *buf, size_t len)
+{
+    int ret;
+    while (len) {
+	ret = scif_recv(psmd_epd, buf, (uint32_t)len, SCIF_RECV_BLOCK);
+	if (ret < 0) {
+	    if (errno == EINTR) continue;
+	    return ret;
+	}
+	buf += ret;
+	len -= ret;
+    }
+    return 0;
+}
+
+static int
+ipath_qibp_open(void)
+{
+    char dev_name[MAXPATHLEN];
+    int fd;
+
+    snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath");
+
+    if (ipath_wait_for_device(dev_name, 0) == -1) {
+        fprintf(stderr, "Could not find an InfiniPath qibp device %s\n", dev_name);
+	return -1;
+    }
+
+    if ((fd = open(dev_name, O_RDWR)) == -1) {
+        fprintf(stderr, "mic:Can't open %s for reading and writing\n", dev_name);
+	return -1;
+    }
+
+    if(fcntl(fd, F_SETFD, FD_CLOEXEC))
+        fprintf(stdout, "Failed to set close on exec for device: %s\n",
+            strerror(errno));
+
+    return fd;
+}
+
+#endif		//__MIC
+
+int
+ipath_context_open(int unit, int port, uint64_t open_timeout)
+{
+    int fd;
+
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd;
+
+    /*
+     * Re-direct context open request to psmd on host.
+     */
+    cmd.type = IPATH_CMD_CONTEXT_OPEN;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+    cmd.cmd.mic_info.data3 = open_timeout;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    fd = cmd.cmd.mic_info.data1; 
+    if (fd == -1) {
+	errno = cmd.cmd.mic_info.data2;
+	return -1;
+    }
+
+    /*
+     * Open MIC side qibp before context is assigned.
+     */
+    if (qibp_fd != -1) {
+	fprintf(stderr, "ipath_context_open(): qibp already opened\n");
+	return -1;
+    }
+    qibp_fd = ipath_qibp_open();
+    if (qibp_fd == -1) return -1;
+
+#else
+    char dev_name[MAXPATHLEN];
+
+    if (unit != IPATH_UNIT_ID_ANY && unit >= 0) 
+	snprintf(dev_name, sizeof(dev_name), "%s%u", "/dev/ipath", unit);
+    else
+	snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath");
+
+    if (ipath_wait_for_device(dev_name, (long)open_timeout) == -1) {
+        _IPATH_DBG("Could not find an InfiniPath Unit on device "
+		    "%s (%lds elapsed)", dev_name, (long)open_timeout / 1000);
+	return -1;
+    }
+
+    if ((fd = open(dev_name, O_RDWR)) == -1) {
+        _IPATH_DBG("(host:Can't open %s for reading and writing",
+		    dev_name);
+	return -1;
+    }
+
+    if(fcntl(fd, F_SETFD, FD_CLOEXEC))
+        _IPATH_INFO("Failed to set close on exec for device: %s\n",
+            strerror(errno));
+#endif
+
+    return fd;
+}
+
+void
+ipath_context_close(int fd)
+{
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_CONTEXT_CLOSE;
+    cmd.cmd.mic_info.data1 = fd;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return;
+
+    if (qibp_fd >= 0) {
+	close(qibp_fd);
+	qibp_fd = -1;
+    }
+    if (psmd_epd >= 0) {
+	scif_close(psmd_epd);
+	psmd_epd = -1;
+    }
+#else
+    (void) close(fd);
+#endif
+}
+
+int
+ipath_cmd_writev(int fd, const struct iovec *iov, int iovcnt)
+{
+#ifdef __MIC__
+    return writev(qibp_fd, iov, iovcnt);
+#else
+    return writev(fd, iov, iovcnt);
+#endif
+}
+
+int
+ipath_cmd_assign_context(int fd, void *buf, size_t count)
+{
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd, *pcmd;
+
+    ret = ipath_scif_send(buf, count);
+    if (ret) return ret;
+
+    ret = ipath_scif_send(&fd, sizeof(fd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret < 0) {
+	errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    pcmd = (struct ipath_cmd *)buf;
+    ret = ipath_scif_recv(
+	(void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info,
+	(int)pcmd->cmd.user_info.spu_base_info_size);
+    return ret;
+#else
+    return write(fd, buf, count);
+#endif
+}
+
+int
+ipath_cmd_user_init(int fd, void *buf, size_t count)
+{
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd, *pcmd;
+
+    ret = ipath_scif_send(buf, count);
+    if (ret) return ret;
+
+    pcmd = (struct ipath_cmd *)buf;
+    ret = ipath_scif_send(
+	(void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info,
+	(int)pcmd->cmd.user_info.spu_base_info_size);
+    if (ret) return ret;
+
+    ret = ipath_scif_send(&fd, sizeof(fd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret < 0) {
+	errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    ret = ipath_scif_recv(
+	(void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info,
+	(int)pcmd->cmd.user_info.spu_base_info_size);
+    return ret;
+#else
+    return write(fd, buf, count);
+#endif
+}
+
+int
+ipath_cmd_write(int fd, struct ipath_cmd *cmd, size_t count)
+{
+#ifdef __MIC__
+/*
+following cmd are processed by mic driver:
+IPATH_CMD_SDMA_COMPLETE
+IPATH_CMD_SDMA_INFLIGHT
+IPATH_CMD_TID_UPDATE
+IPATH_CMD_TID_FREE
+IPATH_CMD_MEM_INFO
+*/
+    int ret;
+
+    if (cmd->type == IPATH_CMD_MIC_MEM_INFO ||
+	cmd->type == IPATH_CMD_SDMA_COMPLETE ||
+	cmd->type == IPATH_CMD_SDMA_INFLIGHT ||
+	cmd->type == IPATH_CMD_TID_UPDATE ||
+	cmd->type == IPATH_CMD_TID_FREE) {
+	return write(qibp_fd, cmd, count);
+    }
+
+    ret = ipath_scif_send(cmd, count);
+    if (ret) return ret;
+
+    ret = ipath_scif_send(&fd, sizeof(fd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(cmd, count);
+    if (ret) return ret;
+
+    ret = cmd->cmd.mic_info.data1;
+    if (ret) errno = cmd->cmd.mic_info.data2;
+    return ret;
+#else
+    return write(fd, cmd, count);
+#endif
+}
+
+// we use mmap64() because we compile in both 32 and 64 bit mode,
+// and we have to map physical addresses that are > 32 bits long.
+// While linux implements mmap64, it doesn't have a man page,
+// and isn't declared in any header file, so we declare it here ourselves.
+
+// We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and
+// redirects mmap to mmap64 for us, but at least through suse10 and fc4,
+// it doesn't work when the address being mapped is > 32 bits.  It chips
+// off bits 32 and above.   So we stay with mmap64.
+void *
+ipath_mmap64(void *addr, size_t length, int prot, int flags, int fd, __off64_t offset)
+{
+#ifdef __MIC__
+    if (qibp_fd == -1) {
+	fprintf(stderr, "ipath_mmap64(): qibp not opened, qibp_fd=-1\n");
+	return MAP_FAILED;
+    }
+    fd = qibp_fd;
+#endif
+    return mmap64(addr, length, prot, flags, fd, offset);
+}
+
+// get the number of units supported by the driver.  Does not guarantee
+// that a working chip has been found for each possible unit #.
+// number of units >=0 (0 means none found).
+// formerly used sysfs file "num_units"
+int
+ipath_get_num_units(void)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_NUM_UNITS;
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    char pathname[128];
+    struct stat st;
+    int i;
+
+    ret = 0;
+    for(i=0; i<IPATH_MAX_UNIT; i++) { /* hope no more than supported units */
+	    snprintf(pathname, sizeof(pathname), QIB_CLASS_PATH"%d", i);
+	    if(stat(pathname, &st) || !S_ISDIR(st.st_mode))
+		    continue;
+	    ret++;
+    }
+#endif
+
+    return ret;
+}
+
+// get the number of contexts from the unit id.
+// Returns 0 if no unit or no match.
+int
+ipath_get_num_contexts(int unit_id)
+{
+    int n = 0;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+    int ret;
+
+    cmd.type = IPATH_CMD_GET_NUM_CTXTS;
+    cmd.cmd.mic_info.unit = unit_id;
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    n = cmd.cmd.mic_info.data1; 
+    if (n == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    int units;
+
+    units = ipath_get_num_units();
+    if (units > 0) {
+	int64_t val;
+	if (unit_id == IPATH_UNIT_ID_ANY) {
+	  uint32_t u, p;
+	    for (u = 0; u < units; u++) {
+	        for (p = 1; p <= IPATH_MAX_PORT; p++)
+		    if (ipath_get_port_lid(u, p) != -1)
+		        break;
+		if (p <= IPATH_MAX_PORT &&
+		    !ipath_sysfs_unit_read_s64(u, "nctxts", &val, 0))
+		    n += (uint32_t) val;
+	    }
+	}
+	else {
+	    uint32_t p;
+	    for (p = 1; p <= IPATH_MAX_PORT; p++)
+		if (ipath_get_port_lid(unit_id, p) != -1)
+	            break;
+	    if (p <= IPATH_MAX_PORT &&
+		!ipath_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0))
+	        n += (uint32_t) val;
+	}
+    }
+#endif
+
+    return n;
+}
+
+// Given the unit number, return an error, or the corresponding LID
+// For now, it's used only so the MPI code can determine it's own
+// LID, and which other LIDs (if any) are also assigned to this node
+// Returns an int, so -1 indicates an error.  0 may indicate that
+// the unit is valid, but no LID has been assigned.
+// No error print because we call this for both potential
+// ports without knowing if both ports exist (or are connected)
+int
+ipath_get_port_lid(int unit, int port)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_PORT_LID;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    int64_t val;
+    char *state;
+
+    ret = ipath_sysfs_port_read(unit, port, "phys_state", &state);
+    if (ret == -1) {
+	    if(errno == ENODEV)
+		    /* this is "normal" for port != 1, on single
+		     * port chips */
+		    _IPATH_VDBG("Failed to get phys_state for unit %u:%u: %s\n",
+			unit, port, strerror(errno));
+	    else
+		    _IPATH_DBG("Failed to get phys_state for unit %u:%u: %s\n",
+			unit, port, strerror(errno));
+    } else {
+	    if (strncmp(state, "5: LinkUp", 9)) {
+		    _IPATH_DBG("!LinkUp for unit %u:%u\n", unit, port);
+		    ret = -1;
+	    }
+	    free(state);
+    }
+    if (ret == -1) return ret;
+
+    ret = ipath_sysfs_port_read_s64(unit, port, "lid", &val, 0);
+
+    if (ret == -1) {
+	    if(errno == ENODEV)
+		    /* this is "normal" for port != 1, on single
+		     * port chips */
+		    _IPATH_VDBG("Failed to get LID for unit %u:%u: %s\n",
+			unit, port, strerror(errno));
+	    else
+		    _IPATH_DBG("Failed to get LID for unit %u:%u: %s\n",
+			unit, port, strerror(errno));
+    }
+    else {
+        ret = val;
+
+// disable this feature since we don't have a way to provide
+// file descriptor in multiple context case.
+#if 0
+	if(getenv("IPATH_DIAG_LID_LOOP")) {
+		// provides diagnostic ability to run MPI, etc. even
+		// on loopback, by claiming a different LID for each context
+		struct ipath_ctxt_info info;
+		struct ipath_cmd cmd;
+		cmd.type = IPATH_CMD_CTXT_INFO;
+		cmd.cmd.ctxt_info = (uintptr_t) &info;
+		if(__ipath_lastfd == -1)
+			_IPATH_INFO("Can't run CONTEXT_INFO for lid_loop, fd not set\n");
+		else if(write(__ipath_lastfd, &cmd, sizeof(cmd)) == -1)
+			_IPATH_INFO("CONTEXT_INFO command failed: %s\n", strerror(errno));
+		else if(!info.context)
+			_IPATH_INFO("CONTEXT_INFO returned context 0!\n");
+		else {
+			_IPATH_PRDBG("Using lid 0x%x, base %x, context %x\n",
+				ret + info.context, ret, info.context);
+			ret += info.context;
+		}
+	}
+#endif
+    }
+#endif
+
+    return ret;
+}
+
+// Given the unit number, return an error, or the corresponding GID
+// For now, it's used only so the MPI code can determine its fabric ID.
+// Returns an int, so -1 indicates an error.
+// No error print because we call this for both potential
+// ports without knowing if both ports exist (or are connected)
+int
+ipath_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_PORT_GID;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+    else {
+	*hi = cmd.cmd.mic_info.data3;
+	*lo = cmd.cmd.mic_info.data4;
+    }
+#else
+    char *gid_str = NULL;
+
+    ret = ipath_sysfs_port_read(unit, port, "gids/0", &gid_str);
+
+    if (ret == -1) {
+	if (errno == ENODEV)
+		/* this is "normal" for port != 1, on single
+		 * port chips */
+	    _IPATH_VDBG("Failed to get GID for unit %u:%u: %s\n",
+			unit, port, strerror(errno));
+	else
+	    _IPATH_DBG("Failed to get GID for unit %u:%u: %s\n",
+		       unit, port, strerror(errno));
+    }
+    else {
+        unsigned int gid[8];
+        if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", 
+		   &gid[0], &gid[1], &gid[2], &gid[3],
+		   &gid[4], &gid[5], &gid[6], &gid[7]) != 8) {
+	    _IPATH_DBG("Failed to parse GID for unit %u:%u: %s\n",
+		       unit, port, gid_str);
+	    ret = -1;
+	}
+	else {
+            *hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1]) << 32) | 
+	          (((uint64_t) gid[2]) << 16) | (((uint64_t) gid[3]) << 0);
+            *lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5]) << 32) | 
+	          (((uint64_t) gid[6]) << 16) | (((uint64_t) gid[7]) << 0);
+	}
+        free(gid_str);
+    }
+#endif
+
+    return ret;
+}
+
+// Given the unit number, return an error, or the corresponding LMC value
+// for the port
+// Returns an int, so -1 indicates an error.  0
+int
+ipath_get_port_lmc(int unit, int port)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_PORT_LMC;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    int64_t val;
+
+    ret = ipath_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0);
+
+    if (ret == -1) {
+      _IPATH_INFO("Failed to get LMC for unit %u:%u: %s\n",
+		  unit, port, strerror(errno));	
+    }
+    else
+      ret = val;
+#endif
+    
+    return ret;
+}
+
+// Given the unit number, return an error, or the corresponding link rate
+// for the port
+// Returns an int, so -1 indicates an error. 
+int
+ipath_get_port_rate(int unit, int port)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_PORT_RATE;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    double rate;
+    char *data_rate = NULL, *newptr;
+
+    ret = ipath_sysfs_port_read(unit, port, "rate", &data_rate);
+    if (ret == -1)
+      goto get_port_rate_error;
+    else {
+      rate = strtod(data_rate, &newptr);
+      if ((rate == 0) && (data_rate == newptr)) 
+	goto get_port_rate_error;
+    }
+    
+    free(data_rate);
+    return ((int) (rate * 2) >> 1);
+    
+ get_port_rate_error:
+    _IPATH_INFO("Failed to get link rate for unit %u:%u: %s\n",
+		unit, port, strerror(errno));	
+#endif
+
+    return ret;
+}
+
+// Given a unit, port and SL, return an error, or the corresponding VL for the
+// SL as programmed by the SM
+// Returns an int, so -1 indicates an error.  0
+int
+ipath_get_port_sl2vl(int unit, int port, int sl)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_PORT_S2V;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+    cmd.cmd.mic_info.data1 = sl;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret == -1) errno = cmd.cmd.mic_info.data2;
+#else
+    int64_t val;
+    char sl2vlpath[16];
+    
+    snprintf(sl2vlpath, sizeof(sl2vlpath), "sl2vl/%d", sl);
+    ret = ipath_sysfs_port_read_s64(unit, port, sl2vlpath, &val, 0);
+
+    if (ret == -1) {
+      _IPATH_DBG("Failed to get SL2VL mapping for SL %d unit %u:%u: %s\n",
+		 sl, unit, port, strerror(errno));	
+    }
+    else
+      ret = val;
+#endif
+    
+    return ret;
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the ipath driver, they return new info with
+ * the qib driver
+ */
+static int infinipath_count_names(const char *namep)
+{
+	int n = 0;
+	while (*namep != '\0') {
+		if (*namep == '\n')
+			n++;
+		namep++;
+	}
+	return n;
+}
+
+int infinipath_get_stats_names(char **namep)
+{
+#ifdef __MIC__
+    int ret, size;
+    char *name;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_STATS_NAMES;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    size = cmd.cmd.mic_info.data2 + 1;
+    name = malloc(size);
+    if (!name) return -1;
+
+    ret = ipath_scif_recv(name, size);
+    if (ret) {
+	free(name);
+	return ret;
+    }
+    
+    *namep = name;
+    return infinipath_count_names(*namep);
+#else
+	int i;
+	i = ipath_ipathfs_read("driver_stats_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return infinipath_count_names(*namep);
+#endif
+}
+
+int infinipath_get_stats(uint64_t *s, int nelem)
+{
+#ifdef __MIC__
+    int ret, n;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_STATS;
+    cmd.cmd.mic_info.data1 = nelem;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    n = ret;
+    ret = ipath_scif_recv(s, n*sizeof(*s));
+    if (ret) {
+	return ret;
+    }
+    return n;
+#else
+	int i;
+	i = ipath_ipathfs_rd("driver_stats", s, nelem * sizeof(*s));
+	if(i < 0)
+		return -1;
+	else
+		return i / sizeof(*s);
+#endif
+}
+
+int infinipath_get_ctrs_unit_names(int unitno, char **namep)
+{
+#ifdef __MIC__
+    int ret, size;
+    char *name;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CTRS_UNAMES;
+    cmd.cmd.mic_info.unit = unitno;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    size = cmd.cmd.mic_info.data2 + 1;
+    name = malloc(size);
+    if (!name) return -1;
+
+    ret = ipath_scif_recv(name, size);
+    if (ret) {
+	free(name);
+	return ret;
+    }
+    
+    *namep = name;
+    return infinipath_count_names(*namep);
+#else
+	int i;
+	i =  ipath_ipathfs_unit_read(unitno, "counter_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return infinipath_count_names(*namep);
+#endif
+}
+
+int infinipath_get_ctrs_unit(int unitno, uint64_t *c, int nelem)
+{
+#ifdef __MIC__
+    int ret, n;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CTRS_UNIT;
+    cmd.cmd.mic_info.unit = unitno;
+    cmd.cmd.mic_info.data1 = nelem;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    n = ret;
+    ret = ipath_scif_recv(c, n*sizeof(*c));
+    if (ret) {
+	return ret;
+    }
+    return n;
+#else
+	int i;
+	i =  ipath_ipathfs_unit_rd(unitno, "counters", c,
+		nelem * sizeof(*c));
+	if(i < 0)
+		return -1;
+	else
+		return i / sizeof(*c);
+#endif
+}
+
+int infinipath_get_ctrs_port_names(int unitno, char **namep)
+{
+#ifdef __MIC__
+    int ret, size;
+    char *name;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CTRS_PNAMES;
+    cmd.cmd.mic_info.unit = unitno;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    size = cmd.cmd.mic_info.data2 + 1;
+    name = malloc(size);
+    if (!name) return -1;
+
+    ret = ipath_scif_recv(name, size);
+    if (ret) {
+	free(name);
+	return ret;
+    }
+    
+    *namep = name;
+    return infinipath_count_names(*namep);
+#else
+	int i;
+	i =  ipath_ipathfs_unit_read(unitno, "portcounter_names", namep);
+	if (i < 0)
+		return -1;
+	else
+		return infinipath_count_names(*namep);
+#endif
+}
+
+int infinipath_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem)
+{
+#ifdef __MIC__
+    int ret, n;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CTRS_PORT;
+    cmd.cmd.mic_info.unit = unitno;
+    cmd.cmd.mic_info.port = port;
+    cmd.cmd.mic_info.data1 = nelem;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret <= 0) {
+	if (ret == -1) errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    n = ret;
+    ret = ipath_scif_recv(c, n*sizeof(*c));
+    if (ret) {
+	return ret;
+    }
+    return n;
+#else
+	int i;
+	char buf[32];
+	snprintf(buf, sizeof buf, "port%dcounters", port);
+	i =  ipath_ipathfs_unit_rd(unitno, buf, c,
+		nelem * sizeof(*c));
+	if(i < 0)
+		return -1;
+	else
+		return i / sizeof(*c);
+#endif
+}
+
+int
+ipath_get_cc_settings_bin(int unit, int port, char *ccabuf)
+{
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CC_SETTINGS;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret != 1) return ret;
+
+    ret = ipath_scif_recv(ccabuf, 84);
+    if (ret) return ret;
+#else
+    int fd;
+
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+    sprintf(ccabuf,
+	"/sys/class/infiniband/qib%d/ports/%d/CCMgtA/cc_settings_bin",
+	unit, port);
+    fd = open(ccabuf, O_RDONLY);
+    if (fd < 0) {
+	return 0;
+    }
+    /* (16+16+640)/8=84 */
+    if (read(fd, ccabuf, 84) != 84) {
+	_IPATH_CCADBG("Read cc_settings_bin failed. using static CCA\n");
+	close(fd);
+	return 0;
+    }
+
+    close(fd);
+#endif
+
+    return 1;
+}
+
+int
+ipath_get_cc_table_bin(int unit, int port, uint16_t **cctp)
+{
+    int i, ccti_limit;
+    uint16_t *cct;
+
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_GET_CC_TABLE;
+    cmd.cmd.mic_info.unit = unit;
+    cmd.cmd.mic_info.port = port;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret <= 0) return ret;
+
+    ccti_limit = ret;
+    i = (ccti_limit+1)*sizeof(uint16_t);
+    cct = malloc(i);
+    if (!cct) {
+	return -1;
+    }
+
+    ret = ipath_scif_recv(cct, i);
+    if (ret) {
+	free(cct);
+	return ret;
+    }
+#else
+    int fd;
+    char pathname[256];
+
+    *cctp = NULL;
+    sprintf(pathname,
+	"/sys/class/infiniband/qib%d/ports/%d/CCMgtA/cc_table_bin",
+	unit, port);
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+	_IPATH_CCADBG("Open cc_table_bin failed. using static CCA\n");
+	return 0;
+    }
+    if (read(fd, &ccti_limit, 2) != 2) {
+	_IPATH_CCADBG("Read ccti_limit failed. using static CCA\n");
+	close(fd);
+	return 0;
+    }
+    if (ccti_limit < 63 || ccti_limit > 65535) {
+	_IPATH_CCADBG("Read ccti_limit %d not in range [63, 65535], "
+	    "using static CCA.\n", ccti_limit);
+	close(fd);
+	return 0;
+    }
+
+    i = (ccti_limit+1)*sizeof(uint16_t);
+    cct = malloc(i);
+    if (!cct) {
+	close(fd);
+	return -1;
+    }
+    if (read(fd, cct, i) != i) {
+	_IPATH_CCADBG("Read ccti_entry_list, using static CCA\n");
+	free(cct);
+	close(fd);
+	return 0;
+    }
+
+    close(fd);
+#endif
+
+    *cctp = cct;
+    return ccti_limit;
+}
+
+/*
+ * This is for diag function ipath_wait_for_packet() only
+ */
+int
+ipath_cmd_wait_for_packet(int fd)
+{
+    int ret;
+
+#ifdef __MIC__
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_WAIT_FOR_PACKET;
+    cmd.cmd.mic_info.data1 = fd;
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1;
+    if (ret < 0) errno = cmd.cmd.mic_info.data2;
+#else
+    struct pollfd pfd;
+
+    pfd.fd = fd;
+    pfd.events = POLLIN;
+
+    ret = poll(&pfd, 1, 500 /* ms */);
+#endif
+
+    return ret;
+}
+
+/*
+ * This is for diag function ipath_hideous_ioctl_emulator() only
+ */
+int infinipath_get_unit_flash(int unitno, char **datap)
+{
+#ifdef __MIC__
+    int ret, size;
+    char *data;
+    struct ipath_cmd cmd;
+
+    *datap = NULL;
+    cmd.type = IPATH_CMD_GET_UNIT_FLASH;
+    cmd.cmd.mic_info.unit = unitno;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret < 0) {
+	errno = cmd.cmd.mic_info.data2;
+	return ret;
+    }
+
+    size = cmd.cmd.mic_info.data2 + 1;
+    data = malloc(size);
+    if (!data) return -1;
+
+    ret = ipath_scif_recv(data, size);
+    if (ret) {
+	free(data);
+	return ret;
+    }
+    
+    *datap = data;
+    return 0;
+#else
+	int i;
+	i =  ipath_ipathfs_unit_read(unitno, "flash", datap);
+	if (i < 0)
+		return -1;
+	else
+		return 0;
+#endif
+}
+
+/*
+ * This is for diag function ipath_hideous_ioctl_emulator() only
+ */
+int infinipath_put_unit_flash(int unitno, char *data, int len)
+{
+#ifdef __MIC__
+    int ret;
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_PUT_UNIT_FLASH;
+    cmd.cmd.mic_info.unit = unitno;
+    cmd.cmd.mic_info.data1 = len;
+
+    ret = ipath_scif_send(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = ipath_scif_send(data, len);
+    if (ret) return ret;
+
+    ret = ipath_scif_recv(&cmd, sizeof(cmd));
+    if (ret) return ret;
+
+    ret = cmd.cmd.mic_info.data1; 
+    if (ret < 0) errno = cmd.cmd.mic_info.data2;
+    return ret;
+#else
+	int i;
+	i =  ipath_ipathfs_unit_write(unitno, "flash", data, len);
+	if (i < 0)
+		return -1;
+	else
+		return 0;
+#endif
+}
diff --git a/ipath/ipath_sysfs.c b/ipath/ipath_sysfs.c
new file mode 100644
index 0000000..9065f8b
--- /dev/null
+++ b/ipath/ipath_sysfs.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MIC__
+// This file contains a simple sysfs interface used by the low level
+// infinipath protocol code.  It also implements the interface to ipathfs.
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+
+#include "ipath_service.h"
+
+static char *sysfs_path;
+static size_t sysfs_path_len;
+static char *ipathfs_path;
+static long sysfs_page_size;
+
+static void __attribute__((constructor)) sysfs_init(void)
+{
+    struct stat s;
+    if (sysfs_path == NULL)
+        sysfs_path = getenv("IPATH_SYSFS_PATH");
+    if (sysfs_path == NULL) {
+	static char syspath[64];
+	snprintf(syspath, sizeof(syspath),
+		"%s%d", QIB_CLASS_PATH, 0);
+	sysfs_path = syspath;
+    }
+    if(stat(sysfs_path, &s) || !S_ISDIR(s.st_mode))
+	    _IPATH_DBG("Did not find sysfs directory %s, using anyway\n",
+		    sysfs_path);
+    sysfs_path_len = strlen(sysfs_path);
+
+    if (ipathfs_path == NULL)
+        ipathfs_path = getenv("IPATH_IPATHFS_PATH");
+    if (ipathfs_path == NULL)
+        ipathfs_path = "/ipathfs";
+
+    if (!sysfs_page_size)
+        sysfs_page_size = sysconf(_SC_PAGESIZE);
+}
+
+const char *ipath_sysfs_path(void)
+{
+    return sysfs_path;
+}
+
+size_t ipath_sysfs_path_len(void)
+{
+    return sysfs_path_len;
+}
+
+const char *ipath_ipathfs_path(void)
+{
+    return ipathfs_path;
+}
+
+int ipath_sysfs_open(const char *attr, int flags)
+{
+    char buf[1024];
+    int saved_errno;
+    int fd;
+
+    snprintf(buf, sizeof(buf), "%s/%s", ipath_sysfs_path(), attr);
+    fd = open(buf, flags);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        _IPATH_DBG("Failed to open driver attribute '%s': %s\n", attr,
+                   strerror(errno));
+        _IPATH_DBG("Offending file name: %s\n", buf);
+    }
+
+    errno = saved_errno;
+    return fd;
+}
+
+int ipath_ipathfs_open(const char *attr, int flags)
+{
+    char buf[1024];
+    int saved_errno;
+    int fd;
+
+    snprintf(buf, sizeof(buf), "%s/%s", ipath_ipathfs_path(), attr);
+    fd = open(buf, flags);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        _IPATH_DBG("Failed to open driver attribute '%s': %s\n", attr,
+                   strerror(errno));
+        _IPATH_DBG("Offending file name: %s\n", buf);
+    }
+
+    errno = saved_errno;
+    return fd;
+}
+
+static int sysfs_vprintf(int fd, const char *fmt, va_list ap)
+{
+    char *buf;
+    int len, ret;
+    int saved_errno;
+
+    buf = alloca(sysfs_page_size);
+    len = vsnprintf(buf, sysfs_page_size, fmt, ap);
+
+    if (len > sysfs_page_size) {
+        _IPATH_DBG("Attempt to write more (%d) than %ld bytes\n", len,
+                   sysfs_page_size);
+        saved_errno = EINVAL;
+        ret = -1;
+        goto bail;
+    }
+
+    ret = write(fd, buf, len);
+    saved_errno = errno;
+
+    if (ret != -1 && ret < len) {
+        _IPATH_DBG("Write ran short (%d < %d)\n", ret, len);
+        saved_errno = EAGAIN;
+        ret = -1;
+    }
+
+bail:
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_printf(const char *attr, const char *fmt, ...)
+{
+    int fd = -1;
+    va_list ap;
+    int ret = -1;
+    int saved_errno;
+
+    fd = ipath_sysfs_open(attr, O_WRONLY);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        goto bail;
+    }
+
+    va_start(ap, fmt);
+    ret = sysfs_vprintf(fd, fmt, ap);
+    saved_errno = errno;
+    va_end(ap);
+
+    if (ret == -1) {
+        _IPATH_DBG("Failed to write to driver attribute '%s': %s\n", attr,
+                   strerror(errno));
+    }
+
+bail:
+    if (fd != -1)
+        close(fd);
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+    int saved_errno;
+    char buf[1024];
+    int fd;
+    int len, l;
+
+    snprintf(buf, sizeof(buf), "%s", ipath_sysfs_path());
+    len = l = strlen(buf) - 1;
+    while(l > 0 && isdigit(buf[l]))
+	l--;
+    if(l)
+	buf[++l] = 0;
+    else
+	l = len; /* assume they know what they are doing */
+    snprintf(buf+l, sizeof(buf)-l, "%u/%s", unit, attr);
+    fd = open(buf, flags);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        _IPATH_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+                   unit, strerror(errno));
+        _IPATH_DBG("Offending file name: %s\n", buf);
+    }
+
+    errno = saved_errno;
+    return fd;
+}
+
+int ipath_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr,
+	int flags)
+{
+    int saved_errno;
+    char buf[1024];
+    int fd;
+    int len, l;
+
+    snprintf(buf, sizeof(buf), "%s", ipath_sysfs_path());
+    len = l = strlen(buf) - 1;
+    while(l > 0 && isdigit(buf[l]))
+	l--;
+    if(l)
+	buf[++l] = 0;
+    else
+	l = len; /* assume they know what they are doing */
+    snprintf(buf+l, sizeof(buf)-l, "%u/ports/%u/%s", unit, port, attr);
+    fd = open(buf, flags);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        _IPATH_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n", attr,
+                   unit, port, strerror(errno));
+        _IPATH_DBG("Offending file name: %s\n", buf);
+    }
+
+    errno = saved_errno;
+    return fd;
+}
+
+int ipath_ipathfs_unit_open(uint32_t unit, const char *attr, int flags)
+{
+    int saved_errno;
+    char buf[1024];
+    int fd;
+
+    snprintf(buf, sizeof(buf), "%s/%u/%s", ipath_ipathfs_path(), unit, attr);
+    fd = open(buf, flags);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        _IPATH_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr,
+                   unit, strerror(errno));
+        _IPATH_DBG("Offending file name: %s\n", buf);
+    }
+
+    errno = saved_errno;
+    return fd;
+}
+
+int ipath_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr,
+                            const char *fmt, ...)
+{
+    va_list ap;
+    int ret = -1;
+    int saved_errno;
+    int fd;
+
+    fd = ipath_sysfs_port_open(unit, port, attr, O_WRONLY);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        goto bail;
+    }
+
+    va_start(ap, fmt);
+    ret = sysfs_vprintf(fd, fmt, ap);
+    saved_errno = errno;
+    va_end(ap);
+
+    if (ret == -1) {
+        _IPATH_DBG("Failed to write to attribute '%s' of unit %d: %s\n", attr,
+                   unit, strerror(errno));
+    }
+
+bail:
+    if (fd != -1)
+        close(fd);
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_unit_printf(uint32_t unit, const char *attr,
+                            const char *fmt, ...)
+{
+    va_list ap;
+    int ret = -1;
+    int saved_errno;
+    int fd;
+
+    fd = ipath_sysfs_unit_open(unit, attr, O_WRONLY);
+    saved_errno = errno;
+
+    if (fd == -1) {
+        goto bail;
+    }
+
+    va_start(ap, fmt);
+    ret = sysfs_vprintf(fd, fmt, ap);
+    saved_errno = errno;
+    va_end(ap);
+
+    if (ret == -1) {
+        _IPATH_DBG("Failed to write to attribute '%s' of unit %d: %s\n", attr,
+                   unit, strerror(errno));
+    }
+
+bail:
+    if (fd != -1)
+        close(fd);
+
+    errno = saved_errno;
+    return ret;
+}
+
+static int read_page(int fd, char **datap)
+{
+    char *data = NULL;
+    int saved_errno;
+    int ret = -1;
+
+    data = malloc(sysfs_page_size);
+    saved_errno = errno;
+
+    if (!data) {
+        _IPATH_DBG("Could not allocate memory: %s\n", strerror(errno));
+        goto bail;
+    }
+
+    ret = read(fd, data, sysfs_page_size);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        _IPATH_DBG("Read of attribute failed: %s\n", strerror(errno));
+        goto bail;
+    }
+
+bail:
+    if (ret == -1) {
+        free(data);
+    } else {
+        *datap = data;
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int ipath_sysfs_read(const char *attr, char **datap)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_sysfs_open(attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read_page(fd, datap);
+    saved_errno = errno;
+
+bail:
+    if (ret == -1)
+        *datap = NULL;
+
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int ipath_sysfs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_sysfs_unit_open(unit, attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read_page(fd, datap);
+    saved_errno = errno;
+
+bail:
+    if (ret == -1)
+        *datap = NULL;
+
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int ipath_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr,
+	char **datap)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_sysfs_port_open(unit, port, attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read_page(fd, datap);
+    saved_errno = errno;
+
+bail:
+    if (ret == -1)
+        *datap = NULL;
+
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_unit_write(uint32_t unit, const char *attr, const void *data,
+                           size_t len)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    if (len > sysfs_page_size) {
+        _IPATH_DBG("Attempt to write more (%ld) than %ld bytes\n", (long) len,
+                   sysfs_page_size);
+        saved_errno = EINVAL;
+        goto bail;
+    }
+
+    fd = ipath_sysfs_unit_open(unit, attr, O_WRONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = write(fd, data, len);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        _IPATH_DBG("Attempt to write %ld bytes failed: %s\n",
+                   (long) len, strerror(errno));
+        goto bail;
+    }
+
+    if (ret < len) { // sysfs routines can routine count including null byte
+        // so don't return an error if it's > len
+        _IPATH_DBG("Attempt to write %ld bytes came up short (%ld bytes)\n",
+                   (long) len, (long) ret);
+        saved_errno = EAGAIN;
+        ret = -1;
+    }
+
+bail:
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int ipath_ipathfs_read(const char *attr, char **datap)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_ipathfs_open(attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read_page(fd, datap);
+    saved_errno = errno;
+
+bail:
+    if (ret == -1)
+        *datap = NULL;
+
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * On return, caller must free *datap.
+ */
+int ipath_ipathfs_unit_read(uint32_t unit, const char *attr, char **datap)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_ipathfs_unit_open(unit, attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read_page(fd, datap);
+    saved_errno = errno;
+
+bail:
+    if (ret == -1)
+        *datap = NULL;
+
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+/*
+ * The _rd routines jread directly into a supplied buffer,
+ * unlike  the _read routines.
+ */
+int ipath_ipathfs_rd(const char *attr, void *buf, int n)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_ipathfs_open(attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read(fd, buf, n);
+    saved_errno = errno;
+
+bail:
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_ipathfs_unit_rd(uint32_t unit, const char *attr, void *buf, int n)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_ipathfs_unit_open(unit, attr, O_RDONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = read(fd, buf, n);
+    saved_errno = errno;
+
+bail:
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_ipathfs_unit_write(uint32_t unit, const char *attr, const void *data,
+                             size_t len)
+{
+    int fd = -1, ret = -1;
+    int saved_errno;
+
+    fd = ipath_ipathfs_unit_open(unit, attr, O_WRONLY);
+    saved_errno = errno;
+
+    if (fd == -1)
+        goto bail;
+
+    ret = write(fd, data, len);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        _IPATH_DBG("Attempt to write %ld bytes failed: %s\n",
+                   (long) len, strerror(errno));
+        goto bail;
+    }
+
+    if (ret != len) {
+        _IPATH_DBG("Attempt to write %ld bytes came up short (%ld bytes)\n",
+                   (long) len, (long) ret);
+        saved_errno = EAGAIN;
+        ret = -1;
+    }
+
+bail:
+    if (fd != -1) {
+        close(fd);
+    }
+
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_read_s64(const char *attr, int64_t *valp, int base)
+{
+    char *data, *end;
+    int ret;
+    int saved_errno;
+    long long val;
+
+    ret = ipath_sysfs_read(attr, &data);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        goto bail;
+    }
+
+    val = strtoll(data, &end, base);
+    saved_errno = errno;
+
+    if (!*data || !(*end == '\0' || isspace(*end))) {
+        ret = -1;
+        goto bail;
+    }
+
+    *valp = val;
+    ret = 0;
+
+bail:
+    free(data);
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_unit_read_s64(uint32_t unit, const char *attr,
+                              int64_t *valp, int base)
+{
+    char *data, *end;
+    int saved_errno;
+    long long val;
+    int ret;
+
+    ret = ipath_sysfs_unit_read(unit, attr, &data);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        goto bail;
+    }
+
+    val = strtoll(data, &end, base);
+    saved_errno = errno;
+
+    if (!*data || !(*end == '\0' || isspace(*end))) {
+        ret = -1;
+        goto bail;
+    }
+
+    *valp = val;
+    ret = 0;
+
+bail:
+    free(data);
+    errno = saved_errno;
+    return ret;
+}
+
+int ipath_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr,
+                              int64_t *valp, int base)
+{
+    char *data, *end;
+    int saved_errno;
+    long long val;
+    int ret;
+
+    ret = ipath_sysfs_port_read(unit, port, attr, &data);
+    saved_errno = errno;
+
+    if (ret == -1) {
+        goto bail;
+    }
+
+    val = strtoll(data, &end, base);
+    saved_errno = errno;
+
+    if (!*data || !(*end == '\0' || isspace(*end))) {
+        ret = -1;
+        goto bail;
+    }
+
+    *valp = val;
+    ret = 0;
+
+bail:
+    free(data);
+    errno = saved_errno;
+    return ret;
+}
+
+#endif		//__MIC__
diff --git a/ipath/ipath_syslog.c b/ipath/ipath_syslog.c
new file mode 100644
index 0000000..b4ea8c4
--- /dev/null
+++ b/ipath/ipath_syslog.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define __USE_GNU
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "ipath_user.h"
+
+#define SYSLOG_MAXLEN	512
+
+extern char *__ipath_mylabel;
+
+void 
+ipath_vsyslog(const char *prefix, int to_console, int level, 
+	     const char *format, va_list ap)
+{
+    char logprefix[SYSLOG_MAXLEN];
+
+    if (to_console) {
+	char hostname[80];
+	va_list ap_cons;
+	va_copy(ap_cons, ap);
+	size_t len = strlen(format);
+	gethostname(hostname, sizeof hostname);
+	hostname[sizeof hostname - 1] = '\0';
+
+	if (__ipath_mylabel)
+	    fprintf(stderr, "%s", __ipath_mylabel);
+	else
+	    fprintf(stderr, "%s: ", hostname);
+
+	vfprintf(stderr, format, ap_cons);
+	if (format[len] != '\n')
+	    fprintf(stderr, "\n");
+	fflush(stderr);
+	va_end(ap_cons);
+    }
+
+    (void)snprintf(logprefix, sizeof(logprefix), 
+	  "(ipath/%s)[%d]: %s", prefix ? prefix : "ipath", (int) getpid(),
+	  format);
+
+    vsyslog(level | LOG_USER, logprefix, ap);
+
+    return;
+}
+
+void 
+ipath_syslog(const char *prefix, int to_console, int level, 
+	     const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    ipath_vsyslog(prefix, to_console, level, format, ap);
+    va_end(ap);
+}
+
diff --git a/ipath/ipath_time.c b/ipath/ipath_time.c
new file mode 100644
index 0000000..ca3faa8
--- /dev/null
+++ b/ipath/ipath_time.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define __USE_GNU
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+#include "ipath_user.h"
+
+// init the cycle counter to picosecs/cycle conversion automatically
+// at program startup, if it's using timing functions.
+static void init_picos_per_cycle(void) __attribute__ ((constructor));
+static int      ipath_timebase_isvalid(uint32_t pico_per_cycle);
+static uint32_t ipath_timebase_from_cpuinfo(uint32_t old_pico_per_cycle);
+
+// in case two of our mechanisms fail
+#ifdef __powerpc__
+#define SAFEDEFAULT_PICOS_PER_CYCLE 69000
+#else
+#define SAFEDEFAULT_PICOS_PER_CYCLE 500
+#endif
+
+uint32_t __ipath_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; 
+
+// This isn't perfect, but it's close enough for rough timing. We want this
+// to work on systems where the cycle counter isn't the same as the clock
+// frequency.
+// __ipath_pico_per_cycle isn't going to lead to completely accurate
+// conversions from timestamps to nanoseconds, but it's close enough for
+// our purposes, which is mainly to allow people to show events with nsecs
+// or usecs if desired, rather than cycles.   We use it in some performance
+// analysis, but it has to be done with care, since cpuspeed can change,
+// different cpu's can have different speeds, etc.
+//
+// Some architectures don't have their TSC-equivalent running at anything
+// related to the the processor speed (e.g. G5 Power systems use a fixed
+// 33 MHz frequency).
+
+#define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */
+
+static int timebase_debug = 0; /* off by default */
+
+#define timebase_warn_always(fmt,...)				    \
+	    ipath_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__)
+#define timebase_warn(fmt,...)	if (timebase_debug)		    \
+	    timebase_warn_always(fmt, ##__VA_ARGS__)
+
+static int ipath_timebase_isvalid(uint32_t pico_per_cycle)
+{
+#if defined(__x86_64__) || defined(__i386__)
+    /* If pico-per-cycle is less than 200, the clock speed would be greater
+     * than 5 GHz.  Similarly, we minimally support a 1GHz clock.
+     * Allow some slop, because newer kernels with HPET can be a few
+     * units off, and we don't want to spend the startup time needlessly */
+    if (pico_per_cycle >= 198 && pico_per_cycle <= 1005)
+	return 1;
+#elif defined(__powerpc__)
+    /* If pico-per-cycle is not between 1MHz and 1GHz, complain */
+    if (pico_per_cycle >= 9950 && pico_per_cycle <= 1005000)
+	return 1;
+#endif
+    else
+	return 0;
+}
+
+/* 
+ * Method #1:
+ *
+ * Derive the pico-per-cycle by trying to correlate the difference between two
+ * reads of the tsc counter to gettimeofday.
+ */
+static void init_picos_per_cycle()
+{
+    struct timeval tvs, tve;
+    int64_t usec = 0;
+    uint64_t ts, te;
+    int64_t delta;
+    uint32_t picos = 0;
+    int trials = 0;
+    int retry = 0;
+    cpu_set_t cpuset, cpuset_saved;
+    int have_cpuset = 1;
+
+    /*
+     * Make sure we try to calculate the cycle time without being migrated.
+     */
+    CPU_ZERO(&cpuset_saved);
+    if (sched_getaffinity(0, sizeof cpuset, &cpuset_saved))
+	have_cpuset = 0;
+    CPU_ZERO(&cpuset);
+    CPU_SET(0, &cpuset);
+    if(have_cpuset && sched_setaffinity(0,sizeof cpuset, &cpuset)) 
+	have_cpuset = 0;
+
+    /*
+     * If we set affinity correctly, give the scheduler another change to put
+     * us on processor 0
+     */
+    if (have_cpuset) 
+	sched_yield();
+
+retry_pico_test:
+    if (++retry == 10) {
+	__ipath_pico_per_cycle = 
+	    ipath_timebase_from_cpuinfo(picos);
+	goto reset_cpu_mask; /* Reset CPU mask before exiting */
+    }
+
+    usec = 0;
+    gettimeofday(&tvs, NULL);
+    ts = get_cycles();
+    while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */
+	trials++;
+	usleep(125);
+	gettimeofday(&tve, NULL);
+	usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) +
+	       1000000000000LL *  (tve.tv_sec - tvs.tv_sec);
+	if (usec < 0) {
+	    timebase_warn("RTC timebase, gettimeofday is negative (!) %lld\n",
+		(long long) usec);
+	    goto retry_pico_test;
+	}
+    }
+    te = get_cycles();
+    delta = te - ts;
+    picos = (uint32_t)(usec / delta);
+
+    if (!ipath_timebase_isvalid(picos)) {
+	cpu_set_t cpuget;
+	int affinity_valid = !sched_getaffinity(0, sizeof cpuget, &cpuget);
+	if (affinity_valid && !CPU_ISSET(0, &cpuget))
+	    affinity_valid = 0;
+	timebase_warn("Failed to get valid RTC timebase, gettimeofday delta=%lld, "
+	    "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n",
+	    (long long) usec, (long long) delta, picos, 
+	    affinity_valid ? "YES" : "NO", retry);
+	goto retry_pico_test;
+    }
+
+    /* If we've had to retry even once, let that be known */
+    if (retry > 1) 
+	timebase_warn("Clock is %d picos/cycle found in %d trials and "
+		      "%.3f seconds (retry=%d)\n", picos, trials, 
+		      (double) usec / 1.0e12, retry);
+
+    __ipath_pico_per_cycle = picos;
+
+ reset_cpu_mask:
+    /* Restore affinity */
+    if (have_cpuset) {
+	sched_setaffinity(0, sizeof cpuset, &cpuset_saved);
+	/*
+	 * Give a chance to other processes that also set affinity to 0 for
+	 * doing this test.
+	 */
+	sched_yield();
+    }
+}
+
+/* 
+ * Method #2:
+ *
+ * Derive the pico-per-cycle from /proc instead of using sleep trick
+ * that relies on scheduler.
+ */
+static uint32_t 
+ipath_timebase_from_cpuinfo(uint32_t old_pico_per_cycle)
+{
+    /* we only validate once */
+    uint32_t new_pico_per_cycle = old_pico_per_cycle;
+
+    char hostname[80];
+    gethostname(hostname, 80);
+    hostname[sizeof hostname - 1] = '\0';
+
+    if (getenv("IPATH_DEBUG_TIMEBASE"))
+	timebase_debug = 1;
+
+    /* If the old one is valid, don't bother with this mechanism */
+    if (ipath_timebase_isvalid(old_pico_per_cycle)) 
+	return old_pico_per_cycle;
+
+#if defined(__x86_64__) || defined(__i386__)
+    {
+      	FILE *fp = fopen("/proc/cpuinfo","r");
+      	char input[255];
+	char *p = NULL;
+
+	if (!fp)
+	    goto fail;
+
+	while (!feof(fp) && fgets(input, 255, fp)) {
+	    if (strstr(input,"cpu MHz")) {
+		p = strchr(input,':');
+		double MHz = 0.0;
+		if (p) MHz = atof(p+1);
+		new_pico_per_cycle = (uint32_t)(1000000. / MHz);
+		break;  
+	    }
+	}       
+	fclose(fp);
+	if (!p) 
+	    goto fail;
+    }
+#elif defined(__powerpc__)
+  #include <sys/types.h>
+  #include <dirent.h>
+    {
+	DIR *dp = opendir("/proc/device-tree/cpus");
+	uint32_t freq;
+	FILE *fp = NULL;
+	char buf[256];
+	struct dirent *de = NULL;
+	int found = 0;
+	if (!dp)
+	    goto fail;
+	do {
+	    de = readdir(dp);
+	    if (de && (de->d_name == strstr(de->d_name, "PowerPC,"))) {
+		found = 1;
+		break;
+	    }
+	} while (de != NULL);
+	if (!found)
+	    goto fail;
+
+	snprintf(buf, sizeof buf, 
+	    "/proc/device-tree/cpus/%s/timebase-frequency", de->d_name);
+	if ((fp = fopen(buf, "r"))) {
+	    if (fread((void *) &freq, sizeof(uint32_t), 1, fp) != 1) 
+		goto fail;
+	    /* freq is in Hz */
+	    new_pico_per_cycle = 1e6 / (freq / 1e6);
+	    fclose(fp);
+	}
+	else
+	    goto fail;
+    }
+#endif
+
+    /* If there's no change (within a small range), just return the old one */
+    if (abs(new_pico_per_cycle - old_pico_per_cycle) < 5)
+	return old_pico_per_cycle;
+
+    if (ipath_timebase_isvalid(new_pico_per_cycle)) {
+	timebase_warn_always("RTC timebase, using %d picos/cycle from /proc "
+		      "instead of the detected %d picos/cycle\n",
+		      new_pico_per_cycle, old_pico_per_cycle);
+	return new_pico_per_cycle;
+    }
+
+fail:
+    new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE;
+    timebase_warn_always(
+	    "Problem obtaining CPU time base, detected to be %d "
+	    "pico/cycle, adjusted to safe default %d picos/cycle", 
+	    old_pico_per_cycle, new_pico_per_cycle);
+    return new_pico_per_cycle;
+}
+
diff --git a/ipath/ipath_utils.c b/ipath/ipath_utils.c
new file mode 100644
index 0000000..4df8189
--- /dev/null
+++ b/ipath/ipath_utils.c
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains ipath service routine interface used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <time.h>
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+int __ipath_malloc_no_mmap = 0; // keep track whether we disabled mmap in malloc
+
+// This exists as a separate routine called on (very rare)
+// ipath_update_tid() errors, so as to avoid pulling unnecessary code
+// into the instruction cache, keeping the fast path code as fast possible.
+int ipath_update_tid_err(void)
+{
+    int ret = errno; // preserve errno for return
+
+    _IPATH_INFO("failed: %s\n", strerror(errno));
+    return ret;
+}
+
+// This exists as a separate routine called on (very rare)
+// ipath_free_tid() errors, so as to avoid pulling unnecessary code
+// into the instruction cache, keeping the fast path code as fast possible.
+int ipath_free_tid_err(void)
+{
+    int ret = errno; // preserve errno for return
+
+    _IPATH_INFO("failed: %s\n", strerror(errno));
+    return ret;
+}
+
+// touch the pages, with a 32 bit read
+void ipath_touch_mmap(void *m, size_t bytes)
+{
+    volatile uint32_t *b = (volatile uint32_t *)m, c;
+    size_t i;  // m is always page aligned, so pgcnt exact
+    int __ipath_pg_sz;
+
+    /* First get the page size */
+    __ipath_pg_sz = sysconf(_SC_PAGESIZE);
+
+    _IPATH_VDBG("Touch %lu mmap'ed pages starting at %p\n", (unsigned long) bytes/__ipath_pg_sz, m);
+    bytes /= sizeof c;
+    for(i=0; i<bytes; i+=__ipath_pg_sz/sizeof c)
+        c = b[i];
+}
+
+//
+// set the BTH pkey to check for this process.
+// This is for receive checks, not for sends.  See the description
+// in ipath_user.h
+int ipath_set_pkey(struct _ipath_ctrl *ctrl, uint16_t pkey)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_SET_PART_KEY;
+    cmd.cmd.part_key = pkey;
+
+    if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+	if (errno != EINVAL)
+	    _IPATH_INFO("failed: %s\n", strerror(errno));
+	return -1;
+    }
+
+    return 0;
+}
+
+// flush the eager buffers, by setting the eager index head to eager index tail
+// if eager buffer queue is full.
+//
+// Called when we had eager buffer overflows (ERR_TID/INFINIPATH_RHF_H_TIDERR
+// was set in RHF errors), and no good eager packets were received, so
+// that eager head wasn't adavanced.
+//
+
+void ipath_flush_egr_bufs(struct _ipath_ctrl *ctrl)
+{
+    uint32_t head = __le32_to_cpu(*ctrl->__ipath_rcvegrhead);
+    uint32_t tail = __le32_to_cpu(*ctrl->__ipath_rcvegrtail);
+
+    if((head%ctrl->__ipath_tidegrcnt) == ((tail+1)%ctrl->__ipath_tidegrcnt)) {
+        _IPATH_DBG("eager array full after overflow, flushing (head %llx, tail %llx\n",
+            (long long)head, (long long)tail);
+        *ctrl->__ipath_rcvegrhead = __cpu_to_le32(tail);
+    }
+}
+
+// stop_start == 0 disables receive on the context, for use in queue
+// overflow conditions.  stop_start==1 re-enables, to be used to
+// re-init the software copy of the head register
+int ipath_manage_rcvq(struct _ipath_ctrl *ctrl, uint32_t stop_start)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_RECV_CTRL;
+    cmd.cmd.recv_ctrl = stop_start;
+
+    if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s\n", strerror(errno));
+	return -1;
+    }
+    return 0;
+}
+
+// enable == 1 enables armlaunch (normal), 0 disables (only used
+// ipath_pkt_test -B at the moment, needed for linda).
+int ipath_armlaunch_ctrl(struct _ipath_ctrl *ctrl, uint32_t enable)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_ARMLAUNCH_CTRL;
+    cmd.cmd.armlaunch_ctrl = enable;
+
+    if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s\n", strerror(errno));
+	return -1;
+    }
+    return 0;
+}
+
+// force PIOAvail register to be updated to memory
+int ipath_force_pio_avail_update(struct _ipath_ctrl *ctrl)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_PIOAVAILUPD;
+
+    if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s\n", strerror(errno));
+	return -1;
+    }
+    return 0;
+}
+
+// ack event bits, and clear them.  Usage is check *spi_sendbuf_status,
+// pass bits you are prepared to handle to ipath_event_ack(), perform the
+// appropriate actions for bits that were set, and then (if appropriate)
+// check the bits again.
+int ipath_event_ack(struct _ipath_ctrl *ctrl, __u64 ackbits)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_ACK_EVENT;
+    cmd.cmd.event_mask = ackbits;
+
+    if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+      if (errno != EINVAL) /* not implemented in driver. */
+	_IPATH_DBG("failed: %s\n", strerror(errno));
+      return -1;
+    }
+    return 0;
+}
+
+// Disarm any send buffers which need disarming.
+int ipath_disarm_bufs(struct _ipath_ctrl *ctrl)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_DISARM_BUFS;
+
+    if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+      if (errno != EINVAL) /* not implemented in driver. */
+	_IPATH_DBG("failed: %s\n", strerror(errno));
+      return -1;
+    }
+    return 0;
+}
+
+// Wait until send dma completion reaches at least 'completion_counter'
+int ipath_sdma_complete(struct _ipath_ctrl *ctrl, uint32_t *counter)
+{
+    struct ipath_cmd cmd;
+    int ret;
+
+    cmd.type = IPATH_CMD_SDMA_COMPLETE;
+    cmd.cmd.sdma_cntr = (uintptr_t) counter;
+    VALGRIND_MAKE_MEM_DEFINED(&cmd, sizeof(struct ipath_cmd));
+
+    *counter = 0;
+    if ((ret = ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd))) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s (errno=%d)\n", strerror(errno), errno);
+	return -1;
+    }
+    return 1;
+}
+
+// Return send dma's current "in flight counter "
+int ipath_sdma_inflight(struct _ipath_ctrl *ctrl, uint32_t *counter)
+{
+    struct ipath_cmd cmd;
+    int ret;
+
+    cmd.type = IPATH_CMD_SDMA_INFLIGHT;
+    cmd.cmd.sdma_cntr = (uintptr_t) counter;
+    VALGRIND_MAKE_MEM_DEFINED(&cmd, sizeof(struct ipath_cmd));
+
+    *counter = 0;
+    if ((ret = ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd))) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s (errno=%d)\n", strerror(errno), errno);
+	return -1;
+    }
+    return 1;
+}
+
+// Tell the driver to change the way packets can generate interrupts.
+//
+// IPATH_POLL_TYPE_URGENT: Generate interrupt only when packet sets
+//                         INFINIPATH_KPF_INTR
+// IPATH_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on).
+//
+// PSM: Uses TYPE_URGENT in ips protocol
+//
+int ipath_poll_type(struct _ipath_ctrl *ctrl, uint16_t poll_type)
+{
+    struct ipath_cmd cmd;
+
+    cmd.type = IPATH_CMD_POLL_TYPE;
+    cmd.cmd.poll_type = poll_type;
+
+    if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) {
+	if (errno != EINVAL) /* not implemented in driver */
+	    _IPATH_INFO("failed: %s\n", strerror(errno));
+	return -1;
+    }
+    return 0;
+}
+
+// wait for a received packet for our context
+// This allows us to not busy wait, if nothing has happened for a
+// while, which allows better measurements of cpu utilization, and
+// in some cases, slightly better performance.  Called where we would
+// otherwise call sched_yield().  It is not guaranteed that a packet
+// has arrived, so the normal checking loop(s) should be done.
+//
+// PSM: not used as is, PSM has it's own use of polling for interrupt-only
+//      packets (sets ipath_poll_type to TYPE_URGENT)
+int ipath_wait_for_packet(struct _ipath_ctrl *ctrl)
+{
+    return ipath_cmd_wait_for_packet(ctrl->spc_dev.spd_fd);
+}
+
+int ipath_hideous_ioctl_emulator(int unit, int reqtype, struct ipath_eeprom_req *req)
+{
+    switch (reqtype) {
+    case IPATH_READ_EEPROM:
+    {
+        // Emulate a read of a byte range by doing a full read, then
+        // getting the bits we want.
+        char *data;
+
+        if (infinipath_get_unit_flash(unit, &data) == -1) {
+	    if (data) free(data);
+            return -1;
+	}
+
+        memcpy((char *) (unsigned long) req->addr, data + req->offset,
+               req->len);
+
+        free(data);
+
+        break;
+    }
+    case IPATH_WRITE_EEPROM:
+    {
+        // Emulate a write to a byte range by doing a full read,
+        // modifying the bits we want, then a full write.
+        char *data;
+        int len;
+
+        len = infinipath_get_unit_flash(unit, &data);
+
+        if (len == -1) {
+	    if (data) free(data);
+            return -1;
+	}
+
+        memcpy(data + req->offset, (char *) (unsigned long) req->addr,
+               req->len);
+
+        if (infinipath_put_unit_flash(unit, data, len) == -1) {
+	    free(data);
+            return -1;
+	}
+
+        free(data);
+
+        break;
+    }
+    default:
+        fprintf(stderr, "invalid hideous emulated ioctl: %d\n", reqtype);
+        exit(1);
+    }
+    return 0;
+}
+
+// check if the chip/board are in an OK state.  If not,
+// print a message and return an error code.   Used at
+// places where we are going to be in slow mode anyway,
+// such as open, close, and out of pio buffers
+// 
+// PSM: implemented in context abstraction psmi_context_check_status()
+// As of 7322-ready driver, need to check port-specific qword for IB
+// as well as older unit-only.  For now, we don't have the port interface
+// defined, so just check port 0 qword for spi_status
+// Hard-code spmsg as 3rd qword until we have IB port
+int ipath_check_unit_status(struct _ipath_ctrl *ctrl)
+{
+    char *spmsg = NULL, *msg = NULL, buf[80];
+    int rc = IPS_RC_OK;
+    _Pragma_unlikely
+
+    if(!ctrl->__ipath_spi_status)
+        return rc;
+
+    if( !(ctrl->__ipath_spi_status[0] & IPATH_STATUS_CHIP_PRESENT) ||
+        (ctrl->__ipath_spi_status[0] & (IPATH_STATUS_HWERROR))) {
+        rc = IPS_RC_DEVICE_ERROR;
+        if(ctrl->lasterr != rc) { // only report once
+            spmsg = (char*)&ctrl->__ipath_spi_status[2];  // string for hardware error, if any
+            if(!*spmsg) {
+                msg = buf;
+                snprintf(buf, sizeof buf, "%s\n",
+                    (ctrl->__ipath_spi_status[0] & IPATH_STATUS_HWERROR) ?
+                    "Hardware error" : "Hardware not found");
+            }
+        }
+    }
+    else if (!(ctrl->__ipath_spi_status[0] & IPATH_STATUS_IB_CONF) && 
+	    !(ctrl->__ipath_spi_status[1] & IPATH_STATUS_IB_CONF)) {
+        rc = IPS_RC_NETWORK_DOWN;
+        if(ctrl->lasterr != rc) // only report once
+            spmsg = (char*)&ctrl->__ipath_spi_status[2];  // string for hardware error, if any
+    }
+    else if (!(ctrl->__ipath_spi_status[0] & IPATH_STATUS_IB_READY) &&
+	    !(ctrl->__ipath_spi_status[1] & IPATH_STATUS_IB_READY)) {
+        // if only this error, probably cable pulled, switch rebooted, etc.
+        // report it the first time, and then treat it same as BUSY, since
+        // it could be recovered from within the quiescence period
+        rc = IPS_RC_BUSY;
+        if(ctrl->lasterr != rc) // only report once
+            msg = "IB Link is down";
+    }
+    if(spmsg && *spmsg) {
+        _IPATH_ERROR("Hardware problem: %s\n", spmsg);
+        // and try to get it out to user before returning error so mpirun shows
+        // since mpi interface code will normally exit immediately on errors
+        fflush(stdout);
+        sleep(1);
+    }
+    else if(msg)
+        _IPATH_DBG("%s\n", msg);
+    if(ctrl->lasterr && rc==IPS_RC_OK)
+        ctrl->lasterr = 0; // cleared up, report if it happens again
+    else if(rc != IPS_RC_OK)
+        ctrl->lasterr = rc;
+    return rc;
+}
+
+/* These have been fixed to read the values, but they are not
+ * compatible with the ipath driver, they return new info with
+ * the qib driver
+ */
+static int infinipath_count_names(const char *namep)
+{
+	int n = 0;
+	while (*namep != '\0') {
+		if (*namep == '\n')
+			n++;
+		namep++;
+	}
+	return n;
+}
+
+const char * infinipath_get_next_name(char **names)
+{
+	char *p, *start;
+
+	p = start = *names;
+	while (*p != '\0' && *p != '\n') {
+		p++;
+	}
+	if (*p == '\n') {
+		*p = '\0';
+		p++;
+		*names = p;
+		return start;
+	} else
+		return NULL;
+}
+
+void infinipath_release_names(char *namep)
+{
+	/* TODO: names were initialised in the data section before. Now
+	 * they are allocated when ipath_ipathfs_read() is called. Allocation
+	 * for names is done only once at init time. Should we eventually 
+	 * have an "stats_type_unregister" type of routine to explicitely 
+	 * deallocate memory and free resources ?
+	 */
+#if 0
+	if (namep != NULL)
+		free(namep);
+#endif
+}
+
+int infinipath_get_stats_names_count()
+{
+	char *namep;
+	int c;
+
+	c = infinipath_get_stats_names(&namep);
+	free(namep);
+	return c;
+}
+
+int infinipath_get_ctrs_unit_names_count(int unitno)
+{
+	char *namep;
+	int c;
+
+	c = infinipath_get_ctrs_unit_names(unitno, &namep);
+	free(namep);
+	return c;
+}
+
+int infinipath_get_ctrs_port_names_count(int unitno)
+{
+	char *namep;
+	int c;
+
+	c = infinipath_get_ctrs_port_names(unitno, &namep);
+	free(namep);
+	return c;
+}
+
+int infinipath_lookup_stat(const char *attr, char *namep, uint64_t *stats,
+			   uint64_t *s)
+{
+	const char *p;
+	int i, ret = -1, len = strlen(attr);
+	int nelem = infinipath_count_names(namep);
+
+	for (i = 0; i < nelem; i++) {
+		p = infinipath_get_next_name(&namep);
+		if (p == NULL) break;
+		if (strncasecmp(p, attr, len+1) == 0) {
+			ret = i;
+			*s = stats[i];
+		}
+	}
+	return ret;
+}
+
+uint64_t infinipath_get_single_stat(const char *attr, uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = infinipath_get_stats_names(&namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = infinipath_get_stats(stats, nelem);
+	if (n != nelem)
+	       goto bail;
+	ret = infinipath_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+uint64_t infinipath_get_single_unitctr(int unit, const char *attr, uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = infinipath_get_ctrs_unit_names(unit, &namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = infinipath_get_ctrs_unit(unit, stats, nelem);
+	if (n != nelem)
+	       goto bail;
+	ret = infinipath_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+int infinipath_get_single_portctr(int unit, int port, const char *attr,
+				       uint64_t *s)
+{
+	int nelem, n = 0, ret = -1;
+	char *namep = NULL;
+	uint64_t *stats = NULL;
+
+	nelem = infinipath_get_ctrs_port_names(unit, &namep);
+	if (nelem == -1 || namep == NULL)
+		goto bail;
+	stats = calloc(nelem, sizeof(uint64_t));
+	if (stats == NULL)
+		goto bail;
+	n = infinipath_get_ctrs_port(unit, port, stats, nelem);
+	if (n != nelem)
+	       goto bail;
+	ret = infinipath_lookup_stat(attr, namep, stats, s);
+bail:
+	if (namep != NULL)
+		free(namep);
+	if (stats != NULL)
+		free(stats);
+	return ret;
+}
+
+/*
+ * Add a constructor function to disable mmap if asked to do so by the user
+ */
+static void init_mallopt_disable_mmap(void) __attribute__ ((constructor));
+
+static void init_mallopt_disable_mmap(void) 
+{
+    char *env = getenv("IPATH_DISABLE_MMAP_MALLOC");
+
+    if (env && *env) {
+	if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) {
+	    __ipath_malloc_no_mmap = 1;
+	}
+    }
+
+    return;
+}
diff --git a/ipath/ipath_write_pio-i386.c b/ipath/ipath_write_pio-i386.c
new file mode 100644
index 0000000..603edc1
--- /dev/null
+++ b/ipath/ipath_write_pio-i386.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void ipath_write_pio_force_order(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    union ipath_pbc buf = {.qword = 0};
+    uint32_t cksum_len = pioparm->cksum_is_valid ? 
+      IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if(pioparm->port > 1)
+      buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | 
+				   __PBC_IBPORT |
+				   pioparm->rate);
+    else
+      buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+				   pioparm->rate);
+
+    *piob++ = buf.dword;
+    // 32 bit programs require fence after first 32 bits of pbc write
+    // Can't do as uint64_t store, or compiler could reorder
+    ips_wmb();
+    *piob++ = buf.pbcflags;
+
+    if(!pioparm->length) {
+        uint32_t *dhdr, dcpywords;
+        dcpywords = (IPATH_MESSAGE_HDR_SIZE >> 2)-1;
+        ipath_dwordcpy_safe(piob, hdr, dcpywords);
+        ips_wmb();
+        dhdr = hdr;
+        piob += dcpywords;
+        dhdr += dcpywords;
+        *piob++ = *dhdr;
+    } else {
+        uint32_t *pay2 = bdata, j;
+	uint32_t len = pioparm->length;
+
+        ipath_dwordcpy_safe(piob, hdr,
+            IPATH_MESSAGE_HDR_SIZE >> 2);
+        piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+
+        len >>= 2;
+        if(len>16) {
+            uint32_t pay_words = 16*((len-1)/16);
+            ipath_dwordcpy_safe(piob, pay2, pay_words);
+            piob += pay_words;
+            pay2 += pay_words;
+            len -= pay_words;
+        }
+        // now write the final chunk a word at a time, fence before trigger
+        for(j=0;j<(len-1);j++)
+           *piob++ = *pay2++;
+        ips_wmb(); // flush the buffer out now, so
+        *piob++ = *pay2;
+    }
+    
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      ips_wmb();
+      *piob = pioparm->cksum;
+    }
+
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    ips_wmb();
+}
+
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+void ipath_write_pio(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    union ipath_pbc buf = {0};
+    uint32_t cksum_len = pioparm->cksum_is_valid ? 
+      IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if(pioparm->port > 1)
+        buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT |
+				     pioparm->rate);
+    else
+        buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+				     pioparm->rate);
+    
+    *piob++ = buf.dword;
+    // 32 bit programs needs compiler fence to prevent compiler reordering
+    // the two 32 bit stores in a uint64_t, but on inorder wc systems, does not
+    // need a memory fence.
+    asm volatile("" : : : "memory");
+    *piob++ = buf.pbcflags;
+
+    ipath_dwordcpy_safe(piob, hdr,
+        IPATH_MESSAGE_HDR_SIZE >> 2);
+    piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+
+    if(pioparm->length) 
+        ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2);
+    
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+      
+      piob += pioparm->length >> 2;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      asm volatile("" : : : "memory"); // prevent compiler reordering
+      *piob = pioparm->cksum;
+    }
+    
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    ips_wmb();
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+static inline void ipath_write_pio_special_trigger(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata,
+	unsigned offset)
+{
+    union ipath_pbc buf = {0};
+    volatile uint32_t *piobs = piob;
+    uint32_t cksum_len = pioparm->cksum_is_valid ? 
+      IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if(pioparm->port > 1)
+        buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT |
+				     pioparm->rate);
+    else
+        buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+				     pioparm->rate);
+
+    *piob++ = buf.dword;
+    // 32 bit programs needs compiler fence to prevent compiler reordering
+    // the two 32 bit stores in a uint64_t, but on inorder wc systems, does not
+    // need a memory fence.
+    asm volatile("" : : : "memory");
+    *piob++ = buf.pbcflags;
+
+    ipath_dwordcpy_safe(piob, hdr,
+        IPATH_MESSAGE_HDR_SIZE >> 2);
+    piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+    
+    if (pioparm->length) 
+      ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2);
+    
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+      
+      piob += pioparm->length >> 2;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      asm volatile("" : : : "memory"); // prevent compiler reordering
+      *piob = pioparm->cksum;
+    }
+    
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    ips_wmb();
+    *(piobs + offset) = IPATH_SPECIAL_TRIGGER_MAGIC;
+    ips_wmb();
+}
+
+void ipath_write_pio_special_trigger2k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void ipath_write_pio_special_trigger4k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
+
diff --git a/ipath/ipath_write_pio-ppc.c b/ipath/ipath_write_pio-ppc.c
new file mode 100644
index 0000000..f6bda57
--- /dev/null
+++ b/ipath/ipath_write_pio-ppc.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+#include <altivec.h>
+
+union piovec {
+	vector unsigned int	vec;
+	uint32_t		dw[4];
+};
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void ipath_write_pio_force_order(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union ipath_pbc buf = {.qword = 0};
+	volatile uint32_t *dpiob = (volatile uint32_t *)piob;
+	uint32_t *dhdr = hdr;
+	uint32_t *ddata = bdata;
+	uint32_t dlen = pioparm->length >> 2;
+	union piovec vec;
+	volatile vector unsigned int *vpiob;
+	uint32_t cksum_len = pioparm->cksum_is_valid ? 
+	  IPATH_CRC_SIZE_IN_BYTES : 0;
+	
+	buf.length =
+	  __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1);
+	if(pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+					     __PBC_IBPORT |
+					     pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	vpiob = (volatile vector unsigned int *)dpiob;
+
+	vec.dw[0] = buf.dword;
+	vec.dw[1] = 0;
+	vec.dw[2] = *dhdr++;
+	vec.dw[3] = *dhdr++;
+	*vpiob++ = vec.vec;
+	ips_wmb();
+
+	vec.dw[0] = *dhdr++;
+	vec.dw[1] = *dhdr++;
+	vec.dw[2] = *dhdr++;
+	vec.dw[3] = *dhdr++;
+	*vpiob++ = vec.vec;
+
+	vec.dw[0] = *dhdr++;
+	vec.dw[1] = *dhdr++;
+	vec.dw[2] = *dhdr++;
+	vec.dw[3] = *dhdr++;
+	*vpiob++ = vec.vec;
+
+	vec.dw[0] = *dhdr++;
+	vec.dw[1] = *dhdr++;
+	vec.dw[2] = *dhdr++;
+	vec.dw[3] = *dhdr;
+
+	if ( !dlen ) {
+		ips_wmb();
+		*vpiob++ = vec.vec;
+		dpiob = (volatile uint32_t *) vpiob;
+	} else {
+		*vpiob++ = vec.vec;
+
+		while ( dlen > 4 ) {
+			vec.dw[0] = *ddata++;
+			vec.dw[1] = *ddata++;
+			vec.dw[2] = *ddata++;
+			vec.dw[3] = *ddata++;
+			*vpiob++ = vec.vec;
+			dlen -= 4;
+		}
+
+		switch ( dlen ) {
+
+			case 4: {
+				vec.dw[0] = *ddata++;
+				vec.dw[1] = *ddata++;
+				vec.dw[2] = *ddata++;
+				vec.dw[3] = *ddata;
+				ips_wmb();
+				*vpiob++ = vec.vec;
+				dpiob = (volatile uint32_t *) vpiob;
+			} break;
+
+			case 3: {
+				dpiob = (volatile uint32_t *)vpiob;
+				*dpiob++ = *ddata++;
+				*dpiob++ = *ddata++;
+				ips_wmb();
+				*dpiob++ = *ddata;
+			} break;
+
+			case 2: {
+				dpiob = (volatile uint32_t *)vpiob;
+				*dpiob++ = *ddata++;
+				ips_wmb();
+				*dpiob++ = *ddata;
+			} break;
+
+			case 1: {
+				dpiob = (volatile uint32_t *)vpiob;
+				ips_wmb();
+				*dpiob++ = *ddata;
+			} break;
+		}
+	}
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf (pioparm->cksum_is_valid){
+	  int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+	  int nCRC = 0;
+	  
+	  while (nCRC < (nCRCopies-1)) {
+	    *dpiob = pioparm->cksum;
+	    dpiob++;
+	    nCRC++;
+	  }
+	  
+	  asm volatile("" : : : "memory"); // prevent compiler reordering
+	  *dpiob = pioparm->cksum;
+	}
+    
+	ips_wmb();
+
+	return;
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+void ipath_write_pio(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union ipath_pbc buf = {.qword = 0};
+	volatile uint32_t *dpiob = piob;
+	uint32_t *dhdr = hdr;
+	uint32_t *ddata = bdata;
+	uint32_t dlen = pioparm->length >> 2;
+	uint32_t cksum_len = pioparm->cksum_is_valid ? 
+	  IPATH_CRC_SIZE_IN_BYTES : 0;
+	
+	buf.length =
+	  __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1);
+	if(pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+					     __PBC_IBPORT | 
+					     pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*dpiob++ = buf.dword;
+	asm volatile("" : : : "memory");
+	*dpiob++ = 0;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	*dpiob++ = *dhdr++;
+	if ( !dlen ) {
+		asm volatile("" : : : "memory");
+		*dpiob++ = *dhdr;
+	} else {
+		*dpiob++ = *dhdr;
+
+		while ( dlen > 1 ) {
+			*dpiob++ = *ddata++;
+			dlen -= 1;
+		}
+
+		asm volatile("" : : : "memory");
+		*dpiob++ = *ddata;
+	}
+	
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf (pioparm->cksum_is_valid){
+	  int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+	  int nCRC = 0;
+	  
+	  while (nCRC < (nCRCopies-1)) {
+	    *dpiob = pioparm->cksum;
+	    dpiob++;
+	    nCRC++;
+	  }
+	  
+	  asm volatile("" : : : "memory"); // prevent compiler reordering
+	  *dpiob = pioparm->cksum;
+	}
+    
+	ips_wmb();
+
+	return;
+}
+
+void ipath_write_pio_special_trigger2k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	_IPATH_ERROR("no special trigger 2k support for ppc\n");
+}
+
+void ipath_write_pio_special_trigger4k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	_IPATH_ERROR("no special trigger 4k support for ppc\n");
+}
diff --git a/ipath/ipath_write_pio-ppc64.c b/ipath/ipath_write_pio-ppc64.c
new file mode 100644
index 0000000..c7f8764
--- /dev/null
+++ b/ipath/ipath_write_pio-ppc64.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+#include <altivec.h>
+
+union piovec {
+	vector unsigned int	vec;
+	uint64_t		qw[2];
+};
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void ipath_write_pio_force_order(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union ipath_pbc buf = {.qword = 0};
+	volatile uint64_t *qpiob = (volatile uint64_t *)piob;
+	uint64_t *qhdr = hdr;
+	uint64_t *qdata = bdata;
+	uint64_t dlen = pioparm->length >> 2;
+	union piovec vec;
+	volatile vector unsigned int *vpiob;
+	uint32_t cksum_len = pioparm->cksum_is_valid ? 
+	  IPATH_CRC_SIZE_IN_BYTES : 0;
+	
+	buf.length =
+	  __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1);
+	if(pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+					     __PBC_IBPORT |
+					     pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	vpiob = (volatile vector unsigned int *)qpiob;
+
+	vec.qw[0] = buf.qword;
+	vec.qw[1] = *qhdr++;
+	*vpiob++ = vec.vec;
+	ips_wmb();
+
+	vec.qw[0] = *qhdr++;
+	vec.qw[1] = *qhdr++;
+	*vpiob++ = vec.vec;
+
+	vec.qw[0] = *qhdr++;
+	vec.qw[1] = *qhdr++;
+	*vpiob++ = vec.vec;
+
+	vec.qw[0] = *qhdr++;
+	vec.qw[1] = *qhdr;
+
+	if ( !dlen ) {
+		ips_wmb();
+		*vpiob++ = vec.vec;
+		piob = (volatile uint32_t*) qpiob;
+	} else {
+		*vpiob++ = vec.vec;
+		
+		while ( dlen > 4 ) {
+			vec.qw[0] = *qdata++;
+			vec.qw[1] = *qdata++;
+			*vpiob++ = vec.vec;
+			dlen -= 4;
+		}
+
+		switch ( dlen ) {
+
+			case 4: {
+				vec.qw[0] = *qdata++;
+				vec.qw[1] = *qdata;
+				ips_wmb();
+				*vpiob++ = vec.vec;
+				piob = (volatile uint32_t*) qpiob;
+			} break;
+
+			case 3: {
+				volatile uint32_t *dpiob;
+				uint32_t *ddata;
+				qpiob = (volatile uint64_t *)vpiob;
+				*qpiob++ = *qdata++;
+				dpiob = (volatile uint32_t *)qpiob;
+				ddata = (uint32_t *)qdata;
+				ips_wmb();
+				*dpiob++ = *ddata;
+				piob = (volatile uint32_t*) dpiob;
+			} break;
+
+			case 2: {
+				qpiob = (volatile uint64_t *)vpiob;
+				ips_wmb();
+				*qpiob++ = *qdata;
+				piob = (volatile uint32_t*) qpiob;
+			} break;
+
+			case 1: {
+				volatile uint32_t *dpiob = (volatile uint32_t *)vpiob;
+				uint32_t *ddata = (uint32_t *)qdata;
+				ips_wmb();
+				*dpiob++ = *ddata;
+				piob = (volatile uint32_t*) dpiob;
+			} break;
+		}
+	}
+
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf (pioparm->cksum_is_valid){
+	  int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+	  int nCRC = 0;
+	  
+	  while (nCRC < (nCRCopies-1)) {
+	    *piob = pioparm->cksum;
+	    piob++;
+	    nCRC++;
+	  }
+	  
+	  asm volatile("" : : : "memory"); // prevent compiler reordering
+	  *piob = pioparm->cksum;
+	}
+	
+	ips_wmb();
+
+	return;
+}
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+void ipath_write_pio(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	union ipath_pbc buf = {.qword = 0};
+	volatile uint64_t *qpiob = (volatile uint64_t *)piob;
+	uint64_t *qhdr = hdr;
+	uint64_t *qdata = bdata;
+	uint64_t dlen = pioparm->length >> 2;
+        uint32_t cksum_len = pioparm->cksum_is_valid ? 
+	  IPATH_CRC_SIZE_IN_BYTES : 0;
+	
+	buf.length =
+	  __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1);
+	if(pioparm->port > 1)
+		buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+					     __PBC_IBPORT |
+					     pioparm->rate);
+	else
+		buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT |
+					     pioparm->rate);
+
+	*qpiob++ = buf.qword;
+	asm volatile("" : : : "memory"); // prevent compiler reordering
+	*qpiob++ = *qhdr++;
+	*qpiob++ = *qhdr++;
+	*qpiob++ = *qhdr++;
+	*qpiob++ = *qhdr++;
+	*qpiob++ = *qhdr++;
+	*qpiob++ = *qhdr++;
+	if ( !dlen ) {
+	        asm volatile("" : : : "memory"); // prevent compiler reordering
+		*qpiob++ = *qhdr;
+		piob = (volatile uint32_t*) qpiob;
+	} else {
+		*qpiob++ = *qhdr;
+
+		while ( dlen > 2 ) {
+			*qpiob++ = *qdata++;
+			dlen -= 2;
+		}
+		
+		asm volatile("" : : : "memory"); // prevent compiler reordering
+
+		switch ( dlen ) {
+
+			case 2: {
+				*qpiob++ = *qdata;
+				piob = (volatile uint32_t*) qpiob;
+			} break;
+
+			case 1: {
+				volatile uint32_t *dpiob = (volatile uint32_t *)qpiob;
+				uint32_t *ddata = (uint32_t *)qdata;
+
+				*dpiob++ = *ddata;
+				piob = (volatile uint32_t*) dpiob;
+			} break;
+		}
+	}
+	
+	/* If checksum is enabled insert CRC at end of packet */
+	if_pf (pioparm->cksum_is_valid){
+	  int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+	  int nCRC = 0;
+	  
+	  while (nCRC < (nCRCopies-1)) {
+	    *piob = pioparm->cksum;
+	    piob++;
+	    nCRC++;
+	  }
+	  
+	  asm volatile("" : : : "memory"); // prevent compiler reordering
+	  *piob = pioparm->cksum;
+	}
+	
+	/* send it on it's way, now, rather than waiting for processor to
+	 * get around to flushing it */
+	ips_wmb();
+
+	return;
+}
+
+void ipath_write_pio_special_trigger2k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	_IPATH_ERROR("no special trigger 2k support for ppc64\n");
+}
+
+void ipath_write_pio_special_trigger4k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+	_IPATH_ERROR("no special trigger 4k support for ppc64\n");
+}
diff --git a/ipath/ipath_write_pio-x86_64.c b/ipath/ipath_write_pio-x86_64.c
new file mode 100644
index 0000000..a5d47d7
--- /dev/null
+++ b/ipath/ipath_write_pio-x86_64.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains the initialization functions used by the low
+// level infinipath protocol code.
+
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <malloc.h>
+
+#include "ipserror.h"
+#include "ipath_user.h"
+
+/*
+ * These pio copy routines are here so they can be used by test code, as well
+ * as by MPI, and can change independently of MPI
+*/
+
+/*
+ * for processors that may not write store buffers in the order filled,
+ * and when the store buffer is not completely filled (partial at end, or
+ * interrupted and flushed) may write the partial buffer in
+ * "random" order.  requires additional serialization
+*/
+void ipath_write_pio_force_order(volatile uint32_t *piob, const struct ipath_pio_params *pioparm,
+	void *hdr, void *bdata)
+{
+    union ipath_pbc buf = {.qword = 0};
+    uint32_t cksum_len = pioparm->cksum_is_valid ? 
+      IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if(pioparm->port > 1)
+        buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT | 
+				     pioparm->rate);
+    else
+        buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | 
+				     pioparm->rate);
+
+    *(volatile uint64_t *)piob = buf.qword;
+    ips_wmb(); // pbc must be forced to be first write to chip buffer
+    piob += 2;
+
+    if(!pioparm->length) {
+        uint32_t *dhdr, dcpywords;
+        dcpywords = (IPATH_MESSAGE_HDR_SIZE >> 2)-1;
+        ipath_dwordcpy_safe(piob, hdr, dcpywords);
+        ips_wmb();
+        dhdr = hdr;
+        piob += dcpywords;
+        dhdr += dcpywords;
+        *piob++ = *dhdr;
+    } else {
+        uint32_t *pay2 = bdata, j;
+	uint32_t len = pioparm->length;
+
+        ipath_dwordcpy_safe(piob, hdr,
+            IPATH_MESSAGE_HDR_SIZE >> 2);
+        piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+
+        len >>= 2;
+        if(len>16) {
+            uint32_t pay_words = 16*((len-1)/16);
+            ipath_dwordcpy_safe(piob, pay2, pay_words);
+            piob += pay_words;
+            pay2 += pay_words;
+            len -= pay_words;
+        }
+        // now write the final chunk a word at a time, fence before trigger
+        for(j=0;j<(len-1);j++)
+           *piob++ = *pay2++;
+        ips_wmb(); // flush the buffer out now, so
+        *piob++ = *pay2;
+    }
+
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+            
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      ips_wmb();
+      *piob = pioparm->cksum;
+    }
+    
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    ips_wmb();
+}
+
+
+/*
+ * for processors that always write store buffers in the order filled,
+ * and if store buffer not completely filled (partial at end, or
+ * interrupted and flushed) always write the partial buffer in
+ * address order.  Avoids serializing and flush instructions
+ * where possible.
+ */
+#ifdef __MIC__
+void ipath_write_pio_vector(volatile uint32_t *piob,  const struct ipath_pio_params *pioparm,
+	void *hdr, void *bdata)
+{
+  union ipath_pbc *pbc;
+  uint32_t cksum_len = pioparm->cksum_is_valid ? 
+    IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    pbc = (union ipath_pbc *)((char *)hdr - 8);
+    pbc->qword = 0;
+    pbc->length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if (pioparm->port > 1)
+        pbc->pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT | 
+				     pioparm->rate);
+    else
+        pbc->pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | 
+				     pioparm->rate);
+
+#ifdef PSM_DEBUG
+    if (((uint64_t)piob) & 63) {
+	_IPATH_ERROR("ipath_write_pio_vector(): piob not 64byte aligned\n");
+	return;
+    }
+    if (((uint64_t)pbc) & 63) {
+	_IPATH_ERROR("ipath_write_pio_vector(): pbc not 64byte aligned\n");
+	return;
+    }
+#endif
+    memcpy((uint32_t *)piob, pbc, IPATH_MESSAGE_HDR_SIZE+8);
+    piob += (IPATH_MESSAGE_HDR_SIZE >> 2) + 2;
+
+    if(pioparm->length) 
+      memcpy((uint32_t *)piob, (uint32_t*)bdata, pioparm->length);
+
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+      
+      piob += pioparm->length >> 2;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      asm volatile("" : : : "memory"); // prevent compiler reorder
+      *piob = pioparm->cksum;
+    }
+          
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    //ips_wmb();
+}
+#endif		//__MIC__
+
+void ipath_write_pio(volatile uint32_t *piob,  const struct ipath_pio_params *pioparm,
+	void *hdr, void *bdata)
+{
+  union ipath_pbc buf = {0};
+  uint32_t cksum_len = pioparm->cksum_is_valid ? 
+    IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if (pioparm->port > 1)
+        buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT | 
+				     pioparm->rate);
+    else
+        buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | 
+				     pioparm->rate);
+
+    *(volatile uint64_t *)piob = buf.qword;
+    piob += 2;
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+
+    ipath_dwordcpy_safe(piob, hdr, IPATH_MESSAGE_HDR_SIZE >> 2);
+
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+    piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+
+    if(pioparm->length) 
+      ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2);
+
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+      
+      piob += pioparm->length >> 2;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      asm volatile("" : : : "memory"); // prevent compiler reorder
+      *piob = pioparm->cksum;
+    }
+          
+    /* send it on it's way, now, rather than waiting for processor to
+     * get around to flushing it */
+    ips_wmb();
+}
+
+/*
+ * here we trigger on a "special" address, so just bang it out
+ * as fast as possible...
+ */
+static inline void 
+ipath_write_pio_special_trigger(volatile uint32_t *piob,
+		const struct ipath_pio_params *pioparm,
+		void *hdr, void *bdata, unsigned offset)
+{
+    union ipath_pbc buf = {0};
+    volatile uint32_t *piobs = piob;
+    uint32_t cksum_len = pioparm->cksum_is_valid ? 
+      IPATH_CRC_SIZE_IN_BYTES : 0;
+
+    buf.length =
+        __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1);
+    if(pioparm->port > 1)
+        buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | 
+				     __PBC_IBPORT | 
+				     pioparm->rate);
+    else
+        buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT|
+				     pioparm->rate);
+
+    *(volatile uint64_t *)piob = buf.qword;
+    piob += 2;
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+
+    ipath_dwordcpy_safe(piob, hdr,
+        IPATH_MESSAGE_HDR_SIZE >> 2);
+    piob += IPATH_MESSAGE_HDR_SIZE >> 2;
+    asm volatile("" : : : "memory"); // prevent compiler reordering
+    
+    if(pioparm->length) 
+      ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2);
+    
+    /* If checksum is enabled insert CRC at end of packet */
+    if_pf (pioparm->cksum_is_valid){
+      int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2;
+      int nCRC = 0;
+
+      piob += pioparm->length >> 2;
+      
+      while (nCRC < (nCRCopies-1)) {
+	*piob = pioparm->cksum;
+	piob++;
+	nCRC++;
+      }
+      
+      asm volatile("" : : : "memory"); // prevent compiler reordering
+      *piob = pioparm->cksum;
+    }
+    
+    /* 
+     * flush then write "special" then flush...
+     */
+    ips_wmb();
+    *(piobs + offset) = IPATH_SPECIAL_TRIGGER_MAGIC;
+    ips_wmb();
+}
+
+void ipath_write_pio_special_trigger2k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023);
+}
+
+void ipath_write_pio_special_trigger4k(volatile uint32_t *piob,
+	const struct ipath_pio_params *pioparm, void *hdr, void *bdata)
+{
+    ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047);
+}
diff --git a/libuuid/COPYING b/libuuid/COPYING
new file mode 100644
index 0000000..2f17068
--- /dev/null
+++ b/libuuid/COPYING
@@ -0,0 +1,25 @@
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, and the entire permission notice in its entirety,
+   including the disclaimer of warranties.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote
+   products derived from this software without specific prior
+   written permission.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
diff --git a/libuuid/ChangeLog b/libuuid/ChangeLog
new file mode 100644
index 0000000..b90e063
--- /dev/null
+++ b/libuuid/ChangeLog
@@ -0,0 +1,556 @@
+2006-06-30  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.38
+
+2005-03-21  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.37
+
+2006-02-05  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.36
+
+2005-02-05  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Remove uuid.pc on a "make distclean"
+
+2005-01-26  Theodore Ts'o  <tytso@mit.edu>
+
+	* uuid.pc.in: Add pkg-config files.
+
+2005-01-18  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Fix the kernel compile-time echo commands to be
+		consistent and portable
+
+2005-01-17  Theodore Ts'o  <tytso@mit.edu>
+
+	* uuidP.h: Use inttypes.h in preference to stdint.h for
+		compatibility with older FreeBSD and Solaris systems.
+
+2004-12-14  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Use Linux-kernel-style makefile output for "make
+		install"
+
+	* Makefile.in (installdirs): Use $(MKINSTALLDIRS) macro.
+		Update dependencies.
+
+2004-11-30  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Use Linux-kernel-style makefile output to make it
+		easier to see errors/warnings.
+
+2004-09-17  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c (get_node_id): glibc always defines AF_LINK, so only
+		try to use struct sockaddr_dl if HAVE_NET_IF_DL_H is
+		defined.  (Addresses Debian Bug #256669)
+
+2004-05-27  Theodore Ts'o  <tytso@mit.edu>
+
+	* uuid.h (UUID_DEFINE): Make the UUID defined as a static
+		variable, with __attribute__ ((unused)) if we are using GCC.
+
+2004-05-04  Theodore Ts'o  <tytso@mit.edu>
+
+	* Update and clean up uuid man pages
+
+	* gen_uuid.c (uuid_generate_time): Mask off the timestamp to avoid
+		a Y8.8888K problem.
+
+2004-04-03  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Update the modtime even if subst doesn't need to
+		update the libuuid man pages, to avoid always re-running
+		subst, especially since there are no dependencies on the
+		man page.
+
+2004-04-03  Theodore Ts'o  <tytso@mit.edu>
+
+	* libuuid.3.in, uuid_clear.3.in, uuid_compare.3.in, uuid_copy.3.in,
+		uuid_generate.3.in, uuid_is_null.3.in, uuid_parse.3.in,
+		uuid_time.3.in, uuid_unparse.3.in: Change licensing of man
+		pages from GPL to 3-clause BSD-style.
+
+	* uuid_parse.3.in, uuid_unparse.3.in: Change the use of the term
+		"internal format" to "binary representation".
+
+	* gen_uuid.c, pack.c, unpack.c, uuid_time.c, uuidP.h,
+		uuid_types.h.in: Use ANSI C99 types if stdint.h exists.
+
+2004-03-30  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c (get_node_id): Clean up AF_LINK #ifdef's for Darwin.
+
+2004-03-22  Theodore Ts'o  <tytso@mit.edu>
+
+	* unparse.c (uuid_unparse_lower, uuid_unparse_upper),
+		uuid_unparse.3.in, uuid.h: Add new functions.
+
+2004-03-19  Theodore Ts'o  <tytso@mit.edu>
+
+	* Change the license to be the 3-clause BSD-style license
+
+	* uuid.h (UUID_DEFINE): Add UUID type #define's, and add an CPP
+		macro to define UUID constants.
+
+	* gen_uuid.c (get_clock): Use 14 bits for the clock sequence,
+		instead of just 13 bits.
+
+	* gen_uuid.c (get_node_id): Fix so that Darwin will actually get
+		the ethernet address correctly.
+
+2004-02-29  Brian Bergstrand  <brian@bergstrand.org>
+
+	* Makefile.in: Use $(BSDLIB_PIC_FLAG) to determine whether to use
+		-fpic or -fPIC
+
+2004-02-28  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.35
+
+2004-01-30  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c (uuid_generate_time): Fix bug pointed out by Ralf
+		S. Engelshall; when generating a random ethernet address
+		because one is not available, set the least significant
+		bit of the first byte of the MAC address, since it is the
+		first bit to be transmitted, and is therefore the
+		multicast bit.
+
+2003-07-25  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.34
+
+2003-04-21  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.33
+
+2003-04-21  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Use DYLD_LIBRAY_PATH so that "make check" works on
+		Darwin systems when building with shared libraries.
+
+2003-04-12  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c: Add #ifdef checks around #include <sys/ioctl.h> and
+		<sys/socket.h>.
+
+2003-04-03  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c (get_random_bytes): Always xor in a stream of bytes
+		from the system PRNG (i.e., random/srandom, seeded from
+		the time, pid, and uid) in case /dev/random isn't doing
+		the right thing on a particular system.  It doesn't hurt,
+		and it can help, in the case of a buggy /dev/random.
+
+2003-03-14  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in: Add support for Apple Darwin
+
+2003-03-06  Theodore Tso  <tytso@mit.edu>
+
+	* uuid_types.h.in: Don't redefine types if other e2fsprogs
+		*_types.h files have been included already.
+
+	* Makefile.in (tst_uuid): Link against the static library instead
+		of all of the object files, so that we automatically pick
+		up -lsocket under Solaris.
+
+2003-03-02  Theodore Ts'o  <tytso@mit.edu>
+
+	* Makefile.in, uuidP.h, uuid_types.h.in: Use uuid_types.h instead
+		of ext2_types.h
+
+2002-11-09  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.32
+
+2002-11-08  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.31
+
+2002-10-31  Theodore Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.30
+
+2002-10-31  Theodore Ts'o  <tytso@mit.edu>
+
+	* gen_uuid.c (get_random_bytes): Don't spin forever if read()
+		returns EINTR or EAGAIN, so that when /dev/random is
+		opened O_NONBLOCK, we don't end up spinning forever.
+
+2001-09-24  Theodore Tso  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.29
+
+2001-08-31  Theodore Tso  <tytso@thunk.org>
+
+	* Release of E2fsprogs 1.28
+
+2002-07-15  Theodore Ts'o  <tytso@mit.edu>
+
+	* parse.c (uuid_parse): Fix uuid parsing bug which didn't complain
+		for certain types of invalid input text.  (Addresses
+		Debian bug #152891).
+
+	* tst_uuid.c: Add test cases for invalid text strings passed to
+		uuid_parse.
+
+2002-03-08  Theodore Tso  <tytso@mit.edu>
+
+	* Release of E2fsprogs 1.27
+
+2002-02-24  Theodore Tso  <tytso@mit.edu>
+
+	* Makefile.in (install): Install hard links to man pages for
+		uuid_generate_random and uuid_generate_time.  Remove
+		any compressed man pages before installing the man pages.
+
+2002-02-03  Theodore Tso  <tytso@thunk.org>
+
+	* Release of E2fsprogs 1.26
+
+2001-09-20  Theodore Tso  <tytso@thunk.org>
+
+	* Release of E2fsprogs 1.25
+
+2001-09-10  Theodore Tso  <tytso@mit.edu>
+
+	* compare.c (uuid_compare), copy.c (uuid_copy), 
+		isnull.c (uuid_is_null), pack.c (uuid_pack), 
+		parse.c (uuid_parse), unpack.c (uuid_unpack),
+		unparse.c (uuid_unparse), uuid.h, uuidP.h, 
+		uuid_time.c (uuid_time, uuid_type, uuid_variant):
+		Use const for pointer variables that we don't modify.  Add
+		the appropriate ifdef's in uuid.h to make it be C++ friendly.
+
+2001-09-02  Theodore Tso  <tytso@thunk.org>
+
+	* Release of E2fsprogs 1.24a
+
+2001-08-30  Theodore Tso  <tytso@thunk.org>
+
+	* Release of E2fsprogs 1.24
+
+2001-08-15  Theodore Tso  <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.23
+
+2001-06-23  Theodore Tso  <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.22
+
+2001-06-21  Theodore Tso  <tytso@valinux.com>
+
+	* uuid.h: Add protection against multiple inclusion
+
+2001-06-15  Theodore Tso  <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.21
+
+2001-06-01  Theodore Tso  <tytso@valinux.com>
+
+	* Makefile.in, uuidP.h: Move include/asm/types.h.in to
+		lib/ext2fs/ext2_types.h.in.
+
+2001-06-01  Theodore Tso  <tytso@valinux.com>
+
+	* unpack.c, unparse.c, uuid_time.c: Update files to be under the
+		LGPL (that somehow were missed when libuuid was converted
+		to use the LGPL).  Whoops.
+
+2001-05-25  Theodore Tso  <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.20
+
+2001-05-14  Theodore Tso  <tytso@valinux.com>
+
+	* tst_uuid.c, uuid_time.c: Remove unneeded #include of ext2_fs.h
+
+2001-05-12  Theodore Tso  <tytso@valinux.com>
+
+	* libuuid.3.in, uuid_clear.3.in, uuid_compare.3.in, uuid_copy.3.in,
+		uuid_generate.3.in, uuid_is_null.3.in, uuid_parse.3.in,
+		uuid_time.3.in, uuid_unparse.3.in: Update URL location of
+		e2fsprogs package.
+
+2001-05-01  Theodore Tso  <tytso@valinux.com>
+
+	* parse.c, compare.c: Include string.h to fix gcc -Wall
+		complaints.
+
+	* gen_uuid.c: Define _SVID_SOURCE to avoid gcc -Wall errors
+		because some required structures wouldn't be otherwise
+		defined.  Fix a minor gcc -Wall nit in the declaration of
+		get_random_fd().
+
+2001-01-12  Theodore Ts'o  <tytso@valinux.com>
+
+	* uuid_time.c (main), tst_uuid.c (main): Fix gcc -Wall complaints.
+
+	* uuid.h, copy.c (uuid_copy): Change arguments to make it
+		clear which argument is the source and which is the
+		destination.
+
+	* gen_uuid.c (get_random_fd): Use gettimeofday to seed the PRNG,
+		so we can take advantage of tv_usec to do (slightly)
+		better at seeding it.
+
+2000-07-13    <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.19
+
+2000-07-07  Theodore Ts'o  <tytso@valinux.com>
+
+	* Makefile.in (uuid_time): Fix compilation rule so that
+		uuid_time.o doesn't get bashed in order to build the
+		command-line version of uuid_time.
+
+2000-07-04  Theodore Ts'o  <tytso@valinux.com>
+
+	* Makefile.in: Remove explicit link of -lc in the shared library.
+		(It shouldn't be necessary, and is harmful in some cases).
+
+2000-06-12  Theodore Ts'o  <tytso@valinux.com>
+
+	* gen_uuid.c (get_random_bytes): Use O_NONBLOCK when trying to
+		open /dev/random.  Break out the /dev/random
+		initialization code into a get_random_fd() function, and
+		use that function in uuid_generate() to determine whether
+		to use uuid_generate_random()  or uuid_generate_time().
+
+2000-05-25    <tytso@snap.thunk.org>
+
+	* Makefile: Add hack dependency rule so that parallel makes work
+		correctly. 
+
+2000-04-07  Theodore Ts'o  <tytso@valinux.com>
+
+	* clear.c, compare.c, copy.c, gen_uuid.c, isnull.c, pack.c,
+		parse.c, uuid.h, uuidP.h: Changed copyright to be the
+		LGPL. 
+
+Thu Apr  6 17:38:58 2000  Theodore Y. Ts'o  <tytso@signal.thunk.org>
+
+	* Makefile.in (uuid_time): Compile uuid_time in two steps (first
+		create .o, then link it against the libraries) to work
+		around bug in a.out linker.
+
+	* dll/jump.funcs, dll/jump.import, dll/jump.params: Update a.out
+		shared library control files to reflect new added files.
+
+2000-04-03  Theodore Ts'o  <tytso@valinux.com>
+
+	* gen_uuid.c (get_clock): Fix bug where the last timeval wasn't
+		getting set, causing potentially duplicate UUID's to be
+		generated.
+
+2000-03-12  Theodore Ts'o  <tytso@valinux.com>
+
+	* gen_uuid.c (get_random_bytes): Make more paranoid about
+		misbehaving /dev/urandom.  If we get a return of zero
+		without an error more than 8 times in a row, we break out
+		and return an error.  Also, if /dev/urandom doesn't exist,
+		try /dev/random.
+
+2000-01-18  Theodore Ts'o  <tytso@valinux.com>
+
+	* Makefile.in: Since LIBUUID can sometimes include
+		"-lsocket" we need a separate DEPLIBUUID that can be used
+		in Makefile's dependency rules.
+
+1999-11-19    <tytso@valinux.com>
+
+	* Makefile.in (distclean): Remove TAGS and Makefile.in.old from
+		the source directory.
+
+1999-11-10    <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.18
+
+1999-10-26    <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.17
+
+1999-10-26    <tytso@valinux.com>
+
+	* uuid_time.c (variant_string): Declare to be static to avoid gcc
+		warnings.
+
+	* uuid.h: Add function prototypes for uuid_generate_random() and
+		uuid_generate_time().
+
+1999-10-25    <tytso@valinux.com>
+
+	* gen_uuid_nt.c (uuid_generate): W2K strikes again!  An
+		incompatible interface change means we need to detect
+		whether the code is running on an NT4 or NT5 system.
+
+1999-10-22    <tytso@valinux.com>
+
+	* Release of E2fsprogs 1.16
+
+1999-10-21    <tytso@valinux.com>
+
+	* uuid_generate.8.in: Update man page to use a more standard
+		format (bold option flags and italicized variables), as
+		suggested by Andreas Dilger (adilger@enel.ucalgary.ca)
+
+1999-09-24    <tytso@valinux.com>
+
+	* gen_uuid_nt.c: New file which creates a UUID under Windows NT.
+
+1999-07-18  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs 1.15
+
+1999-05-17    <tytso@rsts-11.mit.edu>
+
+	* gen_uuid.c (get_random_bytes): Use a while loop when reading
+		from /dev/urandom so that if we get interrupted while
+		reading the right thing happens.
+		(uuid_generate_random): Add new function which uses the
+		new UUID format which uses 122 random bits to form the
+		128-bit UUID. 
+		(uuid_generate): Rename the old uuid_generate to be
+		uuid_generate_time, and create a new uuid_generate
+		function which calls either uuid_generate_random or
+		uuid_genereate_time depending on whether /dev/urandom is
+		present. 
+
+	* uuid_generate.3.in: Update to reflect changesin uuid_generate
+		and its two new variants.
+
+	* tst_uuid.c: Updated to test new uuid_generate functions, and to
+		reflect new semantics of uuid_compare.  Added tests to
+		make sure the UUID type and variant created by UUID
+		generate is correct.
+
+	* uuid_time.c (uuid_variant, uuid_type): Added new functions to 
+		return the UUID variant and type information.  The
+		debugging program now prints the UUID variant and type, 
+		and warns if the unparsed time information is likely to be
+		incorrect.
+
+	* uuid_parse.3.in, libuuid.3.in: Miscellaneous text cleanups.
+
+1999-05-03    <tytso@rsts-11.mit.edu>
+
+	* compare.c (uuid_compare): Change sense of uuid_compare so that
+		its return values match that of memcpy and the
+		uuid_compare() found in Paul Leach's internet-draft.
+
+1999-03-11  Andreas Dilger <adilger@enel.ucalgary.ca>
+
+	* Created man pages for libuuid functions.
+
+1999-01-09  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs 1.14
+
+1998-12-15  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs 1.13
+
+1998-12-04  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Makefile.in: Update version numbers of the UUID shared library,
+		since we've added a new function (uuid_time()).
+
+	* uuid_time.c: New file which returns the time field of a UUID.
+		(Good for debugging purposes)
+
+1998-07-09  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs 1.12
+
+1998-06-25  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* tst_uuid.c (main): Fixed bogus declaration of the main's argv
+		parameter.
+
+1998-04-26  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* uuidP.h: Use asm/types.h instead of linux/types.h to avoid a
+		problem caused by glibc hack to prevent linux/types.h from
+		being included.
+
+1998-03-30  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Makefile.in: Change to use new installation directory variables
+		convention.  Fix uninstall rules to take $(DESTDIR) into
+		account.
+
+Sun Mar  8 22:17:59 1998  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* gen_uuid.c (get_node_id): Use char * instead of caddr_t, which
+		doesn't always exist for glibc.
+
+Tue Oct 14 21:48:16 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* gen_uuid.c: Use clock_reg instead of clock, since clock
+		conflicts with a header file declaration.
+
+Tue Jun 17 01:33:20 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs 1.11
+
+Thu Apr 24 12:16:42 1997  Theodre Ts'o  <tytso@localhost.mit.edu>
+
+	* Release of E2fsprogs version 1.10
+
+Thu Apr 17 12:23:38 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs version 1.09
+
+Fri Apr 11 18:56:26 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs version 1.08
+
+Wed Mar 12 13:32:05 1997  Theodore Y. Ts'o  <tytso@mit.edu>
+
+	* Release of E2fsprogs version 1.07
+
+Sun Mar  2 16:45:36 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Makefile.in (ELF_VERSION): Change version to be 1.1
+
+Thu Feb  6 23:08:07 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* gen_uuid.c (uuid_generate): Set Multicast bit when picking a
+		random node_id, to prevent conflicts with IEEE 802
+		addresses obtained from network cards.
+
+Wed Jan  1 23:51:09 1997  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* unpack.c, pack.c: Include string.h, since we use memcpy().
+
+Tue Dec  3 13:05:11 1996  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* parse.c: Add #include of ctype.h and stdlib.h, to pull in the
+		required prototypes.
+
+Fri Oct 11 17:15:10 1996  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Makefile.in (DLL_ADDRESS): Updated DLL address for libuuid.
+
+Tue Oct  8 02:02:03 1996  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs version 1.06
+
+Thu Sep 12 15:23:07 1996  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* Release of E2fsprogs version 1.05
+
+Tue Aug 27 16:50:43 1996  Miles Bader  <miles@gnu.ai.mit.edu>
+
+	* uuid/gen_uuid.c [HAVE_NET_IF_H] <net/if.h>: Include guarded.
+	[HAVE_NETINET_IN_H] <netinet/in.h>: Include guarded.
+	(get_node_id): Surround bulk of function with #ifdef HAVE_NET_IF_H.
+
+Tue Aug 27 16:50:16 1996  Theodore Ts'o  <tytso@rsts-11.mit.edu>
+
+	* gen_uuid.c (get_node_id): Add a specific ifdef for the HURD,
+		since it is broken w.r.t getting hardware addresses.
diff --git a/libuuid/Makefile b/libuuid/Makefile
new file mode 100644
index 0000000..ebe3643
--- /dev/null
+++ b/libuuid/Makefile
@@ -0,0 +1,45 @@
+# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+include $(top_srcdir)/buildflags.mak
+CFLAGS += -DPSM_UUID=1 -Wno-unused-function
+INCLUDES += -I$(top_srcdir) -I$(top_srcidr)/libuuid
+
+${TARGLIB}-objs := psm_uuid.o
+
+all: ${${TARGLIB}-objs}
+
+%.o: %.c 
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -f *.o
diff --git a/libuuid/clear.c b/libuuid/clear.c
new file mode 100644
index 0000000..bb52682
--- /dev/null
+++ b/libuuid/clear.c
@@ -0,0 +1,44 @@
+/*
+ * clear.c -- Clear a UUID
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include "string.h"
+
+#include "uuidP.h"
+
+UUID_STATIC
+void uuid_clear(uuid_t uu)
+{
+	memset(uu, 0, 16);
+}
+
diff --git a/libuuid/compare.c b/libuuid/compare.c
new file mode 100644
index 0000000..0a7dc9c
--- /dev/null
+++ b/libuuid/compare.c
@@ -0,0 +1,56 @@
+/*
+ * compare.c --- compare whether or not two UUID's are the same
+ *
+ * Returns 0 if the two UUID's are different, and 1 if they are the same.
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include "uuidP.h"
+#include <string.h>
+
+#define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1);
+
+UUID_STATIC
+int uuid_compare(const uuid_t uu1, const uuid_t uu2)
+{
+	struct uuid	uuid1, uuid2;
+
+	uuid_unpack(uu1, &uuid1);
+	uuid_unpack(uu2, &uuid2);
+
+	UUCMP(uuid1.time_low, uuid2.time_low);
+	UUCMP(uuid1.time_mid, uuid2.time_mid);
+	UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version);
+	UUCMP(uuid1.clock_seq, uuid2.clock_seq);
+	return memcmp(uuid1.node, uuid2.node, 6);
+}
+
diff --git a/libuuid/copy.c b/libuuid/copy.c
new file mode 100644
index 0000000..37b03b2
--- /dev/null
+++ b/libuuid/copy.c
@@ -0,0 +1,46 @@
+/*
+ * copy.c --- copy UUIDs
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include "uuidP.h"
+
+UUID_STATIC
+void uuid_copy(uuid_t dst, const uuid_t src)
+{
+	unsigned char		*cp1;
+	const unsigned char	*cp2;
+	int			i;
+
+	for (i=0, cp1 = dst, cp2 = src; i < 16; i++)
+		*cp1++ = *cp2++;
+}
diff --git a/libuuid/gen_uuid.c b/libuuid/gen_uuid.c
new file mode 100644
index 0000000..a946f79
--- /dev/null
+++ b/libuuid/gen_uuid.c
@@ -0,0 +1,322 @@
+/*
+ * gen_uuid.c --- generate a DCE-compatible uuid
+ *
+ * Copyright (C) 1996, 1997, 1998, 1999 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+/*
+ * Force inclusion of SVID stuff since we need it if we're compiling in
+ * gcc-wall wall mode
+ */
+#ifndef _SVID_SOURCE
+#  define _SVID_SOURCE
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+#ifdef HAVE_SYS_SOCKET_H
+#include <sys/socket.h>
+#endif
+#ifdef HAVE_SYS_SOCKIO_H
+#include <sys/sockio.h>
+#endif
+#ifdef HAVE_NET_IF_H
+#include <net/if.h>
+#endif
+#ifdef HAVE_NETINET_IN_H
+#include <netinet/in.h>
+#endif
+#ifdef HAVE_NET_IF_DL_H
+#include <net/if_dl.h>
+#endif
+
+#include "psm.h"
+
+#include "uuidP.h"
+
+#ifdef HAVE_SRANDOM
+#define srand(x) 	srandom(x)
+#define rand() 		random()
+#endif
+
+static int get_random_fd(void)
+{
+	struct timeval	tv;
+	static int	fd = -2;
+	int		i;
+
+	if (fd == -2) {
+		gettimeofday(&tv, 0);
+		fd = open("/dev/urandom", O_RDONLY);
+		if (fd == -1)
+			fd = open("/dev/random", O_RDONLY | O_NONBLOCK);
+		srand((getpid() << 16) ^ getuid() ^ tv.tv_sec ^ tv.tv_usec);
+	}
+	/* Crank the random number generator a few times */
+	gettimeofday(&tv, 0);
+	for (i = (tv.tv_sec ^ tv.tv_usec) & 0x1F; i > 0; i--)
+		rand();
+	return fd;
+}
+
+/*
+ * Generate a series of random bytes.  Use /dev/urandom if possible,
+ * and if not, use srandom/random.
+ */
+static void get_random_bytes(void *buf, int nbytes)
+{
+	int i, n = nbytes, fd = get_random_fd();
+	int lose_counter = 0;
+	unsigned char *cp = (unsigned char *) buf;
+
+	if (fd >= 0) {
+		while (n > 0) {
+			i = read(fd, cp, n);
+			if (i <= 0) {
+				if (lose_counter++ > 16)
+					break;
+				continue;
+			}
+			n -= i;
+			cp += i;
+			lose_counter = 0;
+		}
+	}
+	
+	/*
+	 * We do this all the time, but this is the only source of
+	 * randomness if /dev/random/urandom is out to lunch.
+	 */
+	for (cp = buf, i = 0; i < nbytes; i++)
+		*cp++ ^= (rand() >> 7) & 0xFF;
+	return;
+}
+
+/*
+ * Get the ethernet hardware address, if we can find it...
+ */
+static int get_node_id(unsigned char *node_id)
+{
+#ifdef HAVE_NET_IF_H
+	int 		sd;
+	struct ifreq 	ifr, *ifrp;
+	struct ifconf 	ifc;
+	char buf[1024];
+	int		n, i;
+	unsigned char 	*a;
+#ifdef HAVE_NET_IF_DL_H
+	struct sockaddr_dl *sdlp;
+#endif
+
+/*
+ * BSD 4.4 defines the size of an ifreq to be
+ * max(sizeof(ifreq), sizeof(ifreq.ifr_name)+ifreq.ifr_addr.sa_len
+ * However, under earlier systems, sa_len isn't present, so the size is 
+ * just sizeof(struct ifreq)
+ */
+#ifdef HAVE_SA_LEN
+#ifndef max
+#define max(a,b) ((a) > (b) ? (a) : (b))
+#endif
+#define ifreq_size(i) max(sizeof(struct ifreq),\
+     sizeof((i).ifr_name)+(i).ifr_addr.sa_len)
+#else
+#define ifreq_size(i) sizeof(struct ifreq)
+#endif /* HAVE_SA_LEN*/
+
+	sd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (sd < 0) {
+		return -1;
+	}
+	memset(buf, 0, sizeof(buf));
+	ifc.ifc_len = sizeof(buf);
+	ifc.ifc_buf = buf;
+	if (ioctl (sd, SIOCGIFCONF, (char *)&ifc) < 0) {
+		close(sd);
+		return -1;
+	}
+	n = ifc.ifc_len;
+	for (i = 0; i < n; i+= ifreq_size(*ifrp) ) {
+		ifrp = (struct ifreq *)((char *) ifc.ifc_buf+i);
+		strncpy(ifr.ifr_name, ifrp->ifr_name, IFNAMSIZ);
+#ifdef SIOCGIFHWADDR
+		if (ioctl(sd, SIOCGIFHWADDR, &ifr) < 0)
+			continue;
+		a = (unsigned char *) &ifr.ifr_hwaddr.sa_data;
+#else
+#ifdef SIOCGENADDR
+		if (ioctl(sd, SIOCGENADDR, &ifr) < 0)
+			continue;
+		a = (unsigned char *) ifr.ifr_enaddr;
+#else
+#ifdef HAVE_NET_IF_DL_H
+		sdlp = (struct sockaddr_dl *) &ifrp->ifr_addr;
+		if ((sdlp->sdl_family != AF_LINK) || (sdlp->sdl_alen != 6))
+			continue;
+		a = (unsigned char *) &sdlp->sdl_data[sdlp->sdl_nlen];
+#else
+		/*
+		 * XXX we don't have a way of getting the hardware
+		 * address
+		 */
+		close(sd);
+		return 0;
+#endif /* HAVE_NET_IF_DL_H */
+#endif /* SIOCGENADDR */
+#endif /* SIOCGIFHWADDR */
+		if (!a[0] && !a[1] && !a[2] && !a[3] && !a[4] && !a[5])
+			continue;
+		if (node_id) {
+			memcpy(node_id, a, 6);
+			close(sd);
+			return 1;
+		}
+	}
+	close(sd);
+#endif
+	return 0;
+}
+
+/* Assume that the gettimeofday() has microsecond granularity */
+#define MAX_ADJUSTMENT 10
+
+static int get_clock(uint32_t *clock_high, uint32_t *clock_low, uint16_t *ret_clock_seq)
+{
+	static int			adjustment = 0;
+	static struct timeval		last = {0, 0};
+	static uint16_t			clock_seq;
+	struct timeval 			tv;
+	unsigned long long		clock_reg;
+	
+try_again:
+	gettimeofday(&tv, 0);
+	if ((last.tv_sec == 0) && (last.tv_usec == 0)) {
+		get_random_bytes(&clock_seq, sizeof(clock_seq));
+		clock_seq &= 0x3FFF;
+		last = tv;
+		last.tv_sec--;
+	}
+	if ((tv.tv_sec < last.tv_sec) ||
+	    ((tv.tv_sec == last.tv_sec) &&
+	     (tv.tv_usec < last.tv_usec))) {
+		clock_seq = (clock_seq+1) & 0x3FFF;
+		adjustment = 0;
+		last = tv;
+	} else if ((tv.tv_sec == last.tv_sec) &&
+	    (tv.tv_usec == last.tv_usec)) {
+		if (adjustment >= MAX_ADJUSTMENT)
+			goto try_again;
+		adjustment++;
+	} else {
+		adjustment = 0;
+		last = tv;
+	}
+		
+	clock_reg = tv.tv_usec*10 + adjustment;
+	clock_reg += ((unsigned long long) tv.tv_sec)*10000000;
+	clock_reg += (((unsigned long long) 0x01B21DD2) << 32) + 0x13814000;
+
+	*clock_high = clock_reg >> 32;
+	*clock_low = clock_reg;
+	*ret_clock_seq = clock_seq;
+	return 0;
+}
+
+UUID_STATIC
+void uuid_generate_time(uuid_t out)
+{
+	static unsigned char node_id[6];
+	static int has_init = 0;
+	struct uuid uu;
+	uint32_t	clock_mid;
+
+	if (!has_init) {
+		if (get_node_id(node_id) <= 0) {
+			get_random_bytes(node_id, 6);
+			/*
+			 * Set multicast bit, to prevent conflicts
+			 * with IEEE 802 addresses obtained from
+			 * network cards
+			 */
+			node_id[0] |= 0x01;
+		}
+		has_init = 1;
+	}
+	get_clock(&clock_mid, &uu.time_low, &uu.clock_seq);
+	uu.clock_seq |= 0x8000;
+	uu.time_mid = (uint16_t) clock_mid;
+	uu.time_hi_and_version = ((clock_mid >> 16) & 0x0FFF) | 0x1000;
+	memcpy(uu.node, node_id, 6);
+	uuid_pack(&uu, out);
+}
+
+UUID_STATIC
+void uuid_generate_random(uuid_t out)
+{
+	uuid_t	buf;
+	struct uuid uu;
+
+	get_random_bytes(buf, sizeof(buf));
+	uuid_unpack(buf, &uu);
+
+	uu.clock_seq = (uu.clock_seq & 0x3FFF) | 0x8000;
+	uu.time_hi_and_version = (uu.time_hi_and_version & 0x0FFF) | 0x4000;
+	uuid_pack(&uu, out);
+}
+
+/*
+ * This is the generic front-end to uuid_generate_random and
+ * uuid_generate_time.  It uses uuid_generate_random only if
+ * /dev/urandom is available, since otherwise we won't have
+ * high-quality randomness.
+ */
+UUID_STATIC
+void uuid_generate(uuid_t out)
+{
+	if (get_random_fd() >= 0)
+		uuid_generate_random(out);
+	else
+		uuid_generate_time(out);
+}
diff --git a/libuuid/isnull.c b/libuuid/isnull.c
new file mode 100644
index 0000000..fb7fa3d
--- /dev/null
+++ b/libuuid/isnull.c
@@ -0,0 +1,49 @@
+/*
+ * isnull.c --- Check whether or not the UUID is null
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include "uuidP.h"
+
+/* Returns 1 if the uuid is the NULL uuid */
+UUID_STATIC
+int uuid_is_null(const uuid_t uu)
+{
+	const unsigned char 	*cp;
+	int			i;
+
+	for (i=0, cp = uu; i < 16; i++)
+		if (*cp++)
+			return 0;
+	return 1;
+}
+
diff --git a/libuuid/pack.c b/libuuid/pack.c
new file mode 100644
index 0000000..51c47ee
--- /dev/null
+++ b/libuuid/pack.c
@@ -0,0 +1,70 @@
+/*
+ * Internal routine for packing UUID's
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <string.h>
+#include "uuidP.h"
+
+UUID_STATIC
+void uuid_pack(const struct uuid *uu, uuid_t ptr)
+{
+	uint32_t	tmp;
+	unsigned char	*out = ptr;
+
+	tmp = uu->time_low;
+	out[3] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[2] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[1] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[0] = (unsigned char) tmp;
+	
+	tmp = uu->time_mid;
+	out[5] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[4] = (unsigned char) tmp;
+
+	tmp = uu->time_hi_and_version;
+	out[7] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[6] = (unsigned char) tmp;
+
+	tmp = uu->clock_seq;
+	out[9] = (unsigned char) tmp;
+	tmp >>= 8;
+	out[8] = (unsigned char) tmp;
+
+	memcpy(out+10, uu->node, 6);
+}
+
diff --git a/libuuid/parse.c b/libuuid/parse.c
new file mode 100644
index 0000000..0773447
--- /dev/null
+++ b/libuuid/parse.c
@@ -0,0 +1,80 @@
+/*
+ * parse.c --- UUID parsing
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "uuidP.h"
+
+UUID_STATIC
+int uuid_parse(const char *in, uuid_t uu)
+{
+	struct uuid	uuid;
+	int 		i;
+	const char	*cp;
+	char		buf[3];
+
+	if (strlen(in) != 36)
+		return -1;
+	for (i=0, cp = in; i <= 36; i++,cp++) {
+		if ((i == 8) || (i == 13) || (i == 18) ||
+		    (i == 23)) {
+			if (*cp == '-')
+				continue;
+			else
+				return -1;
+		}
+		if (i== 36)
+			if (*cp == 0)
+				continue;
+		if (!isxdigit(*cp))
+			return -1;
+	}
+	uuid.time_low = strtoul(in, NULL, 16);
+	uuid.time_mid = strtoul(in+9, NULL, 16);
+	uuid.time_hi_and_version = strtoul(in+14, NULL, 16);
+	uuid.clock_seq = strtoul(in+19, NULL, 16);
+	cp = in+24;
+	buf[2] = 0;
+	for (i=0; i < 6; i++) {
+		buf[0] = *cp++;
+		buf[1] = *cp++;
+		uuid.node[i] = strtoul(buf, NULL, 16);
+	}
+	
+	uuid_pack(&uuid, uu);
+	return 0;
+}
diff --git a/libuuid/psm_uuid.c b/libuuid/psm_uuid.c
new file mode 100644
index 0000000..fcfa94c
--- /dev/null
+++ b/libuuid/psm_uuid.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(PSM_USE_SYS_UUID)
+
+#define STDC_HEADERS 1
+#define HAVE_SYS_TYPES_H 1
+#define HAVE_SYS_STAT_H 1
+#define HAVE_STDLIB_H 1
+#define HAVE_STRING_H 1
+#define HAVE_MEMORY_H 1
+#define HAVE_STRINGS_H 1
+#define HAVE_INTTYPES_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_UNISTD_H 1
+#define ENABLE_HTREE 1
+#define ENABLE_SWAPFS 1
+#define HAVE_LONG_LONG 1
+#define HAVE_LONG_DOUBLE 1
+#define HAVE_WCHAR_T 1
+#define HAVE_WINT_T 1
+#define HAVE_INTTYPES_H_WITH_UINTMAX 1
+#define HAVE_STDINT_H_WITH_UINTMAX 1
+#define HAVE_INTMAX_T 1
+#define HAVE_POSIX_PRINTF 1
+#define HAVE_ALLOCA_H 1
+#define HAVE_ALLOCA 1
+#define HAVE_STDLIB_H 1
+#define HAVE_UNISTD_H 1
+#define HAVE_GETPAGESIZE 1
+#define HAVE_MMAP 1
+#define INTDIV0_RAISES_SIGFPE 1
+#define HAVE_UNSIGNED_LONG_LONG 1
+#define HAVE_UINTMAX_T 1
+#define HAVE_INTTYPES_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_ARGZ_H 1
+#define HAVE_LIMITS_H 1
+#define HAVE_LOCALE_H 1
+#define HAVE_NL_TYPES_H 1
+#define HAVE_MALLOC_H 1
+#define HAVE_STDDEF_H 1
+#define HAVE_STDLIB_H 1
+#define HAVE_STRING_H 1
+#define HAVE_UNISTD_H 1
+#define HAVE_SYS_PARAM_H 1
+#define HAVE_ASPRINTF 1
+#define HAVE_FWPRINTF 1
+#define HAVE_GETCWD 1
+#define HAVE_GETEGID 1
+#define HAVE_GETEUID 1
+#define HAVE_GETGID 1
+#define HAVE_GETUID 1
+#define HAVE_MEMPCPY 1
+#define HAVE_MUNMAP 1
+#define HAVE_PUTENV 1
+#define HAVE_SETENV 1
+#define HAVE_SETLOCALE 1
+#define HAVE_SNPRINTF 1
+#define HAVE_STPCPY 1
+#define HAVE_STRCASECMP 1
+#define HAVE_STRDUP 1
+#define HAVE_STRTOUL 1
+#define HAVE_TSEARCH 1
+#define HAVE_WCSLEN 1
+#define HAVE___ARGZ_COUNT 1
+#define HAVE___ARGZ_STRINGIFY 1
+#define HAVE___ARGZ_NEXT 1
+#define HAVE___FSETLOCKING 1
+#define HAVE_DECL__SNPRINTF 0
+#define HAVE_DECL__SNWPRINTF 0
+#define HAVE_DECL_FEOF_UNLOCKED 1
+#define HAVE_DECL_FGETS_UNLOCKED 0
+#define HAVE_DECL_GETC_UNLOCKED 1
+#define HAVE_ICONV 1
+#define ICONV_CONST 
+#define HAVE_LANGINFO_CODESET 1
+#define HAVE_LC_MESSAGES 1
+#define ENABLE_NLS 1
+#define HAVE_GETTEXT 1
+#define HAVE_DCGETTEXT 1
+#define HAVE_STDLIB_H 1
+#define HAVE_UNISTD_H 1
+#define HAVE_STDARG_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_ERRNO_H 1
+#define HAVE_MALLOC_H 1
+#define HAVE_MNTENT_H 1
+#define HAVE_PATHS_H 1
+#define HAVE_DIRENT_H 1
+#define HAVE_GETOPT_H 1
+#define HAVE_SETJMP_H 1
+#define HAVE_SIGNAL_H 1
+#define HAVE_TERMIOS_H 1
+#define HAVE_LINUX_FD_H 1
+#define HAVE_LINUX_MAJOR_H 1
+#define HAVE_SYS_IOCTL_H 1
+#define HAVE_SYS_PRCTL_H 1
+#define HAVE_SYS_QUEUE_H 1
+#define HAVE_SYS_SOCKET_H 1
+#define HAVE_SYS_SYSMACROS_H 1
+#define HAVE_SYS_TIME_H 1
+#define HAVE_SYS_STAT_H 1
+#define HAVE_SYS_TYPES_H 1
+#define HAVE_SYS_WAIT_H 1
+#define HAVE_SYS_RESOURCE_H 1
+#define HAVE_NETINET_IN_H 1
+#define HAVE_SYS_MOUNT_H 1
+#define HAVE_NET_IF_H 1
+#define HAVE_VPRINTF 1
+#define HAVE_RECLEN_DIRENT 1
+#define HAVE_TYPE_SSIZE_T 1
+#define HAVE_LSEEK64_PROTOTYPE 1
+#define SIZEOF_SHORT 2
+#define SIZEOF_INT 4
+#define SIZEOF_LONG 8
+#define SIZEOF_LONG_LONG 8
+#define HAVE_INTTYPES_H 1
+#define HAVE_INTPTR_T 1
+#define HAVE_GETRUSAGE 1
+#define HAVE_LLSEEK 1
+#define HAVE_LSEEK64 1
+#define HAVE_OPEN64 1
+#define HAVE_STRTOULL 1
+#define HAVE_STRCASECMP 1
+#define HAVE_SRANDOM 1
+#define HAVE_FCHOWN 1
+#define HAVE_MALLINFO 1
+#define HAVE_FDATASYNC 1
+#define HAVE_STRNLEN 1
+#define HAVE_STRPTIME 1
+#define HAVE_SYSCONF 1
+#define HAVE_PATHCONF 1
+#define HAVE_POSIX_MEMALIGN 1
+#define HAVE_MEMALIGN 1
+#define HAVE_VALLOC 1
+#define HAVE___SECURE_GETENV 1
+#define HAVE_PRCTL 1
+#define HAVE_DLOPEN 1
+#define HAVE_EXT2_IOCTLS 1
+
+#include "pack.c"
+#include "unpack.c"
+#include "clear.c"
+#include "compare.c"
+#include "copy.c"
+#include "gen_uuid.c"
+#include "isnull.c"
+#include "parse.c"
+#include "unparse.c"
+#include "psm_help.h"
+
+#else /* PSM_USE_SYS_UUID */
+#include <uuid/uuid.h>
+#include "psm_user.h"
+#endif
+
+void
+__psm_uuid_generate(psm_uuid_t uuid_out)
+{
+    uuid_generate(uuid_out);
+    return;
+}
+PSMI_API_DECL(psm_uuid_generate)
+
+int
+psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB)
+{
+    return uuid_compare(uuA, uuB);
+}
+
+void
+psmi_uuid_unparse(const uuid_t uu, char *out)
+{
+    uuid_unparse_lower(uu, out);
+}
+
+int
+psmi_uuid_parse(const char *in, uuid_t uu)
+{
+    return uuid_parse(in, uu);
+}
+
diff --git a/libuuid/psm_uuid.h b/libuuid/psm_uuid.h
new file mode 100644
index 0000000..5c2011f
--- /dev/null
+++ b/libuuid/psm_uuid.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSM_UUID_H
+#define _PSM_UUID_H
+int	    psmi_uuid_parse(const char *in, psm_uuid_t uu);
+void	    psmi_uuid_unparse(const psm_uuid_t uuid, char *out);
+int	    psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB);
+#endif
diff --git a/libuuid/tst_uuid.c b/libuuid/tst_uuid.c
new file mode 100644
index 0000000..47ff06c
--- /dev/null
+++ b/libuuid/tst_uuid.c
@@ -0,0 +1,168 @@
+/*
+ * tst_uuid.c --- test program from the UUID library
+ *
+ * Copyright (C) 1996, 1997, 1998 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "uuid.h"
+
+static int test_uuid(const char * uuid, int isValid)
+{
+	static const char * validStr[2] = {"invalid", "valid"};
+	uuid_t uuidBits;
+	int parsedOk;
+
+	parsedOk = uuid_parse(uuid, uuidBits) == 0;
+
+	printf("%s is %s", uuid, validStr[isValid]);
+	if (parsedOk != isValid) {
+		printf(" but uuid_parse says %s\n", validStr[parsedOk]);
+		return 1;
+	}
+	printf(", OK\n");
+	return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+	uuid_t		buf, tst;
+	char		str[100];
+	struct timeval	tv;
+	time_t		time_reg;
+	unsigned char	*cp;
+	int i;
+	int failed = 0;
+	int type, variant;
+
+	uuid_generate(buf);
+	uuid_unparse(buf, str);
+	printf("UUID generate = %s\n", str);
+	printf("UUID: ");
+	for (i=0, cp = (unsigned char *) &buf; i < 16; i++) {
+		printf("%02x", *cp++);
+	}
+	printf("\n");
+	type = uuid_type(buf); 	variant = uuid_variant(buf);
+	printf("UUID type = %d, UUID variant = %d\n", type, variant);
+	if (variant != UUID_VARIANT_DCE) {
+		printf("Incorrect UUID Variant; was expecting DCE!\n");
+		failed++;
+	}
+	printf("\n");
+
+	uuid_generate_random(buf);
+	uuid_unparse(buf, str);
+	printf("UUID random string = %s\n", str);
+	printf("UUID: ");
+	for (i=0, cp = (unsigned char *) &buf; i < 16; i++) {
+		printf("%02x", *cp++);
+	}
+	printf("\n");
+	type = uuid_type(buf); 	variant = uuid_variant(buf);
+	printf("UUID type = %d, UUID variant = %d\n", type, variant);
+	if (variant != UUID_VARIANT_DCE) {
+		printf("Incorrect UUID Variant; was expecting DCE!\n");
+		failed++;
+	}
+	if (type != 4) {
+		printf("Incorrect UUID type; was expecting "
+		       "4 (random type)!\n");
+		failed++;
+	}
+	printf("\n");
+
+	uuid_generate_time(buf);
+	uuid_unparse(buf, str);
+	printf("UUID string = %s\n", str);
+	printf("UUID time: ");
+	for (i=0, cp = (unsigned char *) &buf; i < 16; i++) {
+		printf("%02x", *cp++);
+	}
+	printf("\n");
+	type = uuid_type(buf); 	variant = uuid_variant(buf);
+	printf("UUID type = %d, UUID variant = %d\n", type, variant);
+	if (variant != UUID_VARIANT_DCE) {
+		printf("Incorrect UUID Variant; was expecting DCE!\n");
+		failed++;
+	}
+	if (type != 1) {
+		printf("Incorrect UUID type; was expecting "
+		       "1 (time-based type)!\\n");
+		failed++;
+	}
+	tv.tv_sec = 0;
+	tv.tv_usec = 0;
+	time_reg = uuid_time(buf, &tv);
+	printf("UUID time is: (%ld, %ld): %s\n", tv.tv_sec, tv.tv_usec,
+	       ctime(&time_reg));
+	uuid_parse(str, tst);
+	if (!uuid_compare(buf, tst))
+		printf("UUID parse and compare succeeded.\n");
+	else {
+		printf("UUID parse and compare failed!\n");
+		failed++;
+	}
+	uuid_clear(tst);
+	if (uuid_is_null(tst))
+		printf("UUID clear and is null succeeded.\n");
+	else {
+		printf("UUID clear and is null failed!\n");
+		failed++;
+	}
+	uuid_copy(buf, tst);
+	if (!uuid_compare(buf, tst))
+		printf("UUID copy and compare succeeded.\n");
+	else {
+		printf("UUID copy and compare failed!\n");
+		failed++;
+	}
+	failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981b", 1);
+	failed += test_uuid("84949CC5-4701-4A84-895B-354C584A981B", 1);
+	failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981bc", 0);
+	failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981", 0);
+	failed += test_uuid("84949cc5x4701-4a84-895b-354c584a981b", 0);
+	failed += test_uuid("84949cc504701-4a84-895b-354c584a981b", 0);
+	failed += test_uuid("84949cc5-470104a84-895b-354c584a981b", 0);
+	failed += test_uuid("84949cc5-4701-4a840895b-354c584a981b", 0);
+	failed += test_uuid("84949cc5-4701-4a84-895b0354c584a981b", 0);
+	failed += test_uuid("g4949cc5-4701-4a84-895b-354c584a981b", 0);
+	failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981g", 0);
+
+	if (failed) {
+		printf("%d failures.\n", failed);
+		exit(1);
+	}
+	return 0;
+}
diff --git a/libuuid/unpack.c b/libuuid/unpack.c
new file mode 100644
index 0000000..a05d664
--- /dev/null
+++ b/libuuid/unpack.c
@@ -0,0 +1,64 @@
+/*
+ * Internal routine for unpacking UUID
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <string.h>
+#include "uuidP.h"
+
+UUID_STATIC
+void uuid_unpack(const uuid_t in, struct uuid *uu)
+{
+	const uint8_t	*ptr = in;
+	uint32_t		tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_low = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_mid = tmp;
+	
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->time_hi_and_version = tmp;
+
+	tmp = *ptr++;
+	tmp = (tmp << 8) | *ptr++;
+	uu->clock_seq = tmp;
+
+	memcpy(uu->node, ptr, 6);
+}
+
diff --git a/libuuid/unparse.c b/libuuid/unparse.c
new file mode 100644
index 0000000..0857f50
--- /dev/null
+++ b/libuuid/unparse.c
@@ -0,0 +1,79 @@
+/*
+ * unparse.c -- convert a UUID to string
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <stdio.h>
+
+#include "uuidP.h"
+
+static const char *fmt_lower = 
+	"%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x";
+
+static const char *fmt_upper = 
+	"%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X";
+
+#ifdef UUID_UNPARSE_DEFAULT_UPPER
+#define FMT_DEFAULT fmt_upper
+#else
+#define FMT_DEFAULT fmt_lower
+#endif
+
+static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt)
+{
+	struct uuid uuid;
+
+	uuid_unpack(uu, &uuid);
+	sprintf(out, fmt,
+		uuid.time_low, uuid.time_mid, uuid.time_hi_and_version,
+		uuid.clock_seq >> 8, uuid.clock_seq & 0xFF,
+		uuid.node[0], uuid.node[1], uuid.node[2],
+		uuid.node[3], uuid.node[4], uuid.node[5]);
+}
+
+UUID_STATIC
+void uuid_unparse_lower(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_lower);
+}
+
+UUID_STATIC
+void uuid_unparse_upper(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out,	fmt_upper);
+}
+
+UUID_STATIC
+void uuid_unparse(const uuid_t uu, char *out)
+{
+	uuid_unparse_x(uu, out, FMT_DEFAULT);
+}
diff --git a/libuuid/uuid.h b/libuuid/uuid.h
new file mode 100644
index 0000000..54a9e96
--- /dev/null
+++ b/libuuid/uuid.h
@@ -0,0 +1,108 @@
+
+/*
+ * Public include file for the UUID library
+ * 
+ * Copyright (C) 1996, 1997, 1998 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#ifndef _UUID_UUID_H
+#define _UUID_UUID_H
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+
+typedef unsigned char uuid_t[16];
+
+/* UUID Variant definitions */
+#define UUID_VARIANT_NCS 	0
+#define UUID_VARIANT_DCE 	1
+#define UUID_VARIANT_MICROSOFT	2
+#define UUID_VARIANT_OTHER	3
+
+/* UUID Type definitions */
+#define UUID_TYPE_DCE_TIME   1
+#define UUID_TYPE_DCE_RANDOM 4
+
+/* Allow UUID constants to be defined */
+#ifdef __GNUC__
+#define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \
+	static const uuid_t name __attribute__ ((unused)) = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15}
+#else
+#define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \
+	static const uuid_t name = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15}
+#endif
+
+#ifdef PSM_UUID
+#define UUID_STATIC static
+#else
+#define UUID_STATIC
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* gen_uuid.c */
+UUID_STATIC void uuid_generate(uuid_t out);
+UUID_STATIC void uuid_generate_random(uuid_t out);
+UUID_STATIC void uuid_generate_time(uuid_t out);
+
+/* clear.c */
+UUID_STATIC void uuid_clear(uuid_t uu);
+
+/* compare.c */
+UUID_STATIC int uuid_compare(const uuid_t uu1, const uuid_t uu2);
+
+/* copy.c */
+UUID_STATIC void uuid_copy(uuid_t dst, const uuid_t src);
+
+/* isnull.c */
+UUID_STATIC int uuid_is_null(const uuid_t uu);
+
+/* parse.c */
+UUID_STATIC int uuid_parse(const char *in, uuid_t uu);
+
+/* unparse.c */
+UUID_STATIC void uuid_unparse(const uuid_t uu, char *out);
+UUID_STATIC void uuid_unparse_lower(const uuid_t uu, char *out);
+UUID_STATIC void uuid_unparse_upper(const uuid_t uu, char *out);
+
+/* uuid_time.c */
+UUID_STATIC time_t uuid_time(const uuid_t uu, struct timeval *ret_tv);
+UUID_STATIC int uuid_type(const uuid_t uu);
+UUID_STATIC int uuid_variant(const uuid_t uu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _UUID_UUID_H */
diff --git a/libuuid/uuidP.h b/libuuid/uuidP.h
new file mode 100644
index 0000000..fa7e91b
--- /dev/null
+++ b/libuuid/uuidP.h
@@ -0,0 +1,77 @@
+/*
+ * uuid.h -- private header file for uuids
+ * 
+ * Copyright (C) 1996, 1997 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#ifndef _UUID_UUIDP_H
+#define _UUID_UUIDP_H
+
+#ifndef UUID_STATIC
+# ifdef PSM_UUID
+#  define UUID_STATIC static
+# else
+#  define UUID_STATIC
+# endif
+#endif
+
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#else
+#include <uuid/uuid_types.h>
+#endif
+#include <sys/types.h>
+
+#include "uuid.h"
+
+/*
+ * Offset between 15-Oct-1582 and 1-Jan-70
+ */
+#define TIME_OFFSET_HIGH 0x01B21DD2
+#define TIME_OFFSET_LOW  0x13814000
+
+struct uuid {
+	uint32_t	time_low;
+	uint16_t	time_mid;
+	uint16_t	time_hi_and_version;
+	uint16_t	clock_seq;
+	uint8_t	node[6];
+};
+
+/*
+ * prototypes
+ */
+UUID_STATIC
+void uuid_pack(const struct uuid *uu, uuid_t ptr);
+UUID_STATIC
+void uuid_unpack(const uuid_t in, struct uuid *uu);
+
+#endif /* _UUID_UUIDP_H */
diff --git a/libuuid/uuid_time.c b/libuuid/uuid_time.c
new file mode 100644
index 0000000..d5f992b
--- /dev/null
+++ b/libuuid/uuid_time.c
@@ -0,0 +1,161 @@
+/*
+ * uuid_time.c --- Interpret the time field from a uuid.  This program
+ * 	violates the UUID abstraction barrier by reaching into the guts
+ *	of a UUID and interpreting it.
+ * 
+ * Copyright (C) 1998, 1999 Theodore Ts'o.
+ *
+ * %Begin-Header%
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF
+ * WHICH ARE HEREBY DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+ * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ * %End-Header%
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "uuidP.h"
+
+time_t uuid_time(const uuid_t uu, struct timeval *ret_tv)
+{
+	struct uuid		uuid;
+	uint32_t			high;
+	struct timeval		tv;
+	unsigned long long	clock_reg;
+
+	uuid_unpack(uu, &uuid);
+	
+	high = uuid.time_mid | ((uuid.time_hi_and_version & 0xFFF) << 16);
+	clock_reg = uuid.time_low | ((unsigned long long) high << 32);
+
+	clock_reg -= (((unsigned long long) 0x01B21DD2) << 32) + 0x13814000;
+	tv.tv_sec = clock_reg / 10000000;
+	tv.tv_usec = (clock_reg % 10000000) / 10;
+
+	if (ret_tv)
+		*ret_tv = tv;
+
+	return tv.tv_sec;
+}
+
+int uuid_type(const uuid_t uu)
+{
+	struct uuid		uuid;
+
+	uuid_unpack(uu, &uuid);	
+	return ((uuid.time_hi_and_version >> 12) & 0xF);
+}
+
+int uuid_variant(const uuid_t uu)
+{
+	struct uuid		uuid;
+	int			var;
+
+	uuid_unpack(uu, &uuid);	
+	var = uuid.clock_seq;
+
+	if ((var & 0x8000) == 0)
+		return UUID_VARIANT_NCS;
+	if ((var & 0x4000) == 0)
+		return UUID_VARIANT_DCE;
+	if ((var & 0x2000) == 0)
+		return UUID_VARIANT_MICROSOFT;
+	return UUID_VARIANT_OTHER;
+}
+
+#ifdef DEBUG
+static const char *variant_string(int variant)
+{
+	switch (variant) {
+	case UUID_VARIANT_NCS:
+		return "NCS";
+	case UUID_VARIANT_DCE:
+		return "DCE";
+	case UUID_VARIANT_MICROSOFT:
+		return "Microsoft";
+	default:
+		return "Other";
+	}
+}
+
+	
+int
+main(int argc, char **argv)
+{
+	uuid_t		buf;
+	time_t		time_reg;
+	struct timeval	tv;
+	int		type, variant;
+
+	if (argc != 2) {
+		fprintf(stderr, "Usage: %s uuid\n", argv[0]);
+		exit(1);
+	}
+	if (uuid_parse(argv[1], buf)) {
+		fprintf(stderr, "Invalid UUID: %s\n", argv[1]);
+		exit(1);
+	}
+	variant = uuid_variant(buf);
+	type = uuid_type(buf);
+	time_reg = uuid_time(buf, &tv);
+
+	printf("UUID variant is %d (%s)\n", variant, variant_string(variant));
+	if (variant != UUID_VARIANT_DCE) {
+		printf("Warning: This program only knows how to interpret "
+		       "DCE UUIDs.\n\tThe rest of the output is likely "
+		       "to be incorrect!!\n");
+	}
+	printf("UUID type is %d", type);
+	switch (type) {
+	case 1:
+		printf(" (time based)\n");
+		break;
+	case 2:
+		printf(" (DCE)\n");
+		break;
+	case 3:
+		printf(" (name-based)\n");
+		break;
+	case 4:
+		printf(" (random)\n");
+		break;
+	default:
+		printf("\n");
+	}
+	if (type != 1) {
+		printf("Warning: not a time-based UUID, so UUID time "
+		       "decoding will likely not work!\n");
+	}
+	printf("UUID time is: (%ld, %ld): %s\n", tv.tv_sec, tv.tv_usec,
+	       ctime(&time_reg));
+	
+	return 0;
+}
+#endif
diff --git a/mic-psm-card-devel.srclist.in b/mic-psm-card-devel.srclist.in
new file mode 100644
index 0000000..7d6fd6c
--- /dev/null
+++ b/mic-psm-card-devel.srclist.in
@@ -0,0 +1,2 @@
+%LIBPREFIX%/libinfinipath.so
+%LIBPREFIX%/libpsm_infinipath.so
diff --git a/mic-psm-card.srclist.in b/mic-psm-card.srclist.in
new file mode 100644
index 0000000..beea15e
--- /dev/null
+++ b/mic-psm-card.srclist.in
@@ -0,0 +1,6 @@
+/etc/sysconfig/mic/conf.d/psm.conf
+%PREFIX%/psm.filelist
+%LIBPREFIX%/libinfinipath.so.%IPATHMAJOR%
+%LIBPREFIX%/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR%
+%LIBPREFIX%/libpsm_infinipath.so.%PSMMAJOR%
+%LIBPREFIX%/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR%
diff --git a/mic-psm-devel.srclist.in b/mic-psm-devel.srclist.in
new file mode 100644
index 0000000..a1dc132
--- /dev/null
+++ b/mic-psm-devel.srclist.in
@@ -0,0 +1,4 @@
+/usr/include/psm.h
+/usr/include/psm_mq.h
+%LIBPREFIX%/libinfinipath.so
+%LIBPREFIX%/libpsm_infinipath.so
diff --git a/mic-psm.srclist.in b/mic-psm.srclist.in
new file mode 100644
index 0000000..d80350d
--- /dev/null
+++ b/mic-psm.srclist.in
@@ -0,0 +1,5 @@
+%SBINPREFIX%/psmd
+%LIBPREFIX%/libinfinipath.so.4
+%LIBPREFIX%/libinfinipath.so.4.0
+%LIBPREFIX%/libpsm_infinipath.so.1
+%LIBPREFIX%/libpsm_infinipath.so.1.15
diff --git a/mic/etc/sysconfig/mic/conf.d/psm.conf b/mic/etc/sysconfig/mic/conf.d/psm.conf
new file mode 100644
index 0000000..deba040
--- /dev/null
+++ b/mic/etc/sysconfig/mic/conf.d/psm.conf
@@ -0,0 +1,2 @@
+# PSM download files
+Overlay Filelist /opt/intel/mic/psm /opt/intel/mic/psm/psm.filelist on
diff --git a/mic/opt/intel/mic/psm/psm.filelist.in b/mic/opt/intel/mic/psm/psm.filelist.in
new file mode 100644
index 0000000..38c6add
--- /dev/null
+++ b/mic/opt/intel/mic/psm/psm.filelist.in
@@ -0,0 +1,7 @@
+dir /lib64 755 0 0
+file /lib64/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% lib64/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 755 0 0
+slink /lib64/libinfinipath.so.%IPATHMAJOR% libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 777 0 0
+slink /lib64/libinfinipath.so libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 777 0 0
+file /lib64/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% lib64/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 755 0 0
+slink /lib64/libpsm_infinipath.so.%PSMMAJOR% libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 777 0 0
+slink /lib64/libpsm_infinipath.so libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 777 0 0
diff --git a/mpspawn/mpspawn_stats.h b/mpspawn/mpspawn_stats.h
new file mode 100644
index 0000000..3cc8bc7
--- /dev/null
+++ b/mpspawn/mpspawn_stats.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _MPSPAWN_STATS_H
+#define _MPSPAWN_STATS_H
+
+#include <math.h>
+
+#define MPSPAWN_STATS_VERSION	1
+
+typedef enum
+{
+    MPSPAWN_STATS_TYPE_DOUBLE      = 0x1,
+#define MPSPAWN_STATS_TYPE_DOUBLE      0x1
+    MPSPAWN_STATS_TYPE_HEADER      = 0x2,
+#define MPSPAWN_STATS_TYPE_HEADER      0x2
+    MPSPAWN_STATS_REDUCTION_MAX    = 0x1000,
+#define MPSPAWN_STATS_REDUCTION_MAX    0x1000
+    MPSPAWN_STATS_REDUCTION_MIN    = 0x2000,
+#define MPSPAWN_STATS_REDUCTION_MIN    0x2000
+    MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000,
+#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000
+    MPSPAWN_STATS_SKIP_IF_ZERO     = 0x8000
+#define MPSPAWN_STATS_SKIP_IF_ZERO     0x8000
+} 
+mpspawn_stats_flags;
+
+#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \
+	    MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN)
+
+#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg)))
+#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL)
+#define MPSPAWN_ISNAN_U64(x)    (((uint64_t)(x)) == MPSPAWN_NAN_U64)
+
+#define MPSPAWN_NAN	    ((uint64_t) ~0ULL)  //NAN)
+#define MPSPAWN_ISNAN(x)    (isnan(x))
+
+struct mpspawn_stats_add_args;	/* client->mpspawn stats registration */
+struct mpspawn_stats_req_args;	/* mpspawn->client fn callback stats request */
+struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */
+
+/* Clients implement this function to fill in mpspawn request for stats */
+typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *);
+/* mpspawn implements this function to allow clients to register new stats */
+typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *);
+/* mpspawn implements this function to map rank indexes into epaddr structs */
+struct psm_epaddr;
+typedef struct psm_epaddr * (*mpspawn_map_epaddr_fn) (int rank);
+
+typedef struct mpspawn_stats_req_args {
+    int	    version;
+    int	    num;
+    uint64_t *stats;
+    uint16_t *flags;
+    void    *context;
+} 
+mpspawn_stats_req_args_t;
+
+typedef 
+struct mpspawn_stats_add_args {
+    int	    version;
+    int	    num;
+    char    *header;
+    char    **desc;
+    uint16_t *flags;
+    mpspawn_stats_req_fn    req_fn;
+    void    *context;
+} mpspawn_stats_add_args_t;
+
+typedef 
+struct mpspawn_stats_init_args {
+    int			  version;
+    psm_mq_t		  mq;		/* initialized mq endpoint */
+    int			  num_epaddr;	/* number of endpoints in job */
+    mpspawn_stats_add_fn  add_fn;	/* function for client to add stats */
+    mpspawn_map_epaddr_fn epaddr_map_fn;
+    const char		  *stats_types;	/* stats type string mpirun -M */
+} 
+mpspawn_stats_init_args_t;
+
+/* Function in psm exposed to register stats */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args);
+
+#endif
diff --git a/psm.c b/psm.c
new file mode 100644
index 0000000..f8fa3d8
--- /dev/null
+++ b/psm.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <dlfcn.h>
+#include "psm_user.h"
+
+static int psmi_verno_major = PSM_VERNO_MAJOR;
+static int psmi_verno_minor = PSM_VERNO_MINOR;
+static int psmi_verno = PSMI_VERNO_MAKE(PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
+static int psmi_verno_client_val = 0;
+
+#define PSMI_NOT_INITIALIZED    0
+#define PSMI_INITIALIZED        1
+#define PSMI_FINALIZED         -1 /* Prevent the user from calling psm_init
+				   * once psm_finalize has been called. */
+static int psmi_isinit = PSMI_NOT_INITIALIZED;
+
+int
+psmi_verno_client()
+{
+    return psmi_verno_client_val;
+}
+
+#ifdef PSMI_PLOCK_IS_SPINLOCK
+psmi_spinlock_t psmi_progress_lock;
+#elif defined(PSMI_PLOCK_IS_MUTEXLOCK)
+pthread_mutex_t psmi_progress_lock = PTHREAD_MUTEX_INITIALIZER;
+#elif defined(PSMI_PLOCK_IS_MUTEXLOCK_DEBUG)
+pthread_mutex_t psmi_progress_lock = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP;
+pthread_t	psmi_progress_lock_owner = PSMI_PLOCK_NO_OWNER;
+#endif
+
+/* This function is used to determine whether the current library build can
+ * successfully communicate with another library that claims to be version
+ * 'verno'.
+ *
+ * PSM 1.x is always ABI compatible, but this checks to see if two different
+ * versions of the library can coexist.
+ */
+int
+psmi_verno_isinteroperable(uint16_t verno)
+{
+    /* 
+     * Up and including 1.03, all peers require to be 1.03 (or later).
+     */
+    if (PSMI_VERNO_GET_MAJOR(verno) != PSM_VERNO_MAJOR) 
+	return 0;
+
+    /* This -1 tries to make sure that we always update this function for each
+     * new release of the library.  There's an internal check to make sure that
+     * verno_iscompatible is always updated.  Each new version should have an
+     * entry in the switch statement below. */
+    int iscompat = -1;
+
+    switch (psmi_verno) {
+       case 0x0110:
+       case 0x010f:
+	 /* Multi-rail is supported in this version, since the packet header
+	  * sequence number is shrunk from 24bits to 16bits, old version
+	  * can not process such packet. The freed 8bits and another 8bits
+	  * are used to form the message sequence number to keep message order
+	  * in multi-rail case.
+	  */
+	    iscompat = (verno >= 0x010f);
+	    break;
+       case 0x010e:
+	 /* Allow specification of send buffer descriptors in addition to send
+	  * network buffers for IPS. Having a large number of send descriptors
+	  * can be beneficial on large scale clusters with bursty network IO.
+	  */
+       case 0x010d:
+	 /* Wire protocol is same as QOFED 1.4.2. Added support to specify
+	  * path record resolution mechanism as well as service ID to use
+	  * for endpoint. Required to implement support for alternate
+	  * network topolgies.
+	  */
+       case 0x010c:
+	 /* Added support for generic psm_set|getopt methods. Also exposed
+	  * "some" internal implementation details via components that these
+	  * methods operate on. Wire protocol remains the same but we need
+	  * to bump the version number as the API changes so ULPs can detect
+	  * if these methods are available.
+	  */
+       case 0x010b:
+	 /* Removed VL specification per endpoint however is wire level
+	  * compatible with 0x010a version. Use SL2VL mapping table coupled
+	  * with the SL for endpoint to select VL.
+	  */
+       case 0x010a:
+	 /* 0x010a updates wire protocol with support for AM requests with
+	  * no replies (OPCODE_AM_REQUEST_NOREPLY).
+	  */
+	    iscompat = (verno >= 0x010a);
+	    break;
+       case 0x0109:
+	 /* 0x0109 updates the wire protocol to pad writes upto cache line size
+	  * to mitigate overhead of partial cache line writes on some processor
+	  * architectures. Only MQ sends upto 2K bytes are padded.
+	  */
+	     iscompat = (verno >= 0x0109);
+	     break;
+        case 0x0108:
+	  /* 0x0108 moved subcontext bits out of KPFlags and into ips header.
+	   * This is incompatible with previous version. */
+	     iscompat = (verno >= 0x0108);
+	     break;
+	case 0x0107:
+	case 0x0106:
+	case 0x0105:
+	    /* 0x0105 coincides with release 2.1 which introduced a new
+	     * expected send protocol.  Anything before that is incompatible */
+	    iscompat = (verno >= 0x0105);
+	    break;
+	case 0x0104:
+	case 0x0103:
+	    /* Nothing below 1.03 is supported by 1.03 */
+	    iscompat = (verno >= 0x0103);
+	    break;
+	default:
+	    iscompat = -1;
+    }
+    return iscompat;
+}
+
+int
+psmi_isinitialized()
+{
+    return (psmi_isinit == PSMI_INITIALIZED);
+}
+
+extern char psmi_infinipath_revision[];
+
+psm_error_t
+__psm_init(int *major, int *minor)
+{
+    psm_error_t	err = PSM_OK;
+    union psmi_envvar_val env_tmask;
+
+    if (psmi_isinit == PSMI_INITIALIZED)
+	goto update;
+
+    if (psmi_isinit == PSMI_FINALIZED) {
+	err = PSM_IS_FINALIZED;
+	goto fail;
+    }
+
+    if (major == NULL || minor == NULL) {
+	err = PSM_PARAM_ERR;
+	goto fail;
+    }
+
+#ifdef PSM_DEBUG
+    if (!getenv("PSM_NO_WARN")) 
+	fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n");
+#endif
+
+#ifdef PSM_PROFILE
+    if (!getenv("PSM_NO_WARN")) 
+	fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n");
+#endif
+
+    /* Make sure we complain if fault injection is enabled */
+    if (getenv("PSM_FI") && !getenv("PSM_NO_WARN")) 
+	fprintf(stderr, "!!! WARNING !!! You are running with fault injection enabled!\n");
+
+    /* Make sure, as an internal check, that this version knows how to detect
+     * cmopatibility with other library versions it may communicate with */
+    if (psmi_verno_isinteroperable(psmi_verno) != 1) {
+	err = psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		"psmi_verno_isinteroperable() not updated for current version!");
+	goto fail;
+    }
+
+    /* The only way to not support a client is if the major number doesn't
+     * match */
+    if (*major != PSM_VERNO_MAJOR) {
+	err = psmi_handle_error(NULL, PSM_INIT_BAD_API_VERSION,
+		    "This library does not implement version %d.%d", 
+		    *major, *minor);
+	goto fail;
+    }
+
+    /* Make sure we don't keep track of a client that claims a higher version
+     * number than we are */
+    psmi_verno_client_val = min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno);
+
+    psmi_isinit = PSMI_INITIALIZED;
+    /* infinipath_debug lives in libinfinipath.so */
+    psmi_getenv("PSM_TRACEMASK",
+                "Mask flags for tracing",
+                PSMI_ENVVAR_LEVEL_USER,
+                PSMI_ENVVAR_TYPE_ULONG_FLAGS,
+                (union psmi_envvar_val) infinipath_debug,
+                &env_tmask);
+    infinipath_debug = (long) env_tmask.e_ulong;
+
+    /* The "real thing" is done in ipath_proto.c as a constructor function, but
+     * we getenv it here to report what we're doing with the setting */
+    {
+	extern int __ipath_malloc_no_mmap; 
+	union psmi_envvar_val env_mmap;
+	char *env = getenv("IPATH_DISABLE_MMAP_MALLOC");
+	int broken = (env && *env && !__ipath_malloc_no_mmap);
+	psmi_getenv("IPATH_DISABLE_MMAP_MALLOC",
+		broken ?  "Skipping mmap disable for malloc()" :
+		"Disable mmap for malloc()",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_YESNO,
+		(union psmi_envvar_val) 0,
+		&env_mmap);
+	if (broken) 
+	    _IPATH_ERROR("Couldn't successfully disable mmap in mallocs "
+			 "with mallopt()\n");
+    }
+
+    if (getenv("PSM_IDENTIFY")) {
+	Dl_info info_psm, info_ipath;
+	_IPATH_INFO("%s from %s:%s\n", psmi_infinipath_revision,
+	    dladdr(psm_init, &info_psm) ? info_psm.dli_fname : 
+					  "libpsm not available",
+	    dladdr(ipath_userinit, &info_ipath) ? info_ipath.dli_fname : 
+						  "libinfinipath not available");
+    }
+
+#ifdef PSMI_PLOCK_IS_SPINLOCK
+    psmi_spin_init(&psmi_progress_lock);
+#endif
+
+    if (getenv("PSM_DIAGS")) {
+	_IPATH_INFO("Running diags...\n");
+	psmi_diags();
+    }
+
+    psmi_faultinj_init();
+
+    psmi_epid_init();
+
+update:
+    *major = (int) psmi_verno_major;
+    *minor = (int) psmi_verno_minor;
+fail:
+    return err;
+}
+PSMI_API_DECL(psm_init)
+
+psm_error_t
+__psm_finalize(void)
+{
+    struct psmi_eptab_iterator itor;
+    char *hostname;
+    psm_ep_t ep;
+    extern psm_ep_t psmi_opened_endpoint; /* in psm_endpoint.c */
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+    ep = psmi_opened_endpoint;
+    while (ep != NULL) {
+	psmi_opened_endpoint = ep->user_ep_next;
+	psm_ep_close(ep, PSM_EP_CLOSE_GRACEFUL,
+	    2*PSMI_MIN_EP_CLOSE_TIMEOUT);
+	ep = psmi_opened_endpoint;
+    }
+
+    psmi_epid_fini();
+
+    psmi_faultinj_fini();
+
+    /* De-allocate memory for any allocated space to store hostnames */
+    psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME);
+    while ((hostname = psmi_epid_itor_next(&itor)))
+	psmi_free(hostname);
+    psmi_epid_itor_fini(&itor);
+
+    psmi_isinit = PSMI_FINALIZED;
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_finalize)
+
+/*
+ * Function exposed in >= 1.05
+ */
+psm_error_t
+__psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames)
+{
+    int i;
+    psm_error_t err = PSM_OK;
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+    PSMI_PLOCK();
+
+    if (nids == NULL || hostnames == NULL) {
+	err = PSM_PARAM_ERR;
+	goto fail;
+    }
+
+    for (i = 0; i < num; i++) {
+	if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1)))
+	    break;
+    }
+
+fail:
+    PSMI_PUNLOCK();
+    return err;
+}
+PSMI_API_DECL(psm_map_nid_hostname)
+
+void
+__psm_epaddr_setlabel(psm_epaddr_t epaddr, char const *epaddr_label)
+{
+    return; /* ignore this function */
+}
+PSMI_API_DECL(psm_epaddr_setlabel)
+
+void
+__psm_epaddr_setctxt(psm_epaddr_t epaddr, void *ctxt)
+{
+  
+  /* Eventually deprecate this API to use set/get opt as this is unsafe. */
+  psm_setopt(PSM_COMPONENT_CORE, (const void*) epaddr, 
+	     PSM_CORE_OPT_EP_CTXT, (const void*) ctxt, sizeof(void*));
+  
+}
+PSMI_API_DECL(psm_epaddr_setctxt)
+
+void * 
+__psm_epaddr_getctxt(psm_epaddr_t epaddr)
+{
+  psm_error_t err;
+  uint64_t optlen = sizeof(void*);
+  void *result = NULL;
+  
+  /* Evetually deprecate this API to use set/get opt as this is unsafe. */
+  err = psm_getopt(PSM_COMPONENT_CORE, (const void*) epaddr, 
+		   PSM_CORE_OPT_EP_CTXT, (void*) &result, &optlen);
+  
+  if (err == PSM_OK)
+    return result;
+  else
+    return NULL;
+}
+PSMI_API_DECL(psm_epaddr_getctxt)
+
+psm_error_t
+__psm_setopt(psm_component_t component, const void *component_obj,
+	     int optname, const void *optval, uint64_t optlen)
+{  
+  switch(component) {
+  case PSM_COMPONENT_CORE:
+    return psmi_core_setopt(component_obj, optname, optval, optlen);
+    break;
+  case PSM_COMPONENT_MQ:
+    /* Use the deprecated MQ set/get opt for now which does not use optlen */
+    return psm_mq_setopt((psm_mq_t) component_obj, optname, optval);
+    break;
+  case PSM_COMPONENT_AM:
+    /* Hand off to active messages */
+    return psmi_am_setopt(component_obj, optname, optval, optlen);
+    break;
+  case PSM_COMPONENT_IB:
+    /* Hand off to IPS ptl to set option */
+    return psmi_ptl_ips.setopt(component_obj, optname, optval, optlen);
+    break;
+  }
+
+  /* Unrecognized/unknown component */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown component %u", component);
+  
+}
+
+PSMI_API_DECL(psm_setopt);
+
+psm_error_t
+__psm_getopt(psm_component_t component, const void *component_obj,
+	     int optname, void *optval, uint64_t *optlen)
+{  
+  switch(component) {
+  case PSM_COMPONENT_CORE:
+    return psmi_core_getopt(component_obj, optname, optval, optlen);
+    break;
+  case PSM_COMPONENT_MQ:
+    /* Use the deprecated MQ set/get opt for now which does not use optlen */
+    return psm_mq_getopt((psm_mq_t) component_obj, optname, optval);
+    break;
+  case PSM_COMPONENT_AM:
+    /* Hand off to active messages */
+    return psmi_am_getopt(component_obj, optname, optval, optlen);
+    break;
+  case PSM_COMPONENT_IB:
+    /* Hand off to IPS ptl to set option */
+    return psmi_ptl_ips.getopt(component_obj, optname, optval, optlen);
+    break;
+  }
+
+  /* Unrecognized/unknown component */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown component %u", component);
+}
+PSMI_API_DECL(psm_getopt);
+
+psm_error_t __recvpath
+__psmi_poll_noop(ptl_t *ptl, int replyonly)
+{
+    return PSM_OK_NO_PROGRESS;
+}
+PSMI_API_DECL(psmi_poll_noop)
+
+psm_error_t __recvpath
+__psm_poll(psm_ep_t ep)
+{
+    psm_error_t err1 = PSM_OK, err2 = PSM_OK;
+    psm_ep_t tmp;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+
+    tmp = ep;
+    do {
+    err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */
+    if (err1 > PSM_OK_NO_PROGRESS) { /* some error unrelated to polling */
+	PSMI_PUNLOCK();
+	return err1;
+    }
+
+    err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */
+    if (err2 > PSM_OK_NO_PROGRESS) { /* some error unrelated to polling */
+	PSMI_PUNLOCK();
+	return err2;
+    }
+    ep = ep->mctxt_next;
+    } while (ep != tmp);
+
+    /* This is valid because..
+     * PSM_OK & PSM_OK_NO_PROGRESS => PSM_OK
+     * PSM_OK & PSM_OK => PSM_OK
+     * PSM_OK_NO_PROGRESS & PSM_OK => PSM_OK
+     * PSM_OK_NO_PROGRESS & PSM_OK_NO_PROGRESS => PSM_OK_NO_PROGRESS */
+    PSMI_PUNLOCK();
+    return (err1 & err2);
+}
+PSMI_API_DECL(psm_poll)
+
+psm_error_t __recvpath
+__psmi_poll_internal(psm_ep_t ep, int poll_amsh)
+{
+    psm_error_t err1 = PSM_OK_NO_PROGRESS;
+    psm_error_t err2;
+    psm_ep_t tmp;
+
+    PSMI_PLOCK_ASSERT();
+
+    tmp = ep;
+    do {
+    if (poll_amsh) {
+	err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */
+	if (err1 > PSM_OK_NO_PROGRESS) /* some error unrelated to polling */
+	    return err1;
+    }
+
+    err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */
+    if (err2 > PSM_OK_NO_PROGRESS) /* some error unrelated to polling */
+	return err2;
+
+    ep = ep->mctxt_next;
+    } while (ep != tmp);
+
+    return (err1 & err2);
+}
+PSMI_API_DECL(psmi_poll_internal)
+
+#ifdef PSM_PROFILE
+/* These functions each have weak symbols */
+void 
+psmi_profile_block()
+{
+    ; // empty for profiler
+}
+
+void 
+psmi_profile_unblock()
+{
+    ; // empty for profiler
+}
+
+void 
+psmi_profile_reblock(int did_no_progress)
+{
+    ; // empty for profiler
+}
+#endif
+
diff --git a/psm.h b/psm.h
new file mode 100644
index 0000000..ca1200d
--- /dev/null
+++ b/psm.h
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PSM_H
+#define PSM_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+
+
+/* Local endpoint handle (opaque)
+ *  
+ *
+ * Handle returned to the user when a new local endpoint is created.  The
+ * handle is a local handle to be used in all communication functions and is
+ * not intended to globally identify the opened endpoint in any way.  
+ *
+ * All open endpoint handles can be globally identified using the endpoint id
+ * integral type (psm_epid_t) and all communication must use an endpoint
+ * address (psm_epaddr_t) that can be obtained by connecting a local
+ * endpoint to one or more endpoint identifiers.
+ *
+ * @remark The local endpoint handle is opaque to the user.  */
+typedef struct psm_ep *psm_ep_t;
+
+/* MQ handle (opaque)
+ * 
+ *
+ * Handle returned to the user when a new Matched queue is created (@ref
+ * psm_mq_init).  */
+typedef struct psm_mq *psm_mq_t;
+
+#define PSM_VERNO       0x0110
+#define PSM_VERNO_MAJOR 0x01
+#define PSM_VERNO_MINOR 0x10
+
+enum psm_error {
+    
+    PSM_OK = 0,
+    
+    PSM_OK_NO_PROGRESS = 1,
+    
+    PSM_PARAM_ERR = 3,
+    
+    PSM_NO_MEMORY = 4,
+    
+    PSM_INIT_NOT_INIT = 5, 
+    
+    PSM_INIT_BAD_API_VERSION = 6,
+    
+    PSM_NO_AFFINITY = 7,
+    
+    PSM_INTERNAL_ERR = 8,
+    
+    PSM_SHMEM_SEGMENT_ERR = 9,
+    
+    PSM_OPT_READONLY = 10,
+    
+    PSM_TIMEOUT = 11,
+    
+    PSM_TOO_MANY_ENDPOINTS = 12,
+
+    
+    PSM_IS_FINALIZED = 13,
+
+    
+    PSM_EP_WAS_CLOSED = 20, 
+    
+    PSM_EP_NO_DEVICE = 21,
+    
+    PSM_EP_UNIT_NOT_FOUND = 22,
+    
+    PSM_EP_DEVICE_FAILURE = 23, 
+    
+    PSM_EP_CLOSE_TIMEOUT = 24,  
+    
+    PSM_EP_NO_PORTS_AVAIL = 25, 
+    
+    PSM_EP_NO_NETWORK = 26,  
+    
+    PSM_EP_INVALID_UUID_KEY = 27,
+    
+    PSM_EP_NO_RESOURCES = 28,
+
+    
+    PSM_EPID_UNKNOWN = 40,
+    
+    PSM_EPID_UNREACHABLE = 41,
+    
+    PSM_EPID_INVALID_NODE = 43,
+    
+    PSM_EPID_INVALID_MTU =  44,
+    
+    PSM_EPID_INVALID_UUID_KEY = 45,
+    
+    PSM_EPID_INVALID_VERSION = 46,
+    
+    PSM_EPID_INVALID_CONNECT = 47,
+    
+    PSM_EPID_ALREADY_CONNECTED = 48,
+    
+    PSM_EPID_NETWORK_ERROR = 49,
+    
+    PSM_EPID_INVALID_PKEY = 50,
+    
+    PSM_EPID_PATH_RESOLUTION = 51,
+
+    
+    PSM_MQ_NO_COMPLETIONS = 60,
+    
+    PSM_MQ_TRUNCATION = 61,
+
+    
+    PSM_AM_INVALID_REPLY = 70,
+    
+    PSM_ERROR_LAST = 80
+};
+
+/* Backwards header compatibility for a confusing error return name */
+#define PSM_MQ_INCOMPLETE PSM_MQ_NO_COMPLETIONS
+
+typedef enum psm_error psm_error_t;
+
+enum psm_component {
+  
+  PSM_COMPONENT_CORE = 0,
+  
+  PSM_COMPONENT_MQ = 1,
+  
+  PSM_COMPONENT_AM = 2,
+  
+  PSM_COMPONENT_IB = 3
+};
+
+typedef enum psm_component psm_component_t;
+
+enum psm_path_res {  
+  
+  PSM_PATH_RES_NONE = 0,
+  
+  PSM_PATH_RES_OPP = 1,
+  
+  PSM_PATH_RES_UMAD = 2
+};
+
+typedef enum psm_path_res psm_path_res_t;
+  
+/* Initialize PSM interface
+ *
+ * Call to initialize the PSM library for a desired API revision number.
+ *
+ * [in,out] api_verno_major As input a pointer to an integer that holds
+ *                                PSM_VERNO_MAJOR. As output, the pointer
+ *                                is updated with the major revision number of
+ *                                the loaded library.
+ * [in,out] api_verno_minor As intput, a pointer to an integer that holds
+ *                                PSM_VERNO_MINOR.  As output, the pointer
+ *                                is updated with the minor revision number of
+ *                                the loaded library.
+ *
+ * [pre] The user has not called any other PSM library call except @ref
+ *      psm_error_register_handler to register a global error handler.
+ *
+ * [warning] PSM initialization is a precondition for all functions used in the
+ *          PSM library.
+ *
+ * [returns] PSM_OK The PSM interface could be opened and the desired API
+ *                 revision can be provided.
+ * [returns] PSM_INIT_BAD_API_VERSION The PSM library cannot compatibility for
+ *                                   the desired API version.  
+ * 
+ * @verbatim
+ * // In this example, we want to handle our own errors before doing init,
+ * // since we don't want a fatal error if InfiniPath is not found.
+ * // Note that psm_error_register_handler (and psm_uuid_generate)
+ * // are the only function that can be called before psm_init
+ *
+ * int try_to_initialize_psm() {
+ *     int verno_major = PSM_VERNO_MAJOR;
+ *     int verno_minor = PSM_VERNO_MINOR;
+ *
+ *     int err = psm_error_register_handler(NULL,  // Global handler
+ *                                  PSM_ERRHANDLER_NO_HANDLER); // return errors
+ *     if (err) {
+ *        fprintf(stderr, "Couldn't register global handler: %s\n",
+ *		    psm_error_get_string(err));
+ *        return -1;
+ *     }
+ *
+ *     err = psm_init(&verno_major, &verno_minor);
+ *     if (err || verno_major > PSM_VERNO_MAJOR) {
+ *        if (err) 
+ *	      fprintf(stderr, "PSM initialization failure: %s\n",
+ *	                               psm_error_get_string(err));
+ *	  else
+ *	      fprintf(stderr, "PSM loaded an unexpected/unsupported "
+ *	                      "version (%d.%d)\n", verno_major, verno_minor);
+ *	  return -1;
+ *     }
+ *
+ *     // We were able to initialize PSM but will defer all further error
+ *     // handling since most of the errors beyond this point will be fatal.
+ *     int err = psm_error_register_handler(NULL,  // Global handler
+ *                                  PSM_ERRHANDLER_PSM_HANDLER); // 
+ *     if (err) {
+ *        fprintf(stderr, "Couldn't register global errhandler: %s\n",
+ *		    psm_error_get_string(err));
+ *        return -1;
+ *     }
+ *     return 1;
+ * }
+ * @endverbatim
+ */
+psm_error_t
+psm_init(int *api_verno_major, int *api_verno_minor);
+
+/* Finalize PSM interface
+ *
+ * Single call to finalize PSM and close all unclosed endpoints 
+ *
+ * [post] The user guarantees not to make any further PSM calls, including @ref
+ * psm_init.
+ *
+ * [returns] PSM_OK Always returns PSM_OK */
+psm_error_t
+psm_finalize(void);
+
+/* Error handling opaque token
+ *
+ * A token is required for users that register their own handlers and wish to
+ * defer further error handling to PSM. */
+typedef struct psm_error_token	*psm_error_token_t;
+
+/* Error handling function
+ *
+ * Users can handle errors explicitly instead of relying on PSM's own error
+ * handler.  There is one global error handler and error handlers that can be
+ * individually set for each opened endpoint.  By default, endpoints will
+ * inherit the global handler registered at the time of open. 
+ *
+ * [in] ep Handle associated to the endpoint over which the error occurred
+ *               or NULL if the error is being handled by the global error
+ *               handler.
+ * [in] error PSM error identifier
+ * [in] error_string A descriptive error string of maximum length @ref
+ *                         PSM_ERRSTRING_MAXLEN.
+ * [in] token Opaque PSM token associated with the particular event that
+ *		    generated the error.  The token can be used to extract the
+ *		    error string and can be passed to psm_error_defer to
+ *		    defer any remaining or unhandled error handling to PSM. 
+ *
+ * [post] If the error handler returns, the error returned is propagated to the
+ *       caller.  */
+typedef psm_error_t (*psm_ep_errhandler_t)(psm_ep_t ep, 
+					   const psm_error_t error, 
+					   const char *error_string,
+					   psm_error_token_t token);
+
+/* Obsolete names, only here for backwards compatibility */
+#define PSM_ERRHANDLER_DEFAULT	((psm_ep_errhandler_t)-1)
+#define PSM_ERRHANDLER_NOP	((psm_ep_errhandler_t)-2)
+
+#define PSM_ERRHANDLER_PSM_HANDLER  ((psm_ep_errhandler_t)-1)
+/* PSM error handler as explained in error_handling */
+
+#define PSM_ERRHANDLER_NO_HANDLER   ((psm_ep_errhandler_t)-2)
+/* Bypasses the default PSM error handler and returns all errors to the user
+ * (this is the default) */
+
+#define PSM_ERRSTRING_MAXLEN	512 /* Maximum error string length. */
+
+/* PSM error handler registration
+ *
+ * Function to register error handlers on a global basis and on a per-endpoint
+ * basis.  PSM_ERRHANDLER_PSM_HANDLER and PSM_ERRHANDLER_NO_HANDLER are special
+ * pre-defined handlers to respectively enable use of the default PSM-internal
+ * handler or the no-handler that disables registered error handling and
+ * returns all errors to the caller (both are documented in error_handling).
+ *
+ * [in] ep Handle of the endpoint over which the error handler should be
+ *               registered.  With ep set to NULL, the behavior of the
+ *               global error handler can be controlled.
+ * [in] errhandler Handler to register.  Can be a user-specific error
+ *                       handling function or PSM_ERRHANDLER_PSM_HANDLER or
+ *                       PSM_ERRHANDLER_NO_HANDLER.
+ *
+ * @remark When ep is set to NULL, this is the only function that can be
+ * called before psm_init
+ */
+psm_error_t
+psm_error_register_handler(psm_ep_t ep, const psm_ep_errhandler_t errhandler);
+
+/* PSM deferred error handler 
+ *
+ * Function to handle fatal PSM errors if no error handler is installed or if
+ * the user wishes to defer further error handling to PSM.  Depending on the
+ * type of error, PSM may or may not return from the function call.
+ *
+ * [in] err_token Error token initially passed to error handler
+ *
+ * [pre] The user is calling into the function because it has decided that PSM
+ *      should handle an error case.  
+ *
+ * [post] The function may or may not return depending on the error
+ */
+psm_error_t
+psm_error_defer(psm_error_token_t err_token);
+
+/* Get generic error string from error
+ *
+ * Function to return the default error string associated to a PSM error.
+ *
+ * While a more detailed and precise error string is usually available within
+ * error handlers, this function is available to obtain an error string out of
+ * an error handler context or when a no-op error handler is registered.
+ *
+ * [in] error PSM error
+ */
+const char *
+psm_error_get_string(psm_error_t error);
+
+/* Option key/pair structure 
+ * 
+ * Currently only used in MQ.
+ */
+struct psm_optkey
+{
+    uint32_t key;	/* Option key */
+    void     *value;	/* Option value */
+};
+
+
+
+/* Endpoint ID
+ *
+ * Integral type of size 8 bytes that can be used by the user to globally
+ * identify a successfully opened endpoint.  Although the contents of the
+ * endpoint id integral type remains opaque to the user, unique network id and
+ * InfiniPath port number can be extracted using psm_epid_nid and @ref
+ * psm_epid_context.
+ */
+typedef uint64_t psm_epid_t;
+
+/* Endpoint Address (opaque)
+ *
+ * Remote endpoint addresses are created when the user binds an endpoint ID
+ * to a particular endpoint handle using psm_ep_connect.  A given endpoint
+ * address is only guaranteed to be valid over a single endpoint.  
+ */
+typedef struct psm_epaddr *psm_epaddr_t;
+
+/* PSM Unique UID 
+ *
+ * PSM type equivalent to the DCE-1 uuid_t, used to uniquely identify an
+ * endpoint within a particular job.  Since PSM does not participate in job
+ * allocation and management, users are expected to generate a unique ID to
+ * associate endpoints to a particular parallel or collective job.
+ * [see] psm_uuid_generate
+ */
+typedef uint8_t psm_uuid_t[16];
+
+/* Get Endpoint identifier's Unique Network ID */
+uint64_t
+psm_epid_nid(psm_epid_t epid);
+
+/* Get Endpoint identifier's InfiniPath context number */
+uint64_t
+psm_epid_context(psm_epid_t epid);
+
+/* Get Endpoint identifier's InfiniPath port (deprecated, use
+ * psm_epid_context instead) */
+uint64_t
+psm_epid_port(psm_epid_t epid);
+
+/* List the number of available InfiniPath units
+ *
+ * Function used to determine the amount of locally available InfiniPath units.
+ * For N units, valid unit numbers in psm_ep_open are 0 to N-1.
+ *
+ * [returns] PSM_OK unless the user has not called psm_init
+ */
+psm_error_t
+psm_ep_num_devunits(uint32_t *num_units);
+
+/* Utility to generate UUIDs for psm_ep_open
+ *
+ * This function is available as a utility for generating unique job-wide ids.
+ * See discussion in psm_ep_open for further information.
+ *
+ * @remark This function does not require PSM to be initialized.
+ */
+void
+psm_uuid_generate(psm_uuid_t uuid_out);
+
+/* Affinity modes for the affinity member of struct psm_ep_open_opts */
+#define PSM_EP_OPEN_AFFINITY_SKIP     0	/* Disable setting affinity */
+#define PSM_EP_OPEN_AFFINITY_SET      1	/* Enable setting affinity unless
+					  already set */ 
+#define PSM_EP_OPEN_AFFINITY_FORCE    2 /* Enable setting affinity regardless
+					  of current affinity setting */
+
+/* Default values for some constants */
+#define PSM_EP_OPEN_PKEY_DEFAULT    0xffffffffffffffffULL 
+				    /* Default protection key */
+
+/* Endpoint Open Options
+ *
+ * These options are available for opening a PSM endpoint.  Each is
+ * individually documented and setting each option to -1 or passing NULL as the
+ * options parameter in psm_ep_open instructs PSM to use
+ * implementation-defined defaults.
+ *
+ * Each option is documented in psm_ep_open */
+struct psm_ep_open_opts {
+    int64_t   timeout;	    /* timeout in nanoseconds to open device */
+    int	      unit;	    /* InfiniPath Unit ID to open on */
+    int	      affinity;     /* How PSM should set affinity */
+    int	      shm_mbytes;   /* Megabytes used for intra-node communication */
+    int	      sendbufs_num; /* Preallocated send buffers */
+#if PSM_VERNO >= 0x0101
+    uint64_t  network_pkey; /* Network Protection Key (v1.01) */
+#endif
+#if PSM_VERNO >= 0x0107
+    int	      port;	      /* IB port to use (1 to N) */
+#if PSM_VERNO <= 0x010a
+    int	      outvl;	      /* IB VL to use when sending pkts */
+#endif
+    int	      outsl;	      /* IB SL to use when sending pkts */
+#endif
+#if PSM_VERNO >= 0x010d
+    uint64_t  service_id;     /* IB Service ID to use for endpoint */
+    psm_path_res_t path_res_type;  /* Path resolution type */
+#endif
+#if PSM_VERNO >= 0x010e
+    int       senddesc_num;   /* Preallocated send descriptors */
+    int       imm_size;       /* Immediate data size for endpoint */
+#endif
+
+};
+
+/* InfiniPath endpoint creation
+ *
+ * Function used to create a new local communication endpoint on an InfiniPath
+ * adapter.  The returned endpoint handle is required in all PSM communication
+ * operations, as PSM can manage communication over multiple endpoints.  An
+ * opened endpoint has no global context until the user connects the endpoint
+ * to other global endpoints by way of psm_ep_connect.  All local endpoint
+ * handles are globally identified by endpoint IDs (psm_epid_t) which are
+ * also returned when an endpoint is opened.  It is assumed that the user can
+ * provide an out-of-band mechanism to distribute the endpoint IDs in order to
+ * establish connections between endpoints (psm_ep_connect for more
+ * information).
+ *
+ * [in] unique_job_key Endpoint key, to uniquely identify the endpoint in
+ *                           a parallel job.  It is up to the user to ensure
+ *                           that the key is globally unique over a period long
+ *                           enough to prevent duplicate keys over the same set
+ *                           of endpoints (see comments below).
+ *
+ * [in] opts Open options of type psm_ep_open_opts 
+ *                 (see psm_ep_open_opts_get_defaults).
+ *
+ * [out] ep User-supplied storage to return a pointer to the newly
+ *                created endpoint.  The returned pointer of type psm_ep_t
+ *                is a local handle and cannot be used to globally identify the
+ *                endpoint.
+ * [out] epid User-supplied storage to return the endpoint ID associated
+ *                  to the newly created local endpoint returned in the ep
+ *                  handle.  The endpoint ID is an integral type suitable for
+ *                  uniquely identifying the local endpoint.
+ *
+ * PSM does not internally verify the consistency of the uuid, it is up to the
+ * user to ensure that the uid is unique enough not to collide with other
+ * currently-running jobs.  Users can employ three mechanisms to obtain a uuid.
+ *
+ * 1. Use the supplied psm_uuid_generate utility
+ *
+ * 2. Use an OS or library-specific uuid generation utility, that complies with
+ *    OSF DCE 1.1, such as uuid_generate on Linux or uuid_create on FreeBSD.
+ *    (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm)
+ *
+ * 3. Manually pack a 16-byte string using a utility such as /dev/random or
+ *    other source with enough entropy and proper seeding to prevent two nodes
+ *    from generating the same uuid_t.
+ *
+ * The following options are relevent when opening an endpoint:
+ *   * timeout establishes the amount of nanoseconds to wait before
+ *                  failing to open a port (with -1, defaults to 15 secs).
+ *   * unit sets the InfiniPath unit number to use to open a port (with
+ *               -1, PSM determines the best unit to open the port).  If @c
+ *               IPATH_UNIT is set in the environment, this setting is ignored.
+ *   * affinity enables or disables PSM setting processor affinity.  The
+ *                   option can be controlled to either disable (@ref
+ *                   PSM_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting
+ *                   only if it is already unset (@ref
+ *                   PSM_EP_OPEN_AFFINITY_SET) or regardless of affinity begin
+ *                   set or not (PSM_EP_OPEN_AFFINITY_FORCE).
+ *                   If IPATH_NO_CPUAFFINITY is set in the environment, this
+ *                   setting is ignored.
+ *   * shm_mbytes sets a maximum amount of megabytes that can be allocated
+ *		       to each local endpoint ID connected through this
+ *		       endpoint (with -1, defaults to 10 MB).
+ *   * sendbufs_num sets the number of send buffers that can be
+ *                       pre-allocated for communication (with -1, defaults to
+ *                       512 buffers of MTU size).
+ *   * network_pkey sets the protection key to employ for point-to-point
+ *                       PSM communication.  Unless a specific value is used,
+ *                       this parameter should be set to
+ *                       PSM_EP_OPEN_PKEY_DEFAULT.
+ *
+ * [warning] Currently, PSM limits the user to calling psm_ep_open only once
+ * per process and subsequent calls will fail.  Multiple endpoints per process
+ * will be enabled in a future release.
+ *
+ * @verbatim
+ * // In order to open an endpoint and participate in a job, each endpoint has
+ * // to be distributed a unique 16-byte UUID key from an out-of-band source.
+ * // Presumably this can come from the parallel spawning utility either
+ * // indirectly through an implementors own spawning interface or as in this
+ * // example, the UUID is set as a string in an environment variable
+ * // propagated to all endpoints in the job.
+ *
+ * int try_to_open_psm_endpoint(psm_ep_t *ep, // output endpoint handle
+ *                              psm_epid_t *epid, // output endpoint identifier
+ *                              int unit)  // unit of our choice
+ * {
+ *    psm_ep_open_opts epopts;
+ *    psm_uuid_t job_uuid;
+ *    char *c;
+ *
+ *    // Let PSM assign its default values to the endpoint options.
+ *    psm_ep_open_opts_get_defaults(&epopts);
+ *
+ *    // We want a stricter timeout and a specific unit
+ *    epopts.timeout = 15*1e9;  // 15 second timeout
+ *    epopts.unit = unit;	// We want a specific unit, -1 would let PSM
+ *                              // choose the unit for us.
+ *    epopts.port = port;	// We want a specific unit, <= 0 would let PSM
+ *                              // choose the port for us.
+ *    // We've already set affinity, don't let PSM do so if it wants to.
+ *    if (epopts.affinity == PSM_EP_OPEN_AFFINITY_SET)
+ *       epopts.affinity = PSM_EP_OPEN_AFFINITY_SKIP;
+ *
+ *    // ENDPOINT_UUID is set to the same value in the environment of all the
+ *    // processes that wish to communicate over PSM and was generated by
+ *    // the process spawning utility 
+ *    c = getenv("ENDPOINT_UUID");
+ *    if (c && *c) 
+ *       implementor_string_to_16byte_packing(c, job_uuid);
+ *    else {
+ *       fprintf(stderr, "Can't find UUID for endpoint\n);
+ *       return -1;
+ *    }
+ *
+ *    // Assume we don't want to handle errors here.
+ *    psm_ep_open(job_uuid, &epopts, ep, epid);
+ *    return 1;
+ * } 
+ * @endverbatim */
+psm_error_t
+psm_ep_open(const psm_uuid_t unique_job_key, const struct psm_ep_open_opts *opts,
+	    psm_ep_t *ep, psm_epid_t *epid);
+
+/* Endpoint open default options.
+ *
+ * Function used to initialize the set of endpoint options to their default
+ * values for use in psm_ep_open.
+ *
+ * [out] opts Endpoint Open options.
+ *
+ * [warning] For portable operation, users should always call this function
+ * prior to calling psm_ep_open.
+ *
+ * [return] PSM_OK If result could be updated
+ * [return] PSM_INIT_NOT_INIT If psm has not been initialized.
+ */
+psm_error_t
+psm_ep_open_opts_get_defaults(struct psm_ep_open_opts *opts);
+
+/* Endpoint shared memory query
+ *
+ * Function used to determine if a remote endpoint shares memory with a
+ * currently opened local endpiont.
+ *
+ * [in] ep Endpoint handle
+ * [in] epid Endpoint ID
+ *
+ * [out] result Result is non-zero if the remote endpoint shares memory with the local
+ * endpoint ep, or zero otherwise.
+ *
+ * [return] PSM_OK If result could be updated
+ * [return] PSM_EPID_UNKNOWN If the epid is not recognized
+ */
+psm_error_t
+psm_ep_epid_share_memory(psm_ep_t ep, psm_epid_t epid, int *result);
+
+/* Close endpoint
+ * [in] ep PSM endpoint handle
+ * [in] mode One of PSM_EP_CLOSE_GRACEFUL or PSM_EP_CLOSE_FORCE
+ * [in] timeout How long to wait in nanoseconds if mode is
+ *			PSM_EP_CLOSE_GRACEFUL, 0 waits forever.  If mode is
+ *			PSM_EP_CLOSE_FORCE, this parameter is ignored.
+ *
+ * The following errors are returned, others are handled by the per-endpoint
+ * error handler:
+ *
+ * [return] PSM_OK  Endpoint was successfully closed without force or
+ *                 successfully closed with force within the supplied timeout.
+ * [return] PSM_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed
+ *                              within timeout.
+ */
+psm_error_t
+psm_ep_close(psm_ep_t ep, int mode, int64_t timeout);
+
+#define PSM_EP_CLOSE_GRACEFUL	0   /* Graceful close mode in psm_ep_close */
+#define PSM_EP_CLOSE_FORCE	1   /* Forceful close mode in psm_ep_close */
+
+/* Provide mappings for network id to hostname
+ *
+ * Since PSM does not assume or rely on the availability of an external
+ * networkid-to-hostname mapping service, users can provide one or more of
+ * these mappings.  The psm_map_nid_hostname function allows a list of
+ * network ids to be associated to hostnames.
+ *
+ * This function is not mandatory for correct operation but may allow PSM to
+ * provide better diagnostics when remote endpoints are unavailable and can
+ * otherwise only be identified by their network id.
+ *
+ * [in] num Number elements in nid and hostnames arrays
+ * [in] nids User-provided array of network ids (i.e. InfiniBand LIDs),
+ *                 should be obtained by calling psm_epid_nid on each
+ *                 epid.
+ * [in] hostnames User-provided array of hostnames (array of
+ *                      NUL-terimated strings) where each hostname index
+ *                      maps to the provided nid hostname.
+ *
+ * [warning] Duplicate nids may be provided in the input nids array, only
+ *          the first corresponding hostname will be remembered.
+ *
+ * [pre] The user may or may not have already provided a hostname mappings.  
+ * [post] The user may free any dynamically allocated memory passed to the
+ *       function.
+ *
+ */
+psm_error_t
+psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames);
+
+/* Connect one or more remote endpoints to a local endpoint
+ *
+ * Function to non-collectively establish a connection to a set of endpoint IDs
+ * and translate endpoint IDs into endpoint addresses.  Establishing a remote
+ * connection with a set of remote endpoint IDs does not imply a collective
+ * operation and the user is free to connect unequal sets on each process.
+ * Similarly, a given endpoint address does not imply that a pairwise
+ * communication context exists between the local endpoint and remote endpoint.
+ *
+ * [in] ep PSM endpoint handle
+ *
+ * [in] num_of_epid The amount of endpoints to connect to, which
+ *			  also establishes the amount of elements contained in
+ *			  all of the function's array-based parameters.
+ *
+ * [in] array_of_epid User-allocated array that contains num_of_epid
+ *                          valid endpoint identifiers.  Each endpoint id (or
+ *                          epid) has been obtained through an out-of-band
+ *                          mechanism and each endpoint must have been opened
+ *                          with the same uuid key.
+ *
+ * [in] array_of_epid_mask User-allocated array that contains num_of_epid
+ *			    integers.  This array of masks allows users to
+ *			    select which of the epids in array_of_epid
+ *			    should be connected.  If the integer at index i is
+ *			    zero, psm does not attempt to connect to the epid
+ *			    at index i in array_of_epid.  If this parameter
+ *			    is NULL, psm will try to connect to each epid.
+ *
+ * [out] array_of_errors User-allocated array of at least num_of_epid
+ *                             elements. If the function does not return
+ *                             PSM_OK, this array can be consulted for each
+ *                             endpoint not masked off by array_of_epid_mask
+ *                             to know why the endpoint could not be connected.
+ *                             Endpoints that could not be connected because of
+ *                             an unrelated failure will be marked as @ref
+ *                             PSM_EPID_UNKNOWN.  If the function returns
+ *                             PSM_OK, the errors for all endpoints will also
+ *                             contain PSM_OK.
+ *
+ * [out] array_of_epaddr User-allocated array of at least num_of_epid
+ *                             elements of type psm_epaddr_t.  Each
+ *                             successfully connected endpoint is updated with
+ *                             an endpoint address handle that corresponds to
+ *                             the endpoint id at the same index in @c
+ *                             array_of_epid.  Handles are only updated if the
+ *                             endpoint could be connected and if its error in
+ *                             array_of_errors is PSM_OK.
+ *
+ * [in] timeout Timeout in nanoseconds after which connection attempts will
+ *                    be abandoned.  Setting this value to 0 disables timeout
+ *                    and waits until all endpoints have been successfully
+ *                    connected or until an error is detected.
+ *
+ * [pre] The user has opened a local endpoint and obtained a list of endpoint
+ *      IDs to connect to a given endpoint handle using an out-of-band
+ *      mechanism not provided by PSM.
+ *
+ * [post] If the connect is successful, array_of_epaddr is updated with valid
+ *       endpoint addresses.  
+ *
+ * [post] If unsuccessful, the user can query the return status of each
+ *       individual remote endpoint in array_of_errors.
+ * 
+ * [post] The user can call into psm_ep_connect many times with the same
+ *       endpoint ID and the function is guaranteed to return the same output
+ *       parameters.
+ *
+ * [post] PSM does not keep any reference to the arrays passed into the
+ *       function and the caller is free to deallocate them.
+ *
+ * The error value with the highest importance is returned by
+ * the function if some portion of the communication failed.  Users should
+ * always refer to individual errors in array_of_errors whenever the
+ * function cannot return PSM_OK.
+ *
+ * [returns] PSM_OK  The entire set of endpoint IDs were successfully connected
+ *                  and endpoint addresses are available for all endpoint IDs.
+ *
+ * @verbatim
+ * int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid,
+ *                       psm_epaddr_t **array_of_epaddr_out)
+ * {
+ *     psm_error_t *errors = (psm_error_t *) 
+ *				calloc(numep, sizeof(psm_error_t));
+ *     if (errors == NULL)
+ *         return -1;
+ *
+ *     psm_epaddr_t *all_epaddrs = 
+ *              (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t));       
+ * 
+ *     if (all_epaddrs == NULL)
+ *         return -1;
+ *
+ *     psm_ep_connect(ep, numep, array_of_epid, 
+ *                    NULL, // We want to connect all epids, no mask needed
+ *                    errors,
+ *                    all_epaddrs,
+ *                    30*e9); // 30 second timeout, <1 ns is forever
+ *     *array_of_epaddr_out = all_epaddrs;
+ *     free(errors);
+ *     return 1;
+ * }
+ * @endverbatim */
+psm_error_t
+psm_ep_connect(psm_ep_t ep, int num_of_epid, const psm_epid_t *array_of_epid,
+	       const int *array_of_epid_mask, psm_error_t *array_of_errors, 
+	       psm_epaddr_t *array_of_epaddr, int64_t timeout);
+
+/* Ensure endpoint communication progress 
+ *
+ * Function to ensure progress for all PSM components instantiated on an
+ * endpoint (currently, this only includes the MQ component).  The function
+ * never blocks and is typically required in two cases:
+ *
+ * * Allowing all PSM components instantiated over a given endpoint to make
+ *     communication progress. Refer to mq_progress for a detailed
+ *     discussion on MQ-level progress issues.
+ *
+ * * Cases where users write their own synchronization primitives that
+ *     depend on remote communication (such as spinning on a memory location
+ *     which's new value depends on ongoing communication).
+ *
+ * The poll function doesn't block, but the user can rely on the @ref
+ * PSM_OK_NO_PROGRESS return value to control polling behaviour in terms of
+ * frequency (poll until an event happens) or execution environment (poll for a
+ * while but yield to other threads of CPUs are oversubscribed).
+ *
+ * [returns] PSM_OK	       Some communication events were progressed
+ * [returns] PSM_OK_NO_PROGRESS Polling did not yield any communication progress
+ *
+ */
+psm_error_t
+psm_poll(psm_ep_t ep);
+
+/* Set a user-determined ep address label.
+ *
+ * [in] epaddr Endpoint address, obtained from psm_ep_connect
+ * [in] epaddr_label_string User-allocated string to print when
+ *                   identifying endpoint in error handling or other verbose
+ *                   printing.  The NULL-terminated string must be allocated by
+ *                   the user since PSM only keeps a pointer to the label.  If
+ *                   users do not explicitly set a label for each endpoint,
+ *                   endpoints will identify themselves as hostname:port.
+ */
+void
+psm_epaddr_setlabel(psm_epaddr_t epaddr, const char *epaddr_label_string);
+
+/* Set a user-determined ep address context.
+ *
+ * [in] epaddr Endpoint address, obtained from psm_ep_connect
+ * [in] ctxt   Opaque user defined state to associate with an endpoint
+ *                   address. This state can be retrieved via 
+ *                   psm_epaddr_getctxt.
+ */
+void 
+psm_epaddr_setctxt(psm_epaddr_t epaddr, void *ctxt);
+
+/* Get the user-determined ep address context. Users can associate an
+ *  opaque context with each endpoint via psm_epaddr_setctxt.
+ *
+ * [in] epaddr Endpoint address, obtained from psm_ep_connect.
+ */  
+void *
+psm_epaddr_getctxt(psm_epaddr_t epaddr);
+
+/* Below are all component specific options. The component object for each of
+ * the options is also specified. 
+ */
+  
+/* PSM_COMPONENT_CORE options */
+/* PSM debug level */
+#define PSM_CORE_OPT_DEBUG     0x101
+  /* [uint32_t ] Set/Get the PSM debug level. This option can be set
+   * before initializing the PSM library.
+   *
+   * component object: (null)
+   * option value: PSM Debug mask to set or currently active debug level.
+   */
+  
+/* PSM endpoint address context */  
+#define PSM_CORE_OPT_EP_CTXT   0x102
+  /* [uint32_t ] Set/Get the context associated with a PSM endpoint
+   * address (psm_epaddr_t).
+   *
+   * component object: PSM endpoint (psm_epaddr_t) address.
+   * option value: Context associated with PSM endpoint address.
+   */
+
+/* PSM_COMPONENT_IB options */
+/* Default service level to use to communicate with remote endpoints */
+#define PSM_IB_OPT_DF_SL 0x201
+  /* [uint32_t ] Default Infiniband SL to use for all remote communication.
+   * If unset defaults to Service Level 0. 
+   *
+   * component object: Opened PSM endpoint id (psm_ep_t).
+   * option value: Default IB SL to use for endpoint. (0 <= SL < 15)
+   */
+
+/* Set IB service level to use for communication to an endpoint */
+#define PSM_IB_OPT_EP_SL 0x202
+  /* [uint32_t ] Infiniband SL to use for communication to specified
+   * remote endpoint.
+   * 
+   * component object: PSM endpoint (@ ref psm_epaddr_t) address.
+   * option value: SL used to communicate with remote endpoint. (0 <= SL < 15)
+   */
+
+/* PSM_COMPONENT_MQ options (deprecates psm_mq_set|getopt) */
+/* MQ options that can be set in psm_mq_init and psm_{set,get}_opt */
+#define PSM_MQ_OPT_RNDV_IB_SZ       0x301
+#define PSM_MQ_RNDV_IPATH_SZ	    PSM_MQ_OPT_RNDV_IB_SZ 
+  /* [uint32_t ] Size at which to start enabling rendezvous
+   * messaging for InfiniPath messages (if unset, defaults to values
+   * between 56000 and 72000 depending on the system configuration) 
+   *
+   * component object: PSM Matched Queue (psm_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+  
+#define PSM_MQ_OPT_RNDV_SHM_SZ      0x302
+#define PSM_MQ_RNDV_SHM_SZ	    PSM_MQ_OPT_RNDV_SHM_SZ
+  /* [uint32_t ] Size at which to start enabling
+   * rendezvous messaging for shared memory (intra-node) messages (If
+   * unset, defaults to 64000 bytes). 
+   *
+   * component object: PSM Matched Queue (psm_mq_t).
+   * option value: Size at which to switch to rendezvous protocol.
+   */
+  
+#define PSM_MQ_OPT_SYSBUF_MYBYTES   0x303
+#define PSM_MQ_MAX_SYSBUF_MBYTES    PSM_MQ_OPT_SYSBUF_MYBYTES
+  /* [uint32_t ] Maximum amount of bytes to allocate for unexpected
+   * messages.
+   *
+   * component object: PSM Matched Queue (psm_mq_t).
+   * option value: Maximum amount of bytes to allocate for unexpected messages.
+   * Mesages that would cause memory allocation to exceed this amount will be
+   * dropped.
+   */
+
+
+/* PSM_COMPONENT_AM options */
+#define PSM_AM_OPT_FRAG_SZ          0x401
+  
+  
+/* Set an option for a PSM component
+ *
+ * Function to set the value of a PSM component option
+ *
+ * [in] component Type of PSM component for which to set the option
+ * [in] component_obj Opaque component specify object to apply the set
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * [in] optname Name of component option to set. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * [in] optval Pointer to storage that contains the value to be updated
+ *                   for the supplied option.  It is up to the user to
+ *                   ensure that the pointer points to a memory location with a
+ *                   correct size and format.
+ * [in] optlen Size of the memory region pointed to by optval.
+ *
+ * [returns] PSM_OK if option could be set.
+ * [returns] PSM_PARAM_ERR if the component or optname are not valid.
+ * [returns] PSM_OPT_READONLY if the option to be set is a read-only option.
+ *                           
+ */
+psm_error_t
+psm_setopt(psm_component_t component, const void *component_obj,
+	   int optname, const void *optval, uint64_t optlen);
+
+/* Get an option for a PSM component
+ *
+ * Function to get the value of a PSM component option
+ *
+ * [in] component Type of PSM component for which to get the option
+ * [in] component_obj Opaque component specify object to apply the get
+ *                          operation on. These are passed uninterpreted to the
+ *                          appropriate component for interpretation.
+ * [in] optname Name of component option to get. These are component
+ *                    specific and passed uninterpreted to the appropriate
+ *                    component for interpretation.
+ * [out] optval Pointer to storage that contains the value to be updated
+ *                    for the supplied option.  It is up to the user to
+ *                    ensure that the pointer points to a valid memory region.
+ * [in,out] optlen This is a value result parameter initially containing 
+ *                      the size of the memory region pointed to by optval and 
+ *                      modified to return the actual size of optval.
+ *
+ * [returns] PSM_OK if option value could be retrieved successfully.
+ * [returns] PSM_PARAM_ERR if the component or optname are not valid.
+ * [returns] PSM_NO_MEMORY if the memory region optval is of insufficient size.
+ *                         optlen contains the required memory region size for
+ *                         optname value.
+ *
+ */
+psm_error_t
+psm_getopt(psm_component_t component, const void *component_obj,
+	   int optname, void *optval, uint64_t *optlen);
+
+/* Datatype for end-point information */
+typedef struct psm_epinfo {
+    psm_ep_t ep;	/* The ep for this end-point*/
+    psm_epid_t epid;	/* The epid for this end-point */
+    psm_uuid_t uuid;	/* The UUID for this end-point */
+    char uuid_str[64];	/* String representation of the UUID for this end-point */
+} psm_epinfo_t;
+
+/* Datatype for end-point connection */
+typedef struct psm_epconn {
+    psm_epaddr_t addr;	/* The epaddr for this connection */
+    psm_ep_t ep;	/* The ep for this connection */
+    psm_mq_t mq;	/* The mq for this connection */
+} psm_epconn_t;
+
+/* Query PSM for end-point information.
+ *
+ * Function to query PSM for end-point information. This allows retrieval of end-point
+ * information in cases where the caller does not have access to the results of psm_ep_open().
+ * In single-rail mode PSM will use a single end-point. In multi-rail mode, PSM will use an 
+ * end-point per rail.
+ *
+ * [in,out] num_of_epinfo On input, sizes the available number of entries in array_of_epinfo.
+ *                             On output, specifies the returned number of entries in array_of_epinfo.
+ * [out] array_of_epinfo Returns end-point information structures.
+ *
+ * [pre] PSM is initialized and the end-point has been opened.
+ *
+ * [returns] PSM_OK indicates success.
+ * [returns] PSM_PARAM_ERR if input num_if_epinfo is less than or equal to zero.
+ * [returns] PSM_EP_WAS_CLOSED if PSM end-point is closed or does not exist.
+ */
+psm_error_t 
+psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo);
+
+/* Query PSM for end-point connections.
+ *
+ * Function to query PSM for end-point connections. This allows retrieval of end-point
+ * connnections in cases where the caller does not have access to the results of psm_ep_connect().
+ * The epid values can be found using psm_ep_query() so that each PSM process can determine
+ * its own epid. These values can then be distributed across the PSM process so that each PSM
+ * process knows the epid for all other PSM processes.
+ *
+ * [in] epid The epid of a PSM process.
+ * [out] epconn The connection information for that PSM process.
+ *
+ * [pre] PSM is initialized and the end-point has been connected to this epid.
+ *
+ * [returns] PSM_OK indicates success.
+ * [returns] PSM_EP_WAS_CLOSED if PSM end-point is closed or does not exist.
+ * [returns] PSM_EPID_UNKNOWN if the epid value is not known to PSM.
+ */
+psm_error_t 
+psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn);
+  
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/psm.supp b/psm.supp
new file mode 100644
index 0000000..3113ad0
--- /dev/null
+++ b/psm.supp
@@ -0,0 +1,58 @@
+
+# userinit
+{
+   syscall_ipath_userinit
+   Memcheck:Param
+   write(buf)
+   fun:__write_nocancel
+   fun:ipath_userinit
+}
+
+# syscall poll type
+{
+   syscall_poll_type
+   Memcheck:Param
+   write(buf)
+   obj:/lib64/libc*.so
+   fun:ipath_poll_type
+}
+
+# Tids allocation.
+{
+   syscall_tid_free
+   Memcheck:Param
+   write(buf)
+   obj:/lib64/libc*.so
+   fun:ips_tid_release
+}
+
+# Tids de-allocation.
+{
+   syscall_tid_alloc
+   Memcheck:Param
+   write(buf)
+   obj:/lib64/libc*so
+   fun:ips_tid_acquire
+}
+
+# really in QLogic MPI
+{
+   mpspawn_socket
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:psc_skt_sendN
+}
+
+# gethostbyname on sles
+{
+   gethostbyname
+   Memcheck:Param
+   socketcall.sendto(msg)
+   fun:send
+   fun:get_mapping
+   fun:__nscd_get_map_ref
+   fun:nscd_gethst_r
+   fun:__nscd_gethostbyname_r
+   fun:gethostbyname_r@@GLIBC_2.2.5
+}
diff --git a/psm_am.c b/psm_am.c
new file mode 100644
index 0000000..d5db5c7
--- /dev/null
+++ b/psm_am.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_am.h"
+#include "psm_am_internal.h"
+
+int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid);
+
+static int _ignore_handler(PSMI_AM_ARGS_DEFAULT)
+{
+    return 0;
+}
+
+int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT)
+{
+    abort();
+    return 0;
+}
+
+psm_error_t
+psmi_am_init_internal(psm_ep_t ep)
+{
+    int i;
+    psm_am_handler_fn_t *am_htable;
+
+    ep->am_htable = 
+        psmi_malloc(ep, UNDEFINED,
+		    sizeof(psm_am_handler_fn_t) * PSMI_AM_NUM_HANDLERS);
+    if (ep->am_htable == NULL)
+	return PSM_NO_MEMORY;
+
+    am_htable = (psm_am_handler_fn_t *) ep->am_htable;
+    for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) 
+	am_htable[i] = _ignore_handler;
+
+    return PSM_OK;
+}
+
+psm_error_t
+__psm_am_register_handlers(psm_ep_t ep, 
+			 const psm_am_handler_fn_t *handlers, 
+			 int num_handlers, int *handlers_idx)
+{
+    int i, j;
+
+    /* For now just assign any free one */
+    for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
+	if (ep->am_htable[i] == _ignore_handler) {
+	    ep->am_htable[i] = handlers[j];
+	    handlers_idx[j] = i;
+	    if (++j == num_handlers) /* all registered */
+		break;
+	}
+    }
+
+    if (j < num_handlers) {
+	/* Not enough free handlers, restore unused handlers */
+	for (i = 0; i < j; i++) 
+	    ep->am_htable[handlers_idx[i]] = _ignore_handler;
+
+	return psmi_handle_error(ep, PSM_EP_NO_RESOURCES, "Insufficient "
+		"available AM handlers: registered %d of %d requested handlers",
+		j, num_handlers);
+    }
+    else
+	return PSM_OK;
+}
+PSMI_API_DECL(psm_am_register_handlers)
+
+psm_error_t
+__psm_am_request_short(psm_epaddr_t epaddr, psm_handler_t handler, 
+		       psm_amarg_t *args, int nargs, void *src, size_t len,
+		       int flags, psm_am_completion_fn_t completion_fn,
+		       void *completion_ctxt)
+{
+    psm_error_t err;
+    ptl_ctl_t *ptlc = epaddr->ptlctl;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+    
+    err =  ptlc->am_short_request(epaddr, handler, args, 
+				  nargs, src, len, flags, completion_fn,
+				  completion_ctxt);
+    PSMI_PUNLOCK();
+    return err;
+}
+PSMI_API_DECL(psm_am_request_short)
+ 
+psm_error_t
+__psm_am_reply_short(psm_am_token_t token, psm_handler_t handler, 
+		     psm_amarg_t *args, int nargs, void *src, size_t len, 
+		     int flags, psm_am_completion_fn_t completion_fn,
+		     void *completion_ctxt)
+{
+    psm_error_t err;
+    struct psmi_am_token *tok = (struct psmi_am_token *)token;
+    psm_epaddr_t epaddr = tok->epaddr_from;
+    ptl_ctl_t *ptlc = epaddr->ptlctl;
+
+    psmi_assert_always(token != NULL);
+
+    /* No locking here since we are already within handler context and already
+     * locked */
+
+    PSMI_ASSERT_INITIALIZED();
+
+    err =  ptlc->am_short_reply(token, handler, args, 
+				nargs, src, len, flags, completion_fn,
+				completion_ctxt);
+    return err;
+}
+PSMI_API_DECL(psm_am_reply_short)
+ 
+psm_error_t
+__psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters,
+			size_t sizeof_parameters_in,
+			size_t *sizeof_parameters_out)
+{
+    struct psm_am_parameters params;
+    size_t s;
+    uint32_t frag_sz;
+    /* This is the same calculation as PSM_AM_OPT_FRAG_SZ in psm_utils.c */
+    frag_sz = (ep && psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) ?
+              (ep->context.base_info.spi_piosize -
+              IPATH_MESSAGE_HDR_SIZE) : 2048;
+    params.max_handlers = PSMI_AM_NUM_HANDLERS;
+    params.max_nargs = PSMI_AM_MAX_ARGS;
+    params.max_request_short = frag_sz;
+    params.max_reply_short = frag_sz;
+    memset(parameters, 0, sizeof_parameters_in);
+    s = min(sizeof(params), sizeof_parameters_in);
+    memcpy(parameters, &params, s);
+    *sizeof_parameters_out = s;
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_am_get_parameters)
diff --git a/psm_am.h b/psm_am.h
new file mode 100644
index 0000000..c91c66e
--- /dev/null
+++ b/psm_am.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PSM_AM_H
+#define PSM_AM_H
+
+#include <psm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/* Datatype for an index number representing an active message handler */
+typedef uint32_t psm_handler_t;
+
+/* Datatype for a token for an active message handler.*/
+typedef void    *psm_am_token_t;
+
+/* PSM AM flags
+ * These flags may be combined using bitwise-or.
+ */
+#define PSM_AM_FLAG_NONE    0 /* This flag should be used when no other PSM AM flags are needed. */
+#define PSM_AM_FLAG_ASYNC   1 /* This flag indicates no need to copy source data. */
+#define PSM_AM_FLAG_NOREPLY 2 /* This flag indicates that the handler for this AM request is guaranteed not to generate a reply. */
+
+/* The psm_amarg type represents the type of an AM argument. This is
+ *  a 64-bit type and is broken down into four 16-bit fields, two 32-bit
+ *  fields or one 64-bit field for the convenience of code using the PSM AM
+ *  interface.
+ */
+typedef
+struct psm_amarg { 
+    union {
+	struct {
+	    uint16_t	u16w3;
+	    uint16_t	u16w2;
+	    uint16_t	u16w1;
+	    uint16_t	u16w0;
+	};
+	struct {
+	    uint32_t	u32w1;
+	    uint32_t	u32w0;
+	};
+	uint64_t	u64w0;
+	uint64_t	u64;
+    };
+}
+psm_amarg_t;
+
+/* The AM handler function type
+ *  
+ * psm_am_handler_fm_t is the datatype for an AM handler. PSM AM will call-back
+ * into an AM handler using this function prototype. The parameters and result
+ * of these handler functions are described here.
+ *
+ * [in] token This is an opaque token value passed into a handler.
+ *                  A request handler may send at most one reply back to the original 
+ *                  requestor, and must pass this value as the token parameter
+ *                  to the psm_am_reply_short() function. A reply handler is also
+ *                  passed a token value, but must not attempt to reply.
+ * [in] epaddr The end-point address of the other party in this AM transaction.
+ * [in] args A pointer to the arguments provided to this handler.
+ * [in] nargs The number of arguments.
+ * [in] src A pointer to the data payload provided to this handler.
+ * [in] len The length of the data payload in bytes.
+ *
+ * [returns] 0 The handler should always return a result of 0.
+ */
+typedef
+int (*psm_am_handler_fn_t)(psm_am_token_t token, psm_epaddr_t epaddr,
+			   psm_amarg_t *args, int nargs, 
+			   void *src, uint32_t len);
+
+/* Type for a completion call-back handler.
+ *  
+ * A completion handler can be specified to give a call-back on the initiation
+ * side that an AM request or reply has completed on the target side. The call-back
+ * has a context pointer which is provided along with the call-back function
+ * pointer when the initiator generates the request or reply. This approach will
+ * typically give higher performance than using an AM request or reply to achieve
+ * the same effect, though note that no additional information can be passed
+ * from the target side back to the initiator side with the completion handler
+ * approach.
+ *
+ * [in] context A context pointer.
+ * [returns] void This handler has no return result.
+ */
+typedef
+void (*psm_am_completion_fn_t)(void *context);
+
+/* Register AM call-back handlers at the specified end-point.
+ *
+ * This function is used to register an array of handlers, and may be called
+ * multiple times to register additonal handlers. The maximum number of handlers
+ * that can be registered is limited to the max_handlers value returned by
+ * psm_am_get_parameters(). Handlers are associated with a PSM end-point. The
+ * handlers are allocated index numbers in the the handler table for that end-point.
+ * The allocated index for the handler function in handlers[i] is returned in
+ * handlers_idx[i] for i in (0, num_handlers]. These handler index values are
+ * used in the psm_am_request_short() and psm_am_reply_short() functions.
+ * 
+ * [in] ep End-point value
+ * [in] handlers Array of handler functions
+ * [in] num_handlers Number of handlers (sizes the handlers and handlers_idx arrays)
+ * [out] handlers_idx Used to return handler index mapping table
+ * 
+ * [returns] PSM_OK Indicates success
+ * [returns] PSM_EP_NO_RESOURCES Insufficient slots in the AM handler table
+ */
+psm_error_t psm_am_register_handlers(psm_ep_t ep, 
+				     const psm_am_handler_fn_t *handlers, 
+				     int num_handlers, int *handlers_idx);
+
+/* Generate an AM request.
+ *
+ * This function generates an AM request causing an AM handler function to be
+ * called in the PSM process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in bytes
+ * to max_request_short returned by the psm_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the argument
+ * pointer will not be dereferenced. If payload is not required, set the payload size
+ * to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be provided,
+ * and a local call-back will be made to that function passing in that context
+ * pointer once remote execution of the handler has completed. If the completion
+ * call-back is not required, the handler should be specified as NULL and the
+ * pointer value will not be used.
+ * 
+ * The allowed flags are any combination of the following combined with bitwise-or:
+ *   PSM_AM_FLAG_NONE    - No flags
+ *   PSM_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *   PSM_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to generate a reply
+ *
+ * The PSM AM implementation will not dereference the args pointer after return from
+ * this function. If PSM_AM_FLAG_ASYNC is not provided, the PSM AM implementation will
+ * not dereference the src pointer after return from this function. This may require the
+ * implementation to take a copy of the payload if the request cannot be issued immediately.
+ * However, if PSM_AM_FLAG_ASYNC is provided then a copy will not be taken and the PSM AM
+ * implementation retains ownership of the payload src memory until the request is locally
+ * complete. Local completion can be determined using the completion handler call-back, or
+ * through an AM handler associated with an AM reply.
+ *
+ * The PSM_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that a reply will
+ * not be generated. Use of this flag is optional, but it may enable a performance optimization
+ * in this case by indicating that reply state is not required.
+ *
+ * [in] epaddr End-point address to run handler on
+ * [in] handler Index of handler to run
+ * [in] args Array of arguments to be provided to the handler
+ * [in] nargs Number of arguments to be provided to the handler
+ * [in] src Pointer to the payload to be delivered to the handler
+ * [in] len Length of the payload in bytes
+ * [in] flags These are PSM AM flags and may be combined together with bitwise-or
+ * [in] completion_fn The completion function to called locally when remote handler is complete
+ * [in] completion_ctxt User-provided context pointer to be passed to the completion handler
+ * 
+ * [returns] PSM_OK indicates success.
+ */
+psm_error_t
+psm_am_request_short(psm_epaddr_t epaddr, psm_handler_t handler, 
+		     psm_amarg_t *args, int nargs, void *src, size_t len,
+		     int flags, psm_am_completion_fn_t completion_fn,
+		     void *completion_ctxt);
+
+/* Generate an AM reply.
+ *
+ * This function may only be called from an AM handler called due to an AM request.
+ * If the AM request uses the PSM_AM_FLAG_NOREPLY flag, the AM handler must not
+ * call this function. Otherwise, the AM request handler may call psm_am_reply_short() 
+ * at most once, and must pass in the token value that it received in its own handler 
+ * call-back. 
+ *
+ * This function generates an AM reply causing an AM handler function to be
+ * called in the PSM process associated with the specified end-point address.
+ * The number of arguments is limited to max_nargs and the payload length in bytes
+ * to max_reply_short returned by the psm_am_get_parameters() function.
+ * If arguments are not required, set the number of arguments to 0 and the argument
+ * pointer will not be dereferenced. If payload is not required, set the payload size
+ * to 0 and the payload pointer will not be dereferenced.
+ *
+ * Optionally a completion function and completion context pointer can be provided,
+ * and a local call-back will be made to that function passing in that context
+ * pointer once remote execution of the handler has completed. If the completion
+ * call-back is not required, the handler should be specified as NULL and the
+ * pointer value will not be used.
+ * 
+ * The allowed flags are any combination of the following combined with bitwise-or:
+ *   PSM_AM_FLAG_NONE    - No flags
+ *   PSM_AM_FLAG_ASYNC   - Indicates no need to copy source data
+ *
+ * The PSM AM implementation will not dereference the args pointer after return from
+ * this function. If PSM_AM_FLAG_ASYNC is not provided, the PSM AM implementation will
+ * not dereference the src pointer after return from this function. This may require the
+ * implementation to take a copy of the payload if the reply cannot be issued immediately.
+ * However, if PSM_AM_FLAG_ASYNC is provided then a copy will not be taken and the PSM AM
+ * implementation retains ownership of the payload src memory until the reply is locally
+ * complete. Local completion can be determined using the completion handler call-back.
+ *
+ * [in] token Token value provided to the AM handler that is generating the reply.
+ * [in] handler Index of handler to run
+ * [in] args Array of arguments to be provided to the handler
+ * [in] nargs Number of arguments to be provided to the handler
+ * [in] src Pointer to the payload to be delivered to the handler
+ * [in] len Length of the payload in bytes
+ * [in] flags These are PSM AM flags and may be combined together with bitwise-or
+ * [in] completion_fn The completion function to called locally when remote handler is complete
+ * [in] completion_ctxt User-provided context pointer to be passed to the completion handler
+ * 
+ * [returns] PSM_OK indicates success.
+ */
+psm_error_t
+psm_am_reply_short(psm_am_token_t token, psm_handler_t handler, 
+		   psm_amarg_t *args, int nargs, void *src, size_t len, 
+		   int flags, psm_am_completion_fn_t completion_fn,
+		   void *completion_ctxt);
+
+/* AM parameters
+ *
+ * This structure is used to return PSM AM implementation-specific parameter
+ * values back to the caller of the psm_am_get_parameters() function. This
+ * API also specifies the minimum values for these parameters that an 
+ * implementation must at least provide:
+ *   max_handlers >= 64,
+ *   max_nargs >= 2,
+ *   max_request_short >= 256 and
+ *   max_reply_short >= 256.
+ */
+struct psm_am_parameters {
+    uint32_t    max_handlers;		/* Maximum number of handlers that can be registered. */
+    uint32_t	max_nargs;		/* Maximum number of arguments to an AM handler. */
+    uint32_t	max_request_short;	/* Maximum number of bytes in a request payload. */
+    uint32_t	max_reply_short;	/* Maximum number of bytes in a reply payload. */
+};
+
+/* Get the AM parameter values
+ *
+ * This function retrieves the implementation-specific AM parameter values for 
+ * the specified end-point.
+ *
+ * [in] ep The end-point value returned by psm_ep_open().
+ * [out] parameters Pointer to the struct where the parameters will be returned.
+ * [in] sizeof_parameters_in The size in bytes of the struct provided by the caller.
+ * [out] sizeof_parameters_out The size in bytes of the struct returned by PSM.
+ *
+ * [returns] PSM_OK indicates success.
+ */
+psm_error_t
+psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters,
+                      size_t sizeof_parameters_in,
+		      size_t *sizeof_parameters_out);
+
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+
+#endif
diff --git a/psm_am_internal.h b/psm_am_internal.h
new file mode 100644
index 0000000..dbe1bbb
--- /dev/null
+++ b/psm_am_internal.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSM_AM_INTERNAL_H
+#define _PSM_AM_INTERNAL_H
+
+#define PSMI_AM_MAX_ARGS     8
+#define PSMI_AM_NUM_HANDLERS 256	/* must be power of 2 */
+
+#define PSMI_AM_ARGS_DEFAULT psm_am_token_t token, psm_epaddr_t epaddr, \
+                             psm_amarg_t *args,	int nargs, 		\
+			     void *src, uint32_t len
+
+struct psmi_am_token {
+  psm_epaddr_t epaddr_from;
+  uint32_t	 flags;
+  /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */
+  uint32_t     can_reply;
+  
+  /* PTLs may add other stuff here */
+};
+
+PSMI_ALWAYS_INLINE(
+psm_am_handler_fn_t
+psm_am_get_handler_function(psm_ep_t ep, psm_handler_t handler_idx))
+{
+    int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS-1);
+    psm_am_handler_fn_t fn = (psm_am_handler_fn_t) ep->am_htable[hidx];
+    psmi_assert_always(fn != NULL);
+    return fn;
+}
+
+/* PSM internal initialization */
+psm_error_t psmi_am_init_internal(psm_ep_t ep);
+
+#endif
diff --git a/psm_context.c b/psm_context.c
new file mode 100644
index 0000000..390b49a
--- /dev/null
+++ b/psm_context.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "psm_user.h"
+
+#ifdef __MIC__
+#include <scif.h>
+#endif
+
+#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT   1
+static int      psmi_get_hca_selection_algorithm(void);
+static psm_error_t psmi_init_userinfo_params(psm_ep_t ep, 
+		int unit_id, int port,
+		psm_uuid_t const unique_job_key,
+		struct ipath_user_info *user_info);
+
+psm_error_t
+psmi_context_interrupt_set(psmi_context_t *context, int enable)
+{
+    int poll_type;
+    int ret;
+
+    if (( enable &&  (context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)) ||
+	(!enable && !(context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)))
+	return PSM_OK;
+
+    if (enable) 
+	poll_type = IPATH_POLL_TYPE_URGENT;
+    else
+	poll_type = 0;
+
+    ret = ipath_poll_type(context->ctrl, poll_type);
+
+    if (ret != 0)
+	return PSM_EP_NO_RESOURCES;
+    else {
+	if (enable)
+	    context->runtime_flags |= PSMI_RUNTIME_INTR_ENABLED;
+	else
+	    context->runtime_flags &= ~PSMI_RUNTIME_INTR_ENABLED;
+	
+	return PSM_OK;
+    }
+}
+
+int
+psmi_context_interrupt_isenabled(psmi_context_t *context)
+{
+    return context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED;
+}
+
+static
+char *
+runtime_flags_string(char *buf, size_t len, uint32_t runtime_flags)
+{
+    size_t off = 0;
+    int flag = 0;
+    char *s;
+
+    psmi_assert(len > 0 && buf != NULL);
+    buf[0] = '\0';
+
+    for (flag = 0; off < len && flag < 32; flag++) {
+	switch((1<<flag) & runtime_flags) {
+	    case IPATH_RUNTIME_PCIE:
+		s = "pcie";
+		break;
+	    case IPATH_RUNTIME_SDMA:
+		s = "dmasend";
+		break;
+	    case IPATH_RUNTIME_FORCE_WC_ORDER:
+		s = "force_wc_order";
+		break;
+	    case IPATH_RUNTIME_HDRSUPP:
+	        s = "hdrsupp";
+		break;
+	    case IPATH_RUNTIME_RCVHDR_COPY:
+		s = "rcvhdr_copy";
+		break;
+	    case IPATH_RUNTIME_MASTER:
+		s = "sub_master";
+		break;
+	    case IPATH_RUNTIME_NODMA_RTAIL:
+		s = "nodma_rtail";
+		break;
+	    case IPATH_RUNTIME_SPECIAL_TRIGGER:
+		s = "pio_special_trigger";
+		break;
+	    case IPATH_RUNTIME_FORCE_PIOAVAIL:
+		s = "pioavail_force";
+		break;
+	    case IPATH_RUNTIME_PIO_REGSWAPPED:
+		s = "pioreg_swapped";
+		break;
+	    case PSMI_RUNTIME_RCVTHREAD:
+		s = "psm_rcvthread";
+		break;
+	    case PSMI_RUNTIME_INTR_ENABLED:
+		s = "psm_intr_on";
+		break;
+	    default:
+		s = NULL;
+		break;
+	}
+	if (s == NULL)
+	    continue;
+	off += snprintf(buf + off, len - off - 1, "%s,", s);
+    }
+
+    if (off > 1) {
+	size_t c = strlen(buf);
+	buf[c - 1] = '\0';
+    }
+    return buf;
+}
+
+psm_error_t
+psmi_context_open(const psm_ep_t ep, long unit_id, long port,
+	      psm_uuid_t const job_key, 
+	      int64_t timeout_ns, psmi_context_t *context)
+{
+    long open_timeout = 0;
+    int lid;
+    uint64_t gid_hi, gid_lo;
+    char dev_name[MAXPATHLEN];
+    psm_error_t err = PSM_OK;
+    uint32_t driver_verno, hca_type;
+    int retry_delay = 0;
+
+    /*
+     * If shared contexts are enabled, try our best to schedule processes
+     * across one or many devices
+     */
+
+    if (timeout_ns > 0)
+	open_timeout = (long)(timeout_ns/MSEC_ULL);
+    if (unit_id != IPATH_UNIT_ID_ANY && unit_id >= 0)
+        snprintf(dev_name, sizeof(dev_name), "%s%u", "/dev/ipath", (unsigned)unit_id);
+    else
+        snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath");
+
+    context->fd = ipath_context_open(unit_id, port, open_timeout);
+    if (context->fd == -1) {
+	err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+		    "PSM can't open %s for reading and writing",
+		    dev_name);
+	goto bail;
+    }
+
+    if ((err = psmi_init_userinfo_params(ep, (int) unit_id, (int)port, job_key,
+				&context->user_info))) 
+	goto bail;
+
+retry_open:
+    context->ctrl = ipath_userinit(context->fd, &context->user_info,
+		                &context->base_info);
+
+    if (!context->ctrl) {
+
+      /* ipath_userinit returns EBUSY on ipath and ENODEV on qib when
+       * no contexts are available. Handle both drivers. 
+       */
+      if ((errno != ENETDOWN) && (errno != EBUSY) && (errno != ENODEV))
+	goto fail;
+      
+      if ((open_timeout == -1L) || (errno == EBUSY) || (errno == ENODEV)) {
+	    if(!retry_delay) {
+		_IPATH_PRDBG("retrying open: %s, network down\n", dev_name);
+		retry_delay = 1;
+	    }
+	    else if(retry_delay<17)
+		retry_delay <<= 1;
+	    
+	    /* If device is still busy after 3 attempts give up. No contexts
+	     * available.
+	     */
+	    if (((errno == EBUSY) || (errno == ENODEV)) && retry_delay > 4)
+	      goto fail;
+	    
+	    sleep(retry_delay);
+	    goto retry_open;
+	}
+      
+	err = psmi_handle_error(NULL, PSM_EP_NO_NETWORK,
+		"can't open %s, network down", dev_name);
+	goto bail;
+    }
+
+    if ((lid = ipath_get_port_lid(context->base_info.spi_unit,
+				  context->base_info.spi_port)) == -1) {
+	err = psmi_handle_error(NULL, 
+	        PSM_EP_DEVICE_FAILURE, 
+		"Can't get InfiniBand LID in psm_ep_open: is SMA running?");
+	goto fail;
+    }
+    if (ipath_get_port_gid(context->base_info.spi_unit,
+			   context->base_info.spi_port,
+			   &gid_hi, &gid_lo) == -1) {
+	err = psmi_handle_error(NULL, 
+	        PSM_EP_DEVICE_FAILURE, 
+		"Can't get InfiniBand GID in psm_ep_open: is SMA running?");
+	goto fail;
+    }
+    ep->unit_id = context->base_info.spi_unit;
+    ep->portnum = context->base_info.spi_port;
+    ep->gid_hi = gid_hi;
+    ep->gid_lo = gid_lo;
+
+    context->ep = (psm_ep_t) ep;
+    context->runtime_flags = context->base_info.spi_runtime_flags;
+    
+    /* Get type of hca assigned to context */
+    hca_type = psmi_get_hca_type(context);
+    
+    /* Endpoint out_sl contains the default SL to use for this endpoint. */
+    context->epid = 
+      PSMI_EPID_PACK_EXT(lid, context->base_info.spi_context,
+			 context->base_info.spi_subcontext, 
+			 hca_type, ep->out_sl);
+    
+    /*
+     * With driver 1.5 (release 2.1), assume we always need the force.
+     * Starting with 1.6, the flag is based on chip rev.
+     */
+    driver_verno = context->base_info.spi_sw_version;
+    if (driver_verno == PSMI_MAKE_DRIVER_VERSION(1, 5))
+	context->runtime_flags |= IPATH_RUNTIME_FORCE_PIOAVAIL;
+
+    /*
+     * We only know of register-swapped pio bufs before driver 1.6
+     * Starting with 1.6, the flag is based on chip rev.
+     */
+    if (driver_verno < PSMI_MAKE_DRIVER_VERSION(1, 6))
+	context->runtime_flags |= IPATH_RUNTIME_PIO_REGSWAPPED;
+
+    /* We are overloading this runtime flags for PSM options so make sure
+     * something can never go horribly bad */
+    psmi_assert_always(context->runtime_flags < _PSMI_RUNTIME_LAST);
+    context->spi_status = (volatile uint64_t *)
+			context->ctrl->__ipath_spi_status;
+
+    {
+	char buf[192];
+	_IPATH_PRDBG("Opened context %d.%d on device %s (LID=%d,epid=%llx), "
+		 "runtime_flags=0x%x (%s), driver=%d.%d\n", 
+		context->base_info.spi_context,
+		context->base_info.spi_subcontext, dev_name, lid,
+		(long long) context->epid, context->runtime_flags,
+		runtime_flags_string(buf, sizeof buf, context->runtime_flags),
+		context->base_info.spi_sw_version >> 16,
+		context->base_info.spi_sw_version & 0xffff);
+    }
+    goto ret;
+
+fail:
+    switch (errno) {
+    case ENOENT:
+    case ENODEV:
+	err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE,
+		"%s not found", dev_name);
+	break;
+    case ENXIO:
+	err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+		"%s failure", dev_name);
+	break;
+    case EBUSY:
+	err = psmi_handle_error(NULL, PSM_EP_NO_PORTS_AVAIL,
+		"No free InfiniPath contexts available on %s", dev_name);
+	break;
+    default:
+	err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, 
+		"Driver initialization failure on %s", dev_name);
+	break;
+    }
+bail:
+    _IPATH_PRDBG("%s open failed: %d (%s)\n", dev_name, err, strerror(errno));
+    if (context->fd != -1) {
+	ipath_context_close(context->fd);
+	context->fd = -1;
+    }
+ret: 
+    return err;
+}
+
+psm_error_t
+psmi_context_close(psmi_context_t *context)
+{
+    if (context->fd >= 0) {
+	ipath_context_close(context->fd);
+	context->fd = -1;
+    }
+    return PSM_OK;
+}
+
+/* 
+ * This function works whether a context is intiialized or not in a psm_ep.
+ *
+ * Returns one of
+ *
+ * PSM_OK: Port status is ok (or context not intialized yet but still "ok")
+ * PSM_OK_NO_PROGRESS: Cable pulled
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ * The message follows the per-port status
+ * As of 7322-ready driver, need to check port-specific qword for IB
+ * as well as older unit-only.  For now, we don't have the port interface
+ * defined, so just check port 0 qword for spi_status
+ */
+
+#define STATUS_MASK     (IPATH_STATUS_CHIP_PRESENT |	    \
+			      IPATH_STATUS_HWERROR |	    \
+			      IPATH_STATUS_IB_CONF |	    \
+			      IPATH_STATUS_IB_READY)
+
+#define STATUS_NO_ERROR_VAL   (IPATH_STATUS_CHIP_PRESENT |   \
+			       IPATH_STATUS_IB_CONF |	    \
+			       IPATH_STATUS_IB_READY)
+psm_error_t
+psmi_context_check_status(const psmi_context_t *contexti)
+{
+    psm_error_t err = PSM_OK;
+    uint64_t status, ibstatus;
+    char *errmsg = NULL;
+    psmi_context_t *context = (psmi_context_t *) contexti;
+
+    if (context->spi_status == NULL) 
+	goto ret;
+
+    status = context->spi_status[0];
+    ibstatus = context->spi_status[1];
+
+    /* Fatal chip-related errors */
+    if ( !(status & IPATH_STATUS_CHIP_PRESENT) ||
+          (status & (IPATH_STATUS_HWERROR))) {
+
+	err = PSM_EP_DEVICE_FAILURE;
+	if (err != context->spi_status_lasterr) { /* report once */
+	    volatile char *errmsg_sp = (volatile char *)&context->spi_status[2];
+	    if (*errmsg_sp) 
+		psmi_handle_error(context->ep, err, 
+				      "Hardware problem: %s", errmsg_sp);
+	    else {
+		if (status & IPATH_STATUS_HWERROR)
+		    errmsg = "Hardware error";
+		else
+		    errmsg = "Hardware not found";
+
+		psmi_handle_error(context->ep, err, errmsg, "%s");
+	    }
+	}
+    }
+
+    /* Fatal network-related errors */
+    else if (!(status & IPATH_STATUS_IB_CONF) &&
+	    !(ibstatus & IPATH_STATUS_IB_CONF)) {
+	err = PSM_EP_NO_NETWORK;
+	if (err != context->spi_status_lasterr) { /* report once */
+	    volatile char *errmsg_sp = (volatile char *)&context->spi_status[1];
+	    psmi_handle_error(context->ep, err,
+			"%s", *errmsg_sp ? errmsg_sp : "Network down");
+	}
+    }
+
+    /* These errors are not fatal, they are log only */
+    else if (!(status & IPATH_STATUS_IB_READY) &&
+	    !(ibstatus & IPATH_STATUS_IB_READY)) {
+	err = PSM_OK_NO_PROGRESS; /* Cable pulled, switch rebooted, ... */
+	if (err != context->spi_status_lasterr) { /* report once */
+#if 0
+	    psmi_handle_error(PSMI_EP_LOGEVENT, PSM_EP_NO_NETWORK,
+		    "IB Link is down");
+#endif
+	}
+    }
+
+    if (err == PSM_OK && context->spi_status_lasterr != PSM_OK) 
+	context->spi_status_lasterr = PSM_OK;  /* clear error */
+    else if (err != PSM_OK)
+	context->spi_status_lasterr = err; /* record error */
+
+ret:
+    return err;
+}
+
+/*
+ * Prepare user_info params for driver open, used only in psmi_context_open
+ */
+static
+psm_error_t
+psmi_init_userinfo_params(psm_ep_t ep, int unit_id, int port,
+		psm_uuid_t const unique_job_key,
+		struct ipath_user_info *user_info)
+{
+    /* static variables, shared among rails */
+    static int shcontexts_enabled = -1, rankid, nranks;
+    static int subcontext_id_start = -1;
+
+    int avail_contexts = 0, max_contexts, ask_contexts, ranks_per_context = 0;
+    uint32_t job_key;
+    uint16_t *jkp;
+    psm_error_t err = PSM_OK;
+    union psmi_envvar_val env_maxctxt, env_ranks_per_context;
+
+    memset(user_info, 0, sizeof *user_info);
+    user_info->spu_userversion = IPATH_USER_SWVERSION;
+    user_info->spu_subcontext_id = 0;
+    user_info->spu_subcontext_cnt = 0;
+    user_info->spu_port_alg = psmi_get_hca_selection_algorithm();
+
+    if (shcontexts_enabled == -1) {
+        shcontexts_enabled = psmi_sharedcontext_params(&nranks, &rankid);
+    }
+
+    if (!shcontexts_enabled)
+	return err;
+
+    avail_contexts = ipath_get_num_contexts(unit_id);
+    jkp = (uint16_t *) unique_job_key;
+
+    /* Use a unique subcontext id based on uuid.  This is just to optimistically
+     * prevent sharing a context across two unrelated jobs that would start at the
+     * same time */
+    job_key =  ((jkp[2] ^ jkp[3]) >> 8) | ((jkp[0] ^ jkp[1]) << 8);
+    job_key ^= ((jkp[6] ^ jkp[7]) >> 8) | ((jkp[4] ^ jkp[5]) << 8);
+    /* comment out, because it has more chance to generate the same job_key for
+     * two unrelated jobs that would start at the same time, and causes context
+     * allocation failure */
+    //job_key &= ~0xff; /* just to make more readable */
+
+    if (avail_contexts == 0) {
+	err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE,
+		"PSM found 0 available contexts on InfiniPath device(s).");
+	goto fail;
+    }
+
+    /* See if the user wants finer control over context assignments */
+    if (!psmi_getenv("PSM_SHAREDCONTEXTS_MAX", 
+		    "Maximum number of contexts for this PSM job",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val) avail_contexts,
+		    &env_maxctxt)) {
+	max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */
+	ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */
+    }
+    else
+	ask_contexts = max_contexts = avail_contexts;
+
+    if (!psmi_getenv("PSM_RANKS_PER_CONTEXT", 
+		    "Number of ranks per context",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val) 1,
+		    &env_ranks_per_context)) {
+        ranks_per_context = max(env_ranks_per_context.e_int, 1);
+	ranks_per_context = min(ranks_per_context, INFINIPATH_MAX_SUBCONTEXT);
+    }
+
+    /* 
+     * See if we could get a valid local rank.  If not, pre-attach to the
+     * shm segment to obtain a unique shmidx.
+     */
+    if (rankid == -1) {
+	if ((err = psmi_shm_attach(ep, &rankid)))
+	    goto fail;
+    }
+
+    /* 
+     * See if we could get a valid ppn.  If not, approximate it to be the
+     * number of cores.  
+     */
+    if (nranks == -1) {
+	long nproc = sysconf(_SC_NPROCESSORS_ONLN);
+	if (nproc < 1) 
+	    nranks = 1;
+	else
+	    nranks = nproc;
+    }
+
+    /* 
+     * Make sure that our guesses are good educated guesses
+     */
+    if (rankid >= nranks) {
+	_IPATH_PRDBG("PSM_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n",
+		     rankid, nranks);
+	goto fail;
+    }
+
+    if (ranks_per_context) {
+        int contexts = (nranks + ranks_per_context - 1) / ranks_per_context;
+	if (contexts > ask_contexts) {
+	    err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE,
+		    "Context required %d (nranks %d, ranks_per_context %d) "
+		    "is less than allowed context %d which is either the "
+		    "total avail_context %d or set by PSM_SHAREDCONTEXTS_MAX\n",
+		    contexts, nranks, ranks_per_context, ask_contexts, avail_contexts);
+	    goto fail;
+	}
+	ask_contexts = contexts;
+    }
+
+    user_info->spu_port = port; /* requested IB port if > 0 */
+    if (subcontext_id_start == -1) {
+#ifdef __MIC__
+	/* this query is moved from ipath_userinit() to here,
+	 * it is also used there by ipath_cmd_assign_context() call. */
+	if (scif_get_nodeIDs(NULL, 0, (uint16_t*)&user_info->_spu_scif_nodeid) < 0) {
+	    _IPATH_INFO("scif_get_nodeIDs() call failed: %s\n", strerror(errno));
+	    goto fail;
+	}
+	/*
+ 	 * When processes from different MICs to use the same HCA, and
+ 	 * context sharing is enabled, we can't mix them, only processes
+ 	 * from the same MIC node can share a context, so we need to
+ 	 * generate a unique id. Here we use the queried nodeID to do it,
+ 	 * avail_contexts is a constant for all MICs.
+ 	 */
+	subcontext_id_start = avail_contexts * user_info->_spu_scif_nodeid;
+#else
+	subcontext_id_start = 0;
+#endif
+    }
+
+    /* "unique" id based on job key */
+    user_info->spu_subcontext_id = subcontext_id_start +
+			job_key + rankid % ask_contexts;
+    /* this is for multi-rail, when we setup a new rail,
+     * we can not use the same subcontext ID as the previous
+     * rail, otherwise, the driver will match previous rail
+     * and fail.
+     */
+    subcontext_id_start += ask_contexts;
+
+    /* Need to compute with how many *other* peers we will be sharing the
+     * context */
+    if (nranks > ask_contexts) {
+	user_info->spu_subcontext_cnt = nranks / ask_contexts;
+	/* If ppn != multiple of contexts, some contexts get an uneven 
+	 * number of subcontexts */
+	if (nranks % ask_contexts > rankid % ask_contexts)
+	    user_info->spu_subcontext_cnt++;
+	/* The case of 1 process "sharing" a context (giving 1 subcontext) 
+	 * is supcontexted by the driver and PSM. However, there is no 
+	 * need to share in this case so disable context sharing. */
+	if (user_info->spu_subcontext_cnt == 1)
+	    user_info->spu_subcontext_cnt = 0;
+    }
+    /* else spu_subcontext_cnt remains 0 and context sharing is disabled. */
+
+    _IPATH_PRDBG("PSM_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d,"
+		 "max_contexts=%d,ask_contexts=%d,"
+                 "ranks_per_context=%d,id=%u,peers=%d,port=%d\n",
+		 rankid, nranks, avail_contexts, max_contexts, ask_contexts, 
+		 ranks_per_context,
+		 (int) user_info->spu_subcontext_id,
+		 (int) user_info->spu_subcontext_cnt,
+		 (int) user_info->spu_port);
+fail:
+    return err;
+}
+
+int
+psmi_sharedcontext_params(int *nranks, int *rankid)
+{
+    union psmi_envvar_val enable_shcontexts;
+    char *ppn_env = NULL, *lrank_env = NULL, *c;
+
+    *rankid = -1;
+    *nranks = -1;
+
+#if 0
+    /* DEBUG: Used to selectively test possible shared context and shm-only
+     * settings */
+    unsetenv("PSC_MPI_NODE_RANK");
+    unsetenv("PSC_MPI_PPN");
+    unsetenv("MPI_LOCALRANKID");
+    unsetenv("MPI_LOCALRANKS");
+#endif
+
+    /* New name in 2.0.1, keep observing old name */
+    if (psmi_getenv("PSM_SHAREDCONTEXTS", "Enable shared contexts",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+		    (union psmi_envvar_val)
+		    PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, 
+		    &enable_shcontexts)) 
+    {
+	psmi_getenv("PSM_SHAREDPORTS", "Enable shared contexts",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO,
+		    (union psmi_envvar_val)
+		    PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, 
+		    &enable_shcontexts); 
+    }
+
+    if (!enable_shcontexts.e_int)
+	return 0;
+
+    /* We support two types of syntaxes to let users give us a hint what
+     * our local rankid is.  Moving towards MPI_, but still support PSC_ */
+    if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') { 
+	lrank_env = "MPI_LOCALRANKID";
+	ppn_env = "MPI_LOCALNRANKS";
+    }
+    else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') { 
+	ppn_env = "PSC_MPI_PPN";
+	lrank_env = "PSC_MPI_NODE_RANK";
+    }
+
+    if (ppn_env != NULL && lrank_env != NULL) {
+	union psmi_envvar_val env_rankid, env_nranks;
+
+	psmi_getenv(lrank_env, "Shared context rankid",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val) -1,
+		    &env_rankid); 
+
+	psmi_getenv(ppn_env, "Shared context numranks",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val) -1,
+		    &env_nranks); 
+
+	*rankid = env_rankid.e_int;
+	*nranks = env_nranks.e_int;
+    }
+    return 1;
+}
+
+static 
+int      
+psmi_get_hca_selection_algorithm(void)
+{
+  union psmi_envvar_val env_hca_alg;
+  int hca_alg = IPATH_PORT_ALG_ACROSS;
+
+  /* If a specific unit is set in the environment, use that one. */
+  psmi_getenv("IPATH_HCA_SELECTION_ALG", 
+	      "HCA Device Selection Algorithm to use. Round Robin (Default) "
+	      "or Packed",
+	      PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+	      (union psmi_envvar_val) "Round Robin",
+	      &env_hca_alg);
+
+  if (!strcasecmp(env_hca_alg.e_str, "Round Robin"))
+    hca_alg = IPATH_PORT_ALG_ACROSS;
+  else if (!strcasecmp(env_hca_alg.e_str, "Packed"))
+    hca_alg = IPATH_PORT_ALG_WITHIN;
+  else {
+    _IPATH_ERROR("Unknown HCA selection algorithm %s. Defaulting to Round Robin "
+		 "allocation of HCAs.\n", env_hca_alg.e_str);
+    hca_alg = IPATH_PORT_ALG_ACROSS;
+  }
+  
+  return hca_alg;
+}
diff --git a/psm_context.h b/psm_context.h
new file mode 100644
index 0000000..635bb10
--- /dev/null
+++ b/psm_context.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_context.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_CONTEXT_H
+#define _PSM_CONTEXT_H
+
+typedef
+struct psmi_context {
+    int			    fd;	    /* driver fd */
+    struct _ipath_ctrl	    *ctrl;  /* driver opaque ipath_proto */
+    psm_ep_t		    ep;	    /* psm ep handle */
+    psm_epid_t		    epid;   /* psm integral ep id */
+    struct ipath_user_info  user_info;
+    struct ipath_base_info  base_info;
+    uint32_t		    runtime_flags;
+    uint32_t		    rcvthread_flags;
+    volatile uint64_t	    *spi_status;
+    psm_error_t		    spi_status_lasterr;
+}
+psmi_context_t;
+
+psm_error_t
+psmi_context_open(const psm_ep_t ep, long unit_id, long port,
+		  psm_uuid_t const job_key, 
+		  int64_t timeout_ns, psmi_context_t *context);
+
+psm_error_t
+psmi_context_close(psmi_context_t *context);
+
+/* Check status of context */
+psm_error_t psmi_context_check_status(const psmi_context_t *context);
+
+psm_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable);
+int	    psmi_context_interrupt_isenabled(psmi_context_t *context);
+
+int psmi_sharedcontext_params(int *nranks, int *rankid);
+/* Runtime flags describe what features are enabled in hw/sw and which
+ * corresponding PSM features are being used.
+ *
+ * Hi 16 bits are PSM options
+ * Lo 16 bits are IPATH_RUNTIME options copied from (ipath_common.h)
+ */
+#define PSMI_RUNTIME_RCVTHREAD	    0x80000000
+#define PSMI_RUNTIME_INTR_ENABLED   0x40000000
+#define PSMI_RUNTIME_LOCKHDRQ	    PSMI_RUNTIME_RCVTHREAD /* alias */
+/* Update _PSMI_RUNTIME_LAST to be the lowest runtime flag */
+#define _PSMI_RUNTIME_LAST	    PSMI_RUNTIME_INTR_ENABLED
+
+/*
+ * The receive thread can be initialized with optional behaviour.
+ *
+ * Note: Currently there is no optional behaviour.
+ */
+#define PSMI_RCVTHREAD_FLAG_ENABLED 0x1
+
+
+#endif /* PSM_CONTEXT_H */
diff --git a/psm_diags.c b/psm_diags.c
new file mode 100644
index 0000000..4502cf1
--- /dev/null
+++ b/psm_diags.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+typedef void (*memcpy_fn_t)(void *dst, const void *src, size_t n);
+static int psmi_test_memcpy(memcpy_fn_t, const char *name);
+static int psmi_test_epid_table(int numelems);
+
+int psmi_diags(void);
+
+#define diags_assert(x)	do {					\
+	    if (!(x))  {					\
+		_IPATH_ERROR("Diags assertion failure: %s\n",	\
+		    #x);					\
+		goto fail;					\
+	    }							\
+	} while (0)
+
+#define DIAGS_RETURN_PASS(str)						\
+	do { _IPATH_INFO("%s: PASSED %s\n", __func__, str); return 0; } \
+	    while (0)
+#define DIAGS_RETURN_FAIL(str)						\
+	do { _IPATH_INFO("%s: FAILED %s\n", __func__, str); return 1; } \
+	    while (0)
+
+int
+psmi_diags(void)
+{
+    int ret = 0;
+    ret |= psmi_test_epid_table(2048);
+    ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo");
+    //ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy");
+
+    if (ret)
+	DIAGS_RETURN_FAIL("");
+    else
+	DIAGS_RETURN_PASS("");
+}
+
+/*
+ * Hash table test
+ */
+#define NALLOC	1024
+static int
+psmi_test_epid_table(int numelems)
+{
+    psm_epaddr_t    *ep_array, epaddr, ep_alloc;
+    psm_epid_t	*epid_array, epid_tmp;
+    psm_ep_t	ep = (psm_ep_t) (uintptr_t) 0xabcdef00;
+    struct psmi_epid_table  *tab;
+    int i, j;
+
+    ep_alloc = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm_epaddr));
+    ep_array = (psm_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm_epaddr *));
+    epid_array = (psm_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(psm_epid_t));
+    diags_assert(ep_alloc != NULL);
+    diags_assert(ep_array != NULL);
+    diags_assert(epid_array != NULL);
+
+    srand(12345678);
+
+    psmi_epid_init();
+    tab = &psmi_epid_table;
+
+    for (i = 0; i < numelems; i++) {
+	epid_array[i] = i;
+	ep_alloc[i].ep = ep;
+	ep_alloc[i].epid = epid_array[i];
+	ep_array[i] = &ep_alloc[i];
+    }
+    for (i = 0 ; i < numelems; i++) {
+	psmi_epid_add(ep, epid_array[i], ep_array[i]);
+    }
+
+    /* Randomize epid_array */
+    for (i = 0; i < numelems; i++) {
+	j = rand() % numelems;
+	epid_tmp = epid_array[i];
+	epid_array[i] = epid_array[j];
+	epid_array[j] = epid_tmp;
+    }
+    /* Lookup. */
+    for (i = 0; i < numelems; i++) {
+	epaddr = psmi_epid_lookup(ep, epid_array[i]);
+	diags_assert(epaddr != NULL);
+	diags_assert(epaddr->epid == epid_array[i]);
+	diags_assert(epaddr->ep == ep);
+    }
+
+    /* Randomize epid_array again */
+    for (i = 0; i < numelems; i++) {
+	j = rand() % numelems;
+	epid_tmp = epid_array[i];
+	epid_array[i] = epid_array[j];
+	epid_array[j] = epid_tmp;
+    }
+    /* Delete half */
+    for (i = 0; i < numelems/2; i++) {
+	epaddr = psmi_epid_remove(ep, epid_array[i]);
+	diags_assert(epaddr != NULL);
+	diags_assert(epaddr->epid == epid_array[i]);
+	diags_assert(epaddr->ep == ep);
+    }
+    /* Lookup other half -- expect non-NULL, then delete */
+    for (i = numelems/2; i < numelems; i++) {
+	epaddr = psmi_epid_lookup(ep, epid_array[i]);
+	diags_assert(epaddr != NULL);
+	diags_assert(epaddr->epid == epid_array[i]);
+	diags_assert(epaddr->ep == ep);
+	epaddr = psmi_epid_remove(ep, epid_array[i]);
+	epaddr = psmi_epid_lookup(ep, epid_array[i]);
+	diags_assert(epaddr == NULL);
+    }
+    /* Lookup whole thing, expect done */
+    for (i = 0; i < numelems; i++) {
+	epaddr = psmi_epid_lookup(ep, epid_array[i]);
+	diags_assert(epaddr == NULL);
+    }
+    for (i = 0; i < tab->tabsize; i++) {
+	diags_assert(tab->table[i].entry == NULL || 
+		     tab->table[i].entry == EPADDR_DELETED);
+    }
+
+    /* Make sure we're not leaking memory somewhere... */
+    diags_assert(tab->tabsize > tab->tabsize_used &&
+		 tab->tabsize * PSMI_EPID_TABLOAD_FACTOR >
+			tab->tabsize_used);
+
+    /* Only free on success */
+    psmi_epid_fini();
+    psmi_free(epid_array);
+    psmi_free(ep_array);
+    psmi_free(ep_alloc);
+    DIAGS_RETURN_PASS("");
+
+fail:
+    /* Klocwork scan report memory leak. */
+    psmi_epid_fini();
+    if (epid_array) psmi_free(epid_array);
+    if (ep_array) psmi_free(ep_array);
+    if (ep_alloc) psmi_free(ep_alloc);
+    DIAGS_RETURN_FAIL("");
+}
+
+/*
+ * Memcpy correctness test
+ */
+static int memcpy_check_size (memcpy_fn_t fn, int *p, int *f, size_t n);
+static void *memcpy_check_one (memcpy_fn_t fn, void *dst, void *src, size_t n);
+
+static int
+psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name)
+{
+    const int CORNERS = 0;
+    const long long lo = 1;
+    const long long hi = 16 * 1024 * 1024;
+    const long long below = 32;
+    const long long above = 32;
+    long long n, m;
+    char buf[128];
+    int ret = 0;
+    int memcpy_passed;
+    int memcpy_failed;
+
+    memcpy_passed = 0;
+    memcpy_failed = 0;
+
+    ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0);
+    if (ret < 0)
+	DIAGS_RETURN_FAIL("no heap space");
+
+    for (n = lo; n <= hi; n <<= 1) {
+	_IPATH_INFO("%s %d align=0..16\n", memcpy_name, (int) n);
+	for (m = n - below; m <= n + above; m++) {
+	    if (m == n) {
+		ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, n);
+		if (ret < 0)
+		    DIAGS_RETURN_FAIL("no heap space");
+	    }
+	    else if (CORNERS && m >= lo && m <= hi && m > (n >> 1) &&
+	       m < max(n, ((n << 1) - below))) 
+	    {
+		ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, (size_t) m);
+		if (ret < 0)
+		    DIAGS_RETURN_FAIL("no heap space");
+	    }
+	}
+    }
+
+    int total = memcpy_passed + memcpy_failed;
+    if (total > 0) {
+	_IPATH_INFO("%d memcpy tests with %d passed (%.2f%%) "
+		    "and %d failed (%.2f%%)\n",
+           total, memcpy_passed, (100.0 * memcpy_passed) / total, 
+           memcpy_failed, (100.0 * memcpy_failed) / total);
+    }
+    if (memcpy_failed) {
+	snprintf(buf, sizeof buf, "%s %.2f%% of tests memcpy_failed",
+			memcpy_name, (100.0 * memcpy_failed) / total);
+	DIAGS_RETURN_FAIL(buf);
+    }
+    else {
+	DIAGS_RETURN_PASS(memcpy_name);
+    }
+}
+
+void *memcpy_check_one (memcpy_fn_t fn, void *dst, void *src, size_t n)
+{
+  int ok = 1;
+  unsigned int seed = (unsigned int)
+	  ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n);
+  unsigned int state;
+  size_t i;
+  psmi_assert_always(n > 0);
+  memset(src, 0x55, n);
+  memset(dst, 0xaa, n);
+  srand(seed);
+  state = seed;
+  for (i = 0; i < n; i++) {
+    ((uint8_t *) src)[i] = (rand_r(&state) >> 16) & 0xff;
+  }
+
+  fn(dst, src, n);
+  memset(src, 0, n);
+  srand(seed);
+  state = seed;
+  for (i = 0; i < n; i++) {
+    int value = (int) (uint8_t) (rand_r(&state) >> 16);
+    int v = (int) ((uint8_t *) dst)[i];
+    if (v != value) {
+      _IPATH_ERROR("Error on index %llu : got %d instead of %d\n",
+             (unsigned long long) i, v, value);
+      ok = 0;
+    }
+  }
+  return ok ? dst : NULL;
+}
+
+int
+memcpy_check_size (memcpy_fn_t fn, int *p, int *f, size_t n)
+{
+#define num_aligns 16
+#define USE_MALLOC 0
+#define DEBUG 0
+  uint8_t *src;
+  uint8_t *dst;
+  size_t size = n * 2 + num_aligns;
+  if (USE_MALLOC) {
+    src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+    dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+    if (src == NULL || dst == NULL) {
+      if (src) psmi_free(src);
+      if (dst) psmi_free(dst);
+      return -1;
+	}
+  }
+  else {
+    void *src_p = NULL, *dst_p = NULL;
+    if (posix_memalign(&src_p, 64, size) != 0 ||
+        posix_memalign(&dst_p, 64, size) != 0) {
+      if (src_p) psmi_free(src_p);
+      if (dst_p) psmi_free(dst_p);
+      return -1;
+    }
+    else {
+	src = (uint8_t *) src_p;
+	dst = (uint8_t *) dst_p;
+    }
+  }
+  int src_align, dst_align;
+  for (src_align = 0; src_align < num_aligns; src_align++) {
+    for (dst_align = 0; dst_align < num_aligns; dst_align++) {
+      uint8_t *d = ((uint8_t *) dst) + dst_align;
+      uint8_t *s = ((uint8_t *) src) + src_align;
+      int ok = (memcpy_check_one(fn, d, s, n) != NULL);
+      if (DEBUG || !ok) {
+        _IPATH_INFO("memcpy(%p, %p, %llu) : %s\n", d, s, 
+	       (unsigned long long) n,
+               ok ? "passed" : "failed");
+      }
+      if (ok) {
+        (*p)++;
+      }
+      else {
+        (*f)++;
+      }  
+    }
+  }
+  psmi_free(src);
+  psmi_free(dst);
+  return 0;
+}
diff --git a/psm_ep.c b/psm_ep.c
new file mode 100644
index 0000000..6857895
--- /dev/null
+++ b/psm_ep.c
@@ -0,0 +1,1423 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sched.h> // cpu_set
+#include <ctype.h> // isalpha
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+/*
+ * Endpoint management
+ */
+psm_ep_t psmi_opened_endpoint = NULL; 
+int psmi_opened_endpoint_count = 0;
+
+static psm_error_t psmi_ep_open_device(const psm_ep_t ep, 
+			    const struct psm_ep_open_opts *opts,
+			    const psm_uuid_t unique_job_key,
+			    struct psmi_context *context,
+			    psm_epid_t *epid);
+
+/*
+ * Device managment
+ *
+ * PSM uses "devices" as components to manage communication to self, to peers
+ * reachable via shared memory and finally to peers reachable only through
+ * ipath.
+ *
+ * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is
+ * tested for reachability to each peer.  First self, then shm and finally
+ * ipath.  The order should really only affect endpoints that happen to be on
+ * the same node.  PSM will correctly detect that two endpoints are on the same
+ * node even though they may be using different host interfaces.
+ */
+
+#define PSMI_DEVICES_DEFAULT "self,shm,ipath"
+static psm_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], 
+				      const char *devstr);
+static int	   psmi_device_is_enabled(const int devices[PTL_MAX_INIT],
+					  int devid);
+int		   psmi_ep_device_is_enabled(const psm_ep_t ep, int devid);
+
+psm_error_t
+__psm_ep_num_devunits(uint32_t *num_units_o)
+{
+    static int num_units = -1;
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+    if (num_units == -1) {
+	num_units = ipath_get_num_units();
+	if (num_units == -1)
+	    num_units = 0;
+    }
+
+    *num_units_o = (uint32_t) num_units;
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_ep_num_devunits)
+
+static int
+cmpfunc(const void *p1, const void *p2)
+{
+    uint64_t a = ((uint64_t *)p1)[0];
+    uint64_t b = ((uint64_t *)p2)[0];
+    if (a < b) return -1;
+    if (a == b) return 0;
+    return 1;
+}
+static psm_error_t
+psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port)
+{
+    uint32_t num_units;
+    uint64_t gid_hi, gid_lo;
+    int i, j, ret, count=0;
+    char *env;
+    psm_error_t err = PSM_OK;
+    uint64_t gidh[IPATH_MAX_UNIT][3];
+
+    env = getenv("PSM_MULTIRAIL");
+    if (!env || atoi(env) == 0) {
+	*num_rails = 0;
+	return err;
+    }
+#ifdef __MIC__
+    env = getenv("MPI_LOCALRANKID");
+    if (!env || atoi(env) == 0) {
+	_IPATH_INFO("PSM_MULTIRAIL is not supported and "
+			"ignored for this PSM mic version.\n");
+    }
+    *num_rails = 0;
+    return err;
+#endif
+
+/*
+ * map is in format: unit:port,unit:port,...
+ */
+    if ((env = getenv("PSM_MULTIRAIL_MAP"))) {
+	if (sscanf(env, "%d:%d", &i, &j) == 2) {
+	    char *comma = strchr(env, ',');
+	    unit[count] = i;
+	    port[count] = j;
+	    count++;
+	    while (comma) {
+		if (sscanf(comma, ",%d:%d", &i, &j) != 2) {
+		    break;
+		}
+		unit[count] = i;
+		port[count] = j;
+		count++;
+		if (count == IPATH_MAX_UNIT) break;
+		comma = strchr(comma+1, ',');
+	    }
+	}
+	*num_rails = count;
+
+/*
+ * Check if any of the port is not usable.
+ */
+	for (i = 0; i < count; i++) {
+	    ret = ipath_get_port_lid(unit[i], port[i]);
+	    if (ret == -1) {
+		err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+			    "Couldn't get lid for unit %d:%d",
+			    unit[i], port[i]);
+		return err;
+	    }
+	    ret = ipath_get_port_gid(unit[i], port[i], &gid_hi, &gid_lo);
+	    if (ret == -1) {
+		err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+			    "Couldn't get gid for unit %d:%d",
+			    unit[i], port[i]);
+		return err;
+	    }
+	}
+
+	return err;
+    }
+
+    if ((err = psm_ep_num_devunits(&num_units))) {
+	return err;
+    }
+    if (num_units > IPATH_MAX_UNIT) {
+	_IPATH_INFO("Found %d units, max %d units are supported, use %d\n",
+		num_units, IPATH_MAX_UNIT, IPATH_MAX_UNIT);
+	num_units = IPATH_MAX_UNIT;
+    }
+
+/*
+ * Get all the ports with a valid lid and gid, one per unit.
+ * we don't know which number is a valid unit, we just loop
+ * over all supported numbers.
+ */
+    for (i = 0; i < IPATH_MAX_UNIT; i++) {
+	for (j = 1; j <= IPATH_MAX_PORT; j++) {
+	    ret = ipath_get_port_lid(i, j);
+	    if (ret == -1) continue;
+	    ret = ipath_get_port_gid(i, j, &gid_hi, &gid_lo);
+	    if (ret == -1) continue;
+
+	    gidh[count][0] = gid_hi;
+	    gidh[count][1] = i;
+	    gidh[count][2] = j;
+	    count++;
+	    break;
+	}
+	if (count == num_units) break;
+    }
+
+/*
+ * Sort all the ports with gidh from small to big.
+ * This is for multiple fabrics, and we use fabric with the
+ * smallest gid to make the master connection.
+ */
+    qsort(gidh, count, sizeof(uint64_t)*3, cmpfunc);
+
+    for (i = 0; i < count; i++) {
+	unit[i] = (uint32_t)gidh[i][1];
+	port[i] = (uint16_t)(uint32_t)gidh[i][2];
+    }
+    *num_rails = count;
+    return err;
+}
+
+static psm_error_t
+psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o,
+		uint64_t my_gid_hi, uint64_t my_gid_lo)
+{
+    static uint16_t *ipath_lids = NULL;
+    static uint32_t nlids;
+    uint32_t num_units;
+    int i;
+    psm_error_t err = PSM_OK;
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+    if (ipath_lids == NULL) {
+	if ((err = psm_ep_num_devunits(&num_units)))
+	    goto fail;
+	ipath_lids = (uint16_t *) 
+	    psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_units*IPATH_MAX_PORT,
+		sizeof(uint16_t));
+	if (ipath_lids == NULL) {
+	    err = psmi_handle_error(NULL, PSM_NO_MEMORY, 
+		    "Couldn't allocate memory for dev_lids structure");
+	    goto fail;
+	}
+
+	for (i = 0; i < IPATH_MAX_UNIT; i++) {
+	    int j;
+	    for (j = 1; j <= IPATH_MAX_PORT; j++) {
+		    int lid = ipath_get_port_lid(i, j);
+		    int ret;
+		    uint64_t gid_hi = 0, gid_lo = 0;
+
+		    if (lid == -1) continue;
+		    ret = ipath_get_port_gid(i, j, &gid_hi, &gid_lo);
+		    if (ret == -1)
+			continue;
+		    else if (my_gid_hi != gid_hi) {
+		        _IPATH_VDBG("LID %d, unit %d, port %d, "
+                                    "mismatched GID %llx:%llx and "
+				    "%llx:%llx\n",
+				    lid, i, j,
+				    (unsigned long long) gid_hi,
+				    (unsigned long long) gid_lo,
+				    (unsigned long long) my_gid_hi,
+				    (unsigned long long) my_gid_lo);
+		        continue;
+		    }
+		    _IPATH_VDBG("LID %d, unit %d, port %d, "
+                                "matching GID %llx:%llx and "
+				"%llx:%llx\n", lid, i, j,
+				(unsigned long long) gid_hi,
+				(unsigned long long) gid_lo,
+				(unsigned long long) my_gid_hi,
+				(unsigned long long) my_gid_lo);
+
+		    ipath_lids[nlids++] = (uint16_t) lid;
+	    }
+	}
+	if (nlids == 0) {
+		err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+			    "Couldn't get lid&gid from any unit/port");
+		goto fail;
+	}
+    }
+    *lids = ipath_lids;
+    *num_lids_o = nlids;
+
+fail:
+    return err;
+}
+
+uint64_t
+__psm_epid_nid(psm_epid_t epid)
+{
+    return PSMI_EPID_GET_LID(epid);
+}
+PSMI_API_DECL(psm_epid_nid)
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * subcontexts */
+uint64_t
+psmi_epid_subcontext(psm_epid_t epid)
+{
+    return PSMI_EPID_GET_SUBCONTEXT(epid);
+}
+
+/* Currently not exposed to users, we don't acknowledge the existence of
+ * service levels and HCA types encoding within epids. This may require
+ * changing to expose SLs
+ */
+uint64_t
+psmi_epid_hca_type(psm_epid_t epid)
+{
+  return PSMI_EPID_GET_HCATYPE(epid);
+}
+
+uint64_t
+psmi_epid_sl(psm_epid_t epid)
+{
+  return PSMI_EPID_GET_SL(epid);
+}
+
+uint64_t
+__psm_epid_context(psm_epid_t epid)
+{
+    return PSMI_EPID_GET_CONTEXT(epid);
+}
+PSMI_API_DECL(psm_epid_context)
+
+uint64_t
+__psm_epid_port(psm_epid_t epid)
+{
+    return __psm_epid_context(epid);
+}
+PSMI_API_DECL(psm_epid_port)
+
+psm_error_t 
+__psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo)
+{
+  psm_error_t err = PSM_OK;
+  int i;
+  psm_ep_t ep;
+  
+  PSMI_ERR_UNLESS_INITIALIZED(NULL);
+  
+  if (*num_of_epinfo <= 0) {
+    err = psmi_handle_error(NULL, PSM_PARAM_ERR, 
+			    "Invalid psm_ep_query parameters");
+    return err;
+  }
+
+  if (psmi_opened_endpoint == NULL) {
+    err =  psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
+			     "PSM Endpoint is closed or does not exist");
+    return err;
+  }
+  
+  ep = psmi_opened_endpoint;
+  for (i = 0; i < *num_of_epinfo; i++) {
+	if (ep == NULL) break;
+	array_of_epinfo[i].ep = ep;
+	array_of_epinfo[i].epid = ep->epid;
+	memcpy(array_of_epinfo[i].uuid, 
+	 (void *) ep->key, sizeof(psm_uuid_t));
+	psmi_uuid_unparse(ep->key, array_of_epinfo[i].uuid_str);
+	ep = ep->user_ep_next;
+  }
+  *num_of_epinfo = i;
+
+  return err;
+}
+PSMI_API_DECL(psm_ep_query)
+
+psm_error_t 
+__psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn)
+{
+  psm_error_t err = PSM_OK;
+  psm_epaddr_t epaddr;
+  psm_ep_t ep;
+  
+  PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+  /* Need to have an opened endpoint before we can resolve epids */
+  if (psmi_opened_endpoint == NULL) {
+    err =  psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
+			     "PSM Endpoint is closed or does not exist");
+    return err;
+  }
+  
+  ep = psmi_opened_endpoint;
+  while (ep) {
+	epaddr = psmi_epid_lookup(ep, epid);
+	if (!epaddr) {
+	    /* Search over SL values for bug 122239. Note that function
+	     * ips_get_addr_from_epid() converts a base epid to an epaddr,
+	     * which can then be used to get the correct epid for this flow.
+	     * However, that function is at the IPS level and not accessible 
+	     * from here without breaking the layering. */
+	    uint64_t lid, context, subcontext, hca_type, sl, try_sl;
+	    psm_epid_t try_epid;
+	    lid = PSMI_EPID_GET_LID(epid);
+	    context = PSMI_EPID_GET_CONTEXT(epid);
+	    subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+	    hca_type = PSMI_EPID_GET_HCATYPE(epid);
+	    sl = PSMI_EPID_GET_SL(epid);
+	    for (try_sl = 0; !epaddr && try_sl < 16; try_sl++) {
+	      if (try_sl != sl) {
+	        try_epid = PSMI_EPID_PACK_EXT(lid, context, subcontext,
+					      hca_type, try_sl);
+	        epaddr = psmi_epid_lookup(psmi_opened_endpoint, try_epid);
+	      }
+	    }
+
+	    if (!epaddr) {
+		ep = ep->user_ep_next;
+		continue;
+	    }
+	}
+  
+	/* Found connection for epid. Return info about endpoint to caller. */
+	psmi_assert_always(epaddr->ep == ep);
+	epconn->addr = epaddr;
+	epconn->ep   = epaddr->ep;
+	epconn->mq   = epaddr->ep->mq;
+	return err;
+  }
+  
+  err =  psmi_handle_error(NULL, PSM_EPID_UNKNOWN,
+	     "Endpoint connection status unknown");
+  return err;
+}
+PSMI_API_DECL(psm_ep_epid_lookup);
+
+psm_error_t
+__psm_ep_epid_share_memory(psm_ep_t ep, psm_epid_t epid, int *result_o)
+{
+    uint32_t num_lids = 0;
+    uint16_t *lids = NULL;
+    int i;
+    uint16_t epid_lid;
+    int result = 0;
+    psm_error_t err;
+
+    psmi_assert_always(ep != NULL);
+    PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+    epid_lid = (uint16_t) psm_epid_nid(epid);
+    /* If we're in non-ipath mode, done bother listing lids */
+    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+	uint64_t mylid = (uint16_t) psm_epid_nid(ep->epid);
+	if (mylid == epid_lid)
+	    result = 1;
+    }
+    else {
+        err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo);
+	if (err)
+	    return err;
+	for (i = 0; i < num_lids; i++) {
+	    if (epid_lid == lids[i]) {
+		result = 1;
+		break;
+	    }
+	}
+    }
+    *result_o = result;
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_ep_epid_share_memory)
+
+#define PSMI_EP_OPEN_SHM_MBYTES_MIN     2
+#define PSMI_EP_OPEN_PKEY_MASK	        0x7fffULL
+
+psm_error_t
+__psm_ep_open_opts_get_defaults(struct psm_ep_open_opts *opts)
+{
+    union psmi_envvar_val nSendBuf;
+    union psmi_envvar_val netPKey;
+#if (PSM_VERNO >= 0x010d)
+    union psmi_envvar_val env_path_service_id;
+    union psmi_envvar_val env_path_res_type;
+#endif
+#if (PSM_VERNO >= 0x010e)
+    union psmi_envvar_val nSendDesc;
+    union psmi_envvar_val immSize;
+#endif
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+    
+    /* Get number of default send buffers from environment */
+    psmi_getenv("PSM_NUM_SEND_BUFFERS",
+		"Number of send buffers to allocate [1024]",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 1024,
+		&nSendBuf);
+    
+    /* Get network key from environment. MVAPICH and other vendor MPIs do not
+     * specify it on ep open and we may require it for vFabrics.
+     */
+    psmi_getenv("PSM_PKEY",
+		"Infiniband PKey to use for endpoint",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_ULONG,
+		(union psmi_envvar_val) IPATH_DEFAULT_P_KEY,
+		&netPKey);
+
+#if (PSM_VERNO >= 0x010d)    
+    /* Get Service ID from environment */
+    psmi_getenv("PSM_IB_SERVICE_ID",
+		"IB Service ID for path resolution",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_ULONG_ULONG,
+		(union psmi_envvar_val) IPATH_DEFAULT_SERVICE_ID, 
+		&env_path_service_id);
+    
+    /* Get Path resolution type from environment Possible choices are:
+     *
+     * NONE : Default same as previous instances. Utilizes static data.
+     * OPP  : Use OFED Plus Plus library to do path record queries.
+     * UMAD : Use raw libibumad interface to form and process path records.
+     * ANY  : Try all available path record mechanisms.
+     */
+    psmi_getenv("PSM_PATH_REC",
+                "Mechanism to query IB path record (default is no path query)",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+                (union psmi_envvar_val) "none", &env_path_res_type);
+#endif
+
+#if (PSM_VERNO >= 0x010e)
+    /* Get numner of send descriptors - by default this is 4 times the number
+     * of send buffers - mainly used for short/inlined messages.
+     */
+    psmi_getenv("PSM_NUM_SEND_DESCRIPTORS",
+		"Number of send descriptors to allocate [4096]",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) (nSendBuf.e_uint << 2),
+		&nSendDesc);
+
+    /* Get immediate data size - transfers less than immediate data size do
+     * not consume a send buffer and require just a send descriptor.
+     */
+    psmi_getenv("PSM_SEND_IMMEDIATE_SIZE",
+		"Immediate data send size not requiring a buffer [128]",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 128,
+		&immSize);
+#endif
+    
+    opts->timeout = 30000000000LL; /* 30 sec */
+    opts->unit    = IPATH_UNIT_ID_ANY;
+    opts->port    = 0;
+    opts->outsl    = PSMI_SL_DEFAULT;
+#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a)
+    opts->outvl    = 0;
+#endif
+    opts->affinity = PSM_EP_OPEN_AFFINITY_SET;
+    opts->shm_mbytes = 10;
+    opts->sendbufs_num = nSendBuf.e_uint;
+    opts->network_pkey = (uint64_t) netPKey.e_ulong;
+#if (PSM_VERNO >= 0x010d)
+    opts->service_id = (uint64_t) env_path_service_id.e_ulonglong;
+    
+    if (!strcasecmp(env_path_res_type.e_str, "none"))
+      opts->path_res_type = PSM_PATH_RES_NONE;
+    else if (!strcasecmp(env_path_res_type.e_str, "opp"))
+      opts->path_res_type = PSM_PATH_RES_OPP;
+    else if (!strcasecmp(env_path_res_type.e_str, "umad"))
+      opts->path_res_type = PSM_PATH_RES_UMAD;
+    else {
+      _IPATH_ERROR("Unknown path resolution type %s. Disabling use of path record query.\n", env_path_res_type.e_str);
+      opts->path_res_type = PSM_PATH_RES_NONE;
+    }
+#endif
+#if (PSM_VERNO >= 0x010e)
+    opts->senddesc_num = nSendDesc.e_uint;
+    opts->imm_size     = immSize.e_uint;
+#endif
+
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_ep_open_opts_get_defaults)
+
+psm_error_t psmi_poll_noop(ptl_t *ptl, int replyonly);
+
+psm_error_t
+__psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled,
+	    struct psm_ep_open_opts const *opts_i, psm_mq_t mq,
+	    psm_ep_t *epo, psm_epid_t *epido)
+{
+    psm_ep_t ep = NULL;
+    uint32_t num_units;
+    size_t len;
+    psm_error_t err;
+    psm_epaddr_t epaddr = NULL;
+    char buf[128], *p, *e;
+    char *old_cpuaff = NULL, *old_unit = NULL;
+    union psmi_envvar_val yield_cnt, no_cpuaff, env_unit_id,
+	  env_port_id, env_sl;
+    size_t ptl_sizes;
+    int default_cpuaff;
+    struct psm_ep_open_opts opts;
+    ptl_t *amsh_ptl, *ips_ptl, *self_ptl;
+    int i;
+
+    /* First get the set of default options, we overwrite with the user's
+     * desired values afterwards */
+    if ((err = psm_ep_open_opts_get_defaults(&opts))) 
+	goto fail;
+
+    if (opts_i != NULL) {
+	if (opts_i->timeout != -1)
+	    opts.timeout = opts_i->timeout;
+	if (opts_i->unit != -1)
+	    opts.unit = opts_i->unit;
+	if (opts_i->affinity != -1)
+	    opts.affinity = opts_i->affinity;
+	if (opts_i->shm_mbytes != -1)
+	    opts.shm_mbytes = opts_i->shm_mbytes;
+	if (opts_i->sendbufs_num != -1)
+	    opts.sendbufs_num = opts_i->sendbufs_num;
+	if (psmi_verno_client() >= PSMI_VERNO_MAKE(1,1)) {
+	  if ((opts_i->network_pkey & PSMI_EP_OPEN_PKEY_MASK) != 
+	      PSMI_EP_OPEN_PKEY_MASK) 
+	    opts.network_pkey = opts_i->network_pkey;
+	}
+	if (psmi_verno_client() >= PSMI_VERNO_MAKE(1,7)) {
+    	    /* these values are sanity checked below */
+	    opts.port = opts_i->port;
+	    opts.outsl = opts_i->outsl;
+#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a)
+	    opts.outvl = opts_i->outvl;
+#endif
+	}
+#if (PSM_VERNO >= 0x010d)
+	/* Note: Environment variable specification for service ID and 
+	 * path resolition type takes precedence over ep_open defaults.
+	 */
+	if (psmi_verno_client() >= 0x010d) {
+	  if (opts_i->service_id)
+	    opts.service_id = (uint64_t) opts_i->service_id;
+	  if (opts.path_res_type == PSM_PATH_RES_NONE)
+	    opts.path_res_type = opts_i->path_res_type;
+	}
+#endif
+
+#if (PSM_VERNO >= 0x010e)
+	if (psmi_verno_client() >= 0x010e) {
+	  if (opts_i->senddesc_num)
+	    opts.senddesc_num = opts_i->senddesc_num;
+	  if (opts_i->imm_size) 
+	    opts.imm_size = opts_i->imm_size;
+	}
+#endif
+    }
+
+    if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+	if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK) 
+	    goto fail;
+    } else num_units = 0;
+
+    /* do some error checking */
+    if (opts.timeout < -1) {
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR, 
+				"Invalid timeout value %lld", 
+				(long long) opts.timeout);
+	goto fail;
+    } else if (num_units && (opts.unit < -1 || opts.unit >= IPATH_MAX_UNIT)) {
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR, 
+				"Invalid Device Unit ID %d (%d units found)",
+				opts.unit, num_units);
+	goto fail;
+    } else if (opts.affinity < 0 || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) {
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR, 
+				    "Invalid Affinity option: %d", opts.affinity);
+	goto fail;
+    } else if (opts.shm_mbytes < PSMI_EP_OPEN_SHM_MBYTES_MIN) {
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR, 
+		"Invalid shm_mbytes option at %d mbytes (minimum is %d)",
+		opts.shm_mbytes, PSMI_EP_OPEN_SHM_MBYTES_MIN);
+	goto fail;
+    } 
+
+    /* Advertise in verbose env the fact that we parse the no-affinity
+     * variable. */ 
+    default_cpuaff = psmi_getenv("IPATH_NO_CPUAFFINITY",
+				"Prevent PSM from setting affinity",
+				PSMI_ENVVAR_LEVEL_USER,
+				PSMI_ENVVAR_TYPE_YESNO,
+				PSMI_ENVVAR_VAL_NO,
+				&no_cpuaff);
+
+    if (no_cpuaff.e_uint || 
+	(default_cpuaff && opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP)) 
+    {
+	old_cpuaff = getenv("IPATH_NO_CPUAFFINITY");
+	setenv("IPATH_NO_CPUAFFINITY", "1", 1);
+    }
+
+#ifdef __MIC__
+    /*
+     * On MIC, we always pick unit from /sys/class/qib/ipath/unit,
+     * but only do this if there is a HCA unit.
+     */
+    if (num_units > 0) {
+	char pathname[128];
+	struct stat st;
+	FILE *fp;
+	
+	snprintf(pathname, sizeof(pathname),
+		"/sys/class/qib/ipath/unit");
+	fp = NULL;
+	if (stat(pathname, &st) || S_ISDIR(st.st_mode) ||
+	!(fp = fopen(pathname, "r")) || (fscanf(fp, "%d", &opts.unit) != 1)) {
+	    err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
+				"Couldn't read from %s", pathname);
+	    if (fp) fclose(fp);
+	    goto fail;
+	}
+	fclose(fp);
+	psmi_assert(opts.unit != IPATH_UNIT_ID_ANY);
+	psmi_assert(opts.unit < IPATH_MAX_UNIT);
+    }
+#else
+    /* If a specific unit is set in the environment, use that one. */
+    if (!psmi_getenv("IPATH_UNIT", "Device Unit number (-1 autodetects)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+		    (union psmi_envvar_val) IPATH_UNIT_ID_ANY,
+		    &env_unit_id)) {
+	opts.unit = env_unit_id.e_long;
+	/* set mock UNIT *just* for setaffinity */
+	if (opts.unit != IPATH_UNIT_ID_ANY) {
+	    char buf[32];
+	    snprintf(buf, sizeof buf - 1, "%d", (int) opts.unit);
+	    buf[sizeof buf - 1] = '\0';
+	    old_unit = getenv("IPATH_UNIT");
+	    setenv("IPATH_UNIT", buf, 1);
+	}
+	else
+	    unsetenv("IPATH_UNIT");
+    }
+#endif
+
+    if (!psmi_getenv("IPATH_PORT", "IB Port number (<= 0 autodetects)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+		    (union psmi_envvar_val)0,
+		    &env_port_id)) {
+	opts.port = env_port_id.e_long;
+    }
+
+    if (!psmi_getenv("IPATH_SL", "IB outging ServiceLevel number (default 0)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+		    (union psmi_envvar_val) PSMI_SL_DEFAULT,
+		    &env_sl)) {
+	opts.outsl = env_sl.e_long;
+    }
+
+#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a)
+    {
+      union psmi_envvar_val env_vl;
+      if (!psmi_getenv("IPATH_VL", "IB outging VirtualLane (default 0)",
+		       PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
+		       (union psmi_envvar_val)0,
+		       &env_vl)) {
+	opts.outvl = env_vl.e_long;
+      }
+    }
+#endif
+
+    /* sanity check new capabilities, after both opts and env */
+    if (opts.port < 0 || opts.port > IPATH_MAX_PORT)
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR,
+	    "Invalid Port number: %lld",
+	    (unsigned long long) opts.port);
+    if (opts.outsl < 0 || opts.outsl > 15)
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR,
+	    "Invalid SL number: %lld",
+	    (unsigned long long) opts.outsl);
+
+#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a)
+    if (opts.outvl < 0 || opts.outvl > 7)
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR,
+	    "Invalid VL number: %lld",
+	    (unsigned long long) opts.outvl);
+#endif
+
+    ptl_sizes =
+	(psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
+	    psmi_ptl_self.sizeof_ptl() : 0) +
+	(psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ?
+	    psmi_ptl_ips.sizeof_ptl() : 0) +
+	(psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ?
+	    psmi_ptl_amsh.sizeof_ptl() : 0);
+    if (ptl_sizes == 0) return PSM_EP_NO_DEVICE;
+
+    ep = (psm_ep_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 1, 
+				sizeof(struct psm_ep) + ptl_sizes);
+    epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, 
+					1, sizeof(struct psm_epaddr));
+    if (ep == NULL || epaddr == NULL) {
+	err = psmi_handle_error(NULL, PSM_NO_MEMORY, 
+				"Couldn't allocate memory for %s structure",
+				ep == NULL ? "psm_ep" : "psm_epaddr");
+	goto fail;
+    }
+
+    /* Copy PTL enabled status */
+    for (i = 0; i < PTL_MAX_INIT; i++)
+	ep->devid_enabled[i] = devid_enabled[i];
+
+    /* Matched Queue initialization.  We do this early because we have to
+     * make sure ep->mq exists and is valid before calling ips_do_work.
+     */
+    ep->mq = mq;
+
+    /* Get ready for PTL initialization */
+    memcpy(&ep->key, (void *) unique_job_key, sizeof(psm_uuid_t));
+    ep->epaddr = epaddr;
+    ep->shm_mbytes = opts.shm_mbytes;
+    ep->memmode = mq->memmode;
+    ep->ipath_num_sendbufs = opts.sendbufs_num;
+    ep->network_pkey = (uint16_t) opts.network_pkey & PSMI_EP_OPEN_PKEY_MASK;
+#if (PSM_VERNO >= 0x010d)
+    ep->service_id = opts.service_id;
+    ep->path_res_type = opts.path_res_type;
+#else
+    /* Select sane defaults with older PSM header */
+    ep->service_id = 0x1000117500000000ULL; /* Default service ID */
+    ep->path_res_type = 0;  /* No path resolution */
+#endif
+#if (PSM_VERNO >= 0x010e)
+    ep->ipath_num_descriptors = opts.senddesc_num;
+    ep->ipath_imm_size = opts.imm_size;
+#else
+    /* Default is 4 times more descriptors than buffers */
+    ep->ipath_num_descriptors = ep->ipath_num_sendbufs << 2;
+    ep->ipath_imm_size = 128;
+#endif
+    ep->errh = psmi_errhandler_global; /* by default use the global one */
+    ep->ptl_amsh.ep_poll = psmi_poll_noop;
+    ep->ptl_ips.ep_poll  = psmi_poll_noop;
+    ep->connections = 0;
+
+    /* Active message fields, used by psmi_shm_attach() */
+    ep->psmi_kassist_fd = -1;
+    ep->psmi_kassist_mode = 0;
+    ep->amsh_shmbase = 0;
+    ep->amsh_blockbase = 0;
+    ep->amsh_dirpage = NULL;
+    ep->amsh_keyname = NULL;
+    ep->amsh_shmfd = -1;
+    ep->amsh_shmidx = -1;
+    ep->amsh_max_idx = -1;
+
+    /* See how many iterations we want to spin before yielding */
+    psmi_getenv("PSM_YIELD_SPIN_COUNT",
+		"Spin poll iterations before yield",
+		PSMI_ENVVAR_LEVEL_HIDDEN,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD,
+		&yield_cnt);
+    ep->yield_spin_cnt = yield_cnt.e_uint;
+
+    ptl_sizes = 0;
+    amsh_ptl = ips_ptl = self_ptl = NULL;
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+	amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+	ptl_sizes += psmi_ptl_amsh.sizeof_ptl();
+    }
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+	ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+	ptl_sizes += psmi_ptl_ips.sizeof_ptl();
+    }
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+	self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
+	ptl_sizes += psmi_ptl_self.sizeof_ptl();
+    }
+
+    if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, 
+			   &(ep->context), &ep->epid)))
+	goto fail;
+
+    /* Restore old cpuaffinity and unit settings.
+     * TODO: PSM should really just implement its own affinity
+     *       setting function */
+    if (old_cpuaff != NULL) 
+	setenv("IPATH_NO_CPUAFFINITY", old_cpuaff, 1);
+    if (old_unit != NULL)
+	setenv("IPATH_UNIT", old_unit, 1);
+
+    psmi_assert_always(ep->epid != 0);
+    ep->epaddr->epid = ep->epid;
+
+    /* Set our new label as soon as we know what it is */
+    strncpy(buf, psmi_gethostname(), sizeof(buf) - 1);
+    buf[sizeof(buf) - 1] = '\0';
+
+    p = buf + strlen(buf);
+
+    /* If our rank is set, use it. If not, use context.subcontext notation */
+    if (((e = getenv("MPI_RANKID")) != NULL && *e) ||
+	    ((e = getenv("PSC_MPI_RANK")) != NULL && *e))
+	len = snprintf(p, sizeof buf - strlen(buf), ":%d.", atoi(e));
+    else
+	len = snprintf(p, sizeof buf - strlen(buf), ":%d.%d.", 
+		(uint32_t) psm_epid_context(ep->epid),
+		(uint32_t) psmi_epid_subcontext(ep->epid));
+    *(p + len) = '\0';
+    ep->context_mylabel = psmi_strdup(ep, buf);
+    if (ep->context_mylabel == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    //ipath_set_mylabel(ep->context_mylabel);
+
+    if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0)))
+	goto fail;
+
+    /* 
+     * Active Message initialization
+     */
+    if ((err = psmi_am_init_internal(ep)))
+	goto fail;
+
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
+	if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self)))
+	    goto fail;
+    }
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+	if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips)))
+	    goto fail;
+    }
+    /* If we're shm-only, this device is enabled above */
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+	if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh)))
+	    goto fail;
+    }
+    else {
+	/* We may have pre-attached as part of getting our rank for enabling
+	 * shared contexts.  */ 
+	psmi_shm_detach(ep);
+    }
+
+    /*
+     * Keep only IPS since only IPS support multi-rail, other devices
+     * are only setup once. IPS device can come to this function again.
+     */
+    for (i = 0; i < PTL_MAX_INIT; i++) {
+	if (devid_enabled[i] != PTL_DEVID_IPS) {
+	    devid_enabled[i] = -1;
+	}
+    }
+
+    *epido    = ep->epid;
+    *epo     = ep;
+
+    return PSM_OK;
+
+fail:
+    if (ep != NULL) {
+	if (ep->context.fd != -1) close(ep->context.fd);
+	psmi_free(ep);
+    }
+    if (epaddr != NULL)
+	psmi_free(epaddr);
+    return err;
+}
+
+psm_error_t
+__psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *opts_i,
+	    psm_ep_t *epo, psm_epid_t *epido)
+{
+    psm_error_t err;
+    psm_mq_t mq;
+    psm_epid_t epid;
+    psm_ep_t ep, tmp;
+    uint32_t units[IPATH_MAX_UNIT];
+    uint16_t ports[IPATH_MAX_UNIT];
+    int i, num_rails = 0;
+    char *uname = "IPATH_UNIT";
+    char *pname = "IPATH_PORT";
+    char uvalue[4], pvalue[4];
+    int devid_enabled[PTL_MAX_INIT];
+    union psmi_envvar_val devs;
+
+    PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+    PSMI_PLOCK();
+
+    /* Matched Queue initialization.  We do this early because we have to
+     * make sure ep->mq exists and is valid before calling ips_do_work.
+     */
+    err = psmi_mq_malloc(&mq);
+    if (err != PSM_OK) goto fail;
+
+    /* See which ptl devices we want to use for this ep to be opened */
+    psmi_getenv("PSM_DEVICES",
+		"Ordered list of PSM-level devices",
+		PSMI_ENVVAR_LEVEL_USER,
+		PSMI_ENVVAR_TYPE_STR,
+		(union psmi_envvar_val) PSMI_DEVICES_DEFAULT,
+		&devs);
+
+    if ((err = psmi_parse_devices(devid_enabled, devs.e_str)))
+	goto fail;
+
+    if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+	err = psmi_ep_multirail(&num_rails, units, ports);
+	if (err != PSM_OK) goto fail;
+
+	/* If multi-rail is used, set the first ep unit/port */
+	if (num_rails > 0) {
+	    snprintf(uvalue, 4, "%1d", units[0]);
+	    snprintf(pvalue, 4, "%1d", ports[0]);
+	    setenv(uname, uvalue, 1);
+	    setenv(pname, pvalue, 1);
+	}
+    }
+
+    err = __psm_ep_open_internal(unique_job_key,
+		devid_enabled, opts_i, mq, &ep, &epid);
+    if (err != PSM_OK) goto fail;
+
+    if (psmi_opened_endpoint == NULL) {
+	psmi_opened_endpoint = ep;
+    } else {
+	tmp = psmi_opened_endpoint;
+	while (tmp->user_ep_next) tmp = tmp->user_ep_next;
+	tmp->user_ep_next = ep;
+    }
+    psmi_opened_endpoint_count++;
+    ep->mctxt_prev = ep->mctxt_next = ep;
+    ep->mctxt_master = ep;
+    mq->ep = ep;
+
+    *epo = ep;
+    *epido = epid;
+
+    if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+	for (i = 1; i < num_rails; i++) {
+	    snprintf(uvalue, 4, "%1d", units[i]);
+	    snprintf(pvalue, 4, "%1d", ports[i]);
+	    setenv(uname, uvalue, 1);
+	    setenv(pname, pvalue, 1);
+
+	    /* Create slave EP */
+	    err = __psm_ep_open_internal(unique_job_key,
+			devid_enabled, opts_i, mq, &tmp, &epid);
+	    if (err) goto fail;
+
+	    /* Link slave EP after master EP. */
+	    PSM_MCTXT_APPEND(ep, tmp);
+	}
+    }
+	
+    /* Once we've initialized all devices, we can update the MQ with its
+     * default values */
+    if (err == PSM_OK) err = psmi_mq_initialize_defaults(mq);
+
+fail:
+    PSMI_PUNLOCK();
+    return err;
+}
+PSMI_API_DECL(psm_ep_open)
+
+psm_error_t
+__psm_ep_close(psm_ep_t ep, int mode, int64_t timeout_in)
+{
+    psm_error_t err = PSM_OK;
+    uint64_t t_start = get_cycles();
+    union psmi_envvar_val timeout_intval;
+    psm_ep_t tmp, mep;
+
+    PSMI_ERR_UNLESS_INITIALIZED(ep);
+    psmi_assert_always(ep->mctxt_master == ep);
+
+    PSMI_PLOCK();
+
+    if (psmi_opened_endpoint == NULL) {
+        err =  psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
+		         "PSM Endpoint is closed or does not exist");
+        return err;
+    }
+
+    tmp = psmi_opened_endpoint;
+    while (tmp && tmp != ep) {
+	tmp = tmp->user_ep_next;
+    }
+    if (!tmp) {
+	err =  psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
+		         "PSM Endpoint is closed or does not exist");
+        return err;
+    }
+
+    psmi_getenv("PSM_CLOSE_TIMEOUT",
+                "End-point close timeout over-ride.",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) 0,
+                &timeout_intval);
+
+    if (getenv("PSM_CLOSE_TIMEOUT")) {
+        timeout_in = timeout_intval.e_uint * SEC_ULL;
+    }
+    else if (timeout_in > 0) {
+        /* The timeout parameter provides the minimum timeout. A heuristic
+	 * is used to scale up the timeout linearly with the number of 
+	 * endpoints, and we allow one second per 100 endpoints. */
+        timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100);
+    }
+
+    if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT)
+	timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+    /* Infinite and excessive close time-out are limited here to a max.
+     * The "rationale" is that there is no point waiting around forever for
+     * graceful termination. Normal (or forced) process termination should clean 
+     * up the context state correctly even if termination is not graceful. */
+    if (timeout_in <= 0 || timeout_in < PSMI_MAX_EP_CLOSE_TIMEOUT)
+	timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT;
+    _IPATH_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and "
+                 "%d connections\n",
+		 ep, mode == PSM_EP_CLOSE_FORCE ? "YES" : "NO", 
+		 (double) timeout_in / 1e9, (int) ep->connections);
+
+    /* XXX We currently cheat in the sense that we leave each PTL the allowed
+     * timeout.  There's no good way to do this until we change the PTL
+     * interface to allow asynchronous finalization
+     */
+    mep = ep;
+    tmp = ep->mctxt_prev;
+    do {
+	ep = tmp;
+	tmp = ep->mctxt_prev;
+	PSM_MCTXT_REMOVE(ep);
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) 
+	    err = psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, timeout_in);
+
+	if ((err == PSM_OK || err == PSM_TIMEOUT) && 
+	    psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) 
+	    err = psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, timeout_in);
+
+	/* If there's timeouts in the disconnect requests,
+	 * still make sure that we still get to close the
+	 *endpoint and mark it closed */
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
+	    psmi_context_close(&ep->context);
+
+	psmi_free(ep->epaddr);
+	psmi_free(ep->context_mylabel);
+	/*
+	 * Before freeing the master ep itself,
+	 * remove it from the global linklist.
+	 * We do it here to let atexit handler in ptl_am directory
+	 * to search the global linklist and free the shared memory file.
+	 */
+	if (ep == mep) {
+	    if (psmi_opened_endpoint == ep) {
+		psmi_opened_endpoint = ep->user_ep_next;
+	    } else {
+		tmp = psmi_opened_endpoint;
+		while (tmp->user_ep_next != ep) {
+		    tmp = tmp->user_ep_next;
+		}
+	        tmp->user_ep_next = ep->user_ep_next;
+	    }
+	    psmi_opened_endpoint_count--;
+	}
+	psmi_free(ep);
+
+    } while ((err == PSM_OK || err == PSM_TIMEOUT) && tmp != ep);
+
+    PSMI_PUNLOCK();
+
+    _IPATH_PRDBG("Closed endpoint in %.3f secs\n",
+	    (double) cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL);
+    return err;
+}
+PSMI_API_DECL(psm_ep_close)
+
+static
+psm_error_t 
+psmi_ep_open_device(const psm_ep_t ep, 
+		    const struct psm_ep_open_opts *opts,
+		    const psm_uuid_t unique_job_key,
+		    struct psmi_context *context,
+		    psm_epid_t *epid)
+{
+    psm_error_t err = PSM_OK;
+
+    /* Skip affinity.  No affinity if:
+     * 1. User explicitly sets no-affinity=YES in environment.
+     * 2. User doesn't set affinity in environment and PSM is opened with
+     *    option affinity skip.
+     */
+    if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+	uint32_t lid;
+	
+	ep->out_sl = opts->outsl;
+	
+	if ((err = psmi_context_open(ep, opts->unit, opts->port, unique_job_key,
+				     opts->timeout, context)) != PSM_OK)
+	    goto fail;
+
+	_IPATH_DBG("[%d]use unit %d port %d\n", getpid(), 
+	    context->base_info.spi_unit, context->base_info.spi_port);
+
+	if ((lid = ipath_get_port_lid(context->base_info.spi_unit,
+	    context->base_info.spi_port)) == -1) {
+	    err = psmi_handle_error(NULL, 
+		PSM_EP_DEVICE_FAILURE, 
+		"Can't get InfiniBand LID in psm_ep_open: is SMA running?");
+	    goto fail;
+	}
+
+	if (context->base_info.spi_sw_version >= (1 << 16 | 5)) {
+	    uint32_t rcvthread_flags;
+	    union psmi_envvar_val env_rcvthread;
+	    static int norcvthread = 0; /* only for first rail */
+
+	    /* See if we want to activate support for receive thread */
+	    psmi_getenv("PSM_RCVTHREAD", "Recv thread flags (0 disables thread)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		    (union psmi_envvar_val)(norcvthread++?0:PSMI_RCVTHREAD_FLAGS),
+		    &env_rcvthread); 
+	    rcvthread_flags = env_rcvthread.e_uint;
+
+	    /* If enabled, use the pollurg capability to implement a receive
+	     * interrupt thread that can handle urg packets */
+	    if (rcvthread_flags) {
+		context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD;
+#ifdef PSMI_PLOCK_IS_NOLOCK
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		    "#define PSMI_PLOCK_IS_NOLOCK not functional yet "
+		    "with RCVTHREAD on");
+#endif
+	    }
+	    context->rcvthread_flags = rcvthread_flags;
+
+	}
+	
+	*epid = context->epid;
+    }
+    else {
+	int rank, nranks;
+	char *e;
+	long nproc = sysconf(_SC_NPROCESSORS_ONLN);
+
+	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+	    /* In shm-only mode, we need to derive a valid epid based on our
+	     * rank.  We try to get it from the environment if its available,
+	     * or resort to preattaching to the shared memory segment and use
+	     * our shared memory rank (shmidx) as the rank.
+	     */
+	    union psmi_envvar_val env_rankid;
+
+	    if (psmi_getenv("MPI_LOCALRANKID", "Shared context rankid",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+		    (union psmi_envvar_val) -1,
+		    &env_rankid)) {
+		if (psmi_getenv("PSC_MPI_NODE_RANK", "Shared context rankid",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val) -1,
+			&env_rankid)) {
+		    if ((err = psmi_shm_attach(ep, &rank)))
+			goto fail;
+		}
+		else
+		    rank = env_rankid.e_int;
+	    }
+	    else
+		rank = env_rankid.e_int;
+	    nranks = (int) nproc;
+	}
+	else {
+	    /* Self-only, meaning only 1 proc max */
+	    rank = 0;
+	    nranks = 1;
+	}
+
+	e = getenv("IPATH_NO_CPUAFFINITY");
+
+	/* Now that we have a rank, set our affinity based on this rank */
+	if (e == NULL || *e == '\0') 
+	{
+	    cpu_set_t cpuset;
+	    CPU_ZERO(&cpuset);
+	    /* First see if affinity is already set */
+	    if (sched_getaffinity(0, sizeof cpuset, &cpuset)) {
+		_IPATH_PRDBG("Couldn't get processory affinity, assuming "
+			     "not set: %s\n", strerror(errno));
+	    }
+	    else {
+		int i, num_set = 0;
+		for (i = 0; i < CPU_SETSIZE; i++) {
+		    if (CPU_ISSET(i, &cpuset))
+			num_set++;
+		}
+
+		if (num_set > 0 && num_set < nproc)
+		    _IPATH_PRDBG("CPU affinity already set, leaving as is\n");
+		else if (rank >= nranks || rank < 0) 
+		    _IPATH_PRDBG("Skipping affinity, rank is %d and there are "
+				"only %d processors.\n", rank, nranks);
+		else {
+		    CPU_ZERO(&cpuset);
+		    CPU_SET(rank, &cpuset);
+		    if (sched_setaffinity(0,sizeof cpuset, &cpuset)) 
+			_IPATH_PRDBG("Couldn't set affinity to processor %d: %s\n",
+			    rank, strerror(errno));
+		    else
+			_IPATH_PRDBG("Set CPU affinity to %d out of %d processors\n",
+			    rank, nranks);
+		}
+	    }
+	}
+
+	/* 
+	 * We use a random lid 0xffff which doesn't really matter since we're
+	 * closing ourselves to the outside world by explicitly disabling the
+	 * ipath device).
+	 */
+	*epid = PSMI_EPID_PACK(0xffff, (rank>>2), rank);
+    } 
+
+fail:
+    return err;
+}
+
+/* Get a list of PTLs we want to use.  The order is important, it affects
+ * whether node-local processes use shm or ips */
+static
+psm_error_t
+psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
+{
+    char *devstr = NULL; 
+    char *b_new, *e, *ee, *b;
+    psm_error_t err = PSM_OK;
+    int len;
+    int i = 0;
+
+    psmi_assert_always(devstring != NULL);
+    len = strlen(devstring)+1;
+
+    for (i = 0; i < PTL_MAX_INIT; i++)
+	devices[i] = -1;
+
+    devstr = (char *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len);
+    if (devstr == NULL)
+	goto fail;
+
+    b_new = (char *) devstr;
+    e = b_new + len;
+    strncpy(e, devstring, len-1);
+    e[len-1] = '\0';
+    ee = e + len;
+    i = 0;
+    while (e < ee && *e && i < PTL_MAX_INIT) {
+	while (*e && !isalpha(*e))
+	    e++;
+	b = e;
+	while (*e && isalpha(*e))
+	    e++;
+	*e = '\0';
+	if (*b) {
+	    if (!strcasecmp(b, "self")) {
+		devices[i++] = PTL_DEVID_SELF;
+		b_new = strcpy(b_new, "self,");
+		b_new += 5;
+	    } else if (!strcasecmp(b, "amsh")) {
+		devices[i++] = PTL_DEVID_AMSH;
+		strcpy(b_new, "amsh,");
+		b_new += 5;
+	    } else if (!strcasecmp(b, "ips")) {
+		devices[i++] = PTL_DEVID_IPS;
+		strcpy(b_new, "ips,");
+		b_new += 4;
+	    /* If shm or shmem is set, bind to amsh */
+	    } else if (!strcasecmp(b, "shm") || !strcasecmp(b, "shmem")) {
+		devices[i++] = PTL_DEVID_AMSH;
+		strcpy(b_new, "amsh,");
+		b_new += 5;
+	    /* If shm or shmem is set, bind to ipath */
+	    } else if (!strcasecmp(b, "ipath") || !(strcasecmp(b, "infinipath"))) {
+		devices[i++] = PTL_DEVID_IPS;
+		strcpy(b_new, "ips,");
+		b_new += 4;
+	    } else {
+		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
+		    "%s set in environment variable PSM_PTL_DEVICES=\"%s\" "
+		    "is not one of the recognized PTL devices (%s)", 
+		    b, devstring, PSMI_DEVICES_DEFAULT);
+		goto fail;
+	    }
+	    e++;
+	}
+    }
+    if (b_new != devstr)  /* we parsed something, remove trailing comma */
+	b_new[strlen(b_new) - 1] = '\0';
+
+    _IPATH_PRDBG("PSM Device allocation order: %s\n", devstr);
+fail:
+    if (devstr != NULL)
+	psmi_free(devstr);
+    return err;
+
+}
+
+static
+int
+psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid)
+{
+    int i;
+    for (i = 0; i < PTL_MAX_INIT; i++) 
+	if (devid_enabled[i] == devid)
+	    return 1;
+    return 0;
+}
+
+int
+psmi_ep_device_is_enabled(const psm_ep_t ep, int devid)
+{
+    return psmi_device_is_enabled(ep->devid_enabled, devid);
+}
+
diff --git a/psm_ep.h b/psm_ep.h
new file mode 100644
index 0000000..6c5723f
--- /dev/null
+++ b/psm_ep.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_ep.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_EP_H
+#define _PSMI_EP_H
+
+#ifdef PSM_HAVE_SCIF
+#include <scif.h>
+#endif
+
+/* 
+ * EPIDs encode the following information:
+ * 
+ * LID:16 bits - LID for endpoint
+ * SUBCONTEXT:2 bits - Subcontext used for endpoint
+ * CONTEXT:6 bits - Context used for bits (upto 64 contexts)
+ * IBA_SL: 4 bits - Default SL to use for endpoint
+ * HCATYPE: 4 bits - QLE71XX, QLE72XX, QLE73XX ....
+ */
+
+#define PSMI_HCA_TYPE_UNKNOWN 0
+#define PSMI_HCA_TYPE_QLE71XX 1
+#define PSMI_HCA_TYPE_QLE72XX 2
+#define PSMI_HCA_TYPE_QLE73XX 3
+#define PSMI_HCA_TYPE_DEFAULT PSMI_HCA_TYPE_UNKNOWN
+
+#define PSMI_SL_DEFAULT 0
+#define PSMI_VL_DEFAULT 0
+
+#define PSMI_EPID_PACK_EXT(lid,context,subcontext,hca_type,sl) \
+  ( ((((uint64_t)lid)&0xffff)<<16) |			       \
+    ((((uint64_t)subcontext)&0x3)<<14) |		       \
+    ((((uint64_t)context)&0x3f)<<8) |			       \
+    ((((uint64_t)sl)&0xf)<<4) |				       \
+    (((uint64_t)hca_type)&0xf) )
+
+#define PSMI_EPID_PACK(lid,context,subcontext)	\
+  PSMI_EPID_PACK_EXT(lid,context,subcontext,PSMI_HCA_TYPE_DEFAULT, PSMI_SL_DEFAULT)
+
+#define PSMI_EPID_GET_LID(epid)         (((epid)>>16)&0xffff)
+#define PSMI_EPID_GET_SUBCONTEXT(epid)  (((epid)>>14)&0x3)
+#define PSMI_EPID_GET_CONTEXT(epid)     (((epid)>>8)&0x3f)
+#define PSMI_EPID_GET_SL(epid)          (((epid)>>4)&0xf)
+#define PSMI_EPID_GET_HCATYPE(epid)     (((epid)>>0)&0xf)
+
+#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL)
+#define PSMI_MIN_EP_CLOSE_TIMEOUT   (2 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_TIMEOUT   (60 * SEC_ULL)
+
+#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL)
+#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (10 * SEC_ULL)
+
+struct psm_ep {
+    psm_epid_t		epid;	    /**> This endpoint's Endpoint ID */
+    psm_epaddr_t	epaddr;	    /**> This ep's ep address */
+    psm_mq_t		mq;	    /**> only 1 MQ */
+    int			unit_id;
+    uint16_t		portnum;
+    uint8_t		out_sl;
+    uint8_t             pad;
+    int			did_syslog;
+    psm_uuid_t		key;
+    uint16_t		network_pkey; /**> InfiniBand Pkey */
+    uint64_t            service_id;   /* Infiniband service ID */
+    psm_path_res_t      path_res_type;/* Path resolution for endpoint */
+    psm_ep_errhandler_t	errh;
+    int			devid_enabled[PTL_MAX_INIT];
+    int			memmode;    /**> min, normal, large memory mode */
+
+#ifdef PSM_HAVE_SCIF
+    scif_epd_t		scif_epd;    /* scif listen endpoint */
+    int                 scif_dma_threshold; /* DMA message size threshold */
+    int			scif_mynodeid; /* my scif node ID */
+    int			scif_nnodes; /* Number of scif nodes on system */
+    int                 scif_dma_mode;
+    pthread_t           scif_thread; /* Thread listening for SCIF connects */
+#endif
+
+    uint32_t	ipath_num_sendbufs; /**> Number of allocated send buffers */
+    uint32_t    ipath_num_descriptors; /** Number of allocated scb descriptors*/
+    uint32_t    ipath_imm_size;     /** Immediate data size */
+    uint32_t	shm_mbytes;	    /**> Number of shared memory pages */
+    uint32_t	connections;	    /**> Number of connections */	
+
+    psmi_context_t	context;
+    char	*context_mylabel;
+    uint32_t	yield_spin_cnt;
+
+    /* EP link-lists */
+    struct psm_ep	*user_ep_next;
+
+    /* EP link-lists for multi-context. */
+    struct psm_ep	*mctxt_prev;
+    struct psm_ep	*mctxt_next;
+    struct psm_ep	*mctxt_master;
+
+    /* Active Message handler table */
+    void	**am_htable;
+    int		psmi_kassist_fd; /* when using kassist */
+    int		psmi_kassist_mode;
+
+    struct amsh_qdirectory      *amsh_qdir;
+    uintptr_t   amsh_shmbase;  /* base for mmap */
+    uintptr_t   amsh_blockbase; /* base for block 0 (after ctl dirpage) */
+    struct am_ctl_dirpage *amsh_dirpage;
+    psm_uuid_t  amsh_keyno;        /* context key uuid */
+    char        *amsh_keyname;/* context keyname */
+    int         amsh_shmfd;    /* context shared mmap fd */
+    int         amsh_shmidx;   /* last used shmidx */
+    int         amsh_max_idx;  /* max directory idx seen so far */
+
+    uint64_t    gid_hi;
+    uint64_t    gid_lo;
+
+    ptl_ctl_t	ptl_amsh;
+    ptl_ctl_t	ptl_ips;
+    ptl_ctl_t	ptl_self;
+
+    /* All ptl data is allocated inline below */
+    uint8_t ptl_base_data[0] __attribute__((aligned(8)));
+};
+
+struct mqq {
+    psm_mq_req_t    first;
+    psm_mq_req_t    *lastp;
+};
+
+struct mqsq {
+    psm_mq_req_t    first;
+    psm_mq_req_t    *lastp;
+};
+
+typedef
+union psmi_egrid {
+    struct {
+	uint32_t	egr_flowid : 8;
+	uint32_t	egr_msgno  : 24;
+    };
+    uint32_t	egr_data;
+}
+psmi_egrid_t;
+
+typedef 
+union psmi_seqnum {
+  struct {
+    uint32_t seq:11;
+    uint32_t gen:8;
+    uint32_t flow:5;
+  };
+  struct {
+    uint32_t pkt:16;
+    uint32_t msg:8;
+  };
+  struct {
+    uint32_t psn:24;
+  };
+  uint32_t val;
+} psmi_seqnum_t;
+
+struct psm_epaddr {
+    struct ptl	    *ptl;	   /* Which ptl owns this epaddress */
+    ptl_ctl_t	    *ptlctl;	   /* The control structure for the ptl */
+    psm_epid_t	    epid;	   
+    psm_ep_t	    ep;
+  
+    void           *usr_ep_ctxt;   /* User context associated with endpoint */
+
+    STAILQ_HEAD(, psm_mq_req) egrlong; /**> egrlong request queue */
+    STAILQ_HEAD(, psm_mq_req) egrdata; /**> egrlong data queue */
+    psmi_egrid_t	xmit_egrlong;
+
+    /* PTLs have a few ways to initialize the ptl address */
+    union {
+	ptl_epaddr_t    *ptladdr;
+	uint32_t	 _ptladdr_u32[2];
+	uint64_t	 _ptladdr_u64;
+	uint8_t		 _ptladdr_data[0];
+    };
+
+    /* it makes sense only in master */
+    uint64_t		mctxt_gidhi[IPATH_MAX_UNIT];
+    psm_epid_t		mctxt_epid[IPATH_MAX_UNIT];
+    int			mctxt_epcount;
+    int			mctxt_nsconn;	/* # slave connection */
+    uint16_t		mctxt_send_seqnum;
+    uint16_t		mctxt_recv_seqnum;
+    struct psm_epaddr	*mctxt_current;
+    struct mqsq		outoforder_q; /**> OutofOrder queue */
+    int			outoforder_c; /* OOO queue count */
+
+    /* epaddr linklist for multi-context. */
+    struct psm_epaddr	*mctxt_master;
+    struct psm_epaddr	*mctxt_prev;
+    struct psm_epaddr	*mctxt_next;
+};
+
+#define PSM_MCTXT_APPEND(head, node)	\
+	node->mctxt_prev = head->mctxt_prev; \
+	node->mctxt_next = head; \
+	head->mctxt_prev->mctxt_next = node; \
+	head->mctxt_prev = node; \
+	node->mctxt_master = head
+#define PSM_MCTXT_REMOVE(node)	\
+	node->mctxt_prev->mctxt_next = node->mctxt_next; \
+	node->mctxt_next->mctxt_prev = node->mctxt_prev; \
+	node->mctxt_next = node->mctxt_prev = node; \
+	node->mctxt_master = NULL
+
+#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD
+#  define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD  250
+#endif
+
+/*
+ * Users of BLOCKUNTIL should check the value of err upon return 
+ */
+#define PSMI_BLOCKUNTIL(ep,err,cond)	do {			\
+	    int spin_cnt = 0;					\
+	    PSMI_PROFILE_BLOCK();				\
+	    while (!(cond)) {					\
+		err = psmi_poll_internal(ep, 1);		\
+		if (err == PSM_OK_NO_PROGRESS) {		\
+		    PSMI_PROFILE_REBLOCK(1);			\
+		    if (++spin_cnt == (ep)->yield_spin_cnt) {   \
+			spin_cnt = 0;				\
+			PSMI_PYIELD();				\
+		    }						\
+		}						\
+		else if (err == PSM_OK) {			\
+		    PSMI_PROFILE_REBLOCK(0);			\
+		    spin_cnt = 0;				\
+		}						\
+		else						\
+		    break;					\
+	    }							\
+	    PSMI_PROFILE_UNBLOCK();				\
+	} while(0)
+
+#endif /* _PSMI_EP_H */
diff --git a/psm_ep_connect.c b/psm_ep_connect.c
new file mode 100644
index 0000000..98294e2
--- /dev/null
+++ b/psm_ep_connect.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+
+int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid);
+
+psm_error_t
+__psm_ep_connect(psm_ep_t ep, int num_of_epid,
+	        psm_epid_t const *array_of_epid,
+	        int const *array_of_epid_mask, /* can be NULL */
+	        psm_error_t  *array_of_errors,
+	        psm_epaddr_t *array_of_epaddr,
+	        int64_t timeout)
+{
+    psm_error_t err = PSM_OK;
+    ptl_ctl_t *ptlctl;
+    ptl_t     *ptl;
+    int i, j, dup_idx;
+    int num_toconnect = 0;
+    int *epid_mask = NULL;
+    int *epid_mask_isdupof = NULL;
+    char *device;
+    uint64_t t_start = get_cycles();
+    uint64_t t_left;
+    union psmi_envvar_val timeout_intval;
+
+    PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+    PSMI_PLOCK();
+
+    /*
+     * Normally we would lock here, but instead each implemented ptl component
+     * does its own locking.  This is mostly because the ptl components are
+     * ahead of the PSM interface in that they can disconnect their peers.
+     */
+    if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL ||
+	num_of_epid < 1) {
+	err = psmi_handle_error(ep, PSM_PARAM_ERR, 
+				 "Invalid psm_ep_connect parameters");
+	goto fail;
+    } 
+    
+    /* We need two of these masks to detect duplicates */
+    err = PSM_NO_MEMORY;
+    epid_mask = (int *) psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+    if (epid_mask == NULL) 
+	goto fail;
+    epid_mask_isdupof = (int *) psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
+    if (epid_mask_isdupof == NULL) 
+	goto fail;
+    err = PSM_OK;
+
+    /* Eventually handle timeouts across all connects. */
+    for (j = 0; j < num_of_epid; j++) {
+	if (array_of_epid_mask != NULL && !array_of_epid_mask[j])
+	    epid_mask[j] = 0;
+	else {
+	    epid_mask[j] = 1;
+	    array_of_errors[j] = PSM_EPID_UNKNOWN;
+	    array_of_epaddr[j] = NULL;
+	    num_toconnect++;
+	}
+	epid_mask_isdupof[j] = -1;
+    }
+
+    psmi_getenv("PSM_CONNECT_TIMEOUT",
+                "End-point connection timeout over-ride. 0 for no time-out.",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) 0,
+                &timeout_intval);
+
+    if (getenv("PSM_CONNECT_TIMEOUT")) {
+        timeout = timeout_intval.e_uint * SEC_ULL;
+    }
+    else if (timeout > 0) {
+        /* The timeout parameter provides the minimum timeout. A heuristic
+	 * is used to scale up the timeout linearly with the number of 
+	 * endpoints, and we allow one second per 100 endpoints. */
+        timeout = max(timeout, (num_toconnect * SEC_ULL) / 100);
+    }
+
+    if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
+        timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
+    _IPATH_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n",
+                 num_toconnect, (double) timeout/ 1e9);
+
+    /* Look for duplicates in input array */
+    for (i = 0; i < num_of_epid; i++) {
+	for (j = i + 1; j < num_of_epid; j++) {
+	    if (array_of_epid[i] == array_of_epid[j] &&
+		epid_mask[i] && epid_mask[j]) {
+		epid_mask[j] = 0; /* don't connect more than once */
+		epid_mask_isdupof[j] = i;
+	    }
+	}
+    }
+
+    for (i = 0; i < PTL_MAX_INIT; i++) {
+	if (ep->devid_enabled[i] == -1)
+	    continue;
+	/* Set up the right connect ptrs */
+	switch (ep->devid_enabled[i]) {
+	    case PTL_DEVID_IPS:
+		ptlctl = &ep->ptl_ips;
+		ptl = ep->ptl_ips.ptl;
+		device = "ips";
+		break;
+	    case PTL_DEVID_AMSH:
+		ptlctl = &ep->ptl_amsh;
+		ptl = ep->ptl_amsh.ptl;
+		device = "amsh";
+		break;
+	    case PTL_DEVID_SELF:
+		ptlctl = &ep->ptl_self;
+		ptl = ep->ptl_self.ptl;
+		device = "self";
+		break;
+	    default:
+		device = "unknown";
+		ptlctl = &ep->ptl_ips; /*no-unused*/
+		ptl = ep->ptl_ips.ptl; /*no-unused*/
+		device = "ips"; /*no-unused*/
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			"Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]);
+		break;
+	}
+	t_left = psmi_cycles_left(t_start, timeout);
+
+	_IPATH_VDBG("Trying to connect with device %s\n", device);
+	if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, 
+		    epid_mask, array_of_errors, array_of_epaddr, 
+		    cycles_to_nanosecs(t_left)))) 
+	{
+		_IPATH_PRDBG("Connect failure in device %s err=%d\n", 
+			    device, err);
+		goto connect_fail;
+	}
+
+	/* Now process what's been connected */
+	for (j = 0; j < num_of_epid; j++) {
+	    dup_idx = epid_mask_isdupof[j];
+	    if (!epid_mask[j] && dup_idx == -1)
+		continue;
+
+	    if (dup_idx != -1) { /* dup */
+		array_of_epaddr[j] = array_of_epaddr[dup_idx];
+		array_of_errors[j] = array_of_errors[dup_idx];
+		epid_mask_isdupof[j] = -1;
+	    }
+
+	    if (array_of_errors[j] == PSM_OK) {
+		epid_mask[j] = 0; /* don't try on next ptl */
+		ep->connections++;
+	    }
+	}
+    }
+
+    for (i = 0; i < num_of_epid; i++) {
+	ptl_ctl_t *c = NULL;
+	if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
+	    continue;
+	/* If we see unreachable here, that means some PTLs were not enabled */
+	if (array_of_errors[i] == PSM_EPID_UNREACHABLE) {
+	    err = PSM_EPID_UNREACHABLE;
+	    break;
+	}
+	
+	psmi_assert_always(array_of_epaddr[i] != NULL);
+	c = array_of_epaddr[i]->ptlctl;
+	psmi_assert_always(c != NULL);
+	_IPATH_VDBG("%-20s DEVICE %s (%p)\n", 
+		psmi_epaddr_get_name(array_of_epid[i]),
+		c == &ep->ptl_ips ? "ipath" :
+		    (c == &ep->ptl_amsh ? "amsh" : "self" ),
+		(void *) array_of_epaddr[i]->ptl);
+    }
+
+connect_fail:
+    /* If the error is a timeout (at worse) and the client is InfiniPath MPI,
+     * just return timeout to let InfiniPath MPI handle the hostnames that
+     * timed out */
+    if (err != PSM_OK) {
+	char errbuf[PSM_ERRSTRING_MAXLEN];
+	size_t len;
+	int j = 0;
+
+	if (err == PSM_EPID_UNREACHABLE) {
+	    char *deverr = "of an incorrect setting";
+	    char *eperr = " ";
+	    char *devname = NULL;
+	    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
+		deverr = "there is no shared memory PSM device (shm)";
+		eperr = " shared memory ";
+	    }
+	    else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
+		deverr = "there is no InfiniPath PSM device (ipath)";
+		eperr = " InfiniPath ";
+	    }
+
+	    len = snprintf(errbuf, sizeof errbuf - 1,
+		"Some%sendpoints could not be connected because %s "
+		"in the currently enabled PSM_DEVICES (",
+		eperr, deverr);
+	    for (i = 0; i < PTL_MAX_INIT && len < sizeof errbuf - 1; i++) {
+		switch (ep->devid_enabled[i]) {
+		    case PTL_DEVID_IPS:
+			devname = "ipath";
+			break;
+		    case PTL_DEVID_AMSH:
+			devname = "shm";
+			break;
+		    case PTL_DEVID_SELF:
+		    default:
+			devname = "self";
+			break;
+		}
+		len += snprintf(errbuf+len, sizeof errbuf - len - 1,
+				"%s,", devname);
+	    }
+	    if (len < sizeof errbuf - 1 && devname != NULL)
+		/* parsed something, remove trailing comma */
+		errbuf[len-1] = ')';
+	}
+	else 
+	    len = snprintf(errbuf, sizeof errbuf - 1,
+		       "%s", err == PSM_TIMEOUT ? 
+		       "Dectected connection timeout" : 
+		       psm_error_get_string(err));
+
+	/* first pass, look for all nodes with the error */
+	for (i = 0; i < num_of_epid && len < sizeof errbuf - 1; i++) {
+	    if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
+		continue;
+	    if (array_of_errors[i] == PSM_OK)
+		continue;
+	    if (array_of_errors[i] == PSM_EPID_UNREACHABLE &&
+		err != PSM_EPID_UNREACHABLE)
+		continue;
+	    if (err == array_of_errors[i]) {
+		len += snprintf(errbuf+len, sizeof errbuf - len - 1,
+			    "%c %s", j==0 ? ':' : ',',
+			    psmi_epaddr_get_hostname(array_of_epid[i]));
+		j++;
+	    }
+	}
+	errbuf[sizeof errbuf - 1] = '\0';
+	err = psmi_handle_error(ep, err, errbuf, "%s");
+    }
+
+fail:
+    PSMI_PUNLOCK();
+
+    if (epid_mask != NULL)
+	psmi_free(epid_mask);
+    if (epid_mask_isdupof != NULL)
+	psmi_free(epid_mask_isdupof);
+
+    return err;
+}
+PSMI_API_DECL(psm_ep_connect)
+
diff --git a/psm_error.c b/psm_error.c
new file mode 100644
index 0000000..6bcefcb
--- /dev/null
+++ b/psm_error.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+
+#define PSMI_NOLOG  -1
+
+struct psm_error_token
+{
+    psm_ep_t	ep;
+    psm_error_t	error;
+    char	err_string[PSM_ERRSTRING_MAXLEN];
+};
+
+static
+psm_error_t
+psmi_errhandler_noop(psm_ep_t ep, const psm_error_t err, 
+		     const char *error_string, psm_error_token_t token)
+{
+    return err;
+}
+
+static
+psm_error_t
+psmi_errhandler_psm(psm_ep_t ep, 
+			const psm_error_t err, 
+			const char *error_string, 
+			psm_error_token_t token)
+{
+    /* we want the error to be seen through ssh, etc., so we flush and then
+     * sleep a bit.   Not perfect, but not doing so means it almost never
+     * gets seen. */
+    fprintf(stderr, "%s%s\n", ipath_get_mylabel(), token->err_string);
+    fflush(stdout);
+    fflush(stderr);
+
+    /* XXX Eventually, this will hook up to a connection manager, and we'll
+     * issue an upcall into the connection manager at shutdown time */
+    sleep(3);
+
+    /* We use this "special" ep internally to handle internal errors that are 
+     * triggered from within code that is not expected to return to the user.
+     * Errors of this sort on not expected to be handled by users and always
+     * mean we have an internal PSM bug. */ 
+    if (err == PSM_INTERNAL_ERR)
+	abort();
+    else
+	exit(-1);
+}
+
+psm_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop;
+
+psm_error_t
+__psm_error_defer(psm_error_token_t token)
+{
+    return psmi_errhandler_psm(token->ep, token->error, token->err_string, token);
+}
+PSMI_API_DECL(psm_error_defer)
+
+psm_error_t
+__psm_error_register_handler(psm_ep_t ep, const psm_ep_errhandler_t errhandler)
+{
+    psm_ep_errhandler_t *errh;
+    if (ep == NULL)
+	errh = &psmi_errhandler_global;
+    else
+	errh = &ep->errh;
+
+    if (errhandler == PSM_ERRHANDLER_PSM_HANDLER)
+	*errh = psmi_errhandler_psm;
+    else if (errhandler == PSM_ERRHANDLER_NO_HANDLER)
+	*errh = psmi_errhandler_noop;
+    else
+	*errh = errhandler;
+
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_error_register_handler)
+
+psm_error_t
+psmi_handle_error(psm_ep_t ep, psm_error_t error, const char *buf, ...) 
+{
+    va_list argptr;
+    int syslog_level;
+    int console_print = 0;
+    psm_error_t newerr;
+    struct psm_error_token token;
+    char *c, fullmsg[PSM_ERRSTRING_MAXLEN];
+    token.error = error;
+    snprintf(fullmsg, PSM_ERRSTRING_MAXLEN-1, "%s", buf);
+    fullmsg[PSM_ERRSTRING_MAXLEN-1] = '\0';
+    va_start(argptr, buf);
+      vsnprintf(token.err_string, PSM_ERRSTRING_MAXLEN-1, fullmsg, argptr);
+    va_end(argptr);
+    token.err_string[PSM_ERRSTRING_MAXLEN-1] = '\0';
+
+    /* Unless the user has set PSM_NO_VERBOSE_ERRORS, always print errors to
+     * console */
+    c = getenv("PSM_NO_VERBOSE_ERRORS");
+    console_print = 0;
+    if (ep == PSMI_EP_LOGEVENT)
+	console_print = 1;
+    else if (!c || *c == '\0') { /* no desire to prevent verbose errors */
+	/* Remove the console print if we're internally handling the error */
+	if (ep == PSMI_EP_NORETURN)
+	    console_print = 0;
+	else if (ep == NULL && psmi_errhandler_global != psmi_errhandler_psm) 
+	    console_print = 1;
+	else if (ep != NULL && ep->errh != psmi_errhandler_psm)
+	    console_print = 1;
+    }
+
+    /* Before we let the user even handle the error, send to syslog */
+    syslog_level = psmi_error_syslog_level(error);
+    if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT) 
+	psmi_syslog(ep, console_print, 
+		    ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level, 
+		    "%s (err=%d)",
+		    token.err_string, error);
+
+    if (ep == PSMI_EP_LOGEVENT) /* we're just logging */
+	newerr = PSM_OK;
+    else if (ep == PSMI_EP_NORETURN)
+        newerr = psmi_errhandler_psm(NULL, error, token.err_string, &token);
+    else if (ep == NULL)
+	newerr = psmi_errhandler_global(NULL, error, token.err_string, &token); 
+    else
+	newerr = ep->errh(ep, error, token.err_string, &token);
+
+    return newerr;
+}
+
+/* Returns the "worst" error out of errA and errB */
+psm_error_t
+psmi_error_cmp(psm_error_t errA, psm_error_t errB)
+{
+#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err)
+
+    /* Bad runtime or before initialization */
+    _PSMI_ERR_IS(PSM_NO_MEMORY);
+    _PSMI_ERR_IS(PSM_INTERNAL_ERR);
+    _PSMI_ERR_IS(PSM_INIT_NOT_INIT);
+    _PSMI_ERR_IS(PSM_INIT_BAD_API_VERSION);
+
+    /* Before we cget an endpoint */
+    _PSMI_ERR_IS(PSM_EP_NO_DEVICE);
+    _PSMI_ERR_IS(PSM_EP_UNIT_NOT_FOUND);
+    _PSMI_ERR_IS(PSM_EP_DEVICE_FAILURE);
+    _PSMI_ERR_IS(PSM_EP_NO_PORTS_AVAIL);
+    _PSMI_ERR_IS(PSM_TOO_MANY_ENDPOINTS);
+
+    /* As we open/close the endpoint */
+    _PSMI_ERR_IS(PSM_EP_NO_NETWORK);
+    _PSMI_ERR_IS(PSM_SHMEM_SEGMENT_ERR);
+    _PSMI_ERR_IS(PSM_EP_CLOSE_TIMEOUT);
+    _PSMI_ERR_IS(PSM_EP_INVALID_UUID_KEY);
+    _PSMI_ERR_IS(PSM_EP_NO_RESOURCES);
+
+    /* In connect phase */
+    _PSMI_ERR_IS(PSM_EPID_NETWORK_ERROR);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_NODE);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_CONNECT);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_PKEY);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_VERSION);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_UUID_KEY);
+    _PSMI_ERR_IS(PSM_EPID_INVALID_MTU);
+
+    /* Timeout if nothing else */
+    _PSMI_ERR_IS(PSM_TIMEOUT);
+
+    /* Last resort */
+    return max(errA,errB);
+}
+
+struct psmi_error_item {
+    int	syslog_level;
+    const char *error_string;
+};
+
+static
+struct psmi_error_item
+psmi_error_items[] = {
+    { PSMI_NOLOG, "Success" }, /*  PSM_OK = 0, */
+    { PSMI_NOLOG, "No events were progressed in psm_poll" }, /* PSM_OK_NO_PROGRESS = 1 */
+    { PSMI_NOLOG, "unknown 2" },
+    { PSMI_NOLOG, "Error in a function parameter" }, /* PSM_PARAM_ERR = 3 */
+    { LOG_CRIT  , "Ran out of memory" }, /* PSM_NO_MEMORY = 4 */
+    { PSMI_NOLOG, "PSM has not been initialized by psm_init" }, /* PSM_INIT_NOT_INIT = 5 */ 
+    { LOG_INFO  , "API version passed in psm_init is incompatible" }, /* PSM_INIT_BAD_API_VERSION = 6 */
+    { PSMI_NOLOG, "PSM Could not set affinity" }, /* PSM_NO_AFFINITY = 7 */
+    { LOG_ALERT , "PSM Unresolved internal error" }, /* PSM_INTERNAL_ERR = 8 */
+    { LOG_CRIT  , "PSM could not set up shared memory segment" }, /* PSM_SHMEM_SEGMENT_ERR = 9 */
+    { PSMI_NOLOG, "PSM option is a read-only option" }, /* PSM_OPT_READONLY = 10 */
+    { PSMI_NOLOG, "Operation timed out" }, /* PSM_TIMEOUT = 11 */
+    { LOG_INFO  , "Exceeded supported amount of endpoints" }, 
+		/* PSM_TOO_MANY_ENDPOINTS = 12 */
+    { PSMI_NOLOG, "PSM is in the finalized state" }, /* PSM_IS_FINALIZED = 13 */
+    { PSMI_NOLOG, "unknown 14" },
+    { PSMI_NOLOG, "unknown 15" },
+    { PSMI_NOLOG, "unknown 16" },
+    { PSMI_NOLOG, "unknown 17" },
+    { PSMI_NOLOG, "unknown 18" },
+    { PSMI_NOLOG, "unknown 19" },
+    { PSMI_NOLOG, "Endpoint was closed" }, /* PSM_EP_WAS_CLOSED = 20 */ 
+    { LOG_ALERT , "PSM Could not find an InfiniPath Unit" }, /* PSM_EP_NO_DEVICE = 21 */
+    { PSMI_NOLOG, "User passed a bad unit number" }, /* PSM_EP_UNIT_NOT_FOUND = 22 */
+    { LOG_ALERT , "Failure in initializing endpoint" }, /* PSM_EP_DEVICE_FAILURE = 23 */ 
+    { PSMI_NOLOG, "Error closing the endpoing error" }, /* PSM_EP_CLOSE_TIMEOUT = 24 */  
+    { PSMI_NOLOG, "No free contexts could be obtained" }, /* PSM_EP_NO_PORTS_AVAIL = 25 */ 
+    { LOG_ALERT , "Could not detect network connectivity" }, /* PSM_EP_NO_NETWORK = 26 */  
+    { LOG_INFO  , "Invalid Unique job-wide UUID Key" }, /* PSM_EP_INVALID_UUID_KEY = 27 */
+    { LOG_INFO  , "Out of endpoint resources" }, /* PSM_EP_NO_RESOURCES = 28 */
+    { PSMI_NOLOG, "unknown 29" },
+    { PSMI_NOLOG, "unknown 30" },
+    { PSMI_NOLOG, "unknown 31" },
+    { PSMI_NOLOG, "unknown 32" },
+    { PSMI_NOLOG, "unknown 33" },
+    { PSMI_NOLOG, "unknown 34" },
+    { PSMI_NOLOG, "unknown 35" },
+    { PSMI_NOLOG, "unknown 36" },
+    { PSMI_NOLOG, "unknown 37" },
+    { PSMI_NOLOG, "unknown 38" },
+    { PSMI_NOLOG, "unknown 39" },
+    { PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)" }, /* PSM_EPID_UNKNOWN = 40 */
+    { PSMI_NOLOG, "Endpoint could not be reached" }, /* PSM_EPID_UNREACHABLE = 41 */
+    { PSMI_NOLOG, "unknown 42" },
+    { LOG_CRIT  , "Invalid node (mismatch in bit width 32/64 or byte order)" }, /* PSM_EPID_INVALID_NODE = 43 */
+    { LOG_CRIT  , "Invalid MTU" }, /* PSM_EPID_INVALID_MTU =  44 */
+    { PSMI_NOLOG, "UUID key mismatch" },  /* PSM_EPID_INVALID_UUID_KEY = 45 */
+    { LOG_ERR   , "Incompatible PSM version" }, /* PSM_EPID_INVALID_VERSION = 46 */
+    { LOG_CRIT  , "Connect received garbled connection information" }, /* PSM_EPID_INVALID_CONNECT = 47 */
+    { PSMI_NOLOG, "Endpoint was already connected" }, /* PSM_EPID_ALREADY_CONNECTED = 48 */
+    { LOG_CRIT  , "Two or more endpoints have the same network id (LID)" }, /* PSM_EPID_NETWORK_ERROR = 49 */
+    { LOG_CRIT,   "Endpoint provided incompatible Partition Key" },
+    { LOG_CRIT,   "Unable to resolve network path. Is the SM running?" },
+    { PSMI_NOLOG, "unknown 51" },
+    { PSMI_NOLOG, "unknown 52" },
+    { PSMI_NOLOG, "unknown 53" },
+    { PSMI_NOLOG, "unknown 54" },
+    { PSMI_NOLOG, "unknown 55" },
+    { PSMI_NOLOG, "unknown 56" },
+    { PSMI_NOLOG, "unknown 57" },
+    { PSMI_NOLOG, "unknown 58" },
+    { PSMI_NOLOG, "unknown 59" },
+    { PSMI_NOLOG, "MQ Non-blocking request is incomplete" }, /* PSM_MQ_NO_COMPLETIONS = 60 */
+    { PSMI_NOLOG, "MQ Message has been truncated at the receiver" }, /* PSM_MQ_TRUNCATION = 61 */
+    { PSMI_NOLOG, "unknown 62" },
+    { PSMI_NOLOG, "unknown 63" },
+    { PSMI_NOLOG, "unknown 64" },
+    { PSMI_NOLOG, "unknown 65" },
+    { PSMI_NOLOG, "unknown 66" },
+    { PSMI_NOLOG, "unknown 67" },
+    { PSMI_NOLOG, "unknown 68" },
+    { PSMI_NOLOG, "unknown 69" },
+    { PSMI_NOLOG, "Invalid AM reply" },
+    { PSMI_NOLOG, "unknown 71" },
+    { PSMI_NOLOG, "unknown 72" },
+    { PSMI_NOLOG, "unknown 73" },
+    { PSMI_NOLOG, "unknown 74" },
+    { PSMI_NOLOG, "unknown 75" },
+    { PSMI_NOLOG, "unknown 76" },
+    { PSMI_NOLOG, "unknown 77" },
+    { PSMI_NOLOG, "unknown 78" },
+    { PSMI_NOLOG, "unknown 79" },
+    { PSMI_NOLOG, "unknown 80" },
+};
+
+const char *
+__psm_error_get_string(psm_error_t error)
+{
+    if (error >= PSM_ERROR_LAST)
+	return "unknown";
+    else
+	return psmi_error_items[error].error_string;
+}
+PSMI_API_DECL(psm_error_get_string)
+
+int
+psmi_error_syslog_level(psm_error_t error)
+{
+    if (error >= PSM_ERROR_LAST)
+	return PSMI_NOLOG;
+    else
+	return psmi_error_items[error].syslog_level;
+}
+
diff --git a/psm_error.h b/psm_error.h
new file mode 100644
index 0000000..21f5745
--- /dev/null
+++ b/psm_error.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_error.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_ERROR_H
+#define _PSMI_ERROR_H
+
+#define PSMI_EP_NONE		    (NULL)
+#define PSMI_EP_NORETURN	    ((psm_ep_t) -2)
+#define PSMI_EP_LOGEVENT	    ((psm_ep_t) -3)
+
+psm_ep_errhandler_t psmi_errhandler_global;
+
+psm_error_t psmi_handle_error(psm_ep_t ep, psm_error_t error, 
+			      const char *buf, ...)
+	    __attribute__((format(printf, 3, 4)));
+
+psm_error_t psmi_error_cmp(psm_error_t errA, psm_error_t errB);
+int	    psmi_error_syslog_level(psm_error_t error);
+
+#endif /* _PSMI_ERROR_H */
diff --git a/psm_help.h b/psm_help.h
new file mode 100644
index 0000000..8efd11d
--- /dev/null
+++ b/psm_help.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_HELP_H
+#define _PSMI_HELP_H
+
+/* XXX pathcc and gcc only */
+#define PSMI_INLINE(FN)					\
+    static inline FN
+
+#define PSMI_ALWAYS_INLINE(FN)                            \
+    static __inline__ FN __attribute__((always_inline));  \
+    static __inline__ FN
+
+#define PSMI_NEVER_INLINE(FN)             \
+    static FN __attribute__((noinline));  \
+    static FN
+
+#define _PPragma(x) _Pragma(x)
+
+#define STRINGIFY(s)	_STRINGIFY(s)
+#define _STRINGIFY(s)	#s
+#define PSMI_CURLOC	__FILE__ ":" STRINGIFY(__LINE__)
+#define psmi_assert_always_loc(x,curloc)    do {			\
+	    if_pf (!(x)) {						\
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,	\
+		    "Assertion failure at %s: %s", curloc,		\
+		     STRINGIFY(x));					\
+	    } } while (0)
+
+#define psmi_assert_always(x)  psmi_assert_always_loc(x,PSMI_CURLOC)
+
+#ifdef PSM_DEBUG
+#  define psmi_assert(x)	psmi_assert_always(x)
+#  define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized())
+#else
+#  define psmi_assert(x)
+#  define PSMI_ASSERT_INITIALIZED()
+#endif
+
+#define _PSMI_API_NAME(FN)  __ ## FN
+#define _PSMI_API_STR(FN)   _STRINGIFY(__ ## FN)
+#define PSMI_API_DECL(FN)							\
+	typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN))));
+
+#define PSMI_ERR_UNLESS_INITIALIZED(ep)    do {			    \
+	    if (!psmi_isinitialized())				    \
+		return psmi_handle_error(ep, PSM_INIT_NOT_INIT,	    \
+			"PSM has not been initialized");	    \
+	} while (0)
+		
+
+#define PSMI_CHECKMEM(err,mem)  do {	\
+	    if ((mem) == NULL) {	\
+		(err) = PSM_NO_MEMORY;	\
+		goto fail;		\
+	    }				\
+	} while (0)
+
+#define PSMI_CACHEALIGN	__attribute__((aligned(64)))
+
+/* Easy way to ignore the OK_NO_PROGRESS case */
+PSMI_ALWAYS_INLINE(
+psm_error_t
+psmi_err_only(psm_error_t err))
+{
+    if (err > PSM_OK_NO_PROGRESS)
+	return err;
+    else
+	return PSM_OK;
+}
+
+#ifdef min
+#undef min
+#endif
+#define min(a,b) ((a) < (b) ? (a) : (b))
+
+#ifdef max
+#undef max
+#endif
+#define max(a,b) ((a) > (b) ? (a) : (b))
+
+#define SEC_ULL	 1000000000ULL
+#define MSEC_ULL 1000000ULL
+#define USEC_ULL 1000ULL
+#define NSEC_ULL 1ULL
+
+#define PSMI_TRUE   1
+#define PSMI_FALSE  0
+
+#define PSMI_CYCLES_TO_SECSF(cycles)			\
+	    ((double) cycles_to_nanosecs(cycles) / 1.0e9)
+
+#define PSMI_PAGESIZE       psmi_getpagesize()
+#define PSMI_POWEROFTWO(P)  (((P)&((P)-1)) == 0)
+#define PSMI_ALIGNDOWN(p,P) (((uintptr_t)(p))&~((uintptr_t)((P)-1)))
+#define PSMI_ALIGNUP(p,P)   (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)),(P)))
+
+#define PSMI_MAKE_DRIVER_VERSION(major,minor) ((major)<<16 | ((minor) & 0xffff))
+
+#define PSMI_STRICT_SIZE_DECL(member,sz) static const size_t __psm_ss_ ## member = sz
+#define PSMI_STRICT_SIZE_VERIFY(member,sz)  do {                    \
+            if (__psm_ss_ ## member != (sz)) {			    \
+                char errmsg[64];                                    \
+                snprintf(errmsg,32, "Internal error: %s "           \
+                  "size doesn't match expected %d bytes",           \
+                  STRINGIFY(member), (int) __psm_ss_ ## member);    \
+		exit(-1);					    \
+            }                                                       \
+        } while (0)
+
+
+#endif /* _PSMI_HELP_H */
diff --git a/psm_lock.h b/psm_lock.h
new file mode 100644
index 0000000..9ad3df6
--- /dev/null
+++ b/psm_lock.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_lock.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_LOCK_H
+#define _PSMI_LOCK_H
+
+#ifndef PSMI_USE_PTHREAD_SPINLOCKS
+  #if defined(__powerpc__)
+    #define PSMI_USE_PTHREAD_SPINLOCKS 1
+  #else
+    #define PSMI_USE_PTHREAD_SPINLOCKS 0
+  #endif
+#endif
+
+#if PSMI_USE_PTHREAD_SPINLOCKS
+  typedef pthread_spinlock_t	  psmi_spinlock_t;
+
+  #define psmi_spin_init(lock)	  pthread_spin_init(lock,0)
+  #define psmi_spin_lock(lock)	  pthread_spin_lock(lock)
+  #define psmi_spin_trylock(lock) pthread_spin_trylock(lock)
+  #define psmi_spin_unlock(lock)  pthread_spin_unlock(lock)
+#else
+  typedef ips_atomic_t psmi_spinlock_t;
+  #define PSMI_SPIN_LOCKED    1
+  #define PSMI_SPIN_UNLOCKED  0
+
+  PSMI_ALWAYS_INLINE(
+  int 
+  psmi_spin_init(psmi_spinlock_t *lock)) {
+    ips_atomic_set(lock, PSMI_SPIN_UNLOCKED);
+    return 0;
+  }
+
+  PSMI_ALWAYS_INLINE(
+  int
+  psmi_spin_trylock(psmi_spinlock_t *lock)) {
+    if (ips_atomic_cmpxchg(lock,PSMI_SPIN_UNLOCKED,PSMI_SPIN_LOCKED) 
+		    == PSMI_SPIN_UNLOCKED)
+	return 0;
+    else
+	return EBUSY;
+  }
+
+  PSMI_ALWAYS_INLINE(
+  int
+  psmi_spin_lock(psmi_spinlock_t *lock)) {
+    while (psmi_spin_trylock(lock) == EBUSY)
+	  {}
+    return 0;
+  }
+
+  PSMI_ALWAYS_INLINE(
+  int
+  psmi_spin_unlock(psmi_spinlock_t *lock)) {
+    atomic_set(lock, PSMI_SPIN_UNLOCKED);
+    return 0;
+  }
+#endif /* PSMI_USE_PTHREAD_SPINLOCKS */
+
+#endif /* _PSMI_LOCK_H */
diff --git a/psm_memcpy.c b/psm_memcpy.c
new file mode 100644
index 0000000..cee165f
--- /dev/null
+++ b/psm_memcpy.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+/* Bug in 2.4 compiler that prevents this file from compiling.
+ * Hardcode memcpyo to psmi_mq_mtucpy (uses ipath_dwordcpy). 
+ */
+#if (WORDSIZE != 64) ||  defined(__powerpc__) || \
+    (defined(__PATHCC__) && __PATHCC__ == 2 && __PATHCC_MINOR__ == 4)
+extern void psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars);
+
+void *psmi_memcpyo(void *dst, const void *src, size_t n)
+{
+    psmi_mq_mtucpy(dst,src,n);
+    return dst;
+}
+#else
+#error "psmi_memcpyo() does not use psmi_mq_mtucpy()"
+#include <emmintrin.h>
+
+#define OPTERON_L1_CACHE_BYTES 65536
+#define OPTERON_L2_CACHE_BYTES 1048576
+
+static inline size_t __memcpy_pathscale_opteron_sse2
+  (uint8_t *d, const uint8_t *s, size_t n) __attribute__ ((always_inline));
+
+static inline size_t __memcpy_pathscale_opteron_sse2
+  (uint8_t *d, const uint8_t *s, size_t n)
+{
+  assert(n >= 16);
+  /* align destination up to 16 bytes */
+  size_t i;
+  size_t align = (16 - (((uintptr_t) d) & 0xf)) & 0xf;
+  if (align != 0) {
+    for (i = 0; i < align; i++) {
+      d[i] = s[i];
+    }
+    d += align;
+    s += align;
+    n -= align;
+  }
+
+  __m128i *dp = (__m128i *) d;
+  __m128i const *sp = (__m128i const *) s;
+
+  if ((((uintptr_t) sp) & 0xf) == 0x0) {
+    /* source and destination are both 16 byte aligned */
+    if (n < (OPTERON_L2_CACHE_BYTES >> 2)) {
+      size_t count = n >> 7;
+      for (i = 0; i < count; i++) {
+        _mm_prefetch(((const char *) sp) + 512, _MM_HINT_NTA);
+        _mm_prefetch(((const char *) sp) + 576, _MM_HINT_NTA);
+        __m128i tmp0 = _mm_load_si128(sp);
+        __m128i tmp1 = _mm_load_si128(sp + 1);
+        __m128i tmp2 = _mm_load_si128(sp + 2);
+        __m128i tmp3 = _mm_load_si128(sp + 3);
+        __m128i tmp4 = _mm_load_si128(sp + 4);
+        __m128i tmp5 = _mm_load_si128(sp + 5);
+        __m128i tmp6 = _mm_load_si128(sp + 6);
+        __m128i tmp7 = _mm_load_si128(sp + 7);
+        _mm_store_si128(dp, tmp0);
+        _mm_store_si128(dp + 1, tmp1);
+        _mm_store_si128(dp + 2, tmp2);
+        _mm_store_si128(dp + 3, tmp3);
+        _mm_store_si128(dp + 4, tmp4);
+        _mm_store_si128(dp + 5, tmp5);
+        _mm_store_si128(dp + 6, tmp6);
+        _mm_store_si128(dp + 7, tmp7);
+        sp += 8;
+        dp += 8;
+      }
+      return align + (count << 7);
+    }
+    else {
+      size_t count = n >> 7;
+      for (i = 0; i < count; i++) {
+        _mm_prefetch(((const char *) sp) + 768, _MM_HINT_NTA);
+        _mm_prefetch(((const char *) sp) + 832, _MM_HINT_NTA);
+        __m128i tmp0 = _mm_load_si128(sp);
+        __m128i tmp1 = _mm_load_si128(sp + 1);
+        __m128i tmp2 = _mm_load_si128(sp + 2);
+        __m128i tmp3 = _mm_load_si128(sp + 3);
+        __m128i tmp4 = _mm_load_si128(sp + 4);
+        __m128i tmp5 = _mm_load_si128(sp + 5);
+        __m128i tmp6 = _mm_load_si128(sp + 6);
+        __m128i tmp7 = _mm_load_si128(sp + 7);
+        _mm_stream_si128(dp, tmp0);
+        _mm_stream_si128(dp + 1, tmp1);
+        _mm_stream_si128(dp + 2, tmp2);
+        _mm_stream_si128(dp + 3, tmp3);
+        _mm_stream_si128(dp + 4, tmp4);
+        _mm_stream_si128(dp + 5, tmp5);
+        _mm_stream_si128(dp + 6, tmp6);
+        _mm_stream_si128(dp + 7, tmp7);
+        sp += 8;
+        dp += 8;
+      }
+      return align + (count << 7);
+    }
+  }
+  else {
+    /* only destination is 16 byte aligned - use unaligned loads */
+    if (n < (OPTERON_L2_CACHE_BYTES >> 2)) {
+      size_t count = n >> 7;
+      for (i = 0; i < count; i++) {
+        _mm_prefetch(((const char *) sp) + 512, _MM_HINT_NTA);
+        _mm_prefetch(((const char *) sp) + 576, _MM_HINT_NTA);
+        __m128i tmp0 = _mm_loadu_si128(sp);
+        __m128i tmp1 = _mm_loadu_si128(sp + 1);
+        __m128i tmp2 = _mm_loadu_si128(sp + 2);
+        __m128i tmp3 = _mm_loadu_si128(sp + 3);
+        __m128i tmp4 = _mm_loadu_si128(sp + 4);
+        __m128i tmp5 = _mm_loadu_si128(sp + 5);
+        __m128i tmp6 = _mm_loadu_si128(sp + 6);
+        __m128i tmp7 = _mm_loadu_si128(sp + 7);
+        _mm_store_si128(dp, tmp0);
+        _mm_store_si128(dp + 1, tmp1);
+        _mm_store_si128(dp + 2, tmp2);
+        _mm_store_si128(dp + 3, tmp3);
+        _mm_store_si128(dp + 4, tmp4);
+        _mm_store_si128(dp + 5, tmp5);
+        _mm_store_si128(dp + 6, tmp6);
+        _mm_store_si128(dp + 7, tmp7);
+        sp += 8;
+        dp += 8;
+      }
+      return align + (count << 7);
+    }
+    else {
+      size_t count = n >> 7;
+      for (i = 0; i < count; i++) {
+        /* 2 x 64 bytes of prefetch matches 8 x 16 bytes of load/store */
+        /* The prefetch distance was tuned empirically */
+        _mm_prefetch(((const char *) sp) + 768, _MM_HINT_NTA);
+        _mm_prefetch(((const char *) sp) + 832, _MM_HINT_NTA);
+        __m128i tmp0 = _mm_loadu_si128(sp);
+        _mm_stream_si128(dp, tmp0);
+        __m128i tmp1 = _mm_loadu_si128(sp + 1);
+        __m128i tmp2 = _mm_loadu_si128(sp + 2);
+        __m128i tmp3 = _mm_loadu_si128(sp + 3);
+        __m128i tmp4 = _mm_loadu_si128(sp + 4);
+        __m128i tmp5 = _mm_loadu_si128(sp + 5);
+        __m128i tmp6 = _mm_loadu_si128(sp + 6);
+        __m128i tmp7 = _mm_loadu_si128(sp + 7);
+        _mm_stream_si128(dp + 1, tmp1);
+        _mm_stream_si128(dp + 2, tmp2);
+        _mm_stream_si128(dp + 3, tmp3);
+        _mm_stream_si128(dp + 4, tmp4);
+        _mm_stream_si128(dp + 5, tmp5);
+        _mm_stream_si128(dp + 6, tmp6);
+        _mm_stream_si128(dp + 7, tmp7);
+        sp += 8;
+        dp += 8;
+      }
+      return align + (count << 7);
+    }
+  }
+  return 0;     /* unreachable */
+}
+
+void *psmi_memcpyo(void *dst, const void *src, size_t n)
+{
+  uint8_t *d = (uint8_t *) dst;
+  const uint8_t *s = (uint8_t *) src;
+
+  /* Smaller copies are detected and handled first since they are
+   * the most latency sensitive. Larger copies can have residual
+   * parts left at the end that are smaller than the unrolled loop.
+   * I use an outer do-loop to allow these cases to loop around to
+   * the smaller copy code. */
+
+  do {
+    if (n < 16) {
+      switch (n) {
+        case 0: {
+          return dst;
+        }
+        case 1: {
+          * (uint8_t *) d = * (const uint8_t *) s;
+          return dst;
+        }
+        case 2: {
+          * (uint16_t *) d = * (const uint16_t *) s;
+          return dst;
+        }
+        case 4: {
+          * (uint32_t *) d = * (const uint32_t *) s;
+          return dst;
+        }
+        case 8: {
+          * (uint64_t *) d = * (const uint64_t *) s;
+          return dst;
+        }
+        default: {
+          if (n & 0x8) {
+            * (uint64_t *) d = * (const uint64_t *) s;
+            d += 8;
+            s += 8;
+          }
+          if (n & 0x4) {
+            * (uint32_t *) d = * (const uint32_t *) s;
+            d += 4;
+            s += 4;
+          }
+          if (n & 0x2) {
+            * (uint16_t *) d = * (const uint16_t *) s;
+            d += 2;
+            s += 2;
+          }
+          if (n & 0x1) {
+            * (uint8_t *) d = * (const uint8_t *) s;
+          }
+          return dst;
+        }
+      }
+    }
+    else if (n < 64) {
+      uint64_t *dp = (uint64_t *) d;
+      const uint64_t *sp = (const uint64_t *) s;
+      size_t count = n >> 3;
+      size_t i;
+      /* ideally would like to tell compiler not to unroll this loop further */
+      for (i = 0; i < count - 1; i += 2) {
+        uint64_t tmp0 = sp[i];
+        uint64_t tmp1 = sp[i + 1];
+        dp[i] = tmp0;
+        dp[i + 1] = tmp1;
+      }
+      size_t bytes = i << 3;
+      if (n == bytes) {
+        return dst;       /* short-cut to return */
+      }
+      d += bytes;
+      s += bytes;
+      n -= bytes;
+    }
+    else if (n < OPTERON_L1_CACHE_BYTES) {
+      /* align destination up to 8 bytes */
+      size_t i;
+      size_t a = 8 - (((uintptr_t) d) & 0x7);
+      if (a != 8) {
+        for (i = 0; i < a; i++) {
+          d[i] = s[i];
+        }
+        d += a;
+        s += a;
+        n -= a;
+      }
+      uint64_t *dp = (uint64_t *) d;
+      const uint64_t *sp = (const uint64_t *) s;
+      size_t count = n >> 6;
+      if (count > 0) {
+        i = count;
+        do {
+          uint64_t tmp0 = sp[0];
+          uint64_t tmp1 = sp[1];
+          uint64_t tmp2 = sp[2];
+          uint64_t tmp3 = sp[3];
+          dp[0] = tmp0;
+          dp[1] = tmp1;
+          dp[2] = tmp2;
+          dp[3] = tmp3;
+          uint64_t tmp4 = sp[4];
+          uint64_t tmp5 = sp[5];
+          uint64_t tmp6 = sp[6];
+          uint64_t tmp7 = sp[7];
+          dp[4] = tmp4;
+          dp[5] = tmp5;
+          dp[6] = tmp6;
+          dp[7] = tmp7;
+          __asm__("lea 64(%0),%0\n" : "+r"(sp));  /* was sp += 64 */
+          __asm__("lea 64(%0),%0\n" : "+r"(dp));  /* was dp += 64 */
+          i--;
+        } while (i > 0);
+      }
+      size_t bytes = count << 6;
+      if (n == bytes) {
+        return dst;       /* short-cut to return */
+      }
+      d += bytes;
+      s += bytes;
+      n -= bytes;
+    }
+#if 0	/* performance of rep movsq appears to be unpredictable */
+    else if (n < OPTERON_L1_CACHE_BYTES) {
+      size_t count = n >> 3;
+      __asm__ ("rep movsq\n" :
+               "+D" (d), "+S" (s), "+c" (count) : : "memory");
+      size_t bytes = count << 3;
+      d += bytes;
+      s += bytes;
+      n -= bytes;
+    }
+#endif
+    else {
+      size_t bytes = __memcpy_pathscale_opteron_sse2(d, s, n);
+      assert(bytes > 0);
+      d += bytes;
+      s += bytes;
+      n -= bytes;
+    }
+  } while (n > 0);
+
+  return dst;
+}
+#endif
diff --git a/psm_mpool.c b/psm_mpool.c
new file mode 100644
index 0000000..6aadd9a
--- /dev/null
+++ b/psm_mpool.c
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+
+#define PSMI_MPOOL_ALIGNMENT	64
+
+struct mpool_element {
+    union {
+	SLIST_ENTRY(mpool_element) me_next;
+	mpool_t me_mpool;
+    };
+
+    uint32_t me_gen_count;
+    uint32_t me_index;
+#ifdef PSM_DEBUG
+    uint32_t me_isused;
+#endif
+} __attribute__ ((aligned(8)));
+
+#ifdef PSM_DEBUG
+#  define me_mark_used(me)    ((me)->me_isused = 1)
+#  define me_mark_unused(me)  ((me)->me_isused = 0)
+#else
+#  define me_mark_used(me)
+#  define me_mark_unused(me)
+#endif
+
+struct mpool {
+    int mp_type;
+    int mp_flags;
+    int mp_vector_shift;
+    
+    uint32_t mp_elm_vector_size;
+    uint32_t mp_elm_offset;
+    uint32_t mp_num_obj;
+    uint32_t mp_num_obj_inuse;
+    uint32_t mp_elm_size;
+    uint32_t mp_obj_size;
+    uint32_t mp_num_obj_per_chunk;
+    uint32_t mp_num_obj_max_total;
+    psmi_memtype_t mp_memtype;
+
+    SLIST_HEAD(, mpool_element)	    mp_head;
+    struct mpool_element **	    mp_elm_vector;
+    struct mpool_element **	    mp_elm_vector_free;
+    non_empty_callback_fn_t	    mp_non_empty_cb;
+    void *			    mp_non_empty_cb_context;
+
+};
+
+static int	psmi_mpool_allocate_chunk(mpool_t);
+
+/**
+ * psmi_mpool_create()
+ *
+ * Create a memory pool and allocates <num_obj_per_chunk> objects of size
+ * <obj_size>.  If more memory is needed to accomodate mpool_get()
+ * requests, the memory pool will allocate another chunk of
+ * <num_obj_per_chunk> objects, until it reaches the maximum number of objects
+ * it can allocate.
+ *
+ * <obj_size>		size of each individual object
+ * <num_obj_per_chunk>	number of objects to allocate per chunk (power of two)
+ * <num_obj_max_total>	total number of objects that may be allocated
+ *			at any given time. Must be a power of two greater than
+ *			<num_obj_per_chunk>.
+ *
+ * <flags>		flags to be applied on the memory pool (ie. memory
+ *			alignment)
+ *
+ * <cb>			callback to be called when the memory pool has some
+ *			free objects available again (after running out of them).
+ * <context>		context pointer for the callback
+ *
+ * Return the mpool on success, NULL on failure.
+ */
+mpool_t
+psmi_mpool_create(size_t obj_size, uint32_t num_obj_per_chunk,
+    uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, 
+    non_empty_callback_fn_t cb, void *context)
+{
+    mpool_t mp;
+    int s;
+    size_t hdr_size;
+
+#ifdef PSM_VALGRIND
+    /* For Valgrind we which to define a "redzone" before and after the
+     * allocation block, so we also allocate a blank mpool_element 
+     * at the end of the user's block */
+#endif
+
+    if (!PSMI_POWEROFTWO(num_obj_per_chunk) ||
+	!PSMI_POWEROFTWO(num_obj_max_total) ||
+	num_obj_max_total < num_obj_per_chunk) {
+	return NULL;
+    }
+
+    mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool));
+    if (mp == NULL) {
+	fprintf(stderr, "Failed to allocate memory for memory pool: %s\n",
+	    strerror(errno));
+	return NULL;
+    }
+
+    for (s = 1; s < num_obj_per_chunk; s <<= 1)
+	   mp->mp_vector_shift++;
+
+    mp->mp_flags = flags;
+    mp->mp_num_obj_per_chunk = num_obj_per_chunk;
+    mp->mp_num_obj_max_total = num_obj_max_total;
+    mp->mp_non_empty_cb = cb;
+    mp->mp_non_empty_cb_context = context;
+
+    mp->mp_memtype = statstype;
+
+    SLIST_INIT(&mp->mp_head);
+    mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk;
+    mp->mp_elm_vector = psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size,
+				    sizeof(struct mpool_element *));
+    if (mp->mp_elm_vector == NULL) {
+	fprintf(stderr, "Failed to allocate memory for memory pool vector: "
+	    "%s\n", strerror(errno));
+	psmi_free(mp);
+	return NULL;
+    }
+
+    mp->mp_elm_vector_free = mp->mp_elm_vector;
+
+    if (flags & PSMI_MPOOL_ALIGN) {
+	/* User wants its block to start on a PSMI_MPOOL_ALIGNMENT
+	 * boundary. */
+	hdr_size        = PSMI_ALIGNUP(sizeof(struct mpool_element), 
+				       PSMI_MPOOL_ALIGNMENT);
+	mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT);
+	mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+
+	mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element);
+    } else {
+	hdr_size	= sizeof(struct mpool_element);
+	mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8);
+	mp->mp_elm_size = hdr_size + mp->mp_obj_size;
+	mp->mp_elm_offset = 0;
+    }
+
+    if (psmi_mpool_allocate_chunk(mp) != PSM_OK) {
+	psmi_mpool_destroy(mp);
+	return NULL;
+    }
+
+    VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */, PSM_VALGRIND_MEM_UNDEFINED);
+
+    return mp;
+}
+
+/**
+ * psmi_mpool_get()
+ *
+ * <mp>	    memory pool
+ *
+ * Requests an object from the memory pool.
+ *
+ * Returns NULL if the maximum number of objects has been allocated (refer to
+ * <num_obj_max_total> in psmi_mpool_create) or if running out of memory.
+ */
+void *
+psmi_mpool_get(mpool_t mp)
+{
+    struct mpool_element *me;
+    void *obj;
+
+    if (SLIST_EMPTY(&mp->mp_head)) {
+	if (psmi_mpool_allocate_chunk(mp) != PSM_OK)
+	    return NULL;
+    }
+
+    me = SLIST_FIRST(&mp->mp_head);
+    SLIST_REMOVE_HEAD(&mp->mp_head, me_next);
+
+    psmi_assert(!me->me_isused);
+    me_mark_used(me);
+
+    /* store a backpointer to the memory pool */
+    me->me_mpool = mp;
+    mp->mp_num_obj_inuse++;
+    psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj);
+
+    obj = (void *) ((uintptr_t) me + sizeof(struct mpool_element));
+    VALGRIND_MEMPOOL_ALLOC(mp, obj, mp->mp_obj_size);
+    return obj;
+}
+
+/**
+ * psmi_mpool_put()
+ *
+ * <obj>    object to return to the memory pool
+ *
+ * Returns an <obj> to the memory pool subsystem.  This object will be re-used
+ * to fulfill new psmi_mpool_get() requests.
+ */
+void
+psmi_mpool_put(void *obj)
+{
+    struct mpool_element *me;
+    int was_empty;
+    mpool_t mp;
+
+    me = (struct mpool_element *)
+	((uintptr_t) obj - sizeof(struct mpool_element));
+    me->me_gen_count++;
+
+    mp = me->me_mpool;
+
+    psmi_assert(mp != NULL);
+    psmi_assert(mp->mp_num_obj_inuse >= 0);
+    psmi_assert(me->me_isused);
+    me_mark_unused(me);
+
+    was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total;
+    SLIST_INSERT_HEAD(&mp->mp_head, me, me_next);
+
+    mp->mp_num_obj_inuse--;
+
+    VALGRIND_MEMPOOL_FREE(mp, obj);
+
+    /* tell the user that memory is available */
+    if (mp->mp_non_empty_cb && was_empty)
+	mp->mp_non_empty_cb(mp->mp_non_empty_cb_context);
+}
+
+/**
+ * psmi_mpool_get_obj_index()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in the memory pool.
+ */
+
+int
+psmi_mpool_get_obj_index(void *obj)
+{
+    struct mpool_element *me = (struct mpool_element *)
+	((uintptr_t) obj - sizeof(struct mpool_element));
+
+    return me->me_index;
+}
+
+/**
+ * psmi_mpool_get_obj_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the generation count of the <obj>.
+ */
+uint32_t
+psmi_mpool_get_obj_gen_count(void *obj)
+{
+    struct mpool_element *me = (struct mpool_element *)
+	((uintptr_t) obj - sizeof(struct mpool_element));
+
+    return me->me_gen_count;
+}
+
+/**
+ * psmi_mpool_get_obj_index_gen_count()
+ *
+ * <obj>    object in the memory pool
+ *
+ * Returns the index of the <obj> in <index>.
+ * Returns the generation count of the <obj> in <gen_count>.
+ */
+int
+psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index,
+    uint32_t *gen_count)
+{
+    struct mpool_element *me = (struct mpool_element *)
+	((uintptr_t) obj - sizeof(struct mpool_element));
+
+    *index = me->me_index;
+    *gen_count = me->me_gen_count;
+    return 0;
+}
+
+/**
+ * psmi_mpool_find_obj_by_index()
+ *
+ * <mp>	    memory pool
+ * <index>  index of the object
+ *
+ * Returns the object located at <index> in the memory pool or NULL if the
+ * <index> is invalid.
+ */
+void *
+psmi_mpool_find_obj_by_index(mpool_t mp, int index)
+{
+    struct mpool_element *me;
+
+    if_pf (index < 0 || index >= mp->mp_num_obj)
+	return NULL;
+
+    me = (struct mpool_element *)
+	  ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] +
+	  (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size + 
+	  mp->mp_elm_offset);
+
+    /* If this mpool doesn't require generation counts, it's illegal to find a
+     * freed object */
+#ifdef PSM_DEBUG
+    if (mp->mp_flags & PSMI_MPOOL_NOGENERATION)
+	psmi_assert(!me->me_isused);
+#endif
+
+    return (void *)((uintptr_t) me + sizeof(struct mpool_element));
+}
+
+/**
+ * psmi_mpool_destroy()
+ *
+ * <mp>	    memory pool
+ *
+ * Destroy a previously allocated memory pool and reclaim its associated
+ * memory.  The behavior is undefined if some objects have not been returned
+ * to the memory pool with psmi_mpool_put().
+ */
+void
+psmi_mpool_destroy(mpool_t mp)
+{
+    int i = 0;
+    size_t nbytes = mp->mp_num_obj * mp->mp_elm_size;
+
+    for (i = 0; i < mp->mp_elm_vector_size; i++) {
+	if (mp->mp_elm_vector[i]) 
+	    psmi_free(mp->mp_elm_vector[i]);
+    }
+    psmi_free(mp->mp_elm_vector);
+    nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *);
+    VALGRIND_DESTROY_MEMPOOL(mp);
+    psmi_free(mp);
+    nbytes += sizeof(struct mpool);
+}
+
+/**
+ * psmi_mpool_get_max_obj()
+ *
+ * <mp>	    memory pool
+ *
+ * Returns the num-obj-per-chunk
+ * Returns the num-obj-max-total
+ */
+void		
+psmi_mpool_get_obj_info(mpool_t mp, uint32_t *num_obj_per_chunk, 
+			uint32_t *num_obj_max_total)
+{
+    *num_obj_per_chunk = mp->mp_num_obj_per_chunk;
+    *num_obj_max_total = mp->mp_num_obj_max_total;
+    return;
+}
+
+static int
+psmi_mpool_allocate_chunk(mpool_t mp)
+{
+    struct mpool_element *elm;
+    void *chunk;
+    uint32_t i = 0, num_to_allocate;
+
+    num_to_allocate =
+	mp->mp_num_obj + mp->mp_num_obj_per_chunk > mp->mp_num_obj_max_total ?
+	0 : mp->mp_num_obj_per_chunk;
+
+    psmi_assert(mp->mp_num_obj + mp->mp_num_obj_per_chunk <=
+	mp->mp_num_obj_max_total);
+
+    if (num_to_allocate == 0)
+	return PSM_NO_MEMORY;
+
+    chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, 
+			num_to_allocate * mp->mp_elm_size);
+    if (chunk == NULL) {
+	fprintf(stderr,
+	    "Failed to allocate memory for memory pool chunk: %s\n",
+	    strerror(errno));
+	return PSM_NO_MEMORY;
+    }
+
+    for (i = 0; i < num_to_allocate; i++) {
+	elm = (struct mpool_element *)((uintptr_t)chunk +
+	    i * mp->mp_elm_size + mp->mp_elm_offset);
+	elm->me_gen_count = 0;
+	elm->me_index = mp->mp_num_obj + i;
+#ifdef PSM_DEBUG
+	elm->me_isused = 0;
+#endif
+	SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next);
+#if 0
+	fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n",
+	    (long)(mp->mp_elm_vector_free - mp->mp_elm_vector), (int) i, elm,
+	    (void *)((uintptr_t) elm + sizeof(struct mpool_element)),
+	    SLIST_NEXT(elm, me_next));
+#endif
+    }
+
+    psmi_assert((uintptr_t) mp->mp_elm_vector_free
+	< ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size
+	* sizeof(struct mpool_element *));
+
+    mp->mp_elm_vector_free[0] = chunk;
+    mp->mp_elm_vector_free++;
+    mp->mp_num_obj += num_to_allocate;
+
+    return PSM_OK;
+}
+
+#if 0
+void
+psmi_mpool_dump(mpool_t mp)
+{
+    int i, j;
+    struct mpool_element *me;
+
+    fprintf(stderr, "Memory pool %p has %d elements per chunk.\n",
+	mp, mp->mp_num_obj_per_chunk);
+    for (i = 0; i < mp->mp_elm_vector_size; i++) {
+	if (mp->mp_elm_vector[i] != NULL) {
+	    fprintf(stderr, "===========================\n");
+	    fprintf(stderr, "mpool chunk #%d\n", i);
+
+	    for (j = 0, me = mp->mp_elm_vector[i];
+		j < mp->mp_num_obj_per_chunk;
+		j++, me = (struct mpool_element *)
+		    ((uintptr_t) me + mp->mp_elm_size)) {
+		fprintf(stderr, "obj=%p index=%d gen_count=%d\n",
+		    (void *) ((uintptr_t) me + sizeof(struct mpool_element)),
+		    me->me_index, me->me_gen_count);
+	    }
+	    fprintf(stderr, "===========================\n");
+	}
+    }
+}
+#endif
diff --git a/psm_mpool.h b/psm_mpool.h
new file mode 100644
index 0000000..1567dd5
--- /dev/null
+++ b/psm_mpool.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_mpool.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef PSM_MPOOL_H
+#define PSM_MPOOL_H
+
+/* mpool flags */
+#define PSMI_MPOOL_ALIGN_CACHE	0x1
+#define PSMI_MPOOL_ALIGN_PAGE   0x2
+#define PSMI_MPOOL_NOGENERATION 0x4
+
+/* Backwards compatibility */
+#define PSMI_MPOOL_ALIGN	PSMI_MPOOL_ALIGN_CACHE 
+
+typedef void (*non_empty_callback_fn_t)(void *context);
+typedef struct mpool *mpool_t;
+
+mpool_t		psmi_mpool_create(size_t obj_size, uint32_t num_obj_per_chunk,
+				  uint32_t num_obj_max_total, int flags,
+				  psmi_memtype_t statstype, 
+				  non_empty_callback_fn_t cb, void *context);
+
+void		psmi_mpool_destroy(mpool_t mp);
+void		psmi_mpool_get_obj_info(mpool_t mp, uint32_t *num_obj_per_chunk, 
+				        uint32_t *num_obj_max_total);
+
+void *		psmi_mpool_get(mpool_t mp);
+void		psmi_mpool_put(void *obj);
+
+int		psmi_mpool_get_obj_index(void *obj);
+uint32_t	psmi_mpool_get_obj_gen_count(void *obj);
+int		psmi_mpool_get_obj_index_gen_count(void *obj,
+						   uint32_t *index,
+						   uint32_t *gen_count);
+
+void *		psmi_mpool_find_obj_by_index(mpool_t mp, int index);
+
+#endif
diff --git a/psm_mq.c b/psm_mq.c
new file mode 100644
index 0000000..ea2655a
--- /dev/null
+++ b/psm_mq.c
@@ -0,0 +1,729 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sched.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/* 
+ * Functions to manipulate the expected queue in mq_ep.
+ */
+
+/*
+ * ! @brief PSM exposed version to allow PTLs to match 
+ */
+
+static
+psm_mq_req_t 
+mq_req_match_with_tagsel(psm_mq_t mq, struct mqsq *q, uint64_t tag, 
+			 uint64_t tagsel, int remove)
+{
+    psm_mq_req_t *curp;
+    psm_mq_req_t cur;
+
+    for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) {
+	if (!((tag ^ cur->tag) & tagsel)) { /* match! */
+	    if (remove) {
+		if ((*curp = cur->next) == NULL) /* fix tail */
+		    q->lastp = curp;
+		cur->next = NULL;
+	    }
+	    return cur;
+	}
+    }
+    return NULL;
+}
+
+#if 0
+/* Only for psm_mq_irecv. Currently not enabled. */
+PSMI_ALWAYS_INLINE(
+psm_mq_req_t
+mq_req_match_with_tagsel_inline(struct mqsq *q, uint64_t tag, uint64_t tagsel))
+{
+    psm_mq_req_t cur = q->first;
+    if (cur == NULL)
+	return NULL;
+    else if (!((cur->tag ^ tag) & tagsel)) {
+	if ((q->first = cur->next) == NULL)
+	    q->lastp = &q->first;
+	cur->next = NULL;
+	return cur;
+    }
+    else
+	return mq_req_match_with_tagsel(q, tag, tagsel, 1);
+}
+#endif
+
+static
+int
+mq_req_remove_single(psm_mq_t mq, struct mqsq *q, psm_mq_req_t req)
+{
+    psm_mq_req_t *curp;
+    psm_mq_req_t cur;
+
+    for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) {
+	if (cur == req) {
+	    if ((*curp = cur->next) == NULL)
+		q->lastp = curp;
+	    cur->next = NULL;
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+#if 0
+ /*XXX only used with cancel, for now */
+
+static
+psm_mq_req_t 
+mq_req_match_req(struct mqsq *q, psm_mq_req_t req, int remove)
+{
+    psm_mq_req_t *curp;
+    psm_mq_req_t cur;
+
+    for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) {
+	if (cur->send_req == req) {
+	    if (remove) {
+		if ((*curp = cur->next) == NULL) /* fix tail */
+		    q->lastp = curp;
+		cur->next = NULL;
+	    }
+	    return cur;
+	}
+    }
+    return NULL; /* no match */
+}
+#endif
+
+void 
+psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars)
+{
+#ifdef __MIC__
+    memcpy(vdest, vsrc, nchars);
+#else
+    unsigned char *dest = (unsigned char *)vdest;
+    const unsigned char *src  = (const unsigned char *)vsrc;
+    if(nchars>>2)
+        ipath_dwordcpy((uint32_t*) dest, (uint32_t*) src, nchars>>2);
+    dest += (nchars>>2)<<2;
+    src += (nchars>>2)<<2;
+    switch (nchars&0x03) {
+        case 3: *dest++ = *src++;
+        case 2: *dest++ = *src++;
+        case 1: *dest++ = *src++;
+    }
+#endif
+}
+
+#if 0 // defined(__x86_64__) No consumers of mtucpy safe
+void 
+psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars)
+{
+    unsigned char *dest = (unsigned char *)vdest;
+    const unsigned char *src  = (const unsigned char *)vsrc;
+    if(nchars>>2)
+        ipath_dwordcpy_safe((uint32_t*) dest, (uint32_t*) src, nchars>>2);
+    dest += (nchars>>2)<<2;
+    src += (nchars>>2)<<2;
+    switch (nchars&0x03) {
+        case 3: *dest++ = *src++;
+        case 2: *dest++ = *src++;
+        case 1: *dest++ = *src++;
+    }
+}
+#endif
+
+psm_error_t
+__psm_mq_iprobe(psm_mq_t mq, uint64_t tag, uint64_t tagsel, psm_mq_status_t *status)
+{
+    psm_mq_req_t req;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+    req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 0);
+
+    if (req != NULL) {
+	PSMI_PUNLOCK();
+	if (status != NULL)
+	    mq_status_copy(req, status);
+	return PSM_OK;
+    }
+
+    psmi_poll_internal(mq->ep, 1);
+    /* try again */
+    req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 0);
+
+    if (req != NULL) {
+	PSMI_PUNLOCK();
+	if (status != NULL)
+	    mq_status_copy(req, status);
+	return PSM_OK;
+    }
+    PSMI_PUNLOCK();
+    return PSM_MQ_NO_COMPLETIONS;
+}
+PSMI_API_DECL(psm_mq_iprobe)
+
+psm_error_t
+__psm_mq_cancel(psm_mq_req_t *ireq)
+{
+    psm_mq_req_t req = *ireq;
+    psm_mq_t mq;
+    psm_error_t err = PSM_OK;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    if (req == NULL)
+	return PSM_MQ_NO_COMPLETIONS;
+
+    /* Cancelling a send is a blocking operation, and expensive.
+     * We only allow cancellation of rendezvous sends, consider the eager sends
+     * as always unsuccessfully cancelled.
+     */
+    PSMI_PLOCK();
+
+    mq = req->mq;
+    if (MQE_TYPE_IS_RECV(req->type)) {
+	if (req->state == MQ_STATE_POSTED) {
+	    int rc;
+
+	    rc = mq_req_remove_single(mq, &mq->expected_q, req);
+	    psmi_assert_always(rc);
+	    req->state = MQ_STATE_COMPLETE;
+	    mq_qq_append(&mq->completed_q, req);
+	    err = PSM_OK;
+	}
+	else 
+	    err = PSM_MQ_NO_COMPLETIONS;
+    }
+    else {
+	err = psmi_handle_error(mq->ep, PSM_PARAM_ERR,
+		"Cannot cancel send requests (req=%p)", req);
+    }
+
+    PSMI_PUNLOCK();
+
+    return err;
+}
+PSMI_API_DECL(psm_mq_cancel)
+
+/* This is the only PSM function that blocks.
+ * We handle it in a special manner since we don't know what the user's
+ * execution environment is (threads, oversubscribing processes, etc).
+ *
+ */
+PSMI_ALWAYS_INLINE(
+psm_error_t 
+psmi_mq_wait_inner(psm_mq_req_t *ireq, psm_mq_status_t *status, int do_lock))
+{
+    psm_error_t err = PSM_OK;
+
+    psm_mq_req_t req = *ireq;
+    if (req == PSM_MQ_REQINVALID) {
+	return PSM_OK;
+    }
+
+    if (do_lock)
+	PSMI_PLOCK();
+
+    if (req->state != MQ_STATE_COMPLETE) {
+	psm_mq_t mq = req->mq;
+
+	/* We'll be waiting on this req, mark it as so */
+	req->type |= MQE_TYPE_WAITING;
+
+	_IPATH_VDBG("req=%p, buf=%p, len=%d, waiting\n", 
+		 req, req->buf, req->buf_len);
+
+	if (req->testwait_callback) {
+	    err = req->testwait_callback(ireq, 0, status);
+	    if (do_lock)
+		PSMI_PUNLOCK();
+	    return err;
+	}
+
+	PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE);
+
+	if (err > PSM_OK_NO_PROGRESS)
+	    goto fail_with_lock;
+	else
+	    err = PSM_OK;
+    }
+
+    mq_qq_remove(&req->mq->completed_q, req);
+
+    if (status != NULL)
+	mq_status_copy(req, status);
+    psmi_mq_req_free(req);
+    *ireq = PSM_MQ_REQINVALID;
+
+    _IPATH_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n", 
+		 req, req->buf, req->buf_len, req->error_code);
+
+fail_with_lock:
+    if (do_lock)
+	PSMI_PUNLOCK();
+    return err;
+}
+
+psm_error_t __sendpath
+__psm_mq_wait(psm_mq_req_t *ireq, psm_mq_status_t *status)
+{
+    PSMI_ASSERT_INITIALIZED();
+    return psmi_mq_wait_inner(ireq, status, 1);
+}
+PSMI_API_DECL(psm_mq_wait)
+
+psm_error_t __sendpath
+psmi_mq_wait_internal(psm_mq_req_t *ireq)
+{
+    return psmi_mq_wait_inner(ireq, NULL, 0);
+}
+
+psm_error_t __sendpath
+__psm_mq_test(psm_mq_req_t *ireq, psm_mq_status_t *status)
+{
+    psm_mq_req_t req = *ireq;
+    psm_error_t err = PSM_OK;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    if (req == PSM_MQ_REQINVALID) {
+	return PSM_OK;
+    }
+
+    if (req->state != MQ_STATE_COMPLETE) {
+	if (req->testwait_callback) {
+	    PSMI_PLOCK();
+	    err = req->testwait_callback(ireq, 1, status);
+	    PSMI_PUNLOCK();
+	    return err;
+	}
+	else
+	    return PSM_MQ_NO_COMPLETIONS;
+    }
+
+    if (status != NULL)
+	mq_status_copy(req, status);
+
+    PSMI_PLOCK();
+    mq_qq_remove(&req->mq->completed_q, req);
+    psmi_mq_req_free(req);
+    PSMI_PUNLOCK();
+
+    *ireq = PSM_MQ_REQINVALID;
+    _IPATH_VDBG("req=%p complete, tag=%llx buf=%p, len=%d, err=%d\n", 
+	req, (unsigned long long) req->tag, req->buf, 
+	req->buf_len, req->error_code);
+
+    return err;
+}
+PSMI_API_DECL(psm_mq_test)
+
+psm_error_t __sendpath
+__psm_mq_isend(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, 
+	     const void *buf, uint32_t len, void *context, psm_mq_req_t *req)
+{
+    psm_error_t err;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+    err = dest->ptlctl->mq_isend(mq, dest, flags, stag, buf, len, context, req);
+    PSMI_PUNLOCK();
+
+#if 0
+#ifdef PSM_VALGRIND
+    /* If the send isn't completed yet, make sure that we mark the memory as
+     * unaccessible 
+     */
+    if (*req != PSM_MQ_REQINVALID && 
+	(*req)->state != MQ_STATE_COMPLETE)
+	VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+#endif
+#endif
+    psmi_assert(*req != NULL);
+    return err;
+}
+PSMI_API_DECL(psm_mq_isend)
+
+psm_error_t __sendpath
+__psm_mq_send(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, 
+	    const void *buf, uint32_t len)
+{
+    psm_error_t err;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+    err =  dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len);
+    PSMI_PUNLOCK();
+    return err;
+}
+PSMI_API_DECL(psm_mq_send)
+
+psm_error_t __recvpath
+__psm_mq_irecv(psm_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, 
+	      void *buf, uint32_t len, void *context, psm_mq_req_t *reqo)
+{
+    psm_error_t err = PSM_OK;
+    psm_mq_req_t req;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    PSMI_PLOCK();
+
+    /* First check unexpected Queue and remove req if found */
+    req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 1);
+
+    if (req == NULL) 
+    {
+	/* prepost before arrival, add to expected q */
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	if_pf (req == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto ret;
+	}
+
+	req->tag = tag;
+	req->tagsel = tagsel;
+	req->state = MQ_STATE_POSTED;
+	req->buf = buf;
+	req->buf_len = len;
+	req->recv_msglen = len;
+	req->recv_msgoff = 0;
+	req->context = context;
+
+	/* Nobody should touch the buffer after it's posted */
+	VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+
+	mq_sq_append(&mq->expected_q, req);
+	_IPATH_VDBG("buf=%p,len=%d,tag=%"PRIx64
+		    " tagsel=%"PRIx64" req=%p\n", 
+		    buf,len,tag, tagsel, req);
+    }
+    else {
+	uint32_t copysz;
+	req->context = context;
+
+	psmi_assert(MQE_TYPE_IS_RECV(req->type));
+	_IPATH_VDBG("unexpected buf=%p,len=%d,tag=%"PRIx64 
+		    " tagsel=%"PRIx64" req=%p\n", buf, len, tag, tagsel, req);
+
+	switch (req->state) {
+	  case MQ_STATE_COMPLETE:
+	    if (req->buf != NULL) { /* 0-byte messages don't alloc a sysbuf */
+		copysz = mq_set_msglen(req, len, req->send_msglen);
+		psmi_mq_mtucpy(buf, (const void *) req->buf, copysz);
+		psmi_mq_sysbuf_free(mq, req->buf);
+	    }
+	    req->buf = buf;
+	    req->buf_len = len;
+	    mq_qq_append(&mq->completed_q, req);
+	    break;
+
+	  case MQ_STATE_UNEXP: /* not done yet */
+	    copysz = mq_set_msglen(req, len, req->send_msglen);
+	    /* Copy What's been received so far and make sure we don't receive
+	     * any more than copysz.  After that, swap system with user buffer
+	     */
+	    req->recv_msgoff = min(req->recv_msgoff, copysz);
+	    psmi_mq_mtucpy(buf, (const void *) req->buf, req->recv_msgoff);
+	    /* What's "left" is no access */
+	    VALGRIND_MAKE_MEM_NOACCESS(
+		(void *)((uintptr_t) buf + req->recv_msgoff), len - req->recv_msgoff);
+	    psmi_mq_sysbuf_free(mq, req->buf);
+	    req->state = MQ_STATE_MATCHED;
+	    req->buf = buf;
+	    req->buf_len = len;
+	    break;
+
+	  case MQ_STATE_UNEXP_RV: /* rendez-vous ... */
+	    copysz = mq_set_msglen(req, len, req->send_msglen);
+	    req->state = MQ_STATE_MATCHED;
+	    req->buf = buf;
+	    req->buf_len = len;
+	    VALGRIND_MAKE_MEM_NOACCESS(buf, len);
+	    req->recv_msgoff = 0;
+	    req->rts_callback(req, 0);
+	    break;
+
+	  default:
+	    fprintf(stderr, "Unexpected state %d in req %p\n", req->state, req);
+	    fprintf(stderr, "type=%d, mq=%p, tag=%p\n",
+			    req->type, req->mq, (void *)(uintptr_t)req->tag);
+	    abort();
+	}
+    }
+
+ret:
+    PSMI_PUNLOCK();
+    *reqo = req;
+    return err;
+}
+PSMI_API_DECL(psm_mq_irecv)
+
+psm_error_t __sendpath
+__psm_mq_ipeek(psm_mq_t mq, psm_mq_req_t *oreq, psm_mq_status_t *status)
+{
+    psm_mq_req_t req;
+
+    PSMI_ASSERT_INITIALIZED();
+
+    if ((req = mq->completed_q.first) == NULL) {
+	PSMI_PLOCK();
+	psmi_poll_internal(mq->ep, 1);
+	if ((req = mq->completed_q.first) == NULL) {
+	    PSMI_PUNLOCK();
+	    return PSM_MQ_NO_COMPLETIONS;
+	}
+	PSMI_PUNLOCK();
+    }
+    /* something in the queue */
+    *oreq = req;
+    if (status != NULL)
+	mq_status_copy(req, status);
+
+    return PSM_OK;
+}
+PSMI_API_DECL(psm_mq_ipeek)
+
+static
+psm_error_t
+psmi_mqopt_ctl(psm_mq_t mq, uint32_t key, void *value, int get)
+{
+    psm_error_t err = PSM_OK;
+    uint32_t val32;
+
+    switch (key) {
+	case PSM_MQ_RNDV_IPATH_SZ:
+	    if (get) 
+		*((uint32_t *)value) = mq->ipath_thresh_rv;
+	    else {
+		val32 = *((uint32_t *) value);
+		mq->ipath_thresh_rv = val32;
+	    }
+	    _IPATH_VDBG("RNDV_IPATH_SZ = %d (%s)\n",
+			mq->ipath_thresh_rv, get ? "GET" : "SET");
+	    break;
+
+	case PSM_MQ_RNDV_SHM_SZ:
+	    if (get) 
+		*((uint32_t *)value) = mq->shm_thresh_rv;
+	    else {
+		val32 = *((uint32_t *) value);
+		mq->shm_thresh_rv = val32;
+	    }
+	    _IPATH_VDBG("RNDV_SHM_SZ = %d (%s)\n",
+			mq->shm_thresh_rv, get ? "GET" : "SET");
+	    break;
+
+	case PSM_MQ_MAX_SYSBUF_MBYTES:
+	    if (get)
+		*((uint32_t *)value) = (uint32_t)(mq->max_sysbuf_bytes / 1048576);
+	    else {
+		val32 = *((uint32_t *) value);
+		/* XXX For now, don't support this */
+		/* mq->max_sysbuf_bytes = 1048576ULL * val32; */
+		mq->max_sysbuf_bytes = ~(0ULL);
+	    }
+	    break;
+	
+	default:
+	    err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown option key=%u", key);
+	    break;
+    }
+    return err;
+}
+
+psm_error_t
+__psm_mq_getopt(psm_mq_t mq, int key, void *value)
+{
+    PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+    return psmi_mqopt_ctl(mq, key, value, 1);
+}
+PSMI_API_DECL(psm_mq_getopt)
+
+psm_error_t
+__psm_mq_setopt(psm_mq_t mq, int key, const void *value)
+{
+    PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+    return psmi_mqopt_ctl(mq, key, (void *) value, 0);
+}
+PSMI_API_DECL(psm_mq_setopt)
+
+/*
+ * This is the API for the user.  We actually allocate the MQ much earlier, but
+ * the user can set options after obtaining an endpoint
+ */
+psm_error_t
+__psm_mq_init(psm_ep_t ep, uint64_t tag_order_mask, 
+	    const struct psm_optkey *opts, 
+	    int numopts, psm_mq_t *mqo)
+{
+    psm_error_t err = PSM_OK;
+    psm_mq_t mq = ep->mq;
+    int i;
+
+    PSMI_ERR_UNLESS_INITIALIZED(ep);
+
+    psmi_assert(mq != NULL);
+    psmi_assert(mq->ep != NULL);
+
+    /* Process options */
+    for (i = 0; err == PSM_OK && i < numopts; i++) 
+	err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0);
+    if (err != PSM_OK) /* error already handled */
+	goto fail;
+    
+    *mqo = mq;
+
+fail:
+    return err;
+}
+PSMI_API_DECL(psm_mq_init)
+
+psm_error_t
+__psm_mq_finalize(psm_mq_t mq)
+{
+    psm_ep_t	ep;
+    PSMI_ERR_UNLESS_INITIALIZED(mq->ep);
+
+    ep = mq->ep;
+    do {
+	ep->mq = NULL;
+	ep = ep->mctxt_next;
+    } while (ep != mq->ep);
+
+    return psmi_mq_free(mq);
+}
+PSMI_API_DECL(psm_mq_finalize)
+
+void
+__psm_mq_get_stats(psm_mq_t mq, psm_mq_stats_t *stats)
+{
+    memcpy(stats, &mq->stats, sizeof(psm_mq_stats_t));
+}
+PSMI_API_DECL(psm_mq_get_stats)
+
+psm_error_t
+psmi_mq_malloc(psm_mq_t *mqo)
+{
+    psm_error_t err = PSM_OK;
+
+    psm_mq_t mq = (psm_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm_mq));
+    if (mq == NULL) {
+	err = psmi_handle_error(NULL, PSM_NO_MEMORY,
+		"Couldn't allocate memory for mq endpoint");
+	goto fail;
+    }
+
+    mq->ep = NULL;
+    mq->memmode = psmi_parse_memmode();
+    mq->expected_q.first = NULL;
+    mq->expected_q.lastp = &mq->expected_q.first;
+    mq->unexpected_q.first = NULL;
+    mq->unexpected_q.lastp = &mq->unexpected_q.first;
+    mq->completed_q.first = NULL;
+    mq->completed_q.lastp = &mq->completed_q.first;
+
+    mq->cur_sysbuf_bytes = 0ULL;
+    mq->max_sysbuf_bytes = ~(0ULL);
+
+    /* The values are overwritten in initialize_defaults, they're just set to
+     * sensible defaults until then */
+    mq->ipath_thresh_rv = 64000;
+    mq->ipath_window_rv = 131072;
+    mq->shm_thresh_rv = 16000;
+
+    memset(&mq->stats, 0, sizeof(psm_mq_stats_t));
+    err = psmi_mq_req_init(mq);
+    if (err)
+	goto fail;
+
+    /* Initialize the unexpected system buffer allocator */
+    psmi_mq_sysbuf_init(mq);
+    char buf[128];
+    psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf);
+    _IPATH_VDBG("%s", buf);
+    *mqo = mq;
+
+    return PSM_OK;
+fail:
+    if (mq != NULL)
+	psmi_free(mq);
+    return err;
+}
+
+psm_error_t
+psmi_mq_initialize_defaults(psm_mq_t mq)
+{
+    union psmi_envvar_val env_rvwin, env_ipathrv, env_shmrv;
+
+    psmi_getenv("PSM_MQ_RNDV_IPATH_THRESH", 
+		"ipath eager-to-rendezvous switchover",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) mq->ipath_thresh_rv, &env_ipathrv);
+    mq->ipath_thresh_rv = env_ipathrv.e_uint;
+
+    /* Re-evaluate this since it may have changed after initializing the shm
+     * device */
+    mq->shm_thresh_rv = psmi_shm_mq_rv_thresh;
+    psmi_getenv("PSM_MQ_RNDV_SHM_THRESH", 
+		"shm eager-to-rendezvous switchover",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) mq->shm_thresh_rv, &env_shmrv);
+    mq->shm_thresh_rv = env_shmrv.e_uint;
+
+    psmi_getenv("PSM_MQ_RNDV_IPATH_WINDOW", 
+		"ipath rendezvous window size",
+		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) mq->ipath_window_rv, &env_rvwin);
+    mq->ipath_window_rv = env_rvwin.e_uint;
+
+    return PSM_OK;
+}
+    
+
+psm_error_t
+psmi_mq_free(psm_mq_t mq)
+{
+    psmi_mq_req_fini(mq);
+    psmi_mq_sysbuf_fini(mq);
+    psmi_free(mq);
+    return PSM_OK;
+}
diff --git a/psm_mq.h b/psm_mq.h
new file mode 100644
index 0000000..dd90028
--- /dev/null
+++ b/psm_mq.h
@@ -0,0 +1,600 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PSM_MQ_H
+#define PSM_MQ_H
+
+#include <psm.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/* Initialize the MQ component for MQ communication
+ *
+ * This function provides the Matched Queue handle necessary to performa all
+ * Matched Queue communication operations.  
+ *
+ * [in] ep Endpoint over which to initialize Matched Queue
+ * [in] tag_order_mask Order mask hint to let MQ know what bits of the send
+ *                           tag are required to maintain MQ message order.  In
+ *                           MPI parlance, this mask sets the bits that store
+ *                           the context (or communicator ID).  The user can
+ *                           choose to pass PSM_MQ_ORDERMASK_NONE or
+ *                           PSM_MQ_ORDERMASK_ALL to tell MQ to respectively
+ *                           provide no ordering guarantees or to provide
+ *                           ordering over all messages by ignoring the
+ *                           contexts of the send tags.
+ * [in] opts Set of options for Matched Queue
+ * [in] numopts Number of options passed
+ * [out] mq User-supplied storage to return the Matched Queue handle
+ *                associated to the newly created Matched Queue.
+ *
+ * @remark This function can be called many times to retrieve the MQ handle
+ *         associated to an endpoint, but options are only considered the first
+ *         time the function is called.
+ *
+ * [post] The user obtains a handle to an instantiated Match Queue.  
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK A new Matched Queue has been instantiated across all the
+ *         members of the group.
+ *
+ * @verbatim
+ * int try_open_endpoint_and_initialize_mq(
+ *	  psm_ep_t *ep,	// endpoint handle
+ *	  psm_epid_t *epid, // unique endpoint ID
+ *	  psm_uuid_t job_uuid, // unique job uuid, for ep_open
+ *	  psm_mq_t *mq, // MQ handle initialized on endpoint 'ep'
+ *        uint64_t communicator_bits) // Where we store our communicator or
+ *                                    // context bits in the 64-bit tag.
+ * {
+ *     // Simplifed open, see psm_ep_open documentation for more info
+ *     psm_ep_open(job_uuid, 
+ *                 NULL, // no options
+ *                 ep, epid);
+ *
+ *     // We initialize a matched queue by telling PSM the bits that are
+ *     // order-significant in the tag.  Point-to-point ordering will not be
+ *     // maintained between senders where the communicator bits are not the
+ *     // same.
+ *     psm_mq_init(ep,
+ *                 communicator_bits,
+ *                 NULL, // no other MQ options
+ *                 0,    // 0 options passed
+ *                 mq);  // newly initialized matched Queue
+ *
+ *     return 1;
+ * }
+ * @endverbatim
+ */
+psm_error_t
+psm_mq_init(psm_ep_t ep, uint64_t tag_order_mask, 
+	    const struct psm_optkey *opts, int numopts, psm_mq_t *mq);
+
+#define PSM_MQ_ORDERMASK_NONE	0ULL
+	/* Used to initialize MQ and disable all MQ message ordering
+	 * guarantees (this mask may prevent the use of MQ to maintain matched
+	 * message envelope delivery required in MPI). */
+
+#define PSM_MQ_ORDERMASK_ALL	0xffffffffffffffffULL
+	/* Used to initialize MQ with no message ordering hints, which forces
+	 * MQ to maintain order over all messages */
+
+/* Finalize (close) an MQ handle
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK A given Matched Queue has been freed and use of the future
+ * use of the handle produces undefined results.
+ */
+psm_error_t
+psm_mq_finalize(psm_mq_t mq);
+
+/* MQ Non-blocking operation status
+ *
+ * Message completion status for asynchronous communication operations.
+ * For wait and test functions, MQ fills in the structure upon completion.
+ * Upon completion, receive requests fill in every field of the status
+ * structure while send requests only return a valid error_code and context
+ * pointer.
+ */
+typedef
+struct psm_mq_status {   
+    uint64_t msg_tag;    /* Sender's original message tag (receive reqs only) */
+    uint32_t msg_length; /* Sender's original message length (receive reqs only) */
+    uint32_t nbytes;	 /* Actual number of bytes transfered (receive reqs only) */
+    psm_error_t error_code; /* MQ error code for communication operation */
+    void     *context;   /* User-associated context for send or receive */
+}
+psm_mq_status_t;
+
+/* PSM Communication handle (opaque) */
+typedef struct psm_mq_req *psm_mq_req_t;
+
+
+
+/* Get an MQ option (Deprecated. Use psm_getopt with PSM_COMPONENT_MQ)
+ *
+ * Function to retrieve the value of an MQ option.
+ *
+ * [in] mq Matched Queue handle
+ * [in] option Index of option to retrieve.  Possible values are:
+ *            * PSM_MQ_RNDV_IPATH_SZ
+ *            * PSM_MQ_RNDV_SHM_SZ
+ *            * PSM_MQ_MAX_SYSBUF_MBYTES
+ *
+ * [in] value Pointer to storage that can be used to store the value of
+ *            the option to be set.  It is up to the user to ensure that the
+ *            pointer points to a memory location large enough to accomodate
+ *            the value associated to the type.  Each option documents the size
+ *            associated to its value.
+ *
+ * [returns] PSM_OK if option could be retrieved.
+ * [returns] PSM_PARAM_ERR if the option is not a valid option number
+ */
+psm_error_t
+psm_mq_getopt(psm_mq_t mq, int option, void *value);
+
+/* Set an MQ option (Deprecated. Use psm_setopt with PSM_COMPONENT_MQ)
+ *
+ * Function to set the value of an MQ option.
+ *
+ * [in] mq Matched Queue handle
+ * [in] option Index of option to retrieve.  Possible values are:
+ *            * PSM_MQ_RNDV_IPATH_SZ
+ *            * PSM_MQ_RNDV_SHM_SZ
+ *            * PSM_MQ_MAX_SYSBUF_MBYTES
+ *
+ * [in] value Pointer to storage that contains the value to be updated
+ *                  for the supplied option number.  It is up to the user to
+ *                  ensure that the pointer points to a memory location with a
+ *                  correct size.
+ *
+ * [returns] PSM_OK if option could be retrieved.
+ * [returns] PSM_PARAM_ERR if the option is not a valid option number
+ * [returns] PSM_OPT_READONLY if the option to be set is a read-only option
+ *                           (currently no MQ options are read-only).
+ */
+psm_error_t
+psm_mq_setopt(psm_mq_t mq, int option, const void *value);
+
+
+
+#define PSM_MQ_FLAG_SENDSYNC	0x01 
+				/* MQ Send Force synchronous send */
+
+#define PSM_MQ_REQINVALID	((psm_mq_req_t)(NULL)) 
+				/* MQ request completion value */
+
+/* Post a receive to a Matched Queue with tag selection criteria
+ *
+ * Function to receive a non-blocking MQ message by providing a preposted
+ * buffer. For every MQ message received on a particular MQ, the tag and @c
+ * tagsel parameters are used against the incoming message's send tag as
+ * described in tagmatch.
+ *
+ * [in] mq Matched Queue Handle
+ * [in] rtag Receive tag
+ * [in] rtagsel Receive tag selector
+ * [in] flags Receive flags (None currently supported)
+ * [in] buf Receive buffer 
+ * [in] len Receive buffer length
+ * [in] context User context pointer, available in psm_mq_status_t
+ *                    upon completion
+ * [out] req PSM MQ Request handle created by the preposted receive, to
+ *                 be used for explicitly controlling message receive
+ *                 completion.
+ *
+ * [post] The supplied receive buffer is given to MQ to match against incoming
+ *       messages unless it is cancelled via psm_mq_cancel @e before any
+ *       match occurs.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The receive buffer has successfully been posted to the MQ.
+ */
+psm_error_t
+psm_mq_irecv(psm_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags,
+	     void *buf, uint32_t len, void *context, psm_mq_req_t *req);
+
+/* Send a blocking MQ message
+ *
+ * Function to send a blocking MQ message, whereby the message is locally
+ * complete and the source data can be modified upon return.
+ *
+ * [in] mq Matched Queue Handle
+ * [in] dest Destination EP address
+ * [in] flags Message flags, currently:
+ *            * PSM_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * [in] stag Message Send Tag
+ * [in] buf Source buffer pointer
+ * [in] len Length of message starting at buf.
+ *
+ * [post] The source buffer is reusable and the send is locally complete.
+ *
+ * @note This send function has been implemented to best suit MPI_Send.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The message has been successfully sent.
+ */
+psm_error_t
+psm_mq_send(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, 
+	    const void *buf, uint32_t len);
+
+/* Send a non-blocking MQ message
+ *
+ * Function to initiate the send of a non-blocking MQ message, whereby the
+ * user guarantees that the source data will remain unmodified until the send
+ * is locally completed through a call such as psm_mq_wait or @ref
+ * psm_mq_test.
+ *
+ * [in] mq Matched Queue Handle
+ * [in] dest Destination EP address
+ * [in] flags Message flags, currently:
+ *            * PSM_MQ_FLAG_SENDSYNC tells PSM to send the message
+ *            synchronously, meaning that the message will not be sent until
+ *            the receiver acknowledges that it has matched the send with a
+ *            receive buffer.
+ * [in] stag Message Send Tag
+ * [in] buf Source buffer pointer
+ * [in] len Length of message starting at buf.
+ * [in] context Optional user-provided pointer available in @ref
+ *                    psm_mq_status_t when the send is locally completed.
+ * [out] req PSM MQ Request handle created by the non-blocking send, to
+ *                 be used for explicitly controlling message completion.
+ *
+ * [post] The source buffer is not reusable and the send is not locally complete
+ *       until its request is completed by either psm_mq_test or @ref
+ *       psm_mq_wait.
+ *
+ * @note This send function has been implemented to suit MPI_Isend.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The message has been successfully initiated.
+ *
+ * @verbatim
+ * psm_mq_req_t 
+ * non_blocking_send(const psm_mq_t mq, psm_epaddr_t dest_ep,
+ *                       const void *buf, uint32_t len,
+ *			 int context_id, int send_tag, const my_request_t *req)
+ * {
+ *     psm_mq_req_t req_mq;
+ *     // Set up our send tag, assume that "my_rank" is global and represents
+ *     // the rank of this process in the job
+ *     uint64_t tag = ( ((context_id & 0xffff) << 48) |
+ *                      ((my_rank & 0xffff) << 32)    |
+ *                      ((send_tag & 0xffffffff)) );
+ *
+ *     psm_mq_isend(mq, dest_ep, 
+ *                  0, // no flags
+ *                  tag,
+ *                  buf,
+ *                  len,
+ *                  req, // this req is available in psm_mq_status_t when one
+ *                       // of the synchronization functions is called.
+ *                  &req_mq);
+ *     return req_mq;
+ * }
+ * @endverbatim
+ */
+psm_error_t
+psm_mq_isend(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, 
+	     const void *buf, uint32_t len, void *context, psm_mq_req_t *req);
+
+/* Try to Probe if a message is received to match tag selection
+ * criteria
+ *
+ * Function to verify if a message matching the supplied tag and tag selectors
+ * has been received.  The function is not fully matched until the user
+ * provides a buffer with the successfully matching tag selection criteria
+ * through psm_mq_irecv.
+ * Probing for messages may be useful if the size of the
+ * message to be received is unknown, in which case its size will be
+ * available in the msg_length member of the returned status.
+ *
+ * [in] mq Matched Queue Handle
+ * [in] rtag Message receive tag
+ * [in] rtagsel Message receive tag selector
+ * [out] status Upon return, status is filled with information
+ *                    regarding the matching send.
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The iprobe is successful and status is updated if non-NULL.
+ * [retval] PSM_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchaged.
+ */
+psm_error_t
+psm_mq_iprobe(psm_mq_t mq, uint64_t rtag, uint64_t rtagsel, 
+		   psm_mq_status_t *status);
+
+/* Query for non-blocking requests ready for completion.
+ *
+ * Function to query a particular MQ for non-blocking requests that are ready
+ * for completion.  Requests "ready for completion" are not actually considered
+ * complete by MQ until they are returned to the MQ library through @ref
+ * psm_mq_wait or psm_mq_test.
+ *
+ * If the user can deal with consuming request completions in the order in
+ * which they complete, this function can be used both for completions and for
+ * ensuring progress.  The latter requirement is satisfied when the user
+ * peeks an empty completion queue as a side effect of always aggressively
+ * peeking and completing all an MQ's requests ready for completion.
+ *
+ * 
+ * [in] mq Matched Queue Handle
+ * [in,out] req MQ non-blocking request
+ * [in] status Optional MQ status, can be NULL.
+ *
+ * [post] The user has ensured progress if the function returns @ref
+ *       PSM_MQ_NO_COMPLETIONS
+ *
+ * The following error codes are returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The peek is successful and req is updated with a request
+ *                ready for completion.  If status is non-NULL, it is also
+ *                updated.
+ *
+ * [retval] PSM_MQ_NO_COMPLETIONS The peek is not successful, meaning that there are
+ *                            no further requests ready for completion.  The
+ *                            contents of req and status remain
+ *                            unchanged.
+ * @verbatim
+ * // Example that uses ipeek_mq_ipeek to make progress instead of psm_poll
+ * // We return the amount of non-blocking requests that we've completed
+ * int main_progress_loop(psm_mq_t mq)
+ * {
+ *     int num_completed = 0;
+ *     psm_mq_req_t req;
+ *     psm_mq_status_t status;
+ *     psm_error_t err;
+ *     my_request_t *myreq;
+ *
+ *     do {
+ *         err = psm_mq_ipeek(mq, &req, 
+ *                            NULL); // No need for status in ipeek here
+ *         if (err == PSM_MQ_NO_COMPLETIONS)
+ *             return num_completed;
+ *         else if (err != PSM_OK)
+ *             goto errh;
+ *         num_completed++;
+ *
+ *         // We obtained 'req' at the head of the completion queue.  We can
+ *         // now free the request with PSM and obtain our original reques
+ *         // from the status' context
+ *         err = psm_mq_test(&req, // will be marked as invalid
+ *                           &status); // we need the status 
+ *         myreq = (my_request_t *) status.context;
+ *
+ *         // handle the completion for myreq whether myreq is a posted receive
+ *         // or a non-blocking send.
+ *    }
+ *    while (1);
+ * }
+ * @endverbatim */
+psm_error_t
+psm_mq_ipeek(psm_mq_t mq, psm_mq_req_t *req, psm_mq_status_t *status);
+
+/* Wait until a non-blocking request completes
+ *
+ * Function to wait on requests created from either preposted receive buffers
+ * or non-blocking sends.  This is the only blocking function in the MQ
+ * interface and will poll until the request is complete as per the progress
+ * semantics explained in mq_progress.
+ *
+ * [in,out] request MQ non-blocking request
+ * [out] status Updated if non-NULL when request successfully completes
+ *
+ * [pre] The user has obtained a valid MQ request by calling psm_mq_isend
+ *      or psm_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a psm_mq_status_t or NULL if status is to be
+ *      ignored.  
+ *
+ * [pre] Since MQ will internally ensure progress while the user is
+ *      suspended, the user need not ensure that progress is made prior to
+ *      calling this function.
+ *
+ * [post] The request is assigned the value PSM_MQ_REQINVALID and all
+ *       associated MQ request storage is released back to the MQ library.
+ *       
+ * [remarks] 
+ *  * This function ensures progress on the endpoint as long as the request
+ *      is incomplete.
+ *  * status can be NULL, in which case no status is written upon
+ *      completion.
+ *  * If request is PSM_MQ_REQINVALID, the function returns
+ *      immediately.
+ *
+ * The following error code is returned.  Other errors are handled by the PSM
+ * error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The request is complete or the value of was 
+ *                PSM_MQ_REQINVALID.
+ *
+ */
+psm_error_t
+psm_mq_wait(psm_mq_req_t *request, psm_mq_status_t *status);
+
+/* Test if a non-blocking request is complete
+ *
+ * Function to test requests created from either preposted receive buffers or
+ * non-blocking sends for completion.  Unlike psm_mq_wait, this function
+ * tests request for completion and @e never ensures progress directly or
+ * indirectly.  It is up to the user to employ some of the progress functions
+ * described in mq_progress to ensure progress if the user chooses to
+ * exclusively test requests for completion.
+ *
+ * Testing a request for completion @e never internally ensure progress in
+ * order to be useful to construct higher-level completion tests over arrays to
+ * test some, all or any request that has completed.  For testing arrays of
+ * requests, it is preferable for performance reasons to only ensure progress
+ * once before testing a set of requests for completion.
+ *
+ * [in,out] request MQ non-blocking request
+ * [out] status Updated if non-NULL and the request successfully
+ * completes
+ *
+ * [pre] The user has obtained a valid MQ request by calling psm_mq_isend
+ *      or psm_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a psm_mq_status_t or NULL if status is to be
+ *      ignored.  
+ * 
+ * [pre] The user has ensured progress on the Matched Queue if @ref
+ *      psm_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * [post] If the request is complete, the request is assigned the value @ref
+ *       PSM_MQ_REQINVALID and all associated MQ request storage is released
+ *       back to the MQ library. If the request is incomplete, the contents of
+ *       request is unchanged.
+ *
+ * [post] The user will ensure progress on the Matched Queue if @ref
+ *       psm_mq_test is exclusively used for guaranteeing request completions.
+ *
+ * The following two errors are always returned.  Other errors are handled by
+ * the PSM error handler (psm_error_register_handler).
+ *
+ * [retval] PSM_OK The request is complete and request is set to @ref
+ *                PSM_MQ_REQINVALID or the value of was PSM_MQ_REQINVALID
+ *
+ * [retval] PSM_MQ_NO_COMPLETIONS The request is not complete and request is
+ *                           unchanged.
+ *
+ * @verbatim
+ * // Function that returns the first completed request in an array
+ * // of requests.
+ * void *
+ * user_testany(psm_mq_t mq, psm_mq_req_t *allreqs, int nreqs)
+ * {
+ *   int i;
+ *   void *context = NULL;
+ *
+ *   // Ensure progress only once
+ *   psm_poll(mq);
+ *
+ *   // Test for at least one completion and return it's context
+ *   psm_mq_status_t stat;
+ *   for (i = 0; i < nreqs; i++) {
+ *     if (psm_mq_test(&allreqs[i], &stat) == PSM_OK) {
+ *       context = stat.context;
+ *       break;
+ *     }
+ *   }
+ *   return context;
+ * }
+ * @endverbatim
+ */
+psm_error_t
+psm_mq_test(psm_mq_req_t *request, psm_mq_status_t *status);
+
+/* Cancel a preposted request
+ *
+ * Function to cancel a preposted receive request returned by @ref
+ * psm_mq_irecv.  It is currently illegal to cancel a send request initiated
+ * with psm_mq_isend.
+ *
+ * [pre] The user has obtained a valid MQ request by calling psm_mq_isend
+ *      or psm_mq_irecv and passes a pointer to enough storage to write
+ *      the output of a psm_mq_status_t or NULL if status is to be
+ *      ignored.  
+ *
+ * [post] Whether the cancel is successful or not, the user returns the
+ *       request to the library by way of psm_mq_test or @ref
+ *       psm_mq_wait.
+ * 
+ * Only the two following errors can be returned directly, without being
+ * handled by the error handler (psm_error_register_handler):
+ *
+ * [retval] PSM_OK The request could be successfully cancelled such that the
+ *                preposted receive buffer could be removed from the preposted
+ *                receive queue before a match occurred. The associated @c
+ *                request remains unchanged and the user must still return
+ *                the storage to the MQ library.
+ *
+ * [retval] PSM_MQ_NO_COMPLETIONS The request could not be successfully cancelled
+ *                           since the preposted receive buffer has already
+ *                           matched an incoming message.  The request
+ *                           remains unchanged.
+ *
+ */
+psm_error_t
+psm_mq_cancel(psm_mq_req_t *req);
+
+struct psm_mq_stats {
+    uint64_t	rx_user_bytes;/* Bytes received into a matched user buffer */
+    uint64_t	rx_user_num;  /* Messages received into a matched user buffer */
+    uint64_t	rx_sys_bytes; /* Bytes received into an unmatched system buffer */
+    uint64_t	rx_sys_num;   /* Messages received into an unmatched system buffer */
+
+    uint64_t	tx_num;         /* Total Messages transmitted (shm and ipath) */
+    uint64_t	tx_eager_num;   /* Messages transmitted eagerly */
+    uint64_t	tx_eager_bytes; /* Bytes transmitted eagerly */
+    uint64_t	tx_rndv_num;    /* Messages transmitted using expected TID mechanism */
+    uint64_t	tx_rndv_bytes;  /* Bytes transmitted using expected TID mechanism */
+    uint64_t	tx_shm_num;     /* Messages transmitted (shm only) */
+    uint64_t	rx_shm_num;     /* Messages received through shm */
+
+    uint64_t	rx_sysbuf_num;   /* Number of system buffers allocated  */
+    uint64_t	rx_sysbuf_bytes; /* Bytes allcoated for system buffers */
+
+    uint64_t	_reserved[16];	 /* Internally reserved for future use */
+};
+
+#define PSM_MQ_NUM_STATS    13	/* How many stats are currently used in psm_mq_stats */
+
+typedef struct psm_mq_stats	   psm_mq_stats_t;
+
+/* Retrieve statistics from an instantied MQ */
+void 
+psm_mq_get_stats(psm_mq_t mq, psm_mq_stats_t *stats);
+
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/psm_mq_internal.h b/psm_mq_internal.h
new file mode 100644
index 0000000..7c0f645
--- /dev/null
+++ b/psm_mq_internal.h
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MQ_INT_H
+#define MQ_INT_H
+
+#include "psm_user.h"
+
+#define MM_FLAG_NONE  0
+#define MM_FLAG_TRANSIENT  0x1
+#define MM_NUM_OF_POOLS 7
+
+typedef struct _mem_block_ctrl mem_block_ctrl;
+typedef struct _mem_ctrl mem_ctrl;
+    
+struct _mem_ctrl {
+    mem_block_ctrl *free_list;
+    uint32_t total_alloc;
+    uint32_t current_available;
+    uint32_t block_size;
+    uint32_t flags;
+    uint32_t replenishing_rate;
+};
+
+struct _mem_block_ctrl {
+    union {
+        mem_ctrl *mem_handler;
+        mem_block_ctrl *next;
+    };
+    char _redzone[PSM_VALGRIND_REDZONE_SZ];
+};
+
+typedef psm_error_t (*psm_mq_unexpected_callback_fn_t)
+		    (psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
+		     uint64_t tag, uint32_t send_msglen, 
+		     const void *payload, uint32_t paylen);
+		    
+struct psm_mq {
+    psm_ep_t	  ep;		/**> ep back pointer */
+    mpool_t	  sreq_pool;
+    mpool_t	  rreq_pool;
+
+    psm_mq_unexpected_callback_fn_t unexpected_callback;
+    struct mqsq   expected_q;	/**> Preposted (expected) queue */
+    struct mqsq   unexpected_q;	/**> Unexpected queue */
+    struct mqq    completed_q;	/**> Completed queue */
+
+    uint64_t	  cur_sysbuf_bytes;
+    uint64_t	  max_sysbuf_bytes;
+    uint32_t	  ipath_thresh_rv;
+    uint32_t	  shm_thresh_rv;
+    uint32_t	  ipath_window_rv;
+    int		  memmode;
+
+    psm_mq_stats_t	stats;	/**> MQ stats, accumulated by each PTL */
+
+    mem_ctrl handler_index[MM_NUM_OF_POOLS];
+    int      mem_ctrl_is_init;
+    uint64_t mem_ctrl_total_bytes;
+};
+
+#define MQ_IPATH_THRESH_TINY	8
+#define MQ_IPATH_THRESH_EGR_SDMA    34000
+#define MQ_IPATH_THRESH_EGR_SDMA_SQ 8192
+
+#define MQE_TYPE_IS_SEND(type)	((type) & MQE_TYPE_SEND)
+#define MQE_TYPE_IS_RECV(type)	((type) & MQE_TYPE_RECV)
+
+#define MQE_TYPE_SEND		0x1000
+#define MQE_TYPE_RECV		0x2000
+#define MQE_TYPE_FLAGMASK	0x0fff
+#define MQE_TYPE_WAITING	0x0001
+#define MQE_TYPE_WAITING_PEER	0x0004
+#define MQE_TYPE_EGRLONG	0x0008
+
+#define MQ_STATE_COMPLETE	0
+#define MQ_STATE_POSTED		1
+#define MQ_STATE_MATCHED	2
+#define MQ_STATE_UNEXP		3
+#define MQ_STATE_UNEXP_RV	4
+#define MQ_STATE_FREE		5
+
+#define MQ_MSG_TINY		1
+#define MQ_MSG_SHORT		2
+#define MQ_MSG_LONG		3
+#define MQ_MSG_RTS		4
+#define MQ_MSG_RTS_EGR		5
+#define MQ_MSG_RTS_WAIT		6
+#define MQ_MSG_DATA		9
+#define MQ_MSG_DATA_BLK		10
+#define MQ_MSG_DATA_REQ		11
+#define MQ_MSG_DATA_REQ_BLK	12
+#define MQ_MSG_CTS_EGR		13
+
+#define MQ_MSG_USER_FIRST 64
+
+/*
+ * Descriptor allocation limits.
+ * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure
+ */
+#define MQ_SENDREQ_LIMITS {					\
+	    .env = "PSM_MQ_SENDREQS_MAX",			\
+	    .descr = "Max num of isend requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+#define MQ_RECVREQ_LIMITS {					\
+	    .env = "PSM_MQ_RECVREQS_MAX",			\
+	    .descr = "Max num of irecv requests in flight",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_USER,		\
+	    .minval = 1,					\
+	    .maxval = ~0,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 1024, 1048576 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 8192, 16777216 }	\
+	}
+
+typedef psm_error_t (*mq_rts_callback_fn_t)(psm_mq_req_t req, int was_posted);
+typedef psm_error_t (*mq_testwait_callback_fn_t)(psm_mq_req_t *req, int istest,
+						 psm_mq_status_t *status);
+
+/* receive mq_req, the default */
+struct psm_mq_req {
+    struct {
+	psm_mq_req_t    next;
+	psm_mq_req_t    *pprev; /* used in completion queue */
+    };
+    uint32_t	    state;
+    uint32_t	    type;
+    psm_mq_t	    mq;
+
+    /* Tag matching vars */
+    uint64_t	tag;
+    uint64_t    tagsel;	    /* used for receives */
+
+    /* Some PTLs want to get notified when there's a test/wait event */
+    mq_testwait_callback_fn_t	testwait_callback;
+
+    /* Buffer attached to request.  May be a system buffer for unexpected
+     * messages or a user buffer when an expected message */
+    uint8_t *buf;
+    uint32_t buf_len;
+    uint32_t error_code;
+
+    /* Used only for eager LONGs */
+    STAILQ_ENTRY(psm_mq_req)    nextq; /* used for egr-long only */
+    psmi_egrid_t egrid;
+    psm_epaddr_t epaddr;
+    uint16_t msg_seqnum;	/* msg seq num for mctxt */
+    uint8_t tid_grant[128];	/* don't change the size unless... */
+
+    uint32_t recv_msglen; /* Message length we are ready to receive */
+    uint32_t send_msglen; /* Message length from sender */
+    uint32_t recv_msgoff; /* Message offset into buf */
+    union {
+	uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */ 
+	uint32_t recv_msgposted;
+    };
+
+    /* Used for request to send messages */
+    void	*context;  /* user context associated to sends or receives */
+
+    /* Used to keep track of unexpected rendezvous */
+    mq_rts_callback_fn_t    rts_callback;
+    psm_epaddr_t	    rts_peer;
+    uint32_t		    rts_reqidx_peer;
+    uintptr_t		    rts_sbuf;
+
+    /* PTLs get to store their own per-request data.  MQ manages the allocation
+     * by allocating psm_mq_req so that ptl_req_data has enough space for all 
+     * possible PTLs.
+     */
+    union {
+	void    *ptl_req_ptr;	  /* when used by ptl as pointer */
+	uint8_t  ptl_req_data[0]; /* when used by ptl for "inline" data */
+    };
+};
+
+void psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars);
+
+#if defined(__x86_64__)
+void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars);
+#else
+#define psmi_mq_mtucpy_safe psmi_mq_mtucpy
+#endif
+
+/*
+ * Optimize for 0-8 byte case, but also handle others.
+ */
+PSMI_ALWAYS_INLINE(
+void mq_copy_tiny(uint32_t* dest, uint32_t* src, uint8_t len)
+)
+{
+    switch (len) {
+        case 8: *dest++ = *src++;
+        case 4: *dest++ = *src++;
+	case 0: return;
+        case 7:
+        case 6:
+        case 5: *dest++ = *src++; len -= 4;
+	case 3: 
+	case 2: 
+	case 1: break;
+	default: /* greater than 8 */
+	    psmi_mq_mtucpy(dest,src,len);
+	    return;
+    }
+    uint8_t* dest1 = (uint8_t*) dest;
+    uint8_t* src1 = (uint8_t*) src;
+    switch(len) {
+        case 3: *dest1++ = *src1++;
+        case 2: *dest1++ = *src1++;
+        case 1: *dest1++ = *src1++;
+    }
+}
+
+/*
+ * Given an req with buffer ubuf of length ubuf_len,
+ * fill in the req's status and return the amount of bytes the request
+ * can receive.
+ *
+ * The function sets status truncation errors. Basically what MPI_Status.
+ */
+PSMI_ALWAYS_INLINE(
+void mq_status_copy(psm_mq_req_t req, psm_mq_status_t *status))
+{
+    status->msg_tag    = req->tag;
+    status->msg_length = req->send_msglen;
+    status->nbytes     = req->recv_msglen;
+    status->error_code = req->error_code;
+    status->context    = req->context;
+}
+
+PSMI_ALWAYS_INLINE(
+uint32_t mq_set_msglen(psm_mq_req_t req, uint32_t recvlen, uint32_t sendlen))
+{
+    req->send_msglen = sendlen;
+    if (recvlen < sendlen) {
+	req->recv_msglen = recvlen;
+	req->error_code = PSM_MQ_TRUNCATION;
+	return recvlen;
+    }
+    else {
+	req->recv_msglen = sendlen;
+	req->error_code = PSM_OK;
+	return sendlen;
+    }
+}
+
+#ifndef PSM_DEBUG
+
+PSMI_ALWAYS_INLINE(
+void
+mq_qq_append(struct mqq *q, psm_mq_req_t req))
+{
+    req->next = NULL;
+    req->pprev = q->lastp;
+    *(q->lastp) = req;
+    q->lastp = &req->next;
+}
+#else
+#define mq_qq_append(q,req) do { \
+    (req)->next = NULL;\
+    (req)->pprev = (q)->lastp;\
+    *((q)->lastp) = (req); \
+    (q)->lastp = &(req)->next; \
+    if (q == &(req)->mq->completed_q) \
+	_IPATH_VDBG("Moving (req)=%p to completed queue on %s, %d\n", (req), __FILE__, __LINE__); \
+} while (0)
+#endif
+
+PSMI_ALWAYS_INLINE(
+void
+mq_sq_append(struct mqsq *q, psm_mq_req_t req))
+{
+    req->next = NULL;
+    *(q->lastp) = req;
+    q->lastp = &req->next;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+mq_qq_remove(struct mqq *q, psm_mq_req_t req))
+{
+    if (req->next != NULL)
+	req->next->pprev = req->pprev;
+    else
+	q->lastp = req->pprev;
+    *(req->pprev) = req->next;
+}
+
+psm_error_t  psmi_mq_req_init(psm_mq_t mq);
+psm_error_t  psmi_mq_req_fini(psm_mq_t mq);
+psm_mq_req_t psmi_mq_req_alloc(psm_mq_t mq, uint32_t type);
+#define      psmi_mq_req_free(req)  psmi_mpool_put(req)
+
+/*
+ * MQ unexpected buffer management
+ */
+void	  psmi_mq_sysbuf_init(psm_mq_t mq);
+void	  psmi_mq_sysbuf_fini(psm_mq_t mq);
+void *	  psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t nbytes);
+void	  psmi_mq_sysbuf_free(psm_mq_t mq, void *);
+void	  psmi_mq_sysbuf_getinfo(psm_mq_t mq, char *buf, size_t len);
+
+/*
+ * Main receive progress engine, for shmops and ipath, in mq.c
+ */
+psm_error_t psmi_mq_malloc(psm_mq_t *mqo);
+psm_error_t psmi_mq_initialize_defaults(psm_mq_t mq);
+psm_error_t psmi_mq_free(psm_mq_t mq);
+
+/* Three functions that handle all MQ stuff */
+#define MQ_RET_MATCH_OK	0
+#define MQ_RET_UNEXP_OK 1
+#define MQ_RET_UNEXP_NO_RESOURCES 2
+#define MQ_RET_DATA_OK 3
+#define MQ_RET_DATA_OUT_OF_ORDER 4
+
+int psmi_mq_handle_outoforder_queue(psm_epaddr_t epaddr);
+int psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode,
+		   psm_epaddr_t epaddr, uint16_t msg_seqnum,
+		   uint64_t tag, psmi_egrid_t egrid, uint32_t msglen,
+		   const void *payload, uint32_t paylen);
+int psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, 
+		   uint64_t tag, psmi_egrid_t egrid, uint32_t msglen,
+		   const void *payload, uint32_t paylen);
+int psmi_mq_handle_data(psm_mq_req_t req, psm_epaddr_t epaddr, 
+		   uint32_t egrid, uint32_t offset,
+		   const void *payload, uint32_t paylen);
+
+/* If rtsreq is non-NULL, it contains enough information to pull the data from
+ * the initiator and signal completion at a later time */
+int psmi_mq_handle_rts_outoforder(psm_mq_t mq, uint64_t tag,
+		   uintptr_t send_buf, uint32_t send_msglen,
+		   psm_epaddr_t peer, uint16_t msg_seqnum,
+		   mq_rts_callback_fn_t cb, psm_mq_req_t *req_o);
+int psmi_mq_handle_rts(psm_mq_t mq, uint64_t tag, uintptr_t send_buf,
+		   uint32_t send_msglen, psm_epaddr_t peer, 
+		   mq_rts_callback_fn_t cb, psm_mq_req_t *req_o);
+void psmi_mq_handle_rts_complete(psm_mq_req_t req);
+
+void psmi_mq_stats_register(psm_mq_t mq, mpspawn_stats_add_fn add_fn);
+
+PSMI_ALWAYS_INLINE(
+psm_mq_req_t 
+mq_req_match(struct mqsq *q, uint64_t tag, int remove)
+)
+{
+    psm_mq_req_t *curp;
+    psm_mq_req_t cur;
+
+    for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) {
+	if (!((tag ^ cur->tag) & cur->tagsel)) { /* match! */
+	    if (remove) {
+		if ((*curp = cur->next) == NULL) /* fix tail */
+		    q->lastp = curp;
+		cur->next = NULL;
+	    }
+	    return cur;
+	}
+    }
+    return NULL; /* no match */
+}
+
+PSMI_ALWAYS_INLINE(
+psm_mq_req_t 
+mq_ooo_match(struct mqsq *q, uint16_t msg_seqnum)
+)
+{
+    psm_mq_req_t *curp;
+    psm_mq_req_t cur;
+
+    for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) {
+	if (cur->msg_seqnum == msg_seqnum) { /* match! */
+	    if ((*curp = cur->next) == NULL) /* fix tail */
+		q->lastp = curp;
+	    cur->next = NULL;
+	    return cur;
+	}
+    }
+    return NULL; /* no match */
+}
+
+/* Default handler */
+int __fastpath
+psmi_mq_handle_envelope_unexpected(
+	psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
+	uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
+	const void *payload, uint32_t paylen);
+
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm_mq_t mq, 
+				     psm_mq_unexpected_callback_fn_t fn);
+
+
+PSMI_ALWAYS_INLINE(
+int 
+psmi_mq_handle_tiny_envelope(psm_mq_t mq, psm_epaddr_t epaddr,
+			     uint64_t tag, const void *payload, uint32_t tinylen))
+{
+    psm_mq_req_t req;
+    uint32_t msglen;
+    int rc;
+    psmi_assert(epaddr != NULL);
+
+    req = mq_req_match(&(mq->expected_q), tag, 1);
+    if (req) { /* we have a match */
+	req->tag = tag;
+	msglen = mq_set_msglen(req, req->buf_len, tinylen);
+	PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen);
+	mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
+	req->state = MQ_STATE_COMPLETE;
+	mq_qq_append(&mq->completed_q, req);
+	mq->stats.rx_user_bytes += msglen;
+	mq->stats.rx_user_num++;
+	_IPATH_VDBG("tiny from=%s match=YES (req=%p) mode=1 mqtag=%llu "
+		"msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), req, 
+		(unsigned long long) tag, msglen, tinylen);
+	rc =  MQ_RET_MATCH_OK;
+    }
+    else {
+	rc = psmi_mq_handle_envelope_unexpected(mq, MQ_MSG_TINY, epaddr, tag, 
+		(union psmi_egrid) 0U, tinylen, payload, tinylen);
+    }
+    return rc;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+psmi_mq_stats_rts_account(psm_mq_req_t req))
+{
+    psm_mq_t mq = req->mq;
+    if (MQE_TYPE_IS_SEND(req->type)) {
+	mq->stats.tx_num++;
+	mq->stats.tx_rndv_num++;
+	mq->stats.tx_rndv_bytes += req->send_msglen;
+    }
+    else {
+	mq->stats.rx_user_num++;
+	mq->stats.rx_user_bytes += req->recv_msglen;
+    }
+    return;
+}
+
+#endif
diff --git a/psm_mq_recv.c b/psm_mq_recv.c
new file mode 100644
index 0000000..13a348d
--- /dev/null
+++ b/psm_mq_recv.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+#define psmi_mq_handle_egrdata(mq, req, epaddr) \
+    do { \
+	    psm_mq_req_t dreq, treq; \
+	    dreq = STAILQ_FIRST(&epaddr->mctxt_master->egrdata); \
+	    while (dreq) { \
+		treq = dreq; \
+		dreq = STAILQ_NEXT(dreq, nextq); \
+		if (treq->egrid.egr_data == req->egrid.egr_data) { \
+		    psmi_mq_handle_data(req, epaddr, treq->egrid.egr_data, \
+			treq->recv_msgoff, treq->buf, treq->recv_msglen); \
+		    psmi_mq_sysbuf_free(mq, treq->buf); \
+		    STAILQ_REMOVE(&epaddr->mctxt_master->egrdata, \
+			treq, psm_mq_req, nextq); \
+		    psmi_mq_req_free(treq); \
+		} \
+	    } \
+    } while (0)
+
+static void __recvpath
+psmi_mq_req_copy(psm_mq_req_t req, psm_epaddr_t epaddr,
+		uint32_t offset, const void *buf, uint32_t nbytes)
+{
+    // recv_msglen may be changed by unexpected receive buf.
+    uint32_t msglen_this, end;
+    uint8_t *msgptr = (uint8_t *)req->buf + offset;
+    
+    end = offset + nbytes;
+    if (end > req->recv_msglen) {
+	if (offset >= req->recv_msglen) msglen_this = 0;
+	else msglen_this = req->recv_msglen - offset;
+    } else {
+	msglen_this = nbytes;
+    }
+
+    VALGRIND_MAKE_MEM_DEFINED(msgptr, msglen_this);
+    psmi_mq_mtucpy(msgptr, buf, msglen_this);
+    
+    if (req->recv_msgoff < end) {
+	req->recv_msgoff = end;
+    }
+    req->send_msgoff += nbytes;
+    return;
+}
+
+int __recvpath
+psmi_mq_handle_data(psm_mq_req_t req, psm_epaddr_t epaddr,
+		    uint32_t egrid, uint32_t offset,
+		    const void *buf, uint32_t nbytes)
+{
+    psm_mq_t mq;
+    int rc;
+    
+    if (req == NULL) goto no_req;
+
+    mq = req->mq;
+    if (req->state == MQ_STATE_MATCHED)
+	rc = MQ_RET_MATCH_OK;
+    else {
+	psmi_assert(req->state == MQ_STATE_UNEXP);
+	rc = MQ_RET_UNEXP_OK;
+    }
+
+    psmi_assert(req->egrid.egr_data == egrid);
+    psmi_mq_req_copy(req, epaddr, offset, buf, nbytes);
+
+    if (req->send_msgoff == req->send_msglen) {
+	if (req->type & MQE_TYPE_EGRLONG) {
+	    STAILQ_REMOVE(&epaddr->mctxt_master->egrlong,
+				req, psm_mq_req, nextq);
+	}
+	    
+	if (req->state == MQ_STATE_MATCHED) {
+	    req->state = MQ_STATE_COMPLETE;
+	    mq_qq_append(&mq->completed_q, req);
+	}
+	else { /* MQ_STATE_UNEXP */
+	    req->state = MQ_STATE_COMPLETE;
+	}
+	_IPATH_VDBG("epaddr=%s completed %d byte send, state=%d\n", 
+		    psmi_epaddr_get_name(epaddr->epid),
+		    (int)req->send_msglen, req->state);
+    }
+
+    return rc;
+
+no_req:
+    mq = epaddr->ep->mq;
+    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+    psmi_assert(req != NULL);
+
+    req->egrid.egr_data = egrid;
+    req->recv_msgoff = offset;
+    req->recv_msglen = nbytes;
+    req->buf = psmi_mq_sysbuf_alloc(mq, nbytes);
+    psmi_mq_mtucpy(req->buf, buf, nbytes);
+
+    STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrdata, req, nextq);
+
+    return MQ_RET_UNEXP_OK;
+}
+
+int __recvpath
+psmi_mq_handle_rts(psm_mq_t mq, uint64_t tag, 
+		   uintptr_t send_buf, uint32_t send_msglen, 
+		   psm_epaddr_t peer, mq_rts_callback_fn_t cb, 
+		   psm_mq_req_t *req_o)
+{
+    psm_mq_req_t req;
+    int rc;
+
+    PSMI_PLOCK_ASSERT();
+
+    req = mq_req_match(&(mq->expected_q), tag, 1);
+
+    if (req) { /* we have a match, no need to callback */
+	(void)mq_set_msglen(req, req->buf_len, send_msglen);
+	req->state = MQ_STATE_MATCHED;
+	req->tag = tag;
+	req->send_msgoff = 0;
+	req->rts_peer = peer;
+	req->rts_sbuf = send_buf;
+	*req_o = req; /* yes match */
+	rc = MQ_RET_MATCH_OK;
+    }
+    else { /* No match, keep track of callback */
+	req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+	psmi_assert(req != NULL);
+	/* We don't know recv_msglen yet but we set it here for
+	 * mq_iprobe */
+	req->send_msglen = req->recv_msglen = send_msglen;
+	req->state = MQ_STATE_UNEXP_RV;
+	req->tag = tag;
+	req->rts_callback = cb;
+	req->recv_msgoff = 0;
+	req->send_msgoff = 0;
+	req->rts_peer = peer;
+	req->rts_sbuf = send_buf;
+	mq_sq_append(&mq->unexpected_q, req);
+	*req_o = req; /* no match, will callback */
+	rc = MQ_RET_UNEXP_OK;
+    }
+
+    _IPATH_VDBG("from=%s match=%s (req=%p) mqtag=%" PRIx64" recvlen=%d "
+		"sendlen=%d errcode=%d\n", psmi_epaddr_get_name(peer->epid), 
+		rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, req->tag, 
+		req->recv_msglen, req->send_msglen, req->error_code);
+    return rc;
+}
+
+void
+psmi_mq_handle_rts_complete(psm_mq_req_t req) 
+{
+    psm_mq_t mq = req->mq;
+
+    /* Stats on rendez-vous messages */
+    psmi_mq_stats_rts_account(req);
+    req->state = MQ_STATE_COMPLETE;
+    mq_qq_append(&mq->completed_q, req);
+#ifdef PSM_VALGRIND
+    if (MQE_TYPE_IS_RECV(req->type))
+	PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, req->recv_msglen);
+    else
+	VALGRIND_MAKE_MEM_DEFINED(req->buf, req->buf_len);
+#endif
+    _IPATH_VDBG("RTS complete, req=%p, recv_msglen = %d\n", 
+		    req, req->recv_msglen);
+    return;
+}
+
+/* Not exposed in public psm, but may extend parts of PSM 2.1 to support
+ * this feature before 2.3 */
+psm_mq_unexpected_callback_fn_t
+psmi_mq_register_unexpected_callback(psm_mq_t mq, 
+				     psm_mq_unexpected_callback_fn_t fn)
+{
+    psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback;
+    mq->unexpected_callback = fn;
+    return old_fn;
+}
+
+int __recvpath
+psmi_mq_handle_envelope_unexpected(
+	psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
+	uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
+	const void *payload, uint32_t paylen)
+{
+    psm_mq_req_t req;
+    uint32_t msglen;
+
+    /* 
+     * Keep a callback here in case we want to fit some other high-level
+     * protocols over MQ (i.e. shmem).  These protocols would bypass the
+     * normal mesage handling and go to higher-level message handlers.
+     */
+    if (mode >= MQ_MSG_USER_FIRST && mq->unexpected_callback) {
+	mq->unexpected_callback(mq,mode,epaddr,tag,send_msglen,payload,paylen);
+	return MQ_RET_UNEXP_OK;
+    }
+    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+    psmi_assert(req != NULL);
+
+    req->tag = tag;
+    req->recv_msgoff = 0;
+    req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen;
+
+    _IPATH_VDBG(
+		"from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64
+		" send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
+		req, mode, tag, send_msglen);
+#if 0
+    if (mq->cur_sysbuf_bytes+msglen > mq->max_sysbuf_bytes) {
+		_IPATH_VDBG("req=%p with len=%d exceeds limit of %llu sysbuf_bytes\n",
+			req, msglen, (unsigned long long) mq->max_sysbuf_bytes);
+		return MQ_RET_UNEXP_NO_RESOURCES;
+    }
+#endif
+    switch (mode) {
+	case MQ_MSG_TINY:
+	    if (msglen > 0) {
+		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
+	    }
+	    else
+		req->buf = NULL;
+	    req->state = MQ_STATE_COMPLETE;
+	    break;
+
+	case MQ_MSG_SHORT:
+	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+	    psmi_mq_mtucpy(req->buf, payload, msglen);
+	    req->state = MQ_STATE_COMPLETE;
+	    break;
+
+	case MQ_MSG_LONG:
+	    req->egrid = egrid;
+	    req->send_msgoff = 0;
+	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+	    req->state = MQ_STATE_UNEXP;
+	    req->type |= MQE_TYPE_EGRLONG;
+	    STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
+	    _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", 
+			egrid.egr_msgno, msglen, paylen);
+	    if (paylen > 0)
+		psmi_mq_handle_data(req, epaddr,
+			egrid.egr_data, 0, payload, paylen);
+	    psmi_mq_handle_egrdata(mq, req, epaddr);
+	    break;
+
+	default:
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			    "Internal error, unknown packet 0x%x", mode);
+    }
+    mq_sq_append(&mq->unexpected_q, req);
+    mq->stats.rx_sys_bytes += msglen;
+    mq->stats.rx_sys_num++;
+
+    return MQ_RET_UNEXP_OK;
+}
+
+/* 
+ * This handles the regular (i.e. non-rendezvous MPI envelopes) 
+ */
+int __recvpath
+psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
+		   uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
+		   const void *payload, uint32_t paylen)
+{
+    psm_mq_req_t req;
+    uint32_t msglen;
+    int rc;
+
+    psmi_assert(epaddr != NULL);
+
+    req = mq_req_match(&(mq->expected_q), tag, 1);
+
+    if (req) { /* we have a match */
+	psmi_assert(MQE_TYPE_IS_RECV(req->type));
+	req->tag = tag;
+	msglen = mq_set_msglen(req, req->buf_len, send_msglen);
+
+	_IPATH_VDBG("from=%s match=YES (req=%p) mode=%x mqtag=%"
+		PRIx64" msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
+		req, mode, tag, msglen, paylen);
+
+	switch(mode) {
+	    case MQ_MSG_TINY:
+		PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen);
+		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+		break;
+
+	    case MQ_MSG_SHORT: /* message fits in 1 payload */
+		PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen);
+		psmi_mq_mtucpy(req->buf, payload, msglen);
+		req->state = MQ_STATE_COMPLETE;
+		mq_qq_append(&mq->completed_q, req);
+		break;
+
+	    case MQ_MSG_LONG:
+		req->egrid = egrid;
+		req->state = MQ_STATE_MATCHED;
+		req->type |= MQE_TYPE_EGRLONG;
+		req->send_msgoff = req->recv_msgoff = 0;
+		STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
+		_IPATH_VDBG("exp MSG_LONG %d of length %d bytes pay=%d\n", 
+			egrid.egr_msgno, msglen, paylen);
+		if (paylen > 0)
+		    psmi_mq_handle_data(req, epaddr,
+			egrid.egr_data, 0, payload, paylen);
+		psmi_mq_handle_egrdata(mq, req, epaddr);
+		break;
+
+	    default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			    "Internal error, unknown packet 0x%x", mode);
+	}
+
+	mq->stats.rx_user_bytes += msglen;
+	mq->stats.rx_user_num++;
+
+	rc = MQ_RET_MATCH_OK;
+	if (mode == MQ_MSG_LONG)
+	    return rc;
+    }
+    else
+	rc =  psmi_mq_handle_envelope_unexpected(mq, mode, epaddr, tag,
+		    egrid, send_msglen, payload, paylen);
+
+    return rc;
+}
+
+/*
+ * Note, epaddr is the master.
+ */
+int __recvpath
+psmi_mq_handle_outoforder_queue(psm_epaddr_t epaddr)
+{
+    psm_mq_t mq = epaddr->ep->mq;
+    psm_mq_req_t ureq, ereq;
+    uint32_t msglen;
+
+    next_ooo:
+    ureq = mq_ooo_match(&epaddr->outoforder_q, epaddr->mctxt_recv_seqnum);
+    if (ureq == NULL) return 0;
+    epaddr->mctxt_recv_seqnum++;
+    epaddr->outoforder_c--;
+
+    ereq = mq_req_match(&(mq->expected_q), ureq->tag, 1);
+    if (ereq == NULL) {
+	mq_sq_append(&mq->unexpected_q, ureq);
+	if (epaddr->outoforder_c) goto next_ooo;
+	return 0;
+    }
+
+    psmi_assert(MQE_TYPE_IS_RECV(ereq->type));
+    ereq->tag = ureq->tag;
+    msglen = mq_set_msglen(ereq, ereq->buf_len, ureq->send_msglen);
+
+    switch (ureq->state) {
+    case MQ_STATE_COMPLETE:
+	if (ureq->buf != NULL) { /* 0-byte don't alloc a sysbuf */
+	    psmi_mq_mtucpy(ereq->buf,
+		(const void *)ureq->buf, msglen);
+	    psmi_mq_sysbuf_free(mq, ureq->buf);
+	}
+	ereq->state = MQ_STATE_COMPLETE;
+	mq_qq_append(&mq->completed_q, ereq);
+	break;
+    case MQ_STATE_UNEXP: /* not done yet */
+	ereq->type = ureq->type;
+	ereq->egrid = ureq->egrid;
+	ereq->epaddr = ureq->epaddr;
+	ereq->send_msgoff = ureq->send_msgoff;
+	ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
+	psmi_mq_mtucpy(ereq->buf,
+	    (const void *)ureq->buf, ereq->recv_msgoff);
+	psmi_mq_sysbuf_free(mq, ureq->buf);
+	ereq->state = MQ_STATE_MATCHED;
+	STAILQ_INSERT_AFTER(&ureq->epaddr->mctxt_master->egrlong,
+			ureq, ereq, nextq);
+	STAILQ_REMOVE(&ureq->epaddr->mctxt_master->egrlong,
+			ureq, psm_mq_req, nextq);
+	break;
+    case MQ_STATE_UNEXP_RV: /* rendez-vous ... */
+	ereq->state = MQ_STATE_MATCHED;
+	ereq->rts_peer = ureq->rts_peer;
+	ereq->rts_sbuf = ureq->rts_sbuf;
+	ereq->send_msgoff = 0;
+	ereq->rts_callback = ureq->rts_callback;
+	ereq->rts_reqidx_peer = ureq->rts_reqidx_peer;
+	ereq->type = ureq->type;
+	ereq->rts_callback(ereq, 0);
+	break;
+    default:
+	fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state, ureq);
+	fprintf(stderr, "type=%d, mq=%p, tag=%p\n",
+			ureq->type, ureq->mq, (void *)(uintptr_t)ureq->tag);
+	abort();
+    }
+
+    psmi_mq_req_free(ureq);
+    if (epaddr->outoforder_c) goto next_ooo;
+    return 0;
+}
+
+int __recvpath
+psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode,
+		   psm_epaddr_t epaddr, uint16_t msg_seqnum,
+		   uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
+		   const void *payload, uint32_t paylen)
+{
+    psm_mq_req_t req;
+    uint32_t msglen;
+
+    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+    psmi_assert(req != NULL);
+
+    req->tag = tag;
+    req->recv_msgoff = 0;
+    req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen;
+
+    _IPATH_VDBG(
+		"from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64
+		" send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
+		req, mode, tag, send_msglen);
+    switch (mode) {
+	case MQ_MSG_TINY:
+	    if (msglen > 0) {
+		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
+	    }
+	    else
+		req->buf = NULL;
+	    req->state = MQ_STATE_COMPLETE;
+	    break;
+
+	case MQ_MSG_SHORT:
+	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+	    psmi_mq_mtucpy(req->buf, payload, msglen);
+	    req->state = MQ_STATE_COMPLETE;
+	    break;
+
+	case MQ_MSG_LONG:
+	    req->egrid = egrid;
+	    req->epaddr = epaddr;
+	    req->send_msgoff = 0;
+	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
+	    req->state = MQ_STATE_UNEXP;
+	    req->type |= MQE_TYPE_EGRLONG;
+	    STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
+	    _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", 
+			egrid.egr_msgno, msglen, paylen);
+	    if (paylen > 0)
+		psmi_mq_handle_data(req, epaddr,
+			egrid.egr_data, 0, payload, paylen);
+	    psmi_mq_handle_egrdata(mq, req, epaddr);
+	    break;
+
+	default:
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			    "Internal error, unknown packet 0x%x", mode);
+    }
+
+    req->msg_seqnum = msg_seqnum;
+    mq_sq_append(&epaddr->mctxt_master->outoforder_q, req);
+    epaddr->mctxt_master->outoforder_c++;
+    mq->stats.rx_sys_bytes += msglen;
+    mq->stats.rx_sys_num++;
+
+    return MQ_RET_UNEXP_OK;
+}
+
+int __recvpath
+psmi_mq_handle_rts_outoforder(psm_mq_t mq, uint64_t tag, 
+		   uintptr_t send_buf, uint32_t send_msglen, 
+		   psm_epaddr_t peer, uint16_t msg_seqnum,
+		   mq_rts_callback_fn_t cb, 
+		   psm_mq_req_t *req_o)
+{
+    psm_mq_req_t req;
+
+    PSMI_PLOCK_ASSERT();
+
+    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+    psmi_assert(req != NULL);
+
+    /* We don't know recv_msglen yet but we set it here for
+     * mq_iprobe */
+    req->send_msglen = req->recv_msglen = send_msglen;
+    req->state = MQ_STATE_UNEXP_RV;
+    req->tag = tag;
+    req->rts_callback = cb;
+    req->recv_msgoff = 0;
+    req->send_msgoff = 0;
+    req->rts_peer = peer;
+    req->rts_sbuf = send_buf;
+    req->msg_seqnum = msg_seqnum;
+    mq_sq_append(&peer->mctxt_master->outoforder_q, req);
+    peer->mctxt_master->outoforder_c++;
+    *req_o = req; /* no match, will callback */
+
+    _IPATH_VDBG("from=%s match=%s (req=%p) mqtag=%" PRIx64" recvlen=%d "
+		"sendlen=%d errcode=%d\n", psmi_epaddr_get_name(peer->epid), 
+		"NO", req, req->tag, 
+		req->recv_msglen, req->send_msglen, req->error_code);
+    return MQ_RET_UNEXP_OK;
+}
+
diff --git a/psm_mq_utils.c b/psm_mq_utils.c
new file mode 100644
index 0000000..a1a8667
--- /dev/null
+++ b/psm_mq_utils.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+/*
+ *
+ * MQ request allocator
+ *
+ */
+
+psm_mq_req_t __sendpath
+psmi_mq_req_alloc(psm_mq_t mq, uint32_t type)
+{
+    psm_mq_req_t req;
+
+    psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND);
+
+    if (type == MQE_TYPE_SEND)
+	req = psmi_mpool_get(mq->sreq_pool);
+    else
+	req = psmi_mpool_get(mq->rreq_pool);
+
+    if_pt (req != NULL) {
+	/* A while ago there were issues about forgetting to zero-out parts of the
+	 * structure, I'm leaving this as a debug-time option */
+#ifdef PSM_DEBUG
+	memset(req, 0, sizeof(struct psm_mq_req));
+#endif
+	req->type = type;
+	req->state = MQ_STATE_FREE;
+	req->next = NULL;
+	req->pprev = NULL;
+	req->error_code = PSM_OK;
+	req->mq = mq;
+	req->testwait_callback = NULL;
+	req->rts_peer = NULL;
+	req->ptl_req_ptr = NULL;
+	return req;
+    }
+    else { /* we're out of reqs */
+	int issend = (type == MQE_TYPE_SEND);
+	uint32_t reqmax, reqchunk;
+	psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, 
+				&reqchunk, &reqmax);
+	
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_PARAM_ERR,
+	    "Exhausted %d MQ %s request descriptors, which usually indicates "
+	    "a user program error or insufficient request descriptors (%s=%d)",
+	    reqmax, issend ? "isend" : "irecv", 
+	    issend ? "PSM_MQ_SENDREQS_MAX" : "PSM_MQ_RECVREQS_MAX", reqmax);
+	return NULL;
+    }
+}
+
+psm_error_t
+psmi_mq_req_init(psm_mq_t mq)
+{
+    psm_mq_req_t warmup_req;
+    psm_error_t err = PSM_OK;
+
+    _IPATH_VDBG("mq element sizes are %d bytes\n", 
+		(int) sizeof(struct psm_mq_req));
+
+    /*
+     * Send MQ requests
+     */
+    {
+	struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS;
+	uint32_t maxsz, chunksz;
+
+	if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+	    goto fail;
+				    
+	if ((mq->sreq_pool = psmi_mpool_create(sizeof(struct psm_mq_req), 
+				chunksz, maxsz, 0, DESCRIPTORS,
+				NULL, NULL)) == NULL) 
+	{
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+    }
+
+    /*
+     * Receive MQ requests
+     */
+    {
+	struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS;
+	uint32_t maxsz, chunksz;
+
+	if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz)))
+	    goto fail;
+
+	if ((mq->rreq_pool = 
+	    psmi_mpool_create(sizeof(struct psm_mq_req), chunksz, maxsz, 0,
+			      DESCRIPTORS, NULL, NULL)) == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+    }
+
+    /* Warm up the allocators */
+    warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
+    psmi_assert_always(warmup_req != NULL);
+    psmi_mq_req_free(warmup_req);
+
+    warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+    psmi_assert_always(warmup_req != NULL);
+    psmi_mq_req_free(warmup_req);
+
+fail:
+    return err;
+}
+
+psm_error_t
+psmi_mq_req_fini(psm_mq_t mq)
+{
+    psmi_mpool_destroy(mq->rreq_pool);
+    psmi_mpool_destroy(mq->sreq_pool);
+    return PSM_OK;
+}
+
+/*
+ *
+ * System buffer (unexpected message) allocator
+ *
+ */
+
+#if 0
+/* There's a version with a basic wrapper around malloc, as a back up */
+void *
+psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t nbytes)
+{
+    mq->stats.rx_sysbuf_num++;
+    mq->stats.rx_sysbuf_bytes += nbytes;
+    return malloc(nbytes);
+}
+
+void 
+psmi_mq_sysbuf_free(psm_mq_t mq, void *ptr)
+{
+    free(ptr);
+}
+
+#else
+
+void psmi_mq_sysbuf_init(psm_mq_t mq)
+{
+    int i;
+    uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1};
+    uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0};
+
+    if (mq->mem_ctrl_is_init)
+	return;
+    mq->mem_ctrl_is_init = 1;
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+        mq->handler_index[i].block_size = block_sizes[i];
+        mq->handler_index[i].current_available = 0;
+        mq->handler_index[i].free_list = NULL;
+        mq->handler_index[i].total_alloc = 0;
+        mq->handler_index[i].replenishing_rate = replenishing_rate[i];
+
+	if (block_sizes[i] == -1) {
+	    psmi_assert_always(replenishing_rate[i] == 0);
+	    mq->handler_index[i].flags = MM_FLAG_TRANSIENT;
+	}
+	else {
+	    psmi_assert_always(replenishing_rate[i] > 0);
+	    mq->handler_index[i].flags = MM_FLAG_NONE;
+	}
+    }
+
+    VALGRIND_CREATE_MEMPOOL(mq, PSM_VALGRIND_REDZONE_SZ, 
+				PSM_VALGRIND_MEM_UNDEFINED);
+
+    /* Hit once on each block size so we have a pool that's allocated */
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+	void *ptr;
+	if (block_sizes[i] == -1)
+	    continue;
+	ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]);
+	psmi_mq_sysbuf_free(mq, ptr);
+    }
+}
+
+void 
+psmi_mq_sysbuf_fini(psm_mq_t mq)  // free all buffers that is currently not used
+{ 
+    mem_block_ctrl *block;
+    int i;
+
+    if (mq->mem_ctrl_is_init == 0)
+	return;
+
+    VALGRIND_DESTROY_MEMPOOL(mq);
+
+    for (i=0; i < MM_NUM_OF_POOLS; i++) {
+	while ((block = mq->handler_index[i].free_list) != NULL) {
+	    mq->handler_index[i].free_list = block->next;
+	    psmi_free(block);
+	}
+    }
+    mq->mem_ctrl_is_init = 0;
+}
+
+void
+psmi_mq_sysbuf_getinfo(psm_mq_t mq, char *buf, size_t len)
+{
+    snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n",
+	    mq->mem_ctrl_total_bytes);
+    buf[len-1] = '\0';
+    return;
+}
+
+void * 
+psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t alloc_size)
+{
+    mem_ctrl *mm_handler = mq->handler_index;
+    mem_block_ctrl *new_block;
+    int replenishing;
+
+    /* There is a timing race with ips initialization, fix later.
+     * XXX */
+    if (!mq->mem_ctrl_is_init)
+	psmi_mq_sysbuf_init(mq);
+
+    mq->stats.rx_sysbuf_num++;
+    mq->stats.rx_sysbuf_bytes += alloc_size;
+    
+    while (mm_handler->block_size < alloc_size) 
+        mm_handler++;
+
+    replenishing = mm_handler->replenishing_rate;
+                          
+    if (mm_handler->current_available == 0) { // allocate more buffers
+        if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+	    uint32_t newsz = alloc_size + sizeof(mem_block_ctrl)
+			     + PSM_VALGRIND_REDZONE_SZ;
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+
+            if (new_block) {
+		new_block->mem_handler = mm_handler;
+                new_block++;
+                mm_handler->total_alloc++;
+		mq->mem_ctrl_total_bytes += newsz;
+		VALGRIND_MEMPOOL_ALLOC(mq, new_block, alloc_size);
+            }
+            return new_block;
+        }
+
+        do {
+	    uint32_t newsz = mm_handler->block_size + sizeof(mem_block_ctrl) +
+			     PSM_VALGRIND_REDZONE_SZ;
+
+            new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
+	    mq->mem_ctrl_total_bytes += newsz;
+
+            if (new_block) {
+                mm_handler->current_available++;
+                mm_handler->total_alloc++;
+
+                new_block->next = mm_handler->free_list;
+                mm_handler->free_list = new_block;
+            }
+            
+        } while (--replenishing && new_block);
+    }
+
+    if (mm_handler->current_available) {
+        mm_handler->current_available--;
+
+       new_block = mm_handler->free_list;
+       mm_handler->free_list = new_block->next;
+
+       new_block->mem_handler = mm_handler;
+       new_block++;
+
+       VALGRIND_MEMPOOL_ALLOC(mq, new_block, mm_handler->block_size);
+       return new_block;
+    }
+
+    return NULL;
+}       
+
+void psmi_mq_sysbuf_free(psm_mq_t mq, void * mem_to_free)
+{
+    mem_block_ctrl * block_to_free;
+    mem_ctrl *mm_handler;
+
+    psmi_assert_always(mq->mem_ctrl_is_init);
+
+    block_to_free = (mem_block_ctrl *)mem_to_free - 1;
+    mm_handler = block_to_free->mem_handler;
+
+    VALGRIND_MEMPOOL_FREE(mq, mem_to_free);
+
+    if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+        psmi_free(block_to_free);
+    } else {
+        block_to_free->next = mm_handler->free_list;
+        mm_handler->free_list = block_to_free;
+
+        mm_handler->current_available++;
+    }
+
+    return;
+}
+#endif
+
+/*
+ * Hooks to plug into QLogic MPI stats
+ */
+
+static
+void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args)
+{
+    uint64_t *entry = args->stats;
+    psm_mq_t mq = (psm_mq_t) args->context;
+    psm_mq_stats_t mqstats;
+
+    psm_mq_get_stats(mq, &mqstats);
+
+    if (args->num < 8)
+        return;
+
+    entry[0] = mqstats.tx_eager_num;
+    entry[1] = mqstats.tx_eager_bytes;
+    entry[2] = mqstats.tx_rndv_num;
+    entry[3] = mqstats.tx_rndv_bytes;
+
+    entry[4] = mqstats.rx_user_num;
+    entry[5] = mqstats.rx_user_bytes;
+    entry[6] = mqstats.rx_sys_num;
+    entry[7] = mqstats.rx_sys_bytes;
+}
+
+void
+psmi_mq_stats_register(psm_mq_t mq, mpspawn_stats_add_fn add_fn)
+{
+    char *desc[8];
+    uint16_t flags[8];
+    int i;
+    struct mpspawn_stats_add_args mp_add;
+    /*
+     * Hardcode flags until we correctly move mpspawn to its own repo.
+     * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN;
+     */
+    for (i = 0; i < 8; i++)
+        flags[i] = MPSPAWN_STATS_REDUCTION_ALL;
+
+    desc[0] = "Eager count sent";
+    desc[1] = "Eager bytes sent";
+    desc[2] = "Rendezvous count sent";
+    desc[3] = "Rendezvous bytes sent";
+    desc[4] = "Expected count received";
+    desc[5] = "Expected bytes received";
+    desc[6] = "Unexpect count received";
+    desc[7] = "Unexpect bytes received";
+
+    mp_add.version = MPSPAWN_STATS_VERSION;
+    mp_add.num = 8;
+    mp_add.header = "MPI Statistics Summary (max,min @ rank)";
+    mp_add.req_fn = psmi_mq_stats_callback;
+    mp_add.desc = desc;
+    mp_add.flags = flags;
+    mp_add.context = mq;
+
+    add_fn(&mp_add);
+}
diff --git a/psm_noship.h b/psm_noship.h
new file mode 100644
index 0000000..201af81
--- /dev/null
+++ b/psm_noship.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSM_NOSHIP_H_
+#define _PSM_NOSHIP_H_
+
+#include "psm.h"
+
+typedef struct psm_epinfo {
+  psm_ep_t ep;
+  psm_epid_t epid;
+  psm_uuid_t uuid;
+  char uuid_str[64];
+} psm_epinfo_t;
+
+typedef struct psm_epconn {
+  psm_epaddr_t addr;
+  psm_ep_t ep;
+  psm_mq_t mq;
+} psm_epconn_t;
+
+psm_error_t 
+psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo);
+
+psm_error_t 
+psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn);
+#endif
diff --git a/psm_stats.c b/psm_stats.c
new file mode 100644
index 0000000..8b338ae
--- /dev/null
+++ b/psm_stats.c
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+struct psmi_stats_type {
+    STAILQ_ENTRY(psmi_stats_type)    next;
+    struct psmi_stats_entry	    *entries;
+
+    int	    num_entries;
+    void    *heading;
+    uint32_t statstype;
+    void    *context;
+};
+
+static STAILQ_HEAD(, psmi_stats_type) psmi_stats = 
+	    STAILQ_HEAD_INITIALIZER(psmi_stats);
+
+psm_error_t
+psmi_stats_register_type(const char *heading, 
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries_i,
+			 int num_entries,
+			 void *context)
+{
+    struct psmi_stats_entry *entries;
+    struct psmi_stats_type *type;
+    int i;
+    psm_error_t err = PSM_OK;
+
+    entries = psmi_calloc(PSMI_EP_NONE, STATS, num_entries, sizeof(struct psmi_stats_entry));
+    type = psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type));
+    PSMI_CHECKMEM(err, entries);
+    PSMI_CHECKMEM(err, type);
+
+    type->entries = entries;
+    type->num_entries = num_entries;
+    type->statstype = statstype;
+    type->context = context;
+    type->heading = (char *) heading;
+
+    for (i = 0; i < num_entries; i++) {
+	type->entries[i].desc  = entries_i[i].desc;
+	type->entries[i].flags = entries_i[i].flags;
+	type->entries[i].getfn = entries_i[i].getfn;
+	type->entries[i].u.val = entries_i[i].u.val;
+    }
+
+    STAILQ_INSERT_TAIL(&psmi_stats, type, next);
+    return err;
+
+fail:
+    if (entries) psmi_free(entries);
+    if (type) psmi_free(type);
+    return err;
+}
+
+psm_error_t
+psmi_stats_deregister_all(void)
+{
+    struct psmi_stats_type *type;
+
+    /* Currently our mpi still reads stats after finalize so this isn't safe
+     * yet */
+    while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) {
+	STAILQ_REMOVE_HEAD(&psmi_stats, next);
+	psmi_free(type->entries);
+	psmi_free(type);
+    }
+
+    return PSM_OK;
+}
+
+static
+uint32_t
+typestring_to_type(const char *typestr)
+{
+    if (strncasecmp(typestr, "all", 4) == 0)
+	return PSMI_STATSTYPE_ALL;
+    else if (strncasecmp(typestr, "p2p", 4) == 0)
+	return PSMI_STATSTYPE_P2P;
+    else if (strncasecmp(typestr, "ipath", 6) == 0)
+	return PSMI_STATSTYPE_IPATH;
+    else if (strncasecmp(typestr, "ips", 4) == 0)
+	return PSMI_STATSTYPE_IPSPROTO;
+    else if ((strncasecmp(typestr, "intr", 5) == 0) ||
+	     (strncasecmp(typestr, "thread", 7) == 0) ||
+	     (strncasecmp(typestr, "rcvthread", 10) == 0))
+	return PSMI_STATSTYPE_RCVTHREAD;
+    else if ((strncasecmp(typestr, "mq", 3) == 0) ||
+	     (strncasecmp(typestr, "mpi", 4) == 0))
+	return PSMI_STATSTYPE_MQ;
+    else if ((strncasecmp(typestr, "tid", 4) == 0) ||
+	     (strncasecmp(typestr, "tids", 5) == 0))
+	return PSMI_STATSTYPE_TIDS;
+    else if ((strncasecmp(typestr, "counter", 8) == 0) ||
+	     (strncasecmp(typestr, "counters", 9) == 0))
+	return PSMI_STATSTYPE_DEVCOUNTERS;
+    else if (strncasecmp(typestr, "devstats", 9) == 0)
+	return PSMI_STATSTYPE_DEVSTATS;
+    else if ((strncasecmp(typestr, "memory", 7) == 0) ||
+	     (strncasecmp(typestr, "alloc", 6) == 0) ||
+	     (strncasecmp(typestr, "malloc", 7) == 0))
+	return PSMI_STATSTYPE_MEMORY;
+    else
+	return 0;
+}
+
+static
+uint32_t
+stats_parse_enabled_mask(const char *stats_string) 
+{
+    char *b = (char *) stats_string;
+    char *e = b;
+    char buf[128];
+
+    uint32_t stats_enabled_mask = 0;
+
+    while (*e) {
+	b = e;
+	while (*e && *e != ',' && *e != '+' && *e != '.' && 
+	       *e != '|' && *e != ':')
+	    e++;
+	if (e > b) { /* something new to parse */
+	    int len = ((e - b) > (sizeof buf - 1)) ?
+			(sizeof buf - 1) : (e - b);
+	    strncpy(buf, b, len);
+	    buf[len] = '\0';
+	    stats_enabled_mask |= typestring_to_type(buf);
+	}
+	if (*e)
+	    e++; /* skip delimiter */
+    }
+    return stats_enabled_mask;
+}
+
+static
+void
+psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args)
+{
+    const struct psmi_stats_entry *entry;
+    struct psmi_stats_type *type =
+	    (struct psmi_stats_type *) args->context;
+    int i, num = args->num;
+    uint64_t *stats = args->stats;
+    uint64_t *c = NULL;
+    uint64_t *s = NULL;
+
+    psmi_assert(num == type->num_entries);
+
+    if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS ||
+	type->statstype == PSMI_STATSTYPE_DEVSTATS) 
+    {
+	int unit_id = ((psm_ep_t) type->context)->unit_id;
+	int portno = ((psm_ep_t) type->context)->portnum;
+	uintptr_t off;
+	uint8_t *p = NULL;
+	int nc, npc, ns;
+	int nstats = infinipath_get_stats_names_count();
+	int nctrs = infinipath_get_ctrs_unit_names_count(unit_id);
+	int npctrs = infinipath_get_ctrs_port_names_count(unit_id);
+
+	if (nctrs != -1 && npctrs != -1)
+		c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs+npctrs,
+				sizeof(uint64_t));
+	if (nstats != -1)
+		s = psmi_calloc(PSMI_EP_NONE, STATS, nstats, sizeof(uint64_t));
+
+	/*
+	 * If ipathfs is not loaded, we set NAN everywhere.  We don't want
+	 * stats to break just because 1 node didn't have ipath-stats
+	 */
+	if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) {
+	    nc = infinipath_get_ctrs_unit(unit_id, c, nctrs);
+	    if (nc != -1 && nc == nctrs)
+		p = (uint8_t *)c;
+	    if (nc == -1)
+	    	nc = 0;
+	    npc = infinipath_get_ctrs_port(unit_id, portno, c+nc, npctrs);
+	    if (!p && npc > 0 && npc == npctrs)
+		p = (uint8_t *)c;
+	}
+	else if (s != NULL) {
+            ns = infinipath_get_stats(s, nstats);
+	    if (ns != -1)
+		p = (uint8_t *)s;
+	}
+	for (i = 0; i < num; i++) {
+	    entry = &type->entries[i];
+	    if (p) {
+		off = (uintptr_t) entry->u.off;
+		stats[i] = *((uint64_t *)(p + off));
+	    }
+	    else
+		stats[i] = MPSPAWN_NAN_U64;
+	}
+    }
+    else if (type->statstype == PSMI_STATSTYPE_MEMORY) {
+	for (i = 0; i < num; i++)  {
+	    entry = &type->entries[i];
+	    stats[i] = *(uint64_t *) ((uintptr_t) &psmi_stats_memory 
+				      + (uintptr_t) entry->u.off);
+	}
+    }
+    else {
+	for (i = 0; i < num; i++) {
+	    entry = &type->entries[i];
+	    if (entry->getfn != NULL)
+		stats[i] = entry->getfn(type->context);
+	    else
+		stats[i] = *entry->u.val;
+	}
+    }
+
+    if (c != NULL)
+    	psmi_free(c);
+    if (s != NULL)
+    	psmi_free(s);
+}
+
+static
+void
+stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn,
+			      char *heading,
+			      int num_entries, 
+			      struct psmi_stats_entry *entries,
+			      mpspawn_stats_req_fn req_fn,
+			      void *context)
+{
+    int i;
+    struct mpspawn_stats_add_args mp_add;
+
+    mp_add.version = MPSPAWN_STATS_VERSION;
+    mp_add.num = num_entries;
+    mp_add.header = heading;
+    mp_add.req_fn = req_fn;
+    mp_add.context = context;
+
+    mp_add.desc = (char **) alloca(sizeof(char *) * num_entries);
+    psmi_assert_always(mp_add.desc != NULL);
+
+    mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries);
+    psmi_assert_always(mp_add.flags != NULL);
+
+    for (i = 0; i < num_entries; i++) {
+	mp_add.desc[i] = (char *) entries[i].desc;
+	mp_add.flags[i] = entries[i].flags;
+    }
+
+    /* Ignore return code, doesn't matter to *us* if register failed */
+    add_fn(&mp_add);
+
+    return;
+}
+
+static void stats_register_ipath_counters(psm_ep_t ep);
+static void stats_register_ipath_stats(psm_ep_t ep);
+static void stats_register_mem_stats(psm_ep_t ep);
+static psm_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args);
+
+/*
+ * Downcall from QLogic MPI into PSM, so we can register stats
+ */
+void *psmi_stats_register(struct mpspawn_stats_init_args *args)
+{
+    struct psmi_stats_type *type;
+    uint32_t statsmask;
+    
+    /* 
+     * Args has a version string in it, but we can ignore it since mpspawn
+     * will decide if it supports *our* version
+     */
+
+    /* 
+     * Eventually, parse the stats_types to add various "flavours" of stats
+     */
+    if (args->stats_types == NULL)
+	return NULL;
+
+    statsmask = stats_parse_enabled_mask(args->stats_types);
+
+    /* MQ (MPI-level) statistics */
+    if (statsmask & PSMI_STATSTYPE_MQ)
+	psmi_mq_stats_register(args->mq, args->add_fn);
+
+    /* PSM and ipath level statistics */
+    if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS)
+	stats_register_ipath_counters(args->mq->ep);
+
+    if (statsmask & PSMI_STATSTYPE_DEVSTATS)
+	stats_register_ipath_stats(args->mq->ep);
+
+    if (statsmask & PSMI_STATSTYPE_MEMORY)
+	stats_register_mem_stats(args->mq->ep);
+
+    /* 
+     * At this point all PSM and ipath-level components have registered stats
+     * with the PSM stats interface.  We register with the mpspawn stats
+     * interface with an upcall in add_fn 
+     */
+    STAILQ_FOREACH(type, &psmi_stats, next) 
+    {
+	if (type->statstype & statsmask) 
+	    stats_register_mpspawn_single(args->add_fn,
+					  type->heading,
+					  type->num_entries,
+					  type->entries,
+					  psmi_stats_mpspawn_callback,
+					  type);
+    }
+
+    /*
+     * Special handling for per-endpoint statistics
+     * Only MPI knows what the endpoint-addresses are in the running program,
+     * PSM has no sense of MPI worlds.  In stats register, MPI tells PSM how
+     * many endpoints it anticipates having and PSM simply reserves that amount
+     * of stats entries X the amount of per-endpoint stats.
+     */
+    if (statsmask & PSMI_STATSTYPE_P2P) 
+	psmi_stats_epaddr_register(args);
+
+    return NULL;
+}
+
+struct stats_epaddr {
+    psm_ep_t		  ep;
+    mpspawn_map_epaddr_fn epaddr_map_fn;
+    int			  num_ep;
+    int			  num_ep_stats;
+};
+
+static
+void
+psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args)
+{
+    int i, num, off;
+    uint64_t *statsp;
+    struct stats_epaddr *stats_ctx = (struct stats_epaddr *) args->context;
+    psm_ep_t ep = stats_ctx->ep;
+    psm_epaddr_t epaddr;
+
+    num = stats_ctx->num_ep * stats_ctx->num_ep_stats;
+
+    /* First always NAN the entire stats request */
+    for (i = 0; i < num; i++) {
+	if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE)
+	    args->stats[i] = MPSPAWN_NAN;
+	else
+	    args->stats[i] = MPSPAWN_NAN_U64;
+    }
+
+    for (i = 0; i < stats_ctx->num_ep; i++) {
+	statsp = args->stats + i*stats_ctx->num_ep_stats;
+	off = 0;
+	epaddr = stats_ctx->epaddr_map_fn(i);
+	if (epaddr == NULL)
+	    continue;
+
+	/* Self */
+	if (&ep->ptl_self == epaddr->ptlctl) {
+	    if (ep->ptl_self.epaddr_stats_get != NULL) 
+		off += ep->ptl_self.epaddr_stats_get(epaddr, statsp + off);
+	}
+	else {
+	    if (ep->ptl_self.epaddr_stats_num != NULL) 
+		off += ep->ptl_self.epaddr_stats_num();
+	}
+
+	/* Shm */
+	if (&ep->ptl_amsh == epaddr->ptlctl) {
+	    if (ep->ptl_amsh.epaddr_stats_get != NULL) 
+		off += ep->ptl_amsh.epaddr_stats_get(epaddr, statsp + off);
+	}
+	else {
+	    if (ep->ptl_amsh.epaddr_stats_num != NULL) 
+		off += ep->ptl_amsh.epaddr_stats_num();
+	}
+
+	/* ips */
+	if (&ep->ptl_ips == epaddr->ptlctl) {
+	    if (ep->ptl_ips.epaddr_stats_get != NULL) 
+		off += ep->ptl_ips.epaddr_stats_get(epaddr, statsp + off);
+	}
+	else {
+	    if (ep->ptl_ips.epaddr_stats_num != NULL) 
+		off += ep->ptl_ips.epaddr_stats_num();
+	}
+    }
+    return;
+}
+
+static
+psm_error_t
+psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args)
+{
+    int i = 0, j;
+    int num_ep = args->num_epaddr;
+    int num_ep_stats = 0;
+    int nz;
+    char **desc, **desc_i;
+    uint16_t *flags, *flags_i;
+    char *p;
+    char buf[128];
+    psm_ep_t ep;
+    struct mpspawn_stats_add_args mp_add;
+    struct stats_epaddr *stats_ctx;
+    psm_error_t err = PSM_OK;
+
+    if (args->mq == NULL)
+	return PSM_OK;
+    ep = args->mq->ep;
+
+    /* Figure out how many stats there are in an endpoint from all devices */
+    if (ep->ptl_self.epaddr_stats_num != NULL)
+	num_ep_stats += ep->ptl_self.epaddr_stats_num();
+    if (ep->ptl_amsh.epaddr_stats_num != NULL)
+	num_ep_stats += ep->ptl_amsh.epaddr_stats_num();
+    if (ep->ptl_ips.epaddr_stats_num != NULL)
+	num_ep_stats += ep->ptl_ips.epaddr_stats_num();
+
+    /* Allocate desc and flags and let each device initialize their
+     * descriptions and flags */
+    desc = psmi_malloc(ep, STATS, sizeof(char *) * num_ep_stats * (num_ep+1));
+    if (desc == NULL)
+	return PSM_NO_MEMORY;
+    flags = psmi_malloc(ep, STATS, sizeof(uint16_t) * num_ep_stats * (num_ep+1));
+    if (flags == NULL) {
+	psmi_free(desc);
+	return PSM_NO_MEMORY;
+    }
+
+    /* Get the descriptions/flags from each device */ 
+    i = 0;
+    i += ep->ptl_self.epaddr_stats_num != NULL ?
+	    ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0;
+    i += ep->ptl_amsh.epaddr_stats_num != NULL ?
+	    ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0;
+    i += ep->ptl_ips.epaddr_stats_num != NULL ?
+	    ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0;
+    psmi_assert_always(i == num_ep_stats);
+
+    /* 
+     * Clone the descriptions for each endpoint but append "rank %d" to it
+     * beforehand.
+     */
+    nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */
+		(num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 :
+			(num_ep < 10000 ? 5 : 6)))));
+			    
+    desc_i  = desc + num_ep_stats;
+    flags_i = flags + num_ep_stats;
+    memset(desc_i, 0, sizeof(char*)*num_ep*num_ep_stats);
+
+    for (i = 0; i < num_ep; i++) {
+	for (j = 0; j < num_ep_stats; j++) {
+	    snprintf(buf, sizeof buf - 1, "<%*d> %s", nz, i, desc[j]);
+	    buf[sizeof buf - 1] = '\0';
+	    p = psmi_strdup(ep, buf);
+	    if (p == NULL) {
+		err = PSM_NO_MEMORY;
+		goto clean;
+	    }
+	    desc_i [i * num_ep_stats + j] = p;
+	    flags_i[i * num_ep_stats + j] = flags[j];
+	}
+    }
+
+    mp_add.version = MPSPAWN_STATS_VERSION;
+    mp_add.num = num_ep_stats * num_ep;
+    mp_add.header = "Endpoint-to-Endpoint Stats (by <rank>)";
+    mp_add.req_fn = psmi_stats_epaddr_callback;
+    mp_add.desc = desc_i;
+    mp_add.flags = flags_i;
+    stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr));
+    if (stats_ctx == NULL) {
+	err = PSM_NO_MEMORY;
+	goto clean;
+    }
+    stats_ctx->ep = ep;
+    stats_ctx->epaddr_map_fn = args->epaddr_map_fn;
+    stats_ctx->num_ep = num_ep;
+    stats_ctx->num_ep_stats = num_ep_stats;
+    mp_add.context = stats_ctx;
+
+    args->add_fn(&mp_add);
+
+clean:
+    /* Now we can free all the descriptions */
+    for (i = 0; i < num_ep; i++) {
+	for (j = 0; j < num_ep_stats; j++)
+	    if (desc_i[i * num_ep_stats + j]) psmi_free(desc_i[i * num_ep_stats + j]);
+    }
+
+    psmi_free(desc);
+    psmi_free(flags);
+
+    return err;
+}
+
+static 
+void
+stats_register_ipath_counters(psm_ep_t ep)
+{
+    int i, nc, npc;
+    char *cnames = NULL, *pcnames = NULL;
+    struct psmi_stats_entry *entries = NULL;
+
+    nc = infinipath_get_ctrs_unit_names(ep->unit_id, &cnames);
+    if (nc == -1 || cnames == NULL)
+	    goto bail;
+    npc = infinipath_get_ctrs_port_names(ep->unit_id, &pcnames);
+    if (npc == -1 || pcnames == NULL)
+	    goto bail;
+    entries = psmi_calloc(ep, STATS, nc+npc, sizeof(struct psmi_stats_entry));
+    if (entries == NULL)
+	    goto bail;
+
+    for (i = 0; i < nc; i++) {
+	    entries[i].desc = infinipath_get_next_name(&cnames);
+	    entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+			       MPSPAWN_STATS_SKIP_IF_ZERO;
+	    entries[i].getfn = NULL;
+	    entries[i].u.off = i*sizeof(uint64_t);
+    }
+    for (i = nc; i < nc+npc; i++) {
+	    entries[i].desc = infinipath_get_next_name(&pcnames);
+	    entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+			       MPSPAWN_STATS_SKIP_IF_ZERO;
+	    entries[i].getfn = NULL;
+	    entries[i].u.off = i*sizeof(uint64_t);
+    }
+    psmi_stats_register_type("InfiniPath device counters", 
+			     PSMI_STATSTYPE_DEVCOUNTERS,
+			     entries,
+			     nc+npc,
+			     ep);
+
+bail:
+    if (cnames != NULL)
+	    psmi_free(cnames);
+    if (pcnames != NULL)
+	    psmi_free(pcnames);
+    if (entries != NULL)
+	    psmi_free(entries);
+    return;
+}
+
+static 
+void
+stats_register_ipath_stats(psm_ep_t ep)
+{
+    int i, ns;
+    char *snames = NULL;
+    struct psmi_stats_entry *entries = NULL;
+
+    ns = infinipath_get_stats_names(&snames);
+    if (ns == -1 || snames == NULL)
+	    goto bail;
+    entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry));
+    if (entries == NULL)
+	    goto bail;
+
+    for (i = 0; i < ns; i++) {
+	    entries[i].desc = infinipath_get_next_name(&snames);
+	    entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL |
+			       MPSPAWN_STATS_SKIP_IF_ZERO;
+	    entries[i].getfn = NULL;
+	    entries[i].u.off = i*sizeof(uint64_t);
+    }
+    psmi_stats_register_type("InfiniPath device statistics", 
+			     PSMI_STATSTYPE_DEVSTATS,
+			     entries,
+			     ns,
+			     ep);
+
+bail:
+    if (snames != NULL)
+	    psmi_free(snames);
+    if (entries != NULL)
+	    psmi_free(entries);
+    return;
+}
+
+#undef _SDECL
+#define _SDECL(_desc, _param) {					\
+	    .desc  = _desc,					\
+	    .flags = MPSPAWN_STATS_REDUCTION_ALL 		\
+		     | MPSPAWN_STATS_SKIP_IF_ZERO,		\
+	    .getfn = NULL,					\
+	    .u.off = offsetof(struct psmi_stats_malloc, _param)	\
+	}
+
+static 
+void
+stats_register_mem_stats(psm_ep_t ep)
+{
+    struct psmi_stats_entry entries[] = {
+	_SDECL("Total (current)", m_all_total),
+	_SDECL("Total (max)", m_all_max),
+	_SDECL("All Peers (current)", m_perpeer_total),
+	_SDECL("All Peers (max)", m_perpeer_max),
+	_SDECL("Network Buffers (current)", m_netbufs_total),
+	_SDECL("Network Buffers (max)", m_netbufs_max),
+	_SDECL("PSM desctors (current)", m_descriptors_total),
+	_SDECL("PSM desctors (max)", m_descriptors_max),
+	_SDECL("Unexp. buffers (current)", m_unexpbufs_total),
+	_SDECL("Unexp. Buffers (max)", m_unexpbufs_max),
+	_SDECL("Other (current)", m_undefined_total),
+	_SDECL("Other (max)", m_undefined_max),
+    };
+
+    psmi_stats_register_type("PSM memory allocation statistics", 
+			     PSMI_STATSTYPE_MEMORY,
+			     entries,
+			     PSMI_STATS_HOWMANY(entries),
+			     ep);
+}
diff --git a/psm_stats.h b/psm_stats.h
new file mode 100644
index 0000000..9baed27
--- /dev/null
+++ b/psm_stats.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_stats.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_STATS_H
+#define _PSM_STATS_H
+
+#include "mpspawn_stats.h"
+
+#define PSMI_STATSTYPE_MQ	    0x00001
+#define PSMI_STATSTYPE_RCVTHREAD    0x00100	/* num_wakups, ratio, etc. */
+#define PSMI_STATSTYPE_IPSPROTO	    0x00200	/* acks,naks,err_chks */
+#define PSMI_STATSTYPE_TIDS	    0x00400
+#define PSMI_STATSTYPE_MEMORY	    0x01000
+#define PSMI_STATSTYPE_IPATH	    (PSMI_STATSTYPE_RCVTHREAD|	\
+				     PSMI_STATSTYPE_IPSPROTO |  \
+				     PSMI_STATSTYPE_MEMORY |  \
+				     PSMI_STATSTYPE_TIDS)
+#define PSMI_STATSTYPE_P2P	    0x00800	/* ep-to-ep details */
+#define PSMI_STATSTYPE_DEVCOUNTERS  0x10000
+#define PSMI_STATSTYPE_DEVSTATS	    0x20000
+#define PSMI_STATSTYPE_ALL	    0xfffff
+#define _PSMI_STATSTYPE_DEVMASK	    0xf0000
+
+/* Used to determine how many stats in static array decl. */
+#define PSMI_STATS_HOWMANY(entries)	    \
+	    (sizeof(entries)/sizeof(entries[0]))
+
+#define PSMI_STATS_NO_HEADING    NULL
+
+#define PSMI_STATS_DECL(_desc,_flags,_getfn,_val)   \
+	{  .desc  = _desc,			    \
+	   .flags = _flags,			    \
+	   .getfn = _getfn,			    \
+	   .u.val = _val,			    \
+	}
+
+#define PSMI_STATS_DECLU64(_desc,_val)					  \
+	    PSMI_STATS_DECL(_desc,					  \
+		MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \
+		NULL,							  \
+		_val)
+
+struct psmi_stats_entry {
+    const char *desc;
+    uint16_t	flags;
+    uint64_t	(*getfn)(void *context); /* optional fn ptr to get value */
+    union {
+	uint64_t    *val;  /* where value is stored if getfn is NULL */
+	uint64_t    off;   /* of offset if that makes more sense */
+    } u;
+};
+
+/*
+ * Copy the array of entries and keep track of the context
+ */
+psm_error_t
+psmi_stats_register_type(const char *heading, 
+			 uint32_t statstype,
+			 const struct psmi_stats_entry *entries,
+			 int num_entries,
+			 void *context);
+
+psm_error_t
+psmi_stats_deregister_all(void);
+
+#endif /* PSM_STATS_H */
diff --git a/psm_timer.c b/psm_timer.c
new file mode 100644
index 0000000..387abd4
--- /dev/null
+++ b/psm_timer.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+
+#define __timerpath __recvpath
+
+#if PSMI_TIMER_STATS
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)	((ctrl)->num_insertions++)
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)	((ctrl)->num_traversals++)
+#else
+#  define PSMI_TIMER_STATS_ADD_INSERTION(ctrl)	
+#  define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl)	
+#endif
+
+psm_error_t
+psmi_timer_init(struct psmi_timer_ctrl *ctrl)
+{
+    ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+
+#if PSMI_TIMER_STATS
+    ctrl->num_insertions = 0;
+    ctrl->num_traversals = 0;
+#endif
+
+    TAILQ_INIT(&ctrl->timerq);
+    return PSM_OK;
+}
+
+void
+psmi_timer_entry_init(struct psmi_timer *t_init,
+		     psmi_timer_expire_callback_t expire_fn,
+		     void *context)
+{
+    TAILQ_NEXT(t_init, timer) = NULL;
+    t_init->t_timeout = 0ULL;
+    t_init->flags = 0;
+    t_init->expire_callback = expire_fn;
+    t_init->context = context;
+    return;
+}
+
+psm_error_t
+psmi_timer_fini(struct psmi_timer_ctrl *ctrl)
+{
+#if PSMI_TIMER_STATS
+    if (ctrl->num_insertions > 0) {
+	_IPATH_INFO("avg elem traversals/insertion = %3.2f %%\n",
+		100.0 * (double) ctrl->num_traversals / ctrl->num_insertions);
+    }
+#endif
+    return PSM_OK;
+}
+
+void __timerpath
+psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+		         struct psmi_timer *t_insert,
+		         uint64_t t_cyc_expire)
+{
+    struct psmi_timer *t_cursor;
+
+    psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING));
+
+    t_insert->t_timeout  = t_cyc_expire;
+    t_insert->flags     |= PSMI_TIMER_FLAG_PENDING;
+
+    /*
+     * We keep the list from oldest (head) to newest (tail), with the
+     * assumption that insert and remove occur much more often than search
+     * (when the timer expires).  Newly added timers are more likely to expire
+     * later rather than sooner, which is why the head is older.
+     */
+    PSMI_TIMER_STATS_ADD_INSERTION(ctrl);
+
+    if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */
+	TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+	ctrl->t_cyc_next_expire = t_cyc_expire;
+	PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+	return;
+    }
+    else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) {
+	TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) {
+	    if (t_cursor->t_timeout <= t_cyc_expire) {
+		TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer);
+		return;
+	    }
+	    PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+	}
+	/* Got to the end of the list -- We're the next to expire */
+	ctrl->t_cyc_next_expire = t_cyc_expire;
+	TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer);
+	return;
+    }
+    else {
+	TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) {
+	    if (t_cursor->t_timeout >= t_cyc_expire) {
+		TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor, t_insert, timer);
+		ctrl->t_cyc_next_expire = min(t_cyc_expire,
+					      ctrl->t_cyc_next_expire);
+		return;
+	    }
+	    PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl);
+	}
+	TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer);
+	/* No need to check if we inserted last, given first branch case */
+	// if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert)
+	//    ctrl->t_cyc_next_expire = t_cyc_expire;
+	return;
+    }
+
+    return;
+}
+
+psm_error_t __timerpath
+psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire)
+{
+    psm_error_t err = PSM_OK_NO_PROGRESS;
+    struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq);
+
+    while (t_cursor) {
+	if (t_cursor->t_timeout > t_cyc_expire) 
+	    break;
+
+	err = PSM_OK;
+	psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING);
+	t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING;
+	TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer);
+	t_cursor->expire_callback(t_cursor, t_cyc_expire);
+	t_cursor = TAILQ_PREV(t_cursor, timerq, timer);
+    }
+
+    if (TAILQ_EMPTY(&ctrl->timerq))
+	ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+    else 
+	ctrl->t_cyc_next_expire = 
+		TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+
+    return err;
+}
+
+void __timerpath
+psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+		       struct psmi_timer *t_remove)
+{
+
+    psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING);
+
+    t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING;
+    TAILQ_REMOVE(&ctrl->timerq, t_remove, timer);
+
+    /* 
+     * If we're removing the last entry, we need to reset the 
+     * expiration cycle time.
+     */
+    if (TAILQ_EMPTY(&ctrl->timerq))
+	ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE;
+    else 
+	ctrl->t_cyc_next_expire = 
+		TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout;
+    return;
+}
+
+
diff --git a/psm_timer.h b/psm_timer.h
new file mode 100644
index 0000000..0a35c04
--- /dev/null
+++ b/psm_timer.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_timer.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_TIMER_H
+#define _PSMI_TIMER_H
+
+#include "psm_user.h"
+
+/* Keep timer stats */
+#define PSMI_TIMER_STATS 0
+
+typedef struct psmi_timer    psmi_timer;
+typedef psm_error_t (*psmi_timer_expire_callback_t)(struct psmi_timer *, uint64_t);
+
+struct psmi_timer {
+    TAILQ_ENTRY(psmi_timer)  timer;	/* opaque */
+    uint64_t		    t_timeout;  /* opaque */
+    uint8_t		    flags;	/* opaque */
+
+    psmi_timer_expire_callback_t	    expire_callback; /* user -- callback fn */
+    void			    *context;	     /* user -- callback param */
+};
+
+struct psmi_timer_ctrl {
+    uint64_t			    t_cyc_next_expire;
+    TAILQ_HEAD(timerq, psmi_timer)   timerq;
+
+#if PSMI_TIMER_STATS
+    uint64_t	num_insertions;
+    uint64_t	num_traversals;
+#endif
+};
+
+/*
+ * Some events need to be unconditionally enqueued at the beginning of the
+ * timerq -- they are not timers meant to expire but merely operations that
+ * need to be delayed.  For delayed operations, there are 5 levels of
+ * priority.
+ */
+#define PSMI_TIMER_PRIO_0	 0ULL
+#define PSMI_TIMER_PRIO_1	 1ULL
+#define PSMI_TIMER_PRIO_2	 2ULL
+#define PSMI_TIMER_PRIO_3	 3ULL
+#define PSMI_TIMER_PRIO_4	 4ULL
+#define PSMI_TIMER_PRIO_LAST	 PSMI_TIMER_PRIO_4
+
+#define PSMI_TIMER_INFINITE	 0xFFFFFFFFFFFFFFFFULL
+#define PSMI_TIMER_FLAG_PENDING  0x01
+
+/*
+ * Timer control initialization and finalization
+ */
+psm_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl);
+psm_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl);
+
+/*
+ * Timer entry initialization (a timer must be initialized before it can be
+ * added to the timer request queue).
+ */
+
+void psmi_timer_entry_init(struct psmi_timer *t_init,
+		     psmi_timer_expire_callback_t expire_fn,
+		     void *context);
+
+/*
+ * Timer requests, conditional (macro) or unconditional
+ */
+#define psmi_timer_request(ctrl, t_insert, t_cyc)			\
+	    if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING))		\
+		psmi_timer_request_always((ctrl), (t_insert), (t_cyc))
+
+void	psmi_timer_request_always(struct psmi_timer_ctrl *ctrl,
+		       struct psmi_timer *t_insert,
+		       uint64_t t_cyc_expire);
+
+/*
+ * Timer cancelations, conditional (macro) only (cancel_inner is internal)
+ */
+#define psmi_timer_cancel(ctrl, t_remove)		    \
+	    if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \
+		psmi_timer_cancel_inner(ctrl, t_remove)	
+void	    psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl,
+				   struct psmi_timer *t_remove);
+
+/*
+ * Timer processing, conditional or unconditional.
+ */
+#define psmi_timer_process_if_expired(ctrl, t_cyc_expire)		\
+	    (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ?		\
+		psmi_timer_process_expired(ctrl, t_cyc_expire) :	\
+		PSM_OK_NO_PROGRESS)
+
+#define psmi_timer_is_expired(ctrl, t_cyc_expire)			\
+	    ((ctrl)->t_cyc_next_expire <= (t_cyc_expire))
+
+psm_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, 
+				      uint64_t t_cyc_expire);
+
+#endif /* _PSMI_TIMER_H */
diff --git a/psm_user.h b/psm_user.h
new file mode 100644
index 0000000..c9aadcc
--- /dev/null
+++ b/psm_user.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_USER_H
+#define _PSMI_USER_H
+
+#include <inttypes.h>
+#include <pthread.h>
+
+#include "psm.h"
+#include "psm_mq.h"
+
+#include "ptl.h"
+
+#include "ipath_user.h"
+#include "ipath_queue.h"
+#include "valgrind/valgrind.h"
+#include "valgrind/memcheck.h"
+
+#define _PSMI_IN_USER_H
+#include "psm_help.h"
+#include "psm_error.h"
+#include "psm_context.h"
+#include "psm_utils.h"
+#include "psm_timer.h"
+#include "psm_mpool.h"
+#include "psm_ep.h"
+#include "psm_lock.h"
+#include "psm_stats.h"
+#undef _PSMI_IN_USER_H
+
+#define PSMI_VERNO_MAKE(major,minor) ((((major)&0xff)<<8)|((minor)&0xff))
+#define PSMI_VERNO  PSMI_VERNO_MAKE(PSM_VERNO_MAJOR, PSM_VERNO_MINOR)
+#define PSMI_VERNO_GET_MAJOR(verno) ( ((verno)>>8) & 0xff )
+#define PSMI_VERNO_GET_MINOR(verno) ( ((verno)>>0) & 0xff )
+
+int psmi_verno_client();
+int psmi_verno_isinteroperable(uint16_t verno);
+int psmi_isinitialized();
+
+psm_error_t psmi_poll_internal(psm_ep_t ep, int poll_amsh);
+psm_error_t psmi_mq_wait_internal(psm_mq_req_t *ireq);
+
+/*
+ * Default setting for Receive thread
+ *
+ *   0 disables rcvthread by default
+ * 0x1 enables ips receive thread by default
+ */
+#define PSMI_RCVTHREAD_FLAGS	0x1
+
+/*
+ * Define one of these below.
+ *
+ * Spinlock gives the best performance and makes sense with the progress thread
+ * only because the progress thread does a "trylock" and then goes back to
+ * sleep in a poll.
+ *
+ * Mutexlock should be used for experimentation while the more useful
+ * mutexlock-debug should be enabled during developement to catch potential
+ * errors.
+ */
+#ifdef PSM_DEBUG
+  #define PSMI_PLOCK_IS_MUTEXLOCK_DEBUG
+#else
+  #define PSMI_PLOCK_IS_SPINLOCK
+  //#define PSMI_PLOCK_IS_MUTEXLOCK
+  //#define PSMI_PLOCK_IS_MUTEXLOCK_DEBUG
+  //#define PSMI_PLOCK_IS_NOLOCK
+#endif
+
+#ifdef PSMI_PLOCK_IS_SPINLOCK
+  psmi_spinlock_t  psmi_progress_lock;
+  #define PSMI_PLOCK_INIT()   psmi_spin_init(&psmi_progress_lock)
+  #define PSMI_PLOCK_TRY()    psmi_spin_trylock(&psmi_progress_lock)
+  #define PSMI_PLOCK()	      psmi_spin_lock(&psmi_progress_lock)
+  #define PSMI_PUNLOCK()      psmi_spin_unlock(&psmi_progress_lock)
+  #define PSMI_PLOCK_ASSERT()
+  #define PSMI_PUNLOCK_ASSERT()
+  #define PSMI_PLOCK_DISABLED  0
+#elif defined(PSMI_PLOCK_IS_MUTEXLOCK_DEBUG) 
+  pthread_mutex_t  psmi_progress_lock;
+  pthread_t	   psmi_progress_lock_owner;
+  #define PSMI_PLOCK_NO_OWNER	((pthread_t)(-1))
+
+  PSMI_ALWAYS_INLINE(
+  int _psmi_mutex_trylock_inner(pthread_mutex_t *mutex, const char *curloc))
+  {
+    psmi_assert_always_loc(psmi_progress_lock_owner != pthread_self(), curloc);
+    int ret = pthread_mutex_trylock(&psmi_progress_lock);
+    if (ret == 0)
+	psmi_progress_lock_owner = pthread_self();
+    return ret;
+  }
+
+  PSMI_ALWAYS_INLINE(
+  int _psmi_mutex_lock_inner(pthread_mutex_t *mutex, const char *curloc))
+  {
+    psmi_assert_always_loc(psmi_progress_lock_owner != pthread_self(), curloc);
+    int ret = pthread_mutex_lock(&psmi_progress_lock);
+    psmi_assert_always_loc(ret != EDEADLK, curloc);
+    psmi_progress_lock_owner = pthread_self();
+    return ret;
+  }
+
+  PSMI_ALWAYS_INLINE(
+  void _psmi_mutex_unlock_inner(pthread_mutex_t *mutex, const char *curloc))
+  {
+    psmi_assert_always_loc(psmi_progress_lock_owner == pthread_self(), curloc);
+    psmi_progress_lock_owner = PSMI_PLOCK_NO_OWNER;
+    psmi_assert_always_loc(
+	pthread_mutex_unlock(&psmi_progress_lock) != EPERM, curloc);
+    return;
+  }
+  #define PSMI_PLOCK_INIT()   /* static initialization */
+  #define PSMI_PLOCK_TRY()						\
+	    _psmi_mutex_trylock_inner(&psmi_progress_lock, PSMI_CURLOC)
+  #define PSMI_PLOCK()							\
+	    _psmi_mutex_lock_inner(&psmi_progress_lock, PSMI_CURLOC)
+  #define PSMI_PUNLOCK()						\
+	    _psmi_mutex_unlock_inner(&psmi_progress_lock, PSMI_CURLOC)
+  #define PSMI_PLOCK_ASSERT()						\
+	    psmi_assert_always(psmi_progress_lock_owner == pthread_self());
+  #define PSMI_PUNLOCK_ASSERT()						\
+	    psmi_assert_always(psmi_progress_lock_owner != pthread_self());
+
+  #define PSMI_PLOCK_DISABLED  0
+#elif defined (PSMI_PLOCK_IS_MUTEXLOCK)
+  pthread_mutex_t  psmi_progress_lock;
+  #define PSMI_PLOCK_INIT()   /* static initialization */
+  #define PSMI_PLOCK_TRY()    pthread_mutex_trylock(&psmi_progress_lock)
+  #define PSMI_PLOCK()	      pthread_mutex_lock(&psmi_progress_lock)
+  #define PSMI_PUNLOCK()      pthread_mutex_unlock(&psmi_progress_lock)
+  #define PSMI_PLOCK_DISABLED  0
+  #define PSMI_PLOCK_ASSERT()
+  #define PSMI_PUNLOCK_ASSERT()
+#elif defined(PSMI_PLOCK_IS_NOLOCK)
+  #define PSMI_PLOCK_TRY()    0 /* 0 *only* so progress thread never succeeds */
+  #define PSMI_PLOCK()	      
+  #define PSMI_PUNLOCK()      
+  #define PSMI_PLOCK_DISABLED  1
+  #define PSMI_PLOCK_ASSERT()
+  #define PSMI_PUNLOCK_ASSERT()
+#else
+  #error No PLOCK lock type declared
+#endif
+
+#define PSMI_PYIELD()							\
+	  do { PSMI_PUNLOCK(); sched_yield(); PSMI_PLOCK(); } while (0)
+
+#ifdef PSM_PROFILE
+  void psmi_profile_block() __attribute__ ((weak));
+  void psmi_profile_unblock() __attribute__ ((weak));
+  void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));;
+
+  #define PSMI_PROFILE_BLOCK()		psmi_profile_block()
+  #define PSMI_PROFILE_UNBLOCK()	psmi_profile_unblock()
+  #define PSMI_PROFILE_REBLOCK(noprog)	psmi_profile_reblock(noprog)
+#else
+  #define PSMI_PROFILE_BLOCK()
+  #define PSMI_PROFILE_UNBLOCK()
+  #define PSMI_PROFILE_REBLOCK(noprog)
+#endif
+
+#ifdef PSM_VALGRIND
+  #define PSM_VALGRIND_REDZONE_SZ	     8
+  #define PSM_VALGRIND_DEFINE_MQ_RECV(buf,posted_len,recv_len)	do {	\
+	    VALGRIND_MAKE_MEM_DEFINED((void *)(buf), (posted_len));	\
+	    if ((recv_len) < (posted_len))				\
+		VALGRIND_MAKE_MEM_UNDEFINED(				\
+		(void *) ((uintptr_t) (buf) + (recv_len)),		\
+		(posted_len) - (recv_len));				\
+	    } while (0)
+
+#else
+  #define PSM_VALGRIND_REDZONE_SZ	     0
+  #define PSM_VALGRIND_DEFINE_MQ_RECV(buf,posted_len,recv_len)
+#endif
+
+/* Parameters for use in valgrind's "is_zeroed" */
+#define PSM_VALGRIND_MEM_DEFINED     1
+#define PSM_VALGRIND_MEM_UNDEFINED   0
+
+#endif /* _PSMI_USER_H */
diff --git a/psm_utils.c b/psm_utils.c
new file mode 100644
index 0000000..c8651fe
--- /dev/null
+++ b/psm_utils.c
@@ -0,0 +1,1278 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <netdb.h> /* gethostbyname */
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid);
+
+struct psmi_epid_table psmi_epid_table;
+
+/* Iterator to access the epid table.
+ * 'ep' can be NULL if remote endpoints from all endpoint handles are requested
+ */
+void
+psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm_ep_t ep)
+{
+    itor->i = 0;
+    itor->ep = ep;
+    pthread_mutex_lock(&psmi_epid_table.tablock);
+}
+
+void *
+psmi_epid_itor_next(struct psmi_eptab_iterator *itor)
+{
+    int i;
+    struct psmi_epid_tabentry *e;
+
+    if (itor->i >= psmi_epid_table.tabsize)
+	return NULL;
+    for (i = itor->i; i < psmi_epid_table.tabsize; i++) {
+	e = &psmi_epid_table.table[i];
+	if (!e->entry || e->entry == EPADDR_DELETED)
+	    continue;
+	if (itor->ep && e->ep != itor->ep)
+	    continue;
+	itor->i = i+1;
+	return e->entry;
+    }
+    itor->i = psmi_epid_table.tabsize; /* put at end of table */
+    return NULL;
+}
+
+void
+psmi_epid_itor_fini(struct psmi_eptab_iterator *itor)
+{
+    pthread_mutex_unlock(&psmi_epid_table.tablock);
+    itor->i = 0;
+}
+
+#define mix64(a,b,c) \
+{ \
+  a -= b; a -= c; a ^= (c>>43); \
+  b -= c; b -= a; b ^= (a<<9);  \
+  c -= a; c -= b; c ^= (b>>8);  \
+  a -= b; a -= c; a ^= (c>>38); \
+  b -= c; b -= a; b ^= (a<<23); \
+  c -= a; c -= b; c ^= (b>>5);  \
+  a -= b; a -= c; a ^= (c>>35); \
+  b -= c; b -= a; b ^= (a<<49); \
+  c -= a; c -= b; c ^= (b>>11); \
+  a -= b; a -= c; a ^= (c>>12); \
+  b -= c; b -= a; b ^= (a<<18); \
+  c -= a; c -= b; c ^= (b>>22); \
+}
+
+psm_error_t
+psmi_epid_init()
+{
+    pthread_mutexattr_t attr;
+    psmi_epid_table.table = NULL,
+    psmi_epid_table.tabsize = 0;
+    psmi_epid_table.tabsize_used = 0;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(&psmi_epid_table.tablock, &attr);
+    pthread_mutexattr_destroy(&attr);
+    return PSM_OK;
+};
+
+psm_error_t
+psmi_epid_fini()
+{
+    if (psmi_epid_table.table != NULL) {
+	psmi_free(psmi_epid_table.table);
+	psmi_epid_table.table = NULL;
+    }
+    psmi_epid_table.tabsize = 0;
+    psmi_epid_table.tabsize_used = 0;
+    return PSM_OK;
+}
+
+PSMI_ALWAYS_INLINE(
+uint64_t
+hash_this(const psm_ep_t ep, const psm_epid_t epid))
+{
+    uint64_t ep_i = (uint64_t)(uintptr_t)ep; 
+    uint64_t epid_i = (uint64_t) epid;
+    uint64_t hash = 0x9e3779b97f4a7c13LL;
+    mix64(ep_i,epid_i,hash);
+    return hash;
+}
+
+PSMI_ALWAYS_INLINE(
+void *
+psmi_epid_lookup_inner(psm_ep_t ep, psm_epid_t epid, int remove))
+{
+    uint64_t key = hash_this(ep, epid);
+    struct psmi_epid_tabentry *e;
+    void *entry = NULL;
+    int idx;
+
+    pthread_mutex_lock(&psmi_epid_table.tablock);
+    if (!psmi_epid_table.table)  
+	goto ret;
+    idx = (int)(key % psmi_epid_table.tabsize);
+    while (psmi_epid_table.table[idx].entry != NULL) {
+	/* An epid can be added twice if there's more than one opened endpoint,
+	 * but really we match on epid *and* on endpoint */
+	e = &psmi_epid_table.table[idx];
+	if (e->entry != EPADDR_DELETED && e->key == key)
+	{ 
+	    entry = e->entry;
+	    if (remove) 
+		psmi_epid_table.table[idx].entry = EPADDR_DELETED;
+	    goto ret;
+	}
+	if (++idx == psmi_epid_table.tabsize)
+	    idx = 0;
+    }
+ret:
+    pthread_mutex_unlock(&psmi_epid_table.tablock);
+    return entry;
+}
+
+void *
+psmi_epid_lookup(psm_ep_t ep, psm_epid_t epid)
+{
+    void *entry = psmi_epid_lookup_inner(ep, epid, 0);
+    if (PSMI_EP_HOSTNAME != ep)
+	_IPATH_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid, entry);
+    return entry;
+}
+
+void *
+psmi_epid_remove(psm_ep_t ep, psm_epid_t epid)
+{
+    if (PSMI_EP_HOSTNAME != ep)
+	_IPATH_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid);
+    return psmi_epid_lookup_inner(ep, epid, 1);
+}
+
+psm_error_t
+psmi_epid_add(psm_ep_t ep, psm_epid_t epid, void *entry)
+{
+    uint64_t key;
+    int idx, i, newsz;
+    struct psmi_epid_tabentry *e;
+    psm_error_t err = PSM_OK;
+
+    if (PSMI_EP_HOSTNAME != ep)
+	_IPATH_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid, entry);
+    pthread_mutex_lock(&psmi_epid_table.tablock);
+    /* Leave this here, mostly for sanity and for the fact that the epid
+     * table is currently not used in the critical path */
+    if (++psmi_epid_table.tabsize_used > 
+	    (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) 
+    {
+	struct psmi_epid_tabentry *newtab;
+	newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK;
+	newtab = (struct psmi_epid_tabentry *) 
+	    psmi_calloc(ep, PER_PEER_ENDPOINT, 
+			newsz, sizeof(struct psmi_epid_tabentry));
+	if (newtab == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+	if (psmi_epid_table.table) { /* rehash the table */
+	    for (i = 0; i < psmi_epid_table.tabsize; i++) {
+		e = &psmi_epid_table.table[i];
+		if (e->entry == NULL)
+		    continue;
+		/* When rehashing, mark deleted as free again */
+		if (e->entry == EPADDR_DELETED) {
+		    psmi_epid_table.tabsize_used--;
+		    continue;
+		}
+		idx = (int)(e->key % newsz);
+		while (newtab[idx].entry != NULL)
+		    if (++idx == newsz)
+			idx = 0;
+		newtab[idx].entry = e->entry;
+		newtab[idx].key   = e->key;
+		newtab[idx].ep    = e->ep;
+		newtab[idx].epid  = e->epid;
+	    }
+	    psmi_free(psmi_epid_table.table);
+	}
+	psmi_epid_table.table = newtab;
+	psmi_epid_table.tabsize = newsz;
+    }
+    key = hash_this(ep, epid);
+    idx = (int)(key % psmi_epid_table.tabsize);
+    e = &psmi_epid_table.table[idx];
+    while (e->entry && e->entry != EPADDR_DELETED) {
+	if (++idx == psmi_epid_table.tabsize)
+	    idx = 0;
+	e = &psmi_epid_table.table[idx];
+    }
+    e->entry = entry;
+    e->key   = key;
+    e->epid  = epid;
+    e->ep    = ep;
+
+fail:
+    pthread_mutex_unlock(&psmi_epid_table.tablock);
+    return err;
+}
+
+char *
+psmi_gethostname(void)
+{
+    /* XXX this will need a lock in a multi-threaded environment */
+    static char hostname[80] = {'\0'};
+    char *c;
+
+    if (hostname[0] == '\0') {
+	gethostname(hostname, sizeof(hostname));
+	hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */
+	if ((c = strchr(hostname, '.')))
+	    *c = '\0';
+    }
+
+    return hostname;
+}
+
+/* 
+ * Hostname stuff.  We really only register the network portion of the epid
+ * since all epids from the same nid are assumed to have the same hostname.
+ */
+psm_error_t
+psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite)
+{
+    size_t hlen;
+    char *h;
+    psm_error_t err = PSM_OK;
+    
+    if (hostname == NULL)
+	return PSM_OK;
+    /* First see if a hostname already exists */
+    if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) {
+	if (!overwrite)
+	    return PSM_OK;
+
+	h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid);
+	if (h != NULL) /* free the previous hostname if so exists */
+	    psmi_free(h);
+    }
+
+    hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname)+1);
+    h = (char *) psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen);
+    if (h == NULL)
+	return PSM_NO_MEMORY;
+    snprintf(h, hlen, "%s", hostname);
+    h[hlen-1] = '\0';
+    err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h);
+    return err;
+}
+
+/* XXX These two functions are not thread safe, we'll use a rotating buffer
+ * trick whenever we need to make them thread safe */
+const char *
+psmi_epaddr_get_hostname(psm_epid_t epid)
+{
+    static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+    static int bufno = 0;
+    uint64_t nid = psm_epid_nid(epid);
+    char *h, *hostname;
+
+    hostname = hostnamebufs[bufno];
+    bufno = (bufno + 1) % 4;
+
+    /* First, if we have registered a host for this epid, just return that, or
+     * else try to return something with lid and context */
+    h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid);
+    if (h != NULL) 
+	return h;
+    else {
+	uint64_t lid, context, subcontext;
+	lid = PSMI_EPID_GET_LID(epid);
+	context = PSMI_EPID_GET_CONTEXT(epid);
+	subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+	snprintf(hostname, PSMI_EP_HOSTNAME_LEN-1, "LID=0x%04x:%d.%d",
+		(unsigned int) lid, (int) context, (int) subcontext);
+	hostname[PSMI_EP_HOSTNAME_LEN-1] = '\0';
+	return hostname;
+    }
+}
+
+/* This one gives the hostname with a lid */
+const char *
+psmi_epaddr_get_name(psm_epid_t epid)
+{
+    static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN];
+    static int bufno = 0;
+    char *h, *hostname;
+    uint64_t lid, context, subcontext;
+
+    lid = PSMI_EPID_GET_LID(epid);
+    context = PSMI_EPID_GET_CONTEXT(epid);
+    subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+    hostname = hostnamebufs[bufno];
+    bufno = (bufno + 1) % 4;
+
+    h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm_epid_nid(epid));
+    if (h == NULL)
+	return psmi_epaddr_get_hostname(epid);
+    else {
+	snprintf(hostname, PSMI_EP_HOSTNAME_LEN-1,
+	    "%s (LID=0x%04x:%d.%d)", h,
+	    (unsigned int) lid, (int) context, (int) subcontext);
+	hostname[PSMI_EP_HOSTNAME_LEN-1] = '\0';
+    }
+    return hostname;
+}
+
+/* Wrapper, in case we port to OS xyz that doesn't have sysconf */
+uintptr_t
+psmi_getpagesize(void)
+{
+    static uintptr_t	pagesz = (uintptr_t) -1;
+    long sz;
+    if (pagesz != (uintptr_t) -1) 
+	return pagesz;
+    sz = sysconf(_SC_PAGESIZE);
+    if (sz == -1) {
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+	    "Can't query system page size");
+    }
+		    
+    pagesz = (uintptr_t) sz;
+    return pagesz;
+}
+
+/* If PSM_VERBOSE_ENV is set in the environment, we determine
+ * what its verbose level is and print the environment at "INFO" 
+ * level if the environment's level matches the desired printlevel.
+ */
+static int psmi_getenv_verblevel = -1;
+static int 
+psmi_getenv_is_verblevel(int printlevel)  
+{
+    if (psmi_getenv_verblevel == -1) {
+	char *env = getenv("PSM_VERBOSE_ENV");
+	if (env && *env) {
+	    char *ep;
+	    int val = (int) strtol(env, &ep, 0);
+	    if (ep == env)
+		psmi_getenv_verblevel = 0;
+	    else if (val == 2)
+		psmi_getenv_verblevel = 2;
+	    else
+		psmi_getenv_verblevel = 1;
+	}
+	else
+		psmi_getenv_verblevel = 0;
+    }
+    return (printlevel <= psmi_getenv_verblevel);
+}
+
+#define GETENV_PRINTF(_level,_fmt,...)			    \
+	do {						    \
+	    int nlevel = _level;			    \
+	    if (psmi_getenv_is_verblevel(nlevel))	    \
+		nlevel = 0;				    \
+	    _IPATH_ENVDBG(nlevel,_fmt,##__VA_ARGS__);	    \
+	} while (0)
+
+int 
+psmi_getenv(const char *name, const char *descr, int level,
+	    int type, union psmi_envvar_val defval,
+	    union psmi_envvar_val *newval)
+{
+    int used_default = 0;
+    union psmi_envvar_val tval;
+    char *env = getenv(name);
+    int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS ||
+		 type == PSMI_ENVVAR_TYPE_UINT_FLAGS);
+
+    /* If we're not using the default, always reset the print
+     * level to '1' so the changed value gets seen at low
+     * verbosity */
+#define _GETENV_PRINT(used_default,fmt,val,defval)  do {	\
+	if (used_default)					\
+	    GETENV_PRINTF(level, "%s%-25s %-40s =>%s" #fmt	\
+		"\n", level>1?"*":" ", name, descr, ishex?"	\
+		0x":" ", val);					\
+	else							\
+	    GETENV_PRINTF(1, "%s%-25s %-40s =>%s" #fmt		\
+		" (default was%s" #fmt ")\n",level>1?"*":" ",	\
+		name, descr, ishex?" 0x":" ", val,		\
+		ishex?" 0x":" ", defval);			\
+	} while (0)
+
+    switch (type) {
+	case PSMI_ENVVAR_TYPE_YESNO:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else if (env[0] == 'Y' || env[0] == 'y')
+	        tval.e_int = 1;
+	    else if (env[0] == 'N' || env[0] == 'n')
+	        tval.e_int = 0;
+	    else {
+		char *ep;
+	        tval.e_ulong = strtoul(env, &ep, 0);
+	        if (ep == env) {
+	    	used_default = 1;
+	    	tval = defval;
+	        }
+	        else if (tval.e_ulong != 0)
+	    	tval.e_ulong = 1;
+	    }
+	    _GETENV_PRINT(used_default,%s,tval.e_long?"YES":"NO",
+	    	     defval.e_int?"YES":"NO");
+	    break;
+
+	case PSMI_ENVVAR_TYPE_STR:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else
+	        tval.e_str = env;
+	    _GETENV_PRINT(used_default,%s,tval.e_str,defval.e_str);
+	    break;
+
+	case PSMI_ENVVAR_TYPE_INT:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else {
+		char *ep;
+	        tval.e_int = (int) strtol(env, &ep, 0);
+	        if (ep == env) {
+	    	used_default = 1;
+	    	tval = defval;
+	        }
+	    }
+	    _GETENV_PRINT(used_default,%d,tval.e_int,defval.e_int);
+	    break;
+
+	case PSMI_ENVVAR_TYPE_UINT:
+	case PSMI_ENVVAR_TYPE_UINT_FLAGS:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else {
+		char *ep;
+	        tval.e_int = (unsigned int) strtoul(env, &ep, 0);
+	        if (ep == env) {
+	    	used_default = 1;
+	    	tval = defval;
+	        }
+	    }
+	    if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
+		_GETENV_PRINT(used_default,%x,tval.e_uint,defval.e_uint);
+	    else
+		_GETENV_PRINT(used_default,%u,tval.e_uint,defval.e_uint);
+	    break;
+
+	case PSMI_ENVVAR_TYPE_LONG:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else {
+		char *ep;
+	        tval.e_long = strtol(env, &ep, 0);
+	        if (ep == env) {
+	    	used_default = 1;
+	    	tval = defval;
+	        }
+	    }
+	    _GETENV_PRINT(used_default,%ld,tval.e_long,defval.e_long);
+	    break;
+	case PSMI_ENVVAR_TYPE_ULONG_ULONG:
+	  if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else {
+		char *ep;
+	        tval.e_ulonglong = (unsigned long long) strtoull(env, &ep, 0);
+	        if (ep == env) {
+		  used_default = 1;
+		  tval = defval;
+	        }
+	    }
+	    _GETENV_PRINT(used_default,%llu,
+			  tval.e_ulonglong, defval.e_ulonglong);
+	  break;
+	case PSMI_ENVVAR_TYPE_ULONG:
+	case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
+	default:
+	    if (!env || *env == '\0') {
+	        tval = defval;
+	        used_default = 1;
+	    }
+	    else {
+		char *ep;
+	        tval.e_ulong = (unsigned long) strtoul(env, &ep, 0);
+	        if (ep == env) {
+	    	used_default = 1;
+	    	tval = defval;
+	        }
+	    }
+	    if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
+		_GETENV_PRINT(used_default,%lx,tval.e_ulong,defval.e_ulong);
+	    else
+		_GETENV_PRINT(used_default,%lu,tval.e_ulong,defval.e_ulong);
+	    break;
+    }
+#undef _GETENV_PRINT
+    *newval = tval;
+	    
+    return used_default;
+}
+
+/*
+ * Parsing int parameters set in string tuples.
+ * Output array int *vals should be able to store 'ntup' elements.
+ * Values are only overwritten if they are parsed.
+ * Tuples are always separated by colons ':'
+ */
+int psmi_parse_str_tuples(const char *string, int ntup, int *vals)
+{
+    char *b = (char *) string;
+    char *e = b;
+    int tup_i = 0;
+    int n_parsed = 0;
+    char *buf = psmi_strdup(NULL, string);
+    psmi_assert_always(buf != NULL);
+
+    while (*e && tup_i < ntup) {
+	b = e;
+	while (*e && *e != ':') 
+	    e++;
+	if (e > b) { /* something to parse */
+	    char *ep;
+	    int len = e - b;
+	    long int l;
+	    strncpy(buf, b, len);
+	    buf[len] = '\0';
+	    l = strtol(buf, &ep, 0);
+	    if (ep != buf) {  /* successful conversion */
+		vals[tup_i] = (int) l;
+		n_parsed++;
+	    }
+	}
+	if (*e == ':') 
+	    e++; /* skip delimiter */
+	tup_i++;
+    }
+    psmi_free(buf);
+    return n_parsed;
+}
+
+/*
+ * Memory footprint/usage mode.
+ *
+ * This can be used for debug or for separating large installations from
+ * small/medium ones.  The default is to assume a medium installation.  Large
+ * is not that much larger in memory footprint, but we make a conscious effort
+ * an consuming only the amount of memory we need.
+ */
+int
+psmi_parse_memmode(void)
+{
+    union psmi_envvar_val env_mmode;
+    int used_default = 
+	psmi_getenv("PSM_MEMORY", "Memory usage mode (normal or large)",
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val) "normal", &env_mmode);
+    if (used_default || !strcasecmp(env_mmode.e_str, "normal"))
+	return PSMI_MEMMODE_NORMAL;
+    else if (!strcasecmp(env_mmode.e_str, "min"))
+	return PSMI_MEMMODE_MINIMAL;
+    else if (!strcasecmp(env_mmode.e_str, "large") || 
+	     !strcasecmp(env_mmode.e_str, "big"))
+	return PSMI_MEMMODE_LARGE;
+    else {
+	_IPATH_PRDBG("PSM_MEMORY env value %s unrecognized, "
+		     "using 'normal' memory mode instead\n",
+		     env_mmode.e_str);
+	return PSMI_MEMMODE_NORMAL;
+    }
+}
+
+static
+const char *
+psmi_memmode_string(int mode)
+{
+    psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM);
+    switch (mode) {
+	case PSMI_MEMMODE_NORMAL:
+	    return "normal";
+	case PSMI_MEMMODE_MINIMAL:
+	    return "minimal";
+	case PSMI_MEMMODE_LARGE:
+	    return "large";
+	default:
+	    return "unknown";
+    }
+}
+
+psm_error_t 
+psmi_parse_mpool_env(const psm_mq_t mq, int level,
+			const struct psmi_rlimit_mpool *rlim,
+		        uint32_t *valo, uint32_t *chunkszo)
+{
+    uint32_t val;
+    const char *env = rlim->env;
+    int mode = mq->memmode;
+    psm_error_t err = PSM_OK;
+    union psmi_envvar_val env_val;
+    
+    psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM);
+
+    psmi_getenv(rlim->env, rlim->descr, rlim->env_level,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) rlim->mode[mode].obj_max, 
+		&env_val);
+
+    val = env_val.e_uint;
+    if (val < rlim->minval || val > rlim->maxval)
+    {
+	err = psmi_handle_error(NULL, PSM_PARAM_ERR,
+		"Env. var %s=%u is invalid (valid settings in mode PSM_MEMORY=%s"
+		" are inclusively between %u and %u)", env, val,
+		psmi_memmode_string(mode), rlim->minval, rlim->maxval);
+	goto fail;
+    }
+
+    _IPATH_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n",
+	    env, val, rlim->mode[mode].obj_chunk, psmi_memmode_string(mode), 
+	    mode, rlim->minval, rlim->maxval);
+
+    *valo = val;
+    *chunkszo = rlim->mode[mode].obj_chunk;
+
+fail:
+    return err;
+}
+
+uint64_t
+psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns)
+{
+    if (timeout_ns < 0)
+	return 0ULL;
+    else if (timeout_ns == 0ULL || timeout_ns == ~0ULL)
+	return ~0ULL;
+    else {
+	uint64_t t_end = nanosecs_to_cycles(timeout_ns);
+	uint64_t t_now = get_cycles() - start_cycles;
+
+	if (t_now >= t_end) 
+	    return 0ULL;
+	else
+	    return (t_end - t_now);
+    }
+}
+
+uint32_t
+psmi_get_ipv4addr()
+{
+    struct  hostent *he;
+    uint32_t addr = 0;
+
+    he = gethostbyname(psmi_gethostname());
+    if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) {
+	memcpy(&addr, he->h_addr, sizeof(uint32_t));
+	return addr;
+    }
+    else
+	return 0;
+}
+
+#define PSMI_EP_IS_PTR(ptr)	    ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT)
+
+void
+psmi_syslog(psm_ep_t ep, int to_console, int level, const char *format, ...)
+{
+    va_list ap;
+
+    /* If we've never syslogged anything from this ep at the PSM level, make
+     * sure we log context information */
+    if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) {
+	char uuid_str[64];
+	ep->did_syslog = 1;
+
+	memset(&uuid_str, 0, sizeof uuid_str);
+	psmi_uuid_unparse(ep->key, uuid_str);
+	ipath_syslog("PSM", 0, LOG_WARNING, 
+		     "uuid_key=%s,unit=%d,context=%d,subcontext=%d",
+		     uuid_str,
+		     ep->context.base_info.spi_unit,
+		     ep->context.base_info.spi_context,
+		     ep->context.base_info.spi_subcontext);
+    }
+
+    va_start(ap, format);
+    ipath_vsyslog("PSM", to_console, level, format, ap);
+    va_end(ap);
+}
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed = 0;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+  uint32_t c;
+  int n, k;
+
+  for (n = 0; n < 256; n++) {
+    c = (uint32_t) n;
+    for (k = 0; k < 8; k++) {
+      if (c & 1)
+        c = 0xedb88320 ^ (c >> 1);
+      else
+        c = c >> 1;
+    }
+    crc_table[n] = c;
+  }
+  crc_table_computed = 1;
+}
+   
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+   
+static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len)
+{
+  uint32_t c = crc;
+  int n;
+
+  if_pf (!crc_table_computed)
+    make_crc_table();
+  for (n = 0; n < len; n++) {
+    c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
+  }
+  return c;
+}
+   
+/* Return the CRC of the bytes buf[0..len-1]. */
+uint32_t psmi_crc(unsigned char *buf, int len)
+{
+    return update_crc(0xffffffff, buf, len) ^ 0xffffffff;
+}
+
+/* Return the HCA type being used for a context */
+uint32_t  psmi_get_hca_type(psmi_context_t *context)
+{
+  uint32_t hca_type;
+  
+  /* Determine HCA type. Use heuristics based on runtime flags
+   * 
+   * Header suppression available: QLE73XX
+   * NODMA_RTAIL: QLE72XX
+   * <AnythingElse>: QLE71XX
+   */
+
+  if (context->runtime_flags & IPATH_RUNTIME_HDRSUPP)
+    hca_type = PSMI_HCA_TYPE_QLE73XX;
+  else if (context->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL)
+    hca_type = PSMI_HCA_TYPE_QLE72XX;
+  else
+    hca_type = PSMI_HCA_TYPE_QLE71XX;
+  
+  return hca_type;
+}
+
+#define PSMI_FAULTINJ_SPEC_NAMELEN  32
+struct psmi_faultinj_spec {
+    STAILQ_ENTRY(psmi_faultinj_spec)	next;
+    char				spec_name[PSMI_FAULTINJ_SPEC_NAMELEN];
+
+    unsigned long long num_faults;
+    unsigned long long num_calls;
+
+    unsigned int seedp;
+    int num;
+    int denom;
+
+};
+
+int  psmi_faultinj_enabled = 0;
+int  psmi_faultinj_verbose = 0;
+char *psmi_faultinj_outfile = NULL;
+
+static struct psmi_faultinj_spec		psmi_faultinj_dummy;
+static STAILQ_HEAD(, psmi_faultinj_spec)	psmi_faultinj_head =
+	STAILQ_HEAD_INITIALIZER(psmi_faultinj_head);
+
+void
+psmi_faultinj_init()
+{   
+    union psmi_envvar_val env_fi;
+
+    psmi_getenv("PSM_FI", "PSM Fault Injection (yes/no)",
+		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO,
+		PSMI_ENVVAR_VAL_NO, &env_fi);
+
+    psmi_faultinj_enabled = !!env_fi.e_uint;
+    
+    if (psmi_faultinj_enabled) {
+	char *def = NULL;
+	if (!psmi_getenv("PSM_FI_TRACEFILE", "PSM Fault Injection output file",
+		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+		(union psmi_envvar_val) def, &env_fi)) 
+	{
+	    psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str); 
+	}
+    }
+
+    return;
+}
+
+void
+psmi_faultinj_fini()
+{
+    struct psmi_faultinj_spec	*fi;
+    FILE *fp;
+    int do_fclose = 0;
+
+    if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL)
+	return;
+
+    if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0) 
+	fp = stdout;
+    else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0)
+	fp = stderr;
+    else {
+	char *c = psmi_faultinj_outfile;
+	char buf[192];
+	int append = 0;
+	if (*c == '+') {
+	    append = 1;
+	    ++c;
+	}
+	do_fclose = 1;
+	snprintf(buf, sizeof buf - 1, "%s.%s", c, __ipath_mylabel);
+	buf[sizeof buf - 1] = '\0';
+	fp = fopen(buf, append ? "a" : "w");
+    }
+
+    if (fp != NULL) {
+	STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+	    fprintf(fp, "%s:%s PSM_FI_%-12s %2.3f%% => "
+		    "%2.3f%% %10lld faults/%10lld events\n", __progname,
+		    __ipath_mylabel, fi->spec_name, 
+		    (double) fi->num * 100.0 / fi->denom, 
+		    (double) fi->num_faults * 100.0 / fi->num_calls,
+		    fi->num_faults, fi->num_calls);
+	}
+	fflush(fp);
+	if (do_fclose)
+	    fclose(fp);
+    }
+
+    psmi_free(psmi_faultinj_outfile);
+    return;
+}
+
+/*
+ * Intended to be used only once, not in the critical path
+ */
+struct psmi_faultinj_spec *
+psmi_faultinj_getspec(char *spec_name, int num, int denom)
+{
+    struct psmi_faultinj_spec	*fi;
+
+    if (!psmi_faultinj_enabled) 
+	return &psmi_faultinj_dummy;
+
+    STAILQ_FOREACH(fi, &psmi_faultinj_head, next) {
+	if (strcmp(fi->spec_name, spec_name) == 0)
+	    return fi;
+    }
+
+    /* We got here, so no spec -- allocate one */
+    fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED, sizeof(struct psmi_faultinj_spec));
+    strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN-1);
+    fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN-1] = '\0';
+    fi->num = num;
+    fi->denom = denom;
+    fi->num_faults = 0;
+    fi->num_calls = 0;
+
+    /* 
+     * See if we get a hint from the environment.
+     * Format is
+     * <num:denom:initial_seed>
+     *
+     * By default, we chose the initial seed to be the 'pid'.  If users need
+     * repeatability, they should set initial_seed to be the 'pid' when the
+     * error was observed or force the initial_seed to be a constant number in
+     * each running process.  Using 'pid' is useful because core dumps store
+     * pids and our backtrace format does as well so if a crash is observed for
+     * a specific seed, programs can reuse the 'pid' to regenerate the same
+     * error condition.
+     */
+    {
+	int fvals[3] = { num, denom, (int) getpid() };
+	union psmi_envvar_val env_fi;
+	char fvals_str[128];
+	char fname[128];
+	char fdesc[256];
+
+	snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom);
+	fvals_str[sizeof fvals_str - 1] = '\0';
+	snprintf(fname, sizeof fname - 1, "PSM_FI_%s", spec_name);
+	fname[sizeof fname - 1] = '\0';
+	snprintf(fdesc, sizeof fdesc - 1, "Fault Injection %s <%s>",
+		    fname, fvals_str);
+
+	if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, 
+			 PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) fvals_str,
+			 &env_fi))
+	{
+	    /* not using default values */
+	    int n_parsed = psmi_parse_str_tuples(env_fi.e_str, 3, fvals);
+	    if (n_parsed >= 1)
+		fi->num = fvals[0];
+	    if (n_parsed >= 2)
+		fi->denom = fvals[1];
+	    if (n_parsed >= 3)
+		fi->seedp = fvals[2];
+	}
+    }
+
+    STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next);
+    return fi;
+}
+    
+int
+psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi)
+{
+    int r;
+    if (!psmi_faultinj_enabled) /* never fault if disabled */
+	return 0;
+    if (fi->num == 0)
+	return 0;
+
+    fi->num_calls++;
+    r = rand_r(&fi->seedp);
+    if (r % fi->denom <= fi->num) {
+	fi->num_faults++;
+	return 1;
+    }
+    else
+	return 0;
+}
+
+/* For memory allocation, we kind of break the PSM error handling rules.
+ * If the caller gets NULL, it has to assume that the error has been handled
+ * and should always return PSM_NO_MEMORY */
+
+/*
+ * Log memory increments or decrements of type memstats_t.
+ */
+struct psmi_memtype_hdr {
+    struct {
+	uint64_t	size : 48;
+	uint64_t	magic : 8;
+	uint64_t	type : 8;
+    };
+};
+
+struct psmi_stats_malloc psmi_stats_memory;
+
+void
+psmi_log_memstats(psmi_memtype_t type, int64_t nbytes)
+{
+#define _add_max_total(type,nbytes)				\
+	psmi_stats_memory.m_ ## type ## _total += (nbytes);	\
+	psmi_stats_memory.m_ ## type ## _max = max(		\
+	    psmi_stats_memory.m_ ## type ## _total,		\
+	    psmi_stats_memory.m_ ## type ## _max);
+
+    switch (type) {
+	case PER_PEER_ENDPOINT:
+	    _add_max_total(perpeer, nbytes);
+	    break;
+	case NETWORK_BUFFERS:
+	    _add_max_total(netbufs, nbytes);
+	    break;
+	case DESCRIPTORS:
+	    _add_max_total(descriptors, nbytes);
+	    break;
+	case UNEXPECTED_BUFFERS:
+	    _add_max_total(unexpbufs, nbytes);
+	    break;
+	case STATS:
+	    _add_max_total(stats, nbytes);
+	    break;
+	case UNDEFINED:
+	    _add_max_total(undefined, nbytes);
+	    break;
+	default:
+	    psmi_assert_always(type == TOTAL);
+	    break;
+    }
+    _add_max_total(all, nbytes);
+    psmi_stats_memory.m_all_max++;
+#undef _add_max_total
+
+    return;
+}
+
+#define psmi_stats_mask PSMI_STATSTYPE_MEMORY
+
+#ifdef malloc
+#undef malloc
+#endif
+void *
+psmi_malloc_internal(psm_ep_t ep, psmi_memtype_t type, 
+		     size_t sz, const char *curloc)
+{
+    size_t newsz = sz;
+    void *newa;
+
+    psmi_assert(sizeof(struct psmi_memtype_hdr) == 8);
+
+    if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY)
+	newsz += sizeof(struct psmi_memtype_hdr);
+
+    newa = malloc(newsz);
+    if (newa == NULL)  {
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY,
+	    "Out of memory for malloc at %s", curloc);
+	return NULL;
+    }
+
+    if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+	struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *) newa;
+	hdr->size = newsz;
+	hdr->type = type;
+	hdr->magic = 0x8c;
+	psmi_log_memstats(type, newsz);
+	newa = (void *) (hdr + 1);
+	//_IPATH_INFO("alloc is %p\n", newa);
+    }
+    return newa;
+}
+
+#ifdef calloc
+#undef calloc
+#endif
+void *
+psmi_calloc_internal(psm_ep_t ep, psmi_memtype_t type, size_t nelem, 
+		     size_t elemsz, const char *curloc)
+{
+    void *newa = psmi_malloc_internal(ep, type, nelem*elemsz, curloc);
+    if (newa == NULL) /* error handled above */
+	return NULL;
+    memset(newa, 0, nelem*elemsz);
+    return newa;
+}
+
+#ifdef strdup
+#undef strdup
+#endif
+void *
+psmi_strdup_internal(psm_ep_t ep, const char *string, const char *curloc)
+{
+    size_t len = strlen(string)+1;
+    void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc);
+    if (newa == NULL)
+	return NULL;
+    memcpy(newa, string, len); /* copy with \0 */
+    return newa;
+}
+
+#ifdef free
+#undef free
+#endif
+
+void
+psmi_free_internal(void *ptr)
+{
+    if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY) {
+	struct psmi_memtype_hdr *hdr = 
+	    (struct psmi_memtype_hdr *) ptr - 1;
+	//_IPATH_INFO("hdr is %p, ptr is %p\n", hdr, ptr);
+	psmi_memtype_t type = hdr->type;
+	int64_t size = hdr->size;
+	int magic = (int) hdr->magic;
+	psmi_log_memstats(type, -size);
+	psmi_assert_always(magic == 0x8c);
+	ptr = (void *) hdr;
+    }
+    free(ptr);
+}
+
+PSMI_ALWAYS_INLINE(
+psm_error_t
+psmi_coreopt_ctl(const void *core_obj, int optname, 
+		 void *optval, uint64_t *optlen, int get))
+{
+  psm_error_t err = PSM_OK;
+  char err_string[256];
+
+  switch(optname) {
+  case PSM_CORE_OPT_DEBUG:
+    /* Sanity check length */
+    if (*optlen < sizeof(unsigned)) {
+      snprintf(err_string, 256, "Option value length error");
+      *optlen = sizeof(unsigned);
+      goto fail;
+    }
+    
+    if (get) {
+      *((unsigned *) optval) = infinipath_debug;
+    }
+    else
+      infinipath_debug = *(unsigned*) optval;
+    break;
+  case PSM_CORE_OPT_EP_CTXT:
+    {
+      /* core object is epaddr */
+      psm_epaddr_t epaddr = (psm_epaddr_t) core_obj;
+      
+      /* Sanity check epaddr */
+      if (!epaddr) {
+	snprintf(err_string, 256, "Invalid endpoint address");
+	goto fail;
+      }
+      
+      /* Sanity check length */
+      if (*optlen < sizeof(unsigned long)) {
+	snprintf(err_string, 256, "Option value length error");
+	*optlen = sizeof(void*);
+	goto fail;
+      }
+      
+      if (get) {
+	*((unsigned long*) optval) = (unsigned long) epaddr->usr_ep_ctxt;
+      }
+      else
+	epaddr->usr_ep_ctxt = optval;
+    }
+    break;
+  default:
+    /* Unknown/unrecognized option */
+    snprintf(err_string, 256, "Unknown PSM_CORE option %u.", optname);
+    goto fail;
+  }
+  
+  
+  return err;
+  
+ fail:
+  /* Unrecognized/unknown option */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, err_string, "%s");
+}
+
+psm_error_t psmi_core_setopt(const void *core_obj, int optname, 
+			     const void *optval, uint64_t optlen)
+{
+  return psmi_coreopt_ctl(core_obj, optname, (void*) optval, &optlen, 0);
+}
+
+psm_error_t psmi_core_getopt(const void *core_obj, int optname, 
+			     void *optval, uint64_t *optlen)
+{ 
+  return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1);
+}
+
+/* PSM AM component option handling */
+PSMI_ALWAYS_INLINE(
+psm_error_t
+psmi_amopt_ctl(const void *am_obj, int optname, 
+	       void *optval, uint64_t *optlen, int get))
+{
+  psm_error_t err = PSM_OK;
+  
+  switch(optname) {
+  case PSM_AM_OPT_FRAG_SZ:
+    {
+      /* AM object is a psm_epaddr (or NULL for global minimum sz) */
+      psm_epaddr_t epaddr = (psm_epaddr_t) am_obj; 
+
+      if (!get) /* Cannot set this option */
+	return psmi_handle_error(NULL, PSM_OPT_READONLY, 
+				 "Unable to set PSM_AM_OPT_FRAG_SZ. This is "
+				 "a read only option.");
+      /* Sanity check length */
+      if (*optlen < sizeof(uint32_t)) {
+	*optlen = sizeof(uint32_t);
+	return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				       "Option value length error");
+      }
+      
+      /* TODO: Currently all AMs occur over IPS which utilizes the PIO flows.
+       * These are limited to the PIO size of the chip. Once we have AM 
+       * capability over shared memory then we can have different fragment
+       * sizes over both transport and the global fragment size will need to
+       * take the minimum of all possible transports used. For now if the
+       * endpoint is opened get the PIO size from it else hard code it to 2K
+       * which is "correct" for all supported chips.
+       */
+      *((unsigned *) optval) = 
+	(epaddr && 
+	 psmi_ep_device_is_enabled(epaddr->ep, PTL_DEVID_IPS)) ? 
+	(epaddr->ep->context.base_info.spi_piosize - 
+	 IPATH_MESSAGE_HDR_SIZE) : 2048;
+    }
+    
+    break;
+  default:
+    err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown PSM_AM option %u.", optname);
+  }
+  
+  return err;
+}
+
+psm_error_t psmi_am_setopt(const void *am_obj, int optname, 
+			     const void *optval, uint64_t optlen)
+{
+  return psmi_amopt_ctl(am_obj, optname, (void*) optval, &optlen, 0);
+}
+
+psm_error_t psmi_am_getopt(const void *am_obj, int optname, 
+			     void *optval, uint64_t *optlen)
+{ 
+  return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1);
+}
diff --git a/psm_utils.h b/psm_utils.h
new file mode 100644
index 0000000..e6420e0
--- /dev/null
+++ b/psm_utils.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_utils.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_UTILS_H
+#define _PSMI_UTILS_H
+
+#include <arpa/inet.h> /* ipv4addr */
+#include <stdlib.h>    /* malloc/free */
+
+/*
+ * Endpoint 'id' hash table, with iterator interface
+ */
+struct psmi_epid_table {
+    struct psmi_epid_tabentry	*table;
+    int			         tabsize;
+    int				 tabsize_used;
+    pthread_mutex_t		 tablock;
+};
+/*
+ * Endpoint address hash table
+ */
+struct psmi_epid_tabentry {
+    void      *entry;
+    uint64_t   key;
+    psm_ep_t   ep;
+    psm_epid_t epid;
+};
+
+extern struct psmi_epid_table psmi_epid_table;
+#define EPADDR_DELETED	((void *)-1)	/* tag used to mark deleted entries */
+#define PSMI_EPID_TABSIZE_CHUNK	 128
+#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7)
+
+psm_error_t  psmi_epid_init();
+psm_error_t  psmi_epid_fini();
+void	    *psmi_epid_lookup(psm_ep_t ep, psm_epid_t epid);
+void	    *psmi_epid_remove(psm_ep_t ep, psm_epid_t epid);
+psm_error_t  psmi_epid_add(psm_ep_t ep, psm_epid_t epid, void *entry);
+#define PSMI_EP_HOSTNAME    ((psm_ep_t) -1) /* Special endpoint handle we use
+					     * to register hostnames */
+#define PSMI_EP_CROSSTALK   ((psm_ep_t) -2) /* Second special endpoint handle
+					     * to log which nodes we've seen
+					     * crosstalk from */
+struct psmi_eptab_iterator {
+    int		i;  /* last index looked up */
+    psm_ep_t	ep;
+};
+void  psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm_ep_t ep);
+void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor);
+void  psmi_epid_itor_fini(struct psmi_eptab_iterator *itor);
+
+uint64_t psmi_epid_hca_type(psm_epid_t epid);
+uint64_t psmi_epid_sl(psm_epid_t epid);
+/*
+ * Hostname manipulation
+ */
+#define	     PSMI_EP_HOSTNAME_LEN   64   /* hostname only */
+#define	     PSMI_EP_NAME_LEN       96   /* hostname:LID:context:subcontext */
+char       * psmi_gethostname(void);
+const char * psmi_epaddr_get_hostname(psm_epid_t epid);
+const char * psmi_epaddr_get_name(psm_epid_t epid);
+psm_error_t  psmi_epid_set_hostname(uint64_t nid, const char *hostname, 
+				   int overwrite);
+
+/* 
+ * Memory allocation, use macros only.
+ *
+ * In all calls, ep can be a specific endpoint (valid psm_ep_t) or PSMI_EP_NONE
+ * if no endpoint is available.
+ *
+ *   psmi_malloc(ep, memtype, size)
+ *   psmi_calloc(ep, memtype, elemsz, numelems)
+ *   psmi_strdup(ep, memtype, ptr)
+ *   psmi_free(ptr)
+ *
+ */
+typedef enum psmi_memtype {
+	TOTAL = 0,	    /* Logged automatically by malloc/calloc */
+	UNDEFINED,	    /* For tracking "other types" of allocations */
+	PER_PEER_ENDPOINT,  /* For tracking "per peer" allocations */
+	NETWORK_BUFFERS,    /* For tracking network buffers */
+	DESCRIPTORS,	    /* For tracking send/recv descriptors */
+	UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */
+	STATS,		    /* For tracking stats-related allocs */
+}
+psmi_memtype_t;
+
+/* 
+ * We track allocation stats.
+ */
+struct psmi_stats_malloc {
+    int64_t	m_all_total;
+    int64_t	m_all_max;
+    int64_t	m_perpeer_total;
+    int64_t	m_perpeer_max;
+    int64_t	m_netbufs_total;
+    int64_t	m_netbufs_max;
+    int64_t	m_descriptors_total;
+    int64_t	m_descriptors_max;
+    int64_t	m_unexpbufs_total;
+    int64_t	m_unexpbufs_max;
+    int64_t	m_undefined_total;
+    int64_t	m_undefined_max;
+    int64_t	m_stats_total;
+    int64_t	m_stats_max;
+};
+
+extern struct psmi_stats_malloc psmi_stats_memory;
+
+void *psmi_malloc_internal(psm_ep_t ep, psmi_memtype_t mt, size_t sz, 
+			   const char *curloc);
+void *psmi_calloc_internal(psm_ep_t ep, psmi_memtype_t mt, size_t num, size_t sz,
+			   const char *curloc);
+void *psmi_strdup_internal(psm_ep_t ep, const char *string, const char *curloc);
+void  psmi_free_internal(void *ptr);
+
+#define psmi_strdup(ep,string) psmi_strdup_internal(ep,string, PSMI_CURLOC)
+#define psmi_calloc(ep,mt,nelem,elemsz) \
+	psmi_calloc_internal(ep,mt,nelem,elemsz,PSMI_CURLOC)
+#define psmi_malloc(ep,mt,sz) psmi_malloc_internal(ep,mt,sz,PSMI_CURLOC)
+#define psmi_free(sz)	psmi_free_internal(sz)
+
+#ifndef PSM_IS_TEST
+#define malloc(sz)       _use_psmi_malloc_instead_of_plain_malloc
+#define calloc(sz,nelm)  _use_psmi_calloc_instead_of_plain_calloc
+#ifdef strdup
+#undef strdup
+#endif
+#define strdup(ptr)  _use_psmi_strdup_instead_of_plain_strdup
+#define free(ptr)    _use_psmi_free_instead_of_plain_free
+#endif /* PSM_IS_TEST */
+
+void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes);
+
+/*
+ * Parsing int parameters set in string tuples.
+ */
+int psmi_parse_str_tuples(const char *str, int ntup, int *vals);
+
+/*
+ * Resource Limiting based on PSM memory mode.
+ */
+#define PSMI_MEMMODE_NORMAL  0
+#define PSMI_MEMMODE_MINIMAL 1
+#define PSMI_MEMMODE_LARGE   2
+#define PSMI_MEMMODE_NUM     3
+
+struct psmi_rlimit_mpool {
+    const char *env;
+    const char *descr;
+    int		env_level;
+    uint32_t	minval;
+    uint32_t	maxval;
+    struct {
+	    uint32_t	obj_chunk;
+	    uint32_t	obj_max;
+    }
+    mode[PSMI_MEMMODE_NUM];
+};
+psm_error_t psmi_parse_mpool_env(const psm_mq_t mq, int level,
+				    const struct psmi_rlimit_mpool *rlim,
+				    uint32_t *valo, uint32_t *chunkszo);
+int psmi_parse_memmode(void);
+
+/*
+ * Parsing environment variables
+ */
+
+union psmi_envvar_val {
+    void	  *e_void;
+    char	  *e_str;
+    int	          e_int;
+    unsigned int  e_uint;
+    long          e_long;
+    unsigned long e_ulong;
+    unsigned long long e_ulonglong;
+};
+
+#define PSMI_ENVVAR_LEVEL_USER	    1
+#define PSMI_ENVVAR_LEVEL_HIDDEN    2
+
+#define PSMI_ENVVAR_TYPE_YESNO		0
+#define PSMI_ENVVAR_TYPE_STR		1
+#define PSMI_ENVVAR_TYPE_INT		2
+#define PSMI_ENVVAR_TYPE_UINT		3
+#define PSMI_ENVVAR_TYPE_UINT_FLAGS	4
+#define PSMI_ENVVAR_TYPE_LONG		5
+#define PSMI_ENVVAR_TYPE_ULONG		6
+#define PSMI_ENVVAR_TYPE_ULONG_FLAGS	7
+#define PSMI_ENVVAR_TYPE_ULONG_ULONG    8
+
+#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1)
+#define PSMI_ENVVAR_VAL_NO  ((union psmi_envvar_val) 0)
+
+int psmi_getenv(const char *name, const char *descr, int level,
+		int type, union psmi_envvar_val defval,
+		union psmi_envvar_val *newval);
+
+/*
+ * Misc functionality
+ */
+uintptr_t psmi_getpagesize(void);
+uint64_t  psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns);
+uint32_t  psmi_get_ipv4addr();
+void	  psmi_syslog(psm_ep_t ep, int to_console, int level, 
+		      const char *format, ...);
+void	  psmi_uuid_unparse(const psm_uuid_t uuid, char *out);
+int	  psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB);
+void     *psmi_memcpyo(void *dst, const void *src, size_t n);
+uint32_t  psmi_crc(unsigned char *buf, int len);
+uint32_t  psmi_get_hca_type(psmi_context_t *context);
+
+/*
+ * Diagnostics, all in psm_diags.c
+ */
+int	psmi_diags(void);
+
+/*
+ * Fault injection
+ */
+struct psmi_faultinj_spec;
+int    psmi_faultinj_enabled; /* use macro to test */
+#if 1 /* possible to disable at compile time */
+#define PSMI_FAULTINJ_ENABLED()	(!!psmi_faultinj_enabled)
+#else
+#define PSMI_FAULTINJ_ENABLED()	0
+#endif
+
+void   psmi_faultinj_init();
+void   psmi_faultinj_fini();
+struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name,
+						 int num, int denom);
+#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom)		\
+	static struct psmi_faultinj_spec *var = NULL;			\
+	if (PSMI_FAULTINJ_ENABLED() && (var) == NULL)			\
+	    (var) = psmi_faultinj_getspec((spec_name), (num), (denom)); 
+int    psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec);
+
+/*
+ * PSM core component set/get options
+ */
+psm_error_t psmi_core_setopt(const void *core_obj, int optname, 
+			     const void *optval, uint64_t optlen);
+
+psm_error_t psmi_core_getopt(const void *core_obj, int optname, 
+			     void *optval, uint64_t *optlen);
+
+/*
+ * PSM AM component set/get options
+ */
+psm_error_t psmi_am_setopt(const void *am_obj, int optname, 
+			     const void *optval, uint64_t optlen);
+
+psm_error_t psmi_am_getopt(const void *am_obj, int optname, 
+			   void *optval, uint64_t *optlen);
+
+#endif /* _PSMI_UTILS_H */
diff --git a/psmd/Makefile b/psmd/Makefile
new file mode 100644
index 0000000..c70665c
--- /dev/null
+++ b/psmd/Makefile
@@ -0,0 +1,82 @@
+#
+#
+#  This file is provided under a dual BSD/GPLv2 license.  When using or
+#  redistributing this file, you may do so under either license.
+#
+#  GPL LICENSE SUMMARY
+#
+#  Copyright(c) 2012, 2017. Intel Corporation.
+#  Copyright(c) 2005, 2006. QLogic Corporation.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of version 2 of the GNU General Public License as
+#  published by the Free Software Foundation.
+#
+#  This program is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  Contact Information:
+#  Intel Corporation, www.intel.com 
+#
+#  BSD LICENSE
+#
+#  Copyright(c) 2012, 2017. Intel Corporation.
+#  Copyright(c) 2005, 2006. QLogic Corporation.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#
+#   * Redistributions of source code must retain the above copyright
+#     notice, this list of conditions and the following disclaimer.
+#   * Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in
+#     the documentation and/or other materials provided with the
+#     distribution.
+#   * Neither the name of Intel Corporation nor the names of its
+#     contributors may be used to endorse or promote products derived
+#     from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#
+
+include $(top_srcdir)/buildflags.mak
+CFLAGS += -Wall -Werror -D_IPATH_DEBUGGING=0
+LDFLAGS += $(SCIF_LINK_FLAGS)
+INCLUDES += -I$(top_srcdir)/include -I$(top_srcdir)/include/linux-x86_64 $(SCIF_INCLUDE_FLAGS)
+TARGETS = psmd
+
+all: ${TARGETS}
+
+${TARGETS}-objs := psmd.o ipath_service.o ipath_sysfs.o
+
+${TARGETS}: ${$(TARGETS)-objs}
+	$(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS)
+
+psmd.o: psmd.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+ipath_service.o: $(top_srcdir)/ipath/ipath_service.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+ipath_sysfs.o: $(top_srcdir)/ipath/ipath_sysfs.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+install:
+	install -D psmd ${DESTDIR}${INSTALL_SBIN_TARG}/psmd
+clean:
+	rm -f *.o $(TARGETS)
+
diff --git a/psmd/psmd.c b/psmd/psmd.c
new file mode 100644
index 0000000..1b14e4c
--- /dev/null
+++ b/psmd/psmd.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This file contains ipath service routine interface used by the low
+// level infinipath protocol code.
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <syslog.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <grp.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include "ipath_service.h"
+
+#include <scif.h>
+#define PSMD_HOST_PORT		SCIF_OFED_PORT_7	/* reserved, match psm library */
+#define BACKLOG			10
+scif_epd_t			psm_epd = -1;
+
+static void
+psmd_syslog(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vsyslog(LOG_ERR|LOG_USER, format, ap);
+    va_end(ap);
+}
+
+static int
+psmd_scif_send(void *buf, size_t len)
+{
+    int ret;
+    while (len) {
+	ret = scif_send(psm_epd, buf, (uint32_t)len, SCIF_SEND_BLOCK);
+	if (ret < 0) {
+	    if (errno == EINTR) continue;
+	    return ret;
+	}
+	buf += ret;
+	len -= ret;
+    }
+    return 0;
+}
+
+static int
+psmd_scif_recv(void *buf, size_t len)
+{
+    int ret;
+    while (len) {
+	ret = scif_recv(psm_epd, buf, (uint32_t)len, SCIF_RECV_BLOCK);
+	if (ret < 0) {
+	    if (errno == EINTR) continue;
+	    return ret;
+	}
+	buf += ret;
+	len -= ret;
+    }
+    return 0;
+}
+
+static void child_handler(int signo)
+{
+    while (waitpid(-1, NULL, WNOHANG) > 0);
+}
+
+static int
+psmd_service(void)
+{
+    int ret;
+    struct ipath_cmd cmd;
+
+    while (1) {
+	ret = psmd_scif_recv(&cmd, sizeof(cmd));
+	if (ret) {
+		//psmd_syslog("get request error\n");
+		scif_close(psm_epd);
+		psm_epd = -1;
+		return 0;
+	}
+
+	switch(cmd.type) {
+	case IPATH_CMD_CONTEXT_OPEN:
+	{
+		int fd;
+
+		fd = ipath_context_open(cmd.cmd.mic_info.unit,
+			cmd.cmd.mic_info.port, cmd.cmd.mic_info.data3);
+
+		cmd.cmd.mic_info.data1 = fd;
+		if (fd < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			close(fd);
+			goto err;
+		}
+		break;
+	}
+
+	case IPATH_CMD_CONTEXT_CLOSE:
+	{
+		ipath_context_close(cmd.cmd.mic_info.data1);
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_ASSIGN_CONTEXT:
+	{
+		int fd;
+		struct ipath_base_info binfo;
+
+		ret = psmd_scif_recv(&fd, sizeof(fd));
+		if (ret) goto err;
+
+		memset(&binfo, 0, sizeof(binfo));
+		cmd.cmd.user_info.spu_base_info = (__u64)&binfo;
+		cmd.cmd.user_info.spu_base_info_size = sizeof(binfo);
+		ret = ipath_cmd_assign_context(fd, &cmd, sizeof(cmd));
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+
+		if (cmd.cmd.mic_info.data1 >= 0) {
+			ret = psmd_scif_send(&binfo, sizeof(binfo));
+			if (ret) goto err;
+		}
+		break;
+	}
+
+	case IPATH_CMD_USER_INIT:
+	{
+		int fd;
+		struct ipath_base_info binfo;
+
+		ret = psmd_scif_recv(&binfo, sizeof(binfo));
+		if (ret) goto err;
+		ret = psmd_scif_recv(&fd, sizeof(fd));
+		if (ret) goto err;
+
+		cmd.cmd.user_info.spu_base_info = (__u64)&binfo;
+		cmd.cmd.user_info.spu_base_info_size = sizeof(binfo);
+		ret = ipath_cmd_user_init(fd, &cmd, sizeof(cmd));
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+
+		if (cmd.cmd.mic_info.data1 >= 0) {
+			ret = psmd_scif_send(&binfo, sizeof(binfo));
+			if (ret) goto err;
+		}
+		break;
+	}
+
+	case IPATH_CMD_SET_PART_KEY:
+	case IPATH_CMD_PIOAVAILUPD:
+	case IPATH_CMD_ACK_EVENT:
+	case IPATH_CMD_POLL_TYPE:
+
+	case IPATH_CMD_RECV_CTRL:
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+	case IPATH_CMD_DISARM_BUFS:
+	{
+		int fd;
+
+		ret = psmd_scif_recv(&fd, sizeof(fd));
+		if (ret) goto err;
+
+		ret = ipath_cmd_write(fd, &cmd, sizeof(cmd));
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_NUM_UNITS:
+	{
+		ret = ipath_get_num_units();
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_NUM_CTXTS:
+	{
+		ret = ipath_get_num_contexts(cmd.cmd.mic_info.unit);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_PORT_LID:
+	{
+		ret = ipath_get_port_lid(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_PORT_GID:
+	{
+		ret = ipath_get_port_gid(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port,
+				(uint64_t*)&cmd.cmd.mic_info.data3,
+				(uint64_t*)&cmd.cmd.mic_info.data4);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_PORT_LMC:
+	{
+		ret = ipath_get_port_lmc(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_PORT_RATE:
+	{
+		ret = ipath_get_port_rate(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_PORT_S2V:
+	{
+		ret = ipath_get_port_sl2vl(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port,
+				cmd.cmd.mic_info.data1);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_STATS_NAMES:
+	{
+		char *name = NULL;
+
+		ret = infinipath_get_stats_names(&name);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		} else cmd.cmd.mic_info.data2 = strlen(name);
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (name) free(name);
+			goto err;
+		}
+
+		/* with name and count is greater than zero */
+		if (name && cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(name, strlen(name)+1);
+		}
+		if (name) free(name);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_STATS:
+	{
+		uint64_t *s;
+
+		s = malloc(cmd.cmd.mic_info.data1*sizeof(*s));
+		if (!s) {
+			cmd.cmd.mic_info.data1 = -1;
+			cmd.cmd.mic_info.data2 = ENOMEM;
+
+			ret = psmd_scif_send(&cmd, sizeof(cmd));
+			if (ret) goto err;
+		}
+
+		ret = infinipath_get_stats(s, cmd.cmd.mic_info.data1);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		}
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (s) free(s);
+			goto err;
+		}
+
+		if (cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(s, cmd.cmd.mic_info.data1*sizeof(*s));
+		}
+		if (s) free(s);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_CTRS_UNAMES:
+	{
+		char *name = NULL;
+
+		ret = infinipath_get_ctrs_unit_names(cmd.cmd.mic_info.unit, &name);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		} else cmd.cmd.mic_info.data2 = strlen(name);
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (name) free(name);
+			goto err;
+		}
+
+		/* with name and count is greater than zero */
+		if (name && cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(name, strlen(name)+1);
+		}
+		if (name) free(name);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_CTRS_UNIT:
+	{
+		uint64_t *c;
+
+		c = malloc(cmd.cmd.mic_info.data1*sizeof(*c));
+		if (!c) {
+			cmd.cmd.mic_info.data1 = -1;
+			cmd.cmd.mic_info.data2 = ENOMEM;
+
+			ret = psmd_scif_send(&cmd, sizeof(cmd));
+			if (ret) goto err;
+		}
+
+		ret = infinipath_get_ctrs_unit(cmd.cmd.mic_info.unit,
+				c, cmd.cmd.mic_info.data1);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		}
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (c) free(c);
+			goto err;
+		}
+
+		if (cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(c, cmd.cmd.mic_info.data1*sizeof(*c));
+		}
+		if (c) free(c);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_CTRS_PNAMES:
+	{
+		char *name = NULL;
+
+		ret = infinipath_get_ctrs_port_names(cmd.cmd.mic_info.unit, &name);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		} else cmd.cmd.mic_info.data2 = strlen(name);
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (name) free(name);
+			goto err;
+		}
+
+		/* with name and count is greater than zero */
+		if (name && cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(name, strlen(name)+1);
+		}
+		if (name) free(name);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_CTRS_PORT:
+	{
+		uint64_t *c;
+
+		c = malloc(cmd.cmd.mic_info.data1*sizeof(*c));
+		if (!c) {
+			cmd.cmd.mic_info.data1 = -1;
+			cmd.cmd.mic_info.data2 = ENOMEM;
+
+			ret = psmd_scif_send(&cmd, sizeof(cmd));
+			if (ret) goto err;
+		}
+
+		ret = infinipath_get_ctrs_port(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port,
+				c, cmd.cmd.mic_info.data1);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret <= 0) {
+			if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		}
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (c) free(c);
+			goto err;
+		}
+
+		if (cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(c, cmd.cmd.mic_info.data1*sizeof(*c));
+		}
+		if (c) free(c);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_CC_SETTINGS:
+	{
+		char ccabuf[256];
+
+		ret = ipath_get_cc_settings_bin(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port, ccabuf);
+
+		cmd.cmd.mic_info.data1 = ret;
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+
+		if (cmd.cmd.mic_info.data1 == 1) {
+			ret = psmd_scif_send(ccabuf, 84);
+			if (ret) goto err;
+		}
+		break;
+	}
+
+	case IPATH_CMD_GET_CC_TABLE:
+	{
+		uint16_t *cct = NULL;
+
+		ret = ipath_get_cc_table_bin(cmd.cmd.mic_info.unit,
+				cmd.cmd.mic_info.port, &cct);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (cct) free(cct);
+			goto err;
+		}
+
+		if (cmd.cmd.mic_info.data1 > 0) {
+			ret = psmd_scif_send(cct,
+				(cmd.cmd.mic_info.data1+1)*sizeof(uint16_t));
+		}
+		if (cct) free(cct);
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_WAIT_FOR_PACKET:
+	{
+		ret = ipath_cmd_wait_for_packet(cmd.cmd.mic_info.data1);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	case IPATH_CMD_GET_UNIT_FLASH:
+	{
+		char *data = NULL;
+
+		ret = infinipath_get_unit_flash(cmd.cmd.mic_info.unit, &data);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+		else cmd.cmd.mic_info.data2 = strlen(data);
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) {
+			if (data) free(data);
+			goto err;
+		}
+
+		if (data) {
+			ret = psmd_scif_send(data, strlen(data)+1);
+			free(data);
+			if (ret) goto err;
+		}
+		break;
+	}
+
+	case IPATH_CMD_PUT_UNIT_FLASH:
+	{
+		char *data;
+		int len;
+
+		len = cmd.cmd.mic_info.data1;
+		data = malloc(len + 1);
+		if (!data) goto err;
+
+		ret = psmd_scif_recv(data, len); 
+		if (ret) {
+			free(data);
+			goto err;
+		}
+
+		ret = infinipath_put_unit_flash(cmd.cmd.mic_info.unit, data, len);
+		free(data);
+
+		cmd.cmd.mic_info.data1 = ret;
+		if (ret < 0) cmd.cmd.mic_info.data2 = errno;
+
+		ret = psmd_scif_send(&cmd, sizeof(cmd));
+		if (ret) goto err;
+		break;
+	}
+
+	default:
+		goto err;
+	} /* switch */
+    } /* while (1) */
+
+err:
+    psmd_syslog("error, request type = %d", cmd.type);
+    scif_close(psm_epd);
+    psm_epd = -1;
+    return 1;
+}
+
+int
+main(int argc, char *argv[])
+{
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	scif_epd_t epd;
+	struct scif_portID portID;
+	struct sigaction act;
+	int count;
+
+	/* Only root can run this code */
+	if (getuid()) {
+		fprintf(stderr, "Only root can run psmd\n");
+		psmd_syslog("Only root can run psmd");
+		exit(1);
+	}
+
+	/* start to daemonize psmd */
+	pid = fork();
+	if (pid < 0) {
+		psmd_syslog("fork() failed with err %d", errno);
+		exit(1);
+	}
+	if (pid > 0) {
+		exit(0);
+	}
+
+	/* At this point we are executing as the child process */
+
+	/* Change the file mode mask */
+	umask(0);
+
+	/* Create a new SID for the child process */
+	if (setsid() < 0) {
+		psmd_syslog("setsid() failed with err %d", errno);
+		exit(1);
+	}
+
+	/* Change the current working directory.*/
+	if ((chdir("/tmp")) < 0) {
+		psmd_syslog("chdir() failed with err %d", errno);
+		exit(1);
+	}
+
+	/* Redirect standard files to /dev/null */
+	if (freopen( "/dev/null", "r", stdin) == NULL ||
+	    freopen( "/dev/null", "w", stdout) == NULL ||
+	    freopen( "/dev/null", "w", stderr) == NULL) {
+		psmd_syslog("freopen() failed with err %d", errno);
+		exit(1);
+	}
+
+	/* Install sigchild handler */
+	memset(&act, 0, sizeof act);
+	act.sa_handler = child_handler;
+	sigaction(SIGCHLD, &act, NULL);
+
+	/* open end pt */
+	if ((epd = scif_open()) < 0) {
+		psmd_syslog("scif_open() failed with err %d", errno);
+		exit(1);
+	}
+
+	/* bind end pt to specified port */
+	if (scif_bind(epd, PSMD_HOST_PORT) < 0) {
+		scif_close(epd);
+		psmd_syslog("scif_bind() failed with err %d", errno);
+		exit(1);
+	}
+
+	/* marks an end pt as listening end pt and queues up a maximum of BACKLOG
+	 * no: of incoming connection requests
+	 */
+	if (scif_listen(epd, BACKLOG) != 0) {
+		scif_close(epd);
+		psmd_syslog("scif_listen() failed with err %d", errno);
+		exit(1);
+	}
+
+	count = 0;
+	while (1) {
+		/* accepts a conn request by creating a new end pt that connects to peer */
+		if (scif_accept(epd, &portID, &psm_epd, SCIF_ACCEPT_SYNC) < 0) {
+			if (errno == EINTR) continue;
+			psmd_syslog("scif_accept() failed with err %d", errno);
+			count++;
+			if (count < 20) continue;
+			scif_close(epd);
+			exit(1);
+		}
+		count = 0; /* not error in row */
+
+		/* if connection is from host, reject it. */
+		if (portID.node == 0) {
+			psmd_syslog("reject connection from host");
+			scif_close(psm_epd);
+			psm_epd = -1;
+			continue;
+		}
+
+		if (scif_recv(psm_epd, &uid, sizeof(uid), SCIF_RECV_BLOCK) != sizeof(uid)) {
+			psmd_syslog("cannot get peer uid");
+			scif_close(psm_epd);
+			psm_epd = -1;
+			continue;
+		}
+		if (scif_recv(psm_epd, &gid, sizeof(gid), SCIF_RECV_BLOCK) != sizeof(gid)) {
+			psmd_syslog("cannot get peer gid");
+			scif_close(psm_epd);
+			psm_epd = -1;
+			continue;
+		}
+
+		pid = fork();
+		if (pid == 0) {
+			/* need to change gid first */
+			if (setgid(gid)) {
+				psmd_syslog("cannot set peer gid");
+				scif_close(psm_epd);
+				psm_epd = -1;
+				exit(1);
+			}
+			if (setgroups(0, 0)) {
+				psmd_syslog("cannot setgroups(0,0)");
+				scif_close(psm_epd);
+				psm_epd = -1;
+				exit(1);
+			}
+			if (setuid(uid)) {
+				psmd_syslog("cannot set peer uid");
+				scif_close(psm_epd);
+				psm_epd = -1;
+				exit(1);
+			}
+
+			exit(psmd_service());
+		} else {
+			scif_close(psm_epd);
+			psm_epd = -1;
+		}
+	}
+
+	exit(0);
+}
diff --git a/ptl.h b/ptl.h
new file mode 100644
index 0000000..3aefcab
--- /dev/null
+++ b/ptl.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Interface implemented by Packet Transport layers such as
+ * ips and active messages.
+ *
+ * This interface can be volatile, it is never seen by PSM clients, and it will
+ * probably change as the AM ptl is developed.
+ */
+
+#ifndef PSM_PTL_H
+#define PSM_PTL_H
+#include <inttypes.h>
+#include <psm.h>
+#include <psm_mq.h>
+#include <psm_am.h>
+
+/* We currently have 3 PTLs, 0 is reserved. */
+#define PTL_DEVID_IPS  1
+#define PTL_DEVID_AMSH 2
+#define PTL_DEVID_SELF 3
+
+/* We can currently initialize up to 3 PTLs */
+#define PTL_MAX_INIT	3
+
+struct ptl;
+typedef struct ptl ptl_t;
+
+struct ptl_epaddr;
+typedef struct ptl_epaddr ptl_epaddr_t;
+
+struct ptl_ctl;
+typedef struct ptl_ctl ptl_ctl_t;
+
+struct ptl_mq_req;
+typedef struct ptl_mq_req ptl_mq_req_t;
+
+/* To be filled in statically by all PTLs */
+struct ptl_ctl_init
+{
+    size_t
+    (*sizeof_ptl)(void);
+
+    psm_error_t
+    (*init)(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl);
+
+    psm_error_t
+    (*fini)(ptl_t *ptl, int force, uint64_t timeout_ns);
+
+    psm_error_t
+    (*setopt)(const void *component_obj, int optname, 
+	      const void *optval, uint64_t optlen);
+    
+    psm_error_t
+    (*getopt)(const void *component_obj, int optname,
+	      void *optval, uint64_t *optlen);
+};
+
+typedef
+struct ptl_arg { 
+    union {
+	struct {
+	    uint16_t	u16w3;
+	    uint16_t	u16w2;
+	    uint16_t	u16w1;
+	    uint16_t	u16w0;
+	};
+	struct {
+	    uint32_t	u32w1;
+	    uint32_t	u32w0;
+	};
+	uint64_t	u64w0;
+	uint64_t	u64;
+	void		*uptr;
+    };
+}
+ptl_arg_t;
+
+#include "ptl_self/ptl_fwd.h"
+#include "ptl_ips/ptl_fwd.h"
+#include "ptl_am/ptl_fwd.h"
+
+/* To be filled in as part of ptl_init */
+struct ptl_ctl
+{
+    ptl_t    *ptl;	   /* pointer to ptl */
+
+    /* EP-specific stuff */
+    psm_error_t (*ep_poll)(ptl_t *ptl, int replyonly);
+
+    /* PTL-level connect
+     *
+     * This PTL-level is slightly different from the top-level PSM connect.
+     *
+     * pre 1: Caller has masked off epids in epid array that are already
+     *        connected at the PSM level.
+     *
+     * post 0: PTL has allocate all epaddrs and whatever internal ptladdr that
+     *         ptl needs.
+     * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i]
+     * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't be
+     *         connected before a timeout occurred.
+     * post 3: PTL returns OK iff all epids are either OK or UNREACHABLE
+     * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK.
+     */
+    psm_error_t (*ep_connect)(ptl_t *ptl,
+			      int num_ep,
+			      const psm_epid_t input_array_of_epid[], 
+			      const int	 array_of_epid_mask[],
+			      psm_error_t  output_array_of_errors[],
+			      psm_epaddr_t output_array_of_epddr[],
+			      uint64_t timeout_ns);
+
+    psm_error_t (*ep_disconnect)(ptl_t *ptl, int force,
+				 int num_ep,
+				 const psm_epaddr_t input_array_of_epaddr[],
+				 const int array_of_epaddr_mask[],
+				 psm_error_t output_array_of_errors[],
+				 uint64_t timeout_ns);
+
+    /* MQ stuff */
+    psm_error_t (*mq_send)(psm_mq_t mq, psm_epaddr_t dest, 
+		           uint32_t flags, uint64_t stag, const void *buf, uint32_t len);
+    psm_error_t (*mq_isend)(psm_mq_t mq, psm_epaddr_t dest, 
+			    uint32_t flags, uint64_t stag, const void *buf, uint32_t len, 
+			    void *ctxt, psm_mq_req_t *req);
+
+    int (*epaddr_stats_num)(void);
+    int	(*epaddr_stats_init)(char *desc[], uint16_t *flags);
+    int	(*epaddr_stats_get)(psm_epaddr_t epaddr, uint64_t *stats);
+
+    /* AM stuff, only for Active messages PTL.  Eventually we will expose
+     * this to PSM clients. */
+    psm_error_t (*am_short_request)(psm_epaddr_t epaddr, 
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+			void *src, size_t len, int flags, 
+			psm_am_completion_fn_t completion_fn, 
+			void *completion_ctxt);
+    psm_error_t (*am_short_reply)(psm_am_token_t token, psm_handler_t handler, 
+				  psm_amarg_t *args, int nargs,
+				  void *src, size_t len, int flags,
+				  psm_am_completion_fn_t completion_fn,
+				  void *completion_ctxt);
+    psm_error_t (*am_long_request)(psm_epaddr_t epaddr,
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+		        void *src, size_t len, void *dest, int flags);
+    psm_error_t (*am_long_reply)(psm_am_token_t token, psm_handler_t handler, 
+		          psm_amarg_t *args, int nargs, void *src, 
+			  size_t len, void *dest, int flags);
+};
+#endif
diff --git a/ptl_am/Makefile b/ptl_am/Makefile
new file mode 100644
index 0000000..f15e0cc
--- /dev/null
+++ b/ptl_am/Makefile
@@ -0,0 +1,45 @@
+# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o kcopyrwu.o knemrwu.o scifrwu.o
+
+all: ${${TARGLIB}-objs}
+
+%.o: %.c
+	$(CC) $(CFLAGS) $(INCLUDES) $(if $(PSM_HAVE_SCIF:0=),$(SCIF_INCLUDE_FLAGS)) -c $< -o $@
+
+clean:
+	rm -f *.o
+
diff --git a/ptl_am/am_reqrep.c b/ptl_am/am_reqrep.c
new file mode 100644
index 0000000..192c040
--- /dev/null
+++ b/ptl_am/am_reqrep.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_am.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+
+psm_error_t
+psmi_amsh_am_short_request(psm_epaddr_t epaddr,
+			   psm_handler_t handler, psm_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm_am_completion_fn_t completion_fn,
+			   void *completion_ctxt)
+{
+  psm_amarg_t req_args[NSHORT_ARGS] = {};
+
+  /* All sends are synchronous. Ignore PSM_AM_FLAG_ASYNC. 
+   * TODO: Treat PSM_AM_FLAG_NOREPLY as "advisory". This was mainly
+   * used to optimize the IPS path though we could put a stricter interpretation
+   * on it to disallow any replies.
+   */
+  
+  /* For now less than NSHORT_ARGS-1. We use the first arg to carry the handler
+   * index.
+   */
+  psmi_assert(nargs < (NSHORT_ARGS - 1));
+  req_args[0].u32w0 = (uint32_t) handler;
+  psmi_mq_mtucpy((void*) &req_args[1], (const void*) args, 
+		 (nargs * sizeof(psm_amarg_t)));
+  psmi_amsh_short_request(epaddr->ptl, epaddr, am_handler_hidx,
+			req_args, nargs + 1, 
+			  src, len, 0);
+  
+  if (completion_fn)
+    completion_fn(completion_ctxt);
+  
+  return PSM_OK;
+}
+
+psm_error_t
+psmi_amsh_am_short_reply(psm_am_token_t tok,
+			 psm_handler_t handler, psm_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm_am_completion_fn_t completion_fn,
+			 void *completion_ctxt)
+{
+  psm_amarg_t rep_args[NSHORT_ARGS] = {};
+
+  /* For now less than NSHORT_ARGS-1. We use the first arg to carry the handler
+   * index.
+   */
+  psmi_assert(nargs < (NSHORT_ARGS - 1));
+  rep_args[0].u32w0 = (uint32_t) handler;
+  psmi_mq_mtucpy((void*) &rep_args[1], (const void*) args, 
+		 (nargs * sizeof(psm_amarg_t)));
+
+  psmi_amsh_short_reply((amsh_am_token_t*) tok, am_handler_hidx, rep_args, nargs+1, src, len, 0);
+  
+  if (completion_fn)
+    completion_fn(completion_ctxt);
+  
+  return PSM_OK;
+}
+
diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c
new file mode 100644
index 0000000..50d86f4
--- /dev/null
+++ b/ptl_am/am_reqrep_shmem.c
@@ -0,0 +1,3513 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>	/* shm_open and signal handling */
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "kcopyrw.h"
+#include "knemrw.h"
+#include "scifrw.h"
+
+struct psm_am_max_sizes {
+    uint32_t   nargs;
+    uint32_t   request_short;
+    uint32_t   reply_short;
+    uint32_t   request_long;
+    uint32_t   reply_long;
+};
+
+int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+
+#ifdef PSM_HAVE_SCIF
+#define PSM_SCIF_CONNECT_RETRIES_DEFAULT 40
+int psmi_scif_connect_retries = PSM_SCIF_CONNECT_RETRIES_DEFAULT;
+#endif
+
+/* If we push bulk packets, we place them in the target's bulk packet region,
+ * if we don't push bulk packets, we place them in *our* bulk packet region and
+ * have the target pull the data from our region when it needs it. */
+#define AMSH_BULK_PUSH  1
+
+/* When do we start using the "huge" buffers -- at 1MB */
+#define AMSH_HUGE_BYTES 1024*1024
+
+#define AMMED_SZ    2048
+#define AMLONG_SZ   8192
+#define AMHUGE_SZ   (524288+sizeof(am_pkt_bulk_t)) /* 512k + E */
+
+/*       short med long huge */
+static const amsh_qinfo_t amsh_qcounts =
+        { 1024, 256, 16, 1, 1024, 256, 16, 8 };
+
+/*        short                   med          long       huge */
+static const amsh_qinfo_t amsh_qelemsz =
+        { sizeof(am_pkt_short_t), AMMED_SZ+64, AMLONG_SZ, AMHUGE_SZ, 
+          sizeof(am_pkt_short_t), AMMED_SZ+64, AMLONG_SZ, AMHUGE_SZ };
+
+/* we use this internally to break up packets into MTUs */
+static const amsh_qinfo_t amsh_qpkt_max =
+        { NSHORT_ARGS*8, AMMED_SZ, AMLONG_SZ-sizeof(am_pkt_bulk_t),
+                                   AMHUGE_SZ-sizeof(am_pkt_bulk_t),
+          NSHORT_ARGS*8, AMMED_SZ, AMLONG_SZ-sizeof(am_pkt_bulk_t),
+                                   AMHUGE_SZ-sizeof(am_pkt_bulk_t),
+        };
+
+/* We expose max sizes for the AM ptl.  */
+static const struct psm_am_max_sizes psmi_am_max_sizes = 
+        { 6, AMMED_SZ, (uint32_t) -1,
+             AMMED_SZ, (uint32_t) -1 };
+
+/*
+ * Macro expansion trickery to handle 6 different fifo types:
+ *
+ * _fifo is one of 'reqFifoShort', 'reqFifoMed', 'reqFifoLong',
+ * 'repFifoShort', 'repFifoMed', 'repFifoLong'
+ *
+ * _fifotyp is one of 'short' or 'bulk'
+ */
+#define QGETPTR(ptl, _shmidx_, _fifo, _fifotyp, _idx)	    \
+	(am_pkt_ ## _fifotyp ## _t *)			    \
+	(((uintptr_t)ptl->ep->amsh_qdir[(_shmidx_)].q ## _fifo) +    \
+		    (_idx) *amsh_qelemsz.q ## _fifo)
+        
+#define QGETPTR_SCIF(ptl, _shmidx_, _node_, _fifo, _fifotyp, _idx)	       \
+	(am_pkt_ ## _fifotyp ## _t *)			                       \
+	(((uintptr_t)ptl->ep->amsh_qdir[(_shmidx_)].qptrs[_node_].q ## _fifo) +\
+		    (_idx) *amsh_qelemsz.q ## _fifo)
+
+#ifdef PSM_HAVE_SCIF
+static void *am_ctl_accept_thread(void *arg);
+static psm_error_t amsh_scif_detach(psm_ep_t ep);
+#endif
+static psm_error_t amsh_poll(ptl_t *ptl, int replyonly);
+static psm_error_t amsh_poll_internal_inner(ptl_t *ptl, int replyonly, int is_internal);
+static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq);
+static void amsh_conn_handler(void *toki, psm_amarg_t *args, int narg, 
+                              void *buf, size_t len);
+static void am_update_directory(ptl_t *ptl, int shmidx);
+
+/* Kassist helper functions */
+static const char * psmi_kassist_getmode(int mode);
+static int psmi_get_kassist_mode();
+
+/* SCIF DMA helper functions */
+#ifdef PSM_HAVE_SCIF
+static const char * psmi_scif_dma_getmode(int mode);
+static int psmi_get_scif_dma_mode();
+static int psmi_get_scif_dma_threshold();
+#endif
+
+/* Kcopy functionality */
+int psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr);
+static int psmi_kcopy_find_minor(int *minor);
+static int psmi_kcopy_open_minor(int minor);
+
+static inline void
+am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz)
+{
+    q->head = 0;
+    q->elem_cnt = elem_cnt;
+    q->elem_sz  = elem_sz;
+}
+
+static void
+am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems)
+{
+    int i;
+    am_pkt_bulk_t *bulkpkt;
+    uintptr_t bulkptr = (uintptr_t) base_ptr;
+
+    for (i = 0; i < nelems; i++, bulkptr += elemsz) {
+        bulkpkt = (am_pkt_bulk_t *) bulkptr;
+        bulkpkt->idx = i;
+    }
+}
+
+#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \
+                               PSMI_PAGESIZE)
+static inline uintptr_t 
+am_ctl_sizeof_block()
+{
+    return 
+      PSMI_ALIGNUP(
+        PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + 
+	PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + /* reqctrl block */
+        _PA(reqFifoShort) + _PA(reqFifoMed) + _PA(reqFifoLong) + 
+        _PA(reqFifoHuge) + 
+        PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + /*reqctrl block*/
+        _PA(repFifoShort) + _PA(repFifoMed) + _PA(repFifoLong) + 
+        _PA(repFifoHuge),
+      PSMI_PAGESIZE); /* align to page size */
+}
+#undef _PA
+
+/**
+ * Given a number of PEs, determine the amount of memory required.
+ */
+static
+size_t
+psmi_amsh_segsize(int num_pe, int num_nodes)
+{
+    size_t segsz;
+    segsz  = PSMI_ALIGNUP(sizeof(struct am_ctl_dirpage), PSMI_PAGESIZE);
+    segsz += am_ctl_sizeof_block() * num_pe * num_nodes;
+    return segsz;
+}
+
+static
+void
+amsh_atexit()
+{
+    static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER;
+    static int atexit_once = 0;
+    psm_ep_t ep;
+    extern psm_ep_t psmi_opened_endpoint;
+
+    pthread_mutex_lock(&mutex_once); 
+    if (atexit_once) {
+        pthread_mutex_unlock(&mutex_once);
+        return;
+    }
+    else
+        atexit_once = 1;
+    pthread_mutex_unlock(&mutex_once); 
+
+    ep = psmi_opened_endpoint;
+    while (ep) {
+	if (ep->amsh_keyname != NULL) {
+	    _IPATH_VDBG("unlinking shm file %s\n", ep->amsh_keyname);
+	    shm_unlink(ep->amsh_keyname);
+	}
+
+	if (ep->psmi_kassist_fd != -1) {
+	    close(ep->psmi_kassist_fd);
+	    ep->psmi_kassist_fd = -1;
+	}
+	ep = ep->user_ep_next;
+    }
+
+    return;
+}
+
+static
+void
+amsh_mmap_fault(int sig)
+{
+    static char shm_errmsg[256];
+
+    snprintf(shm_errmsg, sizeof shm_errmsg,
+        "%s: Unable to allocate shared memory for intra-node messaging.\n"
+        "%s: Delete stale shared memory files in /dev/shm.\n",
+        psmi_gethostname(), psmi_gethostname());
+    amsh_atexit();
+    if (write(2, shm_errmsg, strlen(shm_errmsg)+1) == -1)
+      exit(2);
+    else
+      exit(1); /* XXX revisit this... there's probably a better way to exit */
+}
+
+/*
+ * Scif init to modify the epid of current process.
+ */
+#ifdef PSM_HAVE_SCIF
+static
+psm_error_t
+amsh_scif_init(psm_ep_t ep)
+{
+    scif_epd_t epd;
+    int port, nnodes;
+    uint16_t self;
+    psm_error_t err;
+    union psmi_envvar_val env_retries;
+
+    if(!psmi_getenv("PSM_SCIF_CONNECT_RETRIES",
+                "PSM SCIF connection retry count",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) psmi_scif_connect_retries,
+                &env_retries)) {
+        psmi_scif_connect_retries = env_retries.e_uint;
+    }
+
+    /* open end pt */
+    if ((epd = scif_open()) < 0) {
+	err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+			"scif_open() failed with err %d", errno);
+	return err;
+    }
+
+    /* bind end pt to specified port */
+    if ((port = scif_bind(epd, 0)) < 0) {
+	scif_close(epd);
+	err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+			"scif_bind() failed with err %d", errno);
+	return err;
+    }
+
+    /* marks an end pt as listening end pt and queues up a maximum of 32
+     * incoming connection requests */
+    if (scif_listen(epd, 40) != 0) {
+	scif_close(epd);
+	err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+			"scif_listen() failed with err %d", errno);
+	return err;
+    }
+
+    if ((nnodes = scif_get_nodeIDs(NULL, 0, &self)) < 0) {
+	scif_close(epd);
+	err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+			"scif_get_nodeIDs() failed with err %d", errno);
+	return err;
+    }
+
+    _IPATH_VDBG("listening on SCIF %d:%d\n", self, port);
+
+    /* Save total scif node #, modify epid to include port and self node ID.*/
+    ep->scif_epd = epd;
+    ep->scif_mynodeid = (int)self;
+    ep->scif_nnodes = nnodes;
+
+    /* Modify epid with acquired info as below */
+    ep->epid |= (((uint64_t)self)&0xFF)<<48;
+    ep->epid |= (((uint64_t)port)&0xFFFF)<<32;
+
+    return PSM_OK;
+}
+#endif
+
+/**
+ * Attach endpoint shared-memory.
+ *
+ * We only try to obtain an shmidx at this point.
+ */
+psm_error_t
+psmi_shm_attach(psm_ep_t ep, int *shmidx_o)
+{
+    int ismaster = 1;
+    int i;
+    int use_kcopy, use_kassist;
+    int shmidx;
+    int kcopy_minor = -1;
+    char shmbuf[256];
+    void *mapptr;
+    size_t segsz;
+    psm_error_t err = PSM_OK;
+
+    if (ep->amsh_shmidx != -1) {
+        *shmidx_o = ep->amsh_shmidx;
+        return PSM_OK;
+    }
+
+    *shmidx_o = -1;
+    if (ep->amsh_keyname != NULL) {
+        if (psmi_uuid_compare(ep->amsh_keyno, ep->key) != 0) {
+	    psmi_uuid_unparse(ep->amsh_keyno, shmbuf);
+	    err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+		"Shared memory segment already initialized with key=%s",
+		shmbuf);
+	    goto fail;
+	}
+    }
+    else {
+        char *p;
+        memcpy(&ep->amsh_keyno, ep->key, sizeof(psm_uuid_t));
+        strncpy(shmbuf, "/psm_shm.", sizeof shmbuf);
+        p = shmbuf + strlen(shmbuf);
+        psmi_uuid_unparse(ep->amsh_keyno, p);
+	ep->amsh_keyname = psmi_strdup(NULL, shmbuf); 
+        if (ep->amsh_keyname == NULL) {
+            err = PSM_NO_MEMORY;
+            goto fail;
+        }
+    }
+
+#ifdef PSM_HAVE_SCIF
+    ep->amsh_qdir = psmi_calloc(NULL, PER_PEER_ENDPOINT,
+			    PTL_AMSH_MAX_LOCAL_PROCS*ep->scif_nnodes,
+			    sizeof(struct amsh_qdirectory));
+#else
+    ep->amsh_qdir = psmi_calloc(NULL, PER_PEER_ENDPOINT, 
+			    PTL_AMSH_MAX_LOCAL_PROCS,
+			    sizeof(struct amsh_qdirectory));
+#endif
+
+    if (ep->amsh_qdir == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    
+    /* Get which kassist mode to use. */
+    ep->psmi_kassist_mode = psmi_get_kassist_mode();
+    use_kassist = (ep->psmi_kassist_mode != PSMI_KASSIST_OFF);
+    use_kcopy = (ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY);
+
+#ifdef PSM_HAVE_SCIF
+    ep->scif_dma_mode = psmi_get_scif_dma_mode();
+    ep->scif_dma_threshold = psmi_get_scif_dma_threshold();
+#endif
+
+    /* Reserve enough space in the shared memory region for up to
+       PTL_AMSH_MAX_LOCAL_PROCS.  Although that much space is reserved in
+       virtual memory, physical pages are not allocated until the
+       corresponding memory location is touched.  Memory in this region is
+       only touched as processes initialize their shared queue area in
+       amsh_init_segment(), and physical memory is only allocated by the OS
+       accordingly.  So, it looks like this is consumes a lot of memory,
+       but really it consumes as much as necessary for each active process. */
+#ifdef PSM_HAVE_SCIF
+    segsz = psmi_amsh_segsize(PTL_AMSH_MAX_LOCAL_PROCS,
+                              PTL_AMSH_MAX_LOCAL_NODES);
+#else
+    /* In the non-SCIF case we should be able to get away with just allocating
+     * enough shm for the number of mpi ranks, if the number of ranks is
+     * unavailable, then we will fallback to the number of online cpu cores.
+     * This will help cut back on virtual memory usage.
+     */
+    int nranks, rankid, nprocs;
+    psmi_sharedcontext_params(&nranks, &rankid);
+    nprocs = (nranks <= 0) ? sysconf(_SC_NPROCESSORS_ONLN) : nranks;
+    segsz = psmi_amsh_segsize(nprocs, PTL_AMSH_MAX_LOCAL_NODES);
+#endif
+
+    ep->amsh_shmfd = shm_open(ep->amsh_keyname, 
+                          O_RDWR | O_CREAT | O_EXCL | O_TRUNC, S_IRWXU);
+    if (ep->amsh_shmfd < 0) {
+	ismaster = 0;
+        if (errno != EEXIST) {
+            err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, 
+                "Error creating shared memory object in shm_open%s%s",
+                 errno != EACCES ? ": " :
+                "(/dev/shm may have stale shm files that need to be removed): ",
+                strerror(errno));
+	    goto fail;
+        }
+
+        /* Try to open again, knowing we won't be the shared memory master */
+        ep->amsh_shmfd = shm_open(ep->amsh_keyname, O_RDWR, S_IRWXU);
+        if (ep->amsh_shmfd < 0) {
+            err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, 
+                "Error attaching to shared memory object in shm_open: %s", 
+                strerror(errno));
+            goto fail;
+        }
+    }
+
+    /* Now register the atexit handler for cleanup, whether master or slave */
+    atexit(amsh_atexit);
+
+    _IPATH_PRDBG("Registered as %s to key %s\n", ismaster ? "master" : "slave",
+           ep->amsh_keyname);
+
+    if (ismaster) {
+	if (ftruncate(ep->amsh_shmfd, segsz) != 0) {
+            err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+                "Error setting size of shared memory object to %u bytes in "
+		"ftruncate: %s\n", (uint32_t) segsz, strerror(errno));
+            goto fail;
+	}
+    }
+    else {
+        /* Before we do the mmap, make sure that the master has had time to
+         * apply the ftruncate, or else we will get a successful mmap on a
+         * 0-sized object */
+        struct stat fdstat;
+        off_t cursize = 0;
+        while (cursize == 0) {
+            if (fstat(ep->amsh_shmfd, &fdstat)) {
+                err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+                         "Error querying size of shared memory object: %s",
+                        strerror(errno));
+                goto fail;
+            }
+            cursize = fdstat.st_size;
+            if (cursize == 0)
+                usleep(1); /* be gentle in tight fstat loop */
+        }
+    }
+    
+    /* We map the entire shared memory area, consisting of a control structure
+     * followed by per-process shared queue structures.  The "master" creates
+     * the control structure and initializes it but every process must lock
+     * appropriate data structures before it reads or writes it.
+     */
+    mapptr = mmap(NULL, segsz, PROT_READ|PROT_WRITE, MAP_SHARED,
+                  ep->amsh_shmfd, 0);
+    if (mapptr == MAP_FAILED) {
+        err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+                "Error mmapping shared memory: %s", strerror(errno));
+        goto fail;
+    }
+
+    ep->amsh_shmbase = (uintptr_t) mapptr;
+    ep->amsh_dirpage = (struct am_ctl_dirpage *) ep->amsh_shmbase;
+    ep->amsh_blockbase = ep->amsh_shmbase + psmi_amsh_segsize(0, 0);
+
+    /* We core dump right after here if we don't check the mmap */
+    void (*old_handler_segv)(int) = signal (SIGSEGV, amsh_mmap_fault);
+    void (*old_handler_bus)(int)  = signal (SIGBUS, amsh_mmap_fault);
+
+    _IPATH_PRDBG("Mapped shm control object at %p\n", mapptr);
+    if (ismaster) {
+        pthread_mutexattr_t attr;
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&(ep->amsh_dirpage->lock), &attr);
+	pthread_mutexattr_destroy(&attr);
+
+	ep->amsh_dirpage->num_attached = 0;
+	ep->amsh_dirpage->max_idx = -1;
+
+	for (i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) {
+	    ep->amsh_dirpage->shmidx_map_epid[i] = 0;
+	    ep->amsh_dirpage->kassist_pids[i] = 0;
+	}
+
+        for(i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS*PTL_AMSH_MAX_LOCAL_NODES; i++) {
+            struct amsh_qtail* qtail = &ep->amsh_dirpage->qtails[i];
+
+            qtail->reqFifoShort.tail = 0;
+            qtail->reqFifoMed.tail = 0;
+            qtail->reqFifoLong.tail = 0;
+            qtail->reqFifoHuge.tail = 0;
+
+            qtail->repFifoShort.tail = 0;
+            qtail->repFifoMed.tail = 0;
+            qtail->repFifoLong.tail = 0;
+            qtail->repFifoHuge.tail = 0;
+
+            pthread_spin_init(&qtail->reqFifoShort.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->reqFifoMed.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->reqFifoLong.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->reqFifoHuge.lock, PTHREAD_PROCESS_SHARED);
+
+            pthread_spin_init(&qtail->repFifoShort.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->repFifoMed.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->repFifoLong.lock, PTHREAD_PROCESS_SHARED);
+            pthread_spin_init(&qtail->repFifoHuge.lock, PTHREAD_PROCESS_SHARED);
+        }
+
+	if (use_kassist) {
+	  if (use_kcopy) {
+	    ep->psmi_kassist_fd = psmi_kcopy_find_minor(&kcopy_minor);
+	    if (ep->psmi_kassist_fd >= 0) 
+	      ep->amsh_dirpage->kcopy_minor = kcopy_minor;
+	    else
+	      ep->amsh_dirpage->kcopy_minor = -1;
+	  }
+	  else {  /* Setup knem */
+	    psmi_assert_always(ep->psmi_kassist_mode & PSMI_KASSIST_KNEM);
+	    ep->psmi_kassist_fd = knem_open_device();
+          }
+	  
+	}
+	else
+	  ep->psmi_kassist_fd = -1;
+
+	ips_mb();
+
+	ep->amsh_dirpage->is_init = 1;
+	_IPATH_PRDBG("Mapped and initialized shm object control page at %p,"
+                    "size=%zu, kcopy minor is %d (mode=%s)\n", mapptr,
+		    segsz, kcopy_minor,
+		    psmi_kassist_getmode(ep->psmi_kassist_mode));
+    }
+    else {
+	volatile int *is_init = &ep->amsh_dirpage->is_init;
+	while (*is_init == 0) 
+	    usleep(1);
+	_IPATH_PRDBG("Slave synchronized object control page at "
+		     "%p, size=%d, kcopy minor is %d (mode=%s)\n", 
+		     mapptr, (int) segsz, kcopy_minor,
+		    psmi_kassist_getmode(ep->psmi_kassist_mode));
+    }
+
+    /* 
+     * First safe point where we can try to attach to the segment.
+     *
+     * Here we reserve the shmidx slot by marking the epid to '1'.  We only
+     * update our epid in the init phase once we actually know what our epid
+     * is.
+     */
+    pthread_mutex_lock((pthread_mutex_t *) &(ep->amsh_dirpage->lock));
+    shmidx = -1;
+    for (i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) {
+	if (ep->amsh_dirpage->shmidx_map_epid[i] == 0) {
+	    ep->amsh_dirpage->shmidx_map_epid[i] = 1;
+            ep->amsh_dirpage->psm_verno[i] = PSMI_VERNO;
+	    ep->amsh_dirpage->kassist_pids[i] = (int) getpid();
+
+	    if (use_kassist) {
+	      if (!use_kcopy) {
+		if (!ismaster)
+		  ep->psmi_kassist_fd = knem_open_device();
+		
+		/* If we are able to use KNEM assume everyone else on the
+		 * node can also use it. Advertise that KNEM is active via
+		 * the feature flag.
+		 */
+		if (ep->psmi_kassist_fd >= 0) {
+		  ep->amsh_dirpage->amsh_features[i] |= AMSH_HAVE_KNEM;
+		  psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_KNEM;
+		}
+		else {
+		  ep->psmi_kassist_mode = PSMI_KASSIST_OFF;
+		  use_kassist = 0;
+		  psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+		}
+	      }
+	      else if(use_kcopy) {
+		psmi_assert_always(use_kcopy);
+		kcopy_minor = ep->amsh_dirpage->kcopy_minor;
+		if (!ismaster && kcopy_minor >= 0) 
+		  ep->psmi_kassist_fd = psmi_kcopy_open_minor(kcopy_minor);
+		
+		/* If we are able to use KCOPY assume everyone else on the
+		 * node can also use it. Advertise that KCOPY is active via
+		 * the feature flag.
+		 */
+		if (ep->psmi_kassist_fd >= 0) {
+		  ep->amsh_dirpage->amsh_features[i] |= AMSH_HAVE_KCOPY;
+		  psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_KCOPY;
+		}
+		else {
+		  ep->psmi_kassist_mode = PSMI_KASSIST_OFF;
+		  use_kassist = 0; use_kcopy = 0;
+		  psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+		}
+              }
+	    }
+	    else
+	      psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+	    _IPATH_PRDBG("KASSIST MODE: %s\n", psmi_kassist_getmode(ep->psmi_kassist_mode));
+#ifdef PSM_HAVE_SCIF
+	    _IPATH_PRDBG("SCIF DMA MODE: %s\n", psmi_scif_dma_getmode(ep->scif_dma_mode));
+	    _IPATH_PRDBG("SCIF DMA THRESHOLD: %d\n", ep->scif_dma_threshold);
+#endif
+
+            ep->amsh_shmidx = shmidx = *shmidx_o = i;
+            _IPATH_PRDBG("Grabbed shmidx %d\n", shmidx);
+            ep->amsh_dirpage->num_attached++;
+	    break;
+	}
+    }
+    pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock));
+
+    /* install the old sighandler back */
+    signal(SIGSEGV, old_handler_segv);
+    signal(SIGBUS, old_handler_bus);
+    
+    if (shmidx == -1) 
+	err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+	        "Exceeded maximum of %d support local endpoints: %s", 
+		PTL_AMSH_MAX_LOCAL_PROCS, strerror(errno));
+
+fail:
+    return err;
+}
+
+/**
+ * Initialize endpoint shared-memory AM.
+ *
+ * This function ensures that the given endpoint initializes enough shared
+ * memory storage to communicate with up to PSMI_AMSH_MAX_LOCAL_PROCS local
+ * peers.  In reality, the implementation need not grow any shared structures
+ * if a single endpoint needs to communicate to 2 or 20 local peers (A local
+ * peer is a peer having a context on any locally-attached LID). 
+ *
+ * [pre] Endpoint address epaddr has already been allocated.
+ */
+
+#define AMSH_QSIZE(type)                                                \
+        PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type,   \
+                     PSMI_PAGESIZE)
+
+static
+psm_error_t
+amsh_init_segment(ptl_t *ptl)
+{
+    struct amsh_qptrs* qptrs;
+    int shmidx;
+    int i;
+    psm_error_t err = PSM_OK;
+    int scif_nnodes;
+
+    /* Preconditions */
+    psmi_assert_always(ptl != NULL);
+    psmi_assert_always(ptl->ep != NULL);
+    psmi_assert_always(ptl->epaddr != NULL);
+    psmi_assert_always(ptl->ep->epid != 0);
+    psmi_assert_always(ptl->ep->amsh_shmidx != -1);
+
+    shmidx = ptl->ep->amsh_shmidx;
+
+    ptl->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort);
+    ptl->amsh_qsizes.qreqFifoMed   = AMSH_QSIZE(reqFifoMed);
+    ptl->amsh_qsizes.qreqFifoLong  = AMSH_QSIZE(reqFifoLong);
+    ptl->amsh_qsizes.qreqFifoHuge  = AMSH_QSIZE(reqFifoHuge);
+    ptl->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort);
+    ptl->amsh_qsizes.qrepFifoMed   = AMSH_QSIZE(repFifoMed);
+    ptl->amsh_qsizes.qrepFifoLong  = AMSH_QSIZE(repFifoLong);
+    ptl->amsh_qsizes.qrepFifoHuge  = AMSH_QSIZE(repFifoHuge);
+
+    /* We core dump right after here if we don't check the mmap */
+    void (*old_handler_segv)(int) = signal (SIGSEGV, amsh_mmap_fault);
+    void (*old_handler_bus)(int)  = signal (SIGBUS, amsh_mmap_fault);
+
+    pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+
+    /*
+     * Now that we know our epid, update it in the shmidx array
+     */
+    ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] = ptl->ep->epid;
+
+    if (shmidx > ptl->ep->amsh_dirpage->max_idx) {
+	ptl->ep->amsh_dirpage->max_idx = shmidx;
+    }
+
+    ptl->shmidx = shmidx;
+    ptl->ep->amsh_qdir[shmidx].amsh_epaddr = ptl->ep->epaddr;
+    for(i = 0; i < PTL_AMSH_MAX_LOCAL_NODES; i++) {
+        ptl->reqH[i].base = ptl->reqH[i].head = ptl->reqH[i].end = NULL;
+        ptl->repH[i].base = ptl->repH[i].head = ptl->repH[i].end = NULL;
+    }
+
+    /* Update all of the local directory entries once here. */
+    for(i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) {
+	ptl->ep->amsh_qdir[i].amsh_base =
+		(void *)(ptl->ep->amsh_blockbase +
+                        am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES * i);
+
+	ptl->ep->amsh_qdir[i].amsh_shmidx = ptl->shmidx;
+
+        /* Encode our SCIF nodeid here.  The full epid for local peers isn't
+           known yet, but we do know their nodeid, which is the same as ours.
+           Marking the nodeid here enables process_packet() to work correctly
+           when packets arrive before this epid value has been set with the
+           proper epid, without extra branches in the communication path. */
+#ifdef PSM_HAVE_SCIF
+        ptl->ep->amsh_qdir[i].amsh_epid =
+            ((psm_epid_t)ptl->ep->scif_mynodeid & 0xff) << 48;
+#endif
+
+        /* Clear the SCIF socket to -1.  This indicates that the socket is not
+           going to be used, ever -- which is true since this is a local peer.
+           This prevents later code from trying to connect to self. */
+        //ptl->ep->amsh_qdir[i].amsh_epd[0] = -1;
+
+        am_update_directory(ptl, i);
+    }
+
+#ifdef PSM_HAVE_SCIF
+    scif_nnodes = ptl->ep->scif_nnodes;
+#else
+    /* No SCIF: assume one node. */
+    scif_nnodes = 1;
+#endif
+
+    /* touch all of my pages */
+    memset(ptl->ep->amsh_qdir[shmidx].amsh_base,
+	   0, am_ctl_sizeof_block() * scif_nnodes);
+
+    for(i = 0; i < scif_nnodes; i++) {
+        qptrs = &ptl->ep->amsh_qdir[shmidx].qptrs[i];
+
+        am_ctl_qhdr_init(&qptrs->qreqH->shortq,
+                amsh_qcounts.qreqFifoShort, amsh_qelemsz.qreqFifoShort);
+        am_ctl_qhdr_init(&qptrs->qreqH->medbulkq,
+                amsh_qcounts.qreqFifoMed, amsh_qelemsz.qreqFifoMed);
+        am_ctl_qhdr_init(&qptrs->qreqH->longbulkq,
+                amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong);
+        am_ctl_qhdr_init(&qptrs->qreqH->hugebulkq,
+                amsh_qcounts.qreqFifoHuge, amsh_qelemsz.qreqFifoHuge);
+
+        am_ctl_qhdr_init(&qptrs->qrepH->shortq,
+                amsh_qcounts.qrepFifoShort, amsh_qelemsz.qrepFifoShort);
+        am_ctl_qhdr_init(&qptrs->qrepH->medbulkq,
+                amsh_qcounts.qrepFifoMed, amsh_qelemsz.qrepFifoMed);
+        am_ctl_qhdr_init(&qptrs->qrepH->longbulkq,
+                amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong);
+        am_ctl_qhdr_init(&qptrs->qrepH->hugebulkq,
+                amsh_qcounts.qrepFifoHuge, amsh_qelemsz.qrepFifoHuge);
+
+        /* Set bulkidx in every bulk packet */
+        am_ctl_bulkpkt_init(qptrs->qreqFifoMed,
+                amsh_qelemsz.qreqFifoMed,
+                amsh_qcounts.qreqFifoMed);
+        am_ctl_bulkpkt_init(qptrs->qreqFifoLong,
+                amsh_qelemsz.qreqFifoLong,
+                amsh_qcounts.qreqFifoLong);
+        am_ctl_bulkpkt_init(qptrs->qreqFifoHuge,
+                amsh_qelemsz.qreqFifoHuge,
+                amsh_qcounts.qreqFifoHuge);
+
+        am_ctl_bulkpkt_init(qptrs->qrepFifoMed,
+                amsh_qelemsz.qrepFifoMed,
+                amsh_qcounts.qrepFifoMed);
+        am_ctl_bulkpkt_init(qptrs->qrepFifoLong,
+                amsh_qelemsz.qrepFifoLong,
+                amsh_qcounts.qrepFifoLong);
+        am_ctl_bulkpkt_init(qptrs->qrepFifoHuge,
+                amsh_qelemsz.qrepFifoHuge,
+                amsh_qcounts.qrepFifoHuge);
+    }
+
+    /* install the old sighandler back */
+    signal(SIGSEGV, old_handler_segv);
+    signal(SIGBUS, old_handler_bus);
+
+    pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+    return err;
+}
+
+psm_error_t
+psmi_shm_detach(psm_ep_t ep)
+{
+    psm_error_t err = PSM_OK;
+
+    if (ep->amsh_shmidx == -1 || ep->amsh_keyname == NULL)
+        return err;
+
+#ifdef PSM_HAVE_SCIF
+    if (amsh_scif_detach(ep)) {
+        err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+                "Error with amsh_scif_detach() of shared segment: %s",
+                strerror(errno));
+        goto fail;
+    }
+#endif
+
+    _IPATH_VDBG("unlinking shm file %s\n", ep->amsh_keyname+1);
+    shm_unlink(ep->amsh_keyname);
+    psmi_free(ep->amsh_keyname);
+    ep->amsh_keyname = NULL;
+
+    if (ep->psmi_kassist_fd != -1) {
+	close(ep->psmi_kassist_fd);
+	ep->psmi_kassist_fd = -1;
+    }
+
+    /* go mark my shmidx as free */
+    pthread_mutex_lock((pthread_mutex_t *) &(ep->amsh_dirpage->lock));
+
+    ep->amsh_dirpage->num_attached--;
+    ep->amsh_dirpage->shmidx_map_epid[ep->amsh_shmidx] = 0;
+    ep->amsh_shmidx = -1;
+
+    if (ep->amsh_dirpage->num_attached == 0) { /* truncate to nothing */
+	pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock));
+
+        /* Instead of dynamically shrinking the shared memory region, we always
+           leave it allocated for up to PTL_AMSH_MAX_LOCAL_PROCS or number
+           of processors online.
+           Thus mremap() is never necessary, nor is ftruncate() here.
+           However when the attached process count does go to 0, we should
+           fully munmap() the entire region.
+         */
+#ifdef PSM_HAVE_SCIF
+        if (munmap((void *) ep->amsh_shmbase,
+                    psmi_amsh_segsize(PTL_AMSH_MAX_LOCAL_PROCS,
+                                      PTL_AMSH_MAX_LOCAL_NODES))) {
+#else
+        int nranks, rankid, nprocs;
+        psmi_sharedcontext_params(&nranks, &rankid);
+        nprocs = (nranks <= 0) ? sysconf(_SC_NPROCESSORS_ONLN) : nranks;
+        if (munmap((void *) ep->amsh_shmbase,
+                    psmi_amsh_segsize(nprocs, PTL_AMSH_MAX_LOCAL_NODES))) {
+#endif
+            err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR,
+                    "Error with munamp of shared segment: %s", strerror(errno));
+            goto fail;
+        }
+    }
+    else {
+        int i, new_max_idx = ep->amsh_dirpage->max_idx;
+        for (i = ep->amsh_dirpage->max_idx; i >= 0; i--) {
+            if (ep->amsh_dirpage->shmidx_map_epid[i] == 0) 
+                new_max_idx = i;
+            else
+                break;
+        }
+
+        ep->amsh_dirpage->max_idx = new_max_idx;
+
+        pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock));
+    }
+
+    ep->amsh_max_idx = -1;
+    ep->amsh_shmfd = -1;
+
+    ep->amsh_shmbase = ep->amsh_blockbase = 0;
+    ep->amsh_dirpage = NULL;
+    memset(ep->amsh_keyno, 0, sizeof(ep->amsh_keyno));
+
+    return PSM_OK;
+
+fail:
+    return err;
+}
+
+/**
+ * Update pointers to our req/rep receive queues
+ *
+ * Only called from am_update_directory()
+ */
+static
+void
+am_hdrcache_update_short(ptl_t *ptl, int shmidx,
+                   am_ctl_qshort_cache_t *reqH,
+                   am_ctl_qshort_cache_t *repH)
+{
+    int node;
+
+    for(node = 0; node < PTL_AMSH_MAX_LOCAL_NODES; node++) {
+        reqH[node].base = QGETPTR_SCIF(ptl, shmidx, node,
+                reqFifoShort, short, 0);
+        reqH[node].head = QGETPTR_SCIF(ptl, shmidx, node,
+                reqFifoShort, short, 0);
+        reqH[node].end  = QGETPTR_SCIF(ptl, shmidx, node,
+                reqFifoShort, short, amsh_qcounts.qreqFifoShort);
+
+        repH[node].base = QGETPTR_SCIF(ptl, shmidx, node,
+                repFifoShort, short, 0);
+        repH[node].head = QGETPTR_SCIF(ptl, shmidx, node,
+                repFifoShort, short, 0);
+        repH[node].end  = QGETPTR_SCIF(ptl, shmidx, node,
+                repFifoShort, short, amsh_qcounts.qrepFifoShort);
+    }
+}
+
+/**
+ * Update locally cached shared-pointer directory.
+ *
+ * @param shmidx Endpoint index for which to update local directory.
+ */
+
+static
+void
+am_update_directory(ptl_t *ptl, int shmidx)
+{
+    psm_ep_t ep = ptl->ep;
+    uintptr_t base_this;
+    uintptr_t base_node;
+    struct amsh_qptrs* qptrs;
+    int i;
+
+    psmi_assert_always(shmidx != -1);
+    base_this =
+        (uintptr_t)ep->amsh_qdir[shmidx].amsh_base + AMSH_BLOCK_HEADER_SIZE;
+
+    if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {
+        if(ep->amsh_dirpage->amsh_features[shmidx] & AMSH_HAVE_KASSIST) {
+            ep->amsh_qdir[shmidx].kassist_pid =
+                ep->amsh_dirpage->kassist_pids[shmidx];
+        }
+    } else {
+        ep->amsh_qdir[shmidx].kassist_pid = 0;
+    }
+
+    for(i = 0; i < PTL_AMSH_MAX_LOCAL_NODES; i++) {
+        qptrs = &ep->amsh_qdir[shmidx].qptrs[i];
+
+        base_node = base_this + (i * am_ctl_sizeof_block());
+
+        /* Request queues */
+        qptrs->qreqH = (am_ctl_blockhdr_t *) base_node;
+
+        qptrs->qreqFifoShort = (am_pkt_short_t *)
+            ((uintptr_t) qptrs->qreqH +
+             PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+        qptrs->qreqFifoMed = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qreqFifoShort +
+             ptl->amsh_qsizes.qreqFifoShort);
+        qptrs->qreqFifoLong = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qreqFifoMed +
+             ptl->amsh_qsizes.qreqFifoMed);
+        qptrs->qreqFifoHuge = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qreqFifoLong +
+             ptl->amsh_qsizes.qreqFifoLong);
+
+        /* Reply queues */
+        qptrs->qrepH = (am_ctl_blockhdr_t *)
+            ((uintptr_t) qptrs->qreqFifoHuge +
+             ptl->amsh_qsizes.qreqFifoHuge);
+
+        qptrs->qrepFifoShort = (am_pkt_short_t *)
+            ((uintptr_t) qptrs->qrepH +
+             PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
+        qptrs->qrepFifoMed = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qrepFifoShort +
+             ptl->amsh_qsizes.qrepFifoShort);
+        qptrs->qrepFifoLong = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qrepFifoMed +
+             ptl->amsh_qsizes.qrepFifoMed);
+        qptrs->qrepFifoHuge = (am_pkt_bulk_t *)
+            ((uintptr_t) qptrs->qrepFifoLong +
+             ptl->amsh_qsizes.qrepFifoLong);
+
+        _IPATH_VDBG("shmidx=%d node=%d Request Hdr=%p,Pkt=%p,Med=%p,Long=%p,Huge=%p\n",
+                shmidx, i,
+                qptrs->qreqH,
+                qptrs->qreqFifoShort,
+                qptrs->qreqFifoMed,
+                qptrs->qreqFifoLong,
+                qptrs->qreqFifoHuge);
+        _IPATH_VDBG("shmidx=%d node=%d Reply   Hdr=%p,Pkt=%p,Med=%p,Long=%p,Huge=%p\n",
+                shmidx, i,
+                qptrs->qrepH,
+                qptrs->qrepFifoShort,
+                qptrs->qrepFifoMed,
+                qptrs->qrepFifoLong,
+                qptrs->qrepFifoHuge);
+    }
+
+    /* Update local shorthand pointers */
+#ifdef PSM_HAVE_SCIF
+    qptrs = &ep->amsh_qdir[shmidx].qptrs[ptl->ep->scif_mynodeid];
+#else
+    qptrs = &ep->amsh_qdir[shmidx].qptrs[0];
+#endif
+
+    ep->amsh_qdir[shmidx].qreqH = qptrs->qreqH;
+    ep->amsh_qdir[shmidx].qreqFifoShort = qptrs->qreqFifoShort;
+    ep->amsh_qdir[shmidx].qreqFifoMed = qptrs->qreqFifoMed;
+    ep->amsh_qdir[shmidx].qreqFifoLong = qptrs->qreqFifoLong;
+    ep->amsh_qdir[shmidx].qreqFifoHuge = qptrs->qreqFifoHuge;
+
+    ep->amsh_qdir[shmidx].qrepH = qptrs->qrepH;
+    ep->amsh_qdir[shmidx].qrepFifoShort = qptrs->qrepFifoShort;
+    ep->amsh_qdir[shmidx].qrepFifoMed = qptrs->qrepFifoMed;
+    ep->amsh_qdir[shmidx].qrepFifoLong = qptrs->qrepFifoLong;
+    ep->amsh_qdir[shmidx].qrepFifoHuge = qptrs->qrepFifoHuge;
+
+    /* If we're updating our shmidx, we update our cached pointers */
+    if (ptl->shmidx == shmidx)
+	am_hdrcache_update_short(ptl, shmidx, 
+                                 (am_ctl_qshort_cache_t *) ptl->reqH,
+                                 (am_ctl_qshort_cache_t *) ptl->repH);
+
+    /* Sanity check */
+    uintptr_t base_next = 
+	(uintptr_t) ep->amsh_qdir[shmidx].qptrs[PTL_AMSH_MAX_LOCAL_NODES - 1].qrepFifoHuge + ptl->amsh_qsizes.qrepFifoHuge;
+
+    psmi_assert_always(base_next - base_this <=
+            am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES);
+}
+
+/* ep_epid_share_memory wrapper */
+static
+int
+amsh_epid_reachable(ptl_t *ptl, psm_epid_t epid)
+{
+    int result;
+    psm_error_t err;
+    err = psm_ep_epid_share_memory(ptl->ep, epid, &result);
+    psmi_assert_always(err == PSM_OK);
+    return result;
+}
+
+static
+psm_error_t
+amsh_epaddr_add(ptl_t *ptl, psm_epid_t epid, int shmidx, psm_epaddr_t *epaddr_o)
+{
+    psm_epaddr_t epaddr;
+    psm_error_t err = PSM_OK;
+
+    psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL);
+
+    if (epid == ptl->epid) {
+        epaddr = ptl->epaddr;
+    } else {
+        epaddr = (psm_epaddr_t) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 
+					    1, sizeof(struct psm_epaddr));
+        if (epaddr == NULL) {
+            return PSM_NO_MEMORY;
+        }
+        psmi_assert_always(ptl->ep->amsh_qdir[shmidx].amsh_epaddr == NULL);
+    }
+
+    epaddr->ptl = ptl;
+    epaddr->ptlctl = ptl->ctl;
+    STAILQ_INIT(&epaddr->egrlong);
+    epaddr->mctxt_prev = epaddr;
+    epaddr->mctxt_next = epaddr;
+    epaddr->mctxt_master = epaddr;
+    epaddr->epid = epid;
+    epaddr->ep = ptl->ep;
+    epaddr->_shmidx = shmidx;
+    AMSH_CSTATE_TO_SET(epaddr, NONE);
+    AMSH_CSTATE_FROM_SET(epaddr, NONE);
+    if ((err = psmi_epid_set_hostname(psm_epid_nid(epid), 
+                                      psmi_gethostname(), 0)))
+        goto fail;
+
+    ptl->ep->amsh_qdir[shmidx].amsh_epaddr = epaddr;
+
+    /* Finally, add to table */
+    if ((err = psmi_epid_add(ptl->ep, epid, epaddr)))
+        goto fail;
+
+    _IPATH_VDBG("epaddr=%s added to ptl=%p\n",
+                psmi_epaddr_get_name(epid), ptl);
+
+    *epaddr_o = epaddr;
+    return PSM_OK;
+fail:
+    if (epaddr != ptl->epaddr) psmi_free(epaddr);
+    return err;
+}
+
+struct ptl_connection_req 
+{
+    int         isdone;
+    int         op;         /* connect or disconnect */
+    int         numep;
+    int         numep_left;
+    int         phase;
+
+    int               *epid_mask;
+    const psm_epid_t  *epids;     /* input epid list */
+    psm_epaddr_t      *epaddr;
+    psm_error_t       *errors;    /* inout errors */
+
+    /* Used for connect/disconnect */
+    psm_amarg_t args[4];
+};
+
+/*
+ * function to make scif connection between nodes and exchange shared memory
+ */
+#ifdef PSM_HAVE_SCIF
+static int
+amsh_scif_send(scif_epd_t epd, void *buf, size_t len)
+{
+    int ret;
+    while (len) {
+        ret = scif_send(epd, buf, (uint32_t)len, SCIF_SEND_BLOCK);
+        if (ret < 0) {
+            if (errno == EINTR) continue;
+            return ret;
+        }
+        buf += ret;
+        len -= ret;
+    }
+    return 0;
+}
+
+static int
+amsh_scif_recv(scif_epd_t epd, void *buf, size_t len)
+{
+    int ret;
+    while (len) {
+        ret = scif_recv(epd, buf, (uint32_t)len, SCIF_RECV_BLOCK);
+        if (ret < 0) {
+            if (errno == EINTR) continue;
+            return ret;
+        }
+        buf += ret;
+        len -= ret;
+    }
+    return 0;
+}
+
+static
+psm_error_t
+amsh_scif_connect(uint16_t nodeid, uint16_t port, scif_epd_t *epd_o)
+{
+    int tries;
+    struct scif_portID portID;
+    scif_epd_t epd;
+    psm_error_t err;
+
+    epd = scif_open();
+    if (epd < 0) {
+        err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+	        "scif_open failed with error %d\n", errno);
+        return err;
+    }
+
+    portID.port = port;
+    portID.node = nodeid;
+
+    _IPATH_VDBG("scif connecting to %d:%d\n", nodeid, port);
+
+    for(tries = 0; tries < psmi_scif_connect_retries; tries++) {
+        if (scif_connect(epd, &portID) >= 0) {
+            break;
+        } else if(errno != ECONNREFUSED) {
+            err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                    "scif_connect failed with error %d (%s)\n",
+                    errno, strerror(errno));
+            scif_close(epd);
+            return err;
+        }
+
+        /* Wait a bit before trying again. */
+        if(tries < 20) {
+            usleep(100000);
+        } else {
+            usleep(250000);
+        }
+    }
+
+    if(tries == psmi_scif_connect_retries) {
+        err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                "scif_connect retry limit exceeded\n");
+        return err;
+    }
+
+    *epd_o = epd;
+    return PSM_OK;
+}
+
+/* Establish a connection to a single epid. */
+static psm_error_t amsh_scif_setup(ptl_t* ptl, psm_epid_t epid)
+{
+    psm_ep_t ep = ptl->ep;
+    psm_error_t err = PSM_OK;
+    scif_epd_t epd = -1;
+    void* addr;
+    int peeridx;
+
+    /* Send this struct to identify ourselves to the peer (offset unused) */
+    /* Receive this struct to get memory mapping information. */
+    struct { off_t offset; int verno; psm_epid_t epid; } buf;
+
+    int port = (int)((epid>>32)&0xffff);
+    int nodeid = (int)((epid>>48)&0xff);
+    int shmidx = (int)((epid>>56)&0xff);
+
+    /* Skip peers on the same node */
+    if (nodeid == ep->scif_mynodeid) {
+        return PSM_OK;
+    }
+
+    /* Figure out the peer's index. */
+    /* 0        1 mynodeid 3 4 */
+    /* nodeid 0 1          3 4 */
+    if(nodeid > ep->scif_mynodeid) {
+        peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * nodeid) + shmidx;
+    } else /*nodeid < ep->scif_mynodeid) */ {
+        peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * (nodeid + 1)) + shmidx;
+    }
+
+    _IPATH_VDBG("%lx scif_connect to %d:%d %d %lx\n",
+            ep->epid, nodeid, port, peeridx, epid);
+
+    if(ep->amsh_qdir[peeridx].amsh_epd[0] != 0) {
+        /* Already established this side of the connection; all done. */
+        return err;
+    }
+
+    buf.offset = 0;
+    buf.verno = PSMI_VERNO;
+    buf.epid = ep->epid;
+
+    err = amsh_scif_connect(nodeid, port, &epd);
+    if(err) {
+        return err;
+    }
+
+    /* Send our identification information. */
+    if (amsh_scif_send(epd, &buf, sizeof(buf))) {
+        err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                "scif_send failed: %d %s\n", errno, strerror(errno));
+        scif_close(epd);
+        return err;
+    }
+
+    /* Receive memory registration information. */
+    if(amsh_scif_recv(epd, &buf, sizeof(buf))) {
+        err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                "scif_recv failed: %d %s\n", errno, strerror(errno));
+        scif_close(epd);
+        return err;
+    }
+
+    addr = scif_mmap(NULL, am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES,
+            SCIF_PROT_READ|SCIF_PROT_WRITE, 0, epd, buf.offset);
+    if(addr == SCIF_MMAP_FAILED) {
+        err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                "scif_mmap failed: %d %s\n", errno, strerror(errno));
+        scif_close(epd);
+        return err;
+    }
+
+    _IPATH_PRDBG("%lx scif_mmap offset %p -> %p to addr %p -> %p length %ld\n",
+            ep->epid, (void*)buf.offset,
+            (void*)(buf.offset + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES),
+            addr,
+            (void*)((uintptr_t)addr + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES),
+            am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES);
+
+    ep->amsh_qdir[peeridx].amsh_offset = buf.offset;
+    ep->amsh_qdir[peeridx].amsh_base = addr;
+    ep->amsh_qdir[peeridx].amsh_epid = buf.epid;
+    ep->amsh_qdir[peeridx].amsh_verno = buf.verno;
+
+    /* Calculate my index from the peer's perspective. */
+    /* 0        1 mynodeid 3 4 */
+    /* nodeid   0 1        3 4 */
+    if(ep->scif_mynodeid < nodeid) {
+        ep->amsh_qdir[peeridx].amsh_shmidx =
+            (PTL_AMSH_MAX_LOCAL_PROCS * (ep->scif_mynodeid + 1)) +
+            ep->amsh_shmidx;
+    } else {
+        ep->amsh_qdir[peeridx].amsh_shmidx =
+            (PTL_AMSH_MAX_LOCAL_PROCS * ep->scif_mynodeid) +
+            ep->amsh_shmidx;
+    }
+
+    /* There are eventually two connections.  epd[0] always has the remote
+       memory mapped region associated with it, and is used to make requests
+       to that peer.  epd[1] exposes our local shared memory, and is used
+       to respond to remote requests. */
+    ep->amsh_qdir[peeridx].amsh_epd[0] = epd;
+
+    am_update_directory(ptl, peeridx);
+
+    _IPATH_VDBG("shmidx %d connected! set peeridx %d amsh_shmidx %d epd %d\n",
+            ep->amsh_shmidx, peeridx,
+            ep->amsh_qdir[peeridx].amsh_shmidx,
+            ep->amsh_qdir[peeridx].amsh_epd[0]);
+    return err;
+}
+
+static
+psm_error_t
+amsh_scif_detach(psm_ep_t ep)
+{
+    int i;
+    int size = am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES;
+
+    /* do the rest scif cleanup work */
+    for (i = 0; i < ep->scif_nnodes*PTL_AMSH_MAX_LOCAL_PROCS; i++) {
+	if (ep->amsh_qdir[i].amsh_epd[0] == 0) continue;
+
+        if(i >= PTL_AMSH_MAX_LOCAL_PROCS) {
+            if(scif_munmap(ep->amsh_qdir[i].amsh_base, size)) {
+                _IPATH_INFO("SCIF: unmapping addr %p length %d failed: (%d) %s\n",
+                        ep->amsh_qdir[i].amsh_base, size,
+                        errno, strerror(errno));
+                return PSM_INTERNAL_ERR;
+            }
+
+            ep->amsh_qdir[i].amsh_base = NULL;
+        }
+
+	if(scif_close(ep->amsh_qdir[i].amsh_epd[0])) {
+            _IPATH_INFO("SCIF: closing epd[0] %d failed: (%d) %s\n",
+                    ep->amsh_qdir[i].amsh_epd[0],
+                    errno, strerror(errno));
+            return PSM_INTERNAL_ERR;
+        }
+
+	if(scif_close(ep->amsh_qdir[i].amsh_epd[1])) {
+            _IPATH_INFO("SCIF: closing epd[1] %d failed: (%d) %s\n",
+                    ep->amsh_qdir[i].amsh_epd[1],
+                    errno, strerror(errno));
+            return PSM_INTERNAL_ERR;
+        }
+
+        ep->amsh_qdir[i].amsh_epd[0] = 0;
+        ep->amsh_qdir[i].amsh_epd[1] = 0;
+    }
+
+    /* The accept thread will detect that the listen socket has been closed
+       and will shut down gracefully. */
+    if(scif_close(ep->scif_epd)) {
+        _IPATH_INFO("SCIF: closing listen epd %d failed: (%d) %s\n",
+                ep->scif_epd,
+                errno, strerror(errno));
+        return PSM_INTERNAL_ERR;
+    }
+
+    pthread_join(ep->scif_thread, NULL);
+
+    return PSM_OK;
+}
+
+#endif //PSM_HAVE_SCIF
+
+#define PTL_OP_CONNECT      0
+#define PTL_OP_DISCONNECT   1
+#define PTL_OP_ABORT        2
+
+static
+psm_error_t 
+amsh_ep_connreq_init(ptl_t *ptl, 
+             int op, /* connect, disconnect or abort */
+             int numep,
+	     const psm_epid_t *array_of_epid, /* non-NULL on connect */
+	     const int array_of_epid_mask[],
+             psm_error_t *array_of_errors,
+	     psm_epaddr_t *array_of_epaddr,
+             struct ptl_connection_req **req_o)
+{
+    int i, cstate;
+    psm_epaddr_t epaddr;
+    psm_epid_t epid;
+    struct ptl_connection_req *req = NULL;
+
+    req = (struct ptl_connection_req *)
+          psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1,
+                      sizeof(struct ptl_connection_req));
+    if (req == NULL) 
+        return PSM_NO_MEMORY;
+
+    req->isdone = 0;
+    req->op = op;
+    req->numep = numep;
+    req->numep_left = 0;
+    req->phase = ptl->connect_phase;
+    req->epid_mask = (int *) 
+        psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int));
+
+    if (req->epid_mask == NULL) {
+	psmi_free(req);
+	return PSM_NO_MEMORY;
+    }
+
+    req->epaddr = array_of_epaddr;
+    req->epids = array_of_epid;
+    req->errors = array_of_errors;
+
+    /* First check if there's really something to connect/disconnect 
+     * for this PTL */
+    for (i = 0; i < numep; i++) {
+        req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */
+        if (!array_of_epid_mask[i]) 
+            continue;
+        if (op == PTL_OP_CONNECT) {
+            epid = array_of_epid[i];
+            if (!amsh_epid_reachable(ptl, epid)) {
+                array_of_errors[i] = PSM_EPID_UNREACHABLE;
+                array_of_epaddr[i] = NULL;
+                continue;
+            }
+            _IPATH_VDBG("looking at epid %llx\n", (unsigned long long) epid);
+            epaddr = psmi_epid_lookup(ptl->ep, epid);
+            if (epaddr != NULL) {
+                if (epaddr->ptl != ptl) {
+                    array_of_errors[i] = PSM_EPID_UNREACHABLE;
+                    array_of_epaddr[i] = NULL;
+                    continue;
+                }
+                cstate = AMSH_CSTATE_TO_GET(epaddr);
+                if (cstate == AMSH_CSTATE_TO_ESTABLISHED) {
+                    array_of_epaddr[i] = epaddr;
+                    array_of_errors[i] = PSM_OK;
+                }
+                else {
+                    psmi_assert(cstate == AMSH_CSTATE_TO_NONE);
+                    array_of_errors[i] = PSM_TIMEOUT;
+                    array_of_epaddr[i] = epaddr;
+                    req->epid_mask[i] = AMSH_CMASK_PREREQ;
+                }
+            }
+            else {
+                req->epid_mask[i] = AMSH_CMASK_PREREQ;
+                array_of_epaddr[i] = NULL;
+
+#ifdef PSM_HAVE_SCIF
+                psm_error_t err = amsh_scif_setup(ptl, req->epids[i]);
+                if(err != PSM_OK) {
+                    psmi_free(req->epid_mask);
+                    psmi_free(req);
+                    return err;
+                }
+#endif
+            }
+        }
+        else { /* disc or abort */
+            epaddr = array_of_epaddr[i];
+            psmi_assert(epaddr != NULL);
+            cstate = AMSH_CSTATE_TO_GET(epaddr);
+            if (cstate == AMSH_CSTATE_TO_ESTABLISHED) {
+                req->epid_mask[i] = AMSH_CMASK_PREREQ;
+                _IPATH_VDBG("Just set index %d to AMSH_CMASK_PREREQ\n", i);
+            }
+            /* XXX undef ? */
+        }
+        if (req->epid_mask[i] != AMSH_CMASK_NONE)
+            req->numep_left++;
+    }
+
+    if (req->numep_left == 0) { /* nothing to do */
+        psmi_free(req->epid_mask);
+        psmi_free(req);
+        _IPATH_VDBG("Nothing to connect, bump up phase\n");
+        ptl->connect_phase++;
+        *req_o = NULL;
+        return PSM_OK;
+    }
+    else {
+        *req_o = req;
+        return PSM_OK_NO_PROGRESS;
+    }
+}
+
+static
+psm_error_t
+amsh_ep_connreq_poll(ptl_t *ptl, struct ptl_connection_req *req)
+{
+    int i, j, cstate, shmidx;
+    psm_error_t err = PSM_OK;
+    psm_epid_t epid;
+    psm_epaddr_t epaddr;
+
+    if (req == NULL || req->isdone)
+        return PSM_OK;
+
+    psmi_assert_always(ptl->ep->amsh_dirpage != NULL); 
+    psmi_assert_always(ptl->connect_phase == req->phase);
+
+    if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+        for (i = 0; i < req->numep; i++)  {
+            if (req->epid_mask[i] == AMSH_CMASK_NONE ||
+                req->epid_mask[i] == AMSH_CMASK_DONE)
+                continue;
+
+            epaddr = req->epaddr[i];
+            psmi_assert(epaddr != NULL);
+            if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+                int shmidx = epaddr->_shmidx;
+#ifdef PSM_HAVE_SCIF
+                if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {  /* not remote nodes */
+#endif
+                    /* Make sure the target of the disconnect is still there */
+                    pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+                    if (ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] != epaddr->epid) {
+                        req->numep_left--;
+                        req->epid_mask[i] = AMSH_CMASK_DONE;
+                        AMSH_CSTATE_TO_SET(epaddr, NONE);
+                    }
+                    pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+#ifdef PSM_HAVE_SCIF
+                }
+#endif
+            }
+
+            if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+                req->args[0].u32w0 = PSMI_AM_DISC_REQ;
+                req->args[0].u32w1 = ptl->connect_phase;
+                req->args[1].u64w0 = (uint64_t) ptl->epid;
+                req->args[2].u32w0 = PSMI_VERNO;
+                req->args[2].u32w1 = PSM_OK;
+                req->args[3].u64w0 = (uint64_t)(uintptr_t)&req->errors[i];
+                psmi_amsh_short_request(ptl, epaddr,
+                                amsh_conn_handler_hidx,
+                                req->args, 4, NULL, 0, 0);
+                req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+            }
+            else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+                cstate = AMSH_CSTATE_TO_GET(epaddr);
+                if (cstate == AMSH_CSTATE_TO_DISC_REPLIED) {
+                    req->numep_left--;
+                    req->epid_mask[i] = AMSH_CMASK_DONE;
+                    AMSH_CSTATE_TO_SET(epaddr, NONE);
+                }
+            }
+        }
+    }
+    else {
+        /* First see if we've made progress on any postreqs */
+        int n_prereq = 0;
+        for (i = 0; i < req->numep; i++) {
+            int cstate;
+            if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) {
+                if (req->epid_mask[i] == AMSH_CMASK_PREREQ)
+                    n_prereq++;
+                continue;
+            }
+            epaddr = req->epaddr[i];
+            psmi_assert(epaddr != NULL);
+            cstate = AMSH_CSTATE_TO_GET(epaddr);
+            if (cstate == AMSH_CSTATE_TO_REPLIED) {
+                req->numep_left--;
+                AMSH_CSTATE_TO_SET(epaddr, ESTABLISHED);
+                req->epid_mask[i] = AMSH_CMASK_DONE;
+                continue;
+            }
+        }
+        if (n_prereq > 0) { 
+            char buf[32];
+            uint16_t their_verno;
+
+            psmi_assert(req->numep_left > 0);
+            /* Go through the list of peers we need to connect to and find out
+             * if they each shared ep is mapped into shm */
+            pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+            for (i = 0; i < req->numep; i++) {
+                if (req->epid_mask[i] != AMSH_CMASK_PREREQ)
+                    continue;
+                epid = req->epids[i];
+                epaddr = req->epaddr[i];
+
+#if PSM_HAVE_SCIF
+		/* Get the peer node-ID and scif port # from epid */
+		int nodeid = (int)((epid>>48)&0xff);
+		if (nodeid != ptl->ep->scif_mynodeid) {
+                    int peeridx = (int)((epid>>56)&0xff);
+
+                    //Don't use a loop, compute the shmidx directly.
+                    if(nodeid < ptl->ep->scif_mynodeid) {
+                        shmidx =
+                            (nodeid + 1) * PTL_AMSH_MAX_LOCAL_PROCS + peeridx;
+                    } else {
+                        shmidx = nodeid * PTL_AMSH_MAX_LOCAL_PROCS + peeridx;
+                    }
+
+		    psmi_assert(shmidx >= PTL_AMSH_MAX_LOCAL_PROCS);
+                    their_verno = ptl->ep->amsh_qdir[shmidx].amsh_verno;
+		} else
+#endif
+                {
+                    /* Go through mapped epids and find the epid we're looking for */
+                    for (shmidx = -1, j = 0; j <=
+                            ptl->ep->amsh_dirpage->max_idx; j++) {
+                        /* epid is connected and ready to go */
+	                if (ptl->ep->amsh_dirpage->shmidx_map_epid[j] == epid) {
+                            shmidx = j;
+	                    break;
+                        }
+                    }
+
+                    if (shmidx == -1)  /* couldn't find epid, go to next */
+                        continue;
+                    their_verno = ptl->ep->amsh_dirpage->psm_verno[shmidx];
+		}
+
+                /* Before we even send the request out, check to see if
+                 * versions are interoperable */
+                if (!psmi_verno_isinteroperable(their_verno)) {
+                    snprintf(buf,sizeof buf, "%d.%d",
+                            PSMI_VERNO_GET_MAJOR(their_verno),
+                            PSMI_VERNO_GET_MINOR(their_verno));
+
+                    _IPATH_INFO(
+                            "Local endpoint id %" PRIx64 " has version %s "
+                            "which is not supported by library version %d.%d", 
+                            epid, buf, PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
+                    req->errors[i] = PSM_EPID_INVALID_VERSION;
+                    req->numep_left--;
+                    req->epid_mask[i] = AMSH_CMASK_DONE;
+                    continue;
+                }
+
+                if (epaddr != NULL) {
+                    psmi_assert(epaddr->_shmidx == shmidx);
+                }
+                else if ((epaddr = psmi_epid_lookup(ptl->ep, epid)) == NULL)  {
+                    if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr))) {
+                        pthread_mutex_unlock(
+                            (pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+                        return err;
+                    }
+                } 
+
+                req->epaddr[i] = epaddr;
+                req->args[0].u32w0 = PSMI_AM_CONN_REQ;
+                req->args[0].u32w1 = ptl->connect_phase;
+                req->args[1].u64w0 = (uint64_t) ptl->epid;
+                req->args[2].u32w0 = PSMI_VERNO;
+                req->args[2].u32w1 = PSM_OK;
+                req->args[3].u64w0 = (uint64_t)(uintptr_t)&req->errors[i];
+                req->epid_mask[i] = AMSH_CMASK_POSTREQ;
+                psmi_amsh_short_request(ptl, epaddr, amsh_conn_handler_hidx,
+                                    req->args, 4, NULL, 0, 0);
+	        _IPATH_PRDBG("epaddr=%p, epid=%" PRIx64 " at shmidx=%d\n", 
+                    epaddr, epid, shmidx);
+            }
+            pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+        }
+    }
+
+    if (req->numep_left == 0) { /* we're all done */
+        req->isdone = 1;
+        return PSM_OK;
+    }
+    else {
+        sched_yield();
+        return PSM_OK_NO_PROGRESS;
+    }
+}
+
+static
+psm_error_t
+amsh_ep_connreq_fini(ptl_t *ptl, struct ptl_connection_req *req)
+{
+    psm_error_t err = PSM_OK;
+    int i;
+
+    /* Whereever we are at in our connect process, we've been instructed to
+     * finish the connection process */
+    if (req == NULL)
+        return PSM_OK;
+
+    /* This prevents future connect replies from referencing data structures
+     * that disappeared */
+    ptl->connect_phase++;
+
+    /* First process any leftovers in postreq or prereq */
+    for (i = 0; i < req->numep; i++) {
+        if (req->epid_mask[i] == AMSH_CMASK_NONE)
+            continue;
+        else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
+            int cstate;
+            req->epid_mask[i] = AMSH_CMASK_DONE;
+            cstate = AMSH_CSTATE_TO_GET(req->epaddr[i]);
+            if (cstate == AMSH_CSTATE_TO_REPLIED) {
+                req->numep_left--;
+                AMSH_CSTATE_TO_SET(req->epaddr[i], ESTABLISHED);
+            }
+            else { /* never actually got reply */
+                req->errors[i] = PSM_TIMEOUT;
+            }
+        }
+        /* If we couldn't go from prereq to postreq, that means we couldn't
+         * find the shmidx for an epid in time.  This can only be a case of
+         * time out */
+        else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) {
+            req->errors[i] = PSM_TIMEOUT;
+            req->numep_left--;
+            req->epaddr[i] = NULL;
+            req->epid_mask[i] = AMSH_CMASK_DONE;
+        }
+    }
+    
+    /* Whatever is left can only be in DONE or NONE state */
+    for (i = 0; i < req->numep; i++) {
+        if (req->epid_mask[i] == AMSH_CMASK_NONE)
+            continue;
+        psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE);
+
+        err = psmi_error_cmp(err, req->errors[i]);
+        /* Report errors in connection. */
+        /* XXX de-alloc epaddr */
+    }
+
+    psmi_free(req->epid_mask);
+    psmi_free(req);
+
+    return err;
+}
+
+/* Wrapper for 2.0's use of connect/disconect.  The plan is to move the
+ * init/poll/fini interface up to the PTL level for 2.2 */
+#define CONNREQ_ZERO_POLLS_BEFORE_YIELD  20
+static
+psm_error_t
+amsh_ep_connreq_wrap(ptl_t *ptl, int op,
+             int numep,
+	     const psm_epid_t *array_of_epid, 
+	     const int array_of_epid_mask[],
+             psm_error_t *array_of_errors,
+	     psm_epaddr_t *array_of_epaddr,
+             uint64_t timeout_ns)
+{
+    psm_error_t err;
+    uint64_t t_start;
+    struct ptl_connection_req *req = NULL;
+    int num_polls_noprogress = 0;
+    static int shm_polite_attach = -1;
+
+    if (shm_polite_attach == -1) {
+        char *p = getenv("PSM_SHM_POLITE_ATTACH");
+        if (p && *p && atoi(p) != 0) {
+            fprintf(stderr, "%s: Using Polite SHM segment attach\n",
+                psmi_gethostname());
+            shm_polite_attach = 1;
+        }
+        shm_polite_attach = 0;
+    }
+
+    /* Initialize */
+    err = amsh_ep_connreq_init(ptl, op, numep,
+            array_of_epid, array_of_epid_mask, array_of_errors,
+            array_of_epaddr, &req);
+    if (err != PSM_OK_NO_PROGRESS) /* Either we're all done with connect or 
+                                    * there was an error */
+        return err;
+
+    /* Poll until either
+     * 1. We time out
+     * 2. We are done with connecting 
+     */ 
+    t_start = get_cycles();
+    do {
+        psmi_poll_internal(ptl->ep, 1);
+        err = amsh_ep_connreq_poll(ptl, req);
+        if (err == PSM_OK)
+            break; /* Finished before timeout */
+        else if (err != PSM_OK_NO_PROGRESS) {
+	    psmi_free(req->epid_mask);
+	    psmi_free(req);
+	    goto fail;
+        } else if (shm_polite_attach && 
+            ++num_polls_noprogress == CONNREQ_ZERO_POLLS_BEFORE_YIELD) {
+            num_polls_noprogress = 0;
+	    PSMI_PYIELD();
+        }
+    }
+    while (psmi_cycles_left(t_start, timeout_ns));
+
+    err = amsh_ep_connreq_fini(ptl, req);
+
+    /* Ensure that both sides of all connections are established before
+       returning. This prevents MPI-level deadlocks where one rank returns from
+       here before responding to another ranks handshake and enters a barrier
+       (which does not poll PSM).  That other rank stays in PSM, never
+       receiving the handshake, and never entering the barrier: deadlock. */
+    /* This is fixed by Intel MPI 5.0. */
+#if 0
+    if(op == PTL_OP_CONNECT) {
+        while(ptl->connect_to > ptl->connect_from) {
+            psmi_poll_internal(ptl->ep, 1);
+        }
+    } else { //ABORT or DISCONNECT
+        while(ptl->connect_to < ptl->connect_from) {
+            psmi_poll_internal(ptl->ep, 1);
+        }
+    }
+#endif
+
+fail:
+    return err;
+}
+
+static
+psm_error_t 
+amsh_ep_connect(ptl_t *ptl,
+             int numep,
+	     const psm_epid_t *array_of_epid, 
+	     const int array_of_epid_mask[],
+             psm_error_t *array_of_errors,
+	     psm_epaddr_t *array_of_epaddr,
+             uint64_t timeout_ns)
+{
+    return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid,
+                                array_of_epid_mask, array_of_errors,
+                                array_of_epaddr, timeout_ns);
+}
+
+static
+psm_error_t
+amsh_ep_disconnect(ptl_t *ptl, int force, int numep, 
+	     const psm_epaddr_t array_of_epaddr[], 
+	     const int array_of_epaddr_mask[], 
+	     psm_error_t array_of_errors[],
+	     uint64_t timeout_ns)
+{
+    return amsh_ep_connreq_wrap(ptl, force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, 
+                numep, NULL, array_of_epaddr_mask, array_of_errors,
+                (psm_epaddr_t *) array_of_epaddr, timeout_ns);
+}
+
+/* am_ctl_getslot_remote_inner works just like am_ctl_getslot_pkt_inner, but
+   instead of using the tail/lock in the shq, use a separate per-domain
+   tail/lock.  The queue is actually located on a remote node, but tailinfo
+   is located on the local node (and shared by peers on the same node) */
+static
+am_pkt_short_t*
+am_ctl_getslot_pkt_inner(struct amsh_qtail_info* tailinfo,
+                         volatile am_ctl_qhdr_t *shq,
+                         am_pkt_short_t *pkt0)
+{
+    am_pkt_short_t* pkt;
+    uint32_t idx;
+
+    /* Acquire a slot/packet in the remote queue. */
+    pthread_spin_lock(&tailinfo->lock);
+    idx = tailinfo->tail;
+
+    /* Careful here -- pkt is pointing to memory on a remote node, so any
+       accesses will be expensive over PCIE. */
+    pkt = (void*)((uintptr_t)pkt0 + idx * shq->elem_sz);
+    if(pkt->flag == QFREE) {
+        ips_sync_reads();
+        pkt->flag = QUSED;
+
+        tailinfo->tail += 1;
+        if(tailinfo->tail == shq->elem_cnt) {
+            tailinfo->tail = 0;
+        }
+    } else {
+        pkt = NULL;
+    }
+    pthread_spin_unlock(&tailinfo->lock);
+
+    return pkt;
+}
+
+/* AWF - leaving this code for now.  With the addition of SCIF/symmetric
+   support, all communication uses the 'remote' path. */
+#if 0
+#undef CSWAP
+/* AWF - cswap appears to be broken.. fix? */
+PSMI_ALWAYS_INLINE(
+int32_t 
+cswap(volatile uint32_t *p, uint32_t old_value, uint32_t new_value))
+{
+  asm volatile ("lock cmpxchg %2, %0" :
+                "+m" (*p), "+a" (old_value) :
+                "r" (new_value) :
+                "memory");
+  return old_value;
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0)
+)
+{
+    am_pkt_short_t *pkt;
+    uint32_t idx;
+#ifndef CSWAP
+    pthread_spin_lock(&shq->lock);
+    idx = shq->tail;
+    pkt = (am_pkt_short_t *)((uintptr_t) pkt0 + idx * shq->elem_sz);
+    if (pkt->flag == QFREE) {
+        ips_sync_reads();
+        pkt->flag = QUSED;
+        shq->tail += 1;
+        if (shq->tail == shq->elem_cnt)
+            shq->tail = 0;
+    } else {
+        pkt = NULL;
+    }
+    pthread_spin_unlock(&shq->lock);
+#else
+    uint32_t idx_next;
+    do {
+        idx = shq->tail;
+        idx_next = (idx+1 == shq->elem_cnt) ? 0 : idx+1;
+    } while (cswap(&shq->tail, idx, idx_next) != idx);
+
+    pkt = (am_pkt_short_t *)((uintptr_t) pkt0 + idx * shq->elem_sz);
+    //AWF - why is another cswap needed here? we already have the packet..
+    //We'll wait until the packet goes from QUSED -> QFREE
+    // And as soon as it does, toggle it back to QUSED.
+    while (cswap(&pkt->flag, QFREE, QUSED) !=  QFREE)
+        ;
+#endif
+    return pkt;
+}
+#endif
+
+/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */
+#define am_ctl_getslot_bulkpkt_inner(shq,pkt0) ((am_pkt_bulk_t *) \
+            am_ctl_getslot_pkt_inner(shq,(am_pkt_short_t *)(pkt0)))
+
+PSMI_ALWAYS_INLINE(
+am_pkt_short_t *
+am_ctl_getslot_pkt(ptl_t *ptl, int shmidx, int is_reply)
+)
+{
+    struct amsh_qtail_info* tailinfo;
+    volatile am_ctl_qhdr_t   *shq;
+    am_pkt_short_t  *pkt0;
+
+        /* It's not obvious, but the packet acquisition code below is accessing
+           memory mapped remotely from a peer on another SCIF node. Thus we
+           have to make sure a SCIF connection to that peer is already
+           established. */
+#ifdef PSM_HAVE_SCIF
+        if(shmidx >= PTL_AMSH_MAX_LOCAL_PROCS &&
+                ptl->ep->amsh_qdir[shmidx].amsh_epd[0] == 0) {
+            if(amsh_scif_setup(ptl, ptl->ep->amsh_qdir[shmidx].amsh_epid)
+                    != PSM_OK) {
+                psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                        "am_ctl_getslot_remote(): amsh_scif_setup failed");
+            }
+        }
+#endif
+
+        if(!is_reply) {
+            tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoShort;
+            shq  = &(ptl->ep->amsh_qdir[shmidx].qreqH->shortq);
+            pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoShort;
+        } else {
+            tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoShort;
+            shq  = &(ptl->ep->amsh_qdir[shmidx].qrepH->shortq);
+            pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoShort;
+        }
+
+        return am_ctl_getslot_pkt_inner(tailinfo, shq, pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_med(ptl_t *ptl, int shmidx, int is_reply)
+)
+{
+    struct amsh_qtail_info* tailinfo;
+    volatile am_ctl_qhdr_t   *shq;
+    am_pkt_bulk_t  *pkt0;
+
+    if(!is_reply) {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoMed;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qreqH->medbulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoMed; 
+    } else {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoMed;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qrepH->medbulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoMed; 
+    }
+
+    return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo,
+            shq, (am_pkt_short_t*)pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_long(ptl_t *ptl, int shmidx, int is_reply)
+)
+{
+    struct amsh_qtail_info* tailinfo;
+    volatile am_ctl_qhdr_t   *shq;
+    am_pkt_bulk_t  *pkt0;
+
+    if(!is_reply) {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoLong;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qreqH->longbulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoLong; 
+    } else {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoLong;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qrepH->longbulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoLong; 
+    }
+
+    return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo,
+            shq, (am_pkt_short_t*)pkt0);
+}
+
+PSMI_ALWAYS_INLINE(
+am_pkt_bulk_t *
+am_ctl_getslot_huge(ptl_t *ptl, int shmidx, int is_reply)
+)
+{
+    struct amsh_qtail_info* tailinfo;
+    volatile am_ctl_qhdr_t   *shq;
+    am_pkt_bulk_t  *pkt0;
+
+    if(!is_reply) {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoHuge;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qreqH->hugebulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoHuge; 
+    } else {
+        tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoHuge;
+        shq  = &(ptl->ep->amsh_qdir[shmidx].qrepH->hugebulkq);
+        pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoHuge; 
+    }
+
+    return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo,
+            shq, (am_pkt_short_t*)pkt0);
+}
+
+psmi_handlertab_t psmi_allhandlers[] = { 
+    { 0 },
+    { amsh_conn_handler },
+    { psmi_am_mq_handler },
+    { psmi_am_mq_handler_data },
+    { psmi_am_mq_handler_rtsmatch },
+    { psmi_am_mq_handler_rtsdone },
+    { psmi_am_handler }
+};
+
+PSMI_ALWAYS_INLINE(
+void 
+advance_head(volatile am_ctl_qshort_cache_t *hdr))
+{
+    QMARKFREE(hdr->head);
+    hdr->head++;
+    if (hdr->head == hdr->end)
+        hdr->head = hdr->base;
+}
+
+#define AMSH_ZERO_POLLS_BEFORE_YIELD    64
+#define AMSH_POLLS_BEFORE_PSM_POLL      16
+
+/* XXX this can be made faster.  Instead of checking the flag of the head, keep
+ * a cached copy of the integer value of the tail and compare it against the
+ * previous one we saw.
+ * AWF this trick won't work across nodes, since the receiver doesn't have
+ * access to the tail value.
+ */
+
+PSMI_ALWAYS_INLINE(
+psm_error_t
+amsh_poll_internal_inner(ptl_t *ptl, int replyonly, int is_internal))
+{
+    psm_error_t err = PSM_OK_NO_PROGRESS;
+
+    /* poll replies */
+#ifdef PSM_HAVE_SCIF
+    int node;
+    int nnodes = ptl->ep->scif_nnodes;
+
+    for(node = 0; node < nnodes; node++) {
+        if (!QISEMPTY(ptl->repH[node].head->flag)) {
+            do {
+                ips_sync_reads();
+                process_packet(ptl, (am_pkt_short_t *) ptl->repH[node].head, 0);
+                advance_head(&ptl->repH[node]);
+                err = PSM_OK;
+            } while (!QISEMPTY(ptl->repH[node].head->flag));
+        }
+    }
+#else
+    if (!QISEMPTY(ptl->repH[0].head->flag)) {
+        do {
+            ips_sync_reads();
+            process_packet(ptl, (am_pkt_short_t *) ptl->repH[0].head, 0);
+            advance_head(&ptl->repH[0]);
+            err = PSM_OK;
+        } while (!QISEMPTY(ptl->repH[0].head->flag));
+    }
+#endif
+
+    if (!replyonly) {
+    /* Request queue not enable for 2.0, will be re-enabled to support long
+     * replies */
+        if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) {
+            psmi_am_reqq_drain(ptl);
+            err = PSM_OK;
+        }
+
+#ifdef PSM_HAVE_SCIF
+        for(node = 0; node < nnodes; node++) {
+            if (!QISEMPTY(ptl->reqH[node].head->flag)) {
+                do {
+                    ips_sync_reads();
+                    process_packet(ptl,
+                            (am_pkt_short_t *) ptl->reqH[node].head, 1);
+                    advance_head(&ptl->reqH[node]);
+                    err = PSM_OK;
+                } while (!QISEMPTY(ptl->reqH[node].head->flag));
+            }
+        }
+#else
+        if (!QISEMPTY(ptl->reqH[0].head->flag)) {
+            do {
+                ips_sync_reads();
+                process_packet(ptl,
+                        (am_pkt_short_t *) ptl->reqH[0].head, 1);
+                advance_head(&ptl->reqH[0]);
+                err = PSM_OK;
+            } while (!QISEMPTY(ptl->reqH[0].head->flag));
+        }
+#endif
+    }
+
+    if (is_internal) {
+        if (err == PSM_OK) /* some progress, no yields */
+            ptl->zero_polls = 0;
+        else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) {
+            /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */
+            sched_yield();
+            ptl->zero_polls = 0;
+        }
+
+        if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) {
+            psmi_poll_internal(ptl->ep, 0);
+            ptl->amsh_only_polls = 0;
+        }
+    }
+    return err; /* if we actually did something */
+}
+
+/* non-inlined version */
+static
+psm_error_t
+amsh_poll_internal(ptl_t *ptl, int replyonly)
+{
+    return amsh_poll_internal_inner(ptl, replyonly, 1);
+}
+
+#ifdef PSM_PROFILE
+  #define AMSH_POLL_UNTIL(ptl,isreply,cond)   do {      \
+            PSMI_PROFILE_BLOCK();                       \
+            while (!(cond)) {                           \
+                PSMI_PROFILE_REBLOCK(                   \
+                  amsh_poll_internal(ptl,isreply) ==    \
+                  PSM_OK_NO_PROGRESS);                  \
+            }                                           \
+            PSMI_PROFILE_UNBLOCK();                     \
+        } while (0)
+#else
+  #define AMSH_POLL_UNTIL(ptl,isreply,cond)   do {  \
+            while (!(cond)) {                       \
+                amsh_poll_internal(ptl,isreply);    \
+            }                                       \
+        } while (0)
+#endif
+
+static
+psm_error_t
+amsh_poll(ptl_t *ptl, int replyonly)
+{
+    return amsh_poll_internal_inner(ptl, replyonly, 0);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t bulkidx, 
+                  uint16_t fmt, uint16_t nargs, uint16_t handleridx, 
+                  psm_amarg_t *args, const void *src, uint32_t len, int isreply))
+{
+    int i;
+    volatile am_pkt_short_t *pkt;
+
+    AMSH_POLL_UNTIL(ptl, isreply,
+        (pkt = am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL);
+
+#ifdef __MIC__
+    /* On MIC, a local copy of the packet struct should be filled in, then
+       copied using one vector operation.  MIC does not have write combining,
+       and the acquired packet is in remote (via PCIE) memory, so filling in
+       each struct member will cause a separate PCIE transaction. Using a
+       single vector write reduces latency. */
+    am_pkt_short_t lcl_pkt; /* Local version of packet data */
+
+    lcl_pkt.bulkidx = bulkidx;
+    lcl_pkt.shmidx = ptl->ep->amsh_qdir[destidx].amsh_shmidx;
+    lcl_pkt.type  = fmt;
+    lcl_pkt.nargs = nargs;
+    lcl_pkt.handleridx = handleridx;
+
+    for (i = 0; i < nargs; i++)
+        lcl_pkt.args[i] = args[i];
+
+    if (fmt == AMFMT_SHORT_INLINE)
+        mq_copy_tiny((uint32_t *) &lcl_pkt.args[nargs], (uint32_t *) src, len);
+
+    /* Skip the memory fences in QMARKREADY; not necessary here. */
+    //QMARKREADY(lcl_pkt);
+    lcl_pkt.flag = QREADY;
+
+    /* Now copy the local packet data to the remote packet. */
+    memcpy((void*)pkt, &lcl_pkt, sizeof(am_pkt_short_t));
+
+#else
+    /* got a free pkt... fill it in */
+    pkt->bulkidx = bulkidx;
+    pkt->shmidx = ptl->ep->amsh_qdir[destidx].amsh_shmidx;
+    pkt->type  = fmt;
+    pkt->nargs = nargs;
+    pkt->handleridx = handleridx;
+
+    for (i = 0; i < nargs; i++)
+        pkt->args[i] = args[i];
+
+    if (fmt == AMFMT_SHORT_INLINE) 
+        mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src, len);
+
+    QMARKREADY(pkt);
+#endif
+}
+
+/* It's probably unlikely that the alloca below is problematic, but
+ * in case we think it is, define the next to 1
+ */
+#define ALLOCA_AS_SCRATCH 0
+
+#if ALLOCA_AS_SCRATCH
+static char amsh_medscratch[AMMED_SZ];
+#endif
+
+#ifdef __MIC__
+#define amsh_shm_copy_short memcpy
+#define amsh_shm_copy_long  memcpy
+#define amsh_shm_copy_huge  psmi_memcpyo
+#else
+#define amsh_shm_copy_short psmi_mq_mtucpy
+#define amsh_shm_copy_long  psmi_mq_mtucpy
+#define amsh_shm_copy_huge  psmi_memcpyo
+#endif
+
+PSMI_ALWAYS_INLINE(
+int
+psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl, psm_epaddr_t epaddr,
+                  psm_handler_t handler, psm_amarg_t *args, int nargs,
+		  const void *src, size_t len, void *dst, int flags))
+{
+    uint16_t type;
+    uint32_t bulkidx;
+    uint16_t hidx = (uint16_t) handler;
+    int destidx = epaddr->_shmidx;
+    int is_reply = AM_IS_REPLY(amtype);
+    volatile am_pkt_bulk_t *bulkpkt;
+
+    _IPATH_VDBG("%s epaddr=%s, shmidx=%d, type=%d LOOPBACK=%s\n", 
+            is_reply ? "reply" : "request",
+            psmi_epaddr_get_name(epaddr->epid), epaddr->_shmidx, amtype,
+            ptl->epaddr == epaddr ? "YES" : "NO");
+    if (ptl->epaddr == epaddr) { /* loopback */
+        amsh_am_token_t tok;
+        void *bufa;
+
+	tok.tok.epaddr_from = epaddr;
+        tok.ptl = ptl;
+        tok.mq = ptl->ep->mq;
+        tok.shmidx = ptl->shmidx;
+        if (len > 0) {
+            if (AM_IS_LONG(amtype))
+                bufa = dst;
+            else {
+                psmi_assert_always(len <= AMMED_SZ);
+#if ALLOCA_AS_SCRATCH
+                bufa = (void *) amsh_medscratch;
+#else
+                bufa = alloca(len);
+#endif
+            }
+            psmi_assert(bufa != NULL);
+            amsh_shm_copy_short((void *) bufa, src, len);
+        }
+        else
+            bufa = NULL;
+        psmi_handler_fn_t fn = 
+            (psmi_handler_fn_t) psmi_allhandlers[hidx].fn;
+        fn(&tok, args, nargs, bufa, len);
+
+        return 1;
+    }
+
+    switch (amtype) {
+        case AMREQUEST_SHORT:
+        case AMREPLY_SHORT:
+            if (len + (nargs<<3) <= (NSHORT_ARGS<<3)) {
+                /* Payload fits in args packet */
+                type = AMFMT_SHORT_INLINE;
+                bulkidx = len;
+            }
+            else {
+                psmi_assert(len < amsh_qelemsz.qreqFifoMed);
+                psmi_assert(src != NULL);
+                type = AMFMT_SHORT;
+#if 1
+                AMSH_POLL_UNTIL(ptl, is_reply,
+                    (bulkpkt = am_ctl_getslot_med(ptl, destidx, is_reply)) != NULL);
+#else
+                /* This version exposes a compiler bug */
+                while (1) {
+                    bulkpkt = am_ctl_getslot_med(ptl, destidx, is_reply);
+                    if (bulkpkt == NULL)
+                        break;
+                    amsh_poll_internal(ptl, is_reply);
+                }
+#endif
+                bulkidx = bulkpkt->idx;
+                bulkpkt->len = len;
+                _IPATH_VDBG("bulkpkt %p flag is %d from idx %d\n", 
+                    bulkpkt, bulkpkt->flag, destidx);
+                amsh_shm_copy_short((void*) bulkpkt->payload, src, (uint32_t) len);
+                QMARKREADY(bulkpkt);
+            }
+            am_send_pkt_short(ptl, destidx, bulkidx, type, nargs, hidx,
+                              args, src, len, is_reply);
+            break;
+
+        case AMREQUEST_LONG:
+        case AMREPLY_LONG:
+        {
+            uint32_t bytes_left = len;
+            uint8_t *src_this = (uint8_t *) src;
+            uint8_t *dst_this = (uint8_t *) dst;
+            uint32_t bytes_this;
+            uint32_t mtu_this;
+            type = (bytes_left >= AMSH_HUGE_BYTES ? AMFMT_HUGE : AMFMT_LONG);
+            /* XXX put in my shm block */
+            int destidx_l = AMSH_BULK_PUSH ? destidx : ptl->shmidx;
+
+            if (type == AMFMT_HUGE)
+                mtu_this = is_reply ? amsh_qpkt_max.qrepFifoHuge :
+                                      amsh_qpkt_max.qreqFifoHuge;
+            else
+                mtu_this = is_reply ? amsh_qpkt_max.qrepFifoLong :
+                                       amsh_qpkt_max.qreqFifoLong;
+
+            _IPATH_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n",
+                    is_reply ? "rep" : "req", src, dst, (uint32_t)len, hidx);
+
+            while (bytes_left) {
+                if (type == AMFMT_HUGE) {
+                    bytes_this = min(bytes_left, mtu_this);
+
+                    AMSH_POLL_UNTIL(ptl, is_reply,
+                      (bulkpkt = am_ctl_getslot_huge(ptl, destidx_l, is_reply)) != NULL);
+                    bytes_left -= bytes_this;
+                    if (bytes_left == 0)
+                        type = AMFMT_HUGE_END;
+                    bulkidx = bulkpkt->idx;
+                    amsh_shm_copy_huge((void *) bulkpkt->payload, 
+                                       src_this, bytes_this);
+                }
+                else {
+                    bytes_this = min(bytes_left, mtu_this);
+                    AMSH_POLL_UNTIL(ptl, is_reply,
+                      (bulkpkt = am_ctl_getslot_long(ptl, destidx_l, is_reply)) != NULL);
+                    bytes_left -= bytes_this;
+                    if (bytes_left == 0)
+                        type = AMFMT_LONG_END;
+                    bulkidx = bulkpkt->idx;
+                    amsh_shm_copy_long((void *) bulkpkt->payload, src_this, 
+                                       bytes_this);
+
+                }
+
+                bulkpkt->dest = (uintptr_t) dst;
+                bulkpkt->dest_off = 
+                    (uint32_t)((uintptr_t)dst_this - (uintptr_t)dst);
+                bulkpkt->len = bytes_this;
+                QMARKREADY(bulkpkt);
+
+                am_send_pkt_short(ptl, destidx, bulkidx, type, nargs, 
+                                  hidx, args, NULL, 0, is_reply);
+                src_this += bytes_this;
+                dst_this += bytes_this;
+            }
+            break;
+        }
+        default:
+            break;
+    }
+    return 1;
+}
+
+/* A generic version that's not inlined */
+int
+psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm_epaddr_t epaddr,
+                  psm_handler_t handler, psm_amarg_t *args, int nargs,
+		  const void *src, size_t len, void *dst, int flags)
+{
+    return psmi_amsh_generic_inner(amtype,ptl,epaddr,handler,args,nargs,src,len,
+            dst,flags);
+}
+
+int
+psmi_amsh_short_request(ptl_t *ptl, psm_epaddr_t epaddr,
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+		        const void *src, size_t len, int flags)
+{
+    return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler, args, nargs,
+                             src, len, NULL, flags);
+}
+                
+int
+psmi_amsh_long_request(ptl_t *ptl, psm_epaddr_t epaddr,
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+		        const void *src, size_t len, void *dest, int flags)
+{
+    return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler, args, nargs,
+                             src, len, dest, flags);
+}
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+                      psm_handler_t handler, psm_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags)
+{
+  psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_from, 
+			  handler, args, nargs, src, len, NULL, flags);
+  return;
+}
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+                     psm_handler_t handler, psm_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags)
+{
+   psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_from, 
+			   handler, args, nargs, src, len, dest, flags);
+   return;
+}
+
+void
+psmi_am_reqq_init(ptl_t *ptl)
+{
+    ptl->psmi_am_reqq_fifo.first = NULL;
+    ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+}
+
+psm_error_t
+psmi_am_reqq_drain(ptl_t *ptl)
+{
+    am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first;
+    am_reqq_t *req;
+    psm_error_t err = PSM_OK_NO_PROGRESS;
+
+    /* We're going to process the entire list, and running the generic handler
+     * below can cause other requests to be enqueued in the queue that we're
+     * processing. */
+    ptl->psmi_am_reqq_fifo.first = NULL;
+    ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first;
+
+    while ((req = reqn) != NULL) {
+        err = PSM_OK;
+        reqn = req->next;
+        _IPATH_VDBG("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n", req,
+                psmi_epaddr_get_hostname(req->epaddr->epid),
+                (void *) (uintptr_t) req->args[1].u64w0,
+                (void *) (uintptr_t) req->args[0].u64w0);
+        psmi_amsh_generic(req->amtype, req->ptl, req->epaddr,
+                          req->handler, req->args, req->nargs, req->src,
+                          req->len, req->dest, req->amflags);
+        if (req->flags & AM_FLAG_SRC_TEMP) 
+                psmi_free(req->src);
+        psmi_free(req);
+    }
+    return err;
+}
+
+void
+psmi_am_reqq_add(int amtype, ptl_t *ptl, psm_epaddr_t epaddr,
+                 psm_handler_t handler, psm_amarg_t *args, int nargs,
+		 void *src, size_t len, void *dest, int amflags)
+{
+    int i;
+    int flags = 0;
+    am_reqq_t *nreq = 
+        (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t));
+    psmi_assert_always(nreq != NULL);
+    _IPATH_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, "
+        "localreq=%p, remotereq=%p\n", nreq, 
+        psmi_epaddr_get_hostname(epaddr->epid), dest,  
+        (int)len, (void *) (uintptr_t) args[1].u64w0, 
+        (void *) (uintptr_t) args[0].u64w0);
+
+    psmi_assert(nargs <= 8);
+    nreq->next = NULL;
+    nreq->amtype = amtype;
+    nreq->ptl = ptl;
+    nreq->epaddr = epaddr;
+    nreq->handler = handler;
+    for (i = 0; i < nargs; i++)
+        nreq->args[i] = args[i];
+    nreq->nargs = nargs;
+    if (AM_IS_LONG(amtype) && src != NULL && 
+         len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) 
+    {
+        abort();
+        flags |= AM_FLAG_SRC_TEMP;
+	nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len);
+	psmi_assert_always(nreq->src != NULL); /* XXX mem */
+	amsh_shm_copy_short(nreq->src, src, len);
+    }
+    else
+	nreq->src = src;
+    nreq->len =  len;
+    nreq->dest = dest;
+    nreq->amflags = amflags;
+    nreq->flags = flags;
+
+    nreq->next = NULL;
+    *(ptl->psmi_am_reqq_fifo.lastp) = nreq;
+    ptl->psmi_am_reqq_fifo.lastp = &nreq->next;
+}
+
+static 
+void
+process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq)
+{
+    amsh_am_token_t    tok;
+    psmi_handler_fn_t  fn;
+    int shmidx = pkt->shmidx;
+
+    tok.tok.epaddr_from = ptl->ep->amsh_qdir[shmidx].amsh_epaddr;
+    tok.ptl = ptl;
+    tok.mq = ptl->ep->mq;
+    tok.shmidx = shmidx;
+
+    uint16_t hidx = (uint16_t) pkt->handleridx;
+    int myshmidx = ptl->shmidx;
+    int shmidx_l = AMSH_BULK_PUSH ? myshmidx : shmidx;
+    uint32_t bulkidx = pkt->bulkidx;
+    uintptr_t bulkptr;
+    am_pkt_bulk_t *bulkpkt;
+
+    /* It is possible for packets to arrive (the initial ones for connection
+       establishment) before amsh_epid is set correctly.  However this can only
+       happen for peers in the same node -- those connecting inter-node via
+       SCIF will always have their epid set first.  Since our local nodeid is
+       encoded in the amsh_epid of all local proces at initialization time,
+       it can always be safely extracted here, even before the amsh_epid is
+       set to its proper value for a given peer. */
+#ifdef PSM_HAVE_SCIF
+    int nodeid = (int)((ptl->ep->amsh_qdir[shmidx].amsh_epid >> 48) & 0xff);
+#else
+    const int nodeid = 0;
+#endif
+
+    fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn;
+    psmi_assert(fn != NULL);
+    psmi_assert((uintptr_t) pkt > ptl->ep->amsh_blockbase);
+
+    if (pkt->type == AMFMT_SHORT_INLINE) {
+        _IPATH_VDBG("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n",
+                isreq ? "request" : "reply",
+                pkt->flag, pkt->nargs, shmidx, pkt, hidx);
+
+        fn(&tok, pkt->args, pkt->nargs, pkt->length > 0 ? 
+           (void *) &pkt->args[pkt->nargs] : NULL, pkt->length);
+    }
+    else {
+        int isend = 0;
+        switch (pkt->type) {
+            case AMFMT_SHORT:
+                if (isreq) {
+                    bulkptr = (uintptr_t)
+                        ptl->ep->amsh_qdir[myshmidx].qptrs[nodeid].qreqFifoMed;
+                    bulkptr += bulkidx * amsh_qelemsz.qreqFifoMed;
+                } else {
+                    bulkptr = (uintptr_t)
+                        ptl->ep->amsh_qdir[myshmidx].qptrs[nodeid].qrepFifoMed;
+                    bulkptr += bulkidx * amsh_qelemsz.qrepFifoMed;
+                }
+                break;
+
+            case AMFMT_LONG_END:
+                isend = 1;
+            case AMFMT_LONG:
+                if (isreq) {
+                    bulkptr = (uintptr_t)
+                        ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qreqFifoLong;
+                    bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong;
+                }
+                else {
+                    bulkptr = (uintptr_t)
+                        ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qrepFifoLong;
+                    bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong;
+                }
+                break;
+
+            case AMFMT_HUGE_END:
+                isend = 1;
+            case AMFMT_HUGE:
+                if (isreq) {
+                    bulkptr = (uintptr_t) ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qreqFifoHuge;
+                    bulkptr += bulkidx * amsh_qelemsz.qreqFifoHuge;
+                }
+                else {
+                    bulkptr = (uintptr_t) ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qrepFifoHuge;
+                    bulkptr += bulkidx * amsh_qelemsz.qrepFifoHuge;
+                }
+                break;
+            default:
+                bulkptr = 0;
+                psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                    "Unknown/unhandled packet type 0x%x", pkt->type);
+		return;
+        }
+
+        bulkpkt = (am_pkt_bulk_t *) bulkptr;
+        _IPATH_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d "
+                    "from_idx=%d pkt=%p/%p hidx=%d\n",
+                    ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, 
+                    bulkpkt->flag, pkt->nargs, shmidx, pkt, bulkpkt, hidx);
+        psmi_assert(bulkpkt->flag == QREADY);
+        if (pkt->type == AMFMT_SHORT) {
+                fn(&tok, pkt->args, pkt->nargs, 
+                    (void *) bulkpkt->payload, bulkpkt->len);
+            QMARKFREE(bulkpkt);
+        }
+        else {
+            if (pkt->type == AMFMT_HUGE || pkt->type == AMFMT_HUGE_END)
+                amsh_shm_copy_huge((void *) (bulkpkt->dest + bulkpkt->dest_off), 
+                                   bulkpkt->payload, bulkpkt->len);
+            else
+                amsh_shm_copy_long((void *) (bulkpkt->dest + bulkpkt->dest_off), 
+                                   bulkpkt->payload, bulkpkt->len);
+
+            /* If this is the last packet, copy args before running the
+             * handler */
+            if (isend) {
+                psm_amarg_t args[8];
+                int nargs = pkt->nargs;
+                int i;
+                void *dest = (void *) bulkpkt->dest;
+                size_t len = (size_t) (bulkpkt->dest_off + bulkpkt->len);
+                for (i = 0; i < nargs; i++)
+                    args[i] = pkt->args[i];
+                QMARKFREE(bulkpkt);
+                fn(&tok, args, nargs, dest, len);
+            }
+            else 
+                QMARKFREE(bulkpkt);
+        }
+    }
+    return;
+}
+
+static
+psm_error_t
+amsh_mq_rndv(ptl_t *ptl, psm_mq_t mq, psm_mq_req_t req,
+             psm_epaddr_t epaddr, uint64_t tag, const void *buf, uint32_t len)
+{
+    psm_amarg_t args[5] = {};
+    psm_error_t err = PSM_OK;
+
+    args[0].u32w0 = MQ_MSG_RTS;
+    args[0].u32w1 = len;
+    args[1].u64w0 = tag;
+    args[2].u64w0 = (uint64_t)(uintptr_t) req;
+    args[3].u64w0 = (uint64_t)(uintptr_t) buf;
+
+    /* OK so we want to use SCIF DMA here if enabled.
+       First check: same node?  Use existing local path.
+    */
+
+#ifdef PSM_HAVE_SCIF
+    int shmidx = epaddr->_shmidx;
+    if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {
+#endif
+        /* Intra-node: consider using kassist methods */
+        if (ptl->ep->psmi_kassist_mode == PSMI_KASSIST_KNEM_GET)
+            /* If KNEM Get is active register region for peer to get from */
+            args[4].u64w0 = knem_register_region((void*) buf, len, PSMI_FALSE);
+        else
+            args[4].u64w0 = 0;
+#ifdef PSM_HAVE_SCIF
+    } else {
+        /* Inter-node: use SCIF DMA */
+        if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET &&
+                ptl->ep->scif_dma_threshold <= len) {
+            /* Register the memory region with SCIF and pass the offset over. */
+            off_t offset;
+
+            scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[0];
+
+            err = scif_register_region(epd, (void*)buf, len, &offset);
+            if(err != PSM_OK) {
+                return err;
+            }
+
+            args[4].u64w0 = offset;
+        } else {
+            args[4].u64w0 = 0;
+        }
+    }
+#endif
+    
+    psmi_assert(req != NULL);
+    req->type = MQE_TYPE_SEND;
+    req->buf  = (void *) buf;
+    req->buf_len = len;
+    req->send_msglen = len;
+    req->send_msgoff = 0;
+
+    psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, args, 5, NULL, 0, 0);
+
+    return err;
+}
+
+/*
+ * All shared am mq sends, req can be NULL
+ */
+PSMI_ALWAYS_INLINE(
+psm_error_t
+amsh_mq_send_inner(psm_mq_t mq, psm_mq_req_t req, psm_epaddr_t epaddr, 
+                   uint32_t flags, uint64_t tag, const void *ubuf, uint32_t len))
+{
+    psm_amarg_t args[3] = {};
+    psm_error_t err = PSM_OK;
+    int is_blocking = (req == NULL);
+
+    if (!flags && len <= psmi_am_max_sizes.request_short) {
+	if (len <= 32) 
+	    args[0].u32w0 = MQ_MSG_TINY;
+	else 
+	    args[0].u32w0 = MQ_MSG_SHORT;
+	args[1].u64 = tag;
+
+	psmi_amsh_short_request(epaddr->ptl, epaddr, mq_handler_hidx, args, 2, 
+				ubuf, len, 0);
+    }
+    else if (flags & PSM_MQ_FLAG_SENDSYNC)
+        goto do_rendezvous;
+    else if (len <= mq->shm_thresh_rv) {
+	uint32_t bytes_left = len;
+	uint32_t bytes_this = min(bytes_left, psmi_am_max_sizes.request_short);
+	uint8_t *buf = (uint8_t *)ubuf;
+	args[0].u32w0 = MQ_MSG_LONG;
+        args[0].u32w1 = len;
+	args[1].u64 = tag;
+	psmi_amsh_short_request(epaddr->ptl, epaddr, mq_handler_hidx, args, 2, 
+				buf, bytes_this, 0);
+	bytes_left -= bytes_this;
+	buf += bytes_this;
+	args[2].u32w0 = 0;
+	while (bytes_left) {
+	    args[2].u32w0 += bytes_this;
+	    bytes_this = min(bytes_left, psmi_am_max_sizes.request_short);
+	    /* Here we kind of bend the rules, and assume that shared-memory
+	     * active messages are delivered in order */
+	    psmi_amsh_short_request(epaddr->ptl, epaddr,
+				mq_handler_data_hidx, args, 
+				    3, buf, bytes_this, 0);
+	    buf += bytes_this;
+	    bytes_left -= bytes_this;
+	}
+    }
+    else {
+do_rendezvous:
+        if (is_blocking) {
+            req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+            if_pf (req == NULL)
+                return PSM_NO_MEMORY;
+            req->send_msglen = len;
+            req->tag = tag;
+        }
+        err = amsh_mq_rndv(epaddr->ptl,mq,req,epaddr,tag,ubuf,len);
+
+        if (err == PSM_OK && is_blocking) { /* wait... */
+	    err = psmi_mq_wait_internal(&req);
+	}
+        return err; /* skip eager accounting below */
+    }
+
+    /* All eager async sends are always "all done" */
+    if (req != NULL) {
+        req->state = MQ_STATE_COMPLETE;
+        mq_qq_append(&mq->completed_q, req);
+    }
+
+    mq->stats.tx_num++;
+    mq->stats.tx_shm_num++;
+    mq->stats.tx_eager_num++;
+    mq->stats.tx_eager_bytes += len;
+
+    return err;
+}
+
+static
+psm_error_t
+amsh_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, 
+	      uint64_t tag, const void *ubuf, uint32_t len, void *context,
+              psm_mq_req_t *req_o)
+{
+    psm_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+    if_pf (req == NULL)
+        return PSM_NO_MEMORY;
+
+    req->send_msglen = len;
+    req->tag = tag;
+    req->context = context;
+
+    _IPATH_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%"PRIx64"]\n", 
+        psmi_epaddr_get_name(epaddr->ep->epid),
+        psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag);
+
+    amsh_mq_send_inner(mq, req, epaddr, flags, tag, ubuf, len);
+
+    *req_o = req;
+    return PSM_OK;
+}
+
+static
+psm_error_t
+amsh_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, 
+	      uint64_t tag, const void *ubuf, uint32_t len)
+{
+    amsh_mq_send_inner(mq, NULL, epaddr, flags, tag, ubuf, len);
+
+    _IPATH_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%"PRIx64"]\n", 
+        psmi_epaddr_get_name(epaddr->ep->epid),
+        psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag);
+
+    return PSM_OK;
+}
+
+/* Kcopy-related handling */
+int
+psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr)
+{
+    int shmidx = epaddr->_shmidx;
+    return epaddr->ep->amsh_qdir[shmidx].kassist_pid;
+}
+
+static
+int
+psmi_kcopy_find_minor(int *minor)
+{
+    int i;
+    char path[128];
+
+    /* process-wide kcopy filedescriptor */
+    static int fd = -1;
+    static int kcopy_minor = -1;
+
+    if (fd >= 0) {
+	*minor = kcopy_minor;
+	return fd;
+    }
+
+    for (i = 0; i < 256; i++) {
+	snprintf(path, sizeof(path), "/dev/kcopy/%02d", i);
+	fd = open(path, O_WRONLY | O_EXCL);
+	if (fd >= 0) {
+	    *minor = kcopy_minor = i;
+	    break;
+	}
+    }
+
+    return fd;
+}
+
+static
+int
+psmi_kcopy_open_minor(int minor)
+{
+    char path[128];
+
+    /* process-wide kcopy filedescriptor */
+    static int fd = -1;
+    if (fd >= 0)
+	return fd;
+
+    if (minor >= 0 && minor < 256) {
+	snprintf(path, sizeof(path), "/dev/kcopy/%02d", minor);
+	fd = open(path, O_WRONLY);
+    }
+    return fd;
+}
+
+static
+const char *
+psmi_kassist_getmode(int mode)
+{
+    switch (mode) {
+        case PSMI_KASSIST_OFF:
+	    return "kassist off";
+	case PSMI_KASSIST_KCOPY_PUT:
+	    return "kcopy put";
+	case PSMI_KASSIST_KCOPY_GET:
+	    return "kcopy get";
+        case PSMI_KASSIST_KNEM_GET:
+	    return "knem get";
+        case PSMI_KASSIST_KNEM_PUT:
+	    return "knem put";
+	default:
+	    return "unknown";
+    }
+}
+
+static
+int
+psmi_get_kassist_mode()
+{
+  int mode = PSMI_KASSIST_MODE_DEFAULT;
+  union psmi_envvar_val env_kassist;
+
+  /* Preserve backward compatibility */
+  if (!psmi_getenv("PSM_SHM_KCOPY", 
+		   "PSM Shared Memory use kcopy (put,get,none)",
+		   PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		   (union psmi_envvar_val) "put",
+		   &env_kassist))
+    {
+      char *s = env_kassist.e_str;
+      if (strcasecmp(s, "put") == 0)
+	mode = PSMI_KASSIST_KCOPY_PUT;
+      else if (strcasecmp(s, "get") == 0)
+	mode = PSMI_KASSIST_KCOPY_PUT;
+      else
+	mode = PSMI_KASSIST_OFF;
+    }
+  else if(!psmi_getenv("PSM_KASSIST_MODE",
+		       "PSM Shared memory kernel assist mode "
+		       "(knem-put, knem-get, kcopy-put, kcopy-get, none)",
+		       PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		       (union psmi_envvar_val) PSMI_KASSIST_MODE_DEFAULT_STRING,
+		       &env_kassist)) 
+    {
+      char *s = env_kassist.e_str;
+      if (strcasecmp(s, "kcopy-put") == 0)
+	mode = PSMI_KASSIST_KCOPY_PUT;
+      else if (strcasecmp(s, "kcopy-get") == 0)
+	mode = PSMI_KASSIST_KCOPY_GET;
+      else if (strcasecmp(s, "knem-put") == 0)
+	mode = PSMI_KASSIST_KNEM_PUT;
+      else if (strcasecmp(s, "knem-get") == 0)
+	mode = PSMI_KASSIST_KNEM_GET;
+      else
+	mode = PSMI_KASSIST_OFF;
+
+#if !defined(PSM_USE_KNEM)
+      if (mode & PSMI_KASSIST_KNEM) {
+      	_IPATH_ERROR("KNEM kassist mode requested which has not been compiled "
+		     "into this version of PSM. Switching kassist mode off.\n");
+      	mode = PSMI_KASSIST_OFF;
+      }
+#endif
+    }
+  else {
+    
+#if defined(PSM_USE_KNEM)   
+    int res;
+    
+    /* KNEM is the preferred access mechanism if available. Else default to
+     * using KCOPY.
+     */
+    res = access(KNEM_DEVICE_FILENAME, R_OK | W_OK);
+    if (res == 0)
+      mode = PSMI_KASSIST_KNEM_PUT;
+    else 
+      mode = PSMI_KASSIST_KCOPY_PUT;
+#else
+    mode = PSMI_KASSIST_KCOPY_PUT;
+#endif
+  }
+
+  return mode;
+}
+
+#ifdef PSM_HAVE_SCIF
+static int
+psmi_get_scif_dma_mode()
+{
+    int mode = PSMI_SCIF_DMA_MODE_DEFAULT;
+    union psmi_envvar_val env_scif_dma;
+
+    if(!psmi_getenv("PSM_SCIF_DMA_MODE",
+                "PSM Shared memory SCIF DMA transport mode "
+                "(scif-put, scif-get, none)",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+                (union psmi_envvar_val) PSMI_SCIF_DMA_MODE_DEFAULT_STRING,
+                &env_scif_dma))
+    {
+        char *s = env_scif_dma.e_str;
+        if (strcasecmp(s, "scif-put") == 0)
+            mode = PSMI_SCIF_DMA_PUT;
+        else if (strcasecmp(s, "scif-get") == 0)
+            mode = PSMI_SCIF_DMA_GET;
+        else
+            mode = PSMI_SCIF_DMA_OFF;
+    }
+
+    return mode;
+}
+
+static int
+psmi_get_scif_dma_threshold()
+{
+    int threshold = PSMI_MQ_RV_THRESH_SCIF_DMA;
+    union psmi_envvar_val env_scif_dma;
+
+    if(!psmi_getenv("PSM_SCIF_DMA_THRESH",
+                "PSM SCIF DMA (rendezvous) switchover",
+                PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) threshold,
+                &env_scif_dma)) {
+        threshold = env_scif_dma.e_uint;
+    }
+
+    return threshold;
+}
+
+static
+const char *
+psmi_scif_dma_getmode(int mode)
+{
+    switch (mode) {
+        case PSMI_SCIF_DMA_OFF:
+	    return "SCIF DMA off";
+	case PSMI_SCIF_DMA_PUT:
+	    return "SCIF put";
+	case PSMI_SCIF_DMA_GET:
+	    return "SCIF get";
+	default:
+	    return "unknown";
+    }
+}
+#endif // PSM_HAVE_SCIF
+
+/* Connection handling for shared memory AM.
+ *
+ * arg0 => conn_op, result (PSM error type)
+ * arg1 => epid (always)
+ * arg2 => version.
+ * arg3 => pointer to error for replies.
+ */
+static
+void
+amsh_conn_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    int op          = args[0].u32w0;
+    int phase       = args[0].u32w1;
+    psm_epid_t epid = args[1].u64w0;
+    psm_error_t err = (psm_error_t) args[2].u32w1;
+    psm_error_t *perr = (psm_error_t *) (uintptr_t) args[3].u64w0;
+
+    psm_epaddr_t epaddr;
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    int shmidx = tok->shmidx;
+    int is_valid;
+    ptl_t *ptl = tok->ptl;
+
+    /* We do this because it's an assumption below */
+    psmi_assert_always(buf == NULL && len == 0);
+
+    _IPATH_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n",
+            op, phase, (unsigned long long) epid, err);
+    switch (op) {
+        case PSMI_AM_CONN_REQ: 
+            _IPATH_VDBG("Connect from %d:%d\n",
+		    (int) psm_epid_nid(epid),
+		    (int) psm_epid_context(epid));
+
+            epaddr = psmi_epid_lookup(ptl->ep, epid);
+            if (epaddr == NULL) {
+                /* This can be nasty.  If the segment moves as a result of
+                 * adding a new peer, we have to fix the input pointer 'args'
+                 * since it comes from a shared memory location */
+                if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr)))
+                    /* Unfortunately, no way out of here yet */
+                    psmi_handle_error(PSMI_EP_NORETURN, err, "Fatal error "
+		     "in connecting to shm segment"); 
+                psmi_assert(psmi_epid_lookup(ptl->ep, epid) != NULL);
+            }
+
+            /* Do some version comparison, error checking if required. */
+            /* Rewrite args */
+            ptl->connect_from++;
+            args[0].u32w0 = PSMI_AM_CONN_REP;
+            args[1].u64w0 = (psm_epid_t) ptl->epid;
+            args[2].u32w1 = PSM_OK;
+            AMSH_CSTATE_FROM_SET(epaddr, ESTABLISHED);
+            tok->tok.epaddr_from = epaddr; /* adjust token */
+            psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, 
+                                  args, narg, NULL, 0, 0);
+
+            break;
+
+        case PSMI_AM_CONN_REP: 
+            if (ptl->connect_phase != phase) {
+                _IPATH_VDBG("Out of phase connect reply\n");
+                return;
+            }
+            epaddr = ptl->ep->amsh_qdir[shmidx].amsh_epaddr;
+            *perr = err;
+            AMSH_CSTATE_TO_SET(epaddr, REPLIED);
+            ptl->connect_to++;
+            break;
+
+        case PSMI_AM_DISC_REQ: 
+            epaddr = tok->tok.epaddr_from;
+            args[0].u32w0 = PSMI_AM_DISC_REP;
+            args[2].u32w1 = PSM_OK;
+            AMSH_CSTATE_FROM_SET(epaddr, DISC_REQ);
+            ptl->connect_from--;
+            /* Before sending the reply, make sure the process
+             * is still connected */
+
+	    is_valid = 1;
+#ifdef PSM_HAVE_SCIF
+            if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {
+#endif
+                pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+                if (ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] != epaddr->epid)
+                    is_valid = 0;
+                pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock));
+#ifdef PSM_HAVE_SCIF
+            }
+#endif
+
+            if (is_valid) {
+                psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, 
+                                  args, narg, NULL, 0, 0);
+	    }
+            break;
+
+        case PSMI_AM_DISC_REP: 
+            if (ptl->connect_phase != phase) {
+                _IPATH_VDBG("Out of phase disconnect reply\n");
+                return;
+            }
+            *perr = err;
+            epaddr = tok->tok.epaddr_from;
+            AMSH_CSTATE_TO_SET(epaddr, DISC_REPLIED);
+            ptl->connect_to--;
+            break;
+
+        default:
+            psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                    "Unknown/unhandled connect handler op=%d", op);
+            break;
+    }
+    return;
+}
+
+static
+size_t
+amsh_sizeof(void)
+{
+    return sizeof(ptl_t);
+}
+
+/**
+ * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid.
+ * @param ptl Pointer to caller-allocated space for PTL (fill in)
+ * @param ctl Pointer to caller-allocated space for PTL-control 
+ *            structure (fill in)
+ */
+static
+psm_error_t 
+amsh_init(psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+    int shmidx;
+    psm_error_t err = PSM_OK;
+
+    _IPATH_VDBG("PSM Symmetric Mode!\n");
+    /* Preconditions */
+    psmi_assert_always(ep != NULL);
+    psmi_assert_always(ep->epaddr != NULL);
+    psmi_assert_always(ep->epid != 0);
+
+    /* Setup scif listen port and query node information */
+    /* This is important to get the node count for initializing queues */
+#ifdef PSM_HAVE_SCIF
+    if ((err = amsh_scif_init(ep)))
+	goto fail;
+#endif
+
+    /* If we haven't attached to the segment yet, do it now */
+    if ((err = psmi_shm_attach(ep, &shmidx)))
+	goto fail;
+
+    /* Modify epid with acquired info as below */
+    ep->epid |= ((((uint64_t)shmidx)&0xFF)<<56);
+
+    ptl->ep     = ep; /* back pointer */
+    ptl->epid   = ep->epid; /* cache epid */
+    ptl->epaddr = ep->epaddr; /* cache a copy */
+    ptl->ctl    = ctl;
+    ptl->zero_polls = 0;
+
+    pthread_mutex_init(&ptl->connect_lock, NULL);
+    ptl->connect_phase = 0;
+    ptl->connect_from = 0;
+    ptl->connect_to = 0;
+
+    memset(&ptl->amsh_empty_shortpkt, 0, sizeof ptl->amsh_empty_shortpkt);
+    memset(&ptl->psmi_am_reqq_fifo, 0, sizeof ptl->psmi_am_reqq_fifo);
+
+    if ((err = amsh_init_segment(ptl)))
+        goto fail;
+
+    psmi_am_reqq_init(ptl);
+    memset(ctl, 0, sizeof(*ctl));
+
+    /* Fill in the control structure */
+    ctl->ptl = ptl;
+    ctl->ep_poll = amsh_poll;
+    ctl->ep_connect = amsh_ep_connect;
+    ctl->ep_disconnect = amsh_ep_disconnect;
+
+    ctl->mq_send  = amsh_mq_send;
+    ctl->mq_isend = amsh_mq_isend;
+    
+    ctl->am_short_request = psmi_amsh_am_short_request;
+    ctl->am_short_reply   = psmi_amsh_am_short_reply;
+
+    /* No stats in shm (for now...) */
+    ctl->epaddr_stats_num  = NULL;
+    ctl->epaddr_stats_init = NULL;
+    ctl->epaddr_stats_get  = NULL;
+
+#ifdef PSM_HAVE_SCIF
+    /* Start a thread to service incoming SCIF connections. */
+    if (pthread_create(&ptl->ep->scif_thread, NULL,
+                am_ctl_accept_thread, (void*)ptl)) {
+	err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                "amsh_init_segment(): pthread_create() failed: %d %s",
+                errno, strerror(errno));
+	goto fail;
+    }
+#endif
+
+fail:
+    return err;
+}
+
+static
+psm_error_t 
+amsh_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+    struct psmi_eptab_iterator itor;
+    psm_epaddr_t epaddr;
+    psm_error_t err = PSM_OK;
+    psm_error_t err_seg;
+    uint64_t t_start = get_cycles();
+    int i = 0;
+
+    /* Close whatever has been left open -- this will be factored out for 2.1 */
+    if (ptl->connect_to > 0) {
+        int num_disc = 0;
+        int *mask;
+        psm_error_t  *errs;
+        psm_epaddr_t *epaddr_array;
+
+        psmi_epid_itor_init(&itor, ptl->ep);
+        while ((epaddr = psmi_epid_itor_next(&itor))) {
+            if (epaddr->ptl != ptl)
+                continue;
+            if (AMSH_CSTATE_TO_GET(epaddr) == AMSH_CSTATE_TO_ESTABLISHED) 
+                num_disc++;
+        }
+        psmi_epid_itor_fini(&itor);
+
+	mask = (int *) psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(int));
+	errs = (psm_error_t *)
+		psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm_error_t));
+	epaddr_array = (psm_epaddr_t *) 
+            psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm_epaddr_t));
+
+	if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+	    if (epaddr_array) psmi_free(epaddr_array);
+	    if (errs) psmi_free(errs);
+	    if (mask) psmi_free(mask);
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+        psmi_epid_itor_init(&itor, ptl->ep);
+        while ((epaddr = psmi_epid_itor_next(&itor))) {
+            if (epaddr->ptl == ptl) {
+                if (AMSH_CSTATE_TO_GET(epaddr) == AMSH_CSTATE_TO_ESTABLISHED) {
+                    mask[i] = 1;
+                    epaddr_array[i] = epaddr;
+                    i++;
+                }
+            }
+        }
+        psmi_epid_itor_fini(&itor);
+        psmi_assert(i == num_disc && num_disc > 0);
+	err = amsh_ep_disconnect(ptl, force, num_disc, epaddr_array, 
+			    mask, errs, timeout_ns);
+        psmi_free(mask);
+        psmi_free(errs);
+        psmi_free(epaddr_array);
+    }
+
+    //At this point we are never getting a disconnect request from two peers.
+    //Those peers are polling.. waiting for a response?
+    //Are we somehow losing a message that arrives somewhere between where we
+    //start to disconnect, and here?
+
+    if (ptl->connect_from > 0 || ptl->connect_to > 0) {
+        while (ptl->connect_from > 0 || ptl->connect_to > 0) {
+            if (!psmi_cycles_left(t_start, timeout_ns)) {
+                err = PSM_TIMEOUT;
+                _IPATH_VDBG("CCC timed out with from=%d,to=%d\n",
+                        ptl->connect_from,
+                        ptl->connect_to);
+                break;
+            }
+	    psmi_poll_internal(ptl->ep, 1);
+        }
+    }
+    else {
+        _IPATH_VDBG("CCC complete disconnect from=%d,to=%d\n", 
+                ptl->connect_from,
+                ptl->connect_to);
+    }
+
+    if ((err_seg = psmi_shm_detach(ptl->ep))) {
+        err = err_seg;
+        goto fail;
+    }
+
+    /* This prevents poll calls between now and the point where the endpoint is
+     * deallocated to reference memory that disappeared */
+#ifdef PSM_HAVE_SCIF
+    for(i = 0; i < ptl->ep->scif_nnodes; i++) {
+        ptl->repH[i].head  = &ptl->amsh_empty_shortpkt;
+        ptl->reqH[i].head  = &ptl->amsh_empty_shortpkt;
+    }
+#else
+    ptl->repH[0].head  = &ptl->amsh_empty_shortpkt;
+    ptl->reqH[0].head  = &ptl->amsh_empty_shortpkt;
+#endif
+
+    return PSM_OK;
+fail:
+    return err;
+
+}
+
+static 
+psm_error_t
+amsh_setopt(const void *component_obj, int optname, 
+	       const void *optval, uint64_t optlen)
+{
+  /* No options for AM PTL at the moment */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown AM ptl option %u.", optname);
+}
+
+static
+psm_error_t
+amsh_getopt(const void *component_obj, int optname,
+	       void *optval, uint64_t *optlen)
+{
+  /* No options for AM PTL at the moment */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown AM ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_amsh = { 
+  amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt
+};
+
+#ifdef PSM_HAVE_SCIF
+/* Wait for incoming connections on the SCIF listen socket.
+   When a connection arrives, store the SCIF socket in the correct place and
+   respond so that the remote process can map our shared queue area.
+ */
+static void* am_ctl_accept_thread(void* arg)
+{
+    ptl_t* ptl = (ptl_t*)arg;
+    psm_ep_t ep = ptl->ep;
+    struct scif_portID peer;
+    scif_epd_t epd;
+    void* addr;
+    int peeridx;
+    int shmidx;
+    int nodeid;
+
+    /* Receive this struct to ID the peer (offset unused). */
+    /* Send this struct to share memory mapping information. */
+    struct { off_t offset; int verno; psm_epid_t epid; } inbuf, outbuf;
+
+    while(1) {
+        /* Block on accepting a new connection on the SCIF listen socket. */
+        if(scif_accept(ep->scif_epd, &peer, &epd, SCIF_ACCEPT_SYNC)) {
+            if(errno == EINTR) {
+                /* Time to quit! */
+                _IPATH_VDBG("SCIF accept thread quitting\n");
+                pthread_exit(NULL);
+                return NULL;
+            }
+
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                    "scif_accept failed: %d %s\n", errno, strerror(errno));
+            continue;
+        }
+
+        /* Register the shared memory area this peer should access. */
+        /* SCIF_MAP_FIXED is use to ensure that offset == addr, so that the
+           returned offset does not need to be tracked as well. */
+        addr = ep->amsh_qdir[ep->amsh_shmidx].amsh_base;
+        outbuf.offset = scif_register(epd, addr,
+                am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES,
+                (off_t)addr, SCIF_PROT_READ|SCIF_PROT_WRITE, SCIF_MAP_FIXED);
+
+        _IPATH_PRDBG("registered addr %p at offset %p length %ld\n",
+                addr, (void*)outbuf.offset,
+                am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES);
+        if(outbuf.offset == SCIF_REGISTER_FAILED) {
+            psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                    "scif_register failed: %d %s\n", errno, strerror(errno));
+            scif_close(epd);
+            continue;
+        }
+
+        outbuf.verno = PSMI_VERNO;
+        outbuf.epid = ep->epid;
+
+        if (amsh_scif_send(epd, &outbuf, sizeof(outbuf))) {
+            psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                    "scif_send epd %d failed: %d %s\n",
+                    epd, errno, strerror(errno));
+            scif_close(epd);
+            continue;
+        }
+
+        /* Receive peer identification information */
+        if(amsh_scif_recv(epd, &inbuf, sizeof(inbuf))) {
+            psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                    "scif_recv failed: %d %s\n", errno, strerror(errno));
+            scif_close(epd);
+            continue;
+        }
+
+        /* Extract information from the peer's epid. */
+        nodeid = (int)((inbuf.epid>>48)&0xff);
+        shmidx = (int)((inbuf.epid>>56)&0xff);
+
+        /* Port isn't supposed to match -- we have the peer's listen port,
+           which won't be the same as the connect socket's port. */
+        if(peer.node != nodeid) {
+            psmi_handle_error(NULL, PSM_EP_NO_RESOURCES,
+                    "SCIF node:port %d:%d does not match encoded epid nodeid %d",
+                    peer.node, peer.port, nodeid);
+            scif_close(epd);
+            continue;
+        }
+
+        /* Now that the peer's identity is known, store the new connection. */
+        /* 0        1 mynodeid 3 4 */
+        /* mynodeid 0 1        3 4 */
+        if(nodeid > ep->scif_mynodeid) {
+            peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * nodeid) + shmidx;
+        } else if(nodeid < ep->scif_mynodeid) {
+            peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * (nodeid + 1)) + shmidx;
+        } else {
+            peeridx = shmidx;
+        }
+
+        ptl->ep->amsh_qdir[peeridx].amsh_epid = inbuf.epid;
+        ptl->ep->amsh_qdir[peeridx].amsh_verno = inbuf.verno;
+
+        /* There are eventually two connections.  epd[0] always has the remote
+           memory mapped region associated with it, and is used to make requests
+           to that peer.  epd[1] exposes our local shared memory, and is used
+           to respond to remote requests. */
+        ptl->ep->amsh_qdir[peeridx].amsh_epd[1] = epd;
+
+        _IPATH_VDBG(
+                "shmidx %d accepted %d:%d peeridx %d epd %d shmidx %d\n",
+                ep->amsh_shmidx, peer.node, peer.port, peeridx,
+                ep->amsh_qdir[peeridx].amsh_epd[1],
+                ep->amsh_qdir[peeridx].amsh_shmidx);
+    }
+
+    return NULL;
+}
+#endif //PSM_HAVE_SCIF
+
diff --git a/ptl_am/kcopyrw.h b/ptl_am/kcopyrw.h
new file mode 100644
index 0000000..c50127c
--- /dev/null
+++ b/ptl_am/kcopyrw.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+/*
+ * read from remote process pid
+ */
+int64_t kcopy_get(int fd, pid_t pid, const void *src, void *dst, int64_t n);
+
+/*
+ * write to remote process pid
+ */
+int64_t kcopy_put(int fd, const void *src, pid_t pid, void *dst, int64_t n);
+
+/*
+ * return the ABI version or -1 on error
+ */
+int kcopy_abi(int fd);
diff --git a/ptl_am/kcopyrwu.c b/ptl_am/kcopyrwu.c
new file mode 100644
index 0000000..839846f
--- /dev/null
+++ b/ptl_am/kcopyrwu.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "kcopyrw.h"
+
+#define KCOPY_GET_SYSCALL 1
+#define KCOPY_PUT_SYSCALL 2
+#define KCOPY_ABI_SYSCALL 3
+
+struct kcopy_syscall {
+	uint32_t tag;
+	pid_t    pid;
+	uint64_t n;
+	uint64_t src;
+	uint64_t dst;
+};
+
+int64_t kcopy_get(int fd, pid_t pid, const void *src, void *dst, int64_t n) {
+	struct kcopy_syscall e = {
+		.tag = KCOPY_GET_SYSCALL,
+		.pid = pid,
+		.n = n,
+		.src = (uint64_t) (uintptr_t) src,
+		.dst = (uint64_t) (uintptr_t) dst
+	};
+	int64_t ret;
+
+	ret = write(fd, &e, sizeof(e));
+	if (ret == sizeof(e))
+		ret = n;
+	else if (ret > 0 && ret != sizeof(e))
+		ret = 0;
+	
+	return ret;
+}
+
+int64_t kcopy_put(int fd, const void *src, pid_t pid, void *dst, int64_t n) {
+	struct kcopy_syscall e = {
+		.tag = KCOPY_PUT_SYSCALL,
+		.pid = pid,
+		.n = n,
+		.src = (uint64_t) (uintptr_t) src,
+		.dst = (uint64_t) (uintptr_t) dst
+	};
+	int64_t ret;
+
+	ret = write(fd, &e, sizeof(e));
+	if (ret == sizeof(e))
+		ret = n;
+	else if (ret > 0 && ret != sizeof(e))
+		ret = 0;
+	
+	return ret;
+}
+
+int kcopy_abi(int fd) {
+	int32_t abi;
+	struct kcopy_syscall e = {
+		.tag = KCOPY_ABI_SYSCALL,
+		.dst = (uint64_t) (uintptr_t) &abi
+	};
+	int ret;
+
+	ret = write(fd, &e, sizeof(e));
+	if (ret == sizeof(e))
+		ret = abi;
+	else if (ret > 0 && ret != sizeof(e))
+		ret = 0;
+
+	return ret;
+}
diff --git a/ptl_am/knemrw.h b/ptl_am/knemrw.h
new file mode 100644
index 0000000..4e22e0f
--- /dev/null
+++ b/ptl_am/knemrw.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2010. QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+#if defined(PSM_USE_KNEM)
+#include "knem_io.h"
+#endif
+
+/*
+ * Open handle to knem device.
+ */
+int knem_open_device();
+
+/*
+ * read from remote process given a cookie
+ */
+int64_t knem_get(int fd, int64_t cookie, const void *src, int64_t n);
+
+/*
+ * write to remote process pid given a cookie
+ */
+int64_t knem_put(int fd, const void *src, int64_t n, int64_t cookie);
+
+/*
+ * register a memory region for put/get
+ */
+int64_t knem_register_region(void *buffer, size_t len, int write);
diff --git a/ptl_am/knemrwu.c b/ptl_am/knemrwu.c
new file mode 100644
index 0000000..358f555
--- /dev/null
+++ b/ptl_am/knemrwu.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2010. QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "knemrw.h"
+
+int knem_open_device()
+{
+  /* Process wide knem handle */
+  static int fd = -1;
+  
+#if defined(PSM_USE_KNEM)
+  if (fd >= 0)
+    return fd;
+  
+  fd = open(KNEM_DEVICE_FILENAME, O_RDWR);
+#endif
+  return fd;
+}
+
+int64_t knem_get(int fd, int64_t cookie, const void *src, int64_t n)
+{
+  
+#if defined(PSM_USE_KNEM)
+  struct knem_cmd_inline_copy c;
+  struct knem_cmd_param_iovec iov;
+  int err;
+
+  iov.base = (uint64_t) (uintptr_t) src;
+  iov.len = n;
+  
+  c.local_iovec_array = (uintptr_t) &iov;
+  c.local_iovec_nr = 1;
+  c.remote_cookie = cookie;
+  c.remote_offset = 0;
+  c.write = 0;   /* Do a Read/Get from remote memory region */
+  c.flags = 0;
+  err = ioctl(fd, KNEM_CMD_INLINE_COPY, &c);
+
+  if (c.current_status != KNEM_STATUS_SUCCESS) {
+    _IPATH_INFO("KNEM: Get request of size 0x%"PRIx64" failed with error %d.\n",
+		n, c.current_status);
+    err = c.current_status;
+  }
+  
+  return err;
+#else
+  psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, 
+		    "Attempt to use KNEM kassist (get), support for which has "
+		    "not been compiled in.");
+  
+  return PSM_INTERNAL_ERR;
+#endif
+}
+
+int64_t knem_put(int fd, const void *src, int64_t n, int64_t cookie)
+{
+
+#if defined(PSM_USE_KNEM)
+  struct knem_cmd_inline_copy c;
+  struct knem_cmd_param_iovec iov;
+  int err;
+  
+  iov.base = (uint64_t) (uintptr_t) src;
+  iov.len = n;
+  
+  c.local_iovec_array = (uintptr_t) &iov;
+  c.local_iovec_nr = 1;
+  c.remote_cookie = cookie;
+  c.remote_offset = 0;
+  c.write = 1;   /* Do a Write/Put to remote memory region */
+  c.flags = 0;
+  err = ioctl(fd, KNEM_CMD_INLINE_COPY, &c);
+
+  if (c.current_status != KNEM_STATUS_SUCCESS) {
+    _IPATH_INFO("KNEM: Put request of size 0x%"PRIx64" failed with error %d.\n",
+		n, c.current_status);
+    err = c.current_status;
+  }
+  
+  return err;
+#else
+  
+  psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, 
+		    "Attempt to use KNEM kassist (put), support for which has "
+		    "not been compiled in.");
+  
+  return PSM_INTERNAL_ERR;
+#endif
+
+}
+
+int64_t knem_register_region(void *buffer, size_t len, int write)
+{
+
+#if defined(PSM_USE_KNEM)
+  struct knem_cmd_create_region create;
+  struct knem_cmd_param_iovec iov;
+ 
+  iov.base = (uint64_t) (uintptr_t) buffer;
+  iov.len = len;
+  create.iovec_array = (uintptr_t) &iov;
+  create.iovec_nr = 1;
+  create.flags = KNEM_FLAG_SINGLEUSE; /* Automatically destroy after put */
+  create.protection = write ? PROT_WRITE : PROT_READ;
+  
+  /* AV: Handle failure in memory registration */
+  ioctl(psmi_kassist_fd, KNEM_CMD_CREATE_REGION, &create);
+  return create.cookie;    /* Cookie for registered memory region */
+#else
+  
+  psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, 
+		    "Attempt to use KNEM kassist (reg), support for which has "
+		    "not been compiled in.");
+  return 0;
+#endif
+
+}
diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h
new file mode 100644
index 0000000..34c1342
--- /dev/null
+++ b/ptl_am/psm_am_internal.h
@@ -0,0 +1,524 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef PSMI_AM_H
+#define PSMI_AM_H
+
+#include "../psm_am_internal.h"
+
+#define NSHORT_ARGS 6
+typedef
+struct amsh_am_token 
+{
+  struct psmi_am_token tok;
+  
+  
+  ptl_t	    *ptl; /**> What PTL was it received on */
+  psm_mq_t	    mq;   /**> What matched queue is this for ? */
+  int		    shmidx; /**> what shmidx sent this */
+  int loopback;	  /**> Whether to reply as loopback */
+}
+amsh_am_token_t;
+
+typedef void (*psmi_handler_fn_t)(void *token, psm_amarg_t *args, int nargs, void *src, size_t len);
+
+typedef struct psmi_handlertab {
+    psmi_handler_fn_t  fn;
+} psmi_handlertab_t;
+
+/*
+ * Can change the rendezvous threshold based on usage of kcopy (or not)
+ */
+#define PSMI_MQ_RV_THRESH_KCOPY	   16000
+
+/*
+ * Can change the rendezvous threshold based on usage of knem (or not)
+ */
+#define PSMI_MQ_RV_THRESH_KNEM      16000
+
+/* If no kernel assisted copy is available this is the rendezvous threshold */
+#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
+
+/* Threshold for using SCIF DMA to do data transfers */
+#define PSMI_MQ_RV_THRESH_SCIF_DMA  (150000)
+
+#define PSMI_AM_CONN_REQ    1
+#define PSMI_AM_CONN_REP    2
+#define PSMI_AM_DISC_REQ    3
+#define PSMI_AM_DISC_REP    4
+
+#define PSMI_KASSIST_OFF       0x0
+#define PSMI_KASSIST_KCOPY_GET 0x1
+#define PSMI_KASSIST_KCOPY_PUT 0x2
+#define PSMI_KASSIST_KNEM_GET  0x4
+#define PSMI_KASSIST_KNEM_PUT  0x8
+
+#define PSMI_KASSIST_KCOPY     0x3
+#define PSMI_KASSIST_KNEM      0xC
+#define PSMI_KASSIST_GET       0x15
+#define PSMI_KASSIST_PUT       0x2A
+#define PSMI_KASSIST_MASK      0x3F
+
+#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_KNEM_PUT
+#define PSMI_KASSIST_MODE_DEFAULT_STRING  "knem-put"
+
+int psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr);
+
+#define PSMI_SCIF_DMA_OFF   0x0
+#define PSMI_SCIF_DMA_GET   0x1
+#define PSMI_SCIF_DMA_PUT   0x2
+
+#define PSMI_SCIF_DMA_MODE_DEFAULT PSMI_SCIF_DMA_GET
+#define PSMI_SCIF_DMA_MODE_DEFAULT_STRING  "scif-get"
+
+/*
+ * Eventually, we will allow users to register handlers as "don't reply", which
+ * may save on some of the buffering requirements
+ */
+#define PSMI_HANDLER_NEEDS_REPLY(handler)    1
+#define PSMI_VALIDATE_REPLY(handler)    assert(PSMI_HANDLER_NEEDS_REPLY(handler))
+
+int psmi_amsh_poll(ptl_t *ptl, int replyonly);
+
+/* Shared memory AM, forward decls */
+int
+psmi_amsh_short_request(ptl_t *ptl, psm_epaddr_t epaddr,
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+		        const void *src, size_t len, int flags);
+
+void
+psmi_amsh_short_reply(amsh_am_token_t *tok,
+                      psm_handler_t handler, psm_amarg_t *args, int nargs,
+		      const void *src, size_t len, int flags);
+
+int
+psmi_amsh_long_request(ptl_t *ptl, psm_epaddr_t epaddr,
+                        psm_handler_t handler, psm_amarg_t *args, int nargs,
+		        const void *src, size_t len, void *dest, int flags);
+
+void
+psmi_amsh_long_reply(amsh_am_token_t *tok,
+                     psm_handler_t handler, psm_amarg_t *args, int nargs,
+		     const void *src, size_t len, void *dest, int flags);
+
+void psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+
+void psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+void psmi_am_mq_handler_data(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+void psmi_am_mq_handler_complete(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+void psmi_am_mq_handler_rtsmatch(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+void psmi_am_mq_handler_rtsdone(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+void psmi_am_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len);
+
+/* AM over shared memory (forward decls) */
+psm_error_t
+psmi_amsh_am_short_request(psm_epaddr_t epaddr,
+			   psm_handler_t handler, psm_amarg_t *args, int nargs,
+			   void *src, size_t len, int flags,
+			   psm_am_completion_fn_t completion_fn,
+			   void *completion_ctxt);
+psm_error_t
+psmi_amsh_am_short_reply(psm_am_token_t tok,
+			 psm_handler_t handler, psm_amarg_t *args, int nargs,
+			 void *src, size_t len, int flags,
+			 psm_am_completion_fn_t completion_fn,
+			 void *completion_ctxt);
+
+#define amsh_conn_handler_hidx	 1
+#define mq_handler_hidx          2
+#define mq_handler_data_hidx     3
+#define mq_handler_rtsmatch_hidx 4
+#define mq_handler_rtsdone_hidx  5
+#define am_handler_hidx          6
+
+#define AMREQUEST_SHORT 0
+#define AMREQUEST_LONG  1
+#define AMREPLY_SHORT   2
+#define AMREPLY_LONG    3
+#define AM_IS_REPLY(x)     ((x)&0x2)
+#define AM_IS_REQUEST(x)   (!AM_IS_REPLY(x))
+#define AM_IS_LONG(x)      ((x)&0x1)
+#define AM_IS_SHORT(x)     (!AM_IS_LONG(x))
+
+#define AM_FLAG_SRC_ASYNC   0x1
+#define AM_FLAG_SRC_TEMP    0x2
+
+/*
+ * Request Fifo.
+ */
+typedef
+struct am_reqq {
+    struct am_reqq  *next;
+    int             amtype;
+
+    ptl_t	    *ptl;
+    psm_epaddr_t    epaddr;
+    psm_handler_t   handler;
+    psm_amarg_t     args[8];
+    int             nargs;
+    void            *src;
+    uint32_t        len;
+    void            *dest;
+    int             amflags;
+    int             flags;
+}
+am_reqq_t;
+
+struct am_reqq_fifo_t {
+    am_reqq_t  *first;
+    am_reqq_t  **lastp;
+};
+
+psm_error_t psmi_am_reqq_drain(ptl_t *ptl);
+void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm_epaddr_t epaddr,
+                 psm_handler_t handler, psm_amarg_t *args, int nargs,
+		 void *src, size_t len, void *dest, int flags);
+
+/*
+ * Shared memory Active Messages, implementation derived from
+ * Lumetta, Mainwaring, Culler.  Multi-Protocol Active Messages on a Cluster of
+ * SMP's. Supercomputing 1997.
+ *
+ * We support multiple endpoints in shared memory, but we only support one
+ * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some
+ * structures are endpoint specific (as denoted * with amsh_ep_) and others are
+ * specific to the single shared memory context * (amsh_ global variables). 
+ *
+ * Each endpoint maintains a shared request block and a shared reply block.
+ * Each block is composed of queues for small, medium and large messages.
+ */
+
+#define QFREE      0
+#define QUSED      1
+#define QREADY     2
+#define QREADYMED  3
+#define QREADYLONG 4
+
+#define QISEMPTY(flag) (flag<QREADY)
+#ifdef __powerpc__
+#  define _QMARK_FLAG_FENCE()  asm volatile("lwsync" : : : "memory")
+#elif defined(__x86_64__) || defined(__i386__)
+#ifdef __MIC__
+#  define _QMARK_FLAG_FENCE()  asm volatile("lock; addl $0,0(%%rsp)" ::: "memory");
+#else
+#  define _QMARK_FLAG_FENCE()  asm volatile("sfence" : : : "memory");
+//#  define _QMARK_FLAG_FENCE()  asm volatile("" : : : "memory")  /* compilerfence */
+#endif
+#else
+#  error No _QMARK_FLAG_FENCE() defined for this platform
+#endif
+
+#define _QMARK_FLAG(pkt_ptr, _flag)      do {    \
+        _QMARK_FLAG_FENCE();                     \
+        (pkt_ptr)->flag = (_flag);               \
+        _QMARK_FLAG_FENCE();                     \
+        } while (0)
+
+#define QMARKFREE(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QFREE)
+#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY)
+#define QMARKUSED(pkt_ptr)  _QMARK_FLAG(pkt_ptr, QUSED)
+
+#define AMFMT_SYSTEM       1
+#define AMFMT_SHORT_INLINE 2
+#define AMFMT_SHORT        3
+#define AMFMT_LONG         4
+#define AMFMT_LONG_END     5
+#define AMFMT_HUGE         6
+#define AMFMT_HUGE_END     7
+
+#define _shmidx _ptladdr_u32[0]
+#define _cstate _ptladdr_u32[1]
+
+#define AMSH_CMASK_NONE    0
+#define AMSH_CMASK_PREREQ  1
+#define AMSH_CMASK_POSTREQ 2
+#define AMSH_CMASK_DONE    3
+
+#define AMSH_CSTATE_TO_MASK         0x0f
+#define AMSH_CSTATE_TO_NONE         0x01
+#define AMSH_CSTATE_TO_REPLIED      0x02
+#define AMSH_CSTATE_TO_ESTABLISHED  0x03
+#define AMSH_CSTATE_TO_DISC_REPLIED 0x04
+#define AMSH_CSTATE_TO_GET(epaddr)  ((epaddr)->_cstate & AMSH_CSTATE_TO_MASK)
+#define AMSH_CSTATE_TO_SET(epaddr,state)                                      \
+            (epaddr)->_cstate = (((epaddr)->_cstate & ~AMSH_CSTATE_TO_MASK) | \
+                            ((AMSH_CSTATE_TO_ ## state) & AMSH_CSTATE_TO_MASK))
+
+#define AMSH_CSTATE_FROM_MASK         0xf0
+#define AMSH_CSTATE_FROM_NONE         0x10
+#define AMSH_CSTATE_FROM_DISC_REQ     0x40
+#define AMSH_CSTATE_FROM_ESTABLISHED  0x50
+#define AMSH_CSTATE_FROM_GET(epaddr)  ((epaddr)->_cstate & AMSH_CSTATE_FROM_MASK)
+#define AMSH_CSTATE_FROM_SET(epaddr,state)                                      \
+            (epaddr)->_cstate = (((epaddr)->_cstate & ~AMSH_CSTATE_FROM_MASK) | \
+                           ((AMSH_CSTATE_FROM_ ## state) & AMSH_CSTATE_FROM_MASK))
+
+/**********************************
+ * Shared memory packet formats 
+ **********************************/
+typedef 
+struct am_pkt_short {
+    uint32_t        flag;     /**> Packet state */
+    union {
+        uint32_t        bulkidx;  /**> index in bulk packet queue */
+        uint32_t        length;   /**> length when no bulkidx used */
+    };
+    uint16_t        shmidx;   /**> index in shared segment */
+    uint16_t        type;
+    uint16_t        nargs;
+    uint16_t        handleridx;
+
+    psm_amarg_t	    args[NSHORT_ARGS];	/* AM arguments */
+
+    /* We eventually will expose up to 8 arguments, but this isn't implemented
+     * For now.  >6 args will probably require a medium instead of a short */
+}
+am_pkt_short_t PSMI_CACHEALIGN;
+PSMI_STRICT_SIZE_DECL(am_pkt_short_t,64);
+
+typedef struct am_pkt_bulk {
+    uint32_t    flag;
+    uint32_t    idx;
+    uintptr_t   dest;       /* Destination pointer in "longs" */
+    uint32_t    dest_off;   /* Destination pointer offset */
+    uint32_t    len;   /* Destination length within offset */
+    psm_amarg_t	args[2];    /* Additional "spillover" for >6 args */
+    uint8_t	payload[0];
+}
+am_pkt_bulk_t;
+/* No strict size decl, used for mediums and longs */
+
+/****************************************************
+ * Shared memory header and block control structures
+ ***************************************************/
+
+/* Each pkt queue has the same header format, although the queue
+ * consumers don't use the 'head' index in the same manner. */
+typedef struct am_ctl_qhdr {
+    uint32_t    head;		/* Touched only by 1 consumer */
+    uint8_t	_pad0[64-4];
+
+    /* tail is now located on the dirpage. */
+    uint32_t    elem_cnt;
+    uint32_t    elem_sz;
+    uint8_t     _pad1[64-2*sizeof(uint32_t)];
+}
+am_ctl_qhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t,128);
+
+/* Each block reserves some space at the beginning to store auxiliary data */
+#define AMSH_BLOCK_HEADER_SIZE  4096
+
+/* Each process has a reply qhdr and a request qhdr */
+typedef struct am_ctl_blockhdr {
+    volatile am_ctl_qhdr_t    shortq;
+    volatile am_ctl_qhdr_t    medbulkq;
+    volatile am_ctl_qhdr_t    longbulkq;
+    volatile am_ctl_qhdr_t    hugebulkq;
+}
+am_ctl_blockhdr_t;
+PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t,128*3);
+
+/* We cache the "shorts" because that's what we poll on in the critical path.
+ * We take care to always update these pointers whenever the segment is remapped.
+ */ 
+typedef struct am_ctl_qshort_cache {
+    volatile am_pkt_short_t  *base;  
+    volatile am_pkt_short_t  *head;
+    volatile am_pkt_short_t  *end;
+}
+am_ctl_qshort_cache_t;
+
+struct amsh_qptrs {
+    am_ctl_blockhdr_t	*qreqH;
+    am_pkt_short_t	*qreqFifoShort;
+    am_pkt_bulk_t  	*qreqFifoMed;
+    am_pkt_bulk_t  	*qreqFifoLong;
+    am_pkt_bulk_t  	*qreqFifoHuge;
+
+    am_ctl_blockhdr_t	*qrepH;
+    am_pkt_short_t	*qrepFifoShort;
+    am_pkt_bulk_t  	*qrepFifoMed;
+    am_pkt_bulk_t  	*qrepFifoLong;
+    am_pkt_bulk_t  	*qrepFifoHuge;
+};
+
+/******************************************
+ * Shared segment local directory (global)
+ ******************************************
+ *
+ * Each process keeps a directory for where request and reply structures are 
+ * located at its peers.
+ */
+struct amsh_qdirectory {
+    /* These pointers are convenience aliases for the local node queues
+       also found in the qptrs array. */
+    am_ctl_blockhdr_t	*qreqH;
+    am_pkt_short_t	*qreqFifoShort;
+    am_pkt_bulk_t  	*qreqFifoMed;
+    am_pkt_bulk_t  	*qreqFifoLong;
+    am_pkt_bulk_t  	*qreqFifoHuge;
+
+    am_ctl_blockhdr_t	*qrepH;
+    am_pkt_short_t	*qrepFifoShort;
+    am_pkt_bulk_t  	*qrepFifoMed;
+    am_pkt_bulk_t  	*qrepFifoLong;
+    am_pkt_bulk_t  	*qrepFifoHuge;
+
+    struct amsh_qptrs   qptrs[PTL_AMSH_MAX_LOCAL_NODES];
+
+    int			kassist_pid;
+
+/*
+ * Peer view of my index. for initial node, it is the same as ep->amsh_shmidx,
+ * for other remote nodes, it is calculated by circular offset of
+ * PTL_AMSH_MAX_LOCAL_PROCS, node-ID, and ep->amsh_shmidx.
+ */
+    int			amsh_shmidx;
+    psm_epid_t		amsh_epid;
+    uint16_t		amsh_verno;
+#ifdef PSM_HAVE_SCIF
+    scif_epd_t		amsh_epd[2];
+#endif
+    off_t               amsh_offset;
+    void		*amsh_base;
+    psm_epaddr_t	amsh_epaddr;
+} __attribute__ ((aligned(8)));
+
+typedef struct amsh_qtail_info
+{
+    volatile uint32_t tail;
+    volatile pthread_spinlock_t  lock;
+    uint8_t  _pad0[64-1*4-sizeof(pthread_spinlock_t)];
+} amsh_qtail_info_t;
+PSMI_STRICT_SIZE_DECL(amsh_qtail_info_t,64);
+
+struct amsh_qtail
+{
+    amsh_qtail_info_t reqFifoShort;
+    amsh_qtail_info_t reqFifoMed;
+    amsh_qtail_info_t reqFifoLong;
+    amsh_qtail_info_t reqFifoHuge;
+
+    amsh_qtail_info_t repFifoShort;
+    amsh_qtail_info_t repFifoMed;
+    amsh_qtail_info_t repFifoLong;
+    amsh_qtail_info_t repFifoHuge;
+} __attribute__ ((aligned(64)));
+
+/* The first shared memory page is a control page to support each endpoint
+ * independently adding themselves to the shared memory segment. */
+struct am_ctl_dirpage {
+    pthread_mutex_t lock;
+    char            _pad0[64-sizeof(pthread_mutex_t)];
+    volatile int    is_init;
+    char            _pad1[64-sizeof(int)];
+
+    uint16_t        psm_verno[PTL_AMSH_MAX_LOCAL_PROCS];
+    uint32_t        amsh_features[PTL_AMSH_MAX_LOCAL_PROCS];
+    int             num_attached; /* 0..MAX_LOCAL_PROCS-1 */
+    int		    max_idx;
+
+    psm_epid_t      shmidx_map_epid[PTL_AMSH_MAX_LOCAL_PROCS];
+    int		    kcopy_minor;
+    int		    kassist_pids[PTL_AMSH_MAX_LOCAL_PROCS];
+
+    /* A set of tail queue data for each remote domain.  Each domain has
+       a reserved set of queues for each other domain.  The queues are located
+       in shared memory on the target domain, while the tail pointer is
+       located on the source domain. */
+    /* The tail pointers are located in the dirpage because each peer in this
+       domain will be sharing them (atomically).  The dirpage is mapped by
+       all processes already, so just use it. */
+    struct amsh_qtail qtails[PTL_AMSH_MAX_LOCAL_PROCS*PTL_AMSH_MAX_LOCAL_NODES];
+};
+
+#define AMSH_HAVE_KCOPY	0x01
+#define AMSH_HAVE_KNEM  0x02
+#define AMSH_HAVE_SCIF  0x04
+#define AMSH_HAVE_KASSIST 0x7
+
+/******************************************
+ * Shared fifo element counts and sizes
+ ******************************************
+ * These values are context-wide, they can only be set early on and can't be *
+ * modified at runtime.  All endpoints are expected to use the same values.
+ */
+typedef
+struct amsh_qinfo {
+    int qreqFifoShort;
+    int qreqFifoMed;
+    int qreqFifoLong;
+    int qreqFifoHuge;
+
+    int qrepFifoShort;
+    int qrepFifoMed;
+    int qrepFifoLong;
+    int qrepFifoHuge;
+}
+amsh_qinfo_t;
+
+/******************************************
+ * Per-endpoint structures (ep-local)
+ ******************************************
+ * Each endpoint keeps its own information as to where it resides in the
+ * directory, and maintains its own cached copies of where the short header
+ * resides in shared memory.
+ *
+ * NOTE: All changes must be reflected in PSMI_AMSH_EP_SIZE
+ */
+struct ptl {
+    psm_ep_t		   ep;
+    psm_epid_t             epid;
+    psm_epaddr_t	   epaddr;
+    ptl_ctl_t              *ctl;
+    int                    shmidx; 
+    am_ctl_qshort_cache_t  reqH[PTL_AMSH_MAX_LOCAL_NODES];
+    am_ctl_qshort_cache_t  repH[PTL_AMSH_MAX_LOCAL_NODES];
+    int                    zero_polls;
+    int                    amsh_only_polls;
+
+    pthread_mutex_t        connect_lock;
+    int                    connect_phase;
+    int                    connect_to;
+    int                    connect_from;
+
+/* List of context-specific shared variables */
+    amsh_qinfo_t	   amsh_qsizes;
+    am_pkt_short_t	   amsh_empty_shortpkt;
+    struct am_reqq_fifo_t  psmi_am_reqq_fifo;
+
+};
+
+#endif
diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c
new file mode 100644
index 0000000..8638652
--- /dev/null
+++ b/ptl_am/ptl.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "kcopyrw.h"
+#include "knemrw.h"
+#include "scifrw.h"
+
+static
+psm_error_t
+ptl_handle_rtsmatch_request(psm_mq_req_t req, int was_posted, amsh_am_token_t *tok)
+{
+    psm_amarg_t	args[5] = {};
+    psm_epaddr_t epaddr = req->rts_peer;
+    ptl_t *ptl = epaddr->ptl;
+    int pid = 0;
+    int used_get = 0;
+
+    psmi_assert((tok != NULL && was_posted) || (tok == NULL && !was_posted));
+
+    _IPATH_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n",
+		    req, req->buf, req->recv_msglen, tok);
+
+    args[0].u64w0 = (uint64_t)(uintptr_t) req->ptl_req_ptr;
+    args[1].u64w0 = (uint64_t)(uintptr_t) req;
+    args[2].u64w0 = (uint64_t)(uintptr_t) req->buf;
+    args[3].u32w0 = req->recv_msglen;
+    args[3].u32w1 = tok != NULL ? 1 : 0;
+    args[4].u64w0 = 0;
+
+    /* First check: is the peer local? */
+#ifdef PSM_HAVE_SCIF
+    int shmidx = epaddr->_shmidx;
+    if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {
+#endif
+        /* Use kassist if enabled */
+        if ((ptl->ep->psmi_kassist_mode & PSMI_KASSIST_GET) &&
+                req->recv_msglen > 0 &&
+                (pid = psmi_epaddr_kcopy_pid(epaddr)))
+        {
+            if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY) {
+                /* kcopy can be done in handler context or not. */
+                size_t nbytes = kcopy_get(ptl->ep->psmi_kassist_fd, pid,
+                        (void *) req->rts_sbuf, req->buf, req->recv_msglen);
+                psmi_assert_always(nbytes == req->recv_msglen);
+            } else {
+                psmi_assert_always(ptl->ep->psmi_kassist_mode &
+                        PSMI_KASSIST_KNEM);
+
+                /* knem copy can be done in handler context or not */
+                knem_get(ptl->ep->psmi_kassist_fd, (int64_t) req->rts_sbuf,
+                        (void*) req->buf, req->recv_msglen);
+            }
+
+            used_get = 1;
+        }
+
+        /* If KNEM PUT is active register region for peer to PUT data to */
+        if (ptl->ep->psmi_kassist_mode == PSMI_KASSIST_KNEM_PUT)
+            args[4].u64w0 = knem_register_region(req->buf, req->recv_msglen,
+                    PSMI_TRUE);
+
+#ifdef PSM_HAVE_SCIF
+    } else if(ptl->ep->scif_dma_threshold <= req->recv_msglen) {
+        /* Remote node and threshold is met, consider using SCIF DMA */
+
+        if(epaddr->ep->scif_dma_mode == PSMI_SCIF_DMA_GET) {
+            /* Read via SCIF DMA */
+            scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[1];
+
+            if(scif_vreadfrom(epd, req->buf, req->recv_msglen,
+                        req->rts_sbuf, SCIF_RMA_USECACHE|SCIF_RMA_SYNC)) {
+                psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                        "ptl_handle_rtsmatch_request(): scif_vreadfrom failed: (%d) %s",
+                        errno, strerror(errno));
+            }
+
+            /* Give the remote offset back to the sender. */
+            args[4].u64w0 = req->rts_sbuf;
+            used_get = 1;
+        }
+        else if(epaddr->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT) {
+            /* Peer issues DMA commands on amsh_epd[0] */
+            scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[1];
+
+            off_t reg;
+            if(scif_register_region(epd,
+                        req->buf, req->recv_msglen, &reg) != PSM_OK) {
+                psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                        "ptl_handle_rtsmatch_request(): SCIF memory registration failed");
+            }
+
+            /* Stuff the SCIF registration offset into the buffer pointer.
+               This is needed later in psmi_am_mq_handler_rtsdone to unregister
+               the buffer.  The registration is also passed across for the
+               sender side to issue a DMA write.*/
+            req->buf = (void*)reg;
+            args[4].u64w0 = reg;
+        }
+    }
+#endif
+
+    if (tok != NULL) { 
+	psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl, tok->tok.epaddr_from, 
+	    mq_handler_rtsmatch_hidx, args, 5, NULL, 0, NULL, 0);
+    }
+    else
+	psmi_amsh_short_request(ptl, epaddr, mq_handler_rtsmatch_hidx, 
+				    args, 5, NULL, 0, 0);
+
+    /* 0-byte completion or we used kcopy */
+    if (used_get == 1 || req->recv_msglen == 0)
+	psmi_mq_handle_rts_complete(req);
+    return PSM_OK;
+}
+
+static
+psm_error_t
+ptl_handle_rtsmatch(psm_mq_req_t req, int was_posted)
+{
+    /* was_posted == 0 allows us to assume that we're not running this callback
+     * within am handler context (i.e. we can poll) */
+    psmi_assert(was_posted == 0);
+    return ptl_handle_rtsmatch_request(req, 0, NULL);
+}
+
+void
+psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    ptl_t *ptl = tok->ptl;
+    psm_mq_req_t    req;
+    int rc;
+    int mode        = args[0].u32w0;
+    uint64_t tag    = args[1].u64;
+    uint32_t msglen = mode <= MQ_MSG_SHORT ? len : args[0].u32w1;
+
+    _IPATH_VDBG("mq=%p mode=%d, len=%d, msglen=%d\n", 
+	    tok->mq, mode, (int) len, msglen);
+
+    switch(mode) {
+	case MQ_MSG_TINY:
+	  rc = psmi_mq_handle_tiny_envelope(tok->mq, tok->tok.epaddr_from, tag,
+					    buf, (uint32_t) len);
+	  return;
+	  break;
+	case MQ_MSG_SHORT:
+	case MQ_MSG_LONG:
+	  rc = psmi_mq_handle_envelope(tok->mq, mode, tok->tok.epaddr_from,
+				       tag, (union psmi_egrid) 0U,
+				       msglen, buf, (uint32_t) len);
+	  return;
+	  break;
+	default: {
+	    void *sreq = (void *)(uintptr_t) args[2].u64w0;
+	    uintptr_t sbuf = (uintptr_t) args[3].u64w0;
+	    psmi_assert(narg == 5);
+	    psmi_assert_always(mode == MQ_MSG_RTS);
+	    rc = psmi_mq_handle_rts(tok->mq, tag, sbuf, msglen, 
+				    tok->tok.epaddr_from,
+				    ptl_handle_rtsmatch, &req);
+	    req->ptl_req_ptr = sreq;
+	    
+	    /* Overload rts_sbuf to contain the cookie for remote region */
+            if(ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KNEM)
+                req->rts_sbuf = (uintptr_t) args[4].u64w0;
+#ifdef PSM_HAVE_SCIF
+            else if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET &&
+                    ptl->ep->scif_dma_threshold <= msglen &&
+                    tok->tok.epaddr_from->_shmidx >= PTL_AMSH_MAX_LOCAL_PROCS) {
+                req->rts_sbuf = (uintptr_t) args[4].u64w0;
+            }
+#endif
+	    
+	    if (rc == MQ_RET_MATCH_OK) /* handler context: issue a reply */
+		ptl_handle_rtsmatch_request(req, 1, tok);
+	    /* else will be called later */
+	}
+    }
+    return;
+}
+
+void
+psmi_am_mq_handler_data(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    psm_mq_req_t req = STAILQ_FIRST(&tok->tok.epaddr_from->egrlong);
+    psmi_mq_handle_data(req, tok->tok.epaddr_from, 0, args[2].u32w0, buf, len);
+    
+    return;
+}
+
+void
+psmi_am_mq_handler_rtsmatch(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    ptl_t *ptl = tok->ptl;
+    psm_mq_req_t sreq = (psm_mq_req_t) (uintptr_t) args[0].u64w0;
+    void *dest = (void *)(uintptr_t) args[2].u64w0;
+    uint32_t msglen = args[3].u32w0;
+    int pid = 0;
+    psm_amarg_t rarg[1] = {};
+
+    _IPATH_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n",
+		    sreq, (void*)(uintptr_t)args[1].u64w0, sreq->buf, dest, msglen);
+
+    if (msglen > 0) {
+	rarg[0].u64w0 = args[1].u64w0; /* rreq */
+
+#ifdef PSM_HAVE_SCIF
+        int shmidx = tok->tok.epaddr_from->_shmidx;
+        if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) {
+#endif
+            /* Try Intra-node kassist */
+            if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_MASK)
+                pid = psmi_epaddr_kcopy_pid(tok->tok.epaddr_from);
+            else
+                pid = 0;
+
+            if (!pid)
+                psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1,
+                        sreq->buf, msglen, dest, 0);
+            else if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_PUT)
+            {
+                if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY) {
+                    size_t nbytes = kcopy_put(ptl->ep->psmi_kassist_fd, sreq->buf,
+                            pid, dest, msglen);
+                    psmi_assert_always(nbytes == msglen);
+                } else {
+                    int64_t cookie = args[4].u64w0;
+
+                    psmi_assert_always(
+                            ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KNEM);
+
+                    /* Do a PUT using KNEM */
+                    knem_put(ptl->ep->psmi_kassist_fd,
+                            sreq->buf, msglen, cookie);
+                }
+
+                /* Send response that PUT is complete */
+                psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1,
+                        NULL, 0, 0);
+            }
+#ifdef PSM_HAVE_SCIF
+        } else {
+            /* Try SCIF DMA */
+            scif_epd_t epd =
+                tok->tok.epaddr_from->ep->amsh_qdir[shmidx].amsh_epd[0];
+
+            if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT &&
+                    ptl->ep->scif_dma_threshold <= msglen) {
+                off_t target_offset = args[4].u64w0;
+
+                /* The DMA operation is NOT completed here.  It is
+                   initiated here, then the receiving side is notified.
+                   The target issues a DMA fence to wait for the DMA
+                   complete, then responds that it has completed handling
+                   the transfer on that side. */
+                /* The 'v' form takes care of local registration. */
+                if(scif_vwriteto(epd, sreq->buf, msglen, target_offset,
+                            SCIF_RMA_USECACHE)) {
+                    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                            "psmi_am_mq_handler_rtsmatch(): scif_vwriteto failed: (%d) %s", errno, strerror(errno));
+                }
+
+                /* Send response that PUT is complete */
+                psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1,
+                        NULL, 0, 0);
+            } else if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET &&
+                    ptl->ep->scif_dma_threshold <= msglen) {
+                /* GET mode: receiver has performed DMA read, so unregister. */
+                scif_unregister_region(epd, args[4].u64w0, msglen);
+            } else {
+                /* No form of DMA is enabled -- use the memory copying path */
+                psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1,
+                        sreq->buf, msglen, dest, 0);
+            }
+        }
+#endif
+    } //msglen > 0
+
+    psmi_mq_handle_rts_complete(sreq);
+}
+
+void
+psmi_am_mq_handler_rtsdone(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    psm_mq_req_t rreq = (psm_mq_req_t) (uintptr_t) args[0].u64w0;
+    psmi_assert(narg == 1);
+
+    _IPATH_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->buf, rreq->recv_msglen);
+
+#ifdef PSM_HAVE_SCIF
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    ptl_t *ptl = tok->ptl;
+
+    psm_epaddr_t rmt_epaddr = rreq->rts_peer;
+
+    if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT &&
+            ptl->ep->scif_dma_threshold <= rreq->recv_msglen &&
+            rmt_epaddr->_shmidx >= PTL_AMSH_MAX_LOCAL_PROCS) {
+        /* SCIF DMA commands are initiated on amsh_epd[0]; the receive (for put)
+           side registration is on amsh_epd[1]. */
+        scif_epd_t epd =
+            rmt_epaddr->ep->amsh_qdir[rmt_epaddr->_shmidx].amsh_epd[1];
+
+        int mark;
+        if(scif_fence_mark(epd, SCIF_FENCE_INIT_PEER, &mark)) {
+            psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                    "psmi_am_mq_handler_rtsdone(): scif_fence_mark failed: (%d) %s",
+                    errno, strerror(errno));
+        }
+
+        /* When registered, the rreq->buf address is replaced with the SCIF
+           registration offset so that it can be used here. */
+        scif_unregister_region(epd, (off_t)rreq->buf, rreq->recv_msglen);
+
+        if(scif_fence_wait(epd, mark)) {
+            psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+                    "psmi_am_mq_handler_rtsdone(): scif_fence_wait failed: (%d) %s",
+                    errno, strerror(errno));
+        }
+    }
+#endif
+
+    psmi_mq_handle_rts_complete(rreq);
+}
+
+void
+psmi_am_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len)
+{
+    amsh_am_token_t *tok = (amsh_am_token_t *) toki;
+    psm_am_handler_fn_t hfn;
+
+    hfn = psm_am_get_handler_function(tok->mq->ep, 
+				      (psm_handler_t) args[0].u32w0);
+    
+    /* Invoke handler function. For AM we do not support break functionality */
+    hfn(toki, tok->tok.epaddr_from, args+1, narg-1, buf, len);
+    
+    return;
+}
diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h
new file mode 100644
index 0000000..3be8f5b
--- /dev/null
+++ b/ptl_am/ptl_fwd.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PTL_FWD_AMSH_H
+#define _PTL_FWD_AMSH_H
+
+#define PTL_AMSH_MAX_LOCAL_PROCS   256
+
+/* SCIF manual says it is optimized for up to 8 nodes, so choose 16 for
+   future expansion. */
+#ifdef PSM_HAVE_SCIF
+#define PTL_AMSH_MAX_LOCAL_NODES   8
+#else
+/* Compiling without SCIF: assume one node */
+#define PTL_AMSH_MAX_LOCAL_NODES   1
+#endif
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_amsh;
+
+/* Special non-ptl function exposed to pre-attach to shm segment */
+psm_error_t psmi_shm_attach(psm_ep_t ep, int *shmidx_o);
+psm_error_t psmi_shm_detach(psm_ep_t ep);
+
+extern int psmi_shm_mq_rv_thresh;
+
+#endif
diff --git a/ptl_am/scifrw.h b/ptl_am/scifrw.h
new file mode 100644
index 0000000..eb78126
--- /dev/null
+++ b/ptl_am/scifrw.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2010. QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+
+#if defined(PSM_HAVE_SCIF)
+#include <scif.h>
+
+/*
+ * register a memory region for put/get
+ */
+int scif_register_region(scif_epd_t epd, void* addr, size_t len, off_t* offset);
+
+/*
+ * unregister a memory region that was previously registered
+ */
+int scif_unregister_region(scif_epd_t epd, off_t reg, size_t len);
+
+#endif
diff --git a/ptl_am/scifrwu.c b/ptl_am/scifrwu.c
new file mode 100644
index 0000000..d3ccd63
--- /dev/null
+++ b/ptl_am/scifrwu.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2010. QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+#include "psm_am_internal.h"
+#include "scifrw.h"
+
+#if defined(PSM_HAVE_SCIF)
+int scif_register_region(scif_epd_t epd, void* addr, size_t len, off_t* offset)
+{
+    /* SCIF requires registrations on page granularity.  The address must be
+       rounded down to a page boundary, and the length must be rounded up. */
+    off_t addr_offset = (off_t)addr & 0xFFF;
+    uintptr_t reg_addr = (uintptr_t)addr & ~0xFFF;
+    size_t reg_len = len + addr_offset;
+
+    if(reg_len & 0xFFF) {
+        reg_len += 0x1000 - (reg_len & 0xFFF);
+    }
+
+    off_t reg = scif_register(epd, (void*)reg_addr, reg_len, 0,
+            SCIF_PROT_READ|SCIF_PROT_WRITE, 0);
+
+    if(reg == SCIF_REGISTER_FAILED) {
+        _IPATH_INFO("SCIF: Registering memory %p (%p) length %ld (%ld) epd %d failed: (%d) %s\n",
+                addr, (void*)reg_addr, len, reg_len, epd,
+                errno, strerror(errno));
+
+        *offset = SCIF_REGISTER_FAILED;
+        return PSM_INTERNAL_ERR;
+    }
+
+    /* Although the registration is rounded out to whole pages, return the
+       exact SCIF-space registration offset for the specified address. */
+    *offset = reg + addr_offset;
+    return PSM_OK;
+}
+
+int scif_unregister_region(scif_epd_t epd, off_t reg, size_t len)
+{
+    /* SCIF requires registrations on page granularity.  The address must be
+       rounded down to a page boundary, and the length must be rounded up. */
+    off_t reg_addr = reg & ~0xFFF;
+    size_t reg_len = len + ((size_t)reg & 0xFFF);
+
+    if(reg_len & 0xFFF) {
+        reg_len += 0x1000 - (reg_len & 0xFFF);
+    }
+
+    if(scif_unregister(epd, reg_addr, reg_len)) {
+        _IPATH_INFO("SCIF: Unregistering offset %lx (%lx) length %ld (%ld) epd %d failed: (%d) %s\n",
+                reg, reg_addr, len, reg_len, epd,
+                errno, strerror(errno));
+        return PSM_INTERNAL_ERR;
+    }
+
+    return PSM_OK;
+}
+
+#endif /* defined(PSM_USE_SCIF) */
+
diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile
new file mode 100644
index 0000000..dc06808
--- /dev/null
+++ b/ptl_ips/Makefile
@@ -0,0 +1,55 @@
+# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir) -I$(top_srcidr)/ptl_ips
+
+${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ipserror.o ips_recvq.o \
+		   ips_recvhdrq.o ips_spio.o ips_proto_recv.o ips_proto_connect.o \
+		   ips_proto_dump.o ips_proto_mq.o ips_subcontext.o \
+		   ips_writehdrq.o ips_proto_expected.o ips_tid.o
+
+# enable mov,0 -> xor optimization for ips
+ifeq (${CCARCH},pathcc)
+  ifeq (,${PSM_DEBUG})
+    CFLAGS += -CG:use_xortozero=1
+  endif
+endif
+
+all: ${${TARGLIB}-objs}
+
+%.o: %.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -f *.o
+
diff --git a/ptl_ips/ips_crc32.c b/ptl_ips/ips_crc32.c
new file mode 100644
index 0000000..6a7b85a
--- /dev/null
+++ b/ptl_ips/ips_crc32.c
@@ -0,0 +1,91 @@
+/* The code in this file was derived from crc32.c in zlib 1.2.3, and
+   modified from its original form to suit our requirements. The zlib
+   license and crc32.c copyright and credits are preserved below. */
+
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.3, July 18th, 2005
+
+  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
+  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+*/
+
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/* Table of CRCs of all 8-bit messages. */
+static uint32_t crc_table[256];
+
+/* Flag: has the table been computed? Initially false. */
+static int crc_table_computed = 0;
+
+/* Make the table for a fast CRC. */
+static void make_crc_table(void)
+{
+  uint32_t c;
+  int n, k;
+
+  for (n = 0; n < 256; n++) {
+    c = (uint32_t) n;
+    for (k = 0; k < 8; k++) {
+      if (c & 1)
+        c = 0xedb88320 ^ (c >> 1);
+      else
+        c = c >> 1;
+    }
+    crc_table[n] = c;
+  }
+  crc_table_computed = 1;
+}
+
+/* Update a running CRC with the bytes buf[0..len-1]--the CRC
+ * should be initialized to all 1's, and the transmitted value
+ * is the 1's complement of the final running CRC (see the
+ * crc() routine below)).
+ */
+   
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc)
+{
+  uint32_t c = crc;
+  uint32_t n;
+
+  if (!crc_table_computed) {
+    make_crc_table();
+  }
+  for (n = 0; n < len; n++) {
+    c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8);
+  }
+  return c;
+}
diff --git a/ptl_ips/ips_epstate.c b/ptl_ips/ips_epstate.c
new file mode 100644
index 0000000..43c81ba
--- /dev/null
+++ b/ptl_ips/ips_epstate.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_epstate.h"
+
+/* The indexes are used to map a particular endpoint to a strcture at the
+ * receiver.  Although we take extra care to validate the identity of endpoints
+ * when packets are received, the communication index is at an offset selected
+ * by the endpoint that allocates the index.  This narrows the window of two
+ * jobs communicated with the same set of indexes from getting crosstalk.
+ */
+/* Allocate new epaddrs in chunks of 128 */
+#define PTL_EPADDR_ALLOC_CHUNK  128
+
+psm_error_t
+ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context)
+{
+    memset(eps, 0, sizeof(*eps));
+    eps->context = context;
+    eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) &
+      (IPS_EPSTATE_COMMIDX_MAX-1);
+    return PSM_OK;
+}
+
+psm_error_t
+ips_epstate_fini(struct ips_epstate *eps)
+{
+    if (eps->eps_tab)
+	psmi_free(eps->eps_tab);
+    memset(eps, 0, sizeof(*eps));
+    return PSM_OK;
+}
+
+/*
+ * Add ipsaddr with epid to the epstate table, return new index to caller in
+ * 'commidx'.
+ */
+psm_error_t
+ips_epstate_add(struct ips_epstate *eps, struct ptl_epaddr *ipsaddr,
+		ips_epstate_idx *commidx_o)
+{
+    int i, j;
+    ips_epstate_idx commidx;
+    uint16_t lmc_mask = ~((1 << ipsaddr->proto->epinfo.ep_lmc) - 1);
+    
+    if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */
+	struct ips_epstate_entry *newtab;
+	eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK;
+	newtab = (struct ips_epstate_entry *) 
+	    psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, eps->eps_tabsize, 
+			sizeof(struct ips_epstate_entry));
+	if (newtab == NULL) 
+	    return PSM_NO_MEMORY;
+	else if (eps->eps_tab) { /* NOT first alloc */
+	    for (i = 0; i < eps->eps_tabsize-PTL_EPADDR_ALLOC_CHUNK; i++)
+		newtab[i] = eps->eps_tab[i]; /* deep copy */
+	    psmi_free(eps->eps_tab);
+	}
+	eps->eps_tab = newtab;
+    }
+    /* Find the next free hole.  We can afford to do this since connect is not
+     * in the critical path */
+    for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) {
+	if (j == eps->eps_tabsize)
+	    j = 0;
+	if (eps->eps_tab[j].epid == 0) {
+	    eps->eps_tab_nextidx = j + 1;
+	    if (eps->eps_tab_nextidx == eps->eps_tabsize)
+		eps->eps_tab_nextidx = 0;
+	    break;
+	}
+    }
+    psmi_assert_always(i != eps->eps_tabsize);
+    commidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1);
+    _IPATH_VDBG("node %s gets commidx=%d (table idx %d)\n", 
+	    psmi_epaddr_get_name(ipsaddr->epaddr->epid), commidx, j);
+    eps->eps_tab[j].epid = 
+      PSMI_EPID_PACK(ipsaddr->epr.epr_base_lid & lmc_mask,
+		     ipsaddr->epr.epr_context,
+		     ipsaddr->epr.epr_subcontext);
+    eps->eps_tab[j].ipsaddr = ipsaddr;
+    if (j >= IPS_EPSTATE_COMMIDX_MAX) {
+	return psmi_handle_error(eps->context->ep, PSM_TOO_MANY_ENDPOINTS, 
+	    "Can't connect to more than %d non-local endpoints", 
+	    IPS_EPSTATE_COMMIDX_MAX);
+    }
+    *commidx_o = commidx;
+    return PSM_OK;
+}
+
+psm_error_t
+ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx commidx)
+{
+    ips_epstate_idx idx;
+    /* actual table index */
+    idx = (commidx + eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1);
+    psmi_assert_always(idx < eps->eps_tabsize);
+    _IPATH_VDBG("commidx=%d, table_idx=%d\n", commidx, idx);
+    eps->eps_tab[idx].epid = 0;
+    eps->eps_tab[idx].ipsaddr = NULL;
+    /* We may eventually want to release memory, but probably not */
+    eps->eps_tabsizeused--;
+    return PSM_OK;
+}
+
diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h
new file mode 100644
index 0000000..b6aca57
--- /dev/null
+++ b/ptl_ips/ips_epstate.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_EPSTATE_H
+#define _IPS_EPSTATE_H
+
+#include "psm_user.h"
+
+typedef uint32_t ips_epstate_idx;
+#define IPS_EPSTATE_COMMIDX_MAX (1<<20)
+#define IPS_EPSTATE_COMMIDX_MASK 0xF0000
+#define IPS_EPSTATE_COMMIDX_SHIFT 14
+#define IPS_EPSTATE_COMMIDX_PACK(ipscommidx) \
+  ((ipscommidx & IPS_EPSTATE_COMMIDX_MASK) \
+    >> IPS_EPSTATE_COMMIDX_SHIFT)
+
+struct ptl_epaddr;
+
+struct ips_epstate_entry {
+    uint64_t            epid;
+    struct ptl_epaddr	*ipsaddr;
+};
+
+struct ips_epstate {
+    const psmi_context_t	*context;
+    ips_epstate_idx	eps_base_idx;
+    int			eps_tabsize;
+    int			eps_tabsizeused;
+    int			eps_tab_nextidx;
+
+    struct ips_epstate_entry *eps_tab;
+};
+
+psm_error_t ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *contextj);
+psm_error_t ips_epstate_fini(struct ips_epstate *eps);
+
+psm_error_t  ips_epstate_add(struct ips_epstate *eps, 
+			     struct ptl_epaddr *ipsaddr,
+			     ips_epstate_idx *commidx);
+psm_error_t  ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx commidx);
+
+PSMI_INLINE(
+struct ips_epstate_entry *
+ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx))
+{
+  idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1);
+    if (idx < eps->eps_tabsize)
+	return &eps->eps_tab[idx];
+    else
+	return NULL;
+}
+
+#endif /* _IPS_EPSTATE_H */
diff --git a/ptl_ips/ips_expected_proto.h b/ptl_ips/ips_expected_proto.h
new file mode 100644
index 0000000..f45c687
--- /dev/null
+++ b/ptl_ips/ips_expected_proto.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Control and state structure for one instance of the expected protocol.  The
+ * protocol depends on some upcalls from internal portions of the receive
+ * protocol (such as opcodes dedicated for expected protocol handling)
+ */
+
+/* Generate an expected header every 16 packets */
+#define PSM_DEFAULT_EXPECTED_HEADER 16
+
+struct ips_protoexp {
+    const struct ptl	  *ptl;
+    struct ips_proto	  *proto;
+    struct psmi_timer_ctrl *timerq;
+    struct ips_tid	  tidc;
+    struct ips_tfctrl      tfctrl;
+
+    unsigned int           tidflow_seed;
+    ptl_epaddr_flow_t      tid_ep_flow;
+    uint32_t		   tid_flags;
+    psm_transfer_type_t    tid_xfer_type;
+    struct ips_scbctrl	   tid_scbc_rv;
+    mpool_t		   tid_desc_send_pool;
+    mpool_t		   tid_desc_recv_pool;
+    mpool_t		   tid_getreq_pool;
+    mpool_t		   tid_sreq_pool; /* backptr into proto->ep->mq */
+    mpool_t		   tid_rreq_pool; /* backptr into proto->ep->mq */
+    uint32_t		   tid_send_fragsize;
+    uint32_t		   tid_page_offset_mask;
+    uint64_t		   tid_page_mask;
+    uint64_t		   tid_to_cyc_min;
+    uint64_t		   tid_to_cyc_max; 
+    uint32_t		   tid_to_intr;
+    uint32_t		   tid_min_expsend_cnt;
+    uint32_t               hdr_pkt_interval; 
+    struct ips_tidinfo     *tid_info;
+    
+    STAILQ_HEAD(ips_tid_send_pend,		    /* pending exp. sends */
+		ips_tid_send_desc)   pend_sendq;
+    struct psmi_timer		     timer_send;
+
+    STAILQ_HEAD(ips_tid_get_pend, 
+		ips_tid_get_request)	pend_getreqsq; /* pending tid reqs */
+    struct psmi_timer			timer_getreqs;
+
+    /* stats */
+    uint64_t tid_grant_resends;
+    uint64_t tid_release_resends;
+    uint64_t tid_intr_reqs;
+};
+
+/*
+ * TID member list format used in communication.  The receiver associates
+ * physical pages to tids and communicates a list of tid,offset,length for 
+ * each registered page.
+ *
+ * This format is currently the only one we support, although it is not as
+ * compact as we would like and other formats are planned in the near future
+ */
+#define IPS_TID_SESSTYPE_MEMBER_LIST    1
+
+typedef struct {
+	uint16_t tid;
+	uint16_t offset;
+	uint16_t length;
+} 
+ips_tid_session_member;
+
+typedef struct {
+	uint16_t    tsess_type;
+	uint16_t    tsess_tidcount;
+	uint16_t    tsess_tidlist_length;
+	uint16_t    tsess_unaligned_start;
+	uint16_t    tsess_unaligned_end;
+
+	ptl_arg_t   tsess_descid;
+	uint32_t    tsess_seqno;
+	uint32_t    tsess_srcoff;
+	uint32_t    tsess_length;
+
+	ips_tid_session_member tsess_list[0];	/* must be last in struct */
+} 
+ips_tid_session_list;
+
+/*
+ * Send-side expected send descriptors.
+ *
+ * Descriptors are allocated when tid grant requests are received (the 'target'
+ * side of an RDMA get request).  Descriptors are added to a pending queue of
+ * expected sends and processed one at a time (scb's are requested and messages
+ * sent until all fragments of the descriptor's length are put on the wire).
+ *
+ */
+#define TIDSENDC_SDMA_VEC_DEFAULT	260
+
+struct ips_tid_send_desc {
+    struct ips_protoexp		    *protoexp;
+    STAILQ_ENTRY(ips_tid_send_desc) next;
+
+    /* Filled in at allocation time */
+    ptl_arg_t	  descid;
+    uint32_t	  length;
+    ips_epaddr_t *ipsaddr;
+    psm_mq_req_t  mqreq;
+    struct ips_flow tidflow;
+    
+    uint32_t ctrl_msg_queued; /* bitmap of queued control messages for flow */
+    uint32_t completion_counter;
+				    
+    /* Iterated during send progress */
+    void	*buffer;
+    void	*bounce_buf;
+    int		tid_idx; 
+    int		is_complete;
+    uint32_t	remaining_bytes;   
+    uint32_t	remaining_bytes_in_page;				    
+    uint32_t	frame_send;
+    uint32_t	offset;
+    uint32_t	iovec_cntr_last;
+    uint32_t	release_cnt;
+    uint32_t    unaligned_sent;
+    uint32_t    pad;
+				    
+    psmi_timer   timer_tidrelease;
+
+    union {
+        ips_tid_session_list tid_list;
+        uint8_t filler[2096]; 
+    };
+};
+
+#define TIDRECVC_STATE_FREE      0
+#define TIDRECVC_STATE_GRANT     1
+#define TIDRECVC_STATE_GRANT_ACK 2
+#define TIDRECVC_STATE_DONE      3
+
+struct ips_expected_recv_stats {
+  uint32_t     nSeqErr;
+  uint32_t     nGenErr;
+  uint32_t     nReXmit;
+  uint32_t     nErrChkReceived;
+};
+
+struct ips_tid_recv_desc {
+    const psmi_context_t *context;
+    struct ips_protoexp	 *protoexp;
+    ips_epaddr_t	 *ipsaddr;
+    STAILQ_ENTRY(ips_tid_recv_desc) next;
+				    
+    /* desc id held in tid_list below */
+    void	*buffer;
+    uint32_t    num_recv_hdrs;
+    uint32_t	recv_msglen;
+    uint32_t	grant_cnt;    
+    uint32_t    state;
+    uint32_t    cksum;
+    uint16_t	recv_framecnt;
+    uint16_t	flags;
+
+    /* TF protocol state (recv) */
+    uint32_t    tidflow_idx;
+    uint32_t    tidflow_active_gen;
+				    
+    psmi_seqnum_t tidflow_genseq;
+    uint16_t    tidflow_nswap_gen;
+    uint16_t    pad;
+
+    uint32_t ctrl_msg_queued; /* bitmap of queued control messages for */
+    struct ips_expected_recv_stats stats;				    
+    			    
+    struct ips_tid_get_request	*getreq;
+    psmi_timer   timer_tidreq;
+
+    ips_tidmap_t	 ts_map;
+    union {
+	ips_tid_session_list tid_list; 
+	uint8_t		 filler[2096];
+    };
+};
+
+/*
+ * Get requests, issued by MQ when there's a match on a large message.  Unlike
+ * an RDMA get, the initiator identifies the location of the data at the target
+ * using a 'send token' instead of a virtual address.  This, of course, assumes
+ * that the target has already registered the token and communicated it to the
+ * initiator beforehand (it actually sends the token as part of the initial
+ * MQ message that contains the MQ tag).
+ *
+ * The operation is semantically a two-sided RDMA get.
+ */
+struct ips_tid_get_request {
+    STAILQ_ENTRY(ips_tid_get_request)	tidgr_next;
+    struct ips_protoexp		*tidgr_protoexp;
+    psm_epaddr_t		 tidgr_epaddr;
+					
+    void			 *tidgr_lbuf;
+    uint32_t			  tidgr_length;
+    uint32_t			  tidgr_rndv_winsz;
+    uint32_t			  tidgr_sendtoken;
+    ips_tid_completion_callback_t tidgr_callback;
+    void			 *tidgr_ucontext;
+
+    uint32_t	tidgr_offset;	/* offset in bytes */
+    uint32_t	tidgr_bytesdone;
+    uint32_t	tidgr_desc_seqno;
+    uint32_t	tidgr_flags;
+};
+
+/*
+ * For debug and/or other reasons, we can log the state of each tid and
+ * optionally associate it to a particular receive descriptor
+ */
+
+#define TIDSTATE_FREE	0
+#define TIDSTATE_USED	1
+
+struct ips_tidinfo {
+    uint16_t tid;
+    uint16_t state;
+    struct ips_tid_recv_desc *tidrecvc;
+};
+
+/*
+ * Descriptor limits, structure contents of struct psmi_rlimit_mpool for
+ * normal, min and large configurations.
+ */
+#define TID_SENDSESSIONS_LIMITS {				\
+	    .env = "PSM_TID_SENDSESSIONS_MAX",			\
+	    .descr = "Tid max send session descriptors",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
+	    .minval = 1,					\
+	    .maxval = 1<<30,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = { 256, 4096 },	\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,    1 },	\
+	    .mode[PSMI_MEMMODE_LARGE]   = { 512, 8192 }		\
+	}
+
+#define TID_RECVSESSIONS_LIMITS {				\
+	    .env = "PSM_TID_RECVSESSIONS_MAX",			\
+	    .descr = "Tid max receive session descriptors",	\
+	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
+	    .minval = 1,					\
+	    .maxval = 512,					\
+	    .mode[PSMI_MEMMODE_NORMAL]  = {  32, 512 },		\
+	    .mode[PSMI_MEMMODE_MINIMAL] = {   1,   1 },		\
+	    .mode[PSMI_MEMMODE_LARGE]   = {  32, 512 }		\
+	}
diff --git a/ptl_ips/ips_opp_path_rec.c b/ptl_ips/ips_opp_path_rec.c
new file mode 100644
index 0000000..affe5da
--- /dev/null
+++ b/ptl_ips/ips_opp_path_rec.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include <dlfcn.h>
+
+#define DF_OPP_LIBRARY "libofedplus.so"
+#define DATA_VFABRIC_OFFSET 8
+
+/* SLID and DLID are in network byte order */
+static psm_error_t
+ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto,
+		     uint16_t slid, uint16_t dlid, uint16_t desthca_type,
+		     ips_path_rec_t **path_rec)
+{
+  psm_error_t err = PSM_OK;
+  ibta_path_rec_t query;
+  ips_opp_path_rec_t *opp_path_rec;
+  int opp_err;
+  ENTRY elid, *epath = NULL;
+  char eplid[128];
+  uint64_t timeout_ack_ms;
+
+  /* Query path record query cache first */
+  bzero(&query, sizeof(query));
+  bzero(eplid, sizeof(eplid));
+  
+  /* Bulk service ID is control service id + 1 */
+  switch(type) {
+  case IPS_PATH_NORMAL_PRIORITY:
+  case IPS_PATH_LOW_PRIORITY:
+    query.service_id = 
+	__cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET);
+    break;
+  case IPS_PATH_HIGH_PRIORITY:
+  default:
+    query.service_id = __cpu_to_be64(proto->ep->service_id);
+  }
+
+  query.slid = slid;
+  query.dlid = dlid;
+
+  snprintf(eplid, sizeof(eplid), "%s_%x_%x", (type == IPS_PATH_HIGH_PRIORITY) ? "HIGH" : "LOW", query.slid,query.dlid);
+  elid.key = eplid;
+  hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+
+  if (!epath) { /* Unable to find path record in cache */
+    elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+    opp_path_rec = (ips_opp_path_rec_t*) 
+      psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_opp_path_rec_t));
+    if (!elid.key || !opp_path_rec) {
+	if (elid.key) psmi_free(elid.key);
+	if (opp_path_rec) psmi_free(opp_path_rec);
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    
+    /* Get path record between local LID and remote */
+    opp_err = proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt, &query,
+					     &opp_path_rec->opp_response);
+    if (opp_err) {
+      psmi_free(opp_path_rec);
+      psmi_free(elid.key);
+      err = PSM_EPID_PATH_RESOLUTION;
+      goto fail;
+    }
+
+    /* Create path record */
+    opp_path_rec->ips.epr_slid = opp_path_rec->opp_response.slid;
+    opp_path_rec->ips.epr_dlid = opp_path_rec->opp_response.dlid;
+    opp_path_rec->ips.epr_mtu = 
+      min(ibta_mtu_enum_to_int(opp_path_rec->opp_response.mtu & 0x3f), 
+	  proto->epinfo.ep_mtu);
+    opp_path_rec->ips.epr_pkey = ntohs(opp_path_rec->opp_response.pkey);
+    opp_path_rec->ips.epr_sl = ntohs(opp_path_rec->opp_response.qos_class_sl);
+    opp_path_rec->ips.epr_static_rate = opp_path_rec->opp_response.rate & 0x3f;
+    opp_path_rec->ips.epr_static_ipd = 
+      proto->ips_ipd_delay[opp_path_rec->ips.epr_static_rate];
+    
+    /* Setup CCA parameters for path */
+    if (opp_path_rec->ips.epr_sl > 15) {
+        psmi_free(opp_path_rec);
+        psmi_free(elid.key);
+	err = PSM_INTERNAL_ERR;
+	goto fail;
+    }
+    if (!(proto->ccti_ctrlmap&(1<<opp_path_rec->ips.epr_sl))) {
+	_IPATH_CCADBG("No CCA for sl %d, disable CCA\n",
+		opp_path_rec->ips.epr_sl);
+	proto->flags &= ~IPS_PROTO_FLAG_CCA;
+    }
+    opp_path_rec->ips.proto = proto;
+    opp_path_rec->ips.epr_ccti_min = proto->cace[opp_path_rec->ips.epr_sl].ccti_min;
+    opp_path_rec->ips.epr_ccti = opp_path_rec->ips.epr_ccti_min;
+    psmi_timer_entry_init(&opp_path_rec->ips.epr_timer_cca,
+			  ips_cca_timer_callback, &opp_path_rec->ips);
+    
+    /* Determine active IPD for path. Is max of static rate and CCT table */
+    if ((opp_path_rec->ips.epr_static_ipd) && 
+	((opp_path_rec->ips.epr_static_ipd + 1) > 
+	 (proto->cct[opp_path_rec->ips.epr_ccti] & CCA_IPD_MASK))) {
+      opp_path_rec->ips.epr_active_ipd = opp_path_rec->ips.epr_static_ipd + 1;
+      opp_path_rec->ips.epr_cca_divisor = 0; /*Static rate has no CCA divisor */
+    }
+    else {
+      /* Pick it from the CCT table */
+      opp_path_rec->ips.epr_active_ipd = 
+	proto->cct[opp_path_rec->ips.epr_ccti] & CCA_IPD_MASK;
+      opp_path_rec->ips.epr_cca_divisor = 
+	proto->cct[opp_path_rec->ips.epr_ccti] >> CCA_DIVISOR_SHIFT;
+    }
+        
+    /* Compute max timeout based on pkt life time for path */
+    timeout_ack_ms = ((4096UL * (1UL << (opp_path_rec->opp_response.pkt_life & 0x3f)))/ 1000000UL);
+    opp_path_rec->ips.epr_timeout_ack = 
+      ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT);
+    opp_path_rec->ips.epr_timeout_ack_max = 
+      ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT + timeout_ack_ms);
+    opp_path_rec->ips.epr_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT;
+
+    /* Add path record into cache */
+    strcpy(elid.key, eplid);
+    elid.data = (void*) opp_path_rec;
+    hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+  }
+  else /* Path record found in cache */
+    opp_path_rec = (ips_opp_path_rec_t*) epath->data;
+  
+  /* Dump path record stats */
+  _IPATH_PRDBG("Path Record ServiceID: %"PRIx64" %x -----> %x\n", (uint64_t) __be64_to_cpu(query.service_id), __be16_to_cpu(slid), __be16_to_cpu(dlid));
+  _IPATH_PRDBG("MTU: %x, %x\n", (opp_path_rec->opp_response.mtu & 0x3f), opp_path_rec->ips.epr_mtu);
+  _IPATH_PRDBG("PKEY: 0x%04x\n", ntohs(opp_path_rec->opp_response.pkey));
+  _IPATH_PRDBG("SL: 0x%04x\n", ntohs(opp_path_rec->opp_response.qos_class_sl));
+  _IPATH_PRDBG("Rate: %x, IPD: %x\n", (opp_path_rec->opp_response.rate & 0x3f), opp_path_rec->ips.epr_static_ipd);
+  _IPATH_PRDBG("Timeout Init.: 0x%"PRIx64" Max: 0x%"PRIx64"\n", opp_path_rec->ips.epr_timeout_ack, opp_path_rec->ips.epr_timeout_ack_max);
+
+  /* Return the IPS path record */
+  *path_rec = &opp_path_rec->ips;
+  
+ fail:  
+  return err;
+}
+
+static psm_error_t 
+ips_opp_path_rec(struct ips_proto *proto,
+		 uint16_t slid, uint16_t dlid, uint16_t desthca_type,
+		 unsigned long timeout, 
+		 ips_epaddr_t *ipsaddr)
+{
+  psm_error_t err = PSM_OK;
+  uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc);
+  ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY;
+  ips_path_rec_t *path;
+  uint16_t path_slid, path_dlid;
+  psmi_context_t *context = &proto->ep->context;
+  
+  /*
+   * High Priority Path
+   * ------------------
+   * 
+   * Uses the "base" Service ID. For now there exists only 1 high priority
+   * path between nodes even for non zero LMC fabrics. 
+   * TODO: Investigate if there are any benefits for using multiple high 
+   * priority paths. Initial empirical data shows that this leads to worse
+   * performance as the bulk data can induce HOL blocking.
+   * Currently the normal and low priority paths are same but at some point
+   * we can create separate vFabrics to further distinguish/isolate those 
+   * traffic flows.
+   *
+   * Normal/Low Priority Paths
+   * -------------------------
+   * 
+   * Currently these paths are the same i.e. they are queried for the same
+   * Service ID/vFabric which is the Base Service ID for High Priority + 1.
+   * 
+   * Use case Scenarios
+   * ------------------
+   *
+   * Since with vFabrics we have the capability to define different QoS 
+   * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is
+   * setup in a separate vFabric for high priority traffic. The NORMAL paths
+   * are setup in a separate vFabric optimized for high bandwidth. This allows
+   * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked
+   * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.)
+   * also use the high priority control vFabric.
+   *
+   * NOTE: In order to distinguish between the different vFabrics the user
+   * specifies the service ID to use via mpirun (or environment variable). 
+   * This is the service ID for the high priority control traffic. The bulk
+   * data vFabric is identified by service ID + 1. So for each MPI application
+   * one should specify two service IDs for the high priority and bulk data.
+   * Both these service IDs can be placed in the same vFabric which can be
+   * configured for high priority or bandwidth traffic giving us the default
+   * behavior upto Infinipath 2.5 release.
+   *
+   * NOTE: All of the above would have really helped if the S20 silicon could
+   * correctly support IBTA QoS features. Due to S20 design we can only have
+   * high priority VLarb table (low priority VLarb table results in round
+   * robin arbitration ignoring the weights!). But if this is fixed in a 
+   * subsequent chip respin then this may potentially help our scalability
+   * on large fabrics.
+   *
+   * Mesh/Torus and DOR routed networks
+   * ----------------------------------
+   * 
+   * In a mesh/torus fabric we always have a non zero LMC (atleast 1 can be 
+   * more). We would like to take advantage of dispersive routing on these
+   * fabrics as well to obtain better "worst case/congested" bandwidth. For
+   * these networks currently the base LIDs are used for UPDN routing which 
+   * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR
+   * routing (Dimension Ordered Routing) to avoid deadlocks and provide
+   * higher performance. If a fabric is disrupted then only the base UPDN
+   * routing is available. PSM should continue to operate in this environment
+   * albeit with degraded performance. In disrupted fabric the OPP path
+   * record queries may fail for some DOR routed LIDs i.e. no path exists
+   * PSM should hence ignore path record failures as they indicate a disrupted
+   * fabric and only use valid paths that are returned from the replica. This
+   * will degenerate to only using the UPDN paths on disrupted fabrics and DOR
+   * routes only for fully configured fabrics. Note: For a clean fabric the
+   * base LIDs that are configured for UPDN route will not exist in the replica
+   * as DOR routes are preferred. Hence we will only dispersively route across
+   * the DOR routes only using the UPDN route for disrupted fabrics.
+   *
+   * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN
+   * TAKE PLACE.
+   */
+  
+  /* If base lids are only used then reset num_path to 1 */
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+    num_path = 1;
+  
+  ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = 
+  ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] =
+  ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = 0;
+
+  /* For now there is always only one high priority path between nodes. */
+  for (pidx = 0,cpath = 0; pidx < num_path && cpath == 0; pidx++) {
+    path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+    path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+
+    err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, 
+			       path_slid, path_dlid, 
+			       desthca_type, &path);
+    
+    if (err == PSM_OK) {  /* Valid high priority path found */      
+      /* Resolved high priority path successfully */
+      ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]++;
+      ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][cpath] = path;
+      
+      /* Increment current path index */
+      cpath++;
+    }
+  }
+  
+  /* Make sure we have atleast 1 high priority path */
+  if (ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) {
+    err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION,
+			    "OFEF Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"PRIx64" defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id);
+    goto fail;
+  }
+  
+  /* Next setup the bulk paths. If the subnet administrator has misconfigured
+   * or rather not configured two separate service IDs we place the bulk
+   * paths in the same vFabric as the control paths.
+   */
+  for (pidx = 0,cpath = 0; pidx < num_path; pidx++) {
+    path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+    path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+    
+  retry_path_res:
+    err = ips_opp_get_path_rec(path_type, proto, 
+			       path_slid, path_dlid, desthca_type,
+			       &path);
+    if (err != PSM_OK) {
+      if (path_type == IPS_PATH_NORMAL_PRIORITY) {
+	/* Subnet may only be configured for one service ID/vFabric. Default
+	 * to using the control vFabric/service ID for bulk data as well.
+	 */
+	path_type = IPS_PATH_HIGH_PRIORITY;
+	goto retry_path_res;
+      }
+      
+      /* Unable to resolve path for <path_slid, path_dline>. This is possible
+       * for disrupted fabrics using DOR routing so continue to acquire paths
+       */
+      err = PSM_OK;
+      continue;
+    }
+        
+    /* Valid path. For now both normal and low priority paths are the same */
+    ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][cpath] = path;
+    ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][cpath] = path;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY]++;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY]++;
+    cpath++;
+  }
+
+  /* Make sure we have atleast have a single bulk data transfer path */
+  if ((ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) ||
+      (ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] == 0)) {
+    err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION,
+			    "OFEF Plus path lookup failed. Unable to resolve normal/low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"PRIx64" defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id);
+    goto fail;
+  }
+  
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+    ipsaddr->epr.epr_hpp_index = 0; 
+    ipsaddr->epr.epr_next_path[IPS_PATH_NORMAL_PRIORITY] = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY];
+    ipsaddr->epr.epr_next_path[IPS_PATH_LOW_PRIORITY] = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY];
+  }
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+    ipsaddr->epr.epr_hpp_index = 
+      ipsaddr->epr.epr_context  % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY];
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+    ipsaddr->epr.epr_hpp_index = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY];
+  else  /* Base LID  */
+    ipsaddr->epr.epr_hpp_index = 0;
+  
+ fail:
+  if (err != PSM_OK) 
+    _IPATH_PRDBG("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n", slid, dlid);
+  return err;
+}
+
+static psm_error_t ips_opp_fini(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+  
+  if (proto->opp_lib)
+    dlclose(proto->opp_lib);
+  
+  return err;  
+}
+
+psm_error_t ips_opp_init(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+  struct ipath_base_info *base_info = &proto->ep->context.base_info;
+  char hcaName[32];
+
+  proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW);
+  if (!proto->opp_lib) {
+    char *err = dlerror();
+    _IPATH_ERROR("Unable to open OFED Plus Plus library %s. Error: %s\n", DF_OPP_LIBRARY,
+		err ? err : "no dlerror()");
+    goto fail;
+  }
+  
+  /* Resolve symbols that we require within opp library */
+  proto->opp_fn.op_path_find_hca = dlsym(proto->opp_lib, "op_path_find_hca");
+  proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open");
+  proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close");
+  proto->opp_fn. op_path_get_path_by_rec = dlsym(proto->opp_lib, "op_path_get_path_by_rec");
+  
+  /* If we can't resovle any symbol then fail to load opp module */  
+  if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open ||
+  !proto->opp_fn.op_path_close || !proto->opp_fn.op_path_get_path_by_rec) {
+    _IPATH_PRDBG("Unable to resolve symbols in OPP library. Unloading.\n");
+    goto fail;
+  }
+  
+    /* If PSM_IDENTIFY is set display the OPP library location being used. */
+  if (getenv("PSM_IDENTIFY")) {
+    Dl_info info_opp;
+    _IPATH_INFO("PSM path record queries using OFED Plus Plus (%s) from %s\n", 
+		DF_OPP_LIBRARY,
+		dladdr(proto->opp_fn.op_path_open, &info_opp) ? info_opp.dli_fname : 
+		"Unknown/unsupported version of OPP library found!");
+  }
+
+  /* Obtain handle to hca (requires verbs on node) */
+  snprintf(hcaName, sizeof(hcaName), "qib%d", base_info->spi_unit);
+  proto->hndl = proto->opp_fn.op_path_find_hca(hcaName, &proto->device);
+  if (!proto->hndl) {
+    _IPATH_ERROR("OPP: Unable to find HCA %s. Disabling OPP interface for path record queries.\n", hcaName);
+    goto fail;
+  }
+  
+  /* Get OPP context */
+  proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, base_info->spi_port);
+  if (!proto->opp_ctxt) {
+    _IPATH_ERROR("OPP: Unable to optain OPP context. Disabling OPP interface for path record queries.\n");
+    goto fail;
+  }
+  
+  /* OPP initialized successfully */
+  proto->ibta.get_path_rec = ips_opp_path_rec;
+  proto->ibta.fini = ips_opp_fini;
+  proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC;
+
+  return err;
+  
+ fail:
+  _IPATH_ERROR("Make sure SM is running...\n");
+  _IPATH_ERROR("Make sure service dist_sa is running...\n");
+  _IPATH_ERROR("to start dist_sa: service dist_sa start\n");
+  _IPATH_ERROR("or enable it at boot time: iba_config -E dist_sa\n\n");
+
+  err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION,
+			  "Unable to initialize OFED Plus library successfully.\n");
+
+  if (proto->opp_lib)
+    dlclose(proto->opp_lib);
+  
+  return err;
+}
+
diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c
new file mode 100644
index 0000000..be3b41c
--- /dev/null
+++ b/ptl_ips/ips_path_rec.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+static void
+ips_gen_ipd_table(struct ips_proto *proto)
+{
+  /* Based on our current link rate setup the IPD table */
+  switch(proto->epinfo.ep_link_rate) {
+  case IBTA_RATE_10_GBPS:
+    proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 0;
+    proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 1;
+    proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 3;
+    break;
+  case IBTA_RATE_20_GBPS:
+    proto->ips_ipd_delay[IBTA_RATE_20_GBPS] = 0;
+    proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 1;
+    proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 3;
+    proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 7;
+    break;
+  case IBTA_RATE_40_GBPS:
+  default:
+    proto->ips_ipd_delay[IBTA_RATE_40_GBPS] = 0;
+    proto->ips_ipd_delay[IBTA_RATE_30_GBPS] = 1;
+    proto->ips_ipd_delay[IBTA_RATE_20_GBPS] = 1;
+    proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 3;
+    proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 7;
+    proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 15;
+    break;
+  }
+}
+
+static psm_error_t
+ips_gen_cct_table(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+  uint32_t cca_divisor, ipdidx, ipdval = 1;
+  uint16_t *cct_table;
+
+  /* The CCT table is static currently. If it's already created then return */
+  if (proto->cct)
+    goto fail;
+
+  /* Allocate the CCT table */
+  cct_table = psmi_calloc(proto->ep, UNDEFINED,
+		proto->ccti_size, sizeof(uint16_t));
+  if (!cct_table) {
+    err = PSM_NO_MEMORY;
+    goto fail;
+  }
+
+  /* The first table entry is always 0 i.e. no IPD delay */
+  cct_table[0] = 0;
+  
+  /* Generate the remaining CCT table entries */
+  for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4,ipdval++)
+    for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) {
+      if ((ipdidx+cca_divisor) == proto->ccti_size) break;
+      cct_table[ipdidx+cca_divisor] = 
+	(((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) | (ipdval & 0x3FFF));
+      _IPATH_VDBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n", ipdidx+cca_divisor, cct_table[ipdidx+cca_divisor], (cct_table[ipdidx+cca_divisor] >> CCA_DIVISOR_SHIFT), cct_table[ipdidx+cca_divisor] & CCA_IPD_MASK);
+    }
+
+  /* On link up/down CCT is re-generated. If CCT table is previously created
+   * free it 
+   */
+  if (proto->cct) {
+    psmi_free(proto->cct);
+    proto->cct = NULL;
+  }
+  
+  /* Update to the new CCT table */
+  proto->cct = cct_table;
+  
+ fail:
+  return err;
+}
+
+static ibta_rate
+ips_default_hca_rate(uint16_t hca_type)
+{
+  ibta_rate rate = IBTA_RATE_40_GBPS;
+  
+  switch(hca_type){
+  case PSMI_HCA_TYPE_QLE73XX:
+    rate = IBTA_RATE_40_GBPS;
+    break;
+  case PSMI_HCA_TYPE_QLE72XX:
+    rate = IBTA_RATE_20_GBPS;
+    break;
+  case PSMI_HCA_TYPE_QLE71XX:
+    rate = IBTA_RATE_10_GBPS;
+    break;
+  }
+  
+  return rate;
+}
+
+static ibta_rate
+ips_rate_to_enum(int link_rate) 
+{
+  ibta_rate rate;
+  
+  switch(link_rate) {
+  case 40:
+    rate = IBTA_RATE_40_GBPS;
+    break;
+  case 20:
+    rate = IBTA_RATE_20_GBPS;
+    break;
+  case 10:
+    rate = IBTA_RATE_10_GBPS;
+    break;
+  case 5:
+    rate = IBTA_RATE_5_GBPS;
+    break;
+  case 2:
+    rate = IBTA_RATE_2_5_GBPS;
+    break;
+  default:
+    rate = IBTA_RATE_PORT_CURRENT;
+  }
+  
+  return rate;
+}
+
+static psm_error_t
+ips_none_get_path_rec(struct ips_proto *proto,
+		  uint16_t slid, uint16_t dlid, uint16_t desthca_type,
+		  unsigned long timeout, ips_path_rec_t **prec)
+{
+  psm_error_t err = PSM_OK;
+  ENTRY elid, *epath = NULL;
+  char eplid[128];
+  ips_path_rec_t *path_rec;
+  
+  /* Query the path record cache */
+  snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid);
+  elid.key = eplid;
+  hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash);
+  
+  if (!epath) {
+    elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1);
+    path_rec = (ips_path_rec_t*) 
+      psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_rec_t));
+    if (!elid.key || !path_rec) {
+	if (elid.key) psmi_free(elid.key);
+	if (path_rec) psmi_free(path_rec);
+	return PSM_NO_MEMORY;
+    }
+    
+    /* Create path record */
+    path_rec->epr_slid = slid;
+    path_rec->epr_dlid = dlid;
+    path_rec->epr_mtu  = proto->epinfo.ep_mtu;
+    path_rec->epr_pkey = proto->epinfo.ep_pkey;
+    path_rec->epr_sl   = proto->epinfo.ep_sl;
+
+    /* Determine the IPD based on our local link rate and default link rate for
+     * remote hca type.
+     */
+    path_rec->epr_static_rate = 
+      ips_default_hca_rate(desthca_type);  
+    path_rec->epr_static_ipd = 
+      proto->ips_ipd_delay[path_rec->epr_static_rate];
+
+    /* Setup CCA parameters for path */
+    if (path_rec->epr_sl > 15) {
+	psmi_free(elid.key);
+	psmi_free(path_rec);
+	return PSM_INTERNAL_ERR;
+    }
+    if (!(proto->ccti_ctrlmap&(1<<path_rec->epr_sl))) {
+	_IPATH_CCADBG("No CCA for sl %d, disable CCA\n", path_rec->epr_sl);
+	proto->flags &= ~IPS_PROTO_FLAG_CCA;
+    }
+    path_rec->proto = proto;
+    path_rec->epr_ccti_min = proto->cace[path_rec->epr_sl].ccti_min;
+    path_rec->epr_ccti = path_rec->epr_ccti_min;
+    psmi_timer_entry_init(&path_rec->epr_timer_cca,
+			  ips_cca_timer_callback, path_rec);
+    
+    /* Determine active IPD for path. Is max of static rate and CCT table */
+    if ((path_rec->epr_static_ipd) && 
+	((path_rec->epr_static_ipd + 1) > 
+	 (proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK))) {
+      path_rec->epr_active_ipd = path_rec->epr_static_ipd + 1;
+      path_rec->epr_cca_divisor = 0;
+    }
+    else {
+      /* Pick it from the CCT table */
+      path_rec->epr_active_ipd = proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK;
+      path_rec->epr_cca_divisor = 
+	proto->cct[path_rec->epr_ccti] >> CCA_DIVISOR_SHIFT;
+    }
+    
+    /* Setup default errorcheck timeout. */
+    path_rec->epr_timeout_ack = 
+      proto->epinfo.ep_timeout_ack;
+    path_rec->epr_timeout_ack_max = 
+      proto->epinfo.ep_timeout_ack_max;
+    path_rec->epr_timeout_ack_factor = 
+      proto->epinfo.ep_timeout_ack_factor;
+    
+    /* Add path record into cache */
+    strcpy(elid.key, eplid);
+    elid.data = (void*) path_rec;
+    hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash);
+  }
+  else
+    path_rec = (ips_path_rec_t*) epath->data;
+
+  /* Return IPS path record */
+  *prec = path_rec;
+  
+  return err;
+}
+
+static psm_error_t 
+ips_none_path_rec(struct ips_proto *proto,
+		  uint16_t slid, uint16_t dlid, uint16_t desthca_type,
+		  unsigned long timeout, 
+		  ips_epaddr_t *ipsaddr)
+{
+  psm_error_t err = PSM_OK;
+  uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc);
+  uint16_t base_slid, base_dlid;
+  psmi_context_t *context = &proto->ep->context;
+  
+  /* For the "none" path record resolution all paths are assumed to be of equal
+   * priority however since we want to isolate all control traffic (acks, naks)
+   * to a separate path for non zero LMC subnets the "first path" between a 
+   * pair of endpoints is always the "higher" priority paths. The rest of the
+   * paths are the normal (and low priority) paths.
+   */
+  
+  /* If base lids are only used then reset num_path to 1 */
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+    num_path = 1;
+  
+  if (num_path > 1) {
+    /* One control path and (num_path - 1) norm and low priority paths */
+    ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1;
+  }
+  else {
+    /* LMC of 0. Use the same path for all priorities */
+    ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = 1;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1;
+    ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = 1;
+  }
+    
+  /* For "none" path record we just setup 2^lmc paths. To get better load
+   * balance
+   */
+  for (pidx = 0; pidx < num_path; pidx++) {
+    ips_path_rec_t *path;
+
+    base_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx);
+    base_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx);
+    
+    err = ips_none_get_path_rec(proto, base_slid, base_dlid, desthca_type,
+				timeout, &path);
+    if (err != PSM_OK)
+      goto fail;
+        
+    if (num_path > 1) {
+      if (pidx == 0) {
+	/* First path is always the high priority path */
+	ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0] = path;
+      }
+      else {
+	ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][pidx-1] = path;
+	ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][pidx-1] = path;
+      }
+    }
+    else {
+      ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0] = path;
+      ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][0] = path;
+      ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][0] = path;
+    }
+  }
+  
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+    ipsaddr->epr.epr_hpp_index = 0; 
+    ipsaddr->epr.epr_next_path[IPS_PATH_NORMAL_PRIORITY] = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY];
+    ipsaddr->epr.epr_next_path[IPS_PATH_LOW_PRIORITY] = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY];
+  }
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+    ipsaddr->epr.epr_hpp_index = 
+      ipsaddr->epr.epr_context  % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY];
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+    ipsaddr->epr.epr_hpp_index = 
+      context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY];
+  else  /* Base LID  */
+    ipsaddr->epr.epr_hpp_index = 0;
+  
+ fail:
+  if (err != PSM_OK) 
+    _IPATH_PRDBG("Unable to get path record for LID %x <---> DLID %x.\n", slid, dlid);
+  return err;
+}
+
+static psm_error_t ips_none_path_rec_init(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+  union psmi_envvar_val psm_set_hca_pkey;
+
+  /* Obtain the SL and PKEY to use from the environment (IPATH_SL & PSM_KEY) */
+  proto->epinfo.ep_sl = psmi_epid_sl(proto->ep->epid);
+  proto->epinfo.ep_pkey    = (uint16_t) proto->ep->network_pkey;
+
+  /*
+   * Parse the err_chk settings from the environment.
+   * <min_timeout>:<max_timeout>:<timeout_factor>
+   */
+  {
+    union psmi_envvar_val env_to;
+    char *errchk_to = PSM_TID_TIMEOUT_DEFAULT;
+    int tvals[3] = {
+      IPS_PROTO_ERRCHK_MS_MIN_DEFAULT,
+      IPS_PROTO_ERRCHK_MS_MAX_DEFAULT,
+      IPS_PROTO_ERRCHK_FACTOR_DEFAULT };
+    
+    if (!psmi_getenv("PSM_ERRCHK_TIMEOUT",
+		     "Errchk timeouts in mS <min:max:factor>",
+		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+		     (union psmi_envvar_val) errchk_to, &env_to))
+      {
+	/* Not using default values, parse what we can */
+	errchk_to = env_to.e_str;
+	psmi_parse_str_tuples(errchk_to, 3, tvals);
+	/* Adjust for max smaller than min, things would break */
+	if (tvals[1] < tvals[0]) 
+	  tvals[1] = tvals[0];
+      }
+    proto->epinfo.ep_timeout_ack     = ms_2_cycles(tvals[0]);
+    proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]);
+    proto->epinfo.ep_timeout_ack_factor = tvals[2];
+  }
+
+  /* With no path records queries set pkey manually if PSM_SET_HCA_PKEY is
+   * set.
+   */
+  psmi_getenv("PSM_SET_HCA_PKEY",
+	      "Force write of PKey to HCA (default is disabled)",
+	      PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+	      (union psmi_envvar_val) 0, &psm_set_hca_pkey);
+  
+  if (psm_set_hca_pkey.e_uint) {
+    if (ipath_set_pkey(proto->ep->context.ctrl,
+		       (uint16_t) proto->ep->network_pkey) != 0) {
+      err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+			      "Couldn't set device pkey %d: %s",
+			      (int) proto->ep->network_pkey,
+			      strerror(errno));
+      goto fail;
+    }
+  }
+
+  proto->ibta.get_path_rec = ips_none_path_rec;
+  proto->ibta.fini = NULL;
+
+ fail:
+  return err;
+}
+
+/* (Re)load the SL2VL table */
+psm_error_t ips_ibta_init_sl2vl_table(struct ips_proto *proto)
+{
+  int ret, sli;
+
+  /* Get SL2VL table for unit, port */
+  for (sli = 0; sli < 16; sli++) {
+    if ((ret = ipath_get_port_sl2vl(proto->ep->context.base_info.spi_unit,
+				    proto->ep->context.base_info.spi_port,
+				    (uint8_t) sli)) < 0) {
+      /* Unable to get SL2VL. Set it to default */
+      ret = PSMI_VL_DEFAULT;
+    }
+    
+    proto->sl2vl[sli] = ret;
+  }
+  
+  return PSM_OK;
+}
+
+/* On link up/down we need to update some state */
+psm_error_t ips_ibta_link_updown_event(struct ips_proto *proto) 
+{
+  psm_error_t err = PSM_OK;
+  int ret;
+
+  /* Get base lid, lmc and rate as these may have changed if the link bounced */
+  proto->epinfo.ep_base_lid       = 
+    __cpu_to_be16((uint16_t) psm_epid_nid(proto->ep->context.epid)); 
+  if ((ret = ipath_get_port_lmc(proto->ep->context.base_info.spi_unit,
+				proto->ep->context.base_info.spi_port)) < 0) {
+    err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+			    "Could obtain LMC for unit %u:%d. Error: %s",
+			    proto->ep->context.base_info.spi_unit,
+			    proto->ep->context.base_info.spi_port,
+			    strerror(errno));
+    goto fail;
+  }
+  proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC);
+
+  if ((ret = ipath_get_port_rate(proto->ep->context.base_info.spi_unit,
+				 proto->ep->context.base_info.spi_port)) < 0) {
+    err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+			    "Could obtain link rate for unit %u:%d. Error: %s",
+			    proto->ep->context.base_info.spi_unit,
+			    proto->ep->context.base_info.spi_port,
+			    strerror(errno));
+    goto fail;
+  }  
+  proto->epinfo.ep_link_rate = ips_rate_to_enum(ret);
+
+  /* Load the SL2VL table */
+  ips_ibta_init_sl2vl_table(proto);
+  
+  /* Regenerate new IPD table for the updated link rate. */
+  ips_gen_ipd_table(proto);
+  
+  /* Generate the CCT table.  */
+  err = ips_gen_cct_table(proto);
+
+ fail:
+  return err;
+}
+
+psm_error_t ips_ibta_init(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+  union psmi_envvar_val psm_path_policy;
+  union psmi_envvar_val disable_cca;
+
+  /* Get the path selection policy */
+  psmi_getenv("PSM_PATH_SELECTION",
+	      "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.",
+	      PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+	      (union psmi_envvar_val) "adaptive",
+	      &psm_path_policy);
+  
+  if (!strcasecmp((const char*) psm_path_policy.e_str, "adaptive"))
+    proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE;
+  else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_src"))
+    proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC;
+  else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_dest"))
+    proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST;
+  else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_base"))
+    proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE;
+  
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)
+    _IPATH_PRDBG("Using adaptive path selection.\n");
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+    _IPATH_PRDBG("Static path selection: Src Context\n");
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+    _IPATH_PRDBG("Static path selection: Dest Context\n");
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE)
+    _IPATH_PRDBG("Static path selection: Base LID \n");
+
+  psmi_getenv("PSM_DISABLE_CCA",
+	      "Disable use of Congestion Control Architecure (CCA) [enabled] ",
+	      PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+	      (union psmi_envvar_val) 0,
+	      &disable_cca);
+  if (disable_cca.e_uint) 
+    _IPATH_CCADBG("CCA is disabled for congestion control.\n");
+  else
+    proto->flags |= IPS_PROTO_FLAG_CCA;
+  
+  {
+    /* Get CCA related parameters from the environment */
+    union psmi_envvar_val ccti_incr;
+    union psmi_envvar_val ccti_timer;
+    union psmi_envvar_val ccti_size;
+    int i;
+    char ccabuf[256];
+    uint8_t *p;
+
+/*
+ * If user set any environment variable, use self CCA.
+ */
+    if (getenv("PSM_CCTI_INCREMENT") || getenv("PSM_CCTI_TIMER") || getenv("PSM_CCTI_TABLE_SIZE")) {
+	goto selfcca;
+    }
+
+/*
+ * Check qib driver CCA setting, and try to use it if available.
+ * Fall to self CCA setting if errors.
+ */
+    i = ipath_get_cc_settings_bin(proto->ep->context.base_info.spi_unit,
+		proto->ep->context.base_info.spi_port, ccabuf);
+    if (i <= 0) {
+	goto selfcca;
+    }
+    p = (uint8_t *)ccabuf;
+    memcpy(&proto->ccti_portctrl, p, 2); p += 2;
+    memcpy(&proto->ccti_ctrlmap, p, 2); p += 2;
+    for (i=0; i<16; i++) {
+	proto->cace[i].ccti_increase = *p; p++;
+	memcpy(&proto->cace[i].ccti_timer_cycles, p, 2); p += 2;
+	proto->cace[i].ccti_timer_cycles =
+	    us_2_cycles(proto->cace[i].ccti_timer_cycles);
+	proto->cace[i].ccti_threshold = *p; p++;
+	proto->cace[i].ccti_min = *p; p++;
+    }
+
+    i = ipath_get_cc_table_bin(proto->ep->context.base_info.spi_unit,
+		proto->ep->context.base_info.spi_port, &proto->cct);
+    if (i < 0) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    } else if (i == 0) {
+	goto selfcca;
+    }
+    proto->ccti_limit = i;
+    proto->ccti_size = proto->ccti_limit + 1;
+    goto finishcca;
+
+/*
+ * Since there is no qib driver CCA settings, use self built CCA.
+ */
+    selfcca:
+    psmi_getenv("PSM_CCTI_INCREMENT",
+		"IBTA_CCA: Index increment for CCT table on receipt of a BECN packet (less than table size, default 1)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) 1,
+		&ccti_incr);
+    
+    psmi_getenv("PSM_CCTI_TIMER",
+		"IBTA_CCA: CCT table congestion timer (>0, default 1 us)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) 1,
+		&ccti_timer);
+ 
+    psmi_getenv("PSM_CCTI_TABLE_SIZE",
+		"IBTA_CCA: Number of entries in CCT table (multiple of 64, default 128)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) DF_CCT_TABLE_SIZE, //128
+		&ccti_size);
+
+    /* Check the invalid values. */
+    if (ccti_size.e_uint < 64 || ccti_size.e_uint%64) {
+        _IPATH_INFO("Invalid PSM_CCTI_TABLE_SIZE=%d, at least 64 and multiple of 64, setting to default 128\n",
+                ccti_size.e_uint);
+        ccti_size.e_uint = 128;
+    }
+    proto->ccti_size = ccti_size.e_uint;
+    /* For now the CCT limit is same as table size.
+     * This does not have to be the case. */
+    proto->ccti_limit = proto->ccti_size - 1;
+
+    if (ccti_timer.e_uint <= 0) {
+        _IPATH_INFO("Invalid PSM_CCTI_TIMER=%d, should be bigger than 0, setting to default 1\n",
+                ccti_timer.e_uint);
+        ccti_timer.e_uint = 1;
+    }
+    if (ccti_incr.e_uint <= 0 || ccti_incr.e_uint >= ccti_size.e_uint) {
+        _IPATH_INFO("Invalid PSM_CCTI_INCREMENT=%d, should be less than table size, setting to default 1\n",
+                ccti_incr.e_uint);
+        ccti_incr.e_uint = 1;
+    }
+
+    /* Setup CCA parameters for port */
+    proto->ccti_portctrl = 1;	/* SL/Port based congestion control */
+    proto->ccti_ctrlmap = 0xFFFF;
+    for (i=0; i<16; i++) {
+	proto->cace[i].ccti_increase = ccti_incr.e_uint; 
+	proto->cace[i].ccti_timer_cycles = us_2_cycles(ccti_timer.e_uint); 
+	proto->cace[i].ccti_threshold = 8;
+	proto->cace[i].ccti_min = 0;
+    }
+  }
+
+  finishcca:
+  /* Seed the random number generator with our pid */
+  srand(getpid());
+
+  /* Initialize path record hash table */
+  hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash);
+
+  /* On startup treat it as a link up/down event to setup state . */
+  if ((err = ips_ibta_link_updown_event(proto)) != PSM_OK)
+    goto fail;
+
+  /* Setup the appropriate query interface for the endpoint */
+  switch(proto->ep->path_res_type) {
+  case PSM_PATH_RES_OPP:
+    err = ips_opp_init(proto);
+    if (err != PSM_OK)
+      _IPATH_ERROR("Unable to use OFED Plus Plus for path record queries.\n");
+    break;
+  case PSM_PATH_RES_UMAD:
+    _IPATH_ERROR("Path record queries using UMAD is not supported in PSM version %d.%dx\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR);
+    err = PSM_EPID_PATH_RESOLUTION;
+    break;
+  case PSM_PATH_RES_NONE:
+  default:
+    err = ips_none_path_rec_init(proto);
+  }
+  
+ fail:  
+  return err;
+}
+
+psm_error_t ips_ibta_fini(struct ips_proto *proto)
+{
+  psm_error_t err = PSM_OK;
+
+  if (proto->ibta.fini)
+    err = proto->ibta.fini(proto);
+  
+  /* Destroy the path record hash */
+  hdestroy_r(&proto->ips_path_rec_hash);
+  
+  return err;
+}
diff --git a/ptl_ips/ips_path_rec.h b/ptl_ips/ips_path_rec.h
new file mode 100644
index 0000000..5d43cac
--- /dev/null
+++ b/ptl_ips/ips_path_rec.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * 2009,2010. QLogic Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PATH_REC_H_
+#define _IPS_PATH_REC_H_
+
+#include <search.h>
+
+/* Default size of path record hash table */
+#define DF_PATH_REC_HASH_SIZE 2047
+
+/* Default size of CCT table. Must be multiple of 64 */
+#define DF_CCT_TABLE_SIZE 128
+
+/* CCT max IPD delay. QLE73XX is limited to 32us */
+#define DF_CCT_MAX_IPD_DELAY_US 21
+
+/* CCA divisor shift */
+#define CCA_DIVISOR_SHIFT 14
+
+/* CCA ipd mask */
+#define CCA_IPD_MASK 0x3FFF
+
+/* A lot of these are IBTA specific defines that are available in other header
+ * files. To minimize dependencies with PSM build process they are listed
+ * here. Most of this is used to implement IBTA compliance features with PSM
+ * like path record querye etc.
+ */
+
+enum ibta_mtu {
+  IBTA_MTU_256  = 1,
+  IBTA_MTU_512  = 2,
+  IBTA_MTU_1024 = 3,
+  IBTA_MTU_2048 = 4,
+  IBTA_MTU_4096 = 5
+};
+
+typedef enum  {
+  IBTA_RATE_PORT_CURRENT = 0,
+  IBTA_RATE_2_5_GBPS = 2,
+  IBTA_RATE_5_GBPS   = 5,
+  IBTA_RATE_10_GBPS  = 3,
+  IBTA_RATE_20_GBPS  = 6,
+  IBTA_RATE_30_GBPS  = 4,
+  IBTA_RATE_40_GBPS  = 7,
+  IBTA_RATE_60_GBPS  = 8,
+  IBTA_RATE_80_GBPS  = 9,
+  IBTA_RATE_120_GBPS = 10
+} ibta_rate;
+
+static inline int ibta_mtu_enum_to_int(enum ibta_mtu mtu)
+{
+  switch (mtu) {
+  case IBTA_MTU_256:  return  256;
+  case IBTA_MTU_512:  return  512;
+  case IBTA_MTU_1024: return 1024;
+  case IBTA_MTU_2048: return 2048;
+  case IBTA_MTU_4096: return 4096;
+  default:          return -1;
+  }
+}
+
+/* This is same as ob_path_rec from ib_types.h. Listed here to be self
+ * contained to minimize dependencies during build etc.
+ */
+typedef struct _ibta_path_rec {
+  uint64_t service_id;                /* net order */
+  uint8_t  dgid[16];
+  uint8_t  sgid[16];
+  uint16_t dlid;                      /* net order */
+  uint16_t slid;                      /* net order */
+  uint32_t hop_flow_raw;              /* net order */
+  uint8_t  tclass;
+  uint8_t  num_path;
+  uint16_t pkey;                      /* net order */
+  uint16_t qos_class_sl;              /* net order */
+  uint8_t  mtu;                       /* IBTA encoded */
+  uint8_t  rate;                      /* IBTA encoded */
+  uint8_t  pkt_life;                  /* IBTA encoded */
+  uint8_t  preference;
+  uint8_t  resv2[6];
+} ibta_path_rec_t;
+
+/*
+ * PSM IPS path record components for endpoint. 
+ */
+struct ips_proto;
+typedef struct ips_path_rec {
+  uint16_t      epr_slid; /* For Torus/non zero LMC fabrics this can be diff */
+  uint16_t	epr_dlid;
+  uint16_t	epr_mtu;
+  uint16_t      epr_pkey;
+  uint8_t       epr_sl;
+  uint8_t       epr_static_rate; 
+  uint16_t      epr_static_ipd; /* Static rate IPD from path record */
+
+  /* IBTA CCA parameters per path */
+  struct ips_proto *proto;
+  uint16_t      epr_ccti;
+  uint16_t      epr_ccti_min;
+  psmi_timer    epr_timer_cca; /* Congestion timer for epr_ccti increment. */
+  uint16_t      epr_active_ipd; /* The current active IPD. max(static,cct) */
+  uint8_t       epr_cca_divisor; /* CCA divisor [14:15] in CCT entry */
+  uint8_t       epr_pad;
+
+  /* TODO: The endpoint timeout should also adjust based on epr_ird */ 
+  uint32_t	epr_timeout_ack_factor;
+  uint64_t	epr_timeout_ack; 
+  uint64_t	epr_timeout_ack_max;
+} ips_path_rec_t;
+
+typedef struct _ips_opp_path_rec {
+  ibta_path_rec_t opp_response;
+  ips_path_rec_t ips;
+} ips_opp_path_rec_t;
+
+psm_error_t ips_opp_init(struct ips_proto *proto);
+
+#endif
diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c
new file mode 100644
index 0000000..e9715e4
--- /dev/null
+++ b/ptl_ips/ips_proto.c
@@ -0,0 +1,2061 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * IPS - Interconnect Protocol Stack.
+ */
+
+#include <assert.h>
+#include <sys/uio.h> /* writev */
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_help.h"
+
+/*
+ * host ipv4 and pid used in ERR_CHK messages to detect stray processes
+ */
+static uint32_t host_ipv4addr = 0;  /* be */
+static uint32_t host_pid = 0;	    /* be */
+
+/*
+ * Control message types have their own flag to determine whether a message of
+ * that type is queued or not.  These flags are kept in a state bitfield.
+ */
+#define CTRL_MSG_ACK_QUEUED                     0x0001
+#define CTRL_MSG_NAK_QUEUED                     0x0002
+#define CTRL_MSG_ERR_CHK_QUEUED                 0x0004
+#define CTRL_MSG_ERR_CHK_PLS_QUEUED             0x0008
+#define CTRL_MSG_CONNECT_REQUEST_QUEUED		0x0010
+#define CTRL_MSG_CONNECT_REPLY_QUEUED		0x0020
+#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED	0x0040
+#define CTRL_MSG_DISCONNECT_REPLY_QUEUED	0x0080
+#define CTRL_MSG_TIDS_RELEASE_QUEUED            0x0100
+#define CTRL_MSG_TIDS_RELEASE_CONFIRM_QUEUED    0x0200
+#define CTRL_MSG_CLOSE_QUEUED                   0x0400
+#define CTRL_MSG_CLOSE_ACK_QUEUED               0x0800
+#define CTRL_MSG_ABORT_QUEUED                   0x1000
+#define CTRL_MSG_TIDS_GRANT_QUEUED		0x2000
+#define CTRL_MSG_TIDS_GRANT_ACK_QUEUED		0x4000
+#define CTRL_MSG_ERR_CHK_GEN_QUEUED             0x8000
+#define CTRL_MSG_FLOW_CCA_BECN                  0x10000
+
+#define CTRL_MSG_QUEUE_ALWAYS 0x80000000
+
+#define _desc_idx   u32w0
+#define _desc_genc  u32w1
+
+static void	   ctrlq_init(struct ips_ctrlq *ctrlq, int flowid, 
+			      struct ips_proto *proto);
+static psm_error_t proto_sdma_init(struct ips_proto *proto, 
+				   const psmi_context_t *context);
+
+psm_error_t
+ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, 
+	       int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
+	       const struct psmi_timer_ctrl *timerq, 
+	       const struct ips_epstate *epstate, 
+	       const struct ips_spio *spioc, 
+	       struct ips_proto *proto)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+    uint32_t protoexp_flags, cksum_sz = 0;
+    union psmi_envvar_val env_tid, env_cksum, env_mtu;
+    psm_error_t err = PSM_OK;
+
+    /*
+     * Checksum packets within PSM. Default is off.
+     * This is heavy weight and done in software so not recommended for 
+     * production runs.
+     */
+    
+    psmi_getenv("PSM_CHECKSUM", 
+		"Enable checksum of messages (0 disables checksum)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) 0, 
+		&env_cksum);
+    
+    memset(proto, 0, sizeof(struct ips_proto));
+    proto->ptl = (ptl_t *) ptl;
+    proto->ep  = context->ep;      /* cached */
+    proto->mq  = context->ep->mq;  /* cached */
+    proto->fd  = context->fd;      /* cached */
+    proto->pend_sends.proto = proto;
+    psmi_timer_entry_init(&proto->pend_sends.timer,
+			 ips_proto_timer_pendq_callback, &proto->pend_sends);
+    STAILQ_INIT(&proto->pend_sends.pendq);
+    proto->epstate = (struct ips_epstate *) epstate;
+    proto->timerq  = (struct psmi_timer_ctrl *) timerq;
+    proto->spioc   = (struct ips_spio *) spioc;
+    
+    proto->epinfo.ep_baseqp = base_info->spi_qpair;
+    proto->epinfo.ep_context = base_info->spi_context; /* "real" context */
+
+    proto->epinfo.ep_subcontext = base_info->spi_subcontext;
+    proto->epinfo.ep_hca_type = psmi_epid_hca_type(context->epid);
+    
+    proto->epinfo.ep_unit    = base_info->spi_unit;
+    proto->epinfo.ep_hdrq_msg_size = (IPS_HEADER_QUEUE_HWORDS + 
+				      IPS_HEADER_QUEUE_IWORDS + 
+				      IPS_HEADER_QUEUE_UWORDS_MIN) << 2;
+    
+    /* If checksums enabled we insert checksum at end of packet */
+    cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0;
+
+    proto->epinfo.ep_mtu     = base_info->spi_mtu - 
+                               proto->epinfo.ep_hdrq_msg_size - 
+                               CRC_SIZE_IN_BYTES - PCB_SIZE_IN_BYTES;
+    proto->epinfo.ep_mtu = ips_next_low_pow2(proto->epinfo.ep_mtu);
+    /* Decrement checksum accounting AFTER lowering power of two */
+    proto->epinfo.ep_mtu -= cksum_sz; 
+    
+    /* See if user specifies a lower MTU to use */
+    if (!psmi_getenv("PSM_MTU", "MTU specified by user: 1-5,256-4096[4/2048]",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+		(union psmi_envvar_val) -1,
+		&env_mtu)) {
+	if (env_mtu.e_int != 256 && env_mtu.e_int != 512
+	&& env_mtu.e_int != 1024 && env_mtu.e_int != 2048
+	&& env_mtu.e_int != 4096) {
+	   if (env_mtu.e_int < 1 || env_mtu.e_int > 5) env_mtu.e_int = 4;
+	   env_mtu.e_int = ibta_mtu_enum_to_int((enum ibta_mtu)env_mtu.e_int);
+	}
+	if (proto->epinfo.ep_mtu > env_mtu.e_int)
+		proto->epinfo.ep_mtu = env_mtu.e_int;
+    }
+
+    proto->epinfo.ep_piosize = base_info->spi_piosize - 
+			       proto->epinfo.ep_hdrq_msg_size -
+			       CRC_SIZE_IN_BYTES - PCB_SIZE_IN_BYTES - cksum_sz;
+    
+    /* Keep PIO as multiple of cache line size */
+    if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES)
+      proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1);
+    
+    
+    proto->timeout_send      = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT);
+
+    proto->iovec_cntr_next_inflight = 0;
+    proto->iovec_thresh_eager= proto->iovec_thresh_eager_blocking = ~0U;
+    proto->scb_max_inflight  = 2*num_of_send_desc;
+    proto->scb_bufsize	     = PSMI_ALIGNUP(max(base_info->spi_piosize, 
+						base_info->spi_mtu),
+					    PSMI_PAGESIZE),
+    proto->t_init	     = get_cycles();
+    proto->t_fini	     = 0;
+    proto->flags             = env_cksum.e_uint ? 
+				      IPS_PROTO_FLAG_CKSUM : 0;
+
+    proto->num_connected_to   = 0;
+    proto->num_connected_from = 0;
+    proto->num_disconnect_requests = 0;
+    proto->stray_warn_interval = (uint64_t) -1;
+    proto->done_warning = 0;
+    proto->done_once = 0;
+    proto->num_bogus_warnings = 0;
+    proto->psmi_logevent_tid_send_reqs.interval_secs = 15;
+    proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+    proto->psmi_logevent_tid_send_reqs.count = 0;
+    
+    /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */
+    if ((err = ips_ibta_init(proto)))
+      goto fail;
+
+    {
+      /* Disable coalesced ACKs? */
+      union psmi_envvar_val env_coalesce_acks;
+      
+      psmi_getenv("PSM_COALESCE_ACKS", 
+		  "Coalesce ACKs on the wire (default is enabled i.e. 1)",
+		  PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		  (union psmi_envvar_val) 1,  /* Enabled by default */
+		  &env_coalesce_acks);
+      
+      if (env_coalesce_acks.e_uint) 
+	proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS;
+    }
+    
+    {
+      /* Number of credits per flow */
+      union psmi_envvar_val env_flow_credits;
+      int df_flow_credits = min(PSM_FLOW_CREDITS, num_of_send_desc);
+      
+      psmi_getenv("PSM_FLOW_CREDITS",
+		 "Number of unacked packets (credits) per flow (default is 64)",
+		  PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		  (union psmi_envvar_val) df_flow_credits,
+		  &env_flow_credits);
+      proto->flow_credits = env_flow_credits.e_uint;
+    }
+    
+    if ((context->runtime_flags & IPATH_RUNTIME_SDMA)) 
+	if ((err = proto_sdma_init(proto, context)))
+	    goto fail;
+    
+    /* 
+     * Clone sendreq mpool configuration for pend sends config
+     */
+    {
+	uint32_t chunks, maxsz;
+
+	psmi_assert_always(proto->ep->mq->sreq_pool != NULL);
+	psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks, &maxsz);
+
+	proto->pend_sends_pool = 
+	    psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks, maxsz, 
+			      0, DESCRIPTORS, NULL, NULL);
+	if (proto->pend_sends_pool == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+    }
+
+    /*
+     * Register ips protocol statistics
+     *
+     * We put a (*) in the output to denote stats that may cause a drop in
+     * performance.
+     *
+     * We put a (**) in the output of those stats that "should never happen"
+     */
+    {
+	struct psmi_stats_entry entries[] = {
+	    PSMI_STATS_DECLU64("pio busy count", 
+			       &proto->stats.pio_busy_cnt),
+	    /* Throttling by kernel */
+	    PSMI_STATS_DECLU64("writev busy cnt",
+			       &proto->stats.writev_busy_cnt),
+	    /* When local dma completion is in the way... */
+	    PSMI_STATS_DECLU64("writev compl. eagain",
+			       &proto->stats.writev_compl_eagain),
+	    /* When remote completion happens before local completion */
+	    PSMI_STATS_DECLU64("writev compl. delay (*)",
+			       &proto->stats.writev_compl_delay),
+	    PSMI_STATS_DECLU64("scb unavail eager count", 
+			       &proto->stats.scb_egr_unavail_cnt),
+	    PSMI_STATS_DECLU64("scb unavail exp count", 
+			       &proto->stats.scb_exp_unavail_cnt),
+	    PSMI_STATS_DECLU64("rcvhdr overflows", /* Normal egr/hdr ovflw */
+			       &proto->stats.hdr_overflow),
+	    PSMI_STATS_DECLU64("rcveager overflows", 
+			       &proto->stats.egr_overflow),
+	    PSMI_STATS_DECLU64("lid zero errs (**)", /* shouldn't happen */
+			       &proto->stats.lid_zero_errs),
+	    PSMI_STATS_DECLU64("unknown packets (**)", /* shouldn't happen */
+			       &proto->stats.unknown_packets),
+	    PSMI_STATS_DECLU64("stray packets (*)", 
+			       &proto->stats.stray_packets),
+	    PSMI_STATS_DECLU64("send dma misaligns (*)", 
+			       &proto->stats.send_dma_misaligns),
+	    PSMI_STATS_DECLU64("amreply no bufs (*)",
+			       &proto->proto_am.amreply_nobufs),
+	    PSMI_STATS_DECLU64("pio stalls (*)", /* shouldn't happen too often */ 
+			       &proto->spioc->spio_num_stall_total),
+	    PSMI_STATS_DECLU64("Invariant CRC error (*)",
+			       &proto->error_stats.num_icrc_err),
+	    PSMI_STATS_DECLU64("Variant CRC error (*)",
+			       &proto->error_stats.num_vcrc_err),
+	    PSMI_STATS_DECLU64("ECC error ",
+			       &proto->error_stats.num_ecc_err),
+	    PSMI_STATS_DECLU64("IB Len error",
+			       &proto->error_stats.num_len_err),
+	    PSMI_STATS_DECLU64("IB MTU error ",
+			       &proto->error_stats.num_mtu_err),
+	    PSMI_STATS_DECLU64("KDETH error ",
+			       &proto->error_stats.num_khdr_err),
+	    PSMI_STATS_DECLU64("TID error ",
+			       &proto->error_stats.num_tid_err),
+	    PSMI_STATS_DECLU64("MK error ",
+			       &proto->error_stats.num_mk_err),
+	    PSMI_STATS_DECLU64("IB error ",
+			       &proto->error_stats.num_ib_err),
+	    
+	};
+
+	err = psmi_stats_register_type("InfiniPath low-level protocol stats",
+				       PSMI_STATSTYPE_IPSPROTO,
+				       entries,
+				       PSMI_STATS_HOWMANY(entries),
+				       NULL);
+	if (err != PSM_OK)
+	    goto fail;
+    }
+
+    /* 
+     * Control Queue and messaging 
+     */
+    {
+      int idx;
+      
+      for (idx = 0; idx < EP_FLOW_LAST; idx++)
+	ctrlq_init(&proto->ctrlq[idx], idx, proto);
+    }
+					     
+    /*
+     * Receive-side handling
+     */
+    if ((err = ips_proto_recv_init(proto)))
+	goto fail;
+
+    /* 
+     * Eager buffers.  We don't care to receive a callback when eager buffers
+     * are newly released since we actively poll for new bufs.
+     */
+    if ((err = ips_scbctrl_init(context, num_of_send_desc,
+	        num_of_send_bufs, imm_size, proto->scb_bufsize,
+		NULL, NULL, &proto->scbc_egr)))
+	goto fail;
+
+    /*
+     * Expected protocol handling.
+     * If we enable tid-based expected rendezvous, the expected protocol code
+     * handles its own rv scb buffers.  If not, we have to enable eager-based
+     * rendezvous and we allocate scb buffers for it.
+     */
+    psmi_getenv("PSM_TID", 
+		"Tid proto flags (0 disables protocol)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) IPS_PROTOEXP_FLAGS_DEFAULT, 
+		&env_tid);
+    protoexp_flags = env_tid.e_uint;
+
+    if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
+	proto->scbc_rv = NULL;
+	if ((err = ips_protoexp_init(context, proto, protoexp_flags,
+				     num_of_send_bufs, num_of_send_desc,
+				     &proto->protoexp)))
+	    goto fail;
+    }
+    else {
+	proto->protoexp = NULL;
+	proto->scbc_rv = (struct ips_scbctrl *)
+			psmi_calloc(proto->ep, DESCRIPTORS, 
+				    1, sizeof(struct ips_scbctrl));
+	if (proto->scbc_rv == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+	/* 
+	 * Rendezvous buffers. We want to get a callback for rendezvous bufs
+	 * since we asynchronously try to make progress on these sends and only
+	 * schedule them on the timerq if there are pending sends and available
+	 * bufs.
+	 */
+	if ((err = ips_scbctrl_init(context, num_of_send_desc, 0 /* no bufs */, 
+		    0, 0 /* bufsize==0 */, ips_proto_rv_scbavail_callback,
+		    proto, proto->scbc_rv)))
+	    goto fail;
+    }
+        
+    /*
+     * Parse the tid error settings from the environment.
+     * <interval_secs>:<max_count_before_exit>
+     */
+    {
+	int tvals[2];
+	char *tid_err;
+	union psmi_envvar_val env_tiderr;
+
+	tid_err = "-1:0"; /* no tiderr warnings, never exits */
+	tvals[0] = -1;
+	tvals[1] = 0;
+
+	if (!psmi_getenv("PSM_TID_ERROR",
+			 "Tid error control <intval_secs:max_errors>",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val) tid_err,
+			 &env_tiderr)) {
+	    /* not using default values */
+	    tid_err = env_tiderr.e_str;
+	    psmi_parse_str_tuples(tid_err, 2, tvals);
+	}
+	if (tvals[0] >= 0)
+		proto->tiderr_warn_interval = sec_2_cycles(tvals[0]);
+	else
+		proto->tiderr_warn_interval = UINT64_MAX;
+	proto->tiderr_max = tvals[1];
+	_IPATH_PRDBG("Tid error control: warning every %d secs%s, "
+		     "fatal error after %d tid errors%s\n",
+		     tvals[0], (tvals[0] < 0) ? " (no warnings)" : "",
+		     tvals[1], (tvals[1] == 0) ? " (never fatal)" : "");
+    }
+
+    /*
+     * Active Message interface.  AM requests compete with MQ for eager
+     * buffers, since request establish the amount of buffering in the network
+     * (maximum number of requests in flight).  AM replies use the same amount
+     * of request buffers -- we can never run out of AM reply buffers because a
+     * request handler can only be run if we have at least one reply buffer (or
+     * else the AM request is dropped).
+     */
+    if ((err = ips_proto_am_init(proto, num_of_send_bufs, num_of_send_desc,
+				 imm_size, &proto->proto_am)))
+	goto fail;
+
+    if (!host_pid) {
+	char ipbuf[INET_ADDRSTRLEN], *p;
+	host_pid = (uint32_t) getpid();
+	host_ipv4addr = psmi_get_ipv4addr(); /* already be */
+	if (host_ipv4addr == 0) {
+	    _IPATH_DBG("Unable to obtain local IP address, "
+                       "not fatal but some features may be disabled\n");
+	}
+	else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) {
+	    _IPATH_INFO("Localhost IP address is set to the "
+		        "loopback address 127.0.0.1, "
+		        "not fatal but some features may be disabled\n");
+	}
+	else {
+	    p = (char *) inet_ntop(AF_INET, (const void *) &host_ipv4addr, 
+			ipbuf, sizeof ipbuf);
+	    _IPATH_PRDBG("Ethernet Host IP=%s and PID=%d\n", p, host_pid);
+	}
+
+	/* Store in big endian for use in ERR_CHK */
+	host_pid = __cpu_to_be32(host_pid);
+    }
+
+fail:
+    return err;
+}
+
+psm_error_t
+ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in)
+{
+    struct psmi_eptab_iterator itor;
+    uint64_t t_start;
+    uint64_t t_grace_start, t_grace_time, t_grace_finish, t_grace_interval;
+    psm_epaddr_t epaddr;
+    psm_error_t err = PSM_OK;
+    int i;
+    union psmi_envvar_val grace_intval;
+
+    psmi_getenv("PSM_CLOSE_GRACE_PERIOD",
+		"Additional grace period in seconds for closing end-point.",
+		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 0,
+		&grace_intval);
+
+    if (getenv("PSM_CLOSE_GRACE_PERIOD")) {
+        t_grace_time = grace_intval.e_uint * SEC_ULL;
+    }
+    else if (timeout_in > 0) {
+        /* default to half of the close time-out */
+        t_grace_time = timeout_in / 2;
+    }
+    else {
+        /* propagate the infinite time-out case */
+        t_grace_time = 0;
+    }
+
+    if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT)
+        t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT;
+
+    /* At close we will busy wait for the grace interval to see if any
+     * receive progress is made. If progress is made we will wait for
+     * another grace interval, until either no progress is made or the
+     * entire grace period has passed. If the grace interval is too low
+     * we may miss traffic and exit too early. If the grace interval is
+     * too large the additional time spent while closing the program
+     * will become visible to the user. */
+    psmi_getenv("PSM_CLOSE_GRACE_INTERVAL",
+		"Grace interval in seconds for closing end-point.",
+		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 0,
+		&grace_intval);
+
+    if (getenv("PSM_CLOSE_GRACE_INTERVAL")) {
+        t_grace_interval = grace_intval.e_uint * SEC_ULL;
+    }
+    else {
+        /* A heuristic is used to scale up the timeout linearly with 
+         * the number of endpoints, and we allow one second per 1000
+         * endpoints. */
+        t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000;
+    }
+
+    if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL)
+        t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL;
+    if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL)
+        t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL;
+
+    PSMI_PLOCK_ASSERT();
+
+    t_start = proto->t_fini = get_cycles();
+
+    /* Close whatever has been left open */
+    if (proto->num_connected_to > 0) {
+        int num_disc = 0;
+        int *mask;
+        psm_error_t  *errs;
+        psm_epaddr_t *epaddr_array;
+
+        psmi_epid_itor_init(&itor, proto->ep);
+        while ((epaddr = psmi_epid_itor_next(&itor))) {
+            if (epaddr->ptl == proto->ptl)
+		num_disc++;
+        }
+	psmi_epid_itor_fini(&itor);
+	mask = (int *) psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(int));
+	errs = (psm_error_t *)
+		psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm_error_t));
+	epaddr_array = (psm_epaddr_t *) 
+            psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm_epaddr_t));
+
+	if (errs == NULL || epaddr_array == NULL || mask == NULL) {
+	    if (epaddr_array) psmi_free(epaddr_array);
+	    if (errs) psmi_free(errs);
+	    if (mask) psmi_free(mask);
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+        psmi_epid_itor_init(&itor, proto->ep);
+	i = 0;
+        while ((epaddr = psmi_epid_itor_next(&itor))) {
+            if (epaddr->ptl == proto->ptl) {
+		mask[i] = 1;
+                epaddr_array[i] = epaddr;
+                i++;
+		PSM_MCTXT_REMOVE(epaddr);
+            }
+        }
+	psmi_epid_itor_fini(&itor);
+	err = ips_proto_disconnect(proto, force, num_disc, epaddr_array, 
+				   mask, errs, timeout_in);
+        psmi_free(mask);
+        psmi_free(errs);
+        psmi_free(epaddr_array);
+    }
+
+    t_grace_start = get_cycles();
+
+    while (psmi_cycles_left(t_grace_start, t_grace_time)) {
+        uint64_t t_grace_interval_start = get_cycles();
+	int num_disconnect_requests = proto->num_disconnect_requests;
+        PSMI_BLOCKUNTIL(proto->ep, err, 
+		        (proto->num_connected_from == 0 ||
+		         !psmi_cycles_left(t_start, timeout_in)) &&
+		        (!psmi_cycles_left(t_grace_interval_start, t_grace_interval) ||
+                         !psmi_cycles_left(t_grace_start, t_grace_time)));
+	if (num_disconnect_requests == proto->num_disconnect_requests) {
+	    /* nothing happened in this grace interval so break out early */
+	    break;
+	}
+    }
+
+    t_grace_finish = get_cycles();
+
+    _IPATH_PRDBG("Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n",
+	 	 proto->num_connected_to, proto->num_connected_from,
+		 (int) (cycles_to_nanosecs(t_grace_finish - t_grace_start) / MSEC_ULL),
+                 (int) (t_grace_time / MSEC_ULL));
+    
+    if ((err = ips_ibta_fini(proto)))
+      goto fail;
+        
+    if ((err = ips_proto_am_fini(&proto->proto_am)))
+	goto fail;
+    
+    if ((err = ips_scbctrl_fini(&proto->scbc_egr)))
+	goto fail;
+   
+    ips_proto_recv_fini(proto);
+    
+    if (proto->protoexp) {
+	if ((err = ips_protoexp_fini(proto->protoexp)))
+	    goto fail;
+    }
+    else {
+	ips_scbctrl_fini(proto->scbc_rv);
+	psmi_free(proto->scbc_rv);
+    }
+
+    psmi_mpool_destroy(proto->pend_sends_pool);
+
+fail:
+    proto->t_fini = proto->t_init = 0;
+    return err;
+}
+
+static
+psm_error_t
+proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context)
+{
+    union psmi_envvar_val env_sdma, env_ipathegr;
+    char *c;
+    uint32_t defval = IPS_PROTO_FLAGS_DEFAULT & IPS_PROTO_FLAGS_ALL_SDMA;
+    psm_error_t err = PSM_OK;
+    int egrmode;
+
+    /*
+     * Only initialize if RUNTIME_SDMA is enabled.
+     */
+    psmi_assert_always(context->runtime_flags & IPATH_RUNTIME_SDMA);
+
+    if ((c = getenv("PSM_SDMA")) && *c && !strncmp("always", c, 7))
+	defval = IPS_PROTO_FLAGS_ALL_SDMA;
+
+    psmi_getenv("PSM_SDMA",
+		"ipath send dma flags (0 disables send dma)",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		(union psmi_envvar_val) defval,
+		&env_sdma);
+
+    if(env_sdma.e_uint != 1)
+      proto->flags |= env_sdma.e_uint & IPS_PROTO_FLAGS_ALL_SDMA;
+
+    /* If anything uses send dma, figure out our max packet threshold to call
+     * send dma with */
+    proto->scb_max_sdma = IPS_SDMA_MAX_SCB;
+    if (proto->flags & IPS_PROTO_FLAGS_ALL_SDMA) {
+	psmi_getenv("PSM_SDMA_THRESH",
+		    "ipath send dma max packet per call",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val) proto->scb_max_sdma,
+		    &env_sdma);
+	proto->scb_max_sdma = env_sdma.e_uint;
+	if (proto->scb_max_sdma < 1) {
+	    _IPATH_ERROR("Overriding PSM_SDMA_THRESH=%u to be '%u'\n",
+		    proto->scb_max_sdma, 1);
+	    proto->scb_max_sdma = 1;
+	}
+    }
+
+    egrmode = proto->flags & 
+	      (IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA|IPS_PROTO_FLAG_MQ_EAGER_SDMA);
+    
+    /* Some modes don't make sense or at least, MQ doesn't expect them to
+     * be a functional mode.  For example, it's not possible to use DMA
+     * message envelopes with PIO eager data.
+     */
+    if (egrmode == IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA) {
+	err = psmi_handle_error(proto->ep, PSM_PARAM_ERR,
+		"Unsupported Send DMA mode 0x%x: dma envelopes and pio eager",
+		proto->flags);
+	goto fail;
+    }
+    /* Only bother with switchover for pio-envelope,dma-eagerdata */
+    else if (egrmode == IPS_PROTO_FLAG_MQ_EAGER_SDMA) {
+        /* Reduce threshold to use SDMA for QLE73XX as we are PIO limited for
+         * medium message sizes on it.
+         */
+        uint32_t hca_type = psmi_get_hca_type((psmi_context_t*) context);
+	
+	defval = (hca_type == PSMI_HCA_TYPE_QLE73XX) ? 
+	  MQ_IPATH_THRESH_EGR_SDMA_SQ : MQ_IPATH_THRESH_EGR_SDMA;
+	psmi_getenv("PSM_MQ_EAGER_SDMA_SZ",
+		"ipath pio-to-sdma eager switchover",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) defval, &env_ipathegr);
+
+	/* Has to be at least 1 MTU */
+	proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = 
+	  max(proto->epinfo.ep_piosize, env_ipathegr.e_uint);
+	
+	/* For QLE73XX bump up the eager SDMA threshold for blocking sends if
+	 * the user has not explicitly set one.
+	 */
+	if ((hca_type == PSMI_HCA_TYPE_QLE73XX) && 
+	    (proto->iovec_thresh_eager == defval))
+	  proto->iovec_thresh_eager_blocking = MQ_IPATH_THRESH_EGR_SDMA;
+    }
+    else if (egrmode == 
+	     (IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA|IPS_PROTO_FLAG_MQ_EAGER_SDMA))
+    {
+	/* Has to be 0 so we never try to split pio and dma */
+	proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = 0;
+    }
+    else if (egrmode == 0) { /* all pio */
+	proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U;
+    }
+
+fail:
+    return err;
+}
+
+static
+void
+ctrlq_init(struct ips_ctrlq *ctrlq, int flowid, struct ips_proto *proto)
+{
+    // clear the ctrl send queue
+    memset(ctrlq, 0, sizeof(*ctrlq));
+
+    proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED;
+    proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED;
+    proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED;
+    proto->message_type_to_index[OPCODE_ERR_CHK_PLS] = CTRL_MSG_ERR_CHK_PLS_QUEUED;
+    proto->message_type_to_index[OPCODE_CONNECT_REQUEST] = 
+		CTRL_MSG_CONNECT_REQUEST_QUEUED;
+    proto->message_type_to_index[OPCODE_CONNECT_REPLY] = 
+		CTRL_MSG_CONNECT_REPLY_QUEUED;
+    proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] = 
+		CTRL_MSG_DISCONNECT_REQUEST_QUEUED;
+    proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] = 
+		CTRL_MSG_DISCONNECT_REPLY_QUEUED;
+    proto->message_type_to_index[OPCODE_CLOSE] = CTRL_MSG_CLOSE_QUEUED;
+    proto->message_type_to_index[OPCODE_CLOSE_ACK] = CTRL_MSG_CLOSE_ACK_QUEUED;
+    proto->message_type_to_index[OPCODE_ABORT] = CTRL_MSG_ABORT_QUEUED;
+    proto->message_type_to_index[OPCODE_TIDS_GRANT] = CTRL_MSG_TIDS_GRANT_QUEUED;
+    proto->message_type_to_index[OPCODE_TIDS_GRANT_ACK] = CTRL_MSG_TIDS_GRANT_ACK_QUEUED;
+    proto->message_type_to_index[OPCODE_ERR_CHK_GEN] = CTRL_MSG_ERR_CHK_GEN_QUEUED;
+    proto->message_type_to_index[OPCODE_FLOW_CCA_BECN] = CTRL_MSG_FLOW_CCA_BECN;
+
+    ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0;
+    ctrlq->ctrlq_overflow = 0;
+    ctrlq->ctrlq_proto = proto;
+    ctrlq->ctrlq_flowid = flowid;
+    /* We never enqueue connect messages.  They require 512 bytes and we don't
+     * want to stack allocate 512 bytes just when sending back acks.
+     */
+    proto->ctrl_msg_queue_never_enqueue = CTRL_MSG_CONNECT_REQUEST_QUEUED |
+				   CTRL_MSG_CONNECT_REPLY_QUEUED |
+				   CTRL_MSG_DISCONNECT_REQUEST_QUEUED |
+                                   CTRL_MSG_DISCONNECT_REPLY_QUEUED |
+                                   CTRL_MSG_ERR_CHK_GEN_QUEUED |
+                                   CTRL_MSG_TIDS_GRANT_QUEUED;
+
+    psmi_timer_entry_init(&ctrlq->ctrlq_timer,
+			 ips_proto_timer_ctrlq_callback, ctrlq);
+
+    return;
+}
+
+static int inline 
+_build_ctrl_message(struct ips_proto *proto,
+		    struct ips_proto_ctrl_message *msg,
+		    ips_epaddr_t *ipsaddr, uint8_t message_type,
+		    struct ips_flow *flow,
+		    void *payload, uint8_t *discard_msg)
+{
+    uint32_t tot_paywords = sizeof(struct ips_message_header) >> 2;
+    struct ips_epinfo *epinfo = &proto->epinfo;
+    struct ips_epinfo_remote *epr = &ipsaddr->epr;
+    uint16_t pkt_flags = IPS_EPSTATE_COMMIDX_PACK(epr->epr_commidx_to);
+    struct ips_message_header *p_hdr = &msg->pbc_hdr.hdr;
+    ips_path_rec_t *ctrl_path = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][ipsaddr->epr.epr_hpp_index];
+    int paylen = 0;
+    
+    if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE)  &&
+	(++ipsaddr->epr.epr_hpp_index >=
+	 ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]))
+      ipsaddr->epr.epr_hpp_index = 0;
+    
+    /* Control messages go over the control path. */
+    p_hdr->lrh[0] = __cpu_to_be16(IPATH_LRH_BTH | 
+				  (ctrl_path->epr_sl << 4) |
+				  (proto->sl2vl[ctrl_path->epr_sl] << LRH_VL_SHIFT));
+    p_hdr->lrh[1] = ctrl_path->epr_dlid;
+    p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC);
+    p_hdr->lrh[3] = ctrl_path->epr_slid;
+
+    p_hdr->bth[0] = __cpu_to_be32((IPATH_OPCODE_USER1 << 24) + 
+				  ctrl_path->epr_pkey);
+
+    /* If flow is congested then generate a BECN for path. */
+    if_pf (flow->flags & IPS_FLOW_FLAG_GEN_BECN) {
+      _IPATH_CCADBG("Generating BECN for flow %x ----> %x. Num congested packets: 0x%"PRIx64". Msg type: %d\n", __be16_to_cpu(flow->path->epr_slid), __be16_to_cpu(flow->path->epr_dlid), ipsaddr->stats.congestion_pkts, message_type);
+      p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp | 1 << BTH_BECN_SHIFT);
+      flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN;
+    }
+    else
+      p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp);
+    p_hdr->bth[2] = 0;
+
+    p_hdr->commidx = epr->epr_commidx_to;
+    p_hdr->sub_opcode = message_type;
+    p_hdr->ack_seq_num = 0;
+    IPS_HEADER_SRCCONTEXT_SET(p_hdr, epinfo->ep_context);
+    p_hdr->src_subcontext = epinfo->ep_subcontext;
+    p_hdr->dst_subcontext = epr->epr_subcontext;
+    p_hdr->flags = 0;
+    p_hdr->mqhdr = 0;
+    p_hdr->flowid = flow->flowid;
+
+    switch (message_type) {
+    case OPCODE_ACK:
+      if_pt (flow->protocol != PSM_PROTOCOL_TIDFLOW) 
+        p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+      else {
+	ptl_arg_t *args = (ptl_arg_t*) payload;
+	uint32_t tid_recv_sessid;
+	struct ips_tid_recv_desc *tidrecvc;
+	
+	/* TIDFLOW ACK. 
+	 * args[0] = send descriptor id
+	 * args[1] = receive descriptor id
+	 */
+	ips_ptladdr_lock(ipsaddr);
+	
+	tid_recv_sessid = args[1]._desc_idx;
+	tidrecvc = 
+	  psmi_mpool_find_obj_by_index(proto->protoexp->tid_desc_recv_pool,
+				       tid_recv_sessid);
+	if (tidrecvc == NULL) {
+	  *discard_msg = 1;
+	  ips_ptladdr_unlock(ipsaddr);
+	  break;
+	}
+	if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != args[1]._desc_genc) {
+	  *discard_msg = 1;
+	  ips_ptladdr_unlock(ipsaddr);
+	  break;
+	}
+	
+	p_hdr->data[0].u64 = args[0].u64;
+	p_hdr->ack_seq_num = tidrecvc->tidflow_genseq.psn;
+	ips_ptladdr_unlock(ipsaddr);
+      }
+      break;
+
+    case OPCODE_NAK:
+      if_pf (flow->protocol != PSM_PROTOCOL_TIDFLOW) {
+	p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+      }
+      else {
+	ptl_arg_t *args = (ptl_arg_t*) payload;
+	uint32_t tid_recv_sessid;
+	struct ips_tid_recv_desc *tidrecvc;
+	psmi_seqnum_t ack_seq_num;
+	
+	/* TIDFLOW NAK.
+	 * args[0] = send descriptor id
+	 * args[1] = receive descriptor id
+	 * args[2].u16w0 = Old generation to NAK
+	 */
+	ips_ptladdr_lock(ipsaddr);
+	
+	tid_recv_sessid = args[1]._desc_idx;
+	tidrecvc = 
+	  psmi_mpool_find_obj_by_index(proto->protoexp->tid_desc_recv_pool,
+				       tid_recv_sessid);
+	if (tidrecvc == NULL) {
+	  *discard_msg = 1;
+	  ips_ptladdr_unlock(ipsaddr);
+	  break;
+	}
+	if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != args[1]._desc_genc) {
+	  *discard_msg = 1;
+	  ips_ptladdr_unlock(ipsaddr);
+	  break;
+	}
+
+	p_hdr->data[0].u64 = args[0].u64; /* Send descriptor id */
+	p_hdr->data[1].u32w0 = tidrecvc->tidflow_genseq.val; /*New flowgenseq*/
+	
+	/* Ack seqnum contains the old generation we are acking for */
+	ack_seq_num = tidrecvc->tidflow_genseq;
+	ack_seq_num.gen = args[2].u16w0;
+	p_hdr->ack_seq_num = ack_seq_num.psn;
+	
+	ips_ptladdr_unlock(ipsaddr);
+      }
+      break;
+      
+    case OPCODE_ERR_CHK:
+      {
+	psmi_seqnum_t err_chk_seq;
+	ips_ptladdr_lock(ipsaddr);
+
+	err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ?
+	    flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num;
+	err_chk_seq.pkt -= 1;
+	p_hdr->bth[2] = __cpu_to_be32(err_chk_seq.psn);
+	ips_ptladdr_unlock(ipsaddr);
+	p_hdr->data[0].u32w0 = host_ipv4addr;
+	p_hdr->data[0].u32w1 = host_pid;
+
+	if (ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD)
+	    pkt_flags |= INFINIPATH_KPF_INTR;
+      }
+      break;
+	
+    case OPCODE_ERR_CHK_GEN:
+      {
+	struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+	
+	/* TIDFLOW ERR_CHK_GEN
+	 * args[0] = receive descriptor id
+	 * args[1] = send descriptor id
+	 */
+	if (!STAILQ_EMPTY(unackedq)) {
+	  ips_scb_t *scb = STAILQ_FIRST(unackedq);
+	  psmi_seqnum_t err_chk_seq;
+	  
+	  ips_ptladdr_lock(ipsaddr);
+	  
+	  psmi_assert_always(scb->tidsendc);
+	  
+	  err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ?
+	    flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num;
+	  err_chk_seq.seq -= 1;
+	  
+	  /* NOTE: If error check gen is cached and we get a NAK 
+	   * the scbs are flushed again. This can increase the DMA counter
+	   * as scb's are retransmitted which we don't check for here.
+	   * One way is never cache the ERR_CHK_GEN messages so it's only
+	   * called from the ack timeout callback. Other way is that we
+	   * send the ERR_CHK_GEN message over SDMA so they are serialized with
+	   * respect to each other. Note: In this case we don't need to 
+	   * wait for the DMA completion counters in the ack timeout.
+	   */
+	  p_hdr->bth[2] = __cpu_to_be32(err_chk_seq.psn);
+	  
+	  /* Receive descriptor index */
+	  p_hdr->data[0].u64 = scb->tidsendc->tid_list.tsess_descid.u64;
+	  /* Send descriptor index */
+	  p_hdr->data[1].u64 = scb->tidsendc->descid.u64;
+	  
+	  ips_ptladdr_unlock(ipsaddr);
+	  
+	  if (ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD)
+	    pkt_flags |= INFINIPATH_KPF_INTR;  
+	}
+	else
+	  *discard_msg = 1;
+      }
+      break;
+      
+    case OPCODE_FLOW_CCA_BECN:
+      _IPATH_CCADBG("Generating Explicit BECN for flow %x ----> %x. Num congested packets: 0x%"PRIx64"\n", __be16_to_cpu(flow->path->epr_slid), __be16_to_cpu(flow->path->epr_dlid), ipsaddr->stats.congestion_pkts);
+      p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp | 1 << BTH_BECN_SHIFT);
+      p_hdr->data[0].u32w0 = flow->cca_ooo_pkts;
+      break;
+      
+    case OPCODE_ERR_CHK_BAD:
+	p_hdr->data[0].u32w0 = host_ipv4addr;
+	p_hdr->data[0].u32w1 = host_pid;
+	break;
+
+    case OPCODE_STARTUP:
+    case OPCODE_STARTUP_ACK:
+    case OPCODE_STARTUP_EXT:
+    case OPCODE_STARTUP_ACK_EXT:
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+	    "Unexpected use of old connect protocol");
+        break;
+
+    case OPCODE_CONNECT_REQUEST:
+    case OPCODE_CONNECT_REPLY: 
+	p_hdr->hdr_dlen = (epinfo->ep_hdrq_msg_size>>2) -
+	    IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS;
+        p_hdr->bth[0] = __cpu_to_be32((IPATH_OPCODE_USER1 << 24) + 
+				      ctrl_path->epr_pkey);
+	paylen = 
+	    ips_proto_build_connect_message(proto, msg, ipsaddr, 
+					    message_type, payload);
+	/* Rewrite packet length since this subopcode has an eager payload */
+	tot_paywords += paylen >> 2;
+	p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC);
+
+#if 0	/* MARKDEBBAGE - disabled this as it slows down connect at scale */
+	/* On request message, always set the kpf flag.  If reply, only set it
+	 * if we know that the recvthread is running */
+	if (message_type == OPCODE_CONNECT_REQUEST || 
+	    ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD)
+		pkt_flags |= INFINIPATH_KPF_INTR;
+#endif
+	break;
+
+    case OPCODE_DISCONNECT_REQUEST:
+    case OPCODE_DISCONNECT_REPLY:
+	paylen = 
+	    ips_proto_build_connect_message(proto, msg, ipsaddr, 
+					    message_type, payload);
+	tot_paywords += paylen >> 2;
+	p_hdr->hdr_dlen = (epinfo->ep_hdrq_msg_size>>2) -
+	    IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS;
+	p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC);
+	break;
+
+    case OPCODE_TIDS_RELEASE:
+    case OPCODE_TIDS_RELEASE_CONFIRM:
+    case OPCODE_TIDS_GRANT_ACK:
+    case OPCODE_TIDS_GRANT: 
+	paylen = ips_protoexp_build_ctrl_message(proto->protoexp, ipsaddr, 
+			p_hdr->data, &pkt_flags, message_type, payload);
+	if (paylen < 0) {
+	  *discard_msg = 1;
+	  break;
+	}
+	tot_paywords += paylen >> 2;
+	p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC);
+    break;
+    
+    default:
+	break;
+    }
+
+    p_hdr->iph.ver_context_tid_offset = __cpu_to_le32(
+        (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) +
+        (epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) +
+        (IPATH_EAGER_TID_ID << INFINIPATH_I_TID_SHIFT));
+    p_hdr->iph.pkt_flags = __cpu_to_le16(pkt_flags);
+    
+    ips_kdeth_cksum(p_hdr);  // Generate KDETH  checksum
+    
+    /* Require 4-byte alignment always */
+    psmi_assert(!(paylen & 0x3));
+    return paylen;
+}
+
+psm_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t);
+
+psm_error_t __recvpath
+ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, 
+			    uint32_t *msg_queue_mask, void *payload)
+{
+    struct ips_proto_ctrl_message msg;
+    psm_error_t err = PSM_EP_NO_RESOURCES;
+    ptl_arg_t *args = (ptl_arg_t *) payload;
+    ips_epaddr_t *ipsaddr = flow->ipsaddr;
+    struct ips_proto *proto = ipsaddr->proto;
+    struct ips_ctrlq *ctrlq = &proto->ctrlq[IPS_FLOWID2INDEX(flow->flowid)&0x3];
+    struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe;
+    uint32_t cksum = 0;
+    int paylen;
+    uint8_t discard_msg = 0;
+    
+    /* Drain queue if non-empty */
+    if (cqe[ctrlq->ctrlq_tail].ipsaddr)
+      ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL);
+    
+    if (!cqe[ctrlq->ctrlq_tail].ipsaddr) {
+      paylen = _build_ctrl_message(proto, &msg, ipsaddr, message_type, 
+				   flow, payload, &discard_msg);
+      
+      if_pt (!discard_msg) {
+	/* If enabled checksum control message */
+	ips_do_cksum(proto, &msg.pbc_hdr.hdr, payload, paylen, &cksum);
+	
+	/* Error check messages are serialized with respect to the underlying
+	 * transfer mechanism.
+	 */
+	if ((message_type == OPCODE_ERR_CHK) ||
+	    (message_type == OPCODE_ERR_CHK_GEN) ||
+	    (message_type == OPCODE_ERR_CHK_BAD)) {
+	  switch(flow->transfer) {
+	  case PSM_TRANSFER_PIO:
+	  case PSM_TRANSFER_LAST:
+	    err = ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, 
+					  payload, paylen, PSMI_TRUE,
+					  (proto->flags & IPS_PROTO_FLAG_CKSUM),
+					  cksum);
+	    break;
+	  case PSM_TRANSFER_DMA:
+	    err = ips_dma_transfer_frame(proto, flow, &msg.pbc_hdr, payload, 
+					 paylen, cksum);
+	    break;
+	  }
+	}
+	else
+	  if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA)
+	    err = ips_dma_transfer_frame(proto, flow, &msg.pbc_hdr, payload, 
+					 paylen, cksum);
+	  else
+	    err = ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, 
+					  payload, paylen, PSMI_TRUE,
+					  (proto->flags & IPS_PROTO_FLAG_CKSUM),
+					  cksum);
+
+	if (err == PSM_OK) 
+	  ips_epaddr_stats_send(ipsaddr, message_type);
+      }
+      else
+	err = PSM_OK; /* Ctrl message is discarded. May want to add stats */
+      
+      _IPATH_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d,"
+		  "src=%p,len=%d returns %d\n", (int) msg.pbc_hdr.hdr.sub_opcode, 
+		  __be16_to_cpu(msg.pbc_hdr.hdr.lrh[1]), payload, paylen, err);
+    }
+    if (err != PSM_EP_NO_RESOURCES)
+      return err;
+    if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA)
+      proto->stats.writev_busy_cnt++;
+    else
+      proto->stats.pio_busy_cnt++;
+
+    if (!(proto->ctrl_msg_queue_never_enqueue & proto->message_type_to_index[message_type])) {
+      
+      if ((*msg_queue_mask) & proto->message_type_to_index[message_type]) {
+	/* This type of control message is already queued, skip it */
+	err = PSM_OK;
+      } else if (cqe[ctrlq->ctrlq_head].ipsaddr == NULL) {
+	// entry is free
+	*msg_queue_mask |= message_type2index(proto, message_type);
+	
+	cqe[ctrlq->ctrlq_head].ipsaddr = ipsaddr;
+	cqe[ctrlq->ctrlq_head].message_type = message_type;
+	cqe[ctrlq->ctrlq_head].msg_queue_mask = msg_queue_mask;
+	cqe[ctrlq->ctrlq_head].flow = flow;
+	
+	if (args) {
+	  cqe[ctrlq->ctrlq_head].args[0].u64w0 = args[0].u64w0;
+	  cqe[ctrlq->ctrlq_head].args[1].u64w0 = args[1].u64w0;
+	  cqe[ctrlq->ctrlq_head].args[2].u64w0 = args[2].u64w0;
+	}
+	
+	ctrlq->ctrlq_head = (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE;
+	//_IPATH_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type);
+	psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, 
+			   PSMI_TIMER_PRIO_0);
+	
+	err = PSM_OK;
+      } else {
+	proto->ctrl_msg_queue_overflow++;
+      }
+    }
+
+    return err;
+}
+
+psm_error_t __recvpath
+ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire)
+{
+    struct ips_ctrlq *ctrlq = (struct ips_ctrlq *) timer->context;
+    struct ips_proto *proto = ctrlq->ctrlq_proto;
+    struct ips_proto_ctrl_message msg;
+    struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe;
+    struct ips_flow *flow;
+    uint8_t msg_type;
+    psm_error_t err;
+    struct ptl_epaddr *ipsaddr;
+    uint32_t cksum = 0;
+    int paylen;
+    uint8_t discard_msg = 0;
+    
+    // service ctrl send queue first
+    while (cqe[ctrlq->ctrlq_tail].ipsaddr) {
+	msg_type = cqe[ctrlq->ctrlq_tail].message_type;
+	ipsaddr = cqe[ctrlq->ctrlq_tail].ipsaddr;
+	flow = cqe[ctrlq->ctrlq_tail].flow;
+	
+        paylen = _build_ctrl_message(proto, &msg,
+				     ipsaddr, msg_type, flow,
+				     cqe[ctrlq->ctrlq_tail].args,
+				     &discard_msg);
+	
+	psmi_assert_always(paylen == 0);
+
+	if_pt (!discard_msg) {
+	  /* If enabled checksum control message */
+	  ips_do_cksum(proto, &msg.pbc_hdr.hdr, NULL, 0, &cksum);
+
+	  /* Error check messages are serialized with respect to the underlying
+	   * transfer mechanism.
+	   */
+	  if ((msg_type == OPCODE_ERR_CHK) ||
+	      (msg_type == OPCODE_ERR_CHK_GEN) ||
+	      (msg_type == OPCODE_ERR_CHK_BAD)) {
+	    switch(flow->transfer) {
+	    case PSM_TRANSFER_DMA:
+	      err = ips_dma_transfer_frame(proto,flow,&msg.pbc_hdr,0,0, cksum); 
+	      break;
+	    case PSM_TRANSFER_PIO:
+	    default:
+	      err = 
+		ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, 
+					NULL, 0, PSMI_TRUE,
+					(proto->flags & IPS_PROTO_FLAG_CKSUM),
+					cksum);
+	      break;
+	    }
+	  }
+	  else
+	    if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA)
+	      err = ips_dma_transfer_frame(proto,flow,&msg.pbc_hdr,NULL,0,cksum); 
+	    else
+	      err = 
+		ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, 
+					0, 0, PSMI_TRUE,
+					(proto->flags & IPS_PROTO_FLAG_CKSUM),
+					cksum);
+	}
+	else
+	  err = PSM_OK; /* Discard ctrl message */
+
+	if (err == PSM_OK) {
+	  ips_epaddr_stats_send(ipsaddr, msg_type);
+	  *cqe[ctrlq->ctrlq_tail].msg_queue_mask &=
+	    ~message_type2index(proto, cqe[ctrlq->ctrlq_tail].message_type);
+	  cqe[ctrlq->ctrlq_tail].ipsaddr = NULL;
+	  ctrlq->ctrlq_tail = (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE;
+        } else {
+	    psmi_assert(err == PSM_EP_NO_RESOURCES);
+
+	    if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA)
+	      proto->stats.writev_busy_cnt++;
+	    else
+	      proto->stats.pio_busy_cnt++;
+	    /* re-request a timer expiration */
+	    psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, 
+			      PSMI_TIMER_PRIO_0);
+	    return PSM_OK;
+	}
+    }
+
+    return PSM_OK;
+}
+
+void __sendpath
+ips_proto_flow_enqueue(struct ips_flow *flow, ips_scb_t *scb)
+{
+    ips_epaddr_t  *ipsaddr = flow->ipsaddr;
+    
+    /* Don't support send to self */
+    psmi_assert(flow->path->epr_dlid != flow->path->epr_slid);
+
+    ips_scb_prepare_flow_inner(scb, flow->epinfo, &ipsaddr->epr, flow);
+    ips_do_cksum(ipsaddr->proto, &scb->ips_lrh, 
+		 scb->payload, scb->payload_size, &scb->cksum);
+
+    STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq);
+    flow->scb_num_pending++;
+    flow->scb_num_unacked++;
+
+    /* Every ipsaddr has a pending head that points into the unacked queue.
+     * If sends are already pending, process those first */
+    if (SLIST_EMPTY(&flow->scb_pend))
+	SLIST_FIRST(&flow->scb_pend) = scb;
+}
+
+/* 
+ * This function attempts to flush the current list of pending 
+ * packets through PIO.
+ *
+ * Recoverable errors:
+ * PSM_OK: Packet triggered through PIO.
+ * PSM_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm_error_t __sendpath
+ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
+{
+    struct ips_proto *proto = flow->ipsaddr->proto;
+    struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+    int num_sent = 0;
+    uint64_t t_cyc;
+    ips_scb_t *scb;
+    psm_error_t err = PSM_OK;
+
+    /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+    if_pf ((!flow->credits) || (flow->flags & IPS_FLOW_FLAG_CONGESTED))
+      return PSM_OK;
+
+    while (!SLIST_EMPTY(scb_pend) && flow->credits) {
+	scb = SLIST_FIRST(scb_pend);
+	
+	if ((err = ips_spio_transfer_frame(proto->spioc, flow, &scb->ips_lrh, 
+					   scb->payload, scb->payload_size, 
+					   PSMI_FALSE,
+					   (proto->flags & IPS_PROTO_FLAG_CKSUM) && (scb->tid == IPATH_EAGER_TID_ID),
+					   scb->cksum)) == PSM_OK) 
+	{
+	    t_cyc = get_cycles();
+	    scb->flags &= ~IPS_SEND_FLAG_PENDING;
+	    scb->ack_timeout = flow->path->epr_timeout_ack; 
+	    scb->abs_timeout = flow->path->epr_timeout_ack + t_cyc;
+	    psmi_timer_request(proto->timerq, &flow->timer_ack,
+			       scb->abs_timeout);
+	    num_sent++;
+	    flow->scb_num_pending--;
+	    flow->credits--;
+	    SLIST_REMOVE_HEAD(scb_pend, next);
+	    	    
+	}
+	else
+	  break;
+    }
+
+    /* If out of flow credits re-schedule send timer */
+    if (!SLIST_EMPTY(scb_pend)) {
+      proto->stats.pio_busy_cnt++;
+      psmi_timer_request(proto->timerq, &flow->timer_send, 
+			 get_cycles() + proto->timeout_send);
+    }
+    
+    if (nflushed != NULL)
+	*nflushed = num_sent;
+
+    return err;
+}
+
+/*
+ * Flush all packets currently marked as pending
+ */
+static psm_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+				struct ips_scb_pendlist *slist, int num,
+				int *num_sent);
+
+#ifdef PSM_DEBUG
+#define PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto)				\
+    do  {								\
+	uint32_t cntr_inflight;						\
+	ipath_sdma_inflight(proto->ptl->context->ctrl, &cntr_inflight);	\
+	VALGRIND_MAKE_MEM_DEFINED(&cntr_inflight, sizeof(uint32_t));	\
+	psmi_assert_always(cntr_inflight ==				\
+			   proto->iovec_cntr_next_inflight);		\
+    } while (0)
+#else
+#define PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto)
+#endif
+
+/*
+ * Flush all packets queued up on a flow via send DMA.
+ *
+ * Recoverable errors:
+ * PSM_OK: Able to flush entire pending queue for DMA.
+ * PSM_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA.
+ * PSM_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ *                      or writev returned a recoverable error (no mem for
+ *                      descriptors, dma interrupted or no space left in dma
+ *                      queue).
+ *
+ * Unrecoverable errors:
+ * PSM_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ *			  rxe/txe parity error.
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ */
+psm_error_t __sendpath
+ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed)
+{
+    struct ips_proto *proto = flow->ipsaddr->proto;
+    struct ips_scb_pendlist *scb_pend = &flow->scb_pend;
+    uint32_t cntr_init;
+    ips_scb_t *scb;
+    psm_error_t err = PSM_OK;
+    int howmany = 0;
+    int nsent = 0;
+
+    /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */
+    if_pf ((!flow->credits) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) {
+      if (nflushed)
+	*nflushed = 0;
+      return PSM_EP_NO_RESOURCES;
+    }
+    
+    if (SLIST_EMPTY(scb_pend))
+	goto success;
+
+    /* 
+     * Count how many are to be sent and fire dma.
+     */
+#ifdef PSM_DEBUG
+    SLIST_FOREACH(scb, scb_pend, next)
+	howmany++;
+    psmi_assert_always(howmany == flow->scb_num_pending);
+#else
+    howmany = min(flow->scb_num_pending, flow->credits);
+#endif
+    
+    howmany = min(howmany, proto->scb_max_sdma);
+    
+    if (howmany == 0)
+      goto success;
+
+    PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto); /* Pre-check */
+
+    cntr_init = proto->iovec_cntr_next_inflight;
+    err = scb_dma_send(proto, flow, scb_pend, howmany, &nsent);
+    if (err != PSM_OK && err != PSM_EP_NO_RESOURCES && 
+	err != PSM_OK_NO_PROGRESS)
+	goto fail;
+
+    /* scb_dma_send shouldn't modify iovec_cntr_next_inflight */
+    psmi_assert_always(cntr_init == proto->iovec_cntr_next_inflight);
+
+    if (nsent > 0) {
+	uint64_t t_cyc = get_cycles();
+	uint32_t new_inflight = proto->iovec_cntr_next_inflight + nsent;
+	int i = 0;
+
+	/* We have to ensure that the inflight counter doesn't drift away too
+	 * far from the completion counter or else our wraparound arithmetic
+	 * in ips_proto_dma_wait_until will fail.
+	 */
+	if ((int) new_inflight - (int) proto->iovec_cntr_last_completed < 0)
+	    ips_proto_dma_wait_until(proto, 
+				     proto->iovec_cntr_last_completed + nsent);
+
+	flow->scb_num_pending -= nsent;
+	flow->credits = max((int) flow->credits - nsent, 0);
+	
+	SLIST_FOREACH(scb, scb_pend, next) {
+	    if (++i > nsent) 
+		break;
+	    scb->flags &= ~IPS_SEND_FLAG_PENDING;
+	    scb->ack_timeout = scb->nfrag*flow->path->epr_timeout_ack;
+	    scb->abs_timeout = scb->nfrag*flow->path->epr_timeout_ack + t_cyc;
+	    scb->dma_ctr = proto->iovec_cntr_next_inflight++;
+	    if (scb->tidsendc)
+	      ips_protoexp_scb_inflight(scb);
+	}
+	SLIST_FIRST(scb_pend) = scb;
+    }
+
+    PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto); /* Post Check */
+
+    if (SLIST_FIRST(scb_pend) != NULL) {
+	psmi_assert(flow->scb_num_pending > 0);
+
+	switch(flow->protocol) {
+	case PSM_PROTOCOL_TIDFLOW:
+	  /* For Tidflow we can cancel the ack timer if we have flow credits
+	   * available and schedule the send timer. If we are out of flow
+	   * credits then the ack timer is scheduled as we are waiting for 
+	   * an ACK to reclaim credits. This is required since multiple
+	   * tidflows may be active concurrently.
+	   */
+	  if (flow->credits) {  
+	    /* Cancel ack timer and reschedule send timer. Increment 
+	     * writev_busy_cnt as this really is DMA buffer exhaustion.
+	     */
+	    psmi_timer_cancel(proto->timerq, &flow->timer_ack);
+	    psmi_timer_request(proto->timerq, &flow->timer_send,
+			       get_cycles() + (proto->timeout_send << 1));
+	    proto->stats.writev_busy_cnt++;
+	  }
+	  else {
+	    /* Re-instate ACK timer to reap flow credits */
+	    psmi_timer_request(proto->timerq, &flow->timer_ack,
+			       get_cycles() + (flow->path->epr_timeout_ack>>2));
+	  }
+	  
+	  break;
+	case PSM_PROTOCOL_GO_BACK_N:
+	default:
+	  if (flow->credits) {
+	    /* Schedule send timer and increment writev_busy_cnt */
+	    psmi_timer_request(proto->timerq, &flow->timer_send,
+			       get_cycles() + (proto->timeout_send << 1));
+	    proto->stats.writev_busy_cnt++;
+	  }
+	  else {
+	    /* Schedule ACK timer to reap flow credits */
+	    psmi_timer_request(proto->timerq, &flow->timer_ack,
+			       get_cycles() + (flow->path->epr_timeout_ack>>2));
+	  }
+	  break;
+	}
+    }
+    else {
+      /* Schedule ack timer */
+      psmi_timer_cancel(proto->timerq, &flow->timer_send);
+      psmi_timer_request(proto->timerq, &flow->timer_ack,
+			 get_cycles() + flow->path->epr_timeout_ack);
+    }
+    
+    /* We overwrite error with its new meaning for flushing packets */
+    if (nsent > 0)
+        if (nsent < howmany)
+	    err = PSM_OK_NO_PROGRESS; /* partial flush */
+	else
+	    err = PSM_OK; /* complete flush */
+    else
+	err = PSM_EP_NO_RESOURCES; /* no flush at all */
+
+success:
+fail:
+    if (nflushed)
+	*nflushed = nsent;
+
+    return err;
+}
+
+/* 
+ * Fault injection in dma sends. Since DMA through writev() is all-or-nothing,
+ * we don't inject faults on a packet-per-packet basis since the code gets
+ * quite complex.  Instead, each call to flush_dma or transfer_frame is treated
+ * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND
+ * setting.
+ *
+ * The effect is as if the event was successful but dropped on the wire
+ * somewhere.
+ */
+PSMI_ALWAYS_INLINE(
+int
+dma_do_fault())
+{
+  
+  if_pf (PSMI_FAULTINJ_ENABLED()) {
+    PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1, IPS_FAULTINJ_DMALOST);
+    return psmi_faultinj_is_fault(fi);
+  }
+  else
+    return 0;
+}
+
+/* ips_dma_transfer_frame is used only for control messages, and is
+ * not enabled by default, and not tested by QA; expected send
+ * dma goes through scb_dma_send() */
+psm_error_t __sendpath
+ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, 
+		       struct ips_pbc_header *pbc_hdr_i,
+		       void *payload, uint32_t paylen, uint32_t cksum)
+{
+    struct iovec iovec;
+    ssize_t ret;
+    psm_error_t err;
+    uint32_t have_cksum = 
+      ((proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+       (((__le32_to_cpu(pbc_hdr_i->hdr.iph.ver_context_tid_offset) >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK) == IPATH_EAGER_TID_ID) && (pbc_hdr_i->hdr.mqhdr != MQ_MSG_DATA_BLK) && (pbc_hdr_i->hdr.mqhdr != MQ_MSG_DATA_REQ_BLK));
+    
+    psmi_assert((paylen & 0x3) == 0);		 /* require 4-byte multiple */
+    psmi_assert(((uintptr_t) payload & 0x3) == 0); /* require 4-byte alignment */
+    psmi_assert(paylen < proto->epinfo.ep_mtu);
+    
+    /* See comments above for fault injection */
+    if_pf (dma_do_fault())
+	return PSM_OK;
+
+    ips_proto_pbc_update(proto, flow, PSMI_TRUE,  &pbc_hdr_i->pbc, 
+			 sizeof(struct ips_message_header), 
+			 payload, paylen + 
+			 (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0));
+
+    /* If we have a payload, we need to copy it inline to a single element to
+     * ensure that the driver copies it out completely as part of the writev
+     * call since the payload can be stack-allocated memory.
+     */
+    if (paylen > 0) {
+	uint32_t len = sizeof(struct ips_pbc_header) + 
+	  paylen + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+	struct ips_pbc_header *pbc_hdr = alloca(len);
+
+	if_pf (pbc_hdr == NULL) {
+	    err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY,
+		    "alloca for %d bytes failed in writev", len);
+	    goto fail;
+	}
+	
+	psmi_mq_mtucpy(pbc_hdr, pbc_hdr_i, sizeof(struct ips_pbc_header));
+	psmi_mq_mtucpy(pbc_hdr+1, payload, paylen);
+	
+	if (have_cksum) {
+	  uint32_t *ckptr = (uint32_t*) ((uint8_t*) pbc_hdr + 
+					 (len - PSM_CRC_SIZE_IN_BYTES));
+	  *ckptr = cksum;
+	  ckptr++;
+	  *ckptr = cksum;
+	}
+	
+	iovec.iov_base = pbc_hdr;
+	iovec.iov_len  = len;
+	ret = ipath_cmd_writev(proto->fd, &iovec, 1);
+    }
+    else {
+        uint32_t len = sizeof(struct ips_pbc_header) + 
+	  (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+	struct ips_pbc_header *pbc_hdr = have_cksum ? alloca(len) : pbc_hdr_i;
+	
+	if_pf (pbc_hdr == NULL) {
+	    err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY,
+		    "alloca for %d bytes failed in writev", len);
+	    goto fail;
+	}
+	
+	if (have_cksum) {
+	  uint32_t *ckptr = (uint32_t*) (pbc_hdr + 1);
+	  psmi_mq_mtucpy(pbc_hdr, pbc_hdr_i, sizeof(struct ips_pbc_header));
+	  *ckptr = cksum;
+	  ckptr++;
+	  *ckptr = cksum;
+	}
+	
+	iovec.iov_base = pbc_hdr;
+	iovec.iov_len  = len;
+	ret = ipath_cmd_writev(proto->fd, &iovec, 1);
+    }
+
+    if (ret > 0) {
+	/* Even though we won't care about a completion in this frame send, we
+	 * still increment the iovec packet counter */
+	proto->iovec_cntr_next_inflight += ret;
+	err = PSM_OK;
+	psmi_assert_always(ret == 1);
+    }
+    else {
+	/* 
+	 * ret == 0: Driver did not queue packet. Try later.
+	 * ENOMEM: No kernel memory to queue request, try later? *
+	 * ECOMM: Link may have gone down
+	 * EINTR: Got interrupt while in writev
+	 */
+	if (ret == 0 || errno == ENOMEM || errno == ECOMM || errno == EINTR)
+	    err = PSM_EP_NO_RESOURCES;
+	else 
+	    err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+		  "Unhandled error in writev(): %s (fd=%d,iovec=%p,len=%d)", 
+		  strerror(errno), proto->fd, &iovec, 1);
+    }
+
+fail:
+    return err;
+}
+
+/*
+ * Caller still expects num_sent to always be correctly set in case of an
+ * error.
+ *
+ * Recoverable errors:
+ * PSM_OK: At least one packet was successfully queued up for DMA.
+ * PSM_EP_NO_RESOURCES: No scb's available to handle unaligned packets
+ *                      or writev returned a recoverable error (no mem for
+ *                      descriptors, dma interrupted or no space left in dma
+ *                      queue).
+ * PSM_OK_NO_PROGRESS: Cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM_EP_DEVICE_FAILURE: Error calling ipath_sdma_inflight() or unexpected
+ *                        error in calling writev(), or chip failure, rxe/txe
+ *                        parity error.
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ */
+static
+psm_error_t  __sendpath
+scb_dma_send(struct ips_proto *proto, struct ips_flow *flow,
+	     struct ips_scb_pendlist *slist, int num, int *num_sent)
+{
+    ssize_t ret;
+    struct ips_scb *scb = SLIST_FIRST(slist);
+    unsigned int vec_idx = 0, scb_idx = 0, scb_sent = 0;
+    unsigned int max_elem;
+    struct iovec *iovec;
+    psm_error_t err = PSM_OK;
+    uint32_t cksum;
+
+    psmi_assert(num > 0);
+    psmi_assert(scb != NULL);
+
+    /* See comments above for fault injection */
+    if_pf (dma_do_fault()) 
+      goto fail;
+
+    max_elem = 3*num;
+    iovec = alloca(sizeof(struct iovec) * max_elem);
+
+    if_pf (iovec == NULL) {
+	err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY,
+		"alloca for %d bytes failed in writev",
+		(int)(sizeof(struct iovec) * max_elem));
+	goto fail;
+    }
+
+writev_again:
+    vec_idx = 0;
+
+    SLIST_FOREACH(scb, slist, next) {
+	/* Can't exceed posix max writev count */
+	if (vec_idx + (int) !!(scb->payload_size > 0) >= UIO_MAXIOV)
+	    break;
+   	
+	psmi_assert(vec_idx < max_elem);
+	psmi_assert_always((scb->payload_size & 0x3) == 0);
+	
+	/* Checksum all eager packets */
+	cksum = ((proto->flags & IPS_PROTO_FLAG_CKSUM) && 
+		 (scb->tid == IPATH_EAGER_TID_ID) &&
+		 (scb->ips_lrh.mqhdr != MQ_MSG_DATA_BLK) &&
+		 (scb->ips_lrh.mqhdr != MQ_MSG_DATA_REQ_BLK));
+	
+	ips_proto_pbc_update(proto, flow, PSMI_FALSE, &scb->pbc, 
+			     sizeof(struct ips_message_header),
+			     scb->payload, 
+			     scb->payload_size + 
+			     (cksum ? PSM_CRC_SIZE_IN_BYTES : 0));
+
+	iovec[vec_idx].iov_base = &scb->pbc;
+	iovec[vec_idx].iov_len  = sizeof(struct ips_message_header) + 
+			          sizeof(union ipath_pbc);
+	vec_idx++;
+	
+	if (scb->payload_size > 0) {
+	    /* 
+	     * Payloads must be 4-byte aligned.  If not, we need a bounce
+	     * buffer for them.  This should be rare, but may be a performance
+	     * penalty, so we log it as a stat in case we need to narrow in 
+	     * on a performance problem.
+	     *
+	     * If checksum is enabled use a bounce buffer.
+	     */
+	    if ((((uintptr_t) scb->payload) & 0x3) || cksum) {
+		void *buf = scb->payload;
+		uint32_t len = scb->payload_size;
+	     
+		if (scb->nfrag > 1) {
+		  err = psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			"buffer alignment for sdma error");
+		  goto fail;
+		}
+
+		/* Only allocate buffer if current buffer is a user buffer */
+		if (!((scb->payload >= scb->scbc->sbuf_buf_base) &&
+		      (scb->payload <= scb->scbc->sbuf_buf_last))){
+		  
+		  if (!ips_scbctrl_bufalloc(scb)) {
+		    err = PSM_EP_NO_RESOURCES;
+		    if (--vec_idx == 0) /* Remove header, nothing to send */
+		      goto fail;
+		    else    /* send what we have so far, but no more */
+		      break;
+		  }
+		  
+		  /* Only need to copy if bounce buffer is used. */
+		  psmi_mq_mtucpy(scb->payload, buf, len);
+		  scb->payload_size = len;
+		}
+		
+		/* If checksum then update checksum */
+		if (cksum) {
+		  uint32_t *ckptr = (uint32_t*) ((uint8_t*) scb->payload + len);
+		  *ckptr = scb->cksum;
+		  ckptr++;
+		  *ckptr = scb->cksum;
+		}
+		
+		if (((uintptr_t) buf) & 0x3)
+		  proto->stats.send_dma_misaligns++;
+	    }
+	
+	    iovec[vec_idx].iov_base = scb->payload;
+	    iovec[vec_idx].iov_len  = scb->payload_size + 
+	      (cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+	    vec_idx++;
+
+	    _IPATH_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n", 
+		scb->seq_num.psn,
+		iovec[vec_idx-2].iov_base, (int) iovec[vec_idx-2].iov_len,
+		iovec[vec_idx-1].iov_base, (int) iovec[vec_idx-1].iov_len);
+
+	    /*
+	     * if there are multiple frag payload, set the right frag size.
+	     */
+	    if (scb->nfrag > 1) {
+		scb->pbc.fill1 = __cpu_to_le16(scb->frag_size);   
+
+		/* give tidinfo to qib driver */
+		if (scb->tidsendc) {
+		    iovec[vec_idx].iov_base = scb->tsess;
+		    iovec[vec_idx].iov_len  = scb->tsess_length;
+		    vec_idx++;
+		}
+	    }
+	}
+	else {
+	  /* If checksum enabled need to send checksum at end of header 
+	   * as we have no payload.
+	   */
+	  if (cksum) {
+	    char *pbc_hdr = alloca(iovec[vec_idx-1].iov_len + 
+				   PSM_CRC_SIZE_IN_BYTES);
+	    uint32_t *ckptr = (uint32_t*) 
+	      ((uint8_t*) pbc_hdr + iovec[vec_idx-1].iov_len);
+	    
+	    psmi_mq_mtucpy(pbc_hdr, iovec[vec_idx-1].iov_base,iovec[vec_idx-1].iov_len);
+	    *ckptr = scb->cksum;
+	    ckptr++;
+	    *ckptr = scb->cksum;
+	    
+	    iovec[vec_idx-1].iov_base = pbc_hdr;
+	    iovec[vec_idx-1].iov_len += PSM_CRC_SIZE_IN_BYTES;
+       
+	  }
+	  
+	  _IPATH_VDBG("hdr=%p,%d\n",
+	  iovec[vec_idx-1].iov_base, (int) iovec[vec_idx-1].iov_len);
+	}
+
+	/* Can bound the number to send by 'num' */
+	if (++scb_idx == num)
+	    break;
+    }
+    psmi_assert(vec_idx > 0);
+    ret = ipath_cmd_writev(proto->fd, iovec, vec_idx);
+    
+    /* 
+     * Successfully wrote entire vector 
+     */
+    if (ret == scb_idx) {
+	scb_sent += ret;
+	/* scbs are left if we didn't want to send less and didn't have
+	 * to break out of scbctrl_bufalloc */
+	if (scb != NULL && scb_idx < num && err == PSM_OK) 
+	    goto writev_again;
+    }
+    else {
+	if (ret < 0) {
+	    uint32_t cntr_fini;
+
+	    /* ENOMEM: No kernel memory to queue request, try later? 
+	     * ECOMM: Link may have gone down
+	     * EINTR: Got interrupt while in writev
+	     */
+	    if (errno == ENOMEM || errno == ECOMM || errno == EINTR) {
+		err = psmi_context_check_status(
+			    (const psmi_context_t *) &proto->ep->context);
+		if (err == PSM_OK)
+		    err = PSM_EP_NO_RESOURCES;
+	    }
+	    else {
+		err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+			"Unexpected error in writev(): %s (errno=%d) "
+			"(fd=%d,iovec=%p,len=%d)", strerror(errno), errno,
+			proto->fd, iovec, vec_idx);
+		goto fail;
+	    }
+	    /* Find out the latest packet that we were able to put in flight */
+	    if (ipath_sdma_inflight(proto->ptl->context->ctrl, &cntr_fini) < 0)
+	    {
+	      err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+			"Unable to retrieve inflight sdma counter: %s",
+			strerror(errno));
+		goto fail;
+	    }
+
+	    /* Re-write ret to actual inflight count */
+	    scb_sent += cntr_fini - proto->iovec_cntr_next_inflight;
+	}
+	else {
+	    /* No need for inflight system call, we can infer it's value from
+	     * writev's return value */
+	    scb_sent += ret;
+	}
+    }
+
+fail:
+    *num_sent = scb_sent;
+    psmi_assert(*num_sent <= num && *num_sent >= 0);
+    return err;
+}
+
+/*
+ * Because we only lazily reap send dma completions, it's possible that we
+ * receive a packet's remote acknowledgement before seeing that packet's local
+ * completion.  As part of processing ack packets and releasing scbs, we issue
+ * a wait for the local completion if the scb is marked as having been sent via
+ * send dma.
+ */
+psm_error_t __sendpath
+ips_proto_dma_wait_until(struct ips_proto *proto, uint32_t dma_cntr)
+{
+    psm_error_t err = PSM_OK;
+    int spin_cnt = 0;
+    int did_yield = 0;
+
+    PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto);
+
+    if ((int) proto->iovec_cntr_last_completed - (int) dma_cntr >= 0) 
+	return PSM_OK;
+
+    PSMI_PROFILE_BLOCK();
+
+    while ((int) proto->iovec_cntr_last_completed - (int) dma_cntr < 0) 
+    {
+	if (spin_cnt++ == proto->ep->yield_spin_cnt) {
+	    /* Have to yield holding the PSM lock, mostly because we don't
+	     * support another thread changing internal state at this point in
+	     * the code.
+	     */
+	    did_yield = 1;
+	    sched_yield();
+	}
+
+	/* Not there yet in completion count. Update our view of
+	 * last_completed. */
+	if (ipath_sdma_complete(proto->ptl->context->ctrl, 
+			        &proto->iovec_cntr_last_completed) == -1) 
+	{
+		err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE,
+		    "unable to retrieve completion sdma counter: %s",
+		    strerror(errno));
+		break;
+	}
+    }
+
+    if (did_yield) 
+	proto->stats.writev_compl_delay++;
+
+    PSMI_PROFILE_UNBLOCK();
+
+    return err;
+}
+
+#define ERRCHK_NOT_SERIALIZED	1
+
+psm_error_t 
+ips_proto_timer_ack_callback(struct psmi_timer *current_timer, uint64_t current)
+{
+    struct ips_flow *flow = (struct ips_flow *) current_timer->context;
+    ips_epaddr_t *ipsaddr = flow->ipsaddr;
+    struct ips_proto *proto = ipsaddr->proto;
+    uint64_t t_cyc_next = get_cycles();
+    ips_scb_t *scb;
+
+    if (STAILQ_EMPTY(&flow->scb_unacked))
+	return PSM_OK;
+
+    scb = STAILQ_FIRST(&flow->scb_unacked);
+        
+    if (current >= scb->abs_timeout) {
+	int done_local;
+
+#if ERRCHK_NOT_SERIALIZED
+	/* We have to ensure that the send is at least locally complete before
+	 * sending an error check or else earlier data can get to the
+	 * destination *after* we pio this err_chk. 
+	 */
+	if (flow->transfer == PSM_TRANSFER_DMA) {
+	  uint32_t dma_cntr;
+	  uint32_t scb_cntr = 
+	    STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->dma_ctr;
+	  done_local = 
+	    (ipath_sdma_complete(proto->ptl->context->ctrl, &dma_cntr) > 0 &&
+	     ((int) dma_cntr - (int) scb_cntr >= 0));
+	  if (!done_local)
+	    proto->stats.writev_compl_eagain++;
+	}
+	else
+	  done_local = 1; /* Always done for PIO flows */
+#else
+	done_local = 1; /* Otherwise always done */
+#endif
+
+	scb->ack_timeout = 
+	    min(scb->ack_timeout * flow->path->epr_timeout_ack_factor, 
+		flow->path->epr_timeout_ack_max);
+	scb->abs_timeout = t_cyc_next + scb->ack_timeout;
+	
+	if (done_local) {
+	    _IPATH_VDBG("sending err_chk flow=%d with first=%d,last=%d\n",
+		flow->flowid, STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn,
+		STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->seq_num.psn);
+	  
+	    if (flow->protocol == PSM_PROTOCOL_TIDFLOW)
+	      ips_proto_send_ctrl_message(flow, 
+					  OPCODE_ERR_CHK_GEN,
+					  &scb->tidsendc->ctrl_msg_queued,
+					  NULL);
+	    else
+	      ips_proto_send_ctrl_message(flow,
+					  OPCODE_ERR_CHK,
+					  &flow->ipsaddr->ctrl_msg_queued,
+					  NULL);
+	}
+
+	t_cyc_next = get_cycles() + scb->ack_timeout;
+    }
+    else 
+	t_cyc_next += (scb->abs_timeout - current);
+
+    psmi_timer_request(proto->timerq, current_timer, t_cyc_next);
+
+    return PSM_OK;
+}
+
+psm_error_t 
+ips_proto_timer_send_callback(struct psmi_timer *current_timer, uint64_t current)
+{
+    struct ips_flow *flow = (struct ips_flow *) current_timer->context;
+    
+    /* If flow is marked as congested adjust injection rate - see process nak
+     * when a congestion NAK is received.
+     */
+    if_pf (flow->flags & IPS_FLOW_FLAG_CONGESTED) {
+      struct ips_proto *proto = flow->ipsaddr->proto;
+
+      /* Clear congestion flag and decrease injection rate */
+      flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+      if ((flow->path->epr_ccti +
+      proto->cace[flow->path->epr_sl].ccti_increase) <=
+      proto->ccti_limit)
+	ips_cca_adjust_rate(flow->path,
+		proto->cace[flow->path->epr_sl].ccti_increase);
+    }
+
+    flow->fn.xfer.flush(flow, NULL);    
+    return PSM_OK;
+}
+
+psm_error_t
+ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment)
+{
+  struct ips_proto *proto = path_rec->proto;
+  uint16_t prev_ipd, prev_divisor;
+
+  /* Increment/decrement ccti for path */
+  psmi_assert_always(path_rec->epr_ccti >= path_rec->epr_ccti_min);
+  path_rec->epr_ccti += cct_increment;
+  
+  /* Determine new active IPD.  */
+  prev_ipd = path_rec->epr_active_ipd;
+  prev_divisor = path_rec->epr_cca_divisor;
+  if ((path_rec->epr_static_ipd) && 
+      ((path_rec->epr_static_ipd + 1) > 
+       (proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK))) {
+    path_rec->epr_active_ipd = path_rec->epr_static_ipd + 1;
+    path_rec->epr_cca_divisor = 0;
+  }
+  else {
+    path_rec->epr_active_ipd = proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK;
+    path_rec->epr_cca_divisor = 
+      proto->cct[path_rec->epr_ccti] >> CCA_DIVISOR_SHIFT;
+  }
+  
+  _IPATH_CCADBG("CCA: %s injection rate to <%x.%x> from <%x.%x>\n", (cct_increment > 0) ? "Decreasing" : "Increasing", path_rec->epr_cca_divisor, path_rec->epr_active_ipd, prev_divisor, prev_ipd);
+  
+  /* Reschedule CCA timer if this path is still marked as congested */
+  if (path_rec->epr_ccti > path_rec->epr_ccti_min) {
+    psmi_timer_request(proto->timerq,
+		       &path_rec->epr_timer_cca,
+		       get_cycles() + 
+		       proto->cace[path_rec->epr_sl].ccti_timer_cycles);
+  }
+  
+  return PSM_OK;
+}
+
+psm_error_t
+ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current) 
+{
+  ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context;
+  
+  /* Increase injection rate for flow. Decrement CCTI */
+  if (path_rec->epr_ccti > path_rec->epr_ccti_min)
+    return ips_cca_adjust_rate(path_rec, -1);
+  else
+    return PSM_OK;
+}
diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h
new file mode 100644
index 0000000..12f55f1
--- /dev/null
+++ b/ptl_ips/ips_proto.h
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_H
+#define _IPS_PROTO_H
+
+#include "psm_user.h"
+
+#include "ips_recvhdrq.h"
+#include "ips_tid.h"
+#include "ips_scb.h"
+#include "ips_epstate.h"
+#include "ips_spio.h"
+#include "ips_stats.h"
+#include "ips_proto_am.h"
+#include "ips_tidflow.h"
+#include "ips_path_rec.h"
+
+typedef enum ips_path_type {
+  IPS_PATH_LOW_PRIORITY,
+  IPS_PATH_NORMAL_PRIORITY,
+  IPS_PATH_HIGH_PRIORITY,
+  IPS_PATH_MAX_PRIORITY
+} ips_path_type_t;
+
+/* 
+ * Local Endpoint info.
+ *
+ * Contains information necessary for composing packets for the local endpoint
+ */
+struct ips_epinfo {
+  uint32_t	ep_baseqp;
+  uint16_t      ep_base_lid;
+  uint8_t       ep_lmc;
+  uint8_t       ep_pad;
+  ibta_rate     ep_link_rate;
+  uint16_t	ep_context;
+  uint16_t	ep_subcontext;
+  uint16_t      ep_hca_type;
+  uint16_t      ep_sl;     /* IPATH_SL only when path record not used */
+  uint16_t	ep_unit;
+  uint16_t	ep_mtu;
+  uint16_t	ep_piosize;
+  uint16_t      ep_hdrq_msg_size;
+  uint16_t	ep_pkey;  /* PSM_PKEY only when path record not used */
+  uint64_t	ep_timeout_ack; /* PSM_ERRCHK_TIMEOUT if no path record */
+  uint64_t	ep_timeout_ack_max;
+  uint32_t	ep_timeout_ack_factor;
+};
+
+/*
+ * Remote Endpoint info.
+ *
+ * Contains information necessary for composing packets for a remote endpoint
+ */
+#define IPS_MAX_PATH_LMC 3
+struct ips_epinfo_remote {
+  uint32_t	epr_qp;		    /* qp+context encoding */
+  uint32_t	epr_commidx_to;
+  uint32_t	epr_commidx_from;    
+  uint16_t	epr_piosize;
+  uint16_t	epr_context;      /* Real context value */
+  uint16_t	epr_subcontext;
+  uint8_t       epr_hca_type;
+  uint8_t       epr_hpp_index;
+  
+  /* For LMC/Torus keep list of base and max dlid. Used for pkt verification */
+  uint16_t      epr_base_lid;
+  uint16_t      epr_pkt_context;  /* Context encoding in packet header */
+  uint16_t      epr_max_lid;
+  uint8_t       epr_num_paths[IPS_PATH_MAX_PRIORITY];
+  uint8_t       epr_next_path[IPS_PATH_MAX_PRIORITY];
+  ips_path_rec_t *epr_path[IPS_PATH_MAX_PRIORITY][1 << IPS_MAX_PATH_LMC];
+};
+
+/*
+ * Control messages.
+ *
+ * ips low-level control messages to ensure reliability of eager packets.
+ *
+ */
+struct ips_proto;
+psm_error_t
+ips_proto_init(const psmi_context_t *context, 
+	       const struct ptl *ptl, 
+	       int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
+	       const struct psmi_timer_ctrl *timerq, /* PTL's timerq */
+	       const struct ips_epstate *epstate, /* PTL's epstate */
+	       const struct ips_spio *spioc, /* PTL's spio control */
+	       struct ips_proto *proto); /* output protocol */
+
+psm_error_t ips_proto_fini(struct ips_proto *proto, int force,
+			   uint64_t timeout);
+
+/*
+ * For writev support, we need to pass the pbc along with the message header
+ */
+struct ips_pbc_header {
+    union ipath_pbc		pbc;
+    struct ips_message_header	hdr;
+} PSMI_CACHEALIGN;
+
+/*
+ * Control message structures
+ */
+#define CTRL_MSG_QEUEUE_SIZE 32  /* power of two */
+
+struct ips_proto_ctrl_message {
+    struct ips_pbc_header	pbc_hdr;
+    uint8_t _hdr_uwords[IPS_HEADER_QUEUE_UWORDS_MAX<<2];
+} PSMI_CACHEALIGN;
+
+/* Control messages saved in the control queue.  Even though we only
+ * always send 2 ptl_args on the wire, some message types will save
+ * more than 16 bytes in arguments.
+ */
+struct ips_flow;
+struct ips_tid_recv_desc;
+
+struct ips_ctrlq_elem {
+    struct ptl_epaddr *ipsaddr;
+    uint8_t	       message_type;
+    uint8_t	       flowid;
+    uint16_t           pad;
+    uint32_t          *msg_queue_mask;
+    struct ips_flow   *flow;
+    ptl_arg_t	       args[3];
+};
+
+struct ips_ctrlq {
+    /* Queued control messages, queued when pio is busy */
+    struct ips_proto *ctrlq_proto;
+
+    int		ctrlq_flowid;
+    uint32_t	ctrlq_head;
+    uint32_t	ctrlq_tail;
+    uint32_t	ctrlq_overflow;
+    uint32_t	ctrlq_never_enqueue;
+
+    struct ips_ctrlq_elem   ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE]  PSMI_CACHEALIGN;
+    struct psmi_timer	    ctrlq_timer;    /* when in timerq */
+};
+
+/* 
+ * Connect/disconnect, as implemented by ips 
+ */
+psm_error_t ips_proto_connect(struct ips_proto *proto, int numep, 
+			    const psm_epid_t *array_of_epid, 
+			    const int *array_of_epid_mask, 
+			    psm_error_t *array_of_errors, 
+			    psm_epaddr_t *array_of_epaddr, 
+			    uint64_t timeout_in);
+
+psm_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, 
+			       const psm_epaddr_t array_of_epaddr[],
+			       const int array_of_epaddr_mask[], 
+			       psm_error_t array_of_errors[], 
+			       uint64_t timeout_in);
+
+int ips_proto_isconnected(struct ptl_epaddr *ipsaddr);
+
+/*
+ * Pending operation structures
+ */
+struct ips_pend_sreq {
+    STAILQ_ENTRY(ips_pend_sreq)	next;
+    psm_mq_req_t		req;
+    uint32_t			type;
+};
+
+#define IPS_PENDSEND_EAGER_DATA	1
+#define IPS_PENDSEND_EAGER_REQ	2
+#define IPS_PENDSEND_EXP_TIDS	3
+#define IPS_PENDSEND_EXP_SENDS	4
+
+STAILQ_HEAD(ips_pendsendq, ips_pend_sreq);
+
+struct ips_pend_sends {
+    struct ips_proto	 *proto; /* back ptr */
+    struct psmi_timer	 timer;
+    struct ips_pendsendq pendq;
+};
+
+/*
+ * One instance of the protocol
+ */
+
+struct ips_protoexp;
+
+struct ips_proto_stats {
+    uint64_t	pio_busy_cnt;
+    uint64_t	writev_busy_cnt;
+    uint64_t    writev_compl_eagain;
+    uint64_t    writev_compl_delay;
+    uint64_t	scb_egr_unavail_cnt;
+    uint64_t	scb_exp_unavail_cnt;
+    uint64_t	hdr_overflow;
+    uint64_t	egr_overflow;
+    uint64_t	lid_zero_errs;
+    uint64_t	unknown_packets;
+    uint64_t	stray_packets;
+    uint64_t	send_dma_misaligns;
+};
+
+struct ips_proto_error_stats {
+  uint64_t      num_icrc_err;
+  uint64_t      num_vcrc_err;
+  uint64_t      num_ecc_err;
+  uint64_t      num_len_err;
+  uint64_t      num_mtu_err;
+  uint64_t      num_khdr_err;
+  uint64_t      num_tid_err;
+  uint64_t      num_mk_err;
+  uint64_t      num_ib_err;
+};
+
+// OPP support structure.
+struct opp_api {
+  void* (*op_path_find_hca)(const char*name, void **device);
+  void* (*op_path_open)(void *device, int port_num);
+  void (*op_path_close)(void *context);
+  int (*op_path_get_path_by_rec)(void *context, ibta_path_rec_t *query, ibta_path_rec_t *response);
+  /* TODO: Need symbol to ibv_close_device. */
+};
+
+struct ips_ibta_compliance_fn {
+  psm_error_t (*get_path_rec)(struct ips_proto *proto, uint16_t slid, 
+			      uint16_t dlid, uint16_t desthca_type,
+			      unsigned long timeout, 
+			      ips_epaddr_t *ipsaddr);
+  psm_error_t (*fini)(struct ips_proto *proto);
+};
+
+typedef enum ptl_epaddr_flow {
+  EP_FLOW_GO_BACK_N_PIO,
+  EP_FLOW_GO_BACK_N_DMA,
+  EP_FLOW_GO_BACK_N_AM_REQ,
+  EP_FLOW_GO_BACK_N_AM_RSP,
+  EP_FLOW_LAST         /* Keep this the last endpoint flow */
+} ptl_epaddr_flow_t;
+
+struct ips_proto {
+    struct ptl	      *ptl;	/* cached */
+    psm_ep_t	       ep;	/* cached, for errors */
+    psm_mq_t	       mq;	/* cached, for mq handling */
+    int		       fd;	/* cached, for writev ops */
+
+    /* Pending sends */
+    struct ips_pend_sends   pend_sends;
+    struct ips_epstate	    *epstate; 
+    struct psmi_timer_ctrl   *timerq;
+
+    struct ips_protoexp *protoexp; 
+    struct ips_scbctrl	*scbc_rv;
+    struct ips_spio	*spioc;
+    struct ips_scbctrl	scbc_egr;
+    struct ips_epinfo	epinfo;
+    uint64_t	timeout_send;
+    uint32_t	flags;
+    uint32_t	iovec_cntr_next_inflight;
+    uint32_t	iovec_cntr_last_completed;
+    uint32_t	iovec_thresh_eager;
+    uint32_t    iovec_thresh_eager_blocking;
+    uint32_t	scb_max_sdma;
+    uint32_t	scb_bufsize;
+    uint16_t	scb_max_inflight;
+    uint16_t    flow_credits;
+    mpool_t	pend_sends_pool;
+    struct ips_ibta_compliance_fn ibta;
+    struct ips_proto_stats  stats;
+    struct ips_proto_error_stats error_stats;
+  
+    struct ips_proto_am	proto_am;
+
+    struct ips_ctrlq	ctrlq[EP_FLOW_LAST];
+
+    /* Handling tid errors */
+    uint32_t	tiderr_cnt;
+    uint32_t	tiderr_max;
+    uint64_t	tiderr_tnext;
+    uint64_t	tiderr_warn_interval;
+    uint32_t	tiderr_context_tid_off;
+    psm_epid_t	tiderr_epid;
+
+    uint64_t	t_init;
+    uint64_t	t_fini;
+    uint32_t	runid_key;
+
+    int		    num_connected_to; 
+    int		    num_connected_from;
+    int		    num_disconnect_requests;
+
+    /* misc state variables. */
+// Smallest interval in cycles between which we warn about stray messages
+// This is a per-endpoint quantity, overridable with PSM_STRAY_WARN_INTERVAL
+// We use the same interval to send the "die" message.
+    uint64_t	    stray_warn_interval;
+    int		    done_warning;
+    int		    done_once;
+    int		    num_bogus_warnings;
+    struct {
+	uint32_t    interval_secs;
+	uint64_t    next_warning;
+	uint64_t    count;
+    } psmi_logevent_tid_send_reqs;
+
+    /* SL2VL table for protocol */
+    int         sl2vl[16];
+
+    /* CCA per port */
+    uint16_t *cct; /* cct table */
+    uint16_t  ccti_size; /* ccti table size */
+    uint16_t  ccti_limit; /* should be <= size-1 */
+
+    uint16_t  ccti_portctrl; /* QP or SL CC */
+    uint16_t  ccti_ctrlmap; /* map for valid sl */
+    struct cace { /* CACongestionEntry */
+	uint8_t   ccti_increase; /* steps to increase */
+	//uint16_t  ccti_timer; /* CCTI Timer in units of 1.024 usec */
+	uint64_t  ccti_timer_cycles; /* coverted from us_2_cycles() */
+	uint8_t   ccti_threshold; /* threshod to make log */
+	uint8_t   ccti_min; /* min value for ccti */
+    } cace[16]; /* 16 service level */
+
+    /* Path record support */
+    uint8_t ips_ipd_delay[IBTA_RATE_120_GBPS + 1];
+    struct hsearch_data ips_path_rec_hash;
+    void *opp_lib;
+    void *hndl;
+    void *device;
+    void *opp_ctxt;
+    struct opp_api opp_fn;
+
+/*
+ * Control message queue for pending messages.
+ *
+ * Control messages are queued as pending when no PIO is available for sending
+ * the message.  They are composed on the fly and do not need buffering. 
+ *
+ * Variables here are write once (at init) and read afterwards (except the msg
+ * queue overflow counters).
+ */
+    uint32_t ctrl_msg_queue_overflow;
+    uint32_t ctrl_msg_queue_never_enqueue;
+    uint32_t message_type_to_index[256];
+#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)] & ~CTRL_MSG_QUEUE_ALWAYS)
+
+};
+
+/* 
+ * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init
+ */
+struct ptl_epaddr_stats {
+    uint64_t	err_chk_send;
+    uint64_t	err_chk_recv;
+    uint64_t	nak_send;
+    uint64_t	nak_recv;
+    uint64_t	connect_req;
+    uint64_t	disconnect_req;
+    uint64_t	tids_grant_send;
+    uint64_t	tids_grant_recv;
+    uint64_t	send_rexmit;
+    uint64_t    congestion_pkts;  /* IB CCA FECN packets */
+};
+
+/*
+ * Endpoint address, encapsulates per-endpoint protocol metadata
+ *
+ * Directly implements the ptl epaddr.
+ */
+
+/* 
+ * Flow index (6 bits) encodes the following:
+ *
+ * Protocol: 3 bits
+ * Flow Index:   3 bits
+ *
+ * Currently only two protocols supported: Go Back N (the "original" flow)
+ * and the TIDFLOW. We may look at adding other protocols like 
+ * Selective ACK and maybe even STCP.
+ *
+ * The Flow index is protocol specific. For a Go Back N protocol this usually
+ * refers to the index of the flow between two endpoints. For TIDFLOWS
+ * this is not currently used.
+ */
+
+#define IPS_MAX_PROTOCOL	8
+#define IPS_MAX_FLOWINDEX	8
+ 
+#define IPS_FLOWID_PACK(protocol,flowindex)   \
+  ( ((((uint16_t)protocol)&0x7) << 3) |	      \
+    (((uint16_t)flowindex)&0x7) )
+
+#define IPS_FLOWID_GET_PROTO(flow)    (((flow)>>3)&0x7)
+#define IPS_FLOWID_GET_INDEX(flow)    ((flow) % 4)
+
+#define IPS_FLOWID2INDEX(flow)	\
+   ((flow)&0x7)
+
+typedef void (*ips_flow_enqueue_fn_t)(struct ips_flow *flow, ips_scb_t *scb);
+typedef psm_error_t (*ips_flow_flush_fn_t)(struct ips_flow *, int *nflushed);
+typedef void (*ips_flow_nak_postprocess_fn_t)(struct ips_flow *, struct ips_message_header *p_hdr);
+
+typedef enum psm_transfer_type {
+  PSM_TRANSFER_PIO,
+  PSM_TRANSFER_DMA,
+  PSM_TRANSFER_LAST    /* Keep this the last transfer type */
+} psm_transfer_type_t;
+
+typedef enum psm_protocol_type {
+  PSM_PROTOCOL_GO_BACK_N,
+  PSM_PROTOCOL_TIDFLOW,
+  PSM_PROTOCOL_LAST   /* Keep this the last protocol type */
+} psm_protocol_type_t;
+
+struct ips_transfer_fn {
+  /* Functions dealing with enqueuing and flushing scbs to the network */
+  ips_flow_enqueue_fn_t enqueue;
+  ips_flow_flush_fn_t   flush;
+};
+
+struct ips_protocol_fn {
+  /* FLOW_ADD: Other functions for is_valid etc. */
+  ips_flow_nak_postprocess_fn_t nak_post_process;
+};
+
+struct ips_flow_fn {
+  struct ips_transfer_fn xfer;
+  struct ips_protocol_fn protocol;
+};
+
+#define PIO_TRANSFER_FUNCTIONS {		\
+    .enqueue = ips_proto_flow_enqueue,		\
+    .flush   = ips_proto_flow_flush_pio		\
+}
+
+#define DMA_TRANSFER_FUNCTIONS {		\
+    .enqueue = ips_proto_flow_enqueue,		\
+    .flush   = ips_proto_flow_flush_dma	        \
+}
+
+#define GO_BACK_N_PROTOCOL_FUNCTIONS {		\
+    .nak_post_process = NULL			\
+}
+
+#define TIDFLOW_PROTOCOL_FUNCTIONS {		\
+    .nak_post_process = ips_tidflow_nak_post_process \
+}
+
+struct ips_flow {
+    SLIST_ENTRY(ips_flow)   next; /* List of flows with pending acks */
+    struct ips_flow_fn fn;
+	      
+    struct ptl_epaddr *ipsaddr;	/* back pointer, remote endpoint */
+    struct ips_epinfo *epinfo;  /* back pointer, local epinfo */
+    ips_path_rec_t    *path; 	/* Path to use for flow */
+    psm_transfer_type_t transfer;
+    psm_protocol_type_t protocol;
+
+    uint32_t flowid;
+    uint32_t frag_size;
+    uint16_t flags;
+    uint16_t sl;
+    uint16_t cca_ooo_pkts;			   
+    uint16_t credits;           /* Current credits available to send on flow */
+    uint16_t cwin;              /* Size of congestion window */
+    uint16_t ack_interval;
+    uint16_t msg_ooo_toggle;	/* toggle for OOO message */
+    uint16_t msg_ooo_seqnum;	/* seqnum for OOO message */
+
+    psmi_seqnum_t xmit_seq_num;
+    psmi_seqnum_t xmit_ack_num;
+    psmi_seqnum_t recv_seq_num;
+    psmi_seqnum_t last_seq_num;
+
+    uint32_t scb_num_pending;
+    uint32_t scb_num_unacked;
+
+    psmi_timer timer_send;   /* timer for frames that got a busy PIO */
+    psmi_timer timer_ack;    /* timer for unacked frames */
+
+    STAILQ_HEAD(ips_scb_unackedq, ips_scb)  scb_unacked;
+    SLIST_HEAD(ips_scb_pendlist, ips_scb)   scb_pend;
+};
+
+struct ptl_epaddr {
+    struct ptl	      *ptl;	/* cached */
+    psm_epaddr_t       epaddr;	/* back pointer to psm top-level epaddr */
+    struct ips_proto  *proto;	/* back pointer to protocol */
+    psm_mq_t	       mq;	/* cached */
+
+    uint16_t			flags;	/* per-endpoint flags */
+    struct ips_epinfo_remote	epr;	/* remote endpoint params */
+    struct ips_flow		flows[EP_FLOW_LAST]	    PSMI_CACHEALIGN;
+    struct ips_flow		tidgr_flow; /* tidflow */
+
+    uint32_t ctrl_msg_queued; /* bitmap of queued control messages to be send */
+    uint32_t delay_in_ms;   /* used in close */
+    uint64_t s_timeout;	    /* used as a time in close */
+    int credit;
+    
+    pthread_mutex_t sesslock;
+    struct ptl_epaddr_stats stats;
+
+    uint32_t runid_key;
+    uint16_t psm_verno;	    
+    uint16_t connect_verno; /* The lowest connect version we can support */
+    uint16_t cstate_to;
+    uint16_t cstate_from;
+    psm_error_t cerror_to;
+    psm_error_t cerror_from;
+} 
+__attribute__((aligned(64)));
+
+
+/*
+ * Send support on scbs.
+ *
+ */
+void ips_flow_init(struct ips_flow *flow, ips_path_rec_t *path, 
+		   ips_epaddr_t *ipsaddr, 
+		   psm_transfer_type_t transfer_type, 
+		   psm_protocol_type_t protocol, ips_path_type_t path_type,
+		   uint32_t flow_index);
+
+void ips_scb_prepare_flow(ips_scb_t *scb, struct ips_epinfo *epinfo, 
+		          struct ips_epinfo_remote *epr, struct ips_flow *flow);
+
+void ips_proto_flow_enqueue(struct ips_flow *flow, ips_scb_t *scb);
+
+psm_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed);
+psm_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed);
+
+/* Wrapper for enqueue + flush */
+psm_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb);
+
+void	    ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb);
+psm_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, ips_epaddr_t *ipsaddr,
+				    int *nflushed);
+psm_error_t ips_proto_dma_wait_until(struct ips_proto *proto, uint32_t dma_ctr);
+psm_error_t ips_proto_dma_wait(struct ips_proto *proto, uint32_t dma_ctr,
+			       uint32_t *dma_ctr_out);
+
+psm_error_t ips_dma_transfer_frame(struct ips_proto *proto, 
+				   struct ips_flow *flow,
+				   struct ips_pbc_header *pbc_hdr,
+				   void *payload, uint32_t paylen, 
+				   uint32_t cksum);
+
+/* Special-case for expected sends */
+void	    ips_protoexp_scb_inflight(ips_scb_t *scb);
+
+/*
+ * Protocol receive processing
+ *
+ */
+/* NAK post processing for tidflows */
+void ips_tidflow_nak_post_process(struct ips_flow *flow, 
+				  struct ips_message_header *p_hdr);
+/* Actual receive processing is an inline in ips_proto_help.h */
+int ips_proto_process_packet_inner(struct ips_recvhdrq_event *rcv_ev);
+/* Error handling for unknown packet, packet is unknown when epid doesn't match
+ * in epstate table */
+int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev);
+/* Exposed for fastpath only */
+void ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev);
+/* Handling error cases */
+int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev);
+
+/*
+ * Protocol exception handling and frame dumps
+ */
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len);
+void ips_proto_dump_err_stats(struct ips_proto *proto);
+void ips_proto_show_rhf_errors(const uint32_t *rhdr);
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg);
+void ips_proto_dump_frame(void *frame, int lenght, char *message);
+void ips_proto_dump_data(void *data, int data_length);
+void ips_proto_dump_eager(uint32_t *curr_rcv_hdr);
+
+/*
+ * Checksum of ips packets
+ */
+uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc);
+
+/*
+ * Expected send support
+ */
+/*
+ * The expsend token is currently always a pointer to a MQ request.  It is
+ * echoed on the wire throughout various phases of the expected send protocol
+ * to identify a particular send.
+ */
+typedef void (*ips_tid_completion_callback_t)(void *);
+
+psm_error_t ips_protoexp_init(const psmi_context_t *context,
+			      const struct ips_proto *proto,
+			      uint32_t protoexp_flags,
+			      int num_of_send_bufs,
+			      int num_of_send_desc,
+			      struct ips_protoexp **protoexp_o);
+
+psm_error_t ips_protoexp_fini(struct ips_protoexp *protoexp);
+void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev);
+
+void ips_protoexp_recv_unaligned_data(struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev);
+
+void ips_protoexp_tid_grant(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_tid_grant_ack(const struct ips_recvhdrq_event *rcv_ev);
+int  ips_protoexp_tid_release(const struct ips_recvhdrq_event *rcv_ev);
+void ips_protoexp_tid_release_ack(const struct ips_recvhdrq_event *rcv_ev);
+
+int  ips_protoexp_build_ctrl_message(struct ips_protoexp *protoexp, 
+				     struct ptl_epaddr *ipsaddr,
+				     ptl_arg_t *args,
+				     uint16_t *pkt_flags,
+			             uint8_t opcode, void *payload);
+psm_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc);
+
+/*
+ * Peer is waiting (blocked) for this request
+ */
+#define IPS_PROTOEXP_TIDGET_WAIT	0x1
+#define IPS_PROTOEXP_TIDGET_PEERWAIT	0x2
+psm_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
+				 void *buf, uint32_t length, 
+				 psm_epaddr_t epaddr,
+				 uint32_t remote_tok, uint32_t flags,
+				 ips_tid_completion_callback_t callback,
+				 void *context);
+
+/*
+ * Matched-Queue processing and sends
+ */
+psm_error_t ips_proto_mq_push_eager_req(struct ips_proto *proto, 
+					psm_mq_req_t req);
+psm_error_t ips_proto_mq_push_eager_data(struct ips_proto *proto, 
+					 psm_mq_req_t req);
+
+int ips_proto_mq_handle_cts(struct ips_proto *proto, ptl_arg_t *args);
+
+int ips_proto_mq_handle_rts_envelope(psm_mq_t mq, int mode, psm_epaddr_t epaddr, 
+			     uint64_t tag, uint32_t reqidx_peer, 
+			     uint32_t msglen);
+int ips_proto_mq_handle_rts_envelope_outoforder(psm_mq_t mq, int mode,
+			     psm_epaddr_t epaddr, uint16_t msg_seqnum,
+			     uint64_t tag, uint32_t reqidx_peer, 
+			     uint32_t msglen);
+
+psm_error_t ips_proto_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, 
+			      uint32_t flags, uint64_t tag, const void *ubuf, 
+			      uint32_t len);
+
+psm_error_t ips_proto_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, 
+			       uint32_t flags, uint64_t tag, const void *ubuf, 
+			       uint32_t len, void *context, psm_mq_req_t *req_o);
+
+int ips_proto_am(struct ips_recvhdrq_event *rcv_ev);
+
+/* IBTA feature related functions (path record, sl2vl etc.) */
+psm_error_t ips_ibta_init_sl2vl_table(struct ips_proto *proto);
+psm_error_t ips_ibta_link_updown_event(struct ips_proto *proto);
+psm_error_t ips_ibta_init(struct ips_proto *proto);
+psm_error_t ips_ibta_fini(struct ips_proto *proto);
+
+#endif /* _IPS_PROTO_H */
diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c
new file mode 100644
index 0000000..9f2bf18
--- /dev/null
+++ b/ptl_ips/ips_proto_am.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "psm_am.h"
+#include "psm_am_internal.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define IPS_AMFLAG_ISTINY 1
+
+struct ips_am_token { 
+    struct psmi_am_token    tok;
+  
+    /* ptl-specific token stuff */
+    struct ips_proto_am *proto_am;
+};
+
+psm_error_t
+ips_proto_am_init(struct ips_proto *proto,
+		  int num_of_send_bufs, int num_of_send_desc,
+		  uint32_t imm_size, struct ips_proto_am *proto_am)
+{
+    psm_error_t err = PSM_OK;
+    int send_buf_size = proto->scb_bufsize;
+
+    proto_am->proto = proto;
+    proto_am->scbc_request = &proto->scbc_egr;
+
+    if ((err = ips_scbctrl_init(&proto->ep->context, num_of_send_desc,
+				num_of_send_bufs, imm_size, send_buf_size,
+				NULL, NULL, &proto_am->scbc_reply)))
+	goto fail;
+fail:
+    return err;
+}
+
+psm_error_t
+ips_proto_am_fini(struct ips_proto_am *proto_am)
+{
+    return PSM_OK;
+}
+
+static
+psm_error_t
+am_short_reqrep(struct ips_proto_am *proto_am, ips_scb_t *scb,
+		struct ptl_epaddr *ipsaddr,
+		psm_amarg_t *args, int nargs, uint8_t sub_opcode,
+		void *src, size_t len, int flags, int pad_bytes)
+		    
+{
+    int i, hdr_qwords = PSM_AM_HDR_QWORDS;
+    ptl_epaddr_flow_t flowid = ((sub_opcode == OPCODE_AM_REQUEST) || 
+				(sub_opcode == OPCODE_AM_REQUEST_NOREPLY)) ?
+      EP_FLOW_GO_BACK_N_AM_REQ : EP_FLOW_GO_BACK_N_AM_RSP;
+    struct ips_flow *flow = &ipsaddr->flows[flowid];
+
+    _IPATH_VDBG("%s src=%p len=%d, nargs=%d\n", 
+		((sub_opcode == OPCODE_AM_REQUEST) ||
+		 (sub_opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep",
+		src, (int) len, nargs); 
+
+    if (nargs == 1) {	/* fastpath */
+	scb->ips_lrh.data[0].u64w0 = args[0].u64w0;
+	hdr_qwords--;
+    }
+    else if (nargs > 1) {
+	/* Easily unrollable but leave as is in case we can increase qwords
+	 * on the chip in the near future */
+	for (i = 0; i < PSM_AM_HDR_QWORDS; i++, hdr_qwords--)
+	    scb->ips_lrh.data[i].u64w0 = args[i].u64w0;
+
+	if (nargs > PSM_AM_HDR_QWORDS) {
+	    /* Slow case -- we don't have iovec and not enough space in the
+	     * message header, so we have to copy the user's arguments even if
+	     * the payload is marked ASYNC */
+	    uintptr_t bufp = (uintptr_t) scb->payload;
+	    psmi_mq_mtucpy((void *) bufp, &args[PSM_AM_HDR_QWORDS], 
+		   sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS));
+	    bufp += sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS);
+	    scb->payload_size = sizeof(psm_amarg_t) * (nargs-PSM_AM_HDR_QWORDS);
+	    if (src != NULL && len > 0) {
+		psmi_mq_mtucpy((void *) bufp, src, len);
+		scb->payload_size += len;
+	    }
+	    scb->payload_size += pad_bytes;
+	    scb->ips_lrh.hdr_dlen = pad_bytes;
+	    goto send_scb;
+	}
+    }
+
+    /*
+     * If small enough, try to stuff the message in a header only
+     */
+    if (len <= (hdr_qwords<<3)) { /* can handle len == 0 */
+	psmi_mq_mtucpy(&scb->ips_lrh.data[PSM_AM_HDR_QWORDS-hdr_qwords], src, len);
+	scb->payload_size = 0;
+	scb->ips_lrh.hdr_dlen = len;
+	scb->ips_lrh.amhdr_flags |=  IPS_AMFLAG_ISTINY;
+    }
+    else { /* Whatever's left requires a separate payload */
+	if (scb->payload == NULL) {    /* Just attach the buffer */
+	    scb->payload = src;
+	}
+	else { /* May need to re-xmit user data, keep it around */
+	  psmi_mq_mtucpy(scb->payload, src, len);
+	}
+	scb->payload_size = len + pad_bytes;
+	scb->ips_lrh.hdr_dlen = pad_bytes;
+    }
+
+send_scb:
+    scb->ips_lrh.sub_opcode = sub_opcode;
+    flow->fn.xfer.enqueue(flow, scb);
+    flow->fn.xfer.flush(flow, NULL);
+    return PSM_OK;
+}
+
+static inline int 
+calculate_pad_bytes (struct ips_proto_am *proto_am, int nargs, size_t len)
+{
+  if ((nargs <= PSM_AM_HDR_QWORDS) && 
+      (len <= ((PSM_AM_HDR_QWORDS - nargs) << 3)))
+    return 0;
+  else {
+    size_t arg_overflow = (nargs > PSM_AM_HDR_QWORDS) ?
+      (sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS)) : 0;
+    size_t cache_aligned_len = (len + arg_overflow + PSM_CACHE_LINE_BYTES-1) & 
+      ~(PSM_CACHE_LINE_BYTES - 1);
+    if (cache_aligned_len <= proto_am->proto->scb_bufsize)
+      return cache_aligned_len - (len + arg_overflow);
+    else
+      return 0;
+  }
+}
+
+static inline
+void
+ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs, 
+		int pad_bytes,
+		psm_am_completion_fn_t completion_fn,
+		void *completion_ctxt)
+{
+    scb->completion_am = completion_fn;
+    scb->cb_param = completion_ctxt;
+    scb->ips_lrh.amhdr_hidx = handler;
+    scb->ips_lrh.hdr_dlen = pad_bytes;
+    scb->ips_lrh.amhdr_nargs = nargs;
+    scb->ips_lrh.amhdr_flags = 0;
+    if (completion_fn)
+      scb->flags |= IPS_SEND_FLAG_ACK_REQ;
+    return;
+}
+
+psm_error_t
+ips_am_short_request(psm_epaddr_t epaddr, 
+                     psm_handler_t handler, psm_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags, 
+		     psm_am_completion_fn_t completion_fn, 
+		     void *completion_ctxt)
+{
+    struct ips_proto_am *proto_am = &epaddr->ptl->proto.proto_am;
+    psm_error_t err;
+    ips_scb_t *scb;
+    int pad_bytes = calculate_pad_bytes(proto_am, nargs, len);
+    int payload_sz = (nargs << 3) + pad_bytes;
+    
+    if_pt (!(flags & PSM_AM_FLAG_ASYNC))
+      payload_sz += len;
+    
+    if (payload_sz > (PSM_AM_HDR_QWORDS << 3)) {
+      /* Payload can't fit in header - allocate buffer to carry data */
+      int arg_sz = (nargs > PSM_AM_HDR_QWORDS) ? 
+	((nargs - PSM_AM_HDR_QWORDS) << 3) : 0;
+      
+      /* len + pad_bytes + overflow_args */
+      PSMI_BLOCKUNTIL(epaddr->ep,err,
+	((scb = ips_scbctrl_alloc(proto_am->scbc_request, 1, 
+				  len + pad_bytes + arg_sz,
+				  IPS_SCB_FLAG_ADD_BUFFER)) != NULL));
+    }
+    else {
+      PSMI_BLOCKUNTIL(epaddr->ep,err,
+	   ((scb = ips_scbctrl_alloc_tiny(proto_am->scbc_request)) != NULL));
+    }
+
+    psmi_assert_always(scb != NULL);
+    ips_am_scb_init(scb, handler, nargs, pad_bytes,
+		    completion_fn, completion_ctxt);
+
+    return am_short_reqrep(proto_am, scb, epaddr->ptladdr, args, nargs, 
+			   (flags & PSM_AM_FLAG_NOREPLY) ?
+			   OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST, 
+			   src, len, flags, pad_bytes);
+}
+
+psm_error_t
+ips_am_short_reply(psm_am_token_t tok,
+                   psm_handler_t handler, psm_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags, 
+		   psm_am_completion_fn_t completion_fn, 
+		   void *completion_ctxt)
+{
+    ips_scb_t *scb;
+    struct ips_am_token *token = (struct ips_am_token *) tok;
+    struct ips_proto_am *proto_am = token->proto_am;
+    struct ptl_epaddr *ipsaddr = token->tok.epaddr_from->ptladdr;
+    int scb_flags = 0;
+    int pad_bytes = calculate_pad_bytes(proto_am, nargs, len);
+    
+    if (!token->tok.can_reply) {
+      /* Trying to reply for an AM request that did not expect a reply */
+      _IPATH_ERROR("Invalid AM reply for request!");
+      return PSM_AM_INVALID_REPLY;
+    }
+    
+    psmi_assert_always(ips_scbctrl_avail(&proto_am->scbc_reply));
+
+    if ((nargs<<3) + len <= (PSM_AM_HDR_QWORDS<<3)) {
+      psmi_assert_always(pad_bytes == 0);
+      scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply);
+    }
+    else {
+      int payload_sz = (nargs << 3) + pad_bytes;
+      
+      payload_sz += (flags & PSM_AM_FLAG_ASYNC) ? 0 : len;
+      scb_flags |= (payload_sz > (PSM_AM_HDR_QWORDS << 3)) ? 
+	IPS_SCB_FLAG_ADD_BUFFER : 0;
+      
+      scb = ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz, scb_flags);
+    }
+    
+    psmi_assert_always(scb != NULL);
+    ips_am_scb_init(scb, handler, nargs, pad_bytes,
+		    completion_fn, completion_ctxt);
+    am_short_reqrep(proto_am, scb, ipsaddr, args, nargs, OPCODE_AM_REPLY,
+		    src, len, flags, pad_bytes);
+    return PSM_OK;
+}
+
+/* Prepares and runs a handler from a receive event. */
+static int
+ips_am_run_handler(struct ips_am_token *tok,
+		   const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am;
+    psm_am_handler_fn_t hfn;
+
+    int nargs = p_hdr->amhdr_nargs;
+    tok->tok.flags = p_hdr->amhdr_flags;
+    tok->tok.epaddr_from = rcv_ev->ipsaddr->epaddr;
+    tok->tok.can_reply = (p_hdr->sub_opcode == OPCODE_AM_REQUEST);
+    tok->proto_am = proto_am;
+
+    hfn = psm_am_get_handler_function(rcv_ev->proto->ep, 
+				      p_hdr->amhdr_hidx);
+    _IPATH_VDBG("amhdr_len=%d, amhdr_flags=%x, amhdr_nargs=%d, p_hdr=%p\n",
+	p_hdr->hdr_dlen, p_hdr->amhdr_flags, p_hdr->amhdr_nargs, p_hdr);
+
+    /* Fast path: everything fits only in a header */
+    if (tok->tok.flags & IPS_AMFLAG_ISTINY) {
+        return hfn(tok, tok->tok.epaddr_from,
+		   (psm_amarg_t *) &p_hdr->data[0].u64, nargs,
+		   &p_hdr->data[nargs].u64, p_hdr->hdr_dlen);
+    }
+    else {
+	/* Arguments and payload may split across header/eager_payload
+	 * boundaries. */
+	psm_amarg_t args[8] = {};
+	int i;
+	uint64_t *payload = (uint64_t *) ips_recvhdrq_event_payload(rcv_ev);
+	uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+	for (i = 0; i < nargs; i++) {
+	    if (i < PSM_AM_HDR_QWORDS)
+		args[i].u64 = p_hdr->data[i].u64;
+	    else {
+		args[i].u64 = *payload++;
+		paylen -= 8;
+	    }
+	}
+	
+	paylen -= p_hdr->hdr_dlen;
+	return hfn(tok, tok->tok.epaddr_from, args, nargs, payload, paylen);
+    }
+}
+
+int
+ips_proto_am(struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_am_token token;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    struct ptl_epaddr *ipsaddr = rcv_ev->ipsaddr;
+    struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am;
+    ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+    struct ips_flow *flow = &ipsaddr->flows[flowid];
+    int ret = IPS_RECVHDRQ_CONTINUE;
+    
+/*
+ * Based on AM request/reply traffic pattern, if we don't have
+ * a reply scb slot then we can't process the request packet,
+ * we just silently drop it. Otherwise, it will be a deadlock.
+ * note: ips_proto_is_expected_or_nak() can not be called in this case.
+ */
+    if (p_hdr->sub_opcode == OPCODE_AM_REQUEST &&
+		!ips_scbctrl_avail(&proto_am->scbc_reply)) {
+	proto_am->amreply_nobufs++;
+	return ret;
+    }
+
+    if (ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev)) {
+	/* run handler */
+	if (ips_am_run_handler(&token, rcv_ev))
+	    ret = IPS_RECVHDRQ_BREAK;
+
+	/* Look if the handler replied, if it didn't, ack the request */    
+	if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ)  ||
+			(flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+	    ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow);
+    }
+
+    ips_proto_process_ack(rcv_ev);
+    return ret;
+}
diff --git a/ptl_ips/ips_proto_am.h b/ptl_ips/ips_proto_am.h
new file mode 100644
index 0000000..9e9ad06
--- /dev/null
+++ b/ptl_ips/ips_proto_am.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_AM_H
+#define _IPS_PROTO_AM_H
+
+#include "psm_user.h"
+#include "ips_scb.h"
+
+#define PSM_AM_HDR_QWORDS   2	/* Needs to be at least 2 */
+
+struct ips_proto_am {
+    struct ips_proto *proto;	/* back pointer */
+    struct ips_scbctrl	*scbc_request;
+    struct ips_scbctrl	scbc_reply;
+
+    uint64_t	amreply_nobufs;
+};
+
+psm_error_t
+ips_am_short_reply(psm_am_token_t tok,
+                   psm_handler_t handler, psm_amarg_t *args, int nargs,
+		   void *src, size_t len, int flags,
+		   psm_am_completion_fn_t completion_fn, 
+		   void *completion_ctxt);
+
+psm_error_t
+ips_am_short_request(psm_epaddr_t epaddr, 
+                     psm_handler_t handler, psm_amarg_t *args, int nargs,
+		     void *src, size_t len, int flags,
+		     psm_am_completion_fn_t completion_fn, 
+		     void *completion_ctxt);
+
+psm_error_t ips_proto_am_init(struct ips_proto *proto, int num_of_send_bufs, 
+			      int num_of_send_desc, uint32_t imm_size,
+			      struct ips_proto_am *proto_am);
+
+psm_error_t ips_proto_am_fini(struct ips_proto_am *proto_am);
+
+#endif /* _IPS_PROTO_AM_H */
diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c
new file mode 100644
index 0000000..3e73de8
--- /dev/null
+++ b/ptl_ips/ips_proto_connect.c
@@ -0,0 +1,1639 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define COMMIDX_MAX	65535	    /* Last valid communication idx is 65535 */
+
+/* Connections are not pairwise but we keep a single 'epaddr' for messages from
+ * and messages to a remote 'epaddr'.  State transitions for connecting TO and
+ * FROM 'epaddrs' are the following:
+ * Connect TO:
+ *   NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE
+ *
+ * Connect FROM (we receive a connect request)
+ *   NONE -> ESTABLISHED -> NONE
+ */
+#define CSTATE_ESTABLISHED	1
+#define CSTATE_NONE		2 
+#define CSTATE_TO_DISCONNECTED	3
+#define CSTATE_TO_WAITING	4
+#define CSTATE_TO_WAITING_DISC	5
+
+#define IPS_CONNECT_VERNO    0x0201 /* major,major,minor,minor */
+#define BIG_ENDIAN_TEST_WORD 0xA5A5
+
+/* We can use up to 16-bits of features, we only use 5 of them for now. */
+#define EP_FEATURES_ENDIAN_BIG    0x0001
+#define EP_FEATURES_ENDIAN_LITTLE 0x0002
+#define EP_FEATURES_BITWIDTH_32   0x0004
+#define EP_FEATURES_BITWIDTH_64   0x0008
+#define EP_FEATURES_RCVTHREAD	  0x8000
+#define EP_FEATURES_MULTIFLOW     0x4000
+
+#define EP_FEATURES_NODETYPE	  0x0f
+
+struct connect_msghdr {
+    uint8_t	opcode;		
+    uint8_t	_unused1;		
+
+    uint16_t	connect_verno;	/* be */
+    uint16_t	psm_verno;	/* be */
+    uint16_t	phase;		/* be connect/disconnect phase (unused now) */
+  
+    uint16_t    hca_type;       /* HCA type of remote endpoint */
+    uint16_t    sl;             /* Default SL request for remote endpoint*/
+    uint32_t	_unused[1];
+
+    psm_uuid_t	uuid; 
+};
+#define IPS_CONNECT_MSGHDR_SIZE	32   /* 16 + 16-byte-uuid */
+
+struct ips_connect_reqrep {
+    struct connect_msghdr  hdr;
+    uint32_t	flags;		    /* unused */
+    uint16_t	connect_result;	    /* be */
+
+    /* Per-job info */
+    uint32_t	commidx;	    /* ignore if 0xffffffff */
+    uint32_t	runid_key;	    /* one-time stamp connect key */
+    uint16_t	job_pkey;	    /* (future use) */
+    uint64_t	_unused1[4];
+
+    /* Per-node characteristics */
+    uint32_t	features;	    /* be - endpoint desc (endian + bidwidth) */
+    uint16_t	hdrq_msg_size;	    /* where is the header/eager cutoff */
+    uint16_t	mtu;		    /* receive payload */
+    char	hostname[128];	    /* always NULL-terminated */
+    uint64_t	_unused2[4];
+
+    uint8_t	version_1_offset[0];
+};
+
+/* Used for sanity checking in processing message arrivals */
+#define IPS_CONNECT_REQREP_MINIMUM_SIZE	\
+	(offsetof(struct ips_connect_reqrep, version_1_offset))
+#define IPS_MAX_CONNECT_PAYLEN 512
+
+struct ips_disconnect_reqrep {
+    struct connect_msghdr  hdr;
+    uint32_t	flags;		    /* unused */
+
+    uint16_t	mode;
+    uint16_t	_unused1[3];
+    uint64_t	_unused2[4];
+    uint8_t	version_1_offset[0];
+};
+/* Used for sanity checking in processing message arrivals */
+#define IPS_DISCONNECT_REQREP_MINIMUM_SIZE	\
+	(offsetof(struct ips_disconnect_reqrep, version_1_offset))
+
+const struct ips_transfer_fn psmi_xfer_fn[PSM_TRANSFER_LAST] = 
+  {
+    PIO_TRANSFER_FUNCTIONS,
+    DMA_TRANSFER_FUNCTIONS
+  };
+
+const struct ips_protocol_fn psmi_protocol_fn[PSM_PROTOCOL_LAST] = 
+  {
+    GO_BACK_N_PROTOCOL_FUNCTIONS,
+    TIDFLOW_PROTOCOL_FUNCTIONS
+  };
+
+/* Startup protocol in PSM/IPS
+ *
+ * Start timer.
+ *
+ * For all nodes to connect to:
+ *   Grab connect lock
+ *   Look up epid in table
+ *      MATCH.
+ *         assert cstate_to != CONNECT_WAITING (no re-entrancy)
+ *         If cstate_to == CONNECT_DONE
+ *            return the already connected address.
+ *         else
+ *            assert cstate_to == CONNECT_NONE
+ *            assert cstate_from == CONNECT_DONE
+ *            cstate_to := CONNECT_WAITING
+ *            assert commidx_to != UNKNOWN && commidx_from != UNKNOWN
+ *            req->commidx := epaddr->commidx_from 
+ *            add to list of pending connect.
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_to := CONNECT_WAITING
+ *         cstate_from := CONNECT_NONE
+ *         commidx_to := UNKNOWN
+ *         req->commidx := epaddr->commidx_from := NEW commidx integer
+ *         add to list of pending connect
+ *   Release connect lock
+ *
+ * expected_connect_count = ep->total_connect_count + num_to_connect
+ * while (expected_connect_count != ep->total_connect_count)
+ *    check for timeout
+ *    progress();
+ *
+ * For all connection requests received (within progress loop)
+ *   If uuid doesn't match, NAK the connect and skip request
+ *   Grab connect lock
+ *   Lock up epid in table
+ *      MATCH
+ *	   if cstate_from == CONNECT_DONE
+ *	      req->commidx := epaddr->commidx_from
+ *            compose reply and send again (this is a dupe request).
+ *         else
+ *            assert cstate_from == CONNECT_NONE
+ *            assert cstate_to == (CONNECT_WAITING | CONNECT_DONE)
+ *            cstate_from := CONNECT_DONE
+ *            epaddr->commidx_to := req->commidx
+ *            req->commidx := epaddr->commidx_from
+ *      NO MATCH
+ *         allocate epaddr and put in table
+ *         cstate_from := CONNECT_DONE
+ *         epaddr->commidx_to = req->commidx;
+ *         rep->commidx := epaddr->commidx_from := NEW commidx integer
+ *         compose connect reply and send
+ *   Release connect lock
+ *
+ * For all connection replies received:
+ *    If connect_result != 0, process error and skip.
+ *    assert cstate_to == CONNECT_WAITING
+ *    if cstate_from == CONNECT_DONE
+ *       assert rep->commidx == epaddr->commidx_to
+ *    else
+ *	 epaddr->commidx_to := rep->commidx
+ *    cstate_to := CONNECT_DONE
+ *    ep->total_connect_count ++
+ *
+ *   * Fill in a connection request:
+ *      1. Set connect protocol version and PSM versions
+ *      2. Set the uuid attached to current endpoint and add the job_pkey
+ *         the node wishes to communicate post-connect.
+ *      3. Set our mtu, bitwidth and endianess to detect inconsistencies
+ *
+ */
+
+/* Due to an oversight in the inital protocol, only 16 of the 32 bits can
+ * actually be used because the little-to-big endian conversion was done with
+ * 16 bits from the first version in 2.0. */
+static
+uint32_t
+psmi_ips_node_features(psm_ep_t ep)
+{
+    uint32_t features = 0;
+    if (BIG_ENDIAN_TEST_WORD == __cpu_to_be16(BIG_ENDIAN_TEST_WORD))
+	features |= EP_FEATURES_ENDIAN_BIG;
+    else
+	features |= EP_FEATURES_ENDIAN_LITTLE;
+    if (sizeof(uintptr_t) == 8)
+	features |= EP_FEATURES_BITWIDTH_64;
+    else
+	features |= EP_FEATURES_BITWIDTH_32;
+    if (ep->context.runtime_flags & PSMI_RUNTIME_RCVTHREAD)
+	features |= EP_FEATURES_RCVTHREAD;
+    features |= EP_FEATURES_MULTIFLOW;
+
+    return features;
+}
+
+static
+int
+node_matches_bitendian(psm_ep_t ep, uint32_t features)
+{
+    if ((features & EP_FEATURES_NODETYPE) ==
+	(psmi_ips_node_features(ep) & EP_FEATURES_NODETYPE))
+	return 1;
+    else
+	return 0;
+}
+
+/*
+ * Given a connection request, set mtu, communication index and hdr length
+ * parameters.
+ *
+ * The most subtle parameter is the mtu.  When set as 'req->mtu', the mtu 
+ * is our connecting peer's declared mtu (which may not be the same as our
+ * mtu).  The approach is to take the smaller of both mtus when communicating
+ * with that peer.  Also, when using pio, the size can be further restricted by
+ * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers).
+ */
+static
+psm_error_t
+ips_ipsaddr_set_req_params(struct ips_proto *proto,
+			   ips_epaddr_t *ipsaddr, 
+			   const struct ips_connect_reqrep *req,
+			   uint32_t paylen)
+{
+    psmi_assert_always(req->mtu > 0);
+
+    uint32_t peer_mtu = min(req->mtu, proto->epinfo.ep_mtu);
+    
+    ipsaddr->epr.epr_piosize = min(peer_mtu, proto->epinfo.ep_piosize);
+    ipsaddr->epr.epr_hca_type= req->hdr.hca_type;
+    
+    if (ipsaddr->epr.epr_piosize > PSM_CACHE_LINE_BYTES)
+      ipsaddr->epr.epr_piosize &= ~(PSM_CACHE_LINE_BYTES - 1);
+    
+    /* 
+     * DMA is bounded by the peer's mtu put also our local PIO send size
+     */
+    ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size = ipsaddr->epr.epr_piosize;
+    ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].frag_size = peer_mtu;
+    ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ].frag_size=ipsaddr->epr.epr_piosize;
+    ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP].frag_size=ipsaddr->epr.epr_piosize;
+
+    ipsaddr->epr.epr_commidx_to = req->commidx;
+
+    /* 
+     * For static routes i.e. "none" path resolution update all paths to
+     * have the same profile (mtu, sl etc.).
+     *
+     * For path record queries the epr_mtu and epr_sl are setup correctly
+     * from the path itself.
+     */
+    if (proto->ep->path_res_type == PSM_PATH_RES_NONE) {
+      int ptype, pidx;
+      for (ptype = IPS_PATH_LOW_PRIORITY; ptype < IPS_PATH_MAX_PRIORITY;ptype++)
+	for (pidx = 0; pidx < ipsaddr->epr.epr_num_paths[ptype]; pidx++) {
+	  ipsaddr->epr.epr_path[ptype][pidx]->epr_mtu = peer_mtu;
+	  ipsaddr->epr.epr_path[ptype][pidx]->epr_sl = req->hdr.sl;
+	}
+    }
+    
+    if (paylen > sizeof(struct ips_connect_reqrep)) {
+	int count;
+	char *p = (char *)(req + 1);
+	paylen -= sizeof(struct ips_connect_reqrep);
+	if (paylen%(sizeof(uint64_t)+sizeof(psm_epid_t))) {
+	    return PSM_INTERNAL_ERR;
+	}
+	count = paylen / (sizeof(uint64_t)+sizeof(psm_epid_t));
+	if (count > IPATH_MAX_UNIT) return PSM_INTERNAL_ERR;
+
+	memcpy(ipsaddr->epaddr->mctxt_gidhi, p, count*sizeof(uint64_t));
+	p += count*sizeof(uint64_t);
+	memcpy(ipsaddr->epaddr->mctxt_epid, p, count*sizeof(psm_epid_t));
+	ipsaddr->epaddr->mctxt_epcount = count;
+    }
+
+    return psmi_epid_set_hostname(psm_epid_nid(ipsaddr->epaddr->epid), 
+				       (char*) req->hostname, 0);
+}
+
+static psm_error_t __recvpath
+ips_proto_send_ctrl_message_request(struct ips_proto *proto,
+                                    struct ips_flow *flow, uint8_t message_type, 
+                                    uint32_t *msg_queue_mask, void *payload,
+				    uint64_t timeout)
+{
+    psm_error_t err = PSM_OK;
+
+    while (get_cycles() < timeout) {
+        err = ips_proto_send_ctrl_message(flow, message_type,
+                                          msg_queue_mask, payload);
+        if (err == PSM_OK) {
+	    break;
+        }
+        if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) {
+	    break;
+	}
+    }
+    return err;
+}
+
+static psm_error_t __recvpath
+ips_proto_send_ctrl_message_reply(struct ips_flow *flow, uint8_t message_type, 
+                                  uint32_t *msg_queue_mask, void *payload)
+{
+    /* This will try up to 100 times until the message is sent. The code
+     * is persistent becausing dropping replies will lead to a lack of
+     * overall progress on the connection/disconnection. We do not want
+     * to poll from here, and we cannot afford a lengthy timeout, since 
+     * this is called from the receive path.
+     */
+    psm_error_t err = PSM_OK;
+    int i;
+    for (i = 0; i < 100; i++) {
+        err = ips_proto_send_ctrl_message(flow, message_type,
+                                          msg_queue_mask, payload);
+        if (err == PSM_OK) {
+	    break;
+        }
+    }
+    return err;
+}
+
+int
+ips_proto_build_connect_message(struct ips_proto *proto, 
+			       struct ips_proto_ctrl_message *msg, 
+			       ips_epaddr_t *ipsaddr, uint8_t opcode,
+			       void *payload)
+{
+    struct connect_msghdr *hdr = (struct connect_msghdr *) payload;
+    struct ips_connect_reqrep *req = 
+		(struct ips_connect_reqrep *) payload;
+    uint32_t paylen = sizeof(struct connect_msghdr);
+
+    /* Write standard header that goes out on all connect msgs */
+    hdr->connect_verno = __cpu_to_be16(IPS_CONNECT_VERNO);
+    hdr->psm_verno     = __cpu_to_be16(PSMI_VERNO);
+    hdr->opcode        = opcode;
+    hdr->phase         = 0;
+    hdr->hca_type      = proto->epinfo.ep_hca_type;
+    hdr->sl            = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl;
+
+    /* Some times we simply echo disconnect requests since we can get dupe
+     * disconnect requests.  Unless that's the case, we always send the full
+     * uuid */
+    psmi_assert_always(proto != NULL);
+    memcpy(&hdr->uuid, &proto->ep->key, sizeof(psm_uuid_t));
+
+    switch (opcode) {
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_CONNECT_REQUEST: 
+#if 0
+	    psmi_assert_always(ipsaddr->cerror_from != PSM_OK ||
+			    !COMMIDX_IS_UNKNOWN(proto, ipsaddr->commidx_from));
+#endif
+	    if (opcode == OPCODE_CONNECT_REQUEST) {
+		req->connect_result = __cpu_to_be16(PSM_OK);
+		req->runid_key = proto->runid_key;
+	    }
+	    else {
+		req->connect_result = __cpu_to_be16(ipsaddr->cerror_from);
+		req->runid_key = ipsaddr->runid_key;
+	    }
+	    req->flags     = 0;
+	    req->commidx   = (uint32_t) ipsaddr->epr.epr_commidx_from;
+	    req->job_pkey  = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_pkey;
+
+	    req->features      = 
+		    __cpu_to_be16(psmi_ips_node_features(proto->ep));
+	    req->hdrq_msg_size = proto->epinfo.ep_hdrq_msg_size;
+	    req->mtu = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_mtu;
+	    strncpy(req->hostname, psmi_gethostname(),
+		sizeof(req->hostname) - 1);
+	    req->hostname[sizeof(req->hostname) - 1] = '\0';
+	    paylen	      = sizeof(struct ips_connect_reqrep);
+
+	    /* Attach all multi-context subnetids and epids. */
+	    if (proto->ep->mctxt_master == proto->ep) {
+		psm_epid_t *epid;
+		psm_ep_t ep = proto->ep->mctxt_next;
+		uint64_t *subnetid = (uint64_t *)(req + 1);
+		/* first all subnetids */
+		while (ep != proto->ep) {
+			*subnetid = ep->gid_hi;
+			subnetid++;
+			ep = ep->mctxt_next;
+			paylen += sizeof(uint64_t);
+		}
+		ep = proto->ep->mctxt_next;
+		epid = (psm_epid_t *)subnetid;
+		/* second all epids */
+		while (ep != proto->ep) {
+			*epid = ep->epid;
+			epid++;
+			ep = ep->mctxt_next;
+			paylen += sizeof(psm_epid_t);
+		}
+	    }
+	    psmi_assert_always(paylen <= IPS_MAX_CONNECT_PAYLEN); 
+	break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+	    paylen	   = sizeof(struct ips_disconnect_reqrep);
+	    break;
+	default:
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		"Unexpected/unhandled connection opcode 0x%x\n",
+		opcode);
+	    break;
+    }
+    return paylen;
+}
+
+void
+ips_flow_init(struct ips_flow *flow, ips_path_rec_t *path, ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, psm_protocol_type_t protocol, ips_path_type_t path_type, uint32_t flow_index)
+{
+    struct ips_proto *proto = ipsaddr->proto;
+    
+    psmi_assert_always(protocol < IPS_MAX_PROTOCOL);
+    psmi_assert_always(flow_index < IPS_MAX_FLOWINDEX);
+
+    SLIST_NEXT(flow, next) = NULL;
+    flow->fn.xfer = psmi_xfer_fn[transfer_type];
+    flow->fn.protocol = psmi_protocol_fn[protocol];
+    
+    /* If path is not specified pick one accordingly */
+    if (!path)
+      path = ips_select_path(proto, path_type, ipsaddr);
+    
+    flow->path = path;
+    flow->ipsaddr = ipsaddr;
+    flow->epinfo  = &proto->epinfo;
+    flow->transfer= transfer_type;
+    flow->protocol= protocol;
+    flow->flowid  = IPS_FLOWID_PACK(protocol, flow_index);
+    flow->xmit_seq_num.val = 0;
+    flow->xmit_ack_num.val = 0;
+    flow->xmit_ack_num.pkt--; /* last acked */
+    flow->recv_seq_num.val = 0;
+    flow->flags = 0;
+    flow->sl    = flow->path->epr_sl;
+    flow->cca_ooo_pkts = 0;			    
+    flow->credits = flow->cwin = proto->flow_credits;
+    flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1);
+    flow->scb_num_pending = 0;
+    flow->scb_num_unacked = 0;
+
+    psmi_timer_entry_init(&(flow->timer_ack),
+			  ips_proto_timer_ack_callback, flow);
+
+    psmi_timer_entry_init(&(flow->timer_send),
+			  ips_proto_timer_send_callback, flow);
+
+    STAILQ_INIT(&flow->scb_unacked);
+    SLIST_INIT(&flow->scb_pend);
+    return;
+}
+
+static
+size_t
+epaddr_size()
+{
+    return (size_t) (sizeof(struct psm_epaddr) + sizeof(struct ptl_epaddr));
+}
+
+static
+psm_error_t
+ips_init_ep_qp_and_pkt_context(uint16_t hca_type, uint32_t qp, 
+                               uint32_t context, ips_epaddr_t *ipsaddr)
+{
+    psm_error_t err = PSM_OK;
+    switch(hca_type) {
+    case PSMI_HCA_TYPE_QLE73XX:
+        /* Bit 5 of the context is inserted into bit 0 of QP */
+        ipsaddr->epr.epr_qp = (qp & ~0x1) | (context >> 4);
+        ipsaddr->epr.epr_pkt_context = context & 0xf;
+        break;
+    case PSMI_HCA_TYPE_QLE72XX:
+        if (context == 16) {
+	    /* For context 16, the bottom bit of qp is toggled */
+	    ipsaddr->epr.epr_qp = qp ^ 1;
+	    ipsaddr->epr.epr_pkt_context = 15;
+        }
+        else {
+	    ipsaddr->epr.epr_qp = qp;
+	    ipsaddr->epr.epr_pkt_context = context;
+        }
+        break;
+    case PSMI_HCA_TYPE_QLE71XX:
+        ipsaddr->epr.epr_qp = qp;
+        ipsaddr->epr.epr_pkt_context = context;
+        break;
+    default: 
+        err = PSM_PARAM_ERR;
+        break;
+    }
+    return err;
+}
+
+static
+psm_epaddr_t
+ips_alloc_epaddr(struct ips_proto *proto, psm_epid_t epid, 
+		 const char *hostname, unsigned long timeout)
+{
+    psm_error_t err = PSM_OK;
+    psm_epaddr_t epaddr;
+    ips_epaddr_t *ipsaddr;
+    uint64_t lid, context, subcontext;
+    uint16_t hca_type, path_dlid;
+    uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1);
+    int i;
+    ips_path_type_t prio;
+    
+    /* The PSM/PTL-level and ips-level epaddr structures are colocated in
+     * memory for performance reasons -- this is why ips allocates memory for
+     * both the PSM/PTL-level and ips-level epaddr structure.
+     * 
+     * The PSM/PTL structure data is filled in upon successfuly ep connect in
+     * ips_ptl_connect().
+     */
+    epaddr = (psm_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, 
+					1, epaddr_size());
+    if (epaddr == NULL)
+	return NULL;
+
+    epaddr->ptl  = proto->ptl;
+    epaddr->ptlctl = proto->ptl->ctl;
+    epaddr->ep = proto->ep;
+    STAILQ_INIT(&epaddr->egrlong);
+    STAILQ_INIT(&epaddr->egrdata);
+    epaddr->xmit_egrlong.egr_data = 0;
+    epaddr->outoforder_q.first = NULL;
+    epaddr->outoforder_q.lastp = &epaddr->outoforder_q.first;
+    epaddr->mctxt_master = epaddr;
+    epaddr->mctxt_current = epaddr;
+    epaddr->mctxt_prev = epaddr->mctxt_next = epaddr;
+    
+    /* IPS-level epaddr */
+    ipsaddr = (ips_epaddr_t *)(epaddr+1);
+    epaddr->ptladdr = ipsaddr;
+    
+    ipsaddr->ptl    = proto->ptl;
+    ipsaddr->mq	    = proto->mq;
+    ipsaddr->epaddr = epaddr;
+    ipsaddr->proto  = proto;
+    
+    /* Setup base fields for remote epid before doing path record lookup:
+     */
+    lid = PSMI_EPID_GET_LID(epid);
+    context = PSMI_EPID_GET_CONTEXT(epid);
+    subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+    hca_type = PSMI_EPID_GET_HCATYPE(epid);
+    /* Actual context of peer */
+    ipsaddr->epr.epr_context = context; 
+    
+    /* Setup remote endpoint <context,sucontext> */
+    err = ips_init_ep_qp_and_pkt_context(hca_type, proto->epinfo.ep_baseqp,
+					 context, ipsaddr);
+    if (err != PSM_OK) {
+	_IPATH_ERROR("Connect: Warning! unknown HCA type %d. Assuming remote HCA is same as local.\n", hca_type);
+        ips_init_ep_qp_and_pkt_context(hca_type, proto->epinfo.ep_baseqp,
+                          PSMI_EPID_GET_CONTEXT(proto->ep->epid), ipsaddr);
+    }
+
+    /* Subcontext */
+    ipsaddr->epr.epr_subcontext = subcontext;
+
+    /* Get path record for <service, slid, dlid> tuple */
+    err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, 
+				   __cpu_to_be16(lid), hca_type, timeout,
+				   ipsaddr);
+    if (err != PSM_OK) {
+      psmi_free(epaddr);
+      return NULL;
+    }
+  
+    /* Determine base lid across all paths */
+    ipsaddr->epr.epr_base_lid = 
+      __be16_to_cpu(ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_dlid);
+
+    for (prio = IPS_PATH_LOW_PRIORITY; prio < IPS_PATH_MAX_PRIORITY; prio++)
+      for (i = 0; i < ipsaddr->epr.epr_num_paths[prio]; i++) {
+	path_dlid = __be16_to_cpu(ipsaddr->epr.epr_path[prio][i]->epr_dlid);
+	if (path_dlid < ipsaddr->epr.epr_base_lid)
+	  ipsaddr->epr.epr_base_lid = path_dlid;
+      }
+    
+
+    /* Finally construct the resolved epaddr->epid for this peer (For torus
+     * SL and even the lid may be different!) 
+     */
+    path_dlid = ipsaddr->epr.epr_base_lid & lmc_mask;
+
+    epaddr->epid = 
+      PSMI_EPID_PACK_EXT(path_dlid,
+			 context, subcontext,
+			 hca_type, 
+			 ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl);
+        
+    /* Add this epid as a known hostname to our epid hostname db */
+    if (psmi_epid_set_hostname(psm_epid_nid(epid), hostname, 0))
+	return NULL;
+    
+    ipsaddr->flags = 0;
+    
+    /* All flows are over BULK path. Only control messages use the high
+     * priority CONTROL path.
+     */
+    ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], NULL,
+		  ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+		  IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO);
+    
+    /* DMA flow uses the same path as PIO flow due to multi MTU sized
+     * eager messages. If we use separate paths we are more likely to have
+     * payload arrive out of order with respect to envelope leading to 
+     * un-necessary NAKs.
+     */
+    ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA],
+		  ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path,
+		  ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N,
+		  IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_DMA);
+    
+    /* AM Request messages also use the same path as the PIO flow as they
+     * also require order with respect to the MPI request messages.
+     */
+    ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ],
+		  ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path,
+		  ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+		  IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_AM_REQ);
+    
+    ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP], NULL,
+		  ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N,
+		  IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_AM_RSP);
+
+    /* tidflow for tid get request */
+    ips_flow_init(&ipsaddr->tidgr_flow, NULL, ipsaddr,
+		  PSM_TRANSFER_DMA, PSM_PROTOCOL_TIDFLOW,
+		  IPS_PATH_LOW_PRIORITY, 0);
+
+    ipsaddr->cstate_to   = CSTATE_NONE;
+    ipsaddr->cstate_from = CSTATE_NONE;
+
+    /* For now, set these to our PSM versions and connect versions.  They will
+     * be overwritten to the peer's versions in handling connection reqs 
+     */
+    ipsaddr->psm_verno     = PSMI_VERNO;
+    ipsaddr->connect_verno = IPS_CONNECT_VERNO;
+
+    /* Add epaddr to PSM's epid table */
+    psmi_epid_add(proto->ep, epaddr->epid, epaddr);
+    psmi_assert_always(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr);
+
+    return epaddr;
+}
+
+static
+void
+ips_free_epaddr(ips_epaddr_t *ipsaddr)
+{
+    psm_epaddr_t epaddr = ipsaddr->epaddr;
+    _IPATH_VDBG("epaddr=%p,ipsaddr=%p,commidx_from=%d\n", epaddr, ipsaddr,
+	    ipsaddr->epr.epr_commidx_from);
+    psmi_epid_remove(ipsaddr->proto->ep, epaddr->epid);
+    ips_epstate_del(ipsaddr->proto->epstate, ipsaddr->epr.epr_commidx_from);
+    psmi_free(epaddr);
+    return;
+}
+
+static psm_error_t ips_get_addr_from_epid(struct ips_proto *proto,
+					  psm_epid_t epid, 
+					  unsigned long timeout,
+					  psm_epaddr_t *epaddr)
+{
+  psm_error_t err;
+  uint64_t lid, context, subcontext;
+  uint16_t hca_type, path_dlid;
+  psm_epid_t path_epid;
+  psm_epaddr_t ep_address = NULL;
+  uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1);
+  ips_epaddr_t ipsaddr;
+  
+  /* First unpack to get slid/dlid. */
+  lid = PSMI_EPID_GET_LID(epid);
+  context = PSMI_EPID_GET_CONTEXT(epid);
+  subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+  hca_type = PSMI_EPID_GET_HCATYPE(epid);
+  
+  /* Get path record for <service, slid, dlid> tuple */
+  err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, 
+				 __cpu_to_be16(lid), hca_type,
+				 timeout, &ipsaddr);
+  if (err != PSM_OK)
+    goto fail;
+    
+  /* Generate path epid to do lookup on - uses the SL from the path record. 
+   */
+  path_dlid = (__be16_to_cpu(ipsaddr.epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_dlid)) & lmc_mask;
+
+  path_epid = 
+    PSMI_EPID_PACK_EXT(path_dlid,
+		       context, subcontext, hca_type, 
+		       ipsaddr.epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl);
+  ep_address = psmi_epid_lookup(proto->ep, path_epid);
+  
+ fail:
+  *epaddr = ep_address;
+  return err;
+}
+
+static 
+psm_error_t 
+ptl_handle_connect_req(struct ips_proto *proto, psm_epid_t epid, 
+		       psm_epaddr_t epaddr, struct ips_connect_reqrep *req, 
+		       uint32_t paylen, int uuid_valid);
+
+psm_error_t
+ips_proto_process_connect(struct ips_proto *proto, psm_epid_t epid, 
+			  uint8_t opcode, struct ips_message_header *p_hdr, 
+			  void *payload, uint32_t paylen)
+{
+    psm_epaddr_t epaddr;
+    ips_epaddr_t *ipsaddr;
+    struct connect_msghdr *hdr;
+    uint16_t connect_result;
+    psm_ep_t ep = proto->ep;
+    int uuid_valid;
+    int uwords = (proto->epinfo.ep_hdrq_msg_size>>2) -
+	IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS;
+    int hdrq_extra;
+    uint32_t lid, context, subcontext;
+    uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1);
+
+    PSMI_PLOCK_ASSERT();
+    
+    struct ips_connect_reqrep *req;
+    psm_error_t err = PSM_OK;
+    
+    /* If the sender doesn't have the same header/eager cutoff, we need to make
+     * sure we copy the connect data into a contiguous buffer */
+    char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN;
+    
+    hdrq_extra = uwords - p_hdr->hdr_dlen;
+    if (hdrq_extra != 0) {
+	uint32_t *bufp = (uint32_t *) buf;
+	uint32_t *payp = (uint32_t *) payload;
+	_IPATH_VDBG("hdrq_extra is %d, uwords=%d, inwords=%d\n",
+		    hdrq_extra, uwords, p_hdr->hdr_dlen);
+	int hdrq_extra = uwords - p_hdr->hdr_dlen;
+	if (hdrq_extra > 0) { /* some of it went into our hdrq */
+	    psmi_mq_mtucpy(bufp, &p_hdr->data[0].u32w0 + p_hdr->hdr_dlen, 
+			   hdrq_extra<<2);
+	    psmi_mq_mtucpy(bufp+hdrq_extra, payload, paylen);
+	    paylen += (hdrq_extra<<2);
+	}
+	else { /* we got some useless padding in eager */
+	    hdrq_extra = -hdrq_extra;
+	    paylen -= (hdrq_extra<<2);
+	    psmi_mq_mtucpy(bufp, payp + hdrq_extra, paylen);
+	}
+	payload = buf;
+    }
+
+    hdr = (struct connect_msghdr *) payload;
+    if (paylen < sizeof(struct connect_msghdr)) { /* drop */
+	_IPATH_PRDBG("dropping unknown connect message of length %d\n", paylen);
+	return PSM_OK;
+    }
+    
+    /* Obtain HCA type and SL from request and regenerate epid */
+    lid = PSMI_EPID_GET_LID(epid);
+    context = PSMI_EPID_GET_CONTEXT(epid);
+    subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+    epid = PSMI_EPID_PACK_EXT(lid & lmc_mask, context, subcontext, hdr->hca_type, hdr->sl);
+
+    /* Don't need to call ips_get_addr_from_epid as the epid cache is keyed
+     * of the IPS_PATH_HIGH_PRIORITY dlid and the SL which we already have from
+     * the connect request (as all control messages uses the CONTROL path).
+     */
+    epaddr = psmi_epid_lookup(proto->ep, epid);
+    ipsaddr = epaddr ? epaddr->ptladdr : NULL;
+
+    uuid_valid = (psmi_uuid_compare(ep->key, hdr->uuid) == 0);
+
+    if ((opcode == OPCODE_CONNECT_REQUEST || opcode == OPCODE_CONNECT_REPLY) &&
+	paylen < IPS_CONNECT_REQREP_MINIMUM_SIZE) 
+    {
+	uint64_t lid, context, subcontext;
+	char *type = opcode == OPCODE_CONNECT_REQUEST ? "request" : "reply";
+	lid = PSMI_EPID_GET_LID(epid);
+	context = PSMI_EPID_GET_CONTEXT(epid);
+	subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+	psmi_syslog(proto->ep, 1, LOG_INFO,
+	    "Unrecognized connect %s (size is %d instead of %d) "
+	    "from epid %ld:%ld:%ld\n", type, paylen, 
+	    (int) IPS_CONNECT_REQREP_MINIMUM_SIZE,
+	    (long) lid, (long) context, (long) subcontext);
+	goto fail; /* Not fatal, just drop the packet */
+    }
+
+    switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+	    err = ptl_handle_connect_req(proto, epid, epaddr, 
+		    (struct ips_connect_reqrep *) payload, paylen, uuid_valid);
+	    break;
+
+	case OPCODE_CONNECT_REPLY:
+	    req = (struct ips_connect_reqrep *) payload;
+	    if (!ipsaddr || req->runid_key != proto->runid_key) {
+		uint64_t lid, context, subcontext;
+
+		lid = PSMI_EPID_GET_LID(epid);
+		context = PSMI_EPID_GET_CONTEXT(epid);
+		subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+		_IPATH_PRDBG("Unknown connectrep (ipsaddr=%p, %d,%d) "
+			"from epid %ld:%ld:%ld bad_uuid=%s\n",
+			ipsaddr, req->runid_key, proto->runid_key,
+			(long) lid, (long) context, (long) subcontext,
+			uuid_valid ? "NO" : "YES");
+		break;
+	    }
+	    if (ipsaddr->cstate_to != CSTATE_TO_WAITING) {
+		/* possible dupe */
+		_IPATH_VDBG("connect dupe, expected %d got %d\n",
+			    CSTATE_TO_WAITING, ipsaddr->cstate_to);
+		break;
+	    }
+	    connect_result = __be16_to_cpu(req->connect_result);
+
+	    /* Reply to our request for connection (i.e. outgoing connection) */
+	    if (ipsaddr->cstate_from != CSTATE_ESTABLISHED) {
+		err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen);
+		if (err) goto fail;
+	    }
+	    ipsaddr->cstate_to  = CSTATE_ESTABLISHED;
+	    ipsaddr->cerror_to  = connect_result;
+
+	    break;
+
+	case OPCODE_DISCONNECT_REQUEST:
+	{
+	    ips_epaddr_t ipsaddr_f; /* fake a ptl addr */
+	    int ipsaddr_do_free = 0;
+	    psmi_assert_always(paylen >= IPS_DISCONNECT_REQREP_MINIMUM_SIZE);
+	    _IPATH_VDBG("Got a disconnect from %s\n", psmi_epaddr_get_name(epid));
+	    proto->num_disconnect_requests++;
+	    /* It's possible to get a disconnection request on a ipsaddr that
+	     * we've since removed if the request is a dupe.  Instead of
+	     * silently dropping the packet, we "echo" the request in the
+	     * reply. */
+	    if (ipsaddr == NULL) {
+		uint16_t src_context = IPS_HEADER_SRCCONTEXT_GET(p_hdr);
+		uint32_t qp;
+
+		ipsaddr = &ipsaddr_f;
+		memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t));
+		ipsaddr_f.epr.epr_context = src_context;
+		ipsaddr_f.epr.epr_subcontext = p_hdr->src_subcontext;
+	
+		/* QLE72XX is special for context 16 */
+		if ((hdr->hca_type == PSMI_HCA_TYPE_QLE72XX) && 
+		    (src_context == 16))
+		        ipsaddr_f.epr.epr_pkt_context = 15;
+
+		/* Get path record for peer */
+		err = proto->ibta.get_path_rec(proto, 
+					       proto->epinfo.ep_base_lid, 
+					       __cpu_to_be16(lid), 
+					       hdr->hca_type,
+					       3000, &ipsaddr_f);
+		if (err != PSM_OK)
+		  goto fail;
+
+		qp =  proto->epinfo.ep_baseqp;
+                err = ips_init_ep_qp_and_pkt_context(hdr->hca_type, qp,
+						     src_context, &ipsaddr_f);
+		if (err != PSM_OK) {
+	            _IPATH_ERROR("Disconnect: Warning! unknown HCA type %d.\n", hdr->hca_type);
+		    goto fail;
+		}
+		
+		ipsaddr_f.proto = proto;
+		ipsaddr_f.ptl = (ptl_t *) -1;
+		/* If the send fails because of pio_busy, don't let ips queue
+		 * the request on an invalid ipsaddr, just drop the reply */
+		ipsaddr_f.ctrl_msg_queued = ~0;
+		ips_flow_init(&ipsaddr_f.flows[EP_FLOW_GO_BACK_N_PIO], NULL,
+			      &ipsaddr_f, PSM_TRANSFER_PIO, 
+			      PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY,
+			      EP_FLOW_GO_BACK_N_PIO);
+		_IPATH_VDBG("Disconnect on unknown epaddr, just echo request\n");
+	    }
+	    else if (ipsaddr->cstate_from != CSTATE_NONE) {
+		ipsaddr->cstate_from = CSTATE_NONE;
+		proto->num_connected_from--;
+		if (ipsaddr->cstate_to == CSTATE_NONE) {
+		    ipsaddr_do_free = 1;
+		}
+		if (!uuid_valid) {
+		    uint64_t lid, context, subcontext;
+
+		    lid = PSMI_EPID_GET_LID(epid);
+		    context = PSMI_EPID_GET_CONTEXT(epid);
+		    subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+		    _IPATH_VDBG("Unknown disconnect request from epid %d:%d.%d "
+			"bad_uuid=%s\n", (int) lid, 
+			(int) context, (int) subcontext, uuid_valid ? "NO" : "YES");
+		}
+	    }
+
+	    memset(buf, 0, sizeof buf);
+	    ips_proto_send_ctrl_message_reply(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], 
+					      OPCODE_DISCONNECT_REPLY,
+					      &ipsaddr->ctrl_msg_queued, buf);
+	    /* We can safely free the ipsaddr if required since disconnect
+	     * messages are never enqueued so no reference to ipsaddr is kept */
+	    if (ipsaddr_do_free)
+		ips_free_epaddr(ipsaddr);
+	}
+	break;
+
+	case OPCODE_DISCONNECT_REPLY:
+	    if (!ipsaddr || !uuid_valid) {
+		uint64_t lid, context, subcontext;
+		lid = PSMI_EPID_GET_LID(epid);
+		context = PSMI_EPID_GET_CONTEXT(epid);
+		subcontext = PSMI_EPID_GET_SUBCONTEXT(epid);
+		_IPATH_VDBG("Unknown disconnect reply from epid %d:%d.%d bad_uuid=%s\n",
+			(int) lid, (int) context, (int) subcontext,
+			uuid_valid ? "NO" : "YES");
+		break;
+	    }
+            else if (ipsaddr->cstate_to == CSTATE_TO_WAITING_DISC) {
+		ipsaddr->cstate_to = CSTATE_TO_DISCONNECTED;
+		/* Freed in disconnect() if cstate_from == NONE */
+	    } /* else dupe reply */
+	    break;
+
+	default:
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		"Unexpected/unhandled connect opcode 0x%x\n",
+		opcode);
+    }
+fail:
+    return err;
+}
+
+static 
+psm_error_t 
+ptl_handle_connect_req(struct ips_proto *proto, psm_epid_t epid, 
+		       psm_epaddr_t epaddr, struct ips_connect_reqrep *req, 
+		       uint32_t paylen, int uuid_valid)
+{
+    ips_epaddr_t *ipsaddr;
+    psm_error_t	err = PSM_OK;
+    uint16_t connect_result = PSM_OK;
+    uint16_t psm_verno;
+    uint16_t c_verno;
+    uint16_t features;
+    int newconnect = 0;
+    char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN;
+
+    if (epid == proto->ep->epid) {
+	/* For 2.0, we won't expose handling for this error */
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR,
+		"Network connectivity problem: Locally detected duplicate "
+		"LIDs 0x%04x on hosts %s and %s. (Exiting)",
+		(uint32_t) psm_epid_nid(epid),
+		psmi_epaddr_get_hostname(epid),
+		psmi_gethostname());
+	/* XXX no return */
+	abort();
+    }
+    else if (epaddr == NULL) { /* new ep connect before we call into connect */
+	newconnect = 1;
+	if ((epaddr = ips_alloc_epaddr(proto, epid, req->hostname, 
+				       5000)) == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+    }
+    ipsaddr = epaddr->ptladdr;
+    if (ipsaddr->cstate_from  == CSTATE_ESTABLISHED) {
+	/* Duplicate lid detection.  */
+	if (ipsaddr->runid_key == req->runid_key && uuid_valid)
+	    goto do_reply; /* duplicate request, not duplicate lid */
+	else if (uuid_valid) { 
+	    /* True blue duplicate lid, both connect messages are part of the
+	     * same context since they use the same uuid */ 
+	    /* For 2.0, we won't expose handling for this error */
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR,
+		"Network connectivity problem: Detected duplicate "
+		"LIDs 0x%x on hosts %s (key=%d) and %s (key=%d). (Exiting)",
+		(uint32_t) psm_epid_nid(ipsaddr->epaddr->epid), 
+		psmi_epaddr_get_hostname(epid),
+		ipsaddr->runid_key,
+		req->hostname,
+		req->runid_key);
+	}
+	else { /* Some out of context message.  Just drop it */
+	    if (!proto->done_warning) {
+		psmi_syslog(proto->ep, 1, LOG_INFO, 
+		    "Non-fatal connection problem: Received an out-of-context "
+		    "connection message from host %s LID=0x%x context=%d. (Ignoring)",
+		    req->hostname, (int) psm_epid_nid(epid), psm_epid_context(epid));
+		proto->done_warning = 1;
+	    }
+	    goto no_reply;
+	}
+    }
+    psmi_assert_always(ipsaddr->cstate_from == CSTATE_NONE);
+
+    /* Save requestor's connection and psm version numbers */
+    c_verno   = __be16_to_cpu(req->hdr.connect_verno);
+    psm_verno = __be16_to_cpu(req->hdr.psm_verno);
+    features  = __be16_to_cpu(req->features);
+
+    /* On PSM pre-2.0, just print message and exit if the connect version
+     * number is not at least 0x0201 */
+    if (c_verno < 0x0201) {
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_INVALID_VERSION,
+	    "Connect protocol (%x,%x) is obsolete and incompatible",
+	    (c_verno >> 8) & 0xff, c_verno & 0xff);
+	connect_result = PSM_EPID_INVALID_CONNECT;
+    }
+    /* Whenever there's a protocol change, adjust handling here */
+    else if ((IPS_CONNECT_VERNO & 0xff00) != (ipsaddr->connect_verno & 0xff00)) {
+	connect_result = PSM_EPID_INVALID_VERSION;
+    }
+    else if (!node_matches_bitendian(proto->ep, features))
+	connect_result = PSM_EPID_INVALID_NODE;
+    else if (!psmi_verno_isinteroperable(__be16_to_cpu(req->hdr.psm_verno))) {
+	connect_result = PSM_EPID_INVALID_VERSION;
+    }
+    else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && 
+	     proto->epinfo.ep_pkey != IPATH_DEFAULT_P_KEY && 
+	     proto->epinfo.ep_pkey != req->job_pkey) {
+	connect_result = PSM_EPID_INVALID_PKEY;
+    }
+    else if (!uuid_valid) {
+	char ep_key[37], req_key[37];
+	connect_result = PSM_EPID_INVALID_UUID_KEY;
+	psmi_uuid_unparse(proto->ep->key, ep_key);
+	psmi_uuid_unparse(req->hdr.uuid, req_key);
+	_IPATH_PRDBG("UUID key mismatch request key=%s endpoint key=%s\n",
+		    req_key, ep_key);
+    }
+    else if (!psmi_verno_isinteroperable(ipsaddr->psm_verno)) {
+	connect_result = PSM_INIT_BAD_API_VERSION;
+    }
+    else {
+	connect_result = PSM_OK;
+	if (ipsaddr->cstate_to == CSTATE_NONE) {
+	    ips_epstate_idx idx;
+	    psmi_assert_always(newconnect == 1);
+	    err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+	    if (err)
+		goto fail;
+	    ipsaddr->epr.epr_commidx_from = idx;
+	}
+    }
+    ipsaddr->connect_verno = c_verno;
+    ipsaddr->psm_verno = psm_verno;
+
+    /* Incoming connection request */
+    if (ipsaddr->cstate_to != CSTATE_ESTABLISHED) {
+	err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen);
+	if (err) goto fail;
+    }
+    ipsaddr->cstate_from = CSTATE_ESTABLISHED;
+    ipsaddr->cerror_from = connect_result;
+
+    ipsaddr->runid_key  = req->runid_key;
+    ipsaddr->flags |= features & EP_FEATURES_RCVTHREAD ?
+		      SESS_FLAG_HAS_RCVTHREAD : 0;
+    ipsaddr->flags |= proto->ep->context.runtime_flags & PSMI_RUNTIME_RCVTHREAD ?
+		      SESS_FLAG_LOCK_SESS : 0;
+    ipsaddr->flags |= features & EP_FEATURES_MULTIFLOW ?
+		      SESS_FLAG_HAS_FLOWID : 0;
+
+    pthread_mutex_init(&ipsaddr->sesslock, NULL);
+
+    proto->num_connected_from++;
+
+do_reply:
+    ips_proto_send_ctrl_message_reply(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], 
+				      OPCODE_CONNECT_REPLY,
+				      &ipsaddr->ctrl_msg_queued, buf);
+no_reply:
+fail:
+    return err;
+}
+
+psm_error_t
+ips_proto_connect(struct ips_proto *proto, int numep, 
+		const psm_epid_t *array_of_epid, 
+		const int *array_of_epid_mask, psm_error_t *array_of_errors, 
+		psm_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+    int i, n, n_first;
+    psm_error_t err = PSM_OK;
+    psm_epaddr_t epaddr;
+    ips_epaddr_t *ipsaddr = NULL;
+    int numep_toconnect = 0, numep_left;
+    char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN;
+    union psmi_envvar_val credits_intval;
+    int connect_credits;
+
+    psmi_getenv("PSM_CONNECT_CREDITS",
+                "End-point connect request credits.",
+                PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) 100,
+                &credits_intval);
+
+    connect_credits = credits_intval.e_uint;
+
+    PSMI_PLOCK_ASSERT();
+
+    /* All timeout values are in cycles */ 
+    uint64_t t_start = get_cycles();
+    /* Print a timeout at the warning interval */
+    union psmi_envvar_val warn_intval;
+    uint64_t to_warning_interval;
+    uint64_t to_warning_next;
+
+    /* Setup warning interval */
+    psmi_getenv("PSM_CONNECT_WARN_INTERVAL",
+		"Period in seconds to warn if connections are not completed."
+		"Default is 300 seconds, 0 to disable",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 300,
+		&warn_intval);
+    
+    to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL);
+    to_warning_next = t_start + to_warning_interval;
+
+    /* Some sanity checks */
+    psmi_assert_always(sizeof(struct connect_msghdr) == IPS_CONNECT_MSGHDR_SIZE);
+    psmi_assert_always(array_of_epid_mask != NULL);
+    psmi_assert_always(sizeof(struct ips_connect_reqrep) >= 
+		       IPS_CONNECT_REQREP_MINIMUM_SIZE);
+    psmi_assert_always(sizeof(struct ips_disconnect_reqrep) >= 
+		       IPS_DISCONNECT_REQREP_MINIMUM_SIZE);
+
+    /* First pass: make sure array of errors is at least fully defined */
+    for (i = 0; i < numep; i++) {
+	uint64_t lid, context, subcontext;
+
+	lid = PSMI_EPID_GET_LID(array_of_epid[i]);
+	context = PSMI_EPID_GET_CONTEXT(array_of_epid[i]);
+	subcontext = PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i]);
+	_IPATH_VDBG("epid-connect=%s connect to %ld:%ld:%ld\n", 
+			array_of_epid_mask[i] ? "YES" : " NO",
+			(long) lid, (long) context, (long) subcontext);
+	if (array_of_epid_mask[i]) {
+	    array_of_errors[i] = PSM_EPID_UNKNOWN;
+	    array_of_epaddr[i] = NULL;
+	}
+    }
+
+    /* Second pass: see what to connect and what is connectable. */
+    for (i = 0, numep_toconnect = 0; i < numep; i++) {
+	if (!array_of_epid_mask[i])
+	    continue;
+	/* Can't send to epid on same lid */
+	if (psm_epid_nid(proto->ep->epid) == psm_epid_nid(array_of_epid[i])) {
+	    array_of_errors[i] = PSM_EPID_UNREACHABLE;
+	    continue;
+	}
+
+	err = ips_get_addr_from_epid(proto, array_of_epid[i], 30000, &epaddr);
+	if (err)
+	  goto fail;
+	if (epaddr == NULL) {
+	    ips_epstate_idx idx;
+	    /* We're sending a connect request message before some other node
+	     * has sent its connect message */
+	    epaddr = ips_alloc_epaddr(proto, array_of_epid[i], 
+				      NULL, (timeout_in / 1000000UL));
+	    if (epaddr == NULL) {
+		err = PSM_NO_MEMORY;
+		goto fail;
+	    }
+	    ipsaddr = epaddr->ptladdr;
+	    err = ips_epstate_add(proto->epstate, ipsaddr, &idx);
+	    if (err)
+		goto fail;
+	    ipsaddr->epr.epr_commidx_from = idx;
+	    ipsaddr->cstate_from = CSTATE_NONE;
+	} else if (epaddr->ptladdr->cstate_to != CSTATE_NONE) { /* already connected */
+	    psmi_assert_always(epaddr->ptladdr->cstate_to == CSTATE_ESTABLISHED);
+	    array_of_errors[i] = PSM_EPID_ALREADY_CONNECTED;
+	    array_of_epaddr[i] = epaddr;
+	    continue;
+	} else {
+	    /* We've already received a connect request message from a remote
+	     * peer, it's time to send our own. */
+	    ipsaddr = epaddr->ptladdr;
+	    /* No re-entrancy sanity check and makes sure we are not connected
+	     * twice (caller's precondition) */
+	    psmi_assert_always(ipsaddr->cstate_to == CSTATE_NONE);
+	    psmi_assert_always(ipsaddr->cstate_from != CSTATE_NONE);
+#if 0
+	    psmi_assert_always(ipsaddr->cerror_from != PSM_OK || 
+			       !COMMIDX_IS_UNKNOWN(ptl, ipsaddr->commidx_from));
+	    psmi_assert_always(!COMMIDX_IS_UNKNOWN(ptl, ipsaddr->commidx_to));
+#endif
+	}
+
+	ipsaddr->cstate_to = CSTATE_TO_WAITING;
+	ipsaddr->cerror_to = PSM_OK;
+	array_of_epaddr[i] = epaddr;
+	ipsaddr->s_timeout = get_cycles();
+	ipsaddr->delay_in_ms = 1;
+	ipsaddr->credit = 0;
+	numep_toconnect++;
+    }
+
+    /* Second pass: do the actual connect.
+     * PSM_EPID_UNKNOWN: Not connected yet.
+     * PSM_EPID_UNREACHABLE: Not to be connected.
+     * PSM_OK: Successfully connected.
+     * Start sending connect messages at a random index between 0 and numep-1
+     */
+    numep_left = numep_toconnect;
+    n_first = ((uint32_t) get_cycles()) % numep;
+    while (numep_left > 0) {
+	for (n = 0; n < numep; n++) {
+	    int keep_polling = 1;
+	    i = (n_first + n) % numep;
+	    if (!array_of_epid_mask[i]) 
+		continue;
+	    switch (array_of_errors[i]) {
+		case PSM_EPID_UNREACHABLE:
+		case PSM_EPID_ALREADY_CONNECTED:
+		case PSM_OK:
+		    continue;
+		default:
+		    break;
+	    }
+	    psmi_assert_always(array_of_epaddr[i] != NULL);
+	    ipsaddr = array_of_epaddr[i]->ptladdr;
+	    if (ipsaddr->cstate_to == CSTATE_ESTABLISHED) {
+		/* This is not the real error code, we only set OK here
+		 * so we know to stop polling for the reply. The actual
+		 * error is in ipsaddr->cerror_to */
+		array_of_errors[i] = PSM_OK;
+		numep_left--;
+		connect_credits++;
+		ipsaddr->credit = 0;
+		continue;
+	    }
+	    while (keep_polling) {
+		if (!psmi_cycles_left(t_start, timeout_in)) {
+		    err = PSM_TIMEOUT;
+		    goto err_timeout;
+		}
+		if (to_warning_interval && get_cycles() >= to_warning_next) {
+		    uint64_t waiting_time = 
+			cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL;
+		    const char *first_name = NULL;
+		    int num_waiting = 0;
+
+		    for (i = 0; i < numep; i++) {
+			if (!array_of_epid_mask[i] || 
+			     array_of_errors[i] != PSM_EPID_UNKNOWN)
+			    continue;
+			if (!first_name)
+			    first_name = psmi_epaddr_get_name(array_of_epid[i]);
+			num_waiting++;
+		    }
+		    if (first_name) {
+			_IPATH_INFO("Couldn't connect to %s (and %d others). "
+			    "Time elapsed %02i:%02i:%02i. Still trying...\n",
+			    first_name, num_waiting,
+			    (int) (waiting_time / 3600),
+                            (int) ((waiting_time / 60) -
+				   ((waiting_time / 3600) * 60)),
+                            (int) (waiting_time - ((waiting_time / 60) * 60)));
+		    }
+		    to_warning_next = get_cycles() + to_warning_interval;
+		}
+
+		if (get_cycles() > ipsaddr->s_timeout) {
+		    if (!ipsaddr->credit && connect_credits) {
+		        ipsaddr->credit = 1;
+			connect_credits--;
+		    }
+		    if (ipsaddr->credit) {
+		        _IPATH_VDBG("Connect req to %u:%u:%u\n",
+				    __be16_to_cpu(ipsaddr->epr.epr_base_lid), 
+				    ipsaddr->epr.epr_context, 
+				    ipsaddr->epr.epr_subcontext);
+		        if (ips_proto_send_ctrl_message(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO],
+							OPCODE_CONNECT_REQUEST,
+							&ipsaddr->ctrl_msg_queued,
+							buf) == PSM_OK) {
+			    keep_polling = 0;
+			    ipsaddr->delay_in_ms = 
+			        min(100, ipsaddr->delay_in_ms << 1);
+		            ipsaddr->s_timeout = get_cycles() + 
+			        nanosecs_to_cycles(ipsaddr->delay_in_ms * MSEC_ULL);
+			}
+		        /* If not, send got "busy", keep trying */
+		    }
+		    else {
+		        keep_polling = 0;
+		    }
+		}
+
+		if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+		    goto fail;
+
+		if (ipsaddr->cstate_to == CSTATE_ESTABLISHED) {
+		/* This is not the real error code, we only set OK here
+		 * so we know to stop polling for the reply. The actual
+		 * error is in ipsaddr->cerror_to */
+		    array_of_errors[i] = PSM_OK;
+		    numep_left--;
+		    connect_credits++;
+		    ipsaddr->credit = 0;
+		    break;
+		}
+	    }
+	}
+    }
+
+err_timeout:
+    /* Find the worst error to report */
+    for (i = 0; i < numep; i++) {
+	if (!array_of_epid_mask[i])
+		continue;
+	switch (array_of_errors[i]) {
+	    /* These are benign */
+	    case PSM_EPID_UNREACHABLE: 
+	    case PSM_EPID_ALREADY_CONNECTED:
+		break;
+	    case PSM_EPID_UNKNOWN:
+		array_of_errors[i] = PSM_TIMEOUT;
+		err = psmi_error_cmp(err, PSM_TIMEOUT);
+		break;
+	    case PSM_OK:
+		/* Restore the real connect error */
+		ipsaddr = array_of_epaddr[i]->ptladdr;
+		array_of_errors[i] = ipsaddr->cerror_to;
+		psmi_assert_always(
+		    array_of_epaddr[i]->ptladdr->cstate_to == CSTATE_ESTABLISHED);
+		if (ipsaddr->cerror_to != PSM_OK) {
+		    err = psmi_error_cmp(err, ipsaddr->cerror_to);
+		    ips_free_epaddr(array_of_epaddr[i]->ptladdr);
+		    array_of_epaddr[i] = NULL;
+		}
+		else { 
+		    proto->num_connected_to++;
+		    psmi_assert_always(ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_mtu > 0);
+		}
+		break;
+	    default:
+		break;
+	}
+    }
+
+fail:
+    return err;
+}
+
+/* Repercutions on MQ.
+ *
+ * If num_connected==0, everything that exists in the posted queue should
+ * complete and the error must be marked epid_was_closed.
+ *
+ */
+
+psm_error_t
+ips_proto_disconnect(struct ips_proto *proto, int force, int numep, 
+	     const psm_epaddr_t array_of_epaddr[], 
+	     const int array_of_epaddr_mask[], 
+	     psm_error_t array_of_errors[],
+	     uint64_t timeout_in)
+{
+    ips_epaddr_t *ipsaddr;
+    int numep_left, numep_todisc, i, n;
+    int n_first;
+    int cstate;
+    int has_pending;
+    uint64_t timeout;
+    psm_error_t err = PSM_OK;
+    char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN;
+    uint64_t reqs_sent = 0;
+    union psmi_envvar_val credits_intval;
+    int disconnect_credits;
+    uint64_t t_warning, t_start;
+    union psmi_envvar_val warn_intval;
+    unsigned warning_secs;
+
+    psmi_assert_always(numep > 0);
+
+    psmi_getenv("PSM_DISCONNECT_CREDITS",
+                "End-point disconnect request credits.",
+                PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+                (union psmi_envvar_val) 100,
+                &credits_intval);
+
+    disconnect_credits = credits_intval.e_uint;
+
+    /* Setup warning interval */
+    psmi_getenv("PSM_DISCONNECT_WARN_INTERVAL",
+		"Period in seconds to warn if disconnections are not completed."
+		"Default is 300 seconds, 0 to disable.",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) 300,
+		&warn_intval);
+
+    warning_secs = warn_intval.e_uint;
+
+    PSMI_PLOCK_ASSERT();
+
+    /* First pass: see what to disconnect and what is disconnectable */
+    for (i = 0, numep_todisc = 0; i < numep; i++) {
+	if (!array_of_epaddr_mask[i])
+	    continue;
+	psmi_assert_always(array_of_epaddr[i]->ptl == proto->ptl);
+	cstate = array_of_epaddr[i]->ptladdr->cstate_to;
+	array_of_epaddr[i]->ptladdr->credit = 0;
+	if (cstate == CSTATE_NONE) {
+	    array_of_errors[i] = PSM_OK;
+	    continue;
+	}
+	else {
+	    psmi_assert_always(cstate == CSTATE_ESTABLISHED);
+	}
+	_IPATH_VDBG("disconnecting %p\n", array_of_epaddr[i]);
+	array_of_errors[i] = PSM_EPID_UNKNOWN;
+	numep_todisc++;
+    }
+    if (numep_todisc == 0)
+	goto success;
+
+    /* Wait for everyone to ack previous packets before putting */
+    if (timeout_in == 0)
+	timeout = ~0ULL;
+    else
+	timeout = get_cycles() + nanosecs_to_cycles(timeout_in);
+
+    t_start = get_cycles();
+    t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL);
+
+    n_first = ((uint32_t) get_cycles()) % numep;
+    if (!force) {
+	numep_left = numep_todisc;
+	do {
+	    for (n = 0; n < numep; n++) {
+		i = (n_first + n) % numep;
+		if (!array_of_epaddr_mask[i] || array_of_errors[i] == PSM_OK)
+		    continue;
+		ipsaddr = array_of_epaddr[i]->ptladdr;
+		switch (ipsaddr->cstate_to) {
+		    case CSTATE_TO_DISCONNECTED:
+			array_of_errors[i] = PSM_OK;
+			numep_left--;
+			disconnect_credits++;
+			ipsaddr->credit = 0;
+			continue;
+		    case CSTATE_TO_WAITING_DISC:
+			if (ipsaddr->s_timeout > get_cycles())
+			    continue;
+			ipsaddr->delay_in_ms = 
+			    min(100, ipsaddr->delay_in_ms << 1);
+			ipsaddr->s_timeout = get_cycles() +
+			    nanosecs_to_cycles(ipsaddr->delay_in_ms*MSEC_ULL);
+			ips_proto_send_ctrl_message_request(proto, &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO],
+						            OPCODE_DISCONNECT_REQUEST,
+						            &ipsaddr->ctrl_msg_queued, 
+						            buf, timeout);
+			reqs_sent++;
+			break;
+		    case CSTATE_ESTABLISHED:
+			/* Still pending acks, hold off for now */
+			ips_ptladdr_lock(ipsaddr);
+			has_pending = 
+			  !STAILQ_EMPTY(&ipsaddr->
+				        flows[EP_FLOW_GO_BACK_N_PIO].scb_unacked) ||
+			  !STAILQ_EMPTY(&ipsaddr->
+				        flows[EP_FLOW_GO_BACK_N_DMA].scb_unacked) ||
+			  !STAILQ_EMPTY(&ipsaddr->
+				        flows[EP_FLOW_GO_BACK_N_AM_REQ].scb_unacked) ||
+			  !STAILQ_EMPTY(&ipsaddr->
+				        flows[EP_FLOW_GO_BACK_N_AM_RSP].scb_unacked);
+			ips_ptladdr_unlock(ipsaddr);
+			if (has_pending)
+			    continue;
+		        if (!ipsaddr->credit && disconnect_credits) {
+		            ipsaddr->credit = 1;
+			    disconnect_credits--;
+		        }
+		        if (!ipsaddr->credit)
+			    continue;
+			ipsaddr->delay_in_ms = 1;
+			ipsaddr->cstate_to = CSTATE_TO_WAITING_DISC;
+			ipsaddr->s_timeout = get_cycles() + 
+			  nanosecs_to_cycles(MSEC_ULL);			
+			ips_proto_send_ctrl_message_request(proto, &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO],
+						            OPCODE_DISCONNECT_REQUEST,
+						            &ipsaddr->ctrl_msg_queued, 
+						            buf, timeout);
+			reqs_sent++;
+			break;
+		    default:
+			psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			    "Unhandled/unknown close state %d", 
+			    ipsaddr->cstate_to);
+			break;
+		}
+	    }
+	    if (numep_left == 0)
+		break;
+
+	    if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1))))
+		goto fail;
+
+	    if (warning_secs && get_cycles() > t_warning) {
+                _IPATH_INFO("graceful close in progress for %d/%d peers "
+		    "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc,
+		    (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL),
+                    (int) (timeout_in / MSEC_ULL),
+		    (unsigned long long) reqs_sent);
+                t_warning = get_cycles() + nanosecs_to_cycles(warning_secs * SEC_ULL);
+	    }
+	} 
+	while (timeout > get_cycles());
+
+	if (numep_left > 0) {
+	    err = PSM_TIMEOUT;
+	    for (i = 0; i < numep; i++) {
+		if (!array_of_epaddr_mask[i])
+		    continue;
+		if (array_of_errors[i] == PSM_EPID_UNKNOWN) {
+		    array_of_errors[i] = PSM_TIMEOUT;
+		    _IPATH_VDBG("disc timeout on index %d, epaddr %s\n",
+			        i, psmi_epaddr_get_name(array_of_epaddr[i]->epid));
+		}
+	    }
+            _IPATH_PRDBG("graceful close incomplete for %d/%d peers "
+		    "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc,
+		    (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL),
+                    (int) (timeout_in / MSEC_ULL),
+		    (unsigned long long) reqs_sent);
+	} 
+	else
+            _IPATH_PRDBG("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n",
+		     numep_todisc,
+		    (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL),
+                    (unsigned long long) reqs_sent);
+    } else {
+	for (n = 0; n < numep; n++) {
+	    i = (n_first + n) % numep;
+	    if (!array_of_epaddr_mask[i])
+		continue;
+	    ipsaddr = array_of_epaddr[i]->ptladdr;
+	    psmi_assert_always(ipsaddr->cstate_to == CSTATE_ESTABLISHED);
+	    ips_proto_send_ctrl_message(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], 
+					OPCODE_DISCONNECT_REQUEST,
+					&ipsaddr->ctrl_msg_queued, 
+					buf);
+	    /* Force state to DISCONNECTED */
+	    ipsaddr->cstate_to = CSTATE_TO_DISCONNECTED;
+	    array_of_errors[i] = PSM_OK;
+	}
+        _IPATH_VDBG("non-graceful close complete from %d peers\n", numep);
+    }
+
+    for (i = 0; i < numep; i++) {
+	if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM_OK)
+	    continue;
+	ipsaddr = array_of_epaddr[i]->ptladdr;
+	if (ipsaddr->cstate_to == CSTATE_NONE)
+	    continue;
+	psmi_assert_always(ipsaddr->cstate_to == CSTATE_TO_DISCONNECTED);
+	proto->num_connected_to--;
+	/* Remote disconnect req arrived already, remove this epid.  If it
+	 * hasn't arrived yet, that's okay, we'll pick it up later and just
+	 * mark our connect-to status as being "none". */
+	if (ipsaddr->cstate_from == CSTATE_NONE) {
+	    ips_free_epaddr(ipsaddr);
+	}
+	else
+	    ipsaddr->cstate_to = CSTATE_NONE;
+    }
+
+fail:
+success:
+    return err;
+}
+
+int
+ips_proto_isconnected(ips_epaddr_t *ipsaddr)
+{
+    if (ipsaddr->cstate_to == CSTATE_ESTABLISHED || 
+	ipsaddr->cstate_from == CSTATE_ESTABLISHED)
+	return 1;
+    else
+	return 0;
+}
+
diff --git a/ptl_ips/ips_proto_dump.c b/ptl_ips/ips_proto_dump.c
new file mode 100644
index 0000000..3cbfa28
--- /dev/null
+++ b/ptl_ips/ips_proto_dump.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_proto_header.h"
+#include "ips_proto_help.h"
+#include "ips_epstate.h"
+
+void ips_proto_dump_frame(void *frame, int lenght, char *message)
+{
+    uint8_t *raw_frame = frame;
+    int counter;
+    char default_message[] = "<UNKNOWN>";
+
+    if(!message)
+        message = default_message;
+
+    printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame, message);
+
+    for(counter = 0; counter < lenght; counter++) {
+        if((counter % 16) == 0)
+            printf("\n");
+
+        if((counter % 4) == 0)
+            printf("   ");
+
+        printf("%02X ", raw_frame[counter]);
+    }
+    printf("\n");
+}
+
+void ips_proto_dump_data(void *data, int data_length)
+{
+    int counter;
+    uint8_t *payload = (uint8_t *)data;
+
+    printf("\nHex dump of data, length = %i\n",
+           data_length);
+
+    for(counter = 0; counter < data_length; counter++) {
+        if((counter % 16) == 0)
+            printf("\n %04d: ", counter);
+
+        if((counter % 4) == 0)
+            printf("   ");
+
+        printf("%02X ", payload[counter]);
+    }
+    printf("\n");
+}
+
+void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg)
+{
+    uint32_t tid;
+    psm_protocol_type_t protocol;
+    psmi_seqnum_t ack_seq_num;
+        
+    printf("\nHeader decoding %s\n",msg?msg:"");
+
+    printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n",
+        __be16_to_cpu(p_hdr->lrh[0]));
+    printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1]));
+    printf("LRH: PktLen %i (0x%x)\n", __be16_to_cpu(p_hdr->lrh[2]),
+        __be16_to_cpu(p_hdr->lrh[2]));
+    printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3]));
+    printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n",
+        __be32_to_cpu(p_hdr->bth[0]));
+    printf("BTH: R8-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1]));
+    printf("BTH: AR1-Res7-PSN24 %x\n", __be32_to_cpu(p_hdr->bth[2]));
+    printf("IPH: chksum %x\n", __le16_to_cpu(p_hdr->iph.chksum));
+    printf("IPH: pkt_flags %x\n", __le16_to_cpu(
+        p_hdr->iph.pkt_flags) & INFINIPATH_KPF_INTR_HDRSUPP_MASK);
+    printf("IPH: ver %i\n",
+        (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)
+        >> INFINIPATH_I_VERS_SHIFT) & INFINIPATH_I_VERS_MASK);
+    printf("IPH: context %i\n", 
+        (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)
+        >> INFINIPATH_I_CONTEXT_SHIFT) & INFINIPATH_I_CONTEXT_MASK);
+    printf("IPH: subcontext %i\n", p_hdr->dst_subcontext);
+    tid = (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)
+           >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK;
+    printf("IPH: tid %x\n", tid);
+    printf("IPH: offset %x\n",
+        (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)
+        >> INFINIPATH_I_OFFSET_SHIFT) & INFINIPATH_I_OFFSET_MASK);
+
+    printf("sub-opcode %x\n", p_hdr->sub_opcode);
+    
+    ack_seq_num.psn = p_hdr->ack_seq_num;
+    protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid);
+    if (protocol == PSM_PROTOCOL_GO_BACK_N)
+      printf("ack_seq_num %x\n", ack_seq_num.psn);
+    else
+      printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", ack_seq_num.flow, ack_seq_num.gen, ack_seq_num.seq);
+    
+    printf("context %d (src_context %d src_context_ext %d) src_subcontext %d\n",
+	IPS_HEADER_SRCCONTEXT_GET(p_hdr), p_hdr->src_context, p_hdr->src_context_ext,
+	p_hdr->src_subcontext);
+    printf("src_rank/commidx %i\n", p_hdr->commidx |
+        INFINIPATH_KPF_RESERVED_BITS(p_hdr->iph.pkt_flags));
+    if (tid != IPATH_EAGER_TID_ID)
+	printf("expected_tid_session_id %i\n", p_hdr->data[0].u32w0);
+    printf("flags %x\n", p_hdr->flags);
+    printf("mqhdr %x\n", p_hdr->mqhdr);
+}
+
+// linux doesn't have strlcat; this is a stripped down implementation
+// not super-efficient, but we use it rarely, and only for short strings
+// not fully standards conforming!
+static size_t strlcat(char *d, const char *s, size_t l)
+{
+    int dlen = strlen(d), slen, max;
+    if(l<=dlen)  // bug
+        return l;
+    slen = strlen(s);
+    max = l-(dlen+1);
+    if(slen>max)
+        slen = max;
+    memcpy(d+dlen, s, slen);
+    d[dlen+slen] = '\0';
+    return dlen+slen+1; // standard says to return full length, not actual
+}
+
+// decode RHF errors; only used one place now, may want more later
+void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len)
+{
+    *msg = '\0'; // if no errors, and so don't need to check what's first
+
+    if(err & INFINIPATH_RHF_H_ICRCERR)
+        strlcat(msg, "icrcerr ", len);
+    if(err & INFINIPATH_RHF_H_VCRCERR)
+        strlcat(msg, "vcrcerr ", len);
+    if(err & INFINIPATH_RHF_H_PARITYERR)
+        strlcat(msg, "parityerr ", len);
+    if(err & INFINIPATH_RHF_H_LENERR)
+        strlcat(msg, "lenerr ", len);
+    if(err & INFINIPATH_RHF_H_MTUERR)
+        strlcat(msg, "mtuerr ", len);
+    if(err & INFINIPATH_RHF_H_IHDRERR)
+        strlcat(msg, "ipathhdrerr ", len);
+    if(err & INFINIPATH_RHF_H_TIDERR)
+        strlcat(msg, "tiderr ", len);
+    if(err & INFINIPATH_RHF_H_MKERR)
+        strlcat(msg, "mkerr ", len);
+    if(err & INFINIPATH_RHF_H_IBERR)
+        strlcat(msg, "iberr ", len);
+    if(err & INFINIPATH_RHF_L_SWA)
+        strlcat(msg, "swA ", len);
+    if(err & INFINIPATH_RHF_L_SWB)
+        strlcat(msg, "swB ", len);
+}
+
+void ips_proto_dump_err_stats(struct ips_proto *proto)
+{
+  char err_stat_msg[2048];
+  char tmp_buf[128];
+  int len = sizeof(err_stat_msg);
+
+  if (!(infinipath_debug & __IPATH_PKTDBG))
+    return;
+  
+  *err_stat_msg = '\0';
+
+  if (proto->error_stats.num_icrc_err ||
+      proto->error_stats.num_vcrc_err ||
+      proto->error_stats.num_ecc_err ||
+      proto->error_stats.num_len_err ||
+      proto->error_stats.num_mtu_err ||
+      proto->error_stats.num_khdr_err ||
+      proto->error_stats.num_tid_err ||
+      proto->error_stats.num_mk_err ||
+      proto->error_stats.num_ib_err) {
+    
+    snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: ");
+
+    if (proto->error_stats.num_icrc_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %"PRIu64" ", proto->error_stats.num_icrc_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_vcrc_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "VCRC: %"PRIu64" ", proto->error_stats.num_vcrc_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_ecc_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %"PRIu64" ", proto->error_stats.num_ecc_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_len_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %"PRIu64" ", proto->error_stats.num_len_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_mtu_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "MTU: %"PRIu64" ", proto->error_stats.num_mtu_err);
+    strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_khdr_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "KHDR: %"PRIu64" ", proto->error_stats.num_khdr_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_tid_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "TID: %"PRIu64" ", proto->error_stats.num_tid_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_mk_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "MKERR: %"PRIu64" ", proto->error_stats.num_mk_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    
+    if (proto->error_stats.num_ib_err) {
+      snprintf(tmp_buf, sizeof(tmp_buf), "IBERR: %"PRIu64" ", proto->error_stats.num_ib_err);
+      strlcat(err_stat_msg, tmp_buf, len);
+    }
+    strlcat(err_stat_msg, "\n", len);
+  }
+  else 
+    strlcat(err_stat_msg, "No previous errors.\n", len);
+  
+  _IPATH_ERROR("%s", err_stat_msg);
+}
+
diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c
new file mode 100644
index 0000000..05d7a6e
--- /dev/null
+++ b/ptl_ips/ips_proto_expected.c
@@ -0,0 +1,2489 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+/*
+ * Expected tid operations are carried out over "sessions".  One session is a
+ * collection of N tids where N is determined by the expected message window
+ * size (-W option or PSM_MQ_RNDV_IPATH_WINDOW).  Since naks can cause
+ * retransmissions, each session has an session index (_desc_idx) and a
+ * generation count (_desc_genc) to be able to identify if retransmitted
+ * packets reference the correct session.
+ *
+ * index and generation count are each 4 bytes encoded in one ptl_arg.  They
+ * could be compressed further but we have the header space, so we don't
+ * bother.
+ */
+#define _desc_idx   u32w0
+#define _desc_genc  u32w1
+
+/* 
+ * Easy switch to (say) _IPATH_INFO if debugging in the expected protocol is
+ * needed
+ */
+#define _IPATH_EXP _IPATH_VDBG
+
+/*
+ * Timer callbacks.  When we need work to be done out of the receive process
+ * loop, we schedule work on timers to be done at a later time.
+ */
+static psm_error_t 
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm_error_t 
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm_error_t 
+ips_tid_release_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm_error_t 
+ips_tid_grant_timer_callback(struct psmi_timer *timer, uint64_t current);
+
+static psm_error_t 
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, psm_mq_req_t req, 
+			   uint32_t msglen, int flags, ptl_epaddr_t *ipsaddr,
+			   psmi_seqnum_t flowgenseq,
+			   ips_tid_session_list *tid_list, 
+			   uint32_t tid_list_size);
+
+static void 
+ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+
+static void
+ips_tid_flowavail_callback(struct ips_tfctrl *tfctrl, void *context);
+
+static void 
+ips_tid_mpool_tidrecv_callback(void *context);
+
+/* Defined at the ptl-level (breaks abstractions but needed for shared vs
+ * non-shared contexts */
+extern int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+static psm_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc);
+
+psm_error_t
+ips_protoexp_init(const psmi_context_t *context, 
+		  const struct ips_proto *proto, 
+		  uint32_t protoexp_flags,
+		  int num_of_send_bufs,
+		  int num_of_send_desc,
+		  struct ips_protoexp **protoexp_o)
+{
+    struct ips_protoexp	*protoexp = NULL;
+    uint32_t tidmtu_max;
+    psm_error_t err = PSM_OK;
+        
+    protoexp = (struct ips_protoexp *)
+	    psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp));
+    if (protoexp == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    *protoexp_o = protoexp;
+
+    protoexp->ptl   = (const struct ptl *) proto->ptl;
+    protoexp->proto = (struct ips_proto *) proto;
+    protoexp->timerq = proto->timerq;
+    protoexp->tid_flags = protoexp_flags;
+    protoexp->tidflow_seed = (unsigned int) getpid();
+
+    /* Must be initialized already */
+    /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012
+    psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL &&
+		       proto->ep->mq->rreq_pool != NULL &&
+		       proto->ep->mq->sreq_pool != NULL);
+    */
+    psmi_assert_always(proto->timerq != NULL);
+    /* Make sure pbc is at the right place before the message header */
+    psmi_assert_always(sizeof(union ipath_pbc) == (size_t)
+	(offsetof(struct ips_scb, ips_lrh) - offsetof(struct ips_scb, pbc)));
+
+    /* These request pools are managed by the MQ component */
+    protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool;
+    protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool;
+
+    if (proto->flags & IPS_PROTO_FLAG_MQ_EXPECTED_SDMA) {
+      protoexp->tid_ep_flow = EP_FLOW_GO_BACK_N_DMA;
+      protoexp->tid_xfer_type = PSM_TRANSFER_DMA;
+    }
+    else {
+      protoexp->tid_ep_flow = EP_FLOW_GO_BACK_N_PIO;
+      protoexp->tid_xfer_type = PSM_TRANSFER_PIO;
+    }
+
+    /* Initialze tid flow control. */
+    {
+      const struct ipath_user_info *user_info = &context->user_info;
+      const struct ipath_base_info *base_info = &context->base_info;
+      uint32_t num_flow, start_flow, end_flow;
+      uint32_t has_hw_hdrsupp = (context->runtime_flags & IPATH_RUNTIME_HDRSUPP);
+      
+      if (!user_info->spu_subcontext_cnt || !has_hw_hdrsupp) {
+	/* If no context sharing enabled can use full tidflow table for
+	 * all HCAs. 
+	 */
+	start_flow = 0;
+	num_flow = INFINIPATH_TF_NFLOWS;
+      }
+      else {
+	/* Context sharing on QLE73XX requires hardware tidflow table to be
+	 * shared as well.
+	 */
+	num_flow = (uint32_t) (INFINIPATH_TF_NFLOWS / user_info->spu_subcontext_cnt);
+	start_flow = base_info->spi_subcontext * num_flow;
+      }
+      
+      end_flow = start_flow + num_flow;
+      
+      if ((err = ips_tf_init(context, &protoexp->tfctrl,
+			     start_flow, end_flow,
+			     ips_tid_flowavail_callback, protoexp)))
+	goto fail;
+    }
+    
+    /* Fix the fragsize to be a power of two (usually 2048) */
+    protoexp->tid_send_fragsize = context->base_info.spi_tid_maxsize;
+    if (proto->flags & IPS_PROTO_FLAG_MQ_EXPECTED_SDMA)
+	tidmtu_max = proto->epinfo.ep_mtu;
+    else
+	tidmtu_max = proto->epinfo.ep_piosize;
+
+    while (protoexp->tid_send_fragsize > tidmtu_max)
+	protoexp->tid_send_fragsize /= 2;
+
+    if ((err = ips_tid_init(&protoexp->tidc, context)))
+	goto fail;
+
+    {
+	uint32_t bounce_size, num_bounce_bufs;
+
+	if ((protoexp->tid_xfer_type == PSM_TRANSFER_DMA) ||
+	    (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM)) {
+	    num_bounce_bufs = max(8, num_of_send_bufs >> 2);
+	    bounce_size = protoexp->tid_send_fragsize;
+	}
+	else {
+	  /* no bufs, we only need the buffers to handle misalignment on the
+	   * sender when using send dma. */
+	  num_bounce_bufs = 0;
+	  bounce_size = 0;
+	}
+	if ((err = ips_scbctrl_init(context, num_of_send_desc, num_bounce_bufs,
+		0, bounce_size, ips_tid_scbavail_callback,
+		protoexp, &protoexp->tid_scbc_rv)))
+	    goto fail;
+    }
+    
+    {
+      /* Determine interval to generate headers (relevant only when header
+       * suppression is enabled) else headers will always be generated.
+       *
+       * The PSM_EXPECTED_HEADERS environment variable can specify the
+       * packet interval to generate headers at. Else a header packet is
+       * generated every 
+       * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize).
+       * Note: A header is always generated for the last packet in the flow.
+       */
+      
+      union psmi_envvar_val env_exp_hdr;
+      uint32_t defval = 
+	min(PSM_DEFAULT_EXPECTED_HEADER, 
+	    proto->mq->ipath_window_rv/protoexp->tid_send_fragsize);
+      
+      psmi_getenv("PSM_EXPECTED_HEADERS",
+		  "Interval to generate expected protocol headers",
+		  PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+		  (union psmi_envvar_val) defval, &env_exp_hdr);
+      
+      protoexp->hdr_pkt_interval = env_exp_hdr.e_uint;
+      /* Account for flow credits - Should try to have atleast 4 headers
+       * generated per window.
+       */
+      protoexp->hdr_pkt_interval = 
+	max(min(protoexp->hdr_pkt_interval, proto->flow_credits >> 2), 1);
+      
+      if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) {
+	_IPATH_VDBG("Overriding PSM_EXPECTED_HEADERS=%u to be '%u'\n",
+		    env_exp_hdr.e_uint, protoexp->hdr_pkt_interval);
+      }
+      
+    }
+    
+    /* Send descriptors.
+     *
+     * There can be up to 2^32 of these send descriptors.  We conservatively
+     * allocate 256 but large node configurations can allocate up to sdesc_num
+     * of these (they are about 2k each).
+     * We impose a theoretical limit of 2^30.
+     */
+    {
+	struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS;
+	uint32_t maxsz, chunksz;
+
+	if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+					   &rlim,  &maxsz, &chunksz)))
+	    goto fail;
+				    
+	protoexp->tid_desc_send_pool =
+	    psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz, maxsz,
+			      0, DESCRIPTORS, NULL, NULL);
+
+	if (protoexp->tid_desc_send_pool == NULL) {
+	    err = psmi_handle_error(proto->ep, PSM_NO_MEMORY,
+			    "Couldn't allocate tid descriptor memory pool");
+	    goto fail;
+	}
+    }
+
+    /* Receive descriptors.
+     *
+     * There can only be 256 of these because the field to identify the receive
+     * descriptor is only 8 bits.  This currently isn't a problem because we
+     * only have 512 tids and each descriptor consumes ~32 tids per tid window.
+     * This means only roughly 16 descriptors are ever used.
+     */
+
+    {
+	struct psmi_rlimit_mpool rlim = TID_RECVSESSIONS_LIMITS;
+	uint32_t maxsz, chunksz;
+
+	if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1,
+					   &rlim,  &maxsz, &chunksz)))
+	    goto fail;
+				    
+	protoexp->tid_desc_recv_pool =
+	    psmi_mpool_create(sizeof(struct ips_tid_recv_desc), chunksz, maxsz, 
+			      0, DESCRIPTORS, ips_tid_mpool_tidrecv_callback, 
+			      protoexp);
+
+	if (protoexp->tid_desc_recv_pool == NULL) {
+	    err = psmi_handle_error(proto->ep, PSM_NO_MEMORY,
+		    "Couldn't allocate tid descriptor memory pool");
+	    goto fail;
+	}
+    }
+
+    /* This pool can never be smaller than the max number of rreqs that can be
+     * allocated. */
+    {
+	uint32_t rreq_per_chunk, rreq_max;
+
+	psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL);
+
+	psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool,
+				&rreq_per_chunk,
+				&rreq_max);
+
+	protoexp->tid_getreq_pool =
+	    psmi_mpool_create(sizeof(struct ips_tid_get_request), 
+		    rreq_per_chunk, rreq_max, 0, DESCRIPTORS, NULL, NULL);
+
+	if (protoexp->tid_getreq_pool == NULL) {
+	    err = psmi_handle_error(proto->ep, PSM_NO_MEMORY,
+		"Couldn't allocate getreq descriptor memory pool");
+	    goto fail;
+	}
+    }
+
+    /*
+     * Parse the tid timeout settings from the environment.
+     * <min_timeout>:<max_timeout>:<interrupt_iters>
+     *
+     */
+    {
+	int tvals[3];
+	char *tid_to;
+	union psmi_envvar_val env_to;
+
+	if (context->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+	    tvals[0] = 200;
+	    tvals[1] = 1000;
+	    tvals[2] = 2;
+	    tid_to = "200:1000:2";
+	}
+	else {
+	    /* This has always been the behavior ips < 2.1 */
+	    tid_to = "100:100:3";
+	    tvals[0] = 100;
+	    tvals[1] = 100;
+	    tvals[2] = 3;
+	}
+
+	if (!psmi_getenv("PSM_TID_TIMEOUT",
+			 "Tid timeout control <min:max:intr_count>",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val) tid_to,
+			 &env_to)) {
+	    /* not using default values */
+	    tid_to = env_to.e_str;
+	    psmi_parse_str_tuples(tid_to, 3, tvals);
+	}
+	protoexp->tid_to_cyc_min = us_2_cycles((uint64_t) tvals[0]);
+	protoexp->tid_to_cyc_max = us_2_cycles((uint64_t) tvals[1]);
+	protoexp->tid_to_intr = tvals[2];
+	_IPATH_PRDBG("Tid control message settings: timeout min=%dus/max=%dus, "
+		     "interrupt when trying attempt #%d\n",
+		    tvals[0], tvals[1], tvals[2]);
+    }
+
+    /*
+     * Make sure that the rendezvous window size settings are not larger than
+     * the largest packet we can put on the wire.
+     */
+    {
+	uint32_t winsize = protoexp->proto->mq->ipath_window_rv;
+
+	if (winsize < ips_tid_page_size(&protoexp->tidc)) {
+	    _IPATH_INFO("Overriding request for rndv window size %d "
+			"to minimum supported value %d bytes\n",
+			winsize, ips_tid_page_size(&protoexp->tidc));
+	    protoexp->proto->mq->ipath_window_rv = 
+		    ips_tid_page_size(&protoexp->tidc);
+	}
+	else { /* Figure out maximum supportable value assuming we can
+		* send a maxmium payload of 2048 bytes */
+	    int maxtids = 0;
+
+	    while (PSMI_ALIGNUP((sizeof(ips_tid_session_list) +
+		   ((maxtids+1) * sizeof(ips_tid_session_member))), 4) 
+		    < IPS_PROTOEXP_MIN_MTU)
+	    {
+		maxtids++;
+	    }
+
+	    /* Assume worse-case alignment when deriving the amount of tids,
+	     * need one tid for bad page-alignment and another for spillover
+	     * into last page */
+	    winsize = (maxtids-2) * ips_tid_page_size(&protoexp->tidc);
+
+	    if (protoexp->proto->mq->ipath_window_rv > winsize) {
+		_IPATH_INFO("Overriding request for rndv window size %d "
+			    "to maximum supported value %d bytes\n",
+			    protoexp->proto->mq->ipath_window_rv,
+			    winsize);
+		protoexp->proto->mq->ipath_window_rv = winsize;
+	    }
+	}
+    }
+
+    /*
+     * Allow setting of PSM_TID_MIN_EXPSEND, the minimum amount of expected
+     * send packets we send before checking the receive queue.
+     */
+    {
+	union psmi_envvar_val env_mincnt;
+
+	psmi_getenv("PSM_TID_MIN_EXPSEND",
+		    "Min expsend pkt cnt before recv",
+		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+		    (union psmi_envvar_val) 3, &env_mincnt);
+	protoexp->tid_min_expsend_cnt = env_mincnt.e_uint;
+    }
+
+    /* Timers to handle requeueing of work out of the receive path */
+    psmi_timer_entry_init(&protoexp->timer_send,
+			 ips_tid_pendsend_timer_callback, protoexp);
+    STAILQ_INIT(&protoexp->pend_sendq);
+    psmi_timer_entry_init(&protoexp->timer_getreqs,
+			 ips_tid_pendtids_timer_callback, protoexp);
+    STAILQ_INIT(&protoexp->pend_getreqsq);
+
+    protoexp->tid_page_offset_mask = 
+	(uint32_t) context->base_info.spi_tid_maxsize - 1;
+    protoexp->tid_page_mask =  
+	~((uint64_t) context->base_info.spi_tid_maxsize - 1);
+
+    if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) {
+	protoexp->tid_info = (struct ips_tidinfo *)
+	    psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS, 
+			sizeof (struct ips_tidinfo));
+	if (protoexp->tid_info == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+    }
+    else
+	protoexp->tid_info = NULL;
+
+    psmi_assert(err == PSM_OK);
+    return err;
+
+fail:
+    if (protoexp != NULL && protoexp->tid_getreq_pool != NULL)
+	psmi_mpool_destroy(protoexp->tid_getreq_pool);
+    if (protoexp != NULL && protoexp->tid_desc_recv_pool != NULL)
+	psmi_mpool_destroy(protoexp->tid_desc_recv_pool);
+    if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL)
+	psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+    if (protoexp != NULL)
+	ips_scbctrl_fini(&protoexp->tid_scbc_rv);
+    if (protoexp != NULL)
+	psmi_free(protoexp);
+    return err;
+}
+
+psm_error_t 
+ips_protoexp_fini(struct ips_protoexp *protoexp)
+{
+    psm_error_t err = PSM_OK;
+
+    psmi_mpool_destroy(protoexp->tid_getreq_pool);
+    psmi_mpool_destroy(protoexp->tid_desc_recv_pool);
+    psmi_mpool_destroy(protoexp->tid_desc_send_pool);
+
+    if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv)))
+	goto fail;
+
+    if ((err = ips_tid_fini(&protoexp->tidc)))
+	goto fail;
+    
+    if ((err = ips_tf_fini(&protoexp->tfctrl)))
+      goto fail;
+    
+    _IPATH_PRDBG("Tid control resends: tid_grant=%lld,tid_release=%lld,"
+		 "request_intr=%lld\n",
+		(long long) protoexp->tid_grant_resends,
+		(long long) protoexp->tid_release_resends,
+		(long long) protoexp->tid_intr_reqs);
+
+    if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG)
+	psmi_free(protoexp->tid_info);
+
+    psmi_free(protoexp);
+
+fail:
+    return err;
+}
+
+/* New scbs now available.  If we have pending sends because we were out of
+ * scbs, put the pendq on the timerq so it can be processed. */
+static
+void
+ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+    struct ips_protoexp *protoexp = (struct ips_protoexp *) context;
+
+    if (!STAILQ_EMPTY(&protoexp->pend_sendq))
+	psmi_timer_request(protoexp->timerq, 
+			  &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+    return;
+}
+
+/* New Tid Flows are available. If there are pending get requests put the
+ * get timer on the timerq so it can be processed. */
+static
+void
+ips_tid_flowavail_callback(struct ips_tfctrl *tfctrl, void *context)
+{
+  struct ips_protoexp *protoexp = (struct ips_protoexp *) context;
+
+  if (!STAILQ_EMPTY(&protoexp->pend_getreqsq))
+    psmi_timer_request(protoexp->timerq,
+		       &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+  return;
+}
+
+/*
+ * The tid get request is always issued from within the receive progress loop,
+ * which is why we always enqueue the request instead of issuing it directly.
+ * Eventually, if we expose tid_get to users, we will want to differentiate
+ * when the request comes from the receive progress loop from cases where the
+ * tid_get is issued directly from user code.
+ *
+ */
+psm_error_t
+ips_protoexp_tid_get_from_token(
+	    struct ips_protoexp *protoexp,
+	    void		*buf,
+	    uint32_t		 length,
+	    psm_epaddr_t	 epaddr,
+	    uint32_t		 remote_tok,
+	    uint32_t		 flags,
+	    ips_tid_completion_callback_t callback,
+	    void			  *context)
+{
+    struct ips_tid_get_request *getreq;
+    int count, fragsize;
+
+    getreq = (struct ips_tid_get_request *) 
+	     psmi_mpool_get(protoexp->tid_getreq_pool);
+
+    /* We can't *really* run out of these here because we always allocate as
+     * much as available receive reqs */
+    if_pf (getreq == NULL) 
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+	    "Ran out of 'getreq' descriptors");
+
+    getreq->tidgr_protoexp  = protoexp;
+    getreq->tidgr_epaddr    = epaddr;
+    getreq->tidgr_lbuf	    = buf;
+    getreq->tidgr_length    = length;
+    getreq->tidgr_sendtoken = remote_tok;
+    getreq->tidgr_ucontext  = context;
+    getreq->tidgr_callback  = callback;
+    getreq->tidgr_offset    = 0;
+    getreq->tidgr_bytesdone = 0;
+    getreq->tidgr_desc_seqno= 0;
+    getreq->tidgr_flags     = flags; 
+
+    /* nsconn is the # of slave channels. */
+    /* fragsize is the bytes each channel should transfer. */
+    count = epaddr->mctxt_master->mctxt_nsconn;
+    fragsize = (length+count)/(count+1);
+    if (fragsize < 4096) fragsize = 4096;
+    getreq->tidgr_rndv_winsz= min(fragsize, epaddr->ep->mq->ipath_window_rv);
+
+    STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next);
+    if (ips_tid_num_available(&protoexp->tidc) >=
+	    ips_tid_num_required(&protoexp->tidc, (void *) NULL,
+		getreq->tidgr_rndv_winsz))
+	ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0);
+    else
+	psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, 
+		      PSMI_TIMER_PRIO_1);
+    return PSM_OK;
+}
+
+/* List of perf events */
+#define _ips_logeventid_tid_send_reqs	0   /* out of tid send descriptors */
+
+#define ips_logevent_id(event)	 _ips_logeventid_ ## event
+#define ips_logevent(proto, event,ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr)
+
+static
+void
+ips_logevent_inner(struct ips_proto *proto, int eventid, void *context)
+{
+    uint64_t t_now = get_cycles();
+
+    switch (eventid) {
+	case ips_logevent_id(tid_send_reqs): {
+	    ips_epaddr_t *ipsaddr = (ips_epaddr_t *) context;
+	    proto->psmi_logevent_tid_send_reqs.count++;
+
+	    if (t_now >= proto->psmi_logevent_tid_send_reqs.next_warning) {
+		psmi_handle_error(PSMI_EP_LOGEVENT, PSM_OK,
+		    "Non-fatal temporary exhaustion of send tid dma descriptors "
+		    "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)", 
+		    (double) cycles_to_nanosecs(t_now - ipsaddr->proto->t_init) / 1.0e9,
+		    (int) psm_epid_nid(ipsaddr->epaddr->epid), 
+		    (int) psm_epid_context(ipsaddr->epaddr->epid),
+		    (long long) proto->psmi_logevent_tid_send_reqs.count);
+		proto->psmi_logevent_tid_send_reqs.next_warning = t_now + 
+		    sec_2_cycles(proto->psmi_logevent_tid_send_reqs.interval_secs);
+	    }
+	}
+	break;
+
+	default:
+	    break;
+    }
+
+    return;
+}
+
+/*
+ * Expected Protocol.
+ *
+ * We're granted tids (as part of a tid get request) and expected to fulfill
+ * the request by associating the request's sendtoken to a tid send descriptor.
+ *
+ * It's possible to be out of tid send descriptors when somehow all allocated
+ * descriptors can't complete all of their sends.  For example, the targets of
+ * the sends may be busy in computation loops and not processing incoming
+ * packets.
+ */
+
+void __fastpath 
+ips_protoexp_tid_grant(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    ips_tid_session_list *tid_list;
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+    uint32_t paylen, msglen;
+    uint32_t reqidx;
+    psmi_seqnum_t flowgenseq;
+    psm_error_t err = PSM_OK;
+    psm_mq_req_t req;
+    ptl_arg_t args[3];
+    uint8_t index, seqno;
+
+    paylen   = ips_recvhdrq_event_paylen(rcv_ev);
+    tid_list = (ips_tid_session_list *) ips_recvhdrq_event_payload(rcv_ev);
+    reqidx   = p_hdr->data[0].u32w0;
+    msglen   = p_hdr->data[0].u32w1;
+    flowgenseq.val = p_hdr->data[1].u32w0;
+    
+    /* Increment grant received stats for endpoint */
+    ipsaddr->stats.tids_grant_recv++;
+    index = tid_list->tsess_seqno % sizeof(req->tid_grant);
+    seqno = tid_list->tsess_seqno / sizeof(req->tid_grant);
+
+    req = psmi_mpool_find_obj_by_index(protoexp->tid_sreq_pool, reqidx);
+
+    if (req) {
+    _IPATH_VDBG("req=%p (%d) wait=%s req_seqno=%d pkt_len=%d, seqno=%d, msglen=%d\n", 
+	req, reqidx, req->type & MQE_TYPE_WAITING ? "yes" : "no", 
+	req->recv_msgoff, paylen, tid_list->tsess_seqno, msglen);
+    }
+
+    /* We use recv_msgoff to track the latest receive sequence number */
+
+    if (req == NULL) {
+	/* Not found, bogus req, ack it anyway */
+    }
+    else if (seqno < req->tid_grant[index]) {
+	/* dupe, ack it */
+    }
+    else if (seqno > req->tid_grant[index]) {
+	/* lost tidreq, wait for rexmit */
+	/* XXX count this to see if it's worth handling instead of dropping */
+	goto no_ack;
+    }
+    else {
+	req->tid_grant[index]++;
+	/* Safe to keep updating every time */
+	req->send_msglen = msglen;
+	if ((err = ips_tid_send_handle_tidreq(protoexp, req, msglen, 0, ipsaddr, flowgenseq, tid_list, paylen)) != PSM_OK)
+        {
+	    ips_logevent(rcv_ev->proto, tid_send_reqs, ipsaddr);
+	    /* Out of send reqs, wait for rexmit */
+	    goto no_ack;
+	}
+	req->recv_msgoff = tid_list->tsess_seqno + 1;
+	rcv_ev->proto->psmi_logevent_tid_send_reqs.next_warning = 0;
+    }
+
+    /* At this point we can ack the request */
+    args[0]	  = tid_list->tsess_descid;
+
+    ips_proto_send_ctrl_message(&ipsaddr->flows[protoexp->tid_ep_flow], 
+				OPCODE_TIDS_GRANT_ACK,
+				&ipsaddr->ctrl_msg_queued, args);
+
+no_ack:
+    return; 
+}
+
+void __fastpath 
+ips_protoexp_tid_grant_ack(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    struct ips_tid_recv_desc *tidrecvc;
+    ptl_arg_t desc_id = p_hdr->data[0];
+    ptl_arg_t desc_tidrecvc;
+
+    tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, 
+					     desc_id._desc_idx);
+
+    if (tidrecvc == NULL) /* dupe or gone, drop it */
+	return;
+
+    psmi_mpool_get_obj_index_gen_count(tidrecvc, 
+					&desc_tidrecvc._desc_idx, 
+					&desc_tidrecvc._desc_genc);
+
+    _IPATH_VDBG("desc_req:id=%d,gen=%d desc_tidc:id=%d,gen=%d\n", 
+		    desc_id._desc_idx, desc_id._desc_genc,
+		    desc_tidrecvc._desc_idx, desc_tidrecvc._desc_genc);
+
+    if (desc_tidrecvc.u64 == desc_id.u64 && 
+	tidrecvc->state == TIDRECVC_STATE_GRANT) 
+    {
+	psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq);
+	tidrecvc->state = TIDRECVC_STATE_GRANT_ACK;
+    }
+    return;
+}
+
+void
+__fastpath
+ips_protoexp_recv_unaligned_data(struct ips_recvhdrq_event *rcv_ev)
+{
+  
+  struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+  struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+  struct ptl_epaddr *ipsaddr = rcv_ev->ipsaddr;
+  uint32_t tid_recv_sessid;
+  struct ips_tid_recv_desc *tidrecvc;
+  ptl_arg_t desc_id = rcv_ev->p_hdr->data[0];
+  int i;
+  uint8_t *byte_index = (uint8_t *) &p_hdr->data[1].u32w0;
+  uint8_t *buffer;
+
+  if (!ips_proto_is_expected_or_nak(rcv_ev)) goto process_ack;
+
+  psmi_assert(p_hdr->flags & (IPS_SEND_FLAG_UNALIGNED_DATA | IPS_SEND_FLAG_ACK_REQ));
+  
+  tid_recv_sessid = desc_id._desc_idx;
+  tidrecvc = 
+    psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+				 tid_recv_sessid);
+    
+  if_pf (tidrecvc == NULL) {
+    _IPATH_ERROR("No tidrecv session with index %d\n",
+		 tid_recv_sessid);
+    goto process_ack;
+  }
+  
+  if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) {
+    _IPATH_ERROR("Expected packet to tid session %d, now %d instead "
+		 "of %d; skipping\n", tid_recv_sessid,
+		 psmi_mpool_get_obj_gen_count(tidrecvc), 
+		 desc_id._desc_genc);
+      goto process_ack; /* skip */
+  }
+  
+  psmi_assert(p_hdr->hdr_dlen == 
+	      (tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end));
+
+  /* Cancel tid grant timer (if still active) */
+  if (tidrecvc->num_recv_hdrs++ == 0)
+    psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq);
+
+  buffer = tidrecvc->buffer;
+  for (i = 0; i < tidrecvc->tid_list.tsess_unaligned_start; i++)
+    *buffer++ = *byte_index++;
+
+  buffer =
+    (uint8_t *) tidrecvc->buffer + tidrecvc->recv_msglen -
+    tidrecvc->tid_list.tsess_unaligned_end;
+  byte_index = (uint8_t *)&p_hdr->data[1].u32w1;
+  
+  for (i = 0; i < tidrecvc->tid_list.tsess_unaligned_end; i++)
+    *buffer++ = *byte_index++;
+  
+  /* If packet has checksum for window cache it */
+  if (p_hdr->flags & IPS_SEND_FLAG_HAS_CKSUM) {
+    uint32_t *cksum = (uint32_t*) ips_recvhdrq_event_payload(rcv_ev);
+    
+    psmi_assert_always(protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM);
+    psmi_assert_always(ips_recvhdrq_event_payload(rcv_ev));
+    psmi_assert_always(ips_recvhdrq_event_paylen(rcv_ev));
+    tidrecvc->cksum = *cksum;
+  }
+ 
+process_ack:
+  ips_proto_process_ack(rcv_ev);
+  /* May require ACK for this packet. */
+  if (p_hdr->flags & IPS_SEND_FLAG_ACK_REQ)
+    ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq,
+		&ipsaddr->flows[ips_proto_flowid(p_hdr)]);
+
+  return;
+}
+
+void
+__fastpath 
+ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    uint32_t tid_recv_sessid;
+    struct ips_tid_recv_desc *tidrecvc;
+    ptl_arg_t desc_id = rcv_ev->p_hdr->data[0];
+    ptl_arg_t send_descid = rcv_ev->p_hdr->data[1];
+    uint32_t paylen;
+    psmi_seqnum_t sequence_num, expected_sequence_num;
+    uint32_t has_hw_hdrsupp = (protoexp->ptl->context->runtime_flags & IPATH_RUNTIME_HDRSUPP);
+    ptl_arg_t args[3];
+    
+    paylen = ips_recvhdrq_event_paylen(rcv_ev);
+    tid_recv_sessid = desc_id._desc_idx;
+    tidrecvc = 
+      psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+				   tid_recv_sessid);
+    
+    if_pf (tidrecvc == NULL) {
+      _IPATH_ERROR("No tidrecv session with index %d\n",
+		   tid_recv_sessid);
+      return;
+    }
+
+    if_pf (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER && paylen != 0) {
+      _IPATH_ERROR("Expected packet, but eager index is set; skipping\n");
+      return;
+    }
+
+    if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) {
+      _IPATH_ERROR("Expected packet to tid session %d, now %d instead "
+		   "of %d; skipping\n", tid_recv_sessid,
+		   psmi_mpool_get_obj_gen_count(tidrecvc), 
+		   desc_id._desc_genc);
+      return; /* skip */
+    }
+    
+    sequence_num.val = __be32_to_cpu(p_hdr->bth[2]);
+    expected_sequence_num = tidrecvc->tidflow_genseq;
+    
+    /* On QLE73XX this is only called if data was fully received or the ACK
+     * interval was reached else the gen/seq error handlers are called 
+     * from ips_proto_recv.
+     */
+    if (has_hw_hdrsupp) {      
+      
+      /* Drop packet if generation number does not match */
+      if (expected_sequence_num.gen != sequence_num.gen) 
+	return;
+      
+      /* Increment the expected sequence number taking into account the number
+       * of headers that were suppressed. 
+       */
+      expected_sequence_num.seq += (protoexp->hdr_pkt_interval - 1);
+      
+      /* Special case for last packet as may be lesser than interval. */
+      if (p_hdr->flags & IPS_SEND_FLAG_EXPECTED_DONE)
+	expected_sequence_num = sequence_num;
+      
+      /* TIDFLOW will restart in the if block below */
+      if_pf (sequence_num.psn != expected_sequence_num.psn) { 
+	_IPATH_EPDBG("Expected: Packet PSN %d received and were expecting %d. Restarting flow.\n", sequence_num.psn, expected_sequence_num.psn);
+      }
+      
+    }
+    
+    /* IBTA CCA handling for expected flow. */
+    if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) {
+      /* Mark flow to generate BECN in control packet */
+      tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+      /* Update stats for congestion encountered */
+      rcv_ev->ipsaddr->stats.congestion_pkts++;
+      /* Clear FECN event */
+      rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+    }
+
+    if_pf (sequence_num.psn != expected_sequence_num.psn) {
+      psmi_assert(sequence_num.flow == tidrecvc->tidflow_idx);
+      psmi_assert(sequence_num.flow == tidrecvc->tidflow_genseq.flow);
+
+      /* Generation mismatch */
+      if (sequence_num.gen != tidrecvc->tidflow_genseq.gen)
+	return ips_protoexp_handle_tf_generr(rcv_ev);
+      
+      /* Sequence mismatch error */
+      return ips_protoexp_handle_tf_seqerr(rcv_ev);
+    }
+    else { 
+      
+      /* Update the shadow tidflow_genseq */
+      tidrecvc->tidflow_genseq.seq = sequence_num.seq + 1;
+      
+      /* On QLE71XX/QLE72XX update tidflow table in software */
+      if (!has_hw_hdrsupp) 
+	ipath_tidflow_set_entry(tidrecvc->context->ctrl,
+				tidrecvc->tidflow_idx,
+				tidrecvc->tidflow_genseq.gen,
+				tidrecvc->tidflow_genseq.seq); 
+      
+      /* Reset the swapped generation count as we received a valid packet */
+      tidrecvc->tidflow_nswap_gen = 0;
+    }
+    
+    /* Do some sanity checking */
+    psmi_assert_always(((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3) == 0);
+    psmi_assert_always(tidrecvc->state  != TIDRECVC_STATE_DONE);
+    
+    /* If first packet received cancel tid grant timer */
+    if (tidrecvc->num_recv_hdrs++ == 0)
+      psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq);
+    
+    /* If last packet we can close the tidflow.
+     * We can deallocate tidflow even if the unaligned data has not been
+     * received. The TID_RELEASE message will deallocate the receive 
+     * descriptor.
+     *
+     * Note: If we were out of tidflows this will invoke the callback to 
+     * schedule pending transfers.
+     */
+
+    if (p_hdr->flags & IPS_SEND_FLAG_EXPECTED_DONE) {
+      
+      psm_error_t ret = PSM_OK;
+      
+      /* Acquire lock before updating state (ERR_CHK_GEN also tests for
+       * state before responding.
+       */
+      
+      ips_ptladdr_lock(rcv_ev->ipsaddr);
+      
+      /* Mark receive as done */
+      tidrecvc->state = TIDRECVC_STATE_DONE;
+      
+      ret = ips_tf_deallocate(&protoexp->tfctrl,
+			      tidrecvc->tidflow_idx);
+      psmi_assert_always (ret == PSM_OK);
+      
+      /* Release lock */
+      ips_ptladdr_unlock(rcv_ev->ipsaddr);
+    }
+    
+    /* Respond with an ACK if sender requested one or incoming flow faced
+     * congestion. The ACK in this case will have the BECN bit set. 
+     */
+    if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) ||
+	(tidrecvc->ipsaddr->tidgr_flow.flags & IPS_FLOW_FLAG_GEN_BECN)) {
+      
+      /* Ack sender with descriptor index */
+      args[0] = send_descid;
+      args[1] = tidrecvc->tid_list.tsess_descid;
+      
+      ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow,
+				  OPCODE_ACK,
+				  &tidrecvc->ctrl_msg_queued, args);
+    }
+    
+    return;
+}
+
+#ifndef PSM_DEBUG
+#  define ips_dump_tids(tid_list,msg,...)
+#else
+static
+void
+ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...)
+{
+    char buf[256];
+    size_t off = 0;
+    int i, num_tids = tid_list->tsess_tidcount;
+
+    va_list argptr;
+    va_start(argptr, msg);
+      off += vsnprintf(buf, sizeof buf - off, msg, argptr);
+    va_end(argptr);
+
+    for (i = 0; i < num_tids && off < (sizeof buf - 1); i++) 
+	off += snprintf(buf + off, sizeof buf - off, "%d%s", 
+	    (int) tid_list->tsess_list[i].tid, 
+	    i < num_tids-1 ? "," : "");
+
+    _IPATH_VDBG("%s\n", buf);
+    return;
+}
+#endif
+
+static
+void
+ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc)
+{
+    char buf[256];
+    size_t off = 0;
+    int i;
+
+    off += snprintf(buf + off, sizeof buf - off,
+		    "Remaining bytes: %d Member id %d is not in tid_session_id=%d :", tidsendc->remaining_bytes, tidsendc->tid_idx, 
+	     tidsendc->tid_list.tsess_descid._desc_idx);
+
+    for (i = 0; i < tidsendc->tid_list.tsess_tidcount+1; i++) 
+	off += snprintf(buf + off, sizeof buf - off, "%d,", 
+			    tidsendc->tid_list.tsess_list[i].tid);
+    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+	"Trying to use tid idx %d and there are %d members: %s\n",
+	tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount, buf);
+    return;
+}
+
+void	    
+ips_protoexp_scb_inflight(ips_scb_t *scb)
+{
+    if (scb->tidsendc)
+	scb->tidsendc->iovec_cntr_last = scb->dma_ctr;
+    return;
+}
+
+static 
+void __fastpath
+ips_tid_send_tid_release_msg(struct ips_tid_send_desc *tidsendc)
+{
+  psm_error_t err;
+  struct ips_protoexp *protoexp = tidsendc->protoexp;
+  psm_mq_req_t req = tidsendc->mqreq;
+  ptl_arg_t desc_id[3] = {};
+  uint64_t t_cyc;
+  
+  desc_id[0] = tidsendc->tid_list.tsess_descid;
+  desc_id[1] = tidsendc->descid;
+  desc_id[2].u32w0 = tidsendc->release_cnt;
+  
+  err = ips_proto_send_ctrl_message(&tidsendc->ipsaddr->
+				    flows[protoexp->tid_ep_flow],
+				    OPCODE_TIDS_RELEASE,
+				    &tidsendc->ctrl_msg_queued,
+				    desc_id);
+  
+  if (err != PSM_EP_NO_RESOURCES) {
+    tidsendc->release_cnt++;
+    t_cyc = get_cycles() + protoexp->tid_to_cyc_min;
+  }
+  else
+    t_cyc = get_cycles() + protoexp->proto->timeout_send;
+  
+  psmi_timer_request_always(protoexp->timerq, &tidsendc->timer_tidrelease, 
+			    t_cyc);
+
+  req->send_msgoff += tidsendc->length;
+  
+  _IPATH_VDBG("[rndv][send] tid chunk of size %d done %d/%d for req=%p%s\n", 
+	      tidsendc->length, req->send_msgoff, req->send_msglen, req,
+	      req->send_msgoff == req->send_msglen ? " (complete)" : "");
+  
+  if (req->send_msgoff == req->send_msglen) 
+    psmi_mq_handle_rts_complete(req);
+}
+
+static
+int __fastpath 
+ips_tid_send_completion_unaligned_callback(void * param, uint32_t nbytes)
+{
+  struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *) param;
+  
+  /* Decrement completion counter and complete if unaligned data sent */
+  tidsendc->completion_counter--;
+  
+  psmi_assert(tidsendc->completion_counter >= 0);
+  
+  if (tidsendc->completion_counter == 0)
+    ips_tid_send_tid_release_msg(tidsendc);
+  
+  return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int __fastpath 
+ips_tid_send_completion_callback(void * param, uint32_t nbytes)
+{
+    struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *) param;
+    struct ips_protoexp *protoexp = tidsendc->protoexp;
+    
+    if (protoexp->tid_xfer_type == PSM_TRANSFER_DMA)
+	ips_proto_dma_wait_until(protoexp->proto, tidsendc->iovec_cntr_last);
+
+    if (tidsendc->bounce_buf) psmi_free(tidsendc->bounce_buf);
+
+    /* Decrement completion counter and complete if unaligned data sent */
+    tidsendc->completion_counter--;
+    
+    psmi_assert(tidsendc->completion_counter >= 0);
+    
+    if (tidsendc->completion_counter == 0)
+      ips_tid_send_tid_release_msg(tidsendc);
+    
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+static 
+psm_error_t  __fastpath
+ips_tid_release_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+    struct ips_tid_send_desc *tidsendc = 
+	(struct ips_tid_send_desc *) timer->context;
+    struct ips_protoexp *protoexp = tidsendc->protoexp;
+    uint64_t t_cyc;
+    psm_error_t err;
+    ptl_arg_t desc_id[3] = {};
+
+    /* 0 contain's the receiver's desc_id, 1 contains the sender's desc_id */
+    desc_id[0] = tidsendc->tid_list.tsess_descid;
+    desc_id[1] = tidsendc->descid;
+    desc_id[2].u32w0 = tidsendc->release_cnt;
+
+    err = ips_proto_send_ctrl_message(&tidsendc->ipsaddr->
+				      flows[protoexp->tid_ep_flow],
+				      OPCODE_TIDS_RELEASE,
+				      &tidsendc->ctrl_msg_queued,
+				      desc_id);
+    
+    if (err == PSM_EP_NO_RESOURCES) {
+	t_cyc = get_cycles() + protoexp->proto->timeout_send;
+    }
+    else {
+	tidsendc->release_cnt++;
+	protoexp->tid_release_resends++;
+	t_cyc = get_cycles() +
+		min(tidsendc->release_cnt * protoexp->tid_to_cyc_min,
+		    protoexp->tid_to_cyc_max);
+    }
+
+    psmi_timer_request_always(protoexp->timerq, 
+			     &tidsendc->timer_tidrelease, 
+			     t_cyc);
+    
+    return PSM_OK;
+}
+
+static 
+psm_error_t __fastpath
+ips_tid_grant_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+    struct ips_tid_recv_desc *tidrecvc = 
+	(struct ips_tid_recv_desc *) timer->context;
+    struct ips_protoexp *protoexp = tidrecvc->protoexp;
+    ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr;
+    psm_error_t err;
+    uint64_t t_cyc;
+
+    err = ips_proto_send_ctrl_message(&ipsaddr->flows[protoexp->tid_ep_flow], 
+				      OPCODE_TIDS_GRANT,
+				      &tidrecvc->ctrl_msg_queued, 
+				      &tidrecvc->tid_list);
+    
+    if (err == PSM_EP_NO_RESOURCES) {
+	t_cyc = get_cycles() + protoexp->proto->timeout_send;
+    }
+    else {
+	tidrecvc->grant_cnt++;
+	protoexp->tid_grant_resends++;
+	t_cyc = get_cycles() +
+		min(tidrecvc->grant_cnt * protoexp->tid_to_cyc_min,
+		    protoexp->tid_to_cyc_max);
+    }
+
+    psmi_timer_request_always(protoexp->timerq, timer, t_cyc);
+
+    return PSM_OK;
+}
+
+static
+__fastpath
+psm_error_t
+ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, 
+		      psm_mq_req_t req, uint32_t msglen,
+		      int flags, ptl_epaddr_t *ipsaddr,
+		      psmi_seqnum_t flowgenseq,
+		      ips_tid_session_list *tid_list, 
+		      uint32_t tid_list_size)
+{
+    struct ips_tid_send_desc *tidsendc;
+    req->send_msglen = msglen;
+
+    psmi_assert(tid_list_size >= sizeof(ips_tid_session_list));
+    psmi_assert(tid_list_size <= 2096);
+
+    tidsendc = (struct ips_tid_send_desc *)
+		psmi_mpool_get(protoexp->tid_desc_send_pool);
+    if (tidsendc == NULL) 
+	return PSM_EP_NO_RESOURCES;
+
+    tidsendc->protoexp = protoexp;
+
+    /* Uniquely identify this send descriptor in space and time */
+    tidsendc->descid._desc_idx  = psmi_mpool_get_obj_index(tidsendc);
+    tidsendc->descid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc);
+
+    psmi_mq_mtucpy(&tidsendc->tid_list, tid_list, tid_list_size);
+    tid_list = &tidsendc->tid_list;
+
+    tidsendc->length   = tid_list->tsess_length;
+    tidsendc->ipsaddr   = ipsaddr;
+    tidsendc->mqreq    = req;
+    tidsendc->bounce_buf = NULL;
+    tidsendc->buffer     =
+	    (void *)((uintptr_t)req->buf + tid_list->tsess_srcoff);
+    tidsendc->tid_idx    = 0;
+    tidsendc->is_complete= 0;
+    tidsendc->release_cnt= 0;
+
+    /* Initialize tidflow for window. Use path requested by remote endpoint */
+    ips_flow_init(&tidsendc->tidflow, NULL, ipsaddr, protoexp->tid_xfer_type,
+		  PSM_PROTOCOL_TIDFLOW, IPS_PATH_LOW_PRIORITY, 0);
+    
+    tidsendc->tidflow.xmit_seq_num = flowgenseq;
+    tidsendc->tidflow.xmit_ack_num = flowgenseq;
+    tidsendc->tidflow.xmit_ack_num.seq--; /* last acked */
+    tidsendc->ctrl_msg_queued = 0;
+    tidsendc->completion_counter = 1;
+    
+    /* If unaligned data will need to send a separate packet containing 
+     * unaligned data.
+     */
+    if ((tidsendc->tid_list.tsess_unaligned_start) ||
+	(tidsendc->tid_list.tsess_unaligned_end) ||
+	(protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM))
+      tidsendc->completion_counter += 1;
+    
+    if (tid_list->tsess_tidcount == 0) {
+	_IPATH_VDBG("no tids used, alloc eager tid\n");
+	tid_list->tsess_list[0].tid = IPATH_EAGER_TID_ID;
+	tid_list->tsess_list[0].length = 0;
+	tid_list->tsess_list[0].offset = 0;
+    }
+
+    tidsendc->frame_send = 0;
+    tidsendc->remaining_bytes = tid_list->tsess_length;
+    tidsendc->remaining_bytes_in_page = 
+			   tid_list->tsess_list[0].length;
+    tidsendc->offset     = tid_list->tsess_list[0].offset;
+    tidsendc->unaligned_sent = 0;
+    
+    psmi_timer_entry_init(&tidsendc->timer_tidrelease,
+			 ips_tid_release_timer_callback, tidsendc);
+
+    _IPATH_EXP("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n",
+	    tidsendc->descid._desc_idx, tid_list->tsess_descid._desc_idx,
+	    tid_list->tsess_srcoff, tid_list->tsess_length,
+	    tid_list->tsess_unaligned_start,
+	    tid_list->tsess_unaligned_end
+	    );
+
+    /* We have no tids, we're expected to stuff everything in user
+     * header words, so mark it as an eager packet */
+    if (tid_list->tsess_tidcount > 0) {
+	ips_dump_tids(&tidsendc->tid_list, 
+		"Received %d tids: ", tidsendc->tid_list.tsess_tidcount);
+    }
+
+    /* Add as a pending op and ring up the timer */
+    STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next);
+    psmi_timer_request(protoexp->timerq, &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+
+    /* Consider breaking out of progress engine here */
+    return PSM_OK;
+}
+
+void __fastpath 
+ips_protoexp_tid_release_ack(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_tid_send_desc *tidsendc;
+    ptl_arg_t desc_id = rcv_ev->p_hdr->data[1];
+
+    tidsendc = (struct ips_tid_send_desc *)
+		psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, 
+					     desc_id._desc_idx);
+    _IPATH_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc);
+    if (tidsendc == NULL) {
+	_IPATH_ERROR("OPCODE_TIDS_RELEASE_CONFIRM ERROR: Index %d is out of range\n", 
+			desc_id._desc_idx);
+    }
+    else {
+	ptl_arg_t desc_tidsendc;
+	psmi_mpool_get_obj_index_gen_count(tidsendc, 
+					   &desc_tidsendc._desc_idx, 
+					   &desc_tidsendc._desc_genc);
+
+	_IPATH_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", 
+		    desc_id._desc_idx, desc_id._desc_genc,
+		    desc_tidsendc._desc_idx, desc_tidsendc._desc_genc);
+
+	/* See if the reference is still live and valid */
+	if (desc_tidsendc.u64 == desc_id.u64) {
+	    psmi_timer_cancel(protoexp->timerq, &tidsendc->timer_tidrelease);
+	    psmi_timer_cancel(rcv_ev->proto->timerq,
+			      &tidsendc->tidflow.timer_send);
+	    psmi_timer_cancel(rcv_ev->proto->timerq,
+			      &tidsendc->tidflow.timer_ack);
+	    psmi_mpool_put(tidsendc);
+	}
+    }
+    return;
+}
+
+static
+psm_error_t __fastpath
+ips_scb_send_unaligned_data(ips_scb_t *scb)
+{
+  struct ips_tid_send_desc *tidsendc = scb->tidsendc;
+  struct ips_protoexp *protoexp = tidsendc->protoexp;
+  uint8_t *bufptr = tidsendc->buffer;
+  int frame_extra, i;
+  uint8_t *packptr;
+  uint8_t *unptr_beg = bufptr;
+  uint8_t *unptr_end = bufptr + tidsendc->length - 
+    tidsendc->tid_list.tsess_unaligned_end;
+  struct ips_flow *flow = &tidsendc->ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+  
+  psmi_assert(tidsendc->tid_idx == 0);
+  
+  /* arg[0] is recv descriptor id */
+  scb->ips_lrh.data[0] = tidsendc->tid_list.tsess_descid;
+
+  if (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM) {
+    uint32_t cksum = 0xffffffff;
+    
+    if (!ips_scbctrl_bufalloc(scb)) {
+      ips_scbctrl_free(scb);
+      return PSM_EP_NO_RESOURCES;
+    }
+    
+    cksum = ips_crc_calculate(tidsendc->length, 
+			      (uint8_t*) tidsendc->buffer, cksum);
+    *(uint32_t*) ips_scb_buffer(scb) =  cksum;
+    ips_scb_length(scb) = sizeof(cksum);
+    scb->flags |= IPS_SEND_FLAG_HAS_CKSUM;
+  }
+  
+  // Make sure not to over read unaligned buffer
+  packptr = (uint8_t *)&scb->ips_lrh.data[1].u32w0;
+  for (i = 0; i < tidsendc->tid_list.tsess_unaligned_start; i++)
+    packptr[i] = unptr_beg[i];
+  
+  packptr = (uint8_t *)&scb->ips_lrh.data[1].u32w1;
+  for (i = 0; i < tidsendc->tid_list.tsess_unaligned_end; i++)
+    packptr[i] = unptr_end[i];
+  
+  ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_EXPTID_UNALIGNED;
+  ips_scb_hdr_dlen(scb) = tidsendc->tid_list.tsess_unaligned_start + 
+    tidsendc->tid_list.tsess_unaligned_end;
+  
+  ips_scb_cb(scb) = ips_tid_send_completion_unaligned_callback;
+  ips_scb_cb_param(scb) = tidsendc;
+  scb->flags   |= IPS_SEND_FLAG_UNALIGNED_DATA | IPS_SEND_FLAG_ACK_REQ;
+  
+  bufptr       += tidsendc->tid_list.tsess_unaligned_start;
+  frame_extra = tidsendc->tid_list.tsess_unaligned_start + 
+    tidsendc->tid_list.tsess_unaligned_end;
+
+  
+  tidsendc->remaining_bytes -= frame_extra;
+
+  tidsendc->buffer = bufptr;
+
+  /* Enqueue scb on the flow and flush */
+  flow->fn.xfer.enqueue(flow, scb);
+  flow->fn.xfer.flush(flow, NULL);
+  
+  return PSM_OK;
+}
+
+static 
+ips_scb_t * __fastpath
+ips_scb_prepare_tid_sendctrl(struct ips_flow *flow,
+			     struct ips_tid_send_desc *tidsendc)
+{
+    struct ips_protoexp *protoexp = tidsendc->protoexp;
+    uint8_t *bufptr = tidsendc->buffer;
+    uint16_t frame_len, frag_size, nfrag;
+    int payload_size, idx;
+    ips_scb_t *scb;
+
+    if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL)
+	return NULL;
+
+    /*
+     * Expected sends require 4-byte alignment, so we stuff whatever
+     * misalignment in the header's available user bytes.
+     * 
+     * In the current interface, misalignment can only occur at the
+     * start or end of the packet, so we handle it as a special packet
+     * before the first packet can be sent off.
+     *
+     * If checksum is enabled we send the checksum for the send window 
+     * wiithin/as an unaligned packet as well.
+     */
+    
+    if (tidsendc->length && 
+	(tidsendc->tid_list.tsess_unaligned_start || 
+	 tidsendc->tid_list.tsess_unaligned_end ||
+	 (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM)) &&
+	!(tidsendc->unaligned_sent)) {      
+	
+      /* Send unaligned data separately over ipsaddr->flow. Completion over
+       * both flows is synchronized to generate TIDS_RELEASE. The receive will
+       * only finish when tid release is received. */
+      scb->tidsendc = tidsendc;
+      if (ips_scb_send_unaligned_data(scb) != PSM_OK)
+	return NULL;
+      
+      /* Sent unaligned data */
+      tidsendc->unaligned_sent = 1;
+      
+      
+      /* Buffer may have been updated (unaligned start) */
+      bufptr = tidsendc->buffer;
+      
+      /* Try to obtain another scb after sending unaligned data */
+      if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL)
+	return NULL;
+    }
+    
+    if ((uintptr_t)bufptr & 0x3) {
+	bufptr = psmi_malloc(protoexp->proto->ep,
+		UNDEFINED, tidsendc->remaining_bytes);
+	if (!bufptr) {
+	    ips_scbctrl_free(scb);
+	    return NULL;
+	}
+
+	memcpy(bufptr, tidsendc->buffer, tidsendc->remaining_bytes);
+	tidsendc->buffer = tidsendc->bounce_buf = bufptr;
+    }
+
+    idx = tidsendc->tid_idx;
+    scb->tidsendc     = tidsendc;
+    SLIST_NEXT(scb,next) = NULL;
+
+    scb->ips_lrh.sub_opcode = OPCODE_SEQ_MQ_EXPTID;
+    scb->ips_lrh.data[0] = tidsendc->tid_list.tsess_descid;
+    scb->ips_lrh.data[1] = tidsendc->descid;
+    scb->tid	         = tidsendc->tid_list.tsess_list[idx].tid;
+    scb->tsess		 = (void *)&tidsendc->tid_list.tsess_list[idx];
+    scb->offset		 = tidsendc->offset;
+    scb->payload         = (void *) bufptr;
+
+    /*
+     * Loop over the tid session list, count the frag number and payload size.
+     * The payload size is limited by the pbc.length field which is 16 bits in
+     * DWORD, including both message header and payload. This translates to
+     * less than 256K payload. So 128K is used.
+     */
+    nfrag = 0;
+    payload_size = 0;
+    frag_size = min(protoexp->tid_send_fragsize, flow->path->epr_mtu);
+    frame_len = min(tidsendc->remaining_bytes_in_page, frag_size);
+    while (1) {
+	nfrag++;
+	payload_size += frame_len;
+
+	/* adjust counter and pointers */
+	tidsendc->remaining_bytes -= frame_len;
+	tidsendc->remaining_bytes_in_page -= frame_len;
+	tidsendc->offset += frame_len;
+
+	if (!tidsendc->remaining_bytes_in_page) { 
+	    /* Done with this page, move on to the next tid */
+	    tidsendc->tid_idx++;
+	    tidsendc->remaining_bytes_in_page = 
+		tidsendc->tid_list.tsess_list[tidsendc->tid_idx].length;
+	    tidsendc->offset =
+		tidsendc->tid_list.tsess_list[tidsendc->tid_idx].offset;
+
+	    /* The payload size is limited by the pbc.length field which
+	     * is 16 bits in DWORD, including both message header and
+	     * payload. This translates to less than 256K payload. So 128K
+	     * is used. */
+	    /* break when current page is done */
+	    if (payload_size > 131072) break;
+	}
+
+#if 0
+	if (1) {
+#else
+	if (flow->transfer == PSM_TRANSFER_PIO) {
+#endif
+	    break;	/* turn on to use single frag-size packet */
+	}
+
+	if (!tidsendc->remaining_bytes) break;
+	frame_len = min(tidsendc->remaining_bytes_in_page, frag_size);
+    }
+    scb->nfrag = nfrag;
+    scb->frag_size = frag_size;
+    scb->payload_size = payload_size;
+    scb->tsess_length = sizeof(ips_tid_session_member) *
+				(tidsendc->tid_idx - idx);
+
+    /* Keep track of latest buffer location so we restart at the
+     * right location, if we don't complete the transfer */
+    tidsendc->buffer = bufptr + payload_size;
+
+    /* If last packet, we want a completion notification */
+    if (!tidsendc->remaining_bytes) {
+	scb->flags = (IPS_SEND_FLAG_ACK_REQ | IPS_SEND_FLAG_EXPECTED_DONE);
+	scb->callback = ips_tid_send_completion_callback;
+	scb->cb_param = tidsendc;
+	
+	tidsendc->is_complete = 1;
+    } else {
+	scb->flags = IPS_SEND_FLAG_HDR_SUPPRESS;
+	scb->callback = NULL;
+	scb->cb_param = NULL;
+    }
+
+#if 0
+    if (1) {
+#else
+    if (flow->transfer == PSM_TRANSFER_PIO) {
+#endif
+	/* turn on to use single frag-size packet */
+	/* Do not suppress header every hdr_pkt_interval or the last packet */
+	if ((++tidsendc->frame_send % protoexp->hdr_pkt_interval) == 0) {
+	    scb->flags &= ~IPS_SEND_FLAG_HDR_SUPPRESS;
+	    scb->flags |= IPS_SEND_FLAG_ACK_REQ; /* Request an ACK */
+	}
+    }
+ 
+    return scb;
+}
+
+/*
+ * Returns:
+ *
+ * PSM_OK: scb was allocated for at least one frame, the packet may be queued
+ *         or actually sent.
+ *
+ * PSM_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow
+ *		       to be enqueued before polling receive queue.
+ *
+ * PSM_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more
+ *                      scbs become available.
+ *
+ * PSM_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now.
+ *
+ */
+
+psm_error_t __fastpath 
+ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
+{
+    ips_scb_t *scb = NULL;
+    psm_error_t err = PSM_OK, err_f;
+    struct ips_protoexp *protoexp = tidsendc->protoexp;
+    struct ips_proto *proto = protoexp->proto;
+    struct ips_flow *flow = &tidsendc->tidflow;
+
+    /*
+     * We aggressively try to grab as many scbs as possible, enqueue them to a
+     * flow and flush them when either we're out of scbs our we've completely
+     * filled the send request.
+     */
+    while (!tidsendc->is_complete)
+    {
+	if_pf (tidsendc->tid_list.tsess_tidcount &&
+	       (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount || 
+	        tidsendc->tid_idx < 0) )
+	    ips_expsend_tiderr(tidsendc);
+
+	if ((scb = ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) {
+	    proto->stats.scb_exp_unavail_cnt++;
+	    err = PSM_EP_NO_RESOURCES;
+	    break;
+	}
+	else {
+	  flow->fn.xfer.enqueue(flow, scb);
+	}
+    }
+
+    if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */
+	int num_sent;
+	err_f = flow->fn.xfer.flush(flow, &num_sent);
+
+	if (err != PSM_EP_NO_RESOURCES) {
+	    /* PSM_EP_NO_RESOURCES is reserved for out-of-scbs */
+	    if (err_f == PSM_EP_NO_RESOURCES)
+		err = PSM_TIMEOUT; /* force a resend reschedule */
+	    else if (err_f == PSM_OK && num_sent > 0 && 
+		     !ips_ptl_recvq_isempty(protoexp->ptl))
+		err = PSM_OK_NO_PROGRESS; /* force a rcvhdrq service */
+	}
+    }
+
+    return err;
+}
+
+static
+psm_error_t __recvpath
+ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+    struct ips_protoexp *protoexp = (struct ips_protoexp *) timer->context;
+    struct ips_tid_send_pend *phead = &protoexp->pend_sendq;
+    struct ips_tid_send_desc *tidsendc;
+    psm_error_t err = PSM_OK;
+
+    while (!STAILQ_EMPTY(phead)) {
+	tidsendc = STAILQ_FIRST(phead);
+
+	err = ips_tid_send_exp(tidsendc);
+
+	if (tidsendc->is_complete)
+	    STAILQ_REMOVE_HEAD(phead, next);
+
+	if (err == PSM_OK) {
+	    /* Was able to complete the send, keep going */
+
+#if 0
+	    _IPATH_EXP("tidsess=%6d tid=%4d @ %3d size=%4d offset=%4d, next=%p\n",
+			tidsendc->descid.u32w0, 
+			tidsendc->tid_list.tsess_list[tidsendc->tid_idx].tid,
+			tidsendc->tid_idx,
+			tidsendc->length,
+			tidsendc->length - tidsendc->remaining_bytes,
+			STAILQ_FIRST(phead)
+			);
+#endif
+	}
+	else if (err == PSM_EP_NO_RESOURCES) {
+	    /* No more sendbufs available, sendbuf callback will requeue this
+	     * timer */
+	    break;
+	}
+	else if (err == PSM_TIMEOUT) {
+	    /* Always a case of try later:
+	     * On PIO flow, means no send pio bufs available
+	     * On DMA flow, means kernel can't queue request or would have to block
+	     */
+	    psmi_timer_request(protoexp->proto->timerq, 
+			      &protoexp->timer_send, 
+			      get_cycles() + protoexp->proto->timeout_send);
+	    break;
+	}
+	else {
+	    /* Forced to reschedule later so we can check receive queue */
+	    psmi_assert(err == PSM_OK_NO_PROGRESS); 
+	    psmi_timer_request(protoexp->proto->timerq, 
+			      &protoexp->timer_send, PSMI_TIMER_PRIO_1);
+	    break;
+	}
+    }
+
+    return PSM_OK;
+}
+
+// Right now, in the kernel we are allowing for virtually non-contiguous pages,
+// in a single call, and we are therefore locking one page at a time, but since
+// the intended use of this routine is for a single group of
+// virtually contiguous pages, that should change to improve
+// performance.  That means possibly changing the calling MPI code.
+// Doing so gets rid of some of the loop stuff here, and in the driver,
+// and allows for a single call to the core VM code in the kernel,
+// rather than one per page, definitely improving performance.
+
+static
+psm_error_t __fastpath
+ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp,
+			void *buf, uint32_t buflen, 
+			ips_tid_session_list *tid_list,
+			uint64_t *ts_map)
+{
+    uint16_t unalignment;
+    uint32_t remaining_buffer_size = buflen;
+    uint32_t num_tids;
+    uint32_t num_tids_avail = ips_tid_num_available(&protoexp->tidc);
+    uint16_t tidids[IPS_TID_MAX_TIDS]; 
+    void *bufmap;
+    uint8_t *bufptr = (uint8_t *) buf;
+    const uint32_t page_size = ips_tid_page_size(&protoexp->tidc);
+    const uint32_t page_offset_mask = protoexp->tid_page_offset_mask;
+    int i;
+    psm_error_t err = PSM_OK;
+
+    /*
+     * The following remaining_buffer_size calculation
+     * does not work with buflen<4 and byte aligned
+     * buf, it can get negative value.
+     * In function ips_tid_pendtids_timer_callback(),
+     * we try to avoid nbytes_this(which is buflen)
+     * to be a few bytes.
+     */
+    if (buflen < 4) {
+	tid_list->tsess_unaligned_start = buflen;
+	tid_list->tsess_unaligned_end = 0;
+	remaining_buffer_size = 0;
+    } else {
+	tid_list->tsess_unaligned_start = unalignment = 
+	    ((uintptr_t) buf & 3) ? (4 - ((uintptr_t) buf & 3)) : 0;
+	remaining_buffer_size -= unalignment;
+	bufptr += unalignment;
+			
+	tid_list->tsess_unaligned_end = unalignment = 
+	    remaining_buffer_size & 3;
+	remaining_buffer_size -= unalignment;
+    }
+
+    bufmap = bufptr;
+    psmi_assert_always(ips_tid_num_required(&protoexp->tidc, bufmap, 
+	remaining_buffer_size) <= num_tids_avail);
+    
+    tid_list->tsess_list[0].tid = 0;
+    tid_list->tsess_list[0].offset = 0;
+    tid_list->tsess_list[0].length = 0;
+
+    for (i = 0, num_tids = 0; remaining_buffer_size && i < num_tids_avail; i++) {
+	uint32_t page_off = (uintptr_t) bufptr & page_offset_mask;
+	uint32_t page_len = min(remaining_buffer_size, page_size - page_off);
+	tid_list->tsess_list[i].offset = page_off;
+	tid_list->tsess_list[i].length = page_len;
+	bufptr += page_len;
+	remaining_buffer_size -= page_len;
+	tidids[i] = 0; /* Ensure tidids[i] is never seen as  uninitialized */
+	num_tids++;
+    }
+    psmi_assert_always(remaining_buffer_size == 0);
+
+    if (num_tids && 
+	(err = ips_tid_acquire(&protoexp->tidc, 
+			(void *) ((uintptr_t) bufmap & 
+				  (uintptr_t) protoexp->tid_page_mask),
+			num_tids, ts_map, tidids)))
+	goto fail;
+
+    tid_list->tsess_tidcount = num_tids;
+    for (i = 0; i < num_tids; i++) 
+        tid_list->tsess_list[i].tid = tidids[i];
+
+    ips_dump_tids(tid_list, "Registered %d tids: ", num_tids);
+
+fail:
+    return err;
+}
+
+static
+void
+ips_tid_mpool_tidrecv_callback(void *context)
+{
+    struct ips_protoexp *protoexp = (struct ips_protoexp *) context;
+
+    if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) 
+	psmi_timer_request(protoexp->proto->timerq, 
+			  &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1);
+
+    return;
+}
+
+static
+__fastpath 
+struct ips_tid_recv_desc *
+ips_tid_recv_alloc(struct ips_protoexp *protoexp, ips_epaddr_t *ipsaddr,
+	const struct ips_tid_get_request *getreq, uint32_t nbytes_this)
+{
+    struct ips_tid_recv_desc *tidrecvc;
+    psm_error_t err = PSM_OK;
+    
+    tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_mpool_get(protoexp->tid_desc_recv_pool);
+    if (tidrecvc == NULL)
+        return NULL;
+
+    tidrecvc->context = &protoexp->proto->ep->context;
+    tidrecvc->protoexp = protoexp;
+    tidrecvc->ipsaddr = ipsaddr;
+    tidrecvc->state = TIDRECVC_STATE_GRANT;
+    tidrecvc->buffer = 
+	(void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset);
+    tidrecvc->num_recv_hdrs = 0;
+    tidrecvc->recv_msglen = nbytes_this;
+    tidrecvc->tid_list.tsess_tidcount = 0;
+    tidrecvc->getreq = (struct ips_tid_get_request *) getreq;
+    tidrecvc->grant_cnt = 0;
+    tidrecvc->recv_framecnt = 0;
+    tidrecvc->flags = 0;
+    tidrecvc->tidflow_active_gen = IPS_TF_INVALID_GENERATION;
+    tidrecvc->ctrl_msg_queued = 0;
+    tidrecvc->cksum = 0xb5b5b5b5;
+    tidrecvc->stats.nSeqErr = 0;
+    tidrecvc->stats.nGenErr = 0;
+    tidrecvc->stats.nReXmit = 0;
+    tidrecvc->stats.nErrChkReceived = 0;
+
+    if ((err = ips_tf_allocate(&protoexp->tfctrl,
+			       &tidrecvc->tidflow_idx,
+			       &tidrecvc->tidflow_active_gen))){
+      /* Unable to get a tidflow for expected protocol. */
+      psmi_mpool_put(tidrecvc);
+      /* XXX log this event */
+      return NULL;
+    }
+    
+    tidrecvc->tidflow_genseq.flow = tidrecvc->tidflow_idx;
+    tidrecvc->tidflow_genseq.gen  = tidrecvc->tidflow_active_gen;
+    tidrecvc->tidflow_genseq.seq  = rand_r(&protoexp->tidflow_seed) & 0x3ff;
+
+    ipath_tidflow_set_entry(tidrecvc->context->ctrl,
+			    tidrecvc->tidflow_genseq.flow,
+			    tidrecvc->tidflow_genseq.gen,
+			    tidrecvc->tidflow_genseq.seq);
+    
+    tidrecvc->tidflow_nswap_gen = 0;
+    tidrecvc->tid_list.tsess_type = IPS_TID_SESSTYPE_MEMBER_LIST;
+    tidrecvc->tid_list.tsess_tidcount = 0;
+    tidrecvc->tid_list.tsess_tidlist_length = 0;
+    tidrecvc->tid_list.tsess_unaligned_start = 0;
+    tidrecvc->tid_list.tsess_unaligned_end = 0;
+
+    tidrecvc->tid_list.tsess_descid._desc_idx = 
+		psmi_mpool_get_obj_index(tidrecvc);
+    tidrecvc->tid_list.tsess_descid._desc_genc = 
+		psmi_mpool_get_obj_gen_count(tidrecvc);
+
+    tidrecvc->tid_list.tsess_seqno  = getreq->tidgr_desc_seqno;
+    tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset;
+    tidrecvc->tid_list.tsess_length = nbytes_this;
+    
+    psmi_timer_entry_init(&tidrecvc->timer_tidreq,
+			 ips_tid_grant_timer_callback, tidrecvc);
+
+    if (nbytes_this > 0) {
+	if ((err = ips_tid_recv_alloc_frag(protoexp, tidrecvc->buffer, 
+		    nbytes_this, &tidrecvc->tid_list, tidrecvc->ts_map))) {
+	    tidrecvc->tid_list.tsess_tidcount = 0;
+	    ips_tf_deallocate(&protoexp->tfctrl, tidrecvc->tidflow_idx);
+	    psmi_mpool_put(tidrecvc);
+	    /* XXX log me !!! */
+	    return NULL;
+	}
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG)
+	{
+	    int num_tids = tidrecvc->tid_list.tsess_tidcount;
+	    int tid, i;
+	    for (i = 0; i < num_tids; i++) {
+		tid = tidrecvc->tid_list.tsess_list[i].tid;
+		psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_FREE);
+		protoexp->tid_info[tid].tid = tid;
+		protoexp->tid_info[tid].state = TIDSTATE_USED;
+		protoexp->tid_info[tid].tidrecvc = tidrecvc;
+	    }
+	}
+    }
+
+    /* This gets sent out as a control message, so we need to force 4-byte IB
+     * alignment */
+    tidrecvc->tid_list.tsess_tidlist_length = (uint16_t) 
+	PSMI_ALIGNUP((sizeof(ips_tid_session_list) +
+		     (tidrecvc->tid_list.tsess_tidcount * 
+		      sizeof(ips_tid_session_member))), 4);
+
+    _IPATH_EXP("alloc tidrecv=%d, ntid=%d, paylen=%d\n", 
+	tidrecvc->tid_list.tsess_descid._desc_idx,
+	tidrecvc->tid_list.tsess_tidcount, 
+	tidrecvc->tid_list.tsess_tidlist_length);
+
+    return tidrecvc;
+}
+
+static
+psm_error_t __recvpath
+ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
+{
+    struct ips_protoexp *protoexp = (struct ips_protoexp *) timer->context;
+    struct ips_tid_get_pend *phead = &protoexp->pend_getreqsq;
+    struct ips_tid_get_request *getreq;
+    struct ips_tid_recv_desc *tidrecvc;
+    uint32_t nbytes_this, leftover;
+    uint64_t t_cyc;
+    uintptr_t bufptr;
+    psm_epaddr_t epaddr;
+    ptl_epaddr_t *ipsaddr;
+    psm_error_t err = PSM_OK;
+
+    while (!STAILQ_EMPTY(phead)) {
+	getreq = STAILQ_FIRST(phead);
+	epaddr = getreq->tidgr_epaddr;
+
+next_epaddr:
+	ipsaddr = epaddr->ptladdr;
+	protoexp = ipsaddr->proto->protoexp;
+	nbytes_this = min(getreq->tidgr_length - getreq->tidgr_offset,
+			  getreq->tidgr_rndv_winsz);
+	/*
+ 	 * if the leftover is less than half window size,
+ 	 * we reduce nbytes_this by half, we want to avoid
+ 	 * to send a few bytes in a tid transaction.
+ 	 */
+	leftover = getreq->tidgr_length -
+			(getreq->tidgr_offset + nbytes_this);
+	if (leftover && leftover < getreq->tidgr_rndv_winsz/2) {
+		nbytes_this /= 2;
+	}
+
+	bufptr = (uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset;
+
+	if ((ips_tid_num_required(&protoexp->tidc, (void *) bufptr, nbytes_this) > ips_tid_num_available(&protoexp->tidc)) ||
+	    !ips_tf_available(&protoexp->tfctrl)) {
+	  /* We're out of tids/tidflow, tid release will requeue the callback */
+	  ;
+	}
+	else if ((tidrecvc = ips_tid_recv_alloc(protoexp, ipsaddr,
+				getreq, nbytes_this)) != NULL) {
+
+	    err = ips_proto_send_ctrl_message(&ipsaddr->
+					    flows[protoexp->tid_ep_flow],
+					    OPCODE_TIDS_GRANT,
+					    &tidrecvc->ctrl_msg_queued, 
+					    &tidrecvc->tid_list);
+	    
+	    if (err != PSM_EP_NO_RESOURCES) {
+		tidrecvc->grant_cnt++;
+		t_cyc = get_cycles() + protoexp->tid_to_cyc_min;
+	    }
+	    else
+		t_cyc = get_cycles() + protoexp->proto->timeout_send;
+
+	    psmi_timer_request_always(protoexp->timerq, 
+				     &tidrecvc->timer_tidreq, t_cyc);
+
+	    getreq->tidgr_offset += nbytes_this;
+	    _IPATH_VDBG("GRANT tididx=%d.%d srcoff=%d nbytes=%d/%d\n", 
+			tidrecvc->tid_list.tsess_descid._desc_idx,
+			getreq->tidgr_desc_seqno,
+			getreq->tidgr_offset, nbytes_this, getreq->tidgr_length);
+
+	    getreq->tidgr_desc_seqno++;
+	    if (getreq->tidgr_offset == getreq->tidgr_length) {
+		getreq->tidgr_protoexp = NULL;
+		getreq->tidgr_epaddr = NULL;
+		STAILQ_REMOVE_HEAD(phead, tidgr_next);
+		continue;
+	    }
+	    epaddr = epaddr->mctxt_next;
+	    goto next_epaddr;
+	}
+	else {
+	    /* out of tidrecv desc.  The not-empty tidrecv mpool callback will
+	     * cause us to requeue the getreq on the active timer queue */
+	    ;
+	}
+
+	epaddr = epaddr->mctxt_next;
+	if (epaddr != getreq->tidgr_epaddr) goto next_epaddr;
+	break;
+    }
+    return PSM_OK; /* XXX err-broken */
+}
+
+static
+psm_error_t __fastpath
+ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
+{
+    struct ips_tid_get_request *getreq = tidrecvc->getreq;
+    struct ips_protoexp *protoexp = tidrecvc->protoexp;
+    int tidcount = tidrecvc->tid_list.tsess_tidcount;
+    psm_error_t err = PSM_OK;
+    
+    psmi_assert(getreq != NULL);
+
+    /* If checksum is enabled, make sure we have valid data for window */
+    if (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM) {
+      uint32_t cksum = ips_crc_calculate(tidrecvc->recv_msglen, 
+					 (uint8_t*) tidrecvc->buffer, 
+					 0xffffffff);
+      if (tidrecvc->cksum != cksum) {
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			  "ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Rendezvous stats: nSeqErr: %d, nGenErr: %d, nReXmits: %d, nErrChkGen: %d. Aborting! \n", tidrecvc->cksum, cksum, __be16_to_cpu(tidrecvc->ipsaddr->tidgr_flow.path->epr_dlid), tidrecvc->stats.nSeqErr, tidrecvc->stats.nGenErr, tidrecvc->stats.nReXmit, tidrecvc->stats.nErrChkReceived);
+	ips_proto_dump_data(tidrecvc->buffer, tidrecvc->recv_msglen);
+	
+	/* TODO: In order to recover from this we need to restart the rendezvous
+	 * window again. This requires modifying the sender to not complete the
+	 * send locally till TID_RELEASE_CONFIRM is released - currently it
+	 * locally completes before sending the TID_RELEASE message.
+	 */
+      }
+    }
+    
+    psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_DONE);
+    
+    if (tidcount > 0) {
+	if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG)
+	{
+	    int num_tids = tidrecvc->tid_list.tsess_tidcount;
+	    int tid, i;
+	    for (i = 0; i < num_tids; i++) {
+		tid = tidrecvc->tid_list.tsess_list[i].tid;
+		psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_USED);
+		psmi_assert(protoexp->tid_info[tid].tidrecvc == tidrecvc);
+		protoexp->tid_info[tid].state = TIDSTATE_FREE;
+	    }
+	}
+
+	ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ", 
+		      tidrecvc->tid_list.tsess_tidcount);
+
+	if ((err = ips_tid_release(&tidrecvc->protoexp->tidc,
+			  tidrecvc->ts_map, tidcount)))
+	    goto fail;
+
+    }
+    
+    getreq->tidgr_bytesdone += tidrecvc->recv_msglen;
+    
+    _IPATH_EXP("req=%p bytes=%d/%d\n",
+		    getreq->tidgr_ucontext,
+		    getreq->tidgr_bytesdone,
+		    getreq->tidgr_length);
+    
+    tidrecvc->state = TIDRECVC_STATE_FREE;
+    psmi_mpool_put(tidrecvc);
+
+    if (getreq->tidgr_bytesdone == getreq->tidgr_length) {
+	if (getreq->tidgr_callback)
+	    getreq->tidgr_callback(getreq->tidgr_ucontext);
+	psmi_mpool_put(getreq);
+    }
+
+    /* We just released some tids.  If requests are waiting on tids to be
+     * freed, queue up the timer */
+    if (tidcount > 0) {
+	if (getreq->tidgr_offset < getreq->tidgr_length) {
+#if 0
+	    psmi_timer_request(getreq->tidgr_protoexp->timerq,
+		&getreq->tidgr_protoexp->timer_getreqs,
+		PSMI_TIMER_PRIO_1);
+#endif
+	    ips_tid_pendtids_timer_callback(
+		&getreq->tidgr_protoexp->timer_getreqs, 0);
+	}
+
+	if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) {
+	    psmi_timer_request(protoexp->timerq,
+		&protoexp->timer_getreqs,
+		PSMI_TIMER_PRIO_1);
+	}
+    }
+
+fail:
+    return err;
+}
+
+int 
+__fastpath 
+ips_protoexp_tid_release(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_tid_recv_desc *tidrecvc;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    ptl_arg_t desc_id = p_hdr->data[0];
+    ptl_arg_t args[3];
+    int rc = IPS_RECVHDRQ_CONTINUE;
+
+    args[0] = p_hdr->data[0];
+    args[1] = p_hdr->data[1];
+
+    tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, 
+					     desc_id._desc_idx);
+
+    if (tidrecvc == NULL) 
+        _IPATH_ERROR("OPCODE_TIDS_RELEASE: ERROR: Index %d is out of range\n",
+		    desc_id._desc_idx);
+    else {
+	ptl_arg_t desc_tidrecvc;
+	psmi_mpool_get_obj_index_gen_count(tidrecvc, 
+					   &desc_tidrecvc._desc_idx, 
+					   &desc_tidrecvc._desc_genc);
+
+	_IPATH_VDBG("desc_req:id=%d,gen=%d desc_tidc:id=%d,gen=%d\n", 
+		    desc_id._desc_idx, desc_id._desc_genc,
+		    desc_tidrecvc._desc_idx, desc_tidrecvc._desc_genc);
+
+	/* See if the reference is still live and valid */
+	if (desc_tidrecvc.u64 == desc_id.u64) 
+	  ips_tid_recv_free(tidrecvc);
+    }
+
+    /* Unconditionally echo back the confirmation.  If the release is a dupe
+     * because a previous confirmation was lost, it still needs to be released
+     * at the other end. */
+    ips_proto_send_ctrl_message(&rcv_ev->ipsaddr->flows[protoexp->tid_ep_flow], 
+				OPCODE_TIDS_RELEASE_CONFIRM,
+				&rcv_ev->ipsaddr->ctrl_msg_queued, 
+				args);
+    return rc;
+}
+
+int  __fastpath
+ips_protoexp_build_ctrl_message(struct ips_protoexp *protoexp, 
+				struct ptl_epaddr *ipsaddr,
+				ptl_arg_t *pargs,
+				uint16_t *pkt_flags, uint8_t opcode, 
+				void *payload)
+{
+    switch (opcode) {
+	case OPCODE_TIDS_GRANT:
+	{
+	    ips_tid_session_list *tid_list = (ips_tid_session_list *) payload;
+	    uint32_t desc_idx = tid_list->tsess_descid._desc_idx;
+	    struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *)
+		psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, 
+					     desc_idx);
+	    if (tidrecvc == NULL) return -1;
+
+	    pargs[0].u32w0 = tidrecvc->getreq->tidgr_sendtoken;
+	    pargs[0].u32w1 = tidrecvc->getreq->tidgr_length;
+	    pargs[1].u32w0 = tidrecvc->tidflow_genseq.val;
+	    
+	    if (tidrecvc->grant_cnt >= protoexp->tid_to_intr && 
+		ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD &&
+		!(tidrecvc->getreq->tidgr_flags & IPS_PROTOEXP_TIDGET_PEERWAIT)) 
+	    {
+
+		*pkt_flags |= INFINIPATH_KPF_INTR;
+		protoexp->tid_intr_reqs++;
+	    }
+	    return tid_list->tsess_tidlist_length;
+	    break;
+	}
+
+	case OPCODE_TIDS_RELEASE:
+	case OPCODE_TIDS_RELEASE_CONFIRM:
+	case OPCODE_TIDS_GRANT_ACK:
+	{
+	    ptl_arg_t *args = (ptl_arg_t *) payload;
+	    pargs[0].u64w0 = args[0].u64w0;
+	    pargs[1].u64w0 = args[1].u64w0;
+	    if (opcode == OPCODE_TIDS_RELEASE) {
+		uint32_t release_cnt = args[2].u32w0;
+		if (release_cnt >= protoexp->tid_to_intr && 
+		    ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) 
+		{
+			*pkt_flags |= INFINIPATH_KPF_INTR;
+			protoexp->tid_intr_reqs++;
+		}
+	    }
+	    return 0;
+	}
+	default:
+	    return 0;
+    }
+}
+
+void
+__fastpath
+ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_tid_recv_desc *tidrecvc;
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+
+    ptl_arg_t desc_id = p_hdr->data[0];
+    ptl_arg_t desc_tidrecvc;
+    int tid = IPS_HDR_TID(p_hdr);
+
+    /* Expected sends not enabled */
+    if (protoexp == NULL)
+	return;
+
+    /* Not doing extra tid debugging or not really a tiderr */
+    if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) ||
+	!(rcv_ev->error_flags & INFINIPATH_RHF_H_TIDERR))
+	return;
+
+    if (tid >= IPS_TID_MAX_TIDS || rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) {
+	_IPATH_ERROR("Unexpected tid value %d or ptype %d is not expected "
+		     "in tid debugging\n", tid, rcv_ev->ptype);
+	return;
+    }
+	
+    tidrecvc = (struct ips_tid_recv_desc *)
+	        psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, 
+					     desc_id._desc_idx);
+
+    if (tidrecvc != NULL) 
+	psmi_mpool_get_obj_index_gen_count(tidrecvc, 
+				           &desc_tidrecvc._desc_idx, 
+					   &desc_tidrecvc._desc_genc);
+
+    if (protoexp->tid_info[tid].state != TIDSTATE_USED) {
+	char buf[128];
+	char *s = "invalid (not even in table)";
+	if (tidrecvc != NULL) {
+	    if (desc_tidrecvc._desc_idx == desc_id._desc_idx) {
+		if (desc_tidrecvc._desc_genc == desc_id._desc_genc) 
+		    s = "valid";
+		else {
+		    snprintf(buf, sizeof buf - 1, "valid session, but wrong "
+			"generation (gen=%d,received=%d)", 
+			desc_tidrecvc._desc_genc, desc_id._desc_genc);
+		    buf[sizeof buf - 1] = '\0';
+		    s = buf;
+		}
+	    }
+	    else {
+		snprintf(buf, sizeof buf - 1, "invalid session %d", 
+			desc_id._desc_idx);
+		buf[sizeof buf - 1] = '\0';
+		s = buf;
+	    }
+
+	    if (protoexp->tid_info[tid].tidrecvc != tidrecvc) {
+		_IPATH_ERROR("tid %d not a known member of tidsess %d\n", tid,
+		desc_id._desc_idx);
+	    }
+	}
+
+	_IPATH_ERROR("tid %d is marked unused (session=%d): %s\n", tid,
+		desc_id._desc_idx, s);
+    }
+    return;
+}
+
+void
+__fastpath
+ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_tid_recv_desc *tidrecvc;
+    struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    int hdr_err = rcv_ev->error_flags & INFINIPATH_RHF_H_IHDRERR;
+    uint8_t op_code = __be32_to_cpu(p_hdr->bth[0]) >> 24 & 0xFF;
+    char pktmsg[128];
+    char errmsg[256];
+    
+    ips_proto_get_rhf_errstring(rcv_ev->error_flags, pktmsg, sizeof(pktmsg));
+
+    snprintf(errmsg, sizeof(errmsg), 
+	     "%s pkt type opcode 0x%x at hd=0x%x %s\n",
+	     (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "Eager" :
+	     (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "Expected" :
+	     (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "Non-kd" : 
+	     "<Error>",
+	     op_code, rcv_ev->recvq->state->hdrq_head, pktmsg);
+
+    if (!hdr_err) {
+      uint32_t tid_recv_sessid;
+      ptl_arg_t desc_id = p_hdr->data[0];
+      psmi_seqnum_t sequence_num;
+      uint32_t cur_flowgenseq, tfgen, tfseq;
+      uint16_t kdeth_cksum;
+      
+      /* See if the KDETH checksum validates */
+      kdeth_cksum = 
+	(uint16_t) IPATH_LRH_BTH +
+	(uint16_t) (__be16_to_cpu(p_hdr->lrh[2])) - 
+	(uint16_t) ((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)>>16) & 
+		    LOWER_16_BITS) -
+	(uint16_t) (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) & 
+		    LOWER_16_BITS) -
+	(uint16_t) __le16_to_cpu(p_hdr->iph.pkt_flags);
+      
+      if (kdeth_cksum != __le16_to_cpu(p_hdr->iph.chksum)) {
+	_IPATH_EPDBG("Data Error Pkt With Invalid KDETH Checksum: Computed: 0x%04x, IPH_CKSUM: 0x%04x %s", kdeth_cksum, __le16_to_cpu(p_hdr->iph.chksum), errmsg);
+	return;
+      }
+      
+      tid_recv_sessid = desc_id._desc_idx;
+      tidrecvc = 
+	psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+				     tid_recv_sessid);
+      
+      if_pf (tidrecvc == NULL) {
+	_IPATH_EPDBG("Data Error Pkt and Invalid Recv Handle: %s", errmsg);
+	return;
+      }
+      
+      if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) {
+	/* Print this at very verbose level. Noisy links can have a few of
+	 * these! */
+	_IPATH_VDBG("Data Error Pkt and Recv Generation Mismatch: %s", errmsg);
+	return; /* skip */
+      }
+     
+      if (tidrecvc->state == TIDRECVC_STATE_DONE) {
+	_IPATH_EPDBG("Data Error Pkt for a Completed Rendezvous: %s", errmsg);
+	return; /* skip */
+      }
+      
+      /* See if CRC error for a previous packet */
+      cur_flowgenseq = ipath_tidflow_get(tidrecvc->context->ctrl,
+			tidrecvc->tidflow_idx);
+      tfgen = ipath_tidflow_get_genval(cur_flowgenseq);
+      tfseq = ipath_tidflow_get_seqnum(cur_flowgenseq);
+      
+      sequence_num.val = __be32_to_cpu(p_hdr->bth[2]);
+     
+      if ((sequence_num.gen == tfgen) && (sequence_num.seq < tfseq)) {
+	/* Try to recover the flow by restarting from previous known good 
+	 * sequence (possible if the packet with CRC error is after the "known
+	 * good PSN" else we can't restart the flow.
+	 */
+	if (tidrecvc->tidflow_genseq.seq < sequence_num.seq)
+	  return ips_protoexp_handle_tf_seqerr(rcv_ev);
+	else
+	  _IPATH_EPDBG("ErrPkt: CRC Error for packet %d.%d. Currently at %d.%d. %s.\n", sequence_num.gen, sequence_num.seq, tfgen, tfseq, errmsg);
+      }
+      else {
+	/* Print this at very verbose level */
+	_IPATH_VDBG("Data Error Packet. GenMismatch: %s. Tidrecvc: %p. Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n", (sequence_num.gen != tfgen) ? "Yes" : "No", tidrecvc, sequence_num.gen, sequence_num.seq, tfgen, tfseq, errmsg);
+      }
+      
+    }
+    else {
+      _IPATH_VDBG("HDR_ERROR: %s\n", errmsg);
+    }
+    
+}
+
+psm_error_t
+__fastpath
+ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc)
+{
+  psmi_assert_always(tidrecvc->state != TIDRECVC_STATE_DONE);
+  ips_tfgen_allocate(&tidrecvc->protoexp->tfctrl,
+			   tidrecvc->tidflow_idx,
+			   &tidrecvc->tidflow_active_gen);
+  
+  /* Update tidflow table with new generation number */
+  tidrecvc->tidflow_genseq.gen = tidrecvc->tidflow_active_gen;
+  ipath_tidflow_set_entry(tidrecvc->context->ctrl,
+			  tidrecvc->tidflow_genseq.flow,
+			  tidrecvc->tidflow_genseq.gen,
+			  tidrecvc->tidflow_genseq.seq);
+  
+  /* Increment swapped generation count for tidflow */
+  tidrecvc->tidflow_nswap_gen++;
+  return PSM_OK;  
+}
+
+void
+__fastpath
+ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev)
+{
+  struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+  struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+  struct ips_tid_recv_desc *tidrecvc;
+  ptl_arg_t desc_id = rcv_ev->p_hdr->hdr_data[0];
+  ptl_arg_t send_descid = rcv_ev->p_hdr->hdr_data[1];
+  ptl_arg_t desc_tidrecvc;
+  psmi_seqnum_t sequence_num;
+  ptl_arg_t args[3] = {};
+  psm_error_t err;
+
+  psmi_assert_always(protoexp != NULL);
+  
+  desc_tidrecvc.u64 = 0;
+  tidrecvc = (struct ips_tid_recv_desc *)
+    psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+                                 desc_id._desc_idx);
+
+  if (tidrecvc != NULL)
+    psmi_mpool_get_obj_index_gen_count(tidrecvc,
+                                       &desc_tidrecvc._desc_idx,
+                                       &desc_tidrecvc._desc_genc);
+  
+  if (tidrecvc && desc_tidrecvc.u64 == desc_id.u64) {
+      
+    /* Update stats for sequence errors */
+    tidrecvc->stats.nSeqErr++;
+    
+    if (tidrecvc->state != TIDRECVC_STATE_DONE) {
+      
+      sequence_num.val = __be32_to_cpu(p_hdr->bth[2]);
+      
+      /* Only care about sequence error for currently active generation */
+      if (tidrecvc->tidflow_active_gen == sequence_num.gen) {
+	
+	/* For a sequence error we restart from where the last header
+	 * was successfully delivered for us since this is the last
+	 * known good state for this flow. The PSM version of the flow
+	 * sequence is the "safe" sequence number to restart at.
+	 */
+	
+	/* If a "large" number of swapped generation we are loosing packets
+	 * for this flow. Request throttling of tidflow by generating a 
+	 * BECN. With header suppression we will miss some FECN packet
+	 * on QLE73XX hence keeping track of swapped generation is another
+	 * mechanism to do congestion control for tidflows.
+	 *
+	 * For mismatched sender/receiver/link speeds we can get into a 
+	 * deadly embrace where minimal progress is made due to generation
+	 * mismatch errors. This can occur if we wrap around the generation
+	 * count without making progress. Hence in cases where the swapped
+	 * generation count is > 254 stop sending BECN (and the NAK) so the
+	 * send -> receiver pipeline is flushed with an error check and things
+	 * can sync up. This should be an extremely rare event.
+	 */
+	
+	if_pf (tidrecvc->tidflow_nswap_gen >= 254)
+	  goto fail; /* Do not send NAK. Let error check kick in. */
+	
+	if_pf ((tidrecvc->tidflow_nswap_gen > 4) &&
+	       (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+	  _IPATH_CCADBG("Generating BECN. Number of swapped generations: %d.\n", tidrecvc->tidflow_nswap_gen);
+	  /* Mark flow to generate BECN in control packet */
+	  tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+	  
+	  /* Update stats for congestion encountered */
+	  if (rcv_ev->ipsaddr)
+	    rcv_ev->ipsaddr->stats.congestion_pkts++;
+	}
+	
+	/* Swap generation for the flow. */
+	err = ips_protoexp_flow_newgen(tidrecvc);
+	if (err != PSM_OK)
+	  goto fail;
+	
+	/* NAK the tid flow. Note: We can generate the latest NAK for this flow
+	 * based on the tidrecvc->tidflow_{active|passive}_gen fields. */
+	args[0] = send_descid;
+	args[1] = tidrecvc->tid_list.tsess_descid;
+	args[2].u16w0 = sequence_num.gen; /* Older Gen to NAK */
+
+	ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow, 
+				    OPCODE_NAK,
+				    &tidrecvc->ctrl_msg_queued, args);
+
+	/* Update stats for retransmit */
+	tidrecvc->stats.nReXmit++;
+      }
+    } /* tidrecvc->state != DONE */
+  }
+  
+ fail:
+  return;
+}
+
+void
+__fastpath
+ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev)
+{
+  struct ips_protoexp *protoexp = rcv_ev->proto->protoexp;
+  struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+  int tid = IPS_HDR_TID(p_hdr);
+  struct ips_tid_recv_desc *tidrecvc;
+  psmi_assert(rcv_ev->p_hdr->data != NULL);
+  ptl_arg_t desc_id = rcv_ev->p_hdr->data[0];
+  ptl_arg_t desc_tidrecvc;
+
+  if (tid >= IPS_TID_MAX_TIDS || rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) {
+    _IPATH_ERROR("Unexpected tid value %d or ptype %d is not expected "
+                 "in tid debugging\n", tid, rcv_ev->ptype);
+    return;
+  }
+
+  /* For a generation error our NAK crossed on the wire or this is a stale
+   * packet. Error recovery should sync things up again. Just drop this
+   * packet.
+   */
+  desc_tidrecvc.u64 = 0;
+  tidrecvc = (struct ips_tid_recv_desc *)
+    psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+                                 desc_id._desc_idx);
+  
+  if (tidrecvc != NULL) {
+    psmi_mpool_get_obj_index_gen_count(tidrecvc,
+                                       &desc_tidrecvc._desc_idx,
+                                       &desc_tidrecvc._desc_genc);
+    if (desc_tidrecvc.u64 == desc_id.u64)  {
+      tidrecvc->stats.nGenErr++;   /* Update stats for generation errors */
+      
+      /* TODO_CCA: If packet faced congestion we may want to generate a CN 
+       * packet to rate control sender.
+       */
+    }
+    
+  }
+
+}
diff --git a/ptl_ips/ips_proto_header.h b/ptl_ips/ips_proto_header.h
new file mode 100644
index 0000000..3e3ee90
--- /dev/null
+++ b/ptl_ips/ips_proto_header.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_HEADER_H
+#define _IPS_PROTO_HEADER_H
+
+/* The actual size of the message header is determined by three paramters:
+ * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware)
+ *    InfiniBand words contain LRH and BTH
+ * IPS_HEADER_QUEUE_HWORDS (fixed at 7 by ips protocol)
+ *    IPS header words contain ips-protocol-specific data
+ * IPS_HEADER_QUEUE_UWORDS (variable sized, from 2 to 32)
+ *    Size depends on the target.  The connect protocol always assumes 2
+ *    uwords, and post-connect communication will use a length determined at
+ *    connect time.
+ *
+ * The header message size is determined to as IWORDS + HWORDS + UWORDS
+ */
+struct ips_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct ipath_header iph;
+	__u8 sub_opcode;
+	__u8 flags;
+	__u16 commidx;
+	/* 24 bits. The upper 8 bit is available for other use */
+	union {
+	  /* NOTE: always access src_context with HEADER_SRCCONTEXT macros.
+	   * actual context value is split to preserve wire compatibility */
+	  struct {
+	    unsigned ack_seq_num:24;
+	    unsigned src_context:4;
+	    unsigned src_subcontext:2;
+	    unsigned src_context_ext:2;
+	  };
+	  __u32 ack_seq_num_org;
+	};
+	__u8 flowid;
+	__u8 hdr_dlen;	/* data length in header */
+
+        union {
+	  struct {
+	    __u16 mqhdr : 14;    /* PSM matched queues */
+	    __u16 dst_subcontext : 2; /* Destination subcontext */
+	  };
+	  struct {    /* for PSM Active Messages */
+	    __u16 amhdr_hidx  : 8; 
+	    __u16 amhdr_nargs : 3;
+	    __u16 amhdr_flags : 3; /* Reduced from 5 bits previously */
+	  };
+	  __u16 mqhdr_org;
+	};
+	/* Access to uwords  */
+	union {
+	    ptl_arg_t	hdr_data[2];
+	    ptl_arg_t	data[0];
+	    __u32	uwords[4];
+	};
+};
+
+#define IPS_HEADER_QUEUE_IWORDS	5   /* LRH+BTH (fixed) */
+
+/* These two define the same thing, but they exist in sizeof and as a constant
+ * for sanity checking */
+#define IPS_HEADER_QUEUE_IPS_PROTOCOL_WORDS 5
+#define IPS_HEADER_QUEUE_HWORDS		    5
+
+/* Min is used by the connect protocol.
+ * Max bounds the size of the preallocated communication headers.
+ * Req is the current desired receive header queue size.  The actual size is
+ *     returned after userinit. */
+#define IPS_HEADER_QUEUE_UWORDS_MIN 4
+#define IPS_HEADER_QUEUE_UWORDS_MAX 32
+#define IPS_HEADER_QUEUE_UWORDS_REQ 12
+
+#define IPS_HEADER_QUEUE_PBC_WORDS  2
+
+/* Figure out "real" size of ips_message_header given the size of the receive
+ * header queue entry */
+/* Actual message length includes iwords */
+#define IPS_HEADER_MSGLEN(rcvhdrq_size)       \
+	((IPS_HEADER_QUEUE_IWORDS+(rcvhdrq_size))<<2)
+
+/* Old define */
+#define IPS_HEADER_QUEUE_WORDS	\
+	((sizeof(struct ips_message_header) - \
+	  offsetof(struct ips_message_header, iph)) >> 2)
+
+/* sub OpCodes - ips  */
+#define OPCODE_SEQ_DATA 0x01
+#define OPCODE_SEQ_CTRL 0x02
+
+#define OPCODE_SEQ_MQ_DATA    0x03
+#define OPCODE_SEQ_MQ_CTRL    0x04
+#define OPCODE_SEQ_MQ_HDR     0x05
+#define OPCODE_SEQ_MQ_EXPTID  0x06
+#define OPCODE_SEQ_MQ_EXPTID_UNALIGNED 0x07
+
+#define OPCODE_ACK 0x10
+#define OPCODE_NAK 0x11
+
+#define OPCODE_ERR_CHK_OLD 0x20
+#define OPCODE_ERR_CHK_PLS 0x21
+#define OPCODE_ERR_CHK 0x22       /* error check with ip + pid */
+#define OPCODE_ERR_CHK_BAD 0x23   /* error check out of context */
+#define OPCODE_ERR_CHK_GEN 0x24   /* TF protocol error check */
+
+/* Pre-2.0 startup */
+#define OPCODE_STARTUP 0x30
+#define OPCODE_STARTUP_ACK 0x31
+#define OPCODE_STARTUP_NAK 0x32
+#define OPCODE_STARTUP_EXT 0x34
+#define OPCODE_STARTUP_ACK_EXT 0x35
+#define OPCODE_STARTUP_NAK_EXT 0x36
+/* 2.0+ startup */
+#define OPCODE_CONNECT_REQUEST 0x60
+#define OPCODE_CONNECT_REPLY   0x61
+#define OPCODE_DISCONNECT_REQUEST 0x62
+#define OPCODE_DISCONNECT_REPLY   0x63
+
+#define OPCODE_AM_REQUEST   0x70
+#define OPCODE_AM_REPLY   0x71
+#define OPCODE_AM_REQUEST_NOREPLY 0x72
+
+#define OPCODE_TIDS_RELEASE 0x40
+#define OPCODE_TIDS_RELEASE_CONFIRM 0x41
+#define OPCODE_TIDS_GRANT 0x42
+#define OPCODE_TIDS_GRANT_ACK 0x43
+
+#define OPCODE_CLOSE 0x50
+#define OPCODE_CLOSE_ACK 0x51
+
+/* Explicit CCA related messages */
+#define OPCODE_FLOW_CCA_BECN 0x80
+
+/*
+ * like OPCODE_CLOSE, but no complaint if other side has already closed.
+ * Used when doing abort(), MPI_Abort(), etc.
+ */
+#define OPCODE_ABORT 0x52
+
+#endif /* _IPS_PROTO_HEADER_H */
diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h
new file mode 100644
index 0000000..96aa509
--- /dev/null
+++ b/ptl_ips/ips_proto_help.h
@@ -0,0 +1,759 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_HELP_H
+#define _IPS_PROTO_HELP_H
+
+#include "ips_recvhdrq.h"
+#include "ips_proto.h"
+#include "ipserror.h"
+#include "psm_mq_internal.h" // psmi_mq_handle_tiny_envelope
+#include "ptl_ips.h"
+#include "ips_epstate.h"
+
+/* Some tunable compile-time options */
+#define IPS_TINY_PROCESS_MQTINY 1   /* whether mq processing of tiny pkts is
+				       done separately from non-tiny packets */
+
+PSMI_ALWAYS_INLINE(
+uint8_t
+ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow))
+{
+  uint32_t diff = (flow->protocol == PSM_PROTOCOL_TIDFLOW) ? 
+    (flow->xmit_seq_num.seq - flow->xmit_ack_num.seq) :
+    (flow->xmit_seq_num.pkt - flow->xmit_ack_num.pkt);
+    
+    /*
+     * This is currently disabled pending more experimentation.  The goal
+     * is to eventually use the FLAG_INTR to tighten the control loop
+     * between two endpoints.
+     */
+#if 0
+    /* At every 64, request ack w/ interrupt */
+    if ((diff & 0x3f) == 0) 
+	scb->flags |= IPS_SEND_FLAG_ACK_REQ |
+		     (flow->ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) ?
+		     IPS_SEND_FLAG_INTR : 0;
+    /* At every 16, request ack */
+    else 
+#endif
+      if (((diff & flow->ack_interval) == 0) || (flow->credits == 1))
+	scb->flags |= IPS_SEND_FLAG_ACK_REQ;
+
+    /* Bottom 8 bits wind up in protocol header fields, other bits
+     * control other aspects of packet composition */
+    return (uint8_t) (scb->flags & IPS_SEND_FLAG_PROTO_OPTS);
+}
+
+PSMI_ALWAYS_INLINE(
+ptl_epaddr_flow_t ips_proto_flowid(struct ips_message_header *p_hdr))
+{
+  ptl_epaddr_flow_t flowidx = IPS_FLOWID2INDEX(p_hdr->flowid);
+  psmi_assert(flowidx < EP_FLOW_LAST);
+  return flowidx;
+}
+
+PSMI_ALWAYS_INLINE(
+void ips_kdeth_cksum(struct ips_message_header *p_hdr))
+{
+  /* Compute KDETH checksum */
+  p_hdr->iph.chksum = __cpu_to_le16(
+        (uint16_t) IPATH_LRH_BTH +
+        (uint16_t) (__be16_to_cpu(p_hdr->lrh[2])) - 
+        (uint16_t) ((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)>>16) & 
+		    LOWER_16_BITS) -
+        (uint16_t) (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) & 
+		    LOWER_16_BITS) -
+        (uint16_t) __le16_to_cpu(p_hdr->iph.pkt_flags));
+}
+
+PSMI_ALWAYS_INLINE(
+int ips_do_cksum(struct ips_proto *proto,
+		 struct ips_message_header *p_hdr,
+		 void *payload,
+		 uint32_t paylen,
+		 uint32_t *cksum))
+{
+
+  if_pf ((proto->flags & IPS_PROTO_FLAG_CKSUM) && 
+      (((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK) == IPATH_EAGER_TID_ID) && (p_hdr->mqhdr != MQ_MSG_DATA_BLK) && (p_hdr->mqhdr != MQ_MSG_DATA_REQ_BLK)) {
+    
+    uint16_t paywords;
+        
+    /* Update the payload words in header */
+    paywords = (sizeof(struct ips_message_header) +
+		paylen + PSM_CRC_SIZE_IN_BYTES) >> BYTE2WORD_SHIFT;
+    p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC);
+
+    /* Need to regenerate KDETH checksum after updating payload length */
+    ips_kdeth_cksum(p_hdr); 
+      
+    *cksum = 0xffffffff;
+      
+    /* Checksum header */
+    *cksum = ips_crc_calculate(sizeof(struct ips_message_header), 
+			       (uint8_t*) p_hdr, *cksum);
+      
+    /* Checksum payload (if any) */
+    if (paylen) {
+      psmi_assert_always(payload);
+      *cksum = ips_crc_calculate(paylen, (uint8_t*) payload, 
+				    *cksum);
+    }
+  }
+
+  return 0;
+}
+
+/* Get pbc static rate value for flow for a given message length */
+PSMI_ALWAYS_INLINE(
+uint32_t ips_proto_pbc_static_rate(struct ips_flow *flow, uint32_t msgLen))
+{
+  uint32_t rate = 0;
+
+  /* The PBC rate is based on which HCA type as QLE73XX/QLE72XX have different
+   * mechanism for static rate control. QLE71XX does not even have static
+   * rate control capability.
+   */
+  
+  switch(flow->epinfo->ep_hca_type) {
+  case PSMI_HCA_TYPE_QLE73XX: 
+    {
+      
+      /* Rate = IPD * Time to transmit the packet. The rate value is
+       * programmed into the PBC which counts down at a rate of 500 MHz the
+       * TXE to IBC interface speed (Section 7.8.1). Since time to transmit
+       * depends on our local link speed we need to convert that into the
+       * clock frequency of the TXE in 500 MHz units. To transfer a message of
+       * MSgLen bytes for various local link rates we obtain:
+       *
+       * Link Rate (LinWidth * LinkSpeed)       Cycle Count
+       * SDR (10 Gbit/sec)                      (MsgLen >> 1)
+       * DDR (20 Gbit/sec)                      (MsgLen >> 2)
+       * QDR (40 Gbit/sec)                      (MsgLen >> 3)
+       */
+      static uint8_t qle73xx_rate_divisor[IBTA_RATE_120_GBPS + 1] = {
+	[IBTA_RATE_2_5_GBPS] = 0,
+	[IBTA_RATE_5_GBPS] = 0,
+	[IBTA_RATE_10_GBPS] = 1,
+	[IBTA_RATE_20_GBPS] = 2,
+	[IBTA_RATE_30_GBPS] = 2,
+	[IBTA_RATE_40_GBPS] = 3
+      };
+
+      uint32_t time_to_send = (msgLen >> 
+			       qle73xx_rate_divisor[flow->epinfo->ep_link_rate]);
+      /* IBTA CCA additionally has a shift_field for finer grained control
+       * of IPD (This is bit [14:15] in the CCT entry. For static rate control
+       * this value is always so.
+       */
+      rate = (time_to_send >> flow->path->epr_cca_divisor) * 
+	     (flow->path->epr_active_ipd); 
+
+      /* For QLE73XX the max rate is 0x3FF*/
+      rate = min(rate, 0x3FFF);
+    }
+    break;
+  case PSMI_HCA_TYPE_QLE72XX:
+    /* TODO_CCA: Implement for QLE72XX to take into account the PREVIOUS
+     * messages IPD for this flow/path.
+     */
+    rate = 0;
+    break;
+  default:
+    rate = 0;
+  }
+  
+  return rate;
+}
+
+/* This is only used for SDMA cases; pbc is really a pointer to
+ * struct ips_pbc_header * or the equivalent un-named structure
+ * in ips_scb */
+PSMI_ALWAYS_INLINE(
+void ips_proto_pbc_update(struct ips_proto *proto, 
+			  struct ips_flow *flow, uint32_t isCtrlMsg,
+			  union ipath_pbc *pbc, uint32_t hdrlen, 
+			  void *payload, uint32_t paylen))
+{
+    struct ips_spio *ctrl = proto->spioc;
+    struct ips_message_header *p_hdr = (struct ips_message_header*) &pbc[1];
+    int vl = (__be16_to_cpu(p_hdr->lrh[0]) >> LRH_VL_SHIFT) & 0xf;
+    uint32_t static_rate = 0;
+    
+    if_pf (!isCtrlMsg && flow->path->epr_active_ipd)
+      static_rate = ips_proto_pbc_static_rate(flow, hdrlen + paylen);
+    
+    pbc->qword  = 0ULL;
+    pbc->length =  __cpu_to_le16( ((hdrlen + paylen) >> 2) + 1);
+    if (ctrl->portnum > 1)
+      pbc->pbcflags |= __cpu_to_le32(vl << __PBC_VLSHIFT | 
+				     __PBC_IBPORT | 
+				     static_rate);
+    else
+      pbc->pbcflags |= __cpu_to_le32(vl << __PBC_VLSHIFT | 
+				     static_rate);
+    
+    return;
+}
+
+/* 
+ * Helpers to extract header information 
+ */
+/* With QLE73XX/QLE72XX, we put context 16 in src_context_ext */
+#define IPS_HEADER_SRCCONTEXT_GET(msg_hdr)				\
+	    (((msg_hdr)->src_context) | ((msg_hdr)->src_context_ext<<4))
+
+#define IPS_HEADER_SRCCONTEXT_SET(msg_hdr,context)    do {	\
+	    (msg_hdr)->src_context = (context) & 0xf;		\
+	    (msg_hdr)->src_context_ext = (context>>4) & 0x3;	\
+	} while (0)
+
+PSMI_ALWAYS_INLINE(
+uint32_t ips_proto_dest_context_from_header(struct ips_proto *proto,
+					    struct ips_message_header *p_hdr))
+{
+  uint16_t hca_type;
+  uint32_t dest_context;
+  
+  hca_type = PSMI_EPID_GET_HCATYPE(proto->ep->epid);
+  
+  dest_context = 
+    (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) >> INFINIPATH_I_CONTEXT_SHIFT) & INFINIPATH_I_CONTEXT_MASK;
+  switch(hca_type) {
+  case PSMI_HCA_TYPE_QLE73XX:
+    dest_context |= ((__be32_to_cpu(p_hdr->bth[1]) & 1) << 4);
+    break;
+  case PSMI_HCA_TYPE_QLE72XX:
+    /* Context 16 is special cased on QLE72XX */
+    dest_context |= ((__be32_to_cpu(p_hdr->bth[1]) & 1) << 4);
+    if (dest_context == 0x1f)
+      dest_context = 16;
+    break;
+  case PSMI_HCA_TYPE_QLE71XX:
+  default:
+    /* This is a no-op. */
+    break;
+  }
+  
+  return dest_context;
+}
+
+PSMI_ALWAYS_INLINE(
+void ips_proto_hdr(ips_scb_t *scb,
+		   struct ips_epinfo *epinfo, 
+		   struct ips_epinfo_remote *epr,
+		   struct ips_flow *flow,
+		   uint32_t paywords, 
+		   uint32_t extra_bytes, 
+		   uint16_t kpf_flags,
+		   uint8_t flags))
+{
+    struct ips_message_header *p_hdr = &scb->ips_lrh;
+
+    /*
+     * This scb has been used by this connection last time,
+     * so some of the header fields are already set.
+     */
+    if (scb->flow == flow && scb->epaddr == flow->ipsaddr) {
+	p_hdr->bth[2]      = __cpu_to_be32(flow->xmit_seq_num.psn);
+	p_hdr->flags       = flags;
+	p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+
+	/* check if extra bytes is changed */
+	if (scb->extra_bytes != extra_bytes) {
+	    p_hdr->bth[0] =
+		__cpu_to_be32((IPATH_OPCODE_USER1 << BTH_OPCODE_SHIFT) +
+		(extra_bytes << BTH_EXTRA_BYTE_SHIFT) +
+		flow->path->epr_pkey);
+	    scb->extra_bytes = extra_bytes;
+	}
+
+	/* If header is exactly the same */
+	if (scb->tid == IPATH_EAGER_TID_ID &&
+		scb->pkt_flags == kpf_flags &&
+		scb->payload_bytes == scb->payload_size) {
+	    return;
+	}
+
+	/* context, version, and TID are already known to be in range, no
+	 * masking needed; offset in low INFINIPATH_I_OFFSET_MASK  bits */
+	p_hdr->iph.ver_context_tid_offset = __cpu_to_le32(
+		(IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) +
+		(epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) +
+		(scb->tid << INFINIPATH_I_TID_SHIFT) +
+		(scb->offset >> 2)); // convert from byte to word offset
+
+	p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC);
+	p_hdr->iph.pkt_flags = __cpu_to_le16(kpf_flags);
+
+	ips_kdeth_cksum(p_hdr); // Generate KDETH checksum
+
+	scb->pkt_flags = kpf_flags;
+	scb->payload_bytes = scb->payload_size;
+
+	return;
+    }
+
+    p_hdr->lrh[0] = 
+    __cpu_to_be16(IPATH_LRH_BTH |
+		  (flow->sl << 4) |  /* SL for flow */     
+    /* VL for flow */ (flow->path->proto->sl2vl[flow->sl] << LRH_VL_SHIFT));
+    p_hdr->lrh[1] = flow->path->epr_dlid;
+    p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC);
+    p_hdr->lrh[3] = flow->path->epr_slid;
+
+    p_hdr->bth[0] = 
+	    __cpu_to_be32((IPATH_OPCODE_USER1 << BTH_OPCODE_SHIFT) +
+                          (extra_bytes << BTH_EXTRA_BYTE_SHIFT) +
+                          flow->path->epr_pkey);
+    p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp);
+    p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn);
+    p_hdr->commidx = (uint16_t) epr->epr_commidx_to;
+
+    /* context, version, and TID are already known to be in range, no
+     * masking needed; offset in low INFINIPATH_I_OFFSET_MASK  bits */
+    p_hdr->iph.ver_context_tid_offset = __cpu_to_le32(
+        (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) +
+        (epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) +
+        (scb->tid << INFINIPATH_I_TID_SHIFT) +
+        (scb->offset >> 2)); // convert from byte to word offset
+    p_hdr->iph.pkt_flags = __cpu_to_le16(kpf_flags);
+    
+    ips_kdeth_cksum(p_hdr); // Generate KDETH checksum
+
+    p_hdr->flags       = flags;
+    p_hdr->flowid      = flow->flowid;
+    p_hdr->ack_seq_num = flow->recv_seq_num.psn;
+    IPS_HEADER_SRCCONTEXT_SET(p_hdr, epinfo->ep_context);
+    p_hdr->src_subcontext = epinfo->ep_subcontext;
+    p_hdr->dst_subcontext = epr->epr_subcontext;
+
+    scb->extra_bytes   = extra_bytes;
+    scb->pkt_flags     = kpf_flags;
+    scb->payload_bytes = scb->payload_size;
+    scb->flow          = flow;
+    scb->epaddr        = flow->ipsaddr;
+
+    return;
+}
+
+/* 
+ * Assumes that the following fields are already set in scb:
+ * payload
+ * payload_size
+ * flags
+ */
+PSMI_INLINE(
+void
+ips_scb_prepare_flow_inner(ips_scb_t *scb,
+		     struct ips_epinfo *epinfo, 
+		     struct ips_epinfo_remote *epr,
+		     struct ips_flow *flow))
+{
+    uint32_t extra_bytes;
+    uint32_t tot_paywords;
+    uint16_t pkt_flags = IPS_EPSTATE_COMMIDX_PACK(epr->epr_commidx_to);
+    
+    extra_bytes = scb->payload_size & 3;
+    if (extra_bytes) {
+      extra_bytes = 4 - extra_bytes;
+      scb->payload_size += extra_bytes;
+    }
+    tot_paywords = (sizeof(struct ips_message_header) + scb->payload_size) 
+                    >> BYTE2WORD_SHIFT;
+    pkt_flags |= (scb->flags & IPS_SEND_FLAG_INTR) ? INFINIPATH_KPF_INTR : 0;
+    pkt_flags |= (scb->flags & IPS_SEND_FLAG_HDR_SUPPRESS) ?
+      INFINIPATH_KPF_HDRSUPP : 0;
+    
+    ips_proto_hdr(scb, epinfo, epr, flow,
+		  tot_paywords, extra_bytes,
+		  pkt_flags, ips_flow_gen_ackflags(scb, flow));		  
+
+    scb->ack_timeout = flow->path->epr_timeout_ack;
+    scb->abs_timeout = TIMEOUT_INFINITE;
+    scb->flags      |= IPS_SEND_FLAG_PENDING;
+
+    if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
+      flow->xmit_seq_num.seq += scb->nfrag;
+      scb->seq_num = flow->xmit_seq_num;
+      scb->seq_num.seq--;
+    } else {
+      flow->xmit_seq_num.pkt += scb->nfrag;
+      scb->seq_num = flow->xmit_seq_num;
+      scb->seq_num.pkt--;
+    }
+
+    return;
+}
+
+PSMI_ALWAYS_INLINE(
+psm_epid_t
+ips_epid_from_phdr(const uint16_t lmc_mask, 
+		   const struct ips_message_header *p_hdr))
+{
+    uint16_t lid     = __be16_to_cpu(p_hdr->lrh[3]) & lmc_mask;
+    uint16_t context    = (uint16_t) IPS_HEADER_SRCCONTEXT_GET(p_hdr);
+    uint16_t subcontext = (uint16_t) p_hdr->src_subcontext;
+ 
+    return PSMI_EPID_PACK(lid, context, subcontext);
+}
+
+PSMI_ALWAYS_INLINE(
+void
+ips_epaddr_stats_send(struct ptl_epaddr *ptladdr, uint8_t msgtype))
+{
+    switch (msgtype) {
+	case OPCODE_ACK:
+	    break;
+	case OPCODE_TIDS_GRANT:
+	    ptladdr->stats.tids_grant_send++;
+	    break;
+	case OPCODE_ERR_CHK:
+        case OPCODE_ERR_CHK_GEN:
+	    ptladdr->stats.err_chk_send++;
+	    break;
+	case OPCODE_NAK:
+	    ptladdr->stats.nak_send++;
+	    break;
+	case OPCODE_CONNECT_REQUEST:
+	    ptladdr->stats.connect_req++;
+	    break;
+	case OPCODE_DISCONNECT_REQUEST:
+	    ptladdr->stats.disconnect_req++;
+	    break;
+	default:
+	    break;
+    }
+    return;
+}
+
+/* 
+ * Exported there solely for inlining is_expected_or_nak and mq_tiny handling
+ */
+extern
+psm_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, 
+					uint8_t message_type,
+					uint32_t *msg_queue_mask, 
+					void *payload);
+
+PSMI_ALWAYS_INLINE(
+void 
+ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+  if_pt (recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+    if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) {
+      flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */
+    }
+    else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) {
+      SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+    }
+    
+    flow->flags |= IPS_FLOW_FLAG_PENDING_ACK;  
+  }
+  else {
+    /* Coalesced ACKs disabled. Send ACK immediately */
+    ips_proto_send_ctrl_message(flow, OPCODE_ACK, 
+				&flow->ipsaddr->ctrl_msg_queued, NULL);
+  }
+}
+
+PSMI_ALWAYS_INLINE(
+void 
+ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow))
+{
+  if_pt (recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) {
+    if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+      flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */
+    }
+    else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) {
+      SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next);
+    }
+    
+    flow->flags |= IPS_FLOW_FLAG_PENDING_NAK;  
+  }
+  else {
+    /* Coalesced ACKs disabled. Send NAK immediately */
+    ips_proto_send_ctrl_message(flow, OPCODE_NAK, 
+				&flow->ipsaddr->ctrl_msg_queued, NULL);
+  }
+}
+
+/* return 1 if packet is next expected in flow
+ * return 0 if packet is not next expected in flow (and nak packet).
+ */
+PSMI_ALWAYS_INLINE(
+int
+ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev))
+{
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+    struct ips_flow *flow = &ipsaddr->flows[flowid];
+    psmi_seqnum_t sequence_num;
+    
+    psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) ||
+		(flowid == EP_FLOW_GO_BACK_N_DMA) ||
+		(flowid == EP_FLOW_GO_BACK_N_AM_REQ) ||
+		(flowid == EP_FLOW_GO_BACK_N_AM_RSP)
+		);
+    
+    /* If packet faced congestion generate BECN in NAK. */
+    if_pf ((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) &&
+	   ((flow->cca_ooo_pkts & 0xf) == 0)) {
+      /* Generate a BECN for every 16th OOO packet marked with a FECN. */
+      flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+      flow->cca_ooo_pkts++;
+      ipsaddr->stats.congestion_pkts++;
+      rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */
+    }
+    
+    sequence_num.val = __be32_to_cpu(p_hdr->bth[2]);
+    if_pf (flow->recv_seq_num.pkt != sequence_num.pkt) {
+      int16_t diff = (int16_t) (sequence_num.pkt - flow->last_seq_num.pkt);
+      
+      if (diff < 0)
+	return 0;
+
+      flow->cca_ooo_pkts = diff;
+      if (flow->cca_ooo_pkts > flow->ack_interval) {
+	ipsaddr->stats.congestion_pkts++;
+	flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+	_IPATH_CCADBG("BECN Generation. Expected: %d, Got: %d.\n", flow->recv_seq_num.pkt, sequence_num.pkt);
+      }
+      flow->last_seq_num = sequence_num;
+      
+      if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) {	
+	/* Queue/Send NAK to peer  */
+	ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow);
+	flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+	flow->cca_ooo_pkts = 0;
+      } 
+      else if (flow->flags & IPS_FLOW_FLAG_GEN_BECN) {
+	/* Send Control message to throttle flow. Will clear flow flag and
+	 * reset cca_ooo_pkts. 
+	 */
+	ips_proto_send_ctrl_message(flow, OPCODE_FLOW_CCA_BECN, 
+				    &flow->ipsaddr->ctrl_msg_queued, 
+				    NULL);
+      }
+            
+      return 0;
+    }
+    else {
+      flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND;
+      
+      flow->last_seq_num = sequence_num;
+      flow->recv_seq_num.pkt += 1;
+      flow->cca_ooo_pkts = 0;
+      return 1;
+    }
+}
+
+/*
+ * Return value:
+ *	1: in order message;
+ *	0: out of order, no touch;
+ *	-1: out of order, buffered in outoforder queue.
+ */
+PSMI_ALWAYS_INLINE(
+int 
+ips_proto_check_msg_order(psm_epaddr_t epaddr,
+	struct ips_flow *flow, struct ips_message_header *p_hdr))
+{
+  uint16_t msg_seqnum = (uint16_t)(flow->last_seq_num.msg +
+			((p_hdr->ack_seq_num>>8)&0xff00));
+
+  if (msg_seqnum != epaddr->mctxt_master->mctxt_recv_seqnum) {
+    flow->msg_ooo_toggle = !flow->msg_ooo_toggle;
+
+    if (flow->msg_ooo_toggle) {
+	flow->recv_seq_num.pkt -= 1;
+	flow->msg_ooo_seqnum = msg_seqnum;
+	return 0;
+    }
+
+    psmi_assert(msg_seqnum == flow->msg_ooo_seqnum);
+    return -1;
+  }
+
+  flow->msg_ooo_toggle = 0;
+  epaddr->mctxt_master->mctxt_recv_seqnum++;
+  return 1;
+}
+
+#if IPS_TINY_PROCESS_MQTINY
+PSMI_ALWAYS_INLINE(
+int 
+ips_proto_process_mq_tiny(const struct ips_recvhdrq_event *rcv_ev))
+{
+  ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+  psm_epaddr_t epaddr = ipsaddr->epaddr;
+  struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+  ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+  struct ips_flow *flow = &ipsaddr->flows[flowid];
+  int ret = IPS_RECVHDRQ_CONTINUE;
+  
+  if (ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev)) {
+    ret = ips_proto_check_msg_order(epaddr, flow, p_hdr);
+    if (ret == 0) return IPS_RECVHDRQ_OOO;
+    if (ret == -1) {
+	psmi_mq_handle_envelope_outoforder(ipsaddr->proto->mq,
+		(uint16_t) p_hdr->mqhdr,
+		epaddr, flow->msg_ooo_seqnum,
+		p_hdr->data[0].u64, /* tag */
+		epaddr->xmit_egrlong, /* place hold only */
+		(uint32_t) p_hdr->hdr_dlen,
+		(void *) &p_hdr->data[1],
+		(uint32_t) p_hdr->hdr_dlen);
+	ret = IPS_RECVHDRQ_BREAK;
+    } else {
+	psmi_mq_handle_tiny_envelope(
+		ipsaddr->proto->mq,
+		epaddr, p_hdr->data[0].u64, /* tag */
+		(void *) &p_hdr->data[1], 
+		(uint32_t) p_hdr->hdr_dlen);
+	if (epaddr->mctxt_master->outoforder_c) {
+	    psmi_mq_handle_outoforder_queue(epaddr->mctxt_master);
+	}
+	ret = IPS_RECVHDRQ_CONTINUE;
+    }
+    if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ)  ||
+	(flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+      ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow);
+  }
+  
+  ips_proto_process_ack((struct ips_recvhdrq_event *) rcv_ev);
+  return ret;
+}
+#endif
+
+PSMI_INLINE(
+int
+ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev))
+{
+#if IPS_TINY_PROCESS_MQTINY
+    if (rcv_ev->p_hdr->sub_opcode == OPCODE_SEQ_MQ_HDR) {
+	psmi_assert(rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER);
+	return ips_proto_process_mq_tiny(rcv_ev);
+    }
+    else 
+#endif
+      return ips_proto_process_packet_inner((struct ips_recvhdrq_event *) rcv_ev);
+}
+
+#if PSMI_PLOCK_DISABLED
+  #define ips_ptladdr_lock(ipsaddr)			\
+	if (((ipsaddr)->flags & SESS_FLAG_LOCK_SESS))   \
+	    pthread_mutex_lock(&(ipsaddr)->sesslock)
+
+  #define ips_ptladdr_unlock(ipsaddr)			\
+	if (((ipsaddr)->flags & SESS_FLAG_LOCK_SESS))   \
+	    pthread_mutex_unlock(&(ipsaddr)->sesslock)
+#else
+  #define ips_ptladdr_lock(ipsaddr)
+  #define ips_ptladdr_unlock(ipsaddr)
+#endif
+
+/*
+ * Breaks header encapsulation but needed in mq sends so we can pay
+ * "near-equal" attention to putting sends on the wire and servicing the
+ * receive queue.
+ */
+
+PSMI_ALWAYS_INLINE(
+psm_error_t
+ips_recv_progress_if_busy(ptl_t *ptl, psm_error_t err))
+{
+    if (err == PSM_EP_NO_RESOURCES) {
+	ptl->ctl->ep_poll(ptl, 0);
+	return PSM_OK;
+    }
+    else 
+	return err;
+}
+
+/* Find next lowest power of a two for a 32 bit number*/
+PSMI_ALWAYS_INLINE(
+unsigned int 
+ips_next_low_pow2(unsigned int v))
+{
+
+  const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+  const unsigned int S[] = {1, 2, 4, 8, 16};
+  register unsigned int r = 1; 
+  int i;
+
+  for (i = 4; i >= 0; i--) 
+    {
+      if (v & b[i])
+	{
+	  v >>= S[i];
+	  r <<= S[i];
+	}
+    }
+  
+  return r;
+}
+
+PSMI_ALWAYS_INLINE(
+ips_path_rec_t *ips_select_path(struct ips_proto *proto, 
+				ips_path_type_t path_type,
+				ips_epaddr_t *ipsaddr))
+{
+  uint32_t path_idx;
+  
+  if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) {
+    /* If dispersive routes are configured then select the routes in round
+     * robin order. We may want to use congestion information to select the
+     * least lightly loaded path.
+     */
+    path_idx = ipsaddr->epr.epr_next_path[path_type];
+    if (++ipsaddr->epr.epr_next_path[path_type] >=
+	ipsaddr->epr.epr_num_paths[path_type])
+      ipsaddr->epr.epr_next_path[path_type] = 0;
+  }
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST)
+    path_idx = /* Key on destination context */
+      ipsaddr->epr.epr_context  % ipsaddr->epr.epr_num_paths[path_type];
+  else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC)
+    path_idx = /* Key off src context */
+      ipsaddr->proto->ep->context.base_info.spi_context % ipsaddr->epr.epr_num_paths[path_type];
+  else /* Base LID routed - Default in Infinipath 2.5 (Oct 09). */
+    path_idx = 0;
+  
+  return ipsaddr->epr.epr_path[path_type][path_idx];
+}
+
+#endif /* _IPS_PROTO_HELP_H */
diff --git a/ptl_ips/ips_proto_internal.h b/ptl_ips/ips_proto_internal.h
new file mode 100644
index 0000000..8954ff3
--- /dev/null
+++ b/ptl_ips/ips_proto_internal.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_INTERNAL_H
+#define _IPS_PROTO_INTERNAL_H
+
+#include "ips_proto_header.h"
+#include "ips_expected_proto.h"
+#include "ips_proto_help.h"
+
+/*
+ * Connect protocol.
+ *
+ * On receive, handled by upcalling into the connect interface.
+ * On send, handled by ips_proto by having connect compose the message.
+ */
+psm_error_t ips_proto_process_connect(struct ips_proto *proto, psm_epid_t epid, 
+				      uint8_t opcode,
+				      struct ips_message_header *p_hdr, 
+				      void *payload, uint32_t paylen);
+int ips_proto_build_connect_message(struct ips_proto *proto, 
+				    struct ips_proto_ctrl_message *msg, 
+			            ips_epaddr_t *ptladdr, uint8_t opcode, 
+				    void *payload);
+
+psm_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t);
+psm_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t);
+psm_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t);
+psm_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t);
+psm_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment);
+psm_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current);
+void
+ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context);
+
+psm_error_t ips_proto_recv_init(struct ips_proto *proto);
+psm_error_t ips_proto_recv_fini(struct ips_proto *proto);
+
+#define IPS_PROTO_MQ_CTS_MSGSIZE    64
+
+#endif /* _IPS_PROTO_INTERNAL_H */
diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c
new file mode 100644
index 0000000..3297753
--- /dev/null
+++ b/ptl_ips/ips_proto_mq.c
@@ -0,0 +1,964 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define MQ_NUM_MTUS(size,mtu)	(((size) + (mtu) - 1) / (mtu))
+#define MQ_EGRLONG_ENABLE_MULTIFLOW 0
+
+PSMI_NEVER_INLINE(
+ips_scb_t * __sendpath
+ips_poll_scb(struct ips_proto *proto,
+	     int npkts, int len, uint32_t flags, int istiny))
+{
+    ips_scb_t *scb = NULL;
+    psmi_assert(npkts > 0);
+    psm_error_t err;
+
+    proto->stats.scb_egr_unavail_cnt++;
+
+    PSMI_BLOCKUNTIL(proto->ep,err,
+	((scb = istiny ? 
+	  ips_scbctrl_alloc_tiny(&proto->scbc_egr) :
+	  ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags)) != NULL));
+    psmi_assert(scb != NULL);
+    return scb;
+}
+
+PSMI_ALWAYS_INLINE(
+ips_scb_t * 
+mq_alloc_tiny(struct ips_proto *proto))
+{
+    ips_scb_t* scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr);
+    // common case should branch right through
+    if_pt (scb != NULL) 
+        return scb;
+    else 
+       return ips_poll_scb(proto, 1, 0, 0, 1);
+}
+
+PSMI_ALWAYS_INLINE(
+ips_scb_t * 
+mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags))
+{
+    psmi_assert(npkts > 0);
+    ips_scb_t* scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags);
+    if_pt (scb != NULL) {
+        return scb;
+    }
+    else {
+        return ips_poll_scb(proto, npkts, len, flags, 0 /* not tiny scb */);
+    }
+}
+
+static
+int __recvpath
+ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
+{
+    psm_mq_req_t req = (psm_mq_req_t)reqp;
+    
+    req->send_msgoff += nbytes;
+    if (req->send_msgoff == req->send_msglen) {
+	req->state = MQ_STATE_COMPLETE;
+	mq_qq_append(&req->mq->completed_q, req);
+    }
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int __recvpath
+ips_proto_mq_rv_complete(void *reqp)
+{
+    psm_mq_req_t req = (psm_mq_req_t) reqp;
+    psmi_mq_handle_rts_complete(req);
+
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+void __recvpath
+ips_proto_mq_rv_complete_exp(void *reqp)
+{
+    ips_proto_mq_rv_complete(reqp);
+    return;
+}
+
+extern psm_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope
+ *
+ * Recoverable errors:
+ * PSM_OK: If PIO, envelope is sent. 
+ *	   If DMA, all queued up packets on flow were flushed.
+ *
+ * Recoverable errors converted to PSM_OK just before return:
+ * PSM_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets.
+ * PSM_EP_NO_RESOURCES:
+ *	   If PIO, no pio available or cable currently pulled.
+ *	   If DMA, can be that no scb's available to handle unaligned packets
+ *	           or writev returned a recoverable error (no mem for
+ *	           descriptors, dma interrupted or no space left in dma queue).
+ *
+ * Unrecoverable errors (PIO or DMA).
+ * PSM_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure,
+ *			  rxe/txe parity error.
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ */
+PSMI_ALWAYS_INLINE(
+psm_error_t
+ips_mq_send_envelope(struct ips_proto *proto, psm_epaddr_t mepaddr,
+		     ips_epaddr_t *ipsaddr, struct ips_scb *scb, int do_flush))
+{
+    psm_error_t err = PSM_OK;
+    struct ips_flow *flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+    
+    if_pf (proto->flags & IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA) {
+      flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+      
+      if_pt (ips_scb_length(scb)) /* For DMA envelope need local completion */
+    	ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA;
+    }
+    
+    flow->xmit_seq_num.msg = mepaddr->mctxt_send_seqnum&0xff;
+    flow->recv_seq_num.msg = (mepaddr->mctxt_send_seqnum>>8)&0xff;
+    mepaddr->mctxt_send_seqnum++;
+
+    flow->fn.xfer.enqueue(flow, scb);
+
+    if ((flow->transfer == PSM_TRANSFER_PIO) ||
+	(flow->transfer == PSM_TRANSFER_DMA && do_flush))
+      err = flow->fn.xfer.flush(flow, NULL);
+   
+    if (do_flush)
+	err = ips_recv_progress_if_busy(ipsaddr->ptl, err);
+
+    PSMI_BLOCKUNTIL(proto->ep,err, (scb->flags&IPS_SEND_FLAG_PENDING) == 0);
+
+    /* As per the PSM error model (or lack thereof), PSM clients expect to see
+     * only PSM_OK as a recoverable error */
+    if (err == PSM_EP_NO_RESOURCES || err == PSM_OK_NO_PROGRESS)
+	err = PSM_OK;
+    return err;
+}
+
+/*
+ * We don't use message striping for middle message protocol,
+ * Tests on sandy-bridge two HCAs show lower bandwidth if
+ * message striping is used. 
+ */
+void __sendpath
+ips_mq_send_payload(psm_epaddr_t epaddr, psmi_egrid_t egrid,
+		    void *ubuf, uint32_t len, uint32_t offset,
+		    psm_mq_req_t req, uint32_t flags)
+{
+    psm_error_t err;
+
+    ips_scb_t *scb;
+    uintptr_t buf = (uintptr_t) ubuf;
+    uint32_t nbytes_left = len;
+    uint32_t pktlen, frag_size;
+    ips_epaddr_t *ipsaddr;
+    struct ips_proto *proto;
+    int is_blocking = !!(req == NULL);
+    ptl_epaddr_flow_t flowid = 
+      (flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA) ?
+      EP_FLOW_GO_BACK_N_DMA : EP_FLOW_GO_BACK_N_PIO;
+    struct ips_flow *flow;
+
+    psmi_assert(len > 0);
+    ipsaddr = epaddr->ptladdr;
+    proto = ipsaddr->proto;
+    flow = &ipsaddr->flows[flowid];
+    frag_size = flow->frag_size;
+
+    if (!(flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA)) goto spio;
+
+    psmi_assert(req != NULL);
+    pktlen = len;
+    /* The payload size is limited by the pbc.length field which is 16 bits in
+     * DWORD, including both message header and payload. This translates to
+     * less than 256K payload. So 128K is used. */
+    if (pktlen > 131072) pktlen = 131072;
+
+    do {
+	scb = mq_alloc_pkts(proto, 1, 0, 0);
+	psmi_assert(scb != NULL);
+
+#if 0
+	/* turn on to use single frag-size packet */
+	pktlen = min(frag_size, nbytes_left);
+#else
+	pktlen = min(pktlen, nbytes_left);
+#endif
+	ips_scb_length(scb) = pktlen;
+	ips_scb_mqhdr(scb) = MQ_MSG_DATA_BLK;
+	ips_scb_mqparam(scb).u32w0 = egrid.egr_data;
+	ips_scb_mqparam(scb).u32w1 = offset;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_buffer(scb) = (void *) buf;
+
+	buf += pktlen;
+	offset += pktlen;
+	nbytes_left -= pktlen;
+
+	if (nbytes_left == 0) {
+		ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+		ips_scb_cb_param(scb) = req;
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+	} else {
+		req->send_msgoff += pktlen;
+	}
+
+	scb->nfrag = (pktlen + frag_size - 1) / frag_size;
+	scb->frag_size = frag_size;
+
+	/* attach checksum if enabled, this matches what is done for tid-sdma */
+	if (proto->flags & IPS_PROTO_FLAG_CKSUM && !nbytes_left) {
+		uint32_t cksum = 0xffffffff;
+		cksum = ips_crc_calculate(len, (uint8_t *)(buf-len), cksum);
+		scb->ips_lrh.data[0].u32w0 = cksum;
+		scb->ips_lrh.data[0].u32w1 = offset - len;
+	}
+
+	flow->fn.xfer.enqueue(flow, scb);
+
+	ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA;
+
+	if (nbytes_left == 0) {
+		err = flow->fn.xfer.flush(flow, NULL);
+		if (err == PSM_EP_NO_RESOURCES || err == PSM_OK_NO_PROGRESS) {
+		    err = ips_recv_progress_if_busy
+			(ipsaddr->ptl, PSM_EP_NO_RESOURCES);
+		}
+	}
+
+    } while (nbytes_left);
+
+    return;
+
+spio:
+    do {
+/*
+ * Each flow/proto uses its own scb. If a scb from one proto is
+ * used by another proto, there is a teardown problem, where
+ * a proto deallocates the scb still in use by another proto.
+ */
+	pktlen = min(frag_size, nbytes_left);
+	scb = mq_alloc_pkts(proto, 1, pktlen, is_blocking ? IPS_SCB_FLAG_ADD_BUFFER : 0);
+	psmi_assert(scb != NULL);
+
+	ips_scb_length(scb) = pktlen;
+	ips_scb_mqhdr(scb) = MQ_MSG_DATA;
+	ips_scb_mqparam(scb).u32w0 = egrid.egr_data;
+	ips_scb_mqparam(scb).u32w1 = offset;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+
+	_IPATH_VDBG("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n",
+		(void *) buf, pktlen, frag_size, nbytes_left);
+	if (!is_blocking) /* non-blocking, send from user's buffer */
+	    ips_scb_buffer(scb) = (void *) buf;
+	else /* blocking, copy to bounce buffer */
+	    psmi_mq_mtucpy(ips_scb_buffer(scb), (void *) buf, pktlen);
+
+	buf += pktlen;
+	offset += pktlen;
+	nbytes_left -= pktlen;
+
+	if (nbytes_left == 0) { /* last packet */
+	    if (!is_blocking) {
+		/* non-blocking mode, need completion */
+		ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+		ips_scb_cb_param(scb) = req;
+	    }
+	    ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+	} else {
+	    if (!is_blocking) {
+		req->send_msgoff += pktlen;
+	    }
+	}
+
+	flow->fn.xfer.enqueue(flow, scb);
+
+	/* we need to flush the pending queue */
+	err = flow->fn.xfer.flush(flow, NULL);
+	err = ips_recv_progress_if_busy(ipsaddr->ptl, err);
+
+    } while (nbytes_left);
+
+    return;
+}
+
+
+PSMI_ALWAYS_INLINE(
+void
+ips_shortcpy(void* vdest, const void* vsrc, uint32_t nchars)
+)
+{
+#ifdef __MIC__
+    memcpy(vdest, vsrc, nchars);
+#else
+    unsigned char *dest = vdest;
+    const unsigned char *src = vsrc;
+
+    if(nchars>>2)
+        ipath_dwordcpy((uint32_t*)dest, (uint32_t*)src, nchars>>2);
+    dest += (nchars>>2)<<2;
+    src += (nchars>>2)<<2;
+    switch (nchars&0x03) {
+        case 3: *dest++ = *src++;
+        case 2: *dest++ = *src++;
+        case 1: *dest++ = *src++;
+    }
+#endif
+    return;
+}
+
+static __sendpath
+psm_error_t
+ips_ptl_mq_rndv(psm_mq_req_t req, psm_epaddr_t mepaddr, ips_epaddr_t *ipsaddr, 
+		const void *buf, uint32_t len)
+{
+    ips_scb_t *scb;
+    psm_error_t err = PSM_OK;
+    struct ips_proto *proto = ipsaddr->proto;
+
+    req->buf = (void *) buf;
+    req->buf_len = len;
+    req->send_msglen = len;
+    req->send_msgoff = 0;
+    req->recv_msgoff = 0;
+    req->rts_peer = ipsaddr->epaddr;
+        
+    scb = mq_alloc_tiny(proto);
+
+    /* If the expected tid protocol is active, use it or else resort to
+     * eager-based r-v. */
+    if (proto->protoexp != NULL)
+	ips_scb_mqhdr(scb) = req->type & MQE_TYPE_WAITING ? 
+			     MQ_MSG_RTS_WAIT : MQ_MSG_RTS;
+    else
+	ips_scb_mqhdr(scb) = MQ_MSG_RTS_EGR;
+
+    ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+    ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+    
+    ips_scb_uwords(scb)[0].u64   = req->tag;
+    ips_scb_uwords(scb)[1].u32w0 = psmi_mpool_get_obj_index(req);
+    ips_scb_uwords(scb)[1].u32w1 = len;
+
+    memset(&req->tid_grant, 0, sizeof(req->tid_grant));
+    if ((err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE)))
+	goto fail;
+	    
+    /* Assume that we already put a few rndv requests in flight.  This helps
+     * for bibw microbenchmarks and doesn't hurt the 'blocking' case since
+     * we're going to poll anyway */
+    psmi_poll_internal(ipsaddr->epaddr->ep, 1);
+
+fail:
+    _IPATH_VDBG("[rndv][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p/%d]: %s\n", 
+	psmi_epaddr_get_name(proto->ep->epid),
+	psmi_epaddr_get_name(ipsaddr->epaddr->epid), buf, len, req->tag, req, 
+	psmi_mpool_get_obj_index(req),
+	psm_error_get_string(err));
+
+    return err; 
+}
+
+psm_error_t __sendpath
+ips_proto_mq_isend(psm_mq_t mq, psm_epaddr_t mepaddr, uint32_t flags, 
+	     uint64_t tag, const void *ubuf, uint32_t len, void *context,
+	     psm_mq_req_t *req_o)
+{
+    uint8_t *buf = (uint8_t *) ubuf;
+    uint32_t pktlen = 0;
+    ips_scb_t *scb;
+    psm_epaddr_t epaddr = mepaddr->mctxt_current;
+    ips_epaddr_t *ipsaddr = epaddr->ptladdr;
+    struct ips_proto *proto = ipsaddr->proto;
+    uint32_t pad_write_bytes;
+    psm_error_t err = PSM_OK;
+    psm_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+    if_pf (req == NULL)
+	return PSM_NO_MEMORY;
+
+    mepaddr->mctxt_current = epaddr->mctxt_next;
+    req->send_msglen = len;
+    req->tag = tag;
+    req->context = context;
+
+    if (!flags && len <= MQ_IPATH_THRESH_TINY) {
+	scb = mq_alloc_tiny(proto);
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_HDR;
+	ips_scb_hdr_dlen(scb) = len;
+	ips_scb_mqhdr(scb) = MQ_MSG_TINY;
+	ips_scb_mqtag(scb) = tag;
+	mq_copy_tiny((uint32_t *)&ips_scb_mqparam(scb), (uint32_t *)buf, len);
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE);
+	/* We can mark this op complete since all the data is now copied
+	 * into an SCB that remains live until it is remotely acked */
+	req->state = MQ_STATE_COMPLETE;
+	mq_qq_append(&mq->completed_q, req);
+        _IPATH_VDBG("[itiny][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, len, tag, req);
+	*req_o = req;
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+	return err;
+    }
+    else if (flags & PSM_MQ_FLAG_SENDSYNC) {/* skip eager accounting below */
+	err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len);
+	*req_o = req;
+	return err;
+    }
+    else if (len <= ipsaddr->epr.epr_piosize) {
+        uint32_t cksum_len = (proto->flags & IPS_PROTO_FLAG_CKSUM) ? 
+	  PSM_CRC_SIZE_IN_BYTES : 0;
+	
+	pad_write_bytes = ((PSM_CACHE_LINE_BYTES - 
+			    ((len + cksum_len) & (PSM_CACHE_LINE_BYTES-1))) & 
+			   (PSM_CACHE_LINE_BYTES-1));
+	
+        if_pf ((pad_write_bytes + len) > ipsaddr->epr.epr_piosize)
+	  pad_write_bytes = 0;
+	scb = mq_alloc_pkts(proto, 1, (len + pad_write_bytes),
+			    IPS_SCB_FLAG_ADD_BUFFER);
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_hdr_dlen(scb) = pad_write_bytes;
+	ips_scb_length(scb) = len + pad_write_bytes;
+	ips_scb_mqhdr(scb) = MQ_MSG_SHORT;
+	ips_scb_mqtag(scb) = tag;
+	ips_shortcpy (ips_scb_buffer(scb), buf, len);
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE);
+	req->state = MQ_STATE_COMPLETE;
+	mq_qq_append(&mq->completed_q, req);
+        _IPATH_VDBG("[ishrt][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, len, tag, req);
+    }
+    else if (len <= mq->ipath_thresh_rv) {
+	uint32_t proto_flags = proto->flags & IPS_PROTO_FLAG_MQ_MASK;
+	psmi_egrid_t egrid;
+
+	scb = mq_alloc_pkts(proto, 1, 0, 0);
+	/* directly send from user's buffer */
+	ips_scb_buffer(scb) = buf;
+
+	if (len < proto->iovec_thresh_eager) {
+	    if (len <= 2 * ipsaddr->epr.epr_piosize) {
+		// split into 2 packets and round second down to dword multiple
+		pktlen = len - (((len >> 1) + 3) & ~0x3);
+	    }
+	    else {
+	        pktlen = min(len, ipsaddr->epr.epr_piosize);
+	    }
+	    proto_flags &= ~IPS_PROTO_FLAG_MQ_EAGER_SDMA;
+
+	    /*
+	     * since following packets are sent on the same flow,
+	     * we only wait for completion for the last packet
+	     */
+	    req->send_msgoff = pktlen;
+	}
+	else {
+	    psmi_assert(proto_flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA);
+	    /* send the unaligned bytes only, this is required by sdma. */
+	    pktlen = (uint32_t)((uintptr_t)buf & 0x3);
+	    if (pktlen) pktlen = 4 - pktlen;
+
+	    /* send from user buffer, need completion */
+	    req->send_msgoff = 0;
+	    if (pktlen) {
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+		ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+		ips_scb_cb_param(scb) = req;
+	    }
+	}
+	psmi_assert(pktlen <= ipsaddr->epr.epr_piosize);
+	
+	ips_scb_length(scb) = pktlen;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_mqhdr(scb) = MQ_MSG_LONG;
+	ips_scb_mqtag(scb) = tag;
+	ips_scb_mqparam(scb).u32w1 = len;
+	
+       /* We need a new eager long message number */
+	egrid.egr_data = ips_scb_mqparam(scb).u32w0 = 
+		mepaddr->xmit_egrlong.egr_data;
+	mepaddr->xmit_egrlong.egr_msgno++;
+
+	/* Send the envelope but don't flush if writev is enabled */
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_FALSE);
+	ips_mq_send_payload(epaddr, egrid, 
+			    buf+pktlen, len-pktlen, pktlen, req, 
+			    proto_flags);
+
+        _IPATH_VDBG("[ilong][%s->%s][b=%p][l=%d][m=%d][t=%"PRIx64"][req=%p]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, pktlen, len, tag, req);
+    }
+    else { /* skip eager accounting below */
+	err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len);
+	*req_o = req;
+	return err;
+    }
+
+    *req_o = req;
+    mq->stats.tx_num++;
+    mq->stats.tx_eager_num++;
+    mq->stats.tx_eager_bytes += len;
+
+    return err;
+}
+
+__sendpath
+psm_error_t
+ips_proto_mq_send(psm_mq_t mq, psm_epaddr_t mepaddr, uint32_t flags, 
+	    uint64_t tag, const void *ubuf, uint32_t len)
+{
+    uint8_t *buf = (uint8_t *) ubuf;
+    uint32_t pktlen;
+    ips_scb_t *scb;
+    psm_epaddr_t epaddr = mepaddr->mctxt_current;
+    ips_epaddr_t *ipsaddr = epaddr->ptladdr;
+    uint32_t pad_write_bytes;
+    psm_error_t err = PSM_OK;
+    struct ips_proto *proto = ipsaddr->proto;
+    
+    mepaddr->mctxt_current = epaddr->mctxt_next;
+
+    if (flags == 0 && len <= MQ_IPATH_THRESH_TINY) {
+	scb = mq_alloc_tiny(proto);
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_HDR;
+	ips_scb_hdr_dlen(scb) = len;
+	ips_scb_mqhdr(scb) = MQ_MSG_TINY;
+	ips_scb_mqtag(scb) = tag;
+
+	mq_copy_tiny((uint32_t *)&ips_scb_mqparam(scb), (uint32_t *)buf, len);
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE);
+	_IPATH_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%"PRIx64"]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, len, tag);
+	mq->stats.tx_num++;
+	mq->stats.tx_eager_num++;
+	mq->stats.tx_eager_bytes += len;
+	return err;
+    }
+    else if ((flags & PSM_MQ_FLAG_SENDSYNC)) {
+	goto do_rendezvous;
+    }
+    else if (len <= ipsaddr->epr.epr_piosize) {
+        uint32_t cksum_len = (proto->flags & IPS_PROTO_FLAG_CKSUM) ? 
+	  PSM_CRC_SIZE_IN_BYTES : 0;
+	
+	pad_write_bytes = ((PSM_CACHE_LINE_BYTES - 
+			    ((len + cksum_len) & (PSM_CACHE_LINE_BYTES-1))) & 
+			   (PSM_CACHE_LINE_BYTES-1));
+	
+        if_pf ((pad_write_bytes + len) > ipsaddr->epr.epr_piosize)
+	  pad_write_bytes = 0;
+
+	scb = mq_alloc_pkts(proto, 1, (len + pad_write_bytes),
+			    IPS_SCB_FLAG_ADD_BUFFER);
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_hdr_dlen(scb) = pad_write_bytes;
+	ips_scb_length(scb) = len + pad_write_bytes;
+	ips_scb_mqhdr(scb) = MQ_MSG_SHORT;
+	ips_scb_mqtag(scb) = tag;
+		
+	ips_shortcpy (ips_scb_buffer(scb), buf, len);
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE);
+        _IPATH_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%"PRIx64"]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, len, tag);
+    }
+    else if (len <= mq->ipath_thresh_rv) {
+	uint32_t proto_flags = proto->flags & IPS_PROTO_FLAG_MQ_MASK;
+	psmi_egrid_t egrid;
+	psm_mq_req_t req = NULL;
+
+	if (len < proto->iovec_thresh_eager_blocking) {
+	    if (len <= 2 * ipsaddr->epr.epr_piosize) {
+		// split into 2 packets and round second down to dword multiple
+		pktlen = len - (((len >> 1) + 3) & ~0x3);
+	    }
+	    else {
+	        pktlen = min(len, ipsaddr->epr.epr_piosize);
+	    }
+	    proto_flags &= ~IPS_PROTO_FLAG_MQ_EAGER_SDMA;
+
+	    scb = mq_alloc_pkts(proto, 1, pktlen, IPS_SCB_FLAG_ADD_BUFFER);
+	    /* In blocking mode, copy to scb bounce buffer */
+	    ips_shortcpy (ips_scb_buffer(scb), buf, pktlen);
+	}
+	else {
+	    psmi_assert(proto_flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA);
+	    /* send the unaligned bytes only, this is required by sdma. */
+	    pktlen = (uint32_t)((uintptr_t)buf & 0x3);
+	    if (pktlen) pktlen = 4 - pktlen;
+
+	    /* Block until we can get a req */
+	    PSMI_BLOCKUNTIL(mq->ep, err, 
+			(req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+	    req->type |= MQE_TYPE_WAITING;
+            req->send_msglen = len;
+	    req->tag = tag;
+
+	    scb = mq_alloc_pkts(proto, 1, 0, 0);
+	    /* directly send from user's buffer */
+	    ips_scb_buffer(scb) = buf;
+
+	    /* send from user buffer, need completion */
+	    req->send_msgoff = 0;
+	    if (pktlen) {
+		ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+		ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+		ips_scb_cb_param(scb) = req;
+	    }
+	}
+	psmi_assert(pktlen <= ipsaddr->epr.epr_piosize);
+	
+	ips_scb_length(scb) = pktlen;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_mqhdr(scb) = MQ_MSG_LONG;
+	ips_scb_mqtag(scb) = tag;
+	ips_scb_mqparam(scb).u32w1 = len;
+
+	/* We need a new eager long message number */
+	egrid.egr_data = ips_scb_mqparam(scb).u32w0 = 
+		mepaddr->xmit_egrlong.egr_data;
+	mepaddr->xmit_egrlong.egr_msgno++;
+
+	/* Send the envelope but don't flush if writev is enabled */
+	err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_FALSE);
+	ips_mq_send_payload(epaddr, egrid,
+			buf+pktlen, len-pktlen, pktlen, req,
+			proto_flags);
+	if (req) psmi_mq_wait_internal(&req);
+
+        _IPATH_VDBG("[long][%s->%s][b=%p][l=%d][m=%d][t=%"PRIx64"]\n", 
+	    psmi_epaddr_get_name(mq->ep->epid), 
+	    psmi_epaddr_get_name(epaddr->epid), buf, pktlen, len, tag);
+    }
+    else {
+	psm_mq_req_t req;
+do_rendezvous:
+	/* Block until we can get a req */
+	PSMI_BLOCKUNTIL(mq->ep, err, 
+			(req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND)));
+	req->type |= MQE_TYPE_WAITING;
+	req->tag = tag;
+	err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len);
+	if (err != PSM_OK)
+	    return err;
+	psmi_mq_wait_internal(&req);
+	return err; /* skip accounting, done separately at completion time */
+    }
+
+    mq->stats.tx_num++;
+    mq->stats.tx_eager_num++;
+    mq->stats.tx_eager_bytes += len;
+
+    return err;
+}
+
+static
+psm_error_t __recvpath
+ips_proto_mq_rts_match_callback(psm_mq_req_t req, int was_posted)
+{
+    psm_epaddr_t epaddr = req->rts_peer;
+    ips_epaddr_t *ipsaddr = epaddr->ptladdr;
+    struct ips_proto *proto = ipsaddr->proto;
+
+    /* We have a match.
+     *
+     * If we're doing eager-based r-v, just send back the sreq and length and
+     * have the sender complete the send.
+     *
+     */
+    if (proto->protoexp == NULL) {	/* only eager-based r-v so far */
+	struct ips_pend_sends *pends = &proto->pend_sends;
+	struct ips_pend_sreq *sreq = psmi_mpool_get(proto->pend_sends_pool);
+	psmi_assert(sreq != NULL);
+	if (sreq == NULL) return PSM_NO_MEMORY;
+	sreq->type = IPS_PENDSEND_EAGER_REQ;
+	sreq->req  = req;
+
+	STAILQ_INSERT_TAIL(&pends->pendq, sreq, next);
+	psmi_timer_request(proto->timerq, &pends->timer, PSMI_TIMER_PRIO_1);
+    }
+    else {
+	ips_protoexp_tid_get_from_token(
+	    proto->protoexp, req->buf, req->recv_msglen, epaddr, 
+	    req->rts_reqidx_peer, 
+	    req->type & MQE_TYPE_WAITING_PEER ? IPS_PROTOEXP_TIDGET_PEERWAIT : 0,
+	    ips_proto_mq_rv_complete_exp, req);
+    }
+
+    _IPATH_VDBG("req=%p, dest=%p, len=%d, recv_msglen=%d, stok=%p, expected=%s\n", 
+		req, req->buf, req->buf_len, req->recv_msglen,
+		req->ptl_req_ptr, was_posted ? "YES" : "NO");
+
+    return PSM_OK;
+}
+
+psm_error_t __recvpath
+ips_proto_mq_push_eager_req(struct ips_proto *proto, psm_mq_req_t req)
+{
+    ips_scb_t *scb;
+    ptl_arg_t *args;
+    ips_epaddr_t *ipsaddr;
+    struct ips_flow *flow;
+
+    scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0);
+    if (scb == NULL)
+	return PSM_OK_NO_PROGRESS;
+
+    args = (ptl_arg_t *) ips_scb_uwords(scb);
+
+    args[0].u32w0 = req->rts_reqidx_peer;
+    args[0].u32w1 = psmi_mpool_get_obj_index(req);
+    args[1].u32w0 = req->recv_msglen;
+    req->egrid.egr_data = args[0].u32w1;
+
+    ipsaddr = req->rts_peer->ptladdr;
+    flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+    ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+    ips_scb_mqhdr (scb) = MQ_MSG_CTS_EGR;
+    
+    if (req->recv_msglen == 0) {
+	ips_proto_mq_rv_complete(req);
+    }
+
+    flow->fn.xfer.enqueue(flow, scb);
+    flow->fn.xfer.flush(flow, NULL);
+
+    return PSM_OK;
+}
+
+psm_error_t __recvpath
+ips_proto_mq_push_eager_data(struct ips_proto *proto, psm_mq_req_t req)
+{
+    uintptr_t buf = (uintptr_t) req->buf;
+    ips_epaddr_t *ipsaddr = req->rts_peer->ptladdr;
+    uint32_t nbytes_this;
+    uint32_t nbytes_left = req->send_msglen - req->recv_msgoff;
+    uint16_t frag_size;
+    struct ips_flow *flow;
+    ips_scb_t *scb;
+
+    psmi_assert(nbytes_left > 0);
+
+    if (!(proto->flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA)) goto spio;
+
+    flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA];
+    frag_size = flow->frag_size;
+    nbytes_this = 131072/8;
+    while (nbytes_left > 0) {
+      scb = ips_scbctrl_alloc(proto->scbc_rv, 1, 0, 0);
+	if (scb == NULL)
+	    return PSM_OK_NO_PROGRESS;
+
+#if 0
+        /* turn on to use single frag-size packet */
+        nbytes_this = min(frag_size, nbytes_left);
+#else
+        nbytes_this = min(nbytes_this, nbytes_left);
+#endif
+
+	ips_scb_length(scb) = nbytes_this;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_mqhdr (scb) = MQ_MSG_DATA_REQ_BLK;
+	ips_scb_buffer(scb) = (void *)(buf + req->recv_msgoff);
+	ips_scb_mqparam(scb).u32w0 = req->rts_reqidx_peer;
+	ips_scb_mqparam(scb).u32w1 = req->recv_msgoff;
+
+	if (nbytes_left == nbytes_this) {
+	    ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+	    ips_scb_cb_param(scb) = req;
+	} else {
+	    req->send_msgoff += nbytes_this;
+	}
+
+	scb->nfrag = (nbytes_this + frag_size - 1) / frag_size;
+	scb->frag_size = frag_size;
+
+	/* attach checksum if enabled, this matches what is done for tid-sdma */
+	if (proto->flags&IPS_PROTO_FLAG_CKSUM && nbytes_left==nbytes_this) {
+	    uint32_t cksum = 0xffffffff;
+	    cksum = ips_crc_calculate(req->send_msglen, req->buf, cksum);
+	    scb->ips_lrh.data[0].u32w0 = cksum;
+	}
+
+	ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+	ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA;
+	SLIST_NEXT(scb, next) = NULL;
+
+	flow->fn.xfer.enqueue(flow, scb);
+	flow->fn.xfer.flush(flow, NULL);
+
+	nbytes_left      -= nbytes_this;
+	req->recv_msgoff += nbytes_this;
+    }
+
+    return PSM_OK;
+
+spio:
+    flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+    frag_size = flow->frag_size;
+    while (nbytes_left > 0) {
+      scb = ips_scbctrl_alloc(proto->scbc_rv, 1, 0, 0);
+	if (scb == NULL)
+	    return PSM_OK_NO_PROGRESS;
+	
+	nbytes_this = min(nbytes_left, frag_size);
+	ips_scb_length(scb) = nbytes_this;
+	ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL;
+	ips_scb_mqhdr (scb) = MQ_MSG_DATA_REQ;
+	ips_scb_buffer(scb) = (void *)(buf + req->recv_msgoff);
+	ips_scb_mqparam(scb).u32w0 = req->rts_reqidx_peer;
+	ips_scb_mqparam(scb).u32w1 = req->recv_msgoff;
+
+	ips_scb_cb(scb) = ips_proto_mq_eager_complete;
+	ips_scb_cb_param(scb) = req;
+	if (nbytes_left == nbytes_this) {
+	    ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ;
+	}
+#if 0
+	_IPATH_INFO("send req %p, off %d/%d, len %d, last=%s\n",
+		req, req->send_msgoff, req->send_msglen, nbytes_this,
+		nbytes_left == nbytes_this ? "YES" : "NO");
+#endif
+	SLIST_NEXT(scb, next) = NULL;
+
+	flow->fn.xfer.enqueue(flow, scb);
+	flow->fn.xfer.flush(flow, NULL);
+
+	nbytes_left      -= nbytes_this;
+	req->recv_msgoff += nbytes_this;
+    }
+
+    return PSM_OK;
+}
+
+int __recvpath
+ips_proto_mq_handle_cts(struct ips_proto *proto, ptl_arg_t *args)
+{
+    psm_mq_req_t req;
+    psm_mq_t mq = proto->ep->mq;
+    uint32_t reqidx, reqidx_peer;
+    struct ips_pend_sreq *sreq;
+    uint32_t msglen;
+
+    reqidx      = args[0].u32w0;
+    reqidx_peer = args[0].u32w1;
+    msglen      = args[1].u32w0;
+    
+    req = psmi_mpool_find_obj_by_index(mq->sreq_pool, reqidx);
+    psmi_assert(req != NULL);
+    if (req == NULL) return IPS_RECVHDRQ_BREAK;
+
+    if (msglen == 0) {
+	ips_proto_mq_rv_complete(req);
+	return IPS_RECVHDRQ_CONTINUE;
+    }
+
+    sreq	      = psmi_mpool_get(proto->pend_sends_pool);
+    psmi_assert(sreq != NULL);
+    if (sreq == NULL) return IPS_RECVHDRQ_BREAK;
+    sreq->type	      = IPS_PENDSEND_EAGER_DATA;
+    sreq->req	      = req;
+    req->rts_reqidx_peer = reqidx_peer;
+    req->send_msglen     = msglen;
+    req->send_msgoff     = 0;
+    STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next);
+    /* Make sure it's processed by timer */
+    psmi_timer_request(proto->timerq, &proto->pend_sends.timer, 
+		      PSMI_TIMER_PRIO_1);
+
+    /* XXX Optimization here:  If the 'req' is blocking in the MPI sense, we
+     * could choose to break out of the progress loop and make progress on it
+     * ASAP instead of continuing to process the receive queue */
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+int __recvpath
+ips_proto_mq_handle_rts_envelope(psm_mq_t mq, int mode, psm_epaddr_t epaddr, 
+				uint64_t tag, uint32_t reqidx_peer, 
+				uint32_t msglen)
+{
+    psm_mq_req_t req;
+    _IPATH_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", 
+		    (long long) tag, reqidx_peer, msglen);
+    int rc = psmi_mq_handle_rts(mq, tag, 0, msglen, epaddr,
+		                ips_proto_mq_rts_match_callback, &req);
+    req->rts_reqidx_peer = reqidx_peer;
+    if (mode == MQ_MSG_RTS_WAIT)
+	req->type |= MQE_TYPE_WAITING_PEER;
+
+    if (rc == MQ_RET_MATCH_OK) {
+	ips_proto_mq_rts_match_callback(req, 1);
+	/* XXX if blocking, break out of progress loop */
+    }
+
+    /* If no match, will be called when send actually matches */
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+int __recvpath
+ips_proto_mq_handle_rts_envelope_outoforder(psm_mq_t mq, int mode,
+				psm_epaddr_t peer, uint16_t msg_seqnum,
+				uint64_t tag, uint32_t reqidx_peer, 
+				uint32_t msglen)
+{
+    psm_mq_req_t req;
+    _IPATH_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", 
+		    (long long) tag, reqidx_peer, msglen);
+    psmi_mq_handle_rts_outoforder(mq, tag, 0, msglen,
+				peer, msg_seqnum,
+		                ips_proto_mq_rts_match_callback, &req);
+    req->rts_reqidx_peer = reqidx_peer;
+    if (mode == MQ_MSG_RTS_WAIT)
+	req->type |= MQE_TYPE_WAITING_PEER;
+
+    /* If no match, will be called when send actually matches */
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h
new file mode 100644
index 0000000..62a4e0a
--- /dev/null
+++ b/ptl_ips/ips_proto_params.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PROTO_PARAMS_H
+#define _IPS_PROTO_PARAMS_H
+
+/* Packet header formats */
+#define CRC_SIZE_IN_BYTES 4
+#define PCB_SIZE_IN_BYTES 8
+#define LRH_VL_SHIFT 12
+#define BTH_OPCODE_SHIFT 24
+#define BTH_EXTRA_BYTE_SHIFT 20
+#define BTH_BECN_SHIFT 30
+#define BTH_FECN_SHIFT 31
+#define BYTE2WORD_SHIFT 2
+#define LOWER_24_BITS 0xFFFFFF
+#define LOWER_16_BITS 0xFFFF
+#define LOWER_8_BITS 0xFF
+#define MAX_VL_SUPPORTED 8
+#define PSM_CRC_SIZE_IN_BYTES 8 /* Change in ipath_user.h as well */
+#define PSM_CACHE_LINE_BYTES 64
+#define PSM_FLOW_CREDITS 64
+
+#ifndef BITS_PER_BYTE
+#  define BITS_PER_BYTE 8
+#endif
+
+/* Send retransmission */
+#define IPS_PROTO_SPIO_RETRY_US_DEFAULT	2    /* in uS */
+
+#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT	8     /* in millisecs */
+#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT	32    /* in millisecs */
+#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2
+#define PSM_TID_TIMEOUT_DEFAULT "8:32:2" /* update from above params */
+
+#define IPS_HDR_TID(p_hdr)				    \
+	((__le32_to_cpu((p_hdr)->iph.ver_context_tid_offset) >> \
+	  INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK)
+
+/* time conversion macros */
+#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us))
+#define ms_2_cycles(ms)  nanosecs_to_cycles(1000000ULL*(ms))
+#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec))
+
+/* Per-flow flags */
+#define IPS_FLOW_FLAG_NAK_SEND	    0x01
+#define IPS_FLOW_FLAG_WRITEV	    0x02
+#define IPS_FLOW_FLAG_PENDING_ACK   0x04
+#define IPS_FLOW_FLAG_GEN_BECN      0x08
+#define IPS_FLOW_FLAG_CONGESTED     0x10
+#define IPS_FLOW_FLAG_PENDING_NAK   0x20
+
+/* per-ipsaddr Flags (sess is ipsaddr) */
+#define SESS_FLAG_HAS_RCVTHREAD	    0x2
+#define SESS_FLAG_LOCK_SESS	    0x4
+#define SESS_FLAG_HAS_FLOWID	    0x8
+
+/* tid session expected send flags  */
+#define EXP_SEND_FLAG_CLEAR_ALL 0x00
+#define EXP_SEND_FLAG_FREE_TIDS 0x01
+
+#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's  */
+
+/* ips_scb_t flags, powers of 2, and disjoint from SEND_FLAG_* values.
+ * Only the lower 8 bytes are wire-protocol options */
+#define IPS_SEND_FLAG_NONE		0x00
+// Unused -- future use maybe.
+//#define IPS_SEND_FLAG_ACK_REQ_INTR	0x02	/* request ack with intr */
+#define IPS_SEND_FLAG_ACK_REQ		0x04	/* request ack (normal) */
+#define IPS_SEND_FLAG_UNALIGNED_DATA	0x08	/* unaligned data in hdr */
+#define IPS_SEND_FLAG_HAS_CKSUM         0x10    /* Has checksum */
+#define IPS_SEND_FLAG_EXPECTED_DONE     0x20    /* Last expected packet */
+#define IPS_SEND_FLAG_CCA_BECN          0x40    /* BECN bit for congestion */
+#define IPS_SEND_FLAG_PROTO_OPTS	0xff
+
+#define IPS_SEND_FLAG_PENDING		0x0100
+#define IPS_SEND_FLAG_PERSISTENT	0x0200 
+#define IPS_SEND_FLAG_INTR		0x0400
+#define IPS_SEND_FLAG_WAIT_SDMA		0x0800
+#define IPS_SEND_FLAG_HDR_SUPPRESS      0x1000
+
+#define IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA	0x01
+#define IPS_PROTO_FLAG_MQ_EAGER_SDMA	0x02
+#define IPS_PROTO_FLAG_MQ_EXPECTED_SDMA	0x04
+#define IPS_PROTO_FLAG_MQ_MASK		0x0f /* contains all MQ proto flags */
+#define IPS_PROTO_FLAG_CTRL_SDMA	0x10
+
+/* Alias for use send dma for everything */
+#define IPS_PROTO_FLAGS_ALL_SDMA	0x17
+
+#define IPS_PROTO_FLAG_CKSUM            0x20
+/* Coalesced ACKs (On by default) */
+#define IPS_PROTO_FLAG_COALESCE_ACKS    0x80
+
+/* Use Path Record query (off by default) */
+#define IPS_PROTO_FLAG_QUERY_PATH_REC   0x100
+
+/* Path selection policies:
+ * 
+ * (a) Adaptive - Dynamically determine the least loaded paths using various
+ * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs.
+ *
+ * (b) Static schemes  -
+ *     (i) static_src  - Use path keyed off source context
+ *    (ii) static_dest - Use path keyed off destination context
+ *   (iii) static_base - Use only the base lid path - default till Oct'09.
+ *
+ * The default is adaptive. If a zero lmc network is used then there exists
+ * just one path between endpoints the (b)(iii) case above.
+ *
+ */
+
+#define IPS_PROTO_FLAG_PPOLICY_ADAPTIVE 0x200
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_SRC 0x400
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_DST 0x800
+#define IPS_PROTO_FLAG_PPOLICY_STATIC_BASE 0x1000
+
+/* All static policies */
+#define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
+
+/* IBTA CCA Protocol support */
+#define IPS_PROTO_FLAG_CCA 0x2000
+
+/* By default, we use dma in eager (based on PSM_MQ_EAGER_SDMA_SZ) and
+ * always use it in expected.
+ */
+#define IPS_PROTO_FLAGS_DEFAULT		(IPS_PROTO_FLAG_MQ_EAGER_SDMA | \
+					 IPS_PROTO_FLAG_MQ_EXPECTED_SDMA | \
+					 IPS_PROTO_FLAG_COALESCE_ACKS)
+
+#define IPS_PROTOEXP_FLAG_ENABLED	0x01 /* default */
+//#define IPS_PROTOEXP_FLAG_NAKOPT	0x02 /* *not* default, broken */
+#define IPS_PROTOEXP_FLAG_TID_DEBUG	0x04 /* *not* default */
+#define IPS_PROTOEXP_FLAG_HDR_SUPP      0x08 /* Header suppression enabled */
+
+#define IPS_PROTOEXP_FLAGS_DEFAULT	(IPS_PROTOEXP_FLAG_ENABLED | \
+					 IPS_PROTOEXP_FLAG_HDR_SUPP)
+
+/* We have to get an MTU of at least 2K, or else this breaks some assumptions
+ * in the packets that handle tid descriptors
+ */
+#define IPS_PROTOEXP_MIN_MTU		2048
+
+/* Bound on the number of packets to feed to send dma at a time.  This ensures
+ * we don't "disappear" in the kernel for too long.
+ */
+#define IPS_SDMA_MAX_SCB		32
+
+/* Fault injection, becomes parameters to psmi_faultinj_getspec so
+ * a comma-delimited list of 
+ *   "spec_name", num, denom
+ * Where num/denom means fault num out of every denom.
+ * The defines set 'denum' and assume that num is set to 1
+ *
+ * These values are all defaults, each is overridable via
+ * PSM_FI_<spec_name> in the environment (and yes, spec_name is in lowercase
+ * *in the environment* just to minimize it appearing in the wild).  The format
+ * there is <num:denom:initial_seed> so the same thing except that one can set
+ * a specific seed to the random number generator.
+ */
+#if 1
+#define IPS_FAULTINJ_DMALOST	20	/* 1 every 20 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST	100	/* 1 every 100 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY	10	/* 1 every 10 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST	200	/* 1 every 200 pkts dropped at recv */
+#else
+#define IPS_FAULTINJ_DMALOST	500	/* 1 every 500 dma writev get lost */
+#define IPS_FAULTINJ_PIOLOST	3000	/* 1 every 3000 pio writes get lost */
+#define IPS_FAULTINJ_PIOBUSY	100	/* 1 every 100 pio sends get busy */
+#define IPS_FAULTINJ_RECVLOST	500	/* 1 every 500 pkts dropped at recv */
+#endif
+
+#endif /* _IPS_PROTO_PARAMS_H */
diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c
new file mode 100644
index 0000000..572c522
--- /dev/null
+++ b/ptl_ips/ips_proto_recv.c
@@ -0,0 +1,1547 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ipserror.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+
+#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS	30
+static void ips_report_strays(struct ips_proto *proto);
+
+#define INC_TIME_SPEND(timer)
+
+#define _desc_idx   u32w0
+#define _desc_genc  u32w1
+
+psm_error_t
+ips_proto_recv_init(struct ips_proto *proto)
+{
+    uint32_t interval_secs;
+    union psmi_envvar_val env_stray;
+
+    psmi_getenv("PSM_STRAY_WARNINTERVAL", 
+		"min secs between stray process warnings",
+		PSMI_ENVVAR_LEVEL_HIDDEN, 
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val) PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS, 
+		&env_stray);
+    interval_secs = env_stray.e_uint;
+    if (interval_secs > 0)
+	proto->stray_warn_interval = sec_2_cycles(interval_secs);
+    else
+	proto->stray_warn_interval = 0;
+
+    return PSM_OK;
+}
+
+psm_error_t
+ips_proto_recv_fini(struct ips_proto *proto)
+{
+    ips_report_strays(proto);
+    return PSM_OK;
+}
+
+#define cycles_to_sec_f(cycles)		    \
+	(((double)cycles_to_nanosecs(cycles)) / 1000000000.0)
+
+struct ips_stray_epid {
+    psm_epid_t	epid;
+    uint32_t	err_check_bad_sent;
+    uint32_t	ipv4_addr;
+    uint32_t	pid;
+    uint32_t	num_messages;
+    uint64_t	t_warn_next;
+    uint64_t	t_first;
+    uint64_t	t_last;
+};
+
+static
+void
+ips_report_strays(struct ips_proto *proto)
+{
+    struct ips_stray_epid *sepid;
+    struct psmi_eptab_iterator itor;
+    psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK);
+    double t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init);
+
+    while ((sepid = psmi_epid_itor_next(&itor))) {
+	char ipbuf[INET_ADDRSTRLEN], *ip = NULL;
+	char bufpid[32];
+	uint32_t lid = psm_epid_nid(sepid->epid);
+	double t_first = cycles_to_sec_f(sepid->t_first - proto->t_init);
+	double t_last = cycles_to_sec_f(sepid->t_last - proto->t_init);
+	if (sepid->ipv4_addr) 
+	    ip = (char *) 
+		inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, sizeof ipbuf);
+	if (!ip)
+	    snprintf(ipbuf, sizeof ipbuf, "%d (%x)", lid, lid);
+
+	if (sepid->pid)
+	    snprintf(bufpid, sizeof bufpid, "PID=%d", sepid->pid);
+	else
+	    snprintf(bufpid, sizeof bufpid, "PID unknown");
+
+	_IPATH_INFO("Process %s on host %s=%s sent %d stray message(s) and "
+		    "was told so %d time(s) (first stray message at %.1fs "
+		    "(%d%%), last at %.1fs (%d%%) into application run)\n",
+		    bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages,
+		    sepid->err_check_bad_sent, t_first, 
+		    (int) (t_first * 100.0 / t_runtime), t_last, 
+		    (int) (t_last * 100.0 / t_runtime));
+
+	psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid);
+	psmi_free(sepid);
+    }
+    psmi_epid_itor_fini(&itor);
+    return;
+}
+
+/* New scbs now available.  If we have pending sends because we were out of
+ * scbs, put the pendq on the timerq so it can be processed. */
+void
+ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context)
+{
+    struct ips_proto *proto = (struct ips_proto *) context;
+    struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq);
+    if (sreq != NULL) 
+	psmi_timer_request(proto->timerq, 
+			  &proto->pend_sends.timer, PSMI_TIMER_PRIO_1);
+    return;
+}
+
+psm_error_t __recvpath
+ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current)
+{
+    psm_error_t err = PSM_OK;
+    struct ips_pend_sends *pend_sends = 
+	(struct ips_pend_sends *) timer->context;
+    struct ips_pendsendq *phead = &pend_sends->pendq;
+    struct ips_proto *proto = (struct ips_proto *) pend_sends->proto;
+    struct ips_pend_sreq *sreq;
+
+    while (!STAILQ_EMPTY(phead)) {
+	sreq = STAILQ_FIRST(phead);
+	switch (sreq->type) {
+	    case IPS_PENDSEND_EAGER_REQ:
+		err = ips_proto_mq_push_eager_req(proto, sreq->req);
+		break;
+	    case IPS_PENDSEND_EAGER_DATA:
+		err = ips_proto_mq_push_eager_data(proto, sreq->req);
+		break;
+
+	    default:
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		    "Unknown pendq state %d\n", sreq->type);
+	}
+
+	if (err == PSM_OK) {
+	    STAILQ_REMOVE_HEAD(phead, next);
+	    psmi_mpool_put(sreq);
+	}
+	else { /* out of scbs. wait for the next scb_avail callback */
+	    //printf("!!!!! breaking out of pendq progress\n");
+	    break;
+	}
+    }
+
+    return err;
+}
+
+static 
+int __recvpath 
+_process_mq(struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    char   *payload = ips_recvhdrq_event_payload(rcv_ev);
+    uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev);
+    uint32_t msglen = paylen;
+    uint16_t mode = p_hdr->mqhdr;
+    psm_mq_req_t req;
+    psmi_egrid_t egrid;
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+    psm_epaddr_t epaddr = ipsaddr->epaddr;
+    psm_mq_t mq = rcv_ev->proto->mq; 
+    ptl_arg_t *args;
+    ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+    struct ips_flow *flow = &ipsaddr->flows[flowid];
+    int ret = IPS_RECVHDRQ_CONTINUE;
+    
+    if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev))
+	goto skip_ack_req;
+    
+    _IPATH_VDBG("Rcvd ctrl packet %s length = %i, mode=%d, arg0=%llx arg1=%llx\n", 
+	psmi_epaddr_get_name(epaddr->epid), 
+	paylen, p_hdr->mqhdr, 
+	(long long) p_hdr->data[0].u64, (long long) p_hdr->data[1].u64);
+    
+    if (mode <= MQ_MSG_RTS_WAIT) {
+	ret = ips_proto_check_msg_order(epaddr, flow, p_hdr);
+	if (ret == 0) return IPS_RECVHDRQ_OOO;
+
+	if (mode <= MQ_MSG_LONG) {
+	    egrid.egr_data = 0; 
+	    if (mode == MQ_MSG_SHORT) {
+		/* May have padded writes, account for it */
+		paylen -= p_hdr->hdr_dlen;
+		msglen = paylen;
+	    }
+	    else if (mode == MQ_MSG_TINY) {
+		payload = (void *) &p_hdr->data[1];
+		msglen = paylen = p_hdr->hdr_dlen;
+	    }
+	    else if (mode == MQ_MSG_LONG) {
+		msglen = p_hdr->data[1].u32w1;
+		if (ipsaddr->flags & SESS_FLAG_HAS_FLOWID) {
+		    egrid.egr_data = p_hdr->data[1].u32w0;
+		    _IPATH_VDBG("egrid-msglong is 0x%x\n", egrid.egr_data);
+		}
+	    }
+
+	    if (ret == 1)
+		psmi_mq_handle_envelope(
+		    mq, mode, epaddr, p_hdr->data[0].u64, /* tag */
+		    egrid, msglen, (void *) payload, paylen);
+	    else
+		psmi_mq_handle_envelope_outoforder(
+		    mq, mode, epaddr, flow->msg_ooo_seqnum,
+		    p_hdr->data[0].u64, /* tag */
+		    egrid, msglen, (void *) payload, paylen);
+	} else {
+	    args = (ptl_arg_t *) p_hdr->data;
+	    if (ret == 1)
+		ips_proto_mq_handle_rts_envelope(mq, mode, epaddr, 
+		    args[0].u64, args[1].u32w0, args[1].u32w1);
+	    else
+		ips_proto_mq_handle_rts_envelope_outoforder(mq, mode,
+		    epaddr, flow->msg_ooo_seqnum,
+		    args[0].u64, args[1].u32w0, args[1].u32w1);
+	}
+
+	if (ret == 1) {
+	    if (epaddr->mctxt_master->outoforder_c) {
+		psmi_mq_handle_outoforder_queue(epaddr->mctxt_master);
+	    }
+	    ret = IPS_RECVHDRQ_CONTINUE;
+	} else {
+	    ret = IPS_RECVHDRQ_BREAK;
+	}
+    } else if (mode == MQ_MSG_DATA || mode == MQ_MSG_DATA_BLK) {
+	psm_mq_req_t req;
+
+	req = STAILQ_FIRST(&epaddr->mctxt_master->egrlong);
+	while (req) {
+	    if (req->egrid.egr_data == p_hdr->data[1].u32w0) break;
+	    req = STAILQ_NEXT(req, nextq);
+	}
+
+/*
+ * Even with single context, since the header is sent via pio-flow,
+ * and data is sent via sdma-flow, data could be received first,
+ * thus causes req=NULL.
+ */
+	if (req == NULL) {
+	    flow->msg_ooo_toggle = !flow->msg_ooo_toggle;
+	    if (flow->msg_ooo_toggle) {
+		flow->recv_seq_num.pkt -= 1;
+		return IPS_RECVHDRQ_OOO;
+	    }
+	} else {
+	    flow->msg_ooo_toggle = 0;
+	}
+
+	psmi_mq_handle_data(req, epaddr, p_hdr->data[1].u32w0,
+		p_hdr->data[1].u32w1, payload, paylen);
+
+	/* If checksum is enabled, this matches what is done for tid-sdma */
+	/* if OOO and req is NULL, header is not received and we ignore chksum */
+	if (rcv_ev->proto->flags & IPS_PROTO_FLAG_CKSUM &&
+			mode == MQ_MSG_DATA_BLK &&
+			req && req->state == MQ_STATE_COMPLETE) {
+		uint32_t cksum = ips_crc_calculate(
+			req->recv_msglen - p_hdr->data[0].u32w1,
+			(uint8_t *)req->buf + p_hdr->data[0].u32w1, 
+			0xffffffff);
+		if (p_hdr->data[0].u32w0 != cksum) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			"ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Aborting! \n", p_hdr->data[0].u32w0, cksum, __be16_to_cpu(flow->path->epr_dlid));
+			ips_proto_dump_data(req->buf, req->recv_msglen);
+		}
+	}
+
+    } else if (mode == MQ_MSG_DATA_REQ || mode == MQ_MSG_DATA_REQ_BLK) {
+	req = psmi_mpool_find_obj_by_index(mq->rreq_pool,
+				     p_hdr->data[1].u32w0);
+	if (!req) goto skip_ack_req;
+	psmi_mq_handle_data(req, epaddr, p_hdr->data[1].u32w0,
+		p_hdr->data[1].u32w1, (void *) payload, paylen);
+
+	/* If checksum is enabled, this matches what is done for tid-sdma */
+	if (rcv_ev->proto->flags & IPS_PROTO_FLAG_CKSUM &&
+			mode == MQ_MSG_DATA_REQ_BLK &&
+			req->state == MQ_STATE_COMPLETE) {
+		uint32_t cksum = ips_crc_calculate(
+			req->recv_msglen, req->buf, 0xffffffff);
+		if (p_hdr->data[0].u32w0 != cksum) {
+			psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+			"ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Aborting! \n", p_hdr->data[0].u32w0, cksum, __be16_to_cpu(flow->path->epr_dlid));
+			ips_proto_dump_data(req->buf, req->recv_msglen);
+		}
+	}
+
+    } else if (mode == MQ_MSG_CTS_EGR) {
+	args = p_hdr->data;
+	ips_proto_mq_handle_cts(rcv_ev->proto, args);
+    } else {
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		"Unknown frame mode %x", mode);
+    }
+
+    if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ)  ||
+	(flow->flags & IPS_FLOW_FLAG_GEN_BECN))
+      ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow);
+    
+skip_ack_req:
+    ips_proto_process_ack(rcv_ev);
+    
+    return ret; // skip
+}
+
+PSMI_INLINE(
+int between(int first_seq, int last_seq, int seq))
+{
+  if (last_seq >= first_seq) {
+    if (seq < first_seq || seq > last_seq) {
+	return 0;
+    }
+  } else {
+    if (seq > last_seq && seq < first_seq) {
+	return 0;
+    }
+  }
+  return 1;
+}
+
+PSMI_INLINE(
+int pio_dma_ack_valid(struct ips_flow *flow, psmi_seqnum_t ack_seq_num, 
+		      uint32_t ack_window))
+{
+  uint32_t first_pkt, last_pkt;
+  struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+  
+  if (STAILQ_EMPTY(unackedq))
+    return 0;
+  
+  first_pkt = flow->xmit_ack_num.pkt + 1;
+  last_pkt = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.pkt;
+  return between(first_pkt, last_pkt, ack_seq_num.pkt);
+}
+
+PSMI_INLINE(
+struct ips_flow* get_tidflow(ips_epaddr_t *ipsaddr, 
+			     struct ips_message_header *p_hdr,
+			     psmi_seqnum_t ack_seq_num, 
+			     uint32_t ack_window))
+{
+  struct ips_flow *flow;
+  struct ips_protoexp *protoexp = ipsaddr->proto->protoexp;
+  struct ips_tid_send_desc *tidsendc;
+  ptl_arg_t desc_id = p_hdr->data[0];
+  ptl_arg_t desc_tidsendc;
+  uint32_t first_seq, last_seq;
+  struct ips_scb_unackedq *unackedq;
+  
+  tidsendc = (struct ips_tid_send_desc*)
+    psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool,
+				 desc_id._desc_idx);
+  if (tidsendc == NULL) {
+    _IPATH_ERROR("OPCODE_ACK: Index %d is out of range in tidflow ack\n", desc_id._desc_idx);
+    return NULL;
+  }
+
+  /* Ensure generation matches */
+  psmi_mpool_get_obj_index_gen_count(tidsendc,
+				     &desc_tidsendc._desc_idx,
+				     &desc_tidsendc._desc_genc);
+  if (desc_tidsendc.u64 != desc_id.u64)
+    return NULL;
+  
+  /* Ensure ack is within window */
+  flow = &tidsendc->tidflow;
+  unackedq = &flow->scb_unacked;
+  
+  /* No unacked scbs */
+  if (STAILQ_EMPTY(unackedq))
+    return NULL;
+  
+  first_seq = flow->xmit_ack_num.seq + 1;
+  last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.seq;
+  if (between(first_seq, last_seq, ack_seq_num.seq) == 0) {
+    return NULL;
+  }
+   
+  /* Generation for ack should match */
+  if (STAILQ_FIRST(unackedq)->seq_num.gen != ack_seq_num.gen)
+    return NULL;
+    
+  return flow;
+}
+
+/* NAK post process for tid flow */
+void ips_tidflow_nak_post_process(struct ips_flow *flow, 
+				  struct ips_message_header *p_hdr)
+{
+  
+  ips_scb_t *scb;
+  struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
+#ifdef PSM_DEBUG
+  psmi_seqnum_t new_flowgenseq;
+	
+  new_flowgenseq.val = p_hdr->data[1].u32w0;
+  /* Update any pending scb's to the new generation count.
+   * Note: flow->xmit_seq_num was updated to the new generation when the
+   * NAK was received. 
+   */
+  psmi_assert(STAILQ_FIRST(unackedq)->seq_num.flow==new_flowgenseq.flow);
+  psmi_assert(STAILQ_FIRST(unackedq)->seq_num.gen != new_flowgenseq.gen);
+  psmi_assert(STAILQ_FIRST(unackedq)->
+	seq_num.seq-STAILQ_FIRST(unackedq)->nfrag+1 == new_flowgenseq.seq);
+#endif
+
+  /* Update unacked scb's to use the new flowgenseq */
+  scb = STAILQ_FIRST(unackedq);
+  while (scb) {
+    scb->ips_lrh.bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn);
+    flow->xmit_seq_num.seq += scb->nfrag;
+    scb->seq_num = flow->xmit_seq_num;
+    scb->seq_num.seq--;
+    scb = SLIST_NEXT(scb, next);
+  }
+  
+}
+
+// process an incoming ack message.  Separate function to allow
+// for better optimization by compiler
+void __recvpath
+ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
+{
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; 
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr; 
+    psmi_seqnum_t ack_seq_num, last_seq_num;
+    ips_scb_t *scb;
+    struct ips_proto *proto = ipsaddr->proto;    
+    struct ips_flow *flow = NULL;
+    struct ips_scb_unackedq *unackedq;
+    struct ips_scb_pendlist *scb_pend;
+    psm_protocol_type_t protocol;
+    ptl_epaddr_flow_t flowid;
+    
+    ips_ptladdr_lock(ipsaddr);
+    
+    protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid);
+    flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid);
+    ack_seq_num.psn = p_hdr->ack_seq_num;
+
+    switch(protocol){
+    case PSM_PROTOCOL_GO_BACK_N:
+      flow = &ipsaddr->flows[flowid];
+      ack_seq_num.pkt -= 1;
+      if (!pio_dma_ack_valid(flow, ack_seq_num, proto->scb_max_inflight))
+	goto ret;
+      flow->xmit_ack_num = ack_seq_num;
+      break;
+    case PSM_PROTOCOL_TIDFLOW:
+      ack_seq_num.seq -= 1;
+      flow = get_tidflow(ipsaddr, p_hdr, ack_seq_num, proto->scb_max_inflight);
+      if (!flow) /* Invalid ack for flow */
+        goto ret;
+      flow->xmit_ack_num = ack_seq_num;
+      break;
+    default:
+      _IPATH_ERROR("OPCODE_ACK: Unknown flow type %d in ACK\n", flowid);
+      goto ret;
+    }
+    
+    unackedq = &flow->scb_unacked;
+    scb_pend = &flow->scb_pend;
+    if (STAILQ_EMPTY(unackedq)) goto ret;  // only for Klockwork scan.
+    last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+    
+    INC_TIME_SPEND(TIME_SPEND_USER2);
+
+    /* For tidflow, we want to match all flow/gen/seq,
+       for gobackn, we only match pkt#, msg# is not known.
+       msg# is the message envelope number in the stream,
+       you don't know if the next packet has the old msg#
+       or starts a new msg#.
+    */
+    /*  first release all xmit buffer that has been receveid   */
+    while ((protocol==PSM_PROTOCOL_GO_BACK_N) ?
+		between(STAILQ_FIRST(unackedq)->seq_num.pkt,
+			last_seq_num.pkt, ack_seq_num.pkt) :
+		between(STAILQ_FIRST(unackedq)->seq_num.psn,
+			last_seq_num.psn, ack_seq_num.psn)
+    ) {
+
+        /* take it out of the xmit queue and ..  */
+	scb = STAILQ_FIRST(unackedq);
+	STAILQ_REMOVE_HEAD(unackedq, nextq);
+	flow->scb_num_unacked--;
+	flow->credits++;
+	
+	if (scb == SLIST_FIRST(scb_pend)) {
+	    flow->scb_num_pending--;
+	    SLIST_REMOVE_HEAD(scb_pend, next);
+	}
+
+	if (scb->flags & IPS_SEND_FLAG_WAIT_SDMA) 
+	    ips_proto_dma_wait_until(proto, scb->dma_ctr);
+
+        if (scb->callback)
+            (*scb->callback) (scb->cb_param, scb->payload_size-scb->extra_bytes);
+
+	if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+	    ips_scbctrl_free(scb);
+
+        /* set all index pointer to NULL if all frames have been
+         * acked */
+	if (STAILQ_EMPTY(unackedq)) {
+            psmi_timer_cancel(proto->timerq, &flow->timer_ack);
+	    psmi_timer_cancel(proto->timerq, &flow->timer_send);
+	    SLIST_FIRST(scb_pend) = NULL;
+	    psmi_assert(flow->scb_num_pending == 0);
+	    /* Reset congestion window - all packets ACK'd */
+	    flow->credits = flow->cwin = proto->flow_credits;
+	    flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+	    flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+	    goto ret;
+        }
+    }
+    
+    /* CCA: If flow is congested adjust rate */
+    if_pf (rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+      if ((flow->path->epr_ccti +
+      proto->cace[flow->path->epr_sl].ccti_increase) <=
+      proto->ccti_limit) {
+	ips_cca_adjust_rate(flow->path,
+		proto->cace[flow->path->epr_sl].ccti_increase);
+	/* Clear congestion event */
+	rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+      }
+    }
+    else {
+      /* Increase congestion window if flow is not congested */
+      if_pf (flow->cwin < proto->flow_credits) {
+	flow->credits += 
+	  min(flow->cwin << 1, proto->flow_credits) - flow->cwin;
+	flow->cwin = min(flow->cwin << 1, proto->flow_credits);
+	flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+      }
+    }
+    
+    /* Reclaimed some credits - attempt to flush flow */
+    flow->fn.xfer.flush(flow, NULL);
+    
+    /*
+     * If the next packet has not even been put on the wire, cancel the
+     * retransmission timer since we're still presumably waiting on free 
+     * pio bufs
+     */
+    if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE)
+       psmi_timer_cancel(proto->timerq, &flow->timer_ack);
+
+ret:
+    ips_ptladdr_unlock(ipsaddr);
+    return;
+}
+
+// process an incoming nack message.  Separate function to allow
+// for better optimization by compiler
+static void 
+_process_nak(struct ips_recvhdrq_event *rcv_ev)
+{
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr; 
+    psmi_seqnum_t ack_seq_num, last_seq_num;
+    ips_scb_t *scb;
+    struct ips_proto *proto = ipsaddr->proto;
+    struct ips_flow *flow = NULL;
+    struct ips_scb_unackedq *unackedq;
+    struct ips_scb_pendlist *scb_pend;
+    psm_protocol_type_t protocol;
+    ptl_epaddr_flow_t flowid;
+    int num_resent = 0;
+    
+    ips_ptladdr_lock(ipsaddr);
+    
+    protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid);
+    flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid);
+
+    INC_TIME_SPEND(TIME_SPEND_USER3);
+
+    ack_seq_num.psn = p_hdr->ack_seq_num;
+    
+    switch(protocol){
+    case PSM_PROTOCOL_GO_BACK_N:
+      flow = &ipsaddr->flows[flowid];
+      if (!pio_dma_ack_valid(flow, ack_seq_num, proto->scb_max_inflight)) 
+	goto ret;
+      ack_seq_num.pkt--;
+      flow->xmit_ack_num = ack_seq_num;
+      break;
+    case PSM_PROTOCOL_TIDFLOW:
+      flow = get_tidflow(ipsaddr, p_hdr, ack_seq_num, proto->scb_max_inflight);
+      if (!flow)
+	goto ret;  /* Invalid ack for flow */
+      ack_seq_num.seq--;
+      /* Update xmit seq num to the new flowgenseq */
+      flow->xmit_seq_num = (psmi_seqnum_t)p_hdr->data[1].u32w0;
+      flow->xmit_ack_num = flow->xmit_seq_num;
+      flow->xmit_ack_num.seq--;
+      break;
+    default:
+      _IPATH_ERROR("OPCODE_NAK: Unknown flow type %d in ACK\n", flowid);
+      goto ret;
+    }
+    
+    unackedq = &flow->scb_unacked;
+    scb_pend = &flow->scb_pend;
+    if (STAILQ_EMPTY(unackedq)) goto ret;  // only for Klockwork scan.
+    last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
+        
+    ipsaddr->stats.nak_recv++;
+
+    _IPATH_VDBG("got a nack %d on flow %d, "
+		"first is %d, last is %d\n", ack_seq_num.psn,
+		flowid, STAILQ_EMPTY(unackedq)?-1:STAILQ_FIRST(unackedq)->seq_num.psn,
+		STAILQ_EMPTY(unackedq)?-1:STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn);
+
+    /* For tidflow, we want to match all flow/gen/seq,
+       for gobackn, we only match pkt#, msg# is not known.
+       msg# is the message envelope number in the stream,
+       you don't know if the next packet has the old msg#
+       or starts a new msg#.
+    */
+    /*  first release all xmit buffer that has been receveid   */
+    while ((protocol==PSM_PROTOCOL_GO_BACK_N) ?
+		between(STAILQ_FIRST(unackedq)->seq_num.pkt,
+			last_seq_num.pkt, ack_seq_num.pkt) :
+		between(STAILQ_FIRST(unackedq)->seq_num.psn,
+			last_seq_num.psn, ack_seq_num.psn)
+    ) {
+        /* take it out of the xmit queue and ..  */
+	scb = STAILQ_FIRST(unackedq);
+	STAILQ_REMOVE_HEAD(unackedq, nextq);
+	flow->scb_num_unacked--;
+	
+	if (scb->flags & IPS_SEND_FLAG_WAIT_SDMA) 
+	    ips_proto_dma_wait_until(proto, scb->dma_ctr);
+
+        if (scb->callback)
+            (*scb->callback) (scb->cb_param, scb->payload_size-scb->extra_bytes);
+
+	if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT))
+	    ips_scbctrl_free(scb);
+
+        /* set all index pointer to NULL if all frames has been acked */
+	if (STAILQ_EMPTY(unackedq)) {
+            psmi_timer_cancel(proto->timerq, &flow->timer_ack);
+	    psmi_timer_cancel(proto->timerq, &flow->timer_send);
+	    SLIST_FIRST(scb_pend) = NULL;
+	    psmi_assert(flow->scb_num_pending == 0);
+	    /* Reset congestion window if all packets acknowledged */
+	    flow->credits = flow->cwin = proto->flow_credits;
+	    flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+	    flow->flags &= ~IPS_FLOW_FLAG_CONGESTED;
+	    goto ret;
+        }
+    }
+
+    psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */
+
+    if (flow->fn.protocol.nak_post_process)
+      flow->fn.protocol.nak_post_process(flow, p_hdr);
+    
+    /* Always cancel ACK timer as we are going to restart the flow */
+    psmi_timer_cancel(proto->timerq, &flow->timer_ack);
+    
+    /* What's now pending is all that was unacked */
+    SLIST_FIRST(scb_pend) = STAILQ_FIRST(unackedq);
+    flow->scb_num_pending = flow->scb_num_unacked;
+    
+    /* If NAK with congestion bit set - delay re-transmitting and THEN adjust
+     * CCA rate.
+     */
+    if_pf (rcv_ev->is_congested & IPS_RECV_EVENT_BECN) {
+      uint64_t offset;
+      
+      /* Clear congestion event and mark flow as congested */
+      rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+      flow->flags |= IPS_FLOW_FLAG_CONGESTED;
+      
+      /* For congested flow use slow start i.e. reduce congestion window.
+       * For TIDFLOW we cannot reduce congestion window as peer expects
+       * header packets at regular intervals (protoexp->hdr_pkt_interval).
+       */
+      if (flow->protocol != PSM_PROTOCOL_TIDFLOW)
+	flow->credits = flow->cwin = 1;
+      else 
+	flow->credits = flow->cwin;
+
+      flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+      
+      /* During congestion cancel send timer and delay retransmission by 
+       * random interval 
+       */
+      psmi_timer_cancel(proto->timerq, &flow->timer_send);
+      if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE)
+	offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1);	    
+      else
+	offset = 0;
+      psmi_timer_request(proto->timerq, &flow->timer_send,
+			 (get_cycles() +
+			  (uint64_t)(offset * (rand()/RAND_MAX + 1.0))));
+    }
+    else {
+      /* Reclaim all credits upto congestion window only */
+      flow->credits = flow->cwin;
+      flow->ack_interval = max((flow->credits >> 2) - 1, 1);
+      
+      /* Flush pending scb's */
+      flow->fn.xfer.flush(flow, &num_resent);
+      ipsaddr->stats.send_rexmit += num_resent;
+    }
+    
+ret:
+    ips_ptladdr_unlock(ipsaddr);
+    return;
+}
+
+static void 
+_process_err_chk(struct ips_recvhdrq *recvq, ips_epaddr_t *ipsaddr, 
+		 struct ips_message_header *p_hdr)
+{
+    psmi_seqnum_t seq_num;
+    int16_t seq_off;
+    ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+    struct ips_flow *flow = &ipsaddr->flows[flowid];
+    
+    INC_TIME_SPEND(TIME_SPEND_USER4);
+
+    ipsaddr->stats.err_chk_recv++;
+
+    seq_num.val = __be32_to_cpu(p_hdr->bth[2]);
+    seq_off = (int16_t)(ipsaddr->flows[flowid].recv_seq_num.pkt - seq_num.pkt);
+
+    if_pf (seq_off <= 0) {
+      _IPATH_VDBG("naking for seq=%d, off=%d on flowid  %d\n",
+		  seq_num.pkt, seq_off, flowid);
+      
+      if (seq_off < -flow->ack_interval) 
+	flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+
+      ips_proto_send_nak(recvq, flow);
+      flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+    }
+    else {
+      ips_proto_send_ctrl_message(flow, OPCODE_ACK, 
+				  &ipsaddr->ctrl_msg_queued, NULL);
+    }
+}
+
+static void
+_process_err_chk_gen(ips_epaddr_t *ipsaddr, struct ips_message_header *p_hdr)
+{
+  struct ips_protoexp *protoexp = ipsaddr->proto->protoexp;
+  struct ips_tid_recv_desc *tidrecvc;
+  psmi_seqnum_t err_seqnum;
+  ptl_arg_t desc_id = p_hdr->data[0];
+  ptl_arg_t send_desc_id = p_hdr->data[1];
+  ptl_arg_t desc_tidrecvc;
+  ptl_arg_t args[3] = {};
+  int16_t seq_off;
+  uint8_t ack_type;
+  
+  INC_TIME_SPEND(TIME_SPEND_USER4);
+
+  ipsaddr->stats.err_chk_recv++;
+  
+  /* Get the flowgenseq for err chk gen */
+  err_seqnum.val = __be32_to_cpu(p_hdr->bth[2]);
+
+  ips_ptladdr_lock(ipsaddr);
+  
+  /* Get receive descriptor */
+  tidrecvc = (struct ips_tid_recv_desc *)
+    psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool,
+                                 desc_id._desc_idx);
+  
+  if (tidrecvc == NULL) {
+    _IPATH_DBG("ERR_CHK_GEN: invalid rendezvous handle\n");
+    ips_ptladdr_unlock(ipsaddr);
+    return;
+  }
+  psmi_mpool_get_obj_index_gen_count(tidrecvc,
+                                     &desc_tidrecvc._desc_idx,
+                                     &desc_tidrecvc._desc_genc);
+  
+  if (desc_id.u64 != desc_tidrecvc.u64) {
+    /* Receive descriptor mismatch in time and space.
+     * Stale err chk gen, drop packet
+     */
+    _IPATH_DBG("ERR_CHK_GEN: rendezvous handle generation mismatch. Pkt: 0x%08x, Current: 0x%08x\n", desc_id._desc_genc, desc_tidrecvc._desc_genc);
+    ips_ptladdr_unlock(ipsaddr);
+    return;
+  }
+  
+  psmi_assert(tidrecvc->tidflow_idx == err_seqnum.flow);
+  
+  /* Note: Do not read the tidflow table to determine the sequence to restart
+   * from. Always respond with the last known "good" packet that we received
+   * which is updated in protoexp_data().
+   */
+  
+  /* Either lost packets or lost ack */
+  seq_off = (int16_t) (tidrecvc->tidflow_genseq.seq - err_seqnum.seq);
+
+  if (seq_off <= 0) {
+    ack_type = OPCODE_NAK;
+    
+    if (err_seqnum.gen == tidrecvc->tidflow_active_gen) {
+      /* Swap generations */
+      psm_error_t err;
+      
+      /* Allocate new generation for the flow. */
+      err = ips_protoexp_flow_newgen(tidrecvc);
+      if (err != PSM_OK) 
+	return; /* Out of generation. Drop packet and we will recover later */
+    }
+  }
+  else {
+    ack_type = OPCODE_ACK;
+    
+    if (err_seqnum.gen != tidrecvc->tidflow_genseq.gen)
+      ack_type = OPCODE_NAK; /* NAK without allocating a new generation */
+  }
+
+  args[0] = send_desc_id;
+  args[1] = tidrecvc->tid_list.tsess_descid;
+  args[2].u16w0 = err_seqnum.gen; /* If NAK, generation number */
+
+  ips_ptladdr_unlock(ipsaddr);
+  
+  /* May want to generate a BECN if a lot of swapped generations */
+  if_pf ((tidrecvc->tidflow_nswap_gen > 4) &&
+	 (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) {
+    _IPATH_CCADBG("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n", tidrecvc->tidflow_nswap_gen);
+    /* Mark flow to generate BECN in control packet */
+    tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN;
+    
+    /* Update stats for congestion encountered */
+    ipsaddr->stats.congestion_pkts++;
+  }
+  
+  ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow, 
+			      ack_type, &tidrecvc->ctrl_msg_queued, args);
+
+  /* Update stats for expected window */
+  tidrecvc->stats.nErrChkReceived++;
+  if (ack_type == OPCODE_NAK)
+    tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */
+}
+
+static void 
+parse_ip_or_lid(char *buf, size_t len, uint32_t ip, psm_epid_t epid)
+{
+    char ipbuf[INET_ADDRSTRLEN], *p;
+    in_addr_t in_loop = inet_addr("127.0.0.1");
+    in_addr_t in_any  = inet_addr("0.0.0.0");
+
+    p = (char *) inet_ntop(AF_INET, (const void *) &ip, ipbuf, sizeof ipbuf);
+    if (ip != in_loop && ip != in_any && p) 
+	snprintf(buf, len-1, "IP %s", p);
+    else
+	snprintf(buf, len-1, "LID 0x%x", (int) psm_epid_nid(epid));
+    buf[len-1] = '\0';
+}
+
+#define IPS_MAX_BOGUS_ERR_CHK_BAD   15
+
+static void 
+_process_err_chk_bad(ips_epaddr_t *ipsaddr, struct ips_message_header *p_hdr)
+{
+    uint32_t ipv4_addr = p_hdr->data[0].u32w0;
+    uint32_t pid = __be32_to_cpu(p_hdr->data[0].u32w1);
+    union psmi_envvar_val env_stray;
+    char buf[32];
+    psm_epid_t epid = ipsaddr->epaddr->epid;
+
+    parse_ip_or_lid(buf, sizeof buf, ipv4_addr, epid);
+
+    /* First make sure that we actually do have a connection to this lid+context,
+     * if not, we just ignore the err_chk_bad message since it might be some
+     * oddly timed packet */
+    if (!ips_proto_isconnected(ipsaddr)) {
+	int lid =  (int) psm_epid_nid(epid);
+	int context = (int) psm_epid_context(epid);
+	if (++ipsaddr->proto->num_bogus_warnings <= IPS_MAX_BOGUS_ERR_CHK_BAD)
+	    psmi_syslog(ipsaddr->proto->ep, 1, LOG_INFO, 
+		"PSM pid %d on host %s complains that I am a stray process but "
+		"I'm not even connected to LID %d context %d (ignoring %s\n",
+		pid, buf, lid, context, 
+		ipsaddr->proto->num_bogus_warnings == IPS_MAX_BOGUS_ERR_CHK_BAD ? 
+		"all future stray warning checks from unknown endpoints)." : 
+		").");
+	return;
+    }
+
+    /* At this point the bad error check is a real one, from a host we thought
+     * we were connected to.  We only go through this path once.  If
+     * PSM_STRAY_ENABLED=0, we'll print this warning once, if it's 1 we'll die.
+    */
+    if (ipsaddr->proto->done_once++)
+	return;
+
+    psmi_getenv("PSM_STRAY_ENABLED", "Enable stray process detection",
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO,
+		(union psmi_envvar_val) 1, /* yes by default */
+		&env_stray);
+
+    if (env_stray.e_uint)
+	psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR, "PSM pid "
+	    "%d on host %s has detected that I am a stray process, exiting.", 
+	    pid, buf);
+    else
+	psmi_syslog(ipsaddr->proto->ep, 1, LOG_INFO, "PSM pid "
+	    "%d on host %s has detected that I am a stray process, " 
+	    "PSM_STRAY_ENABLED is off and future messages are ignored.");
+    return;
+}
+
+static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto)
+{
+    _IPATH_DBG("Discarding message with bad opcode 0x%x\n",
+        op_code);
+
+    if(infinipath_debug&__IPATH_DBG) {
+        ips_proto_show_header(proto, "received bad opcode");
+        ips_proto_dump_frame(proto, sizeof(struct ips_message_header),
+            "Opcode error protocol header dump");
+    }
+}
+
+static void 
+_process_unknown_opcode(struct ips_proto *proto,
+			struct ips_message_header *protocol_header)
+{
+    proto->stats.unknown_packets++;
+
+    switch (protocol_header->sub_opcode) {
+	/* A bunch of pre-PSM packets that we don't handle any more */
+	case OPCODE_SEQ_DATA: 
+        case OPCODE_SEQ_CTRL:
+        case OPCODE_STARTUP:
+        case OPCODE_STARTUP_EXT:
+        case OPCODE_STARTUP_ACK:
+        case OPCODE_STARTUP_ACK_EXT:
+        case OPCODE_STARTUP_NAK:
+        case OPCODE_STARTUP_NAK_EXT:
+        case OPCODE_CLOSE:
+        case OPCODE_ABORT:
+        case OPCODE_CLOSE_ACK:
+	    break;
+	default:
+	    ips_bad_opcode(protocol_header->sub_opcode, protocol_header);
+	    break;
+    }
+}
+
+PSMI_NEVER_INLINE(
+int
+_process_connect(const struct ips_recvhdrq_event *rcv_ev))
+{
+  const uint16_t lmc_mask = ~((1 << rcv_ev->proto->epinfo.ep_lmc) - 1);
+  
+  return ips_proto_process_connect(rcv_ev->proto, 
+				   ips_epid_from_phdr(lmc_mask, rcv_ev->p_hdr), 
+				   rcv_ev->p_hdr->sub_opcode, 
+				   rcv_ev->p_hdr,
+				   ips_recvhdrq_event_payload(rcv_ev),
+				   ips_recvhdrq_event_paylen(rcv_ev));
+}
+
+// Return 1 if packet is ok.
+// Return 0 if packet should be skipped
+int
+ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    uint8_t ptype = rcv_ev->ptype;
+    const uint64_t current_count = get_cycles();
+    struct ips_stray_epid *sepid;
+    struct ips_proto *proto = rcv_ev->proto;
+    psm_ep_t ep_err;
+    psm_epid_t epid;
+    char *pkt_type;
+    int opcode = (int) p_hdr->sub_opcode;
+    double t_elapsed;
+    ptl_epaddr_flow_t flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid);
+    const uint16_t lmc_mask = ~((1 << rcv_ev->proto->epinfo.ep_lmc) - 1);
+
+    /* 
+     * If the protocol is disabled or not yet enabled, no processing happens
+     * We set it t_init to 0 when disabling the protocol
+     */
+    if (proto->t_init == 0)
+	return IPS_RECVHDRQ_CONTINUE;
+
+    /*
+     * If lid is 0, something bad happened in queue processing
+     */
+    epid = ips_epid_from_phdr(lmc_mask, p_hdr);
+    if (psm_epid_nid(epid) == 0ULL) {
+	proto->stats.lid_zero_errs++;
+	_IPATH_DBG("Skipping stray packet processing with LID=0\n");
+	return IPS_RECVHDRQ_CONTINUE;
+    }
+
+    /* Connect messages don't have to be from a known epaddr */
+    switch (opcode) {
+	case OPCODE_CONNECT_REQUEST:
+	case OPCODE_CONNECT_REPLY:
+	case OPCODE_DISCONNECT_REQUEST:
+	case OPCODE_DISCONNECT_REPLY:
+	    _process_connect(rcv_ev);
+	    return IPS_RECVHDRQ_CONTINUE;
+	case OPCODE_ERR_CHK_BAD: /* ignore, old opcode */
+	    return IPS_RECVHDRQ_CONTINUE;
+	default:
+	    break;
+    }
+    
+    /* Packet from "unknown" peer. Log the packet and payload if at appropriate
+     * verbose level.
+     */
+    {
+      char *payload = ips_recvhdrq_event_payload(rcv_ev);
+      uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+	((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+      
+      ips_proto_dump_err_stats(proto);
+      
+      if(infinipath_debug & __IPATH_PKTDBG) {
+	ips_proto_dump_frame(rcv_ev->p_hdr, IPATH_MESSAGE_HDR_SIZE, "header");
+	if (paylen)
+	  ips_proto_dump_frame(payload, paylen, "data");
+      }
+    }
+    
+    /* Other messages are definitely crosstalk. */
+    /* out-of-context expected messages are always fatal */
+    if (ptype == RCVHQ_RCV_TYPE_EXPECTED) {
+	ep_err = PSMI_EP_NORETURN;
+	pkt_type = "expected";
+    }
+    else if (ptype == RCVHQ_RCV_TYPE_EAGER) {
+	ep_err = PSMI_EP_LOGEVENT;
+	pkt_type = "eager";
+    }
+    else {
+	ep_err = PSMI_EP_NORETURN;
+	pkt_type = "unknown";
+    }
+
+    proto->stats.stray_packets++;
+
+    /* If we have debug mode, print the complete packet every time */
+    if (infinipath_debug & __IPATH_PKTDBG)
+	ips_proto_show_header(p_hdr, "invalid commidx");
+    t_elapsed = (double)
+	cycles_to_nanosecs(get_cycles()-proto->t_init) / 1.0e9;
+
+    sepid = (struct ips_stray_epid *) 
+	psmi_epid_lookup(PSMI_EP_CROSSTALK, epid);
+    if (sepid == NULL) {  /* Never seen crosstalk from this node, log it */
+	sepid = (struct ips_stray_epid *) 
+		psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(struct ips_stray_epid));
+	if (sepid == NULL) return 0; /* skip packet if no memory */ 
+	psmi_epid_add(PSMI_EP_CROSSTALK, epid, (void *) sepid);
+	sepid->epid = epid;
+	if (proto->stray_warn_interval)
+	    sepid->t_first = sepid->t_warn_next = current_count;
+    }
+    sepid->num_messages++;
+    sepid->t_last = current_count;
+
+    /* If we're not going to warn the user and if this not to be a fatal
+     * packet, just skip it */
+    if (sepid->t_warn_next > current_count && ep_err != PSMI_EP_NORETURN) 
+	return 0;
+
+    sepid->t_warn_next = current_count + proto->stray_warn_interval;
+
+    if (p_hdr->sub_opcode == OPCODE_ERR_CHK) {
+	/* With the new err_check, we can print out extra information */
+	char ipbuf[INET_ADDRSTRLEN], *ip = NULL;
+	sepid->ipv4_addr = p_hdr->data[0].u32w0;
+	sepid->pid       = __be32_to_cpu(p_hdr->data[0].u32w1);
+	ip = (char *) inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, sizeof ipbuf);
+
+	/* If the IP and PID make sense, go ahead and print useful info and
+	 * even reply with ERR_CHK_BAD.  If not, fall through and print the
+	 * generic bad error message
+	 */
+	if (ip != NULL && sepid->pid) {
+	    /* Make up a fake ipsaddr and reply */
+	    ips_epaddr_t ipsaddr_f;
+	    psm_error_t err;
+	    
+	    /* debugging sanity, and catch bugs */
+	    memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t));
+	    ipsaddr_f.epr.epr_context = IPS_HEADER_SRCCONTEXT_GET(p_hdr);
+	    ipsaddr_f.epr.epr_subcontext = p_hdr->dst_subcontext;
+	    ipsaddr_f.epr.epr_pkt_context = 
+		ipsaddr_f.epr.epr_context & 0xf;
+   
+	    /* Get path record for peer */
+	    err = proto->ibta.get_path_rec(proto, 
+					   proto->epinfo.ep_base_lid, 
+					   p_hdr->lrh[3], /* SLID */
+					   PSMI_HCA_TYPE_QLE73XX,
+					   3000, &ipsaddr_f);
+	    if (err != PSM_OK)
+	      goto fail;
+	    
+	    ipsaddr_f.epr.epr_qp = __be32_to_cpu(p_hdr->bth[1]);
+	    ipsaddr_f.epr.epr_qp &= 0xffffff; /* QP is 24 bits */
+	    ipsaddr_f.ptl = (ptl_t *) -1;
+	    ipsaddr_f.proto = proto;
+	    /* Pretend the ctrlmsg is already queued, so it doesn't get queued
+	     * in this fake (stack-allocated) ptladdr */
+	    ipsaddr_f.ctrl_msg_queued = ~0;
+	    flowid = EP_FLOW_GO_BACK_N_PIO;
+	    ips_flow_init(&ipsaddr_f.flows[flowid], NULL,
+			  &ipsaddr_f, PSM_TRANSFER_PIO,
+			  PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY, flowid);
+
+	    if (!ips_proto_send_ctrl_message(&ipsaddr_f.flows[flowid],
+					     OPCODE_ERR_CHK_BAD,
+					     &ipsaddr_f.ctrl_msg_queued,NULL)){
+		sepid->err_check_bad_sent++;
+		_IPATH_VDBG("did reply with ERR_CHK_BAD\n");
+	    }
+	    else
+		_IPATH_VDBG("did *NOT* reply with ERR_CHK_BAD\n");
+	    
+	fail:
+	    psmi_handle_error(ep_err, PSM_EPID_NETWORK_ERROR, 
+		"Received %d out-of-context %s message(s) from stray process "
+		"PID=%d running on host %s (LID 0x%x, ptype=0x%x, subop=0x%x, "
+		"elapsed=%.3fs) %s", 
+		sepid->num_messages, pkt_type, sepid->pid, ip,
+		(int) psm_epid_nid(epid), ptype, opcode, t_elapsed,
+		(ep_err == PSMI_EP_NORETURN) ? "Aborting." : "");
+	    return 0;
+	}
+    }
+
+    /* At this point we either have a OPCODE_ERR_CHECK where we couldn't
+     * extract a valid ip and pid OR some other opcode */
+    psmi_handle_error(ep_err, PSM_EPID_NETWORK_ERROR, 
+		"Received out-of-context %s message(s) from a stray process "
+		"running on LID 0x%x ptype=0x%x subop=0x%x elapsed=%.3fs", 
+		pkt_type, (int) psm_epid_nid(epid), ptype, opcode, t_elapsed);
+
+    return 0; /* Always skip this packet unless the above call was a noreturn
+	       * call */
+}
+
+/* get the error string as a number and a string */
+static void rhf_errnum_string(char *msg, size_t msglen, long err)
+{
+    int len;
+    char *errmsg;
+
+    len = snprintf(msg, msglen, "RHFerror %lx: ", err);
+    if(len > 0 && len < msglen) {
+	    errmsg = msg + len;
+	    msglen -= len;
+    }
+    else
+	    errmsg = msg;
+    *errmsg = 0;
+    ips_proto_get_rhf_errstring(err, errmsg, msglen);
+}
+
+/*
+ * Error handling
+ */
+int __recvpath
+ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_proto *proto = rcv_ev->proto;
+    int pkt_verbose_err = infinipath_debug & __IPATH_PKTDBG;
+    int tiderr = rcv_ev->error_flags & INFINIPATH_RHF_H_TIDERR;
+    int tf_seqerr = rcv_ev->error_flags & INFINIPATH_RHF_H_TFSEQERR;
+    int tf_generr = rcv_ev->error_flags & INFINIPATH_RHF_H_TFGENERR;
+    int data_err = rcv_ev->error_flags & 
+      (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR | 
+       INFINIPATH_RHF_H_PARITYERR | INFINIPATH_RHF_H_LENERR | 
+       INFINIPATH_RHF_H_MTUERR | INFINIPATH_RHF_H_IHDRERR | 
+       INFINIPATH_RHF_H_IBERR);
+    char pktmsg[128];
+    
+    *pktmsg = 0;
+    /*
+     * Tid errors on eager pkts mean we get a headerq overflow, perfectly
+     * safe.  Tid errors on expected or other packets means trouble.
+     */
+    if (tiderr && rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) {
+        struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+      
+      
+	/* Payload dropped - Determine flow for this header and see if
+	 * we need to generate a NAK. 
+	 *
+	 * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO
+	 * CONGESTION AS THE EAGER BUFFER IS FULL.
+	 *
+	 * Possible eager packet type:
+	 * 
+	 * Ctrl Message - ignore
+	 * MQ message - Can get flow and see if we need to NAK.
+	 * AM message - Can get flow and see if we need to NAK.
+	 */
+
+	proto->stats.hdr_overflow++;
+	if (data_err)
+	  return 0;
+	
+	switch(p_hdr->sub_opcode) {
+	case OPCODE_SEQ_MQ_HDR:
+	case OPCODE_SEQ_MQ_CTRL:
+	case OPCODE_AM_REQUEST:
+	case OPCODE_AM_REQUEST_NOREPLY:
+	case OPCODE_AM_REPLY:
+	  {
+	    ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr);
+	    struct ips_epstate_entry *epstaddr;
+	    struct ips_flow *flow;
+	    psmi_seqnum_t sequence_num;
+	    int16_t diff;
+	    
+	    /* Obtain ipsaddr for packet */
+	    epstaddr = ips_epstate_lookup(rcv_ev->recvq->epstate, 
+        rcv_ev->p_hdr->commidx +
+        INFINIPATH_KPF_RESERVED_BITS(p_hdr->iph.pkt_flags));
+	    if_pf (epstaddr == NULL || epstaddr->epid != rcv_ev->epid)
+	      return 0; /* Unknown packet - drop */
+	    
+	    rcv_ev->ipsaddr = epstaddr->ipsaddr;	    
+	    flow = &rcv_ev->ipsaddr->flows[flowid];
+	    sequence_num.val = __be32_to_cpu(p_hdr->bth[2]);
+	    diff = (int16_t) (sequence_num.pkt - flow->recv_seq_num.pkt);
+	    
+	    if (diff >= 0 && !(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) {	      
+	      /* Mark flow as congested and attempt to generate NAK */
+	      flow->flags |= IPS_FLOW_FLAG_GEN_BECN;
+	      rcv_ev->ipsaddr->stats.congestion_pkts++;
+	      flow->last_seq_num = sequence_num;
+	      
+	      flow->flags |= IPS_FLOW_FLAG_NAK_SEND;
+	      flow->cca_ooo_pkts = 0;
+	      ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow);
+	    }
+	    
+	    /* Safe to process ACKs from header */
+	    ips_proto_process_ack(rcv_ev);
+	  }
+	  break;
+	default:
+	  break;
+	}
+    }
+    else if (tiderr)  /* tid error, but not on an eager pkt */
+    {
+	psm_ep_t    ep_err = PSMI_EP_LOGEVENT;
+	int	    many_tids = 0, many_epids = 0;
+	uint32_t    context_tid_off = 
+		      __le32_to_cpu(rcv_ev->p_hdr->iph.ver_context_tid_offset);
+	uint16_t tid, offset;
+	uint64_t t_now = get_cycles();
+
+	proto->tiderr_cnt++;
+
+	/* Whether and how we will be logging this event */
+	if (proto->tiderr_max > 0 && proto->tiderr_cnt >= proto->tiderr_max)
+	    ep_err = PSMI_EP_NORETURN;
+	else if (proto->tiderr_warn_interval != UINT64_MAX &&
+		 proto->tiderr_tnext <= t_now) 
+	    proto->tiderr_tnext = get_cycles() + proto->tiderr_warn_interval;
+	else 
+	    ep_err = NULL;
+
+	if (ep_err != NULL) {
+	    if (proto->tiderr_context_tid_off != context_tid_off) { /* many tids */
+		if (proto->tiderr_context_tid_off != 0)
+		    many_tids = 1;
+		proto->tiderr_context_tid_off = context_tid_off;
+	    }
+
+	    if (proto->tiderr_epid != rcv_ev->epid) { /* many epids */
+		if (proto->tiderr_epid != 0) 
+		    many_epids = 1;
+		proto->tiderr_epid = rcv_ev->epid;
+	    }
+
+	    rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags);
+
+	    tid = (context_tid_off >> INFINIPATH_I_TID_SHIFT) & 
+			INFINIPATH_I_TID_MASK;
+	    offset = (context_tid_off>>INFINIPATH_I_OFFSET_SHIFT) & 
+			INFINIPATH_I_OFFSET_MASK;
+
+	    psmi_handle_error(ep_err, PSM_EP_DEVICE_FAILURE,
+		"%s with tid=%d,offset=%d,count=%d "
+		"from %s%s %s %s", 
+		 many_tids ? "Multiple TID Errors" : "TID Error",
+		 tid, offset, proto->tiderr_cnt, 
+		 psmi_epaddr_get_name(rcv_ev->epid),
+		 many_epids ? " (and other hosts)" : "",
+		 pktmsg, ep_err == PSMI_EP_NORETURN ?
+		 "(Terminating...)" : "");
+	}
+
+	if (proto->protoexp && rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED)
+	    ips_protoexp_handle_tiderr(rcv_ev);
+    }
+    else if (tf_generr)
+      ips_protoexp_handle_tf_generr(rcv_ev);
+    else if (tf_seqerr)
+      ips_protoexp_handle_tf_seqerr(rcv_ev);
+    else if (data_err) {
+      uint8_t op_code = __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF;
+      
+      if (!pkt_verbose_err) {
+	rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags);
+	_IPATH_DBG("Error %s pkt type opcode 0x%x at hd=0x%x %s\n",
+		   (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "eager" :
+		   (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "expected" :
+		   (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "non-kd" : 
+		   "<error>",
+		   op_code, rcv_ev->recvq->state->hdrq_head, pktmsg);
+      }
+      
+      if (proto->protoexp && rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED)
+	ips_protoexp_handle_data_err(rcv_ev);
+    }
+    else { /* not a tid or data error -- some other error */
+	uint8_t op_code = __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF;
+
+	if (!pkt_verbose_err)
+	  rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags);
+	
+	/* else RHFerr decode printed below */
+	_IPATH_DBG("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n",
+		   rcv_ev->ptype, op_code, rcv_ev->recvq->state->hdrq_head, pktmsg);
+    }
+    if (pkt_verbose_err) {
+	if(!*pktmsg)
+	    rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags);
+	ips_proto_show_header(rcv_ev->p_hdr, pktmsg);
+    }
+
+    return 0;
+}
+
+/*
+ * Only valid packets make it to this point.
+ */
+int __recvpath
+ips_proto_process_packet_inner(struct ips_recvhdrq_event *rcv_ev)
+{
+    struct ips_message_header *p_hdr = rcv_ev->p_hdr;
+    ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr;
+    int ret = IPS_RECVHDRQ_CONTINUE;
+
+    /* NOTE: Fault injection will currently not work with hardware suppression
+     * on QLE73XX. See TODO below for reason why as we currently do not update
+     * the hardware tidflow table if FI is dropping the packet.
+     *
+     * TODO: We need to look into the packet before dropping it and
+     * if it's an expected packet AND we have hardware suppression then we
+     * need to update the hardware tidflow table and the associated tidrecvc
+     * state to fake having received a packet uptil some point in the window
+     * defined by the loss rate. This way the subsequent err chk will be NAKd
+     * and we can resync the flow with the sender. 
+     * 
+     * Note: For real errors the hardware generates seq/gen errors which are
+     * handled appropriately by the protocol.
+     */
+
+    if_pf (PSMI_FAULTINJ_ENABLED()) {
+	PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1, IPS_FAULTINJ_RECVLOST);
+	if (psmi_faultinj_is_fault(fi_recv))
+	    return ret;
+    }
+
+    switch (rcv_ev->ptype) {
+        case RCVHQ_RCV_TYPE_EAGER:
+   #if 0
+	    _IPATH_VDBG("got packet from %d with opcode=%x, seqno=%d\n", 
+		    p_hdr->commidx,
+		    p_hdr->sub_opcode, 
+		    __be32_to_cpu(p_hdr->bth[2]));
+   #endif
+
+            switch ( p_hdr->sub_opcode ) {
+            case OPCODE_SEQ_MQ_HDR:
+            case OPCODE_SEQ_MQ_CTRL:
+		ret = _process_mq(rcv_ev);
+                break;
+
+            case OPCODE_ACK:
+	        ips_proto_process_ack(rcv_ev);
+                break;
+
+            case OPCODE_NAK:
+                _process_nak(rcv_ev);
+                break;
+
+	    case OPCODE_AM_REQUEST:
+	    case OPCODE_AM_REQUEST_NOREPLY:
+	    case OPCODE_AM_REPLY:
+		ret = ips_proto_am(rcv_ev);
+		break;
+	    case OPCODE_FLOW_CCA_BECN:
+	      {
+		struct ips_proto *proto = ipsaddr->proto;
+		struct ips_flow *flow = NULL;
+		psm_protocol_type_t protocol;
+		ptl_epaddr_flow_t flowid;
+
+		protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid);
+		flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid);
+		psmi_assert_always(protocol == PSM_PROTOCOL_GO_BACK_N);
+		flow = &ipsaddr->flows[flowid];
+		
+	        if ((flow->path->epr_ccti +
+		proto->cace[flow->path->epr_sl].ccti_increase) <=
+		proto->ccti_limit) {
+		  ips_cca_adjust_rate(flow->path,
+			proto->cace[flow->path->epr_sl].ccti_increase);
+		  /* Clear congestion event */
+		  rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN;
+		}
+	      }
+	      break;
+		
+            case OPCODE_ERR_CHK:
+            case OPCODE_ERR_CHK_OLD:
+	      _process_err_chk((struct ips_recvhdrq *) rcv_ev->recvq, 
+			       ipsaddr, p_hdr);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+                break;
+		
+            case OPCODE_ERR_CHK_GEN:
+	        _process_err_chk_gen(ipsaddr, p_hdr);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+		break;
+
+            case OPCODE_ERR_CHK_PLS:   /* skip for now  */
+                break;
+
+            case OPCODE_ERR_CHK_BAD: 
+                _process_err_chk_bad(ipsaddr, p_hdr);
+                break;
+
+	    case OPCODE_TIDS_GRANT:
+		ips_protoexp_tid_grant(rcv_ev);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+		break;
+
+	    case OPCODE_TIDS_GRANT_ACK:
+		ips_protoexp_tid_grant_ack(rcv_ev);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+		break;
+
+            case OPCODE_TIDS_RELEASE:
+		ret = ips_protoexp_tid_release(rcv_ev);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+                break;
+
+            case OPCODE_TIDS_RELEASE_CONFIRM:
+                ips_protoexp_tid_release_ack(rcv_ev);
+		/* Ignore FECN bit since this is the control path */
+		rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN;
+                break;
+
+	    case OPCODE_SEQ_MQ_EXPTID:
+	        ips_protoexp_data(rcv_ev);
+		break;
+		
+	    case OPCODE_SEQ_MQ_EXPTID_UNALIGNED:
+		ips_protoexp_recv_unaligned_data(rcv_ev);
+		break;
+     
+	    case OPCODE_CONNECT_REQUEST:
+	    case OPCODE_CONNECT_REPLY:
+	    case OPCODE_DISCONNECT_REQUEST:
+	    case OPCODE_DISCONNECT_REPLY:
+		_process_connect(rcv_ev);
+		break;
+		
+            default:   /* skip unsupported opcodes  */
+                _process_unknown_opcode(rcv_ev->proto, p_hdr);
+                break;
+            }   /* switch (op_code) */
+            break;
+
+        case RCVHQ_RCV_TYPE_EXPECTED:
+	    ips_protoexp_data(rcv_ev);
+            break;
+
+        default:       /* unknown frame type */
+	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		"Unknown frame type %x", rcv_ev->ptype);
+            break;
+    }  /* switch (ptype)  */
+
+    return ret;
+}
diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c
new file mode 100644
index 0000000..861b66c
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+
+/*
+ * TUNABLES TUNABLES TUNABLES
+ */
+
+/*
+ * Receive Queue progress optimizations
+ *
+ * The recvhdrq_progress function supports 2 chip features, so can be written
+ * to support 4 possible combinations in chip features (although only 3/4 are
+ * currently implemented in our chips).
+ *
+ * We can either support recvhdrq_progress by implementing the function in 4
+ * ways and calling it through a function pointer
+ * (IPS_RCVHDRQ_THRU_FUNCTION_POINTER=1) or having one implementation that
+ * covers all possible combinations (IPS_RCVHDRQ_THRU_FUNCTION_POINTER=0).
+ */
+#define IPS_RCVHDRQ_THRU_FUNCTION_POINTER 1
+
+#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER
+static psm_error_t ips_recvhdrq_progress_none(struct ips_recvhdrq *recvq);
+static psm_error_t ips_recvhdrq_progress_nortail(struct ips_recvhdrq *recvq);
+#endif
+
+ 
+psm_error_t 
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvq_params *hdrq_params,
+		  const struct ips_recvq_params *egrq_params,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  uint32_t runtime_flags,
+		  uint32_t subcontext,
+		  struct ips_recvhdrq *recvq,
+		  struct ips_recvhdrq_state *recvq_state)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+    psm_error_t err = PSM_OK;
+
+    memset(recvq, 0, sizeof(*recvq));
+    recvq->proto      = (struct ips_proto *) proto;
+    recvq->state      = recvq_state;
+    recvq->context       = context;
+    recvq->subcontext    = subcontext;
+    /* This runtime flags may be different from the context's runtime flags since
+     * a receive queue may be initialised to represent a "software" receive
+     * queue (shared contexts) or a hardware receive queue */
+    recvq->runtime_flags = runtime_flags;
+    recvq->hdrq = *hdrq_params; /* deep copy */
+    pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED);
+    recvq->hdrq_rhf_off = base_info->spi_rhf_offset;
+
+    if (recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+	recvq->hdrq_rhf_notail = 1;
+	recvq->state->hdrq_rhf_seq = 1; 
+    }
+    else {
+	recvq->hdrq_rhf_notail = 0;
+	recvq->state->hdrq_rhf_seq = 0; /* _seq is ignored */
+    }
+    recvq->hdrq_elemlast = ((recvq->hdrq.elemcnt - 1) * recvq->hdrq.elemsz);
+    
+    recvq->egrq = *egrq_params; /* deep copy */
+    recvq->egrq_buftable = 
+	ips_recvq_egrbuf_table_alloc(context->ep, recvq->egrq.base_addr, 
+				     base_info->spi_rcv_egrchunksize,
+				     recvq->egrq.elemcnt, recvq->egrq.elemsz);
+    if (recvq->egrq_buftable == NULL) {
+	err = psmi_handle_error(proto->ep, PSM_NO_MEMORY,
+		    "Couldn't allocate memory for eager buffer index table");
+	goto fail;
+    }
+
+    recvq->epstate = epstate;
+
+    /* NOTE: We should document PSM_RCVHDRCOPY is not available with QIB? */
+
+#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER
+    /* Only either have NODMA RTAIL (for QLE73XX/QLE72XX) or just the vanilla
+       version for QLE71XX where RTAIL is DMA'd */
+    if (recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL)
+      recvq->progress_fn = ips_recvhdrq_progress_nortail;
+    else
+      recvq->progress_fn = ips_recvhdrq_progress_none;
+#endif
+
+    recvq->recvq_callbacks = *callbacks; /* deep copy */
+    SLIST_INIT(&recvq->pending_acks); 
+
+    recvq->state->hdrq_head = 0;
+    recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE;
+    recvq->state->num_hdrq_done = 0;
+    recvq->state->hdr_countdown = 0;
+    
+    {
+      union psmi_envvar_val env_hdr_update;
+      psmi_getenv("PSM_HEAD_UPDATE",
+                  "header queue update interval (0 to update after all entries are processed). Default is 16",
+                  PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
+                  (union psmi_envvar_val) 16, &env_hdr_update);
+      
+      /* Cap max header update interval to size of header/eager queue */
+      recvq->state->head_update_interval =
+        min(env_hdr_update.e_uint,
+            min(recvq->hdrq.elemcnt-1, recvq->egrq.elemcnt-1));
+    }
+
+fail:
+    return err;
+}
+
+psm_error_t
+ips_recvhdrq_fini(struct ips_recvhdrq *recvq)
+{
+    ips_recvq_egrbuf_table_free(recvq->egrq_buftable);
+    return PSM_OK;
+}
+
+// flush the eager buffers, by setting the eager index head to eager index tail
+// if eager buffer queue is full.
+//
+// Called when we had eager buffer overflows (ERR_TID/INFINIPATH_RHF_H_TIDERR
+// was set in RHF errors), and no good eager packets were received, so
+// that eager head wasn't advanced.
+//
+
+#if 0
+static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq)
+{
+    const uint32_t tail = ips_recvq_tail_get(&recvq->egrq);
+    const uint32_t head = ips_recvq_head_get(&recvq->egrq);
+    uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+    if ((head % egr_cnt) == ((tail+1)%egr_cnt)) {
+	_IPATH_DBG("eager array full after overflow, flushing "
+		    "(head %llx, tail %llx)\n",
+            (long long)head, (long long)tail);
+	 recvq->proto->stats.egr_overflow++;
+    }
+    return;
+}
+#endif
+
+/*
+ * Helpers for ips_recvhdrq_progress.
+ */
+
+static __inline__ int 
+_get_proto_subcontext(const struct ips_message_header *p_hdr)
+{
+  return p_hdr->dst_subcontext;
+}
+
+/* ipath_opcode is not the ips-level opcode. */
+static __inline__ uint8_t 
+_get_proto_ipath_opcode(const struct ips_message_header *p_hdr)
+{
+    return __be32_to_cpu(p_hdr->bth[0]) >> BTH_OPCODE_SHIFT & 0xFF;
+}
+
+/* Detrmine if FECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_fecn_set(const struct ips_message_header *p_hdr)
+{
+  return (__be32_to_cpu(p_hdr->bth[1]) >> BTH_FECN_SHIFT);
+}
+
+/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/
+static __inline__ uint8_t
+_is_cca_becn_set(const struct ips_message_header *p_hdr)
+{
+  return (__be32_to_cpu(p_hdr->bth[1]) >> BTH_BECN_SHIFT) & 0x1;
+}
+
+static __inline__ struct ips_message_header * 
+_get_proto_hdr_from_rhf(const uint32_t *rcv_hdr, const __le32 *rhf)
+{
+    return (struct ips_message_header *) (rcv_hdr + ipath_hdrget_offset(rhf));
+}
+
+static __inline__ struct ips_message_header * 
+_get_proto_hdr(const uint32_t *rcv_hdr)
+{
+    return (struct ips_message_header *) &rcv_hdr[2];
+}
+
+static __inline__ uint32_t
+_get_rhf_seq(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+    return ipath_hdrget_seq((const __le32 *) rcv_hdr + recvq->hdrq_rhf_off);
+}
+
+static __inline__ uint32_t
+_get_rhf_len_in_bytes(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr)
+{
+  return ipath_hdrget_length_in_bytes((const __le32*) rcv_hdr + recvq->hdrq_rhf_off);
+}
+
+static __inline__ void
+_dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev)
+{
+  char *payload = ips_recvhdrq_event_payload(rcv_ev);
+  uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+  
+  if(infinipath_debug & __IPATH_PKTDBG) {
+    ips_proto_dump_frame(rcv_ev->p_hdr, IPATH_MESSAGE_HDR_SIZE, "header");
+    if (paylen)
+      ips_proto_dump_frame(payload, paylen, "data");
+  }
+  
+}
+
+static __inline__ void
+_update_error_stats(struct ips_proto *proto, uint32_t err)
+{
+
+  if (err & INFINIPATH_RHF_H_ICRCERR)
+    proto->error_stats.num_icrc_err++;
+  if (err & INFINIPATH_RHF_H_VCRCERR)
+    proto->error_stats.num_vcrc_err++;
+  if (err & INFINIPATH_RHF_H_PARITYERR)
+    proto->error_stats.num_ecc_err++;
+  if (err & INFINIPATH_RHF_H_LENERR)
+    proto->error_stats.num_len_err++;
+  if (err & INFINIPATH_RHF_H_MTUERR)
+    proto->error_stats.num_mtu_err++;
+  if (err & INFINIPATH_RHF_H_IHDRERR)
+    proto->error_stats.num_khdr_err++;
+  if (err & INFINIPATH_RHF_H_TIDERR)
+    proto->error_stats.num_tid_err++;
+  if (err & INFINIPATH_RHF_H_MKERR)
+    proto->error_stats.num_mk_err++;
+  if (err & INFINIPATH_RHF_H_IBERR)
+    proto->error_stats.num_ib_err++;
+}
+
+static int
+_check_headers(struct ips_recvhdrq_event *rcv_ev)
+{
+  struct ips_recvhdrq *recvq = (struct ips_recvhdrq*) rcv_ev->recvq;
+  struct ips_proto *proto = rcv_ev->proto;
+  uint32_t *lrh = (uint32_t*) rcv_ev->p_hdr;
+  const uint32_t *rcv_hdr = rcv_ev->rcv_hdr;
+  uint32_t dest_context;
+  const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]);
+  const uint16_t base_dlid = __be16_to_cpu(recvq->proto->epinfo.ep_base_lid);
+
+  /* Check that the receive header queue entry has a sane sequence number */
+  if (_get_rhf_seq(recvq, rcv_hdr) > LAST_RHF_SEQNO) {         
+    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		      "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n", _get_rhf_seq(recvq, rcv_hdr), recvq->state->hdrq_rhf_seq, lrh[0], lrh[1]);
+    return -1;
+  }
+
+  /* Verify that the packet was destined for our context */
+  dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr);
+  if_pf (dest_context != recvq->proto->epinfo.ep_context) {
+    
+    struct ips_recvhdrq_state *state = recvq->state;
+    
+    /* Packet not targetted at us. Drop packet and continue */
+    ips_proto_dump_err_stats(proto);
+    _dump_invalid_pkt(rcv_ev);
+    
+    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		       "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n", dest_context, recvq->proto->epinfo.ep_context, state->hdrq_head);
+    
+    return -1;
+  }
+  
+
+  if_pf (rcv_ev->error_flags || 
+	 (_get_proto_ipath_opcode(rcv_ev->p_hdr) != IPATH_OPCODE_USER1))  {
+    
+    return 0; /* Error flags are special case. Let main receive loop handle
+	       * packet processing after we account for it. 
+	       */
+  }
+
+  /* Verify that rhf packet length matches the length in LRH */
+  if_pf (_get_rhf_len_in_bytes(recvq, rcv_hdr) != 
+      (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << 2)) {
+    _IPATH_EPDBG("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n", _get_rhf_len_in_bytes(recvq, rcv_hdr) >> 2, __be16_to_cpu(rcv_ev->p_hdr->lrh[2]));
+    
+    ips_proto_dump_err_stats(proto);
+    _dump_invalid_pkt(rcv_ev);
+    return -1;
+  }
+
+  /* Verify that the DLID matches our local LID. */
+  if_pf (!((base_dlid <= pkt_dlid) && 
+	   (pkt_dlid <= (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) {
+    _IPATH_EPDBG("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n", rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid);
+    ips_proto_dump_err_stats(proto);
+    _dump_invalid_pkt(rcv_ev);
+    return -1;
+  }
+
+  return 0;
+}
+
+static __inline__ 
+int
+do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev)
+{
+  char *payload = ips_recvhdrq_event_payload(rcv_ev);
+  uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) +
+    ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3);
+  uint32_t *ckptr;
+  uint32_t recv_cksum, cksum, dest_subcontext; 
+  
+  /* With checksum every packet has a payload */
+  psmi_assert_always(payload);
+  
+  ckptr = (uint32_t*) (payload + paylen);
+  recv_cksum = ckptr[0];
+  
+  /* Calculate checksum hdr + payload (includes any padding words) */
+  cksum = 0xffffffff;
+  cksum = ips_crc_calculate(IPATH_MESSAGE_HDR_SIZE,
+			    (uint8_t*) rcv_ev->p_hdr,cksum);
+  if (paylen)
+    cksum = ips_crc_calculate(paylen, (uint8_t*) payload, cksum);
+  
+  if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) {
+    struct ips_epstate_entry *epstaddr;
+    uint32_t lcontext;
+    uint32_t hd, tl;
+    
+    epstaddr =
+      ips_epstate_lookup(rcv_ev->recvq->epstate, rcv_ev->p_hdr->commidx +
+          INFINIPATH_KPF_RESERVED_BITS(rcv_ev->p_hdr->iph.pkt_flags));
+    epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL;
+    
+    lcontext = 
+      epstaddr ? epstaddr->ipsaddr->proto->epinfo.ep_context : -1;
+    
+    hd = rcv_ev->recvq->context->ctrl->__ipath_rcvhdrhead[0];
+    tl = rcv_ev->recvq->context->ctrl->__ipath_rcvhdrhead[-2];
+    
+    dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr);
+    
+    _IPATH_ERROR("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%x,%x, rhfseq 0x%x\n", (dest_subcontext != rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext, epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->epr.epr_base_lid) : -1, cksum, ckptr[0], ckptr[1], rcv_ev->p_hdr->sub_opcode, rcv_ev->error_flags,hd, tl, rcv_ev->rhf[0], rcv_ev->rhf[1], _get_rhf_seq((struct ips_recvhdrq *) rcv_ev->recvq, rcv_ev->rcv_hdr));
+    
+    /* Dump packet */
+    _dump_invalid_pkt(rcv_ev);
+    return 0; /* Packet checksum error */
+  }
+  
+  return 1;
+}
+
+PSMI_ALWAYS_INLINE(
+void
+process_pending_acks(struct ips_recvhdrq *recvq))
+{
+  /* If any pending acks, dispatch them now */
+  while (!SLIST_EMPTY(&recvq->pending_acks)) {
+    struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks);
+    
+    SLIST_REMOVE_HEAD(&recvq->pending_acks, next);
+    SLIST_NEXT(flow, next) = NULL;
+
+    if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) {
+      psmi_assert_always((flow->flags & IPS_FLOW_FLAG_PENDING_NAK) == 0);
+      
+      flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK;
+      ips_proto_send_ctrl_message(flow, OPCODE_ACK, 
+					&flow->ipsaddr->ctrl_msg_queued, NULL);
+    }
+    else {
+      psmi_assert_always(flow->flags & IPS_FLOW_FLAG_PENDING_NAK);
+      
+      flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK;
+      ips_proto_send_ctrl_message(flow, OPCODE_NAK, 
+					&flow->ipsaddr->ctrl_msg_queued, NULL);
+    }
+    
+  }
+  
+}
+    
+/*
+ * Core receive progress function
+ *
+ * recvhdrq_progress is the core function that services the receive header
+ * queue and optionally, the eager queue.  At the lowest level, it identifies
+ * packets marked with errors by the chip and also detects and corrects when
+ * eager overflow conditions occur.  At the highest level, it queries the
+ * 'epstate' interface to classify packets from "known" and "unknown"
+ * endpoints.  In order to support shared contexts, it can also handle packets
+ * destined for other contexts (or "subcontexts").
+ */
+
+#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER
+PSMI_ALWAYS_INLINE(
+psm_error_t 
+ips_recvhdrq_progress_inner(struct ips_recvhdrq *recvq,
+			    const int has_no_rtail))
+#else
+psm_error_t __recvpath
+ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
+#endif
+{
+    struct ips_recvhdrq_state *state = recvq->state;
+    const __le32 *rhf;
+    PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = { .proto = recvq->proto,
+							 .recvq = recvq };
+
+    uint32_t num_hdrq_done = 0;
+    const int num_hdrq_todo = recvq->hdrq.elemcnt;
+    const uint32_t hdrq_elemsz = recvq->hdrq.elemsz;
+    uint32_t dest_subcontext;
+
+    int ret = IPS_RECVHDRQ_CONTINUE;
+    int done = 0;
+    int do_hdr_update = 0;
+    const uint16_t lmc_mask = ~((1 << recvq->proto->epinfo.ep_lmc) - 1);
+
+    /* Chip features */
+#if !IPS_RCVHDRQ_THRU_FUNCTION_POINTER
+    const int has_no_rtail = recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL;
+#endif
+    
+    /* Both optional_eager and no_rtail features are in the same chip rev */
+#define has_optional_eagerbuf recvq->hdrq_rhf_off
+
+    /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */
+#define next_hdrq_is_ready()						     \
+	(has_no_rtail ? \
+            recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, rcv_hdr)  \
+	  : state->hdrq_head != hdrq_tail)
+
+    const uint32_t hdrq_tail = has_no_rtail ? 0
+					    : ips_recvq_tail_get(&recvq->hdrq);
+    const uint32_t *rcv_hdr = 
+	    (const uint32_t *) recvq->hdrq.base_addr + state->hdrq_head;
+    uint32_t tmp_hdrq_head;
+    
+    done = !next_hdrq_is_ready();
+
+    while (!done)
+    {
+      
+	rhf = (const __le32 *) rcv_hdr + recvq->hdrq_rhf_off;
+        rcv_ev.error_flags = ipath_hdrget_err_flags(rhf);
+        rcv_ev.ptype  = ipath_hdrget_rcv_type(rhf);
+	rcv_ev.rhf    = rhf;
+	rcv_ev.rcv_hdr= rcv_hdr;
+	rcv_ev.p_hdr  = recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(rcv_hdr, rhf)
+				     : _get_proto_hdr(rcv_hdr); 
+	rcv_ev.epid   = ips_epid_from_phdr(lmc_mask, rcv_ev.p_hdr);
+	rcv_ev.has_cksum = 
+	  ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) &&
+	   (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER) &&
+	   (rcv_ev.p_hdr->mqhdr != MQ_MSG_DATA_BLK) &&
+	   (rcv_ev.p_hdr->mqhdr != MQ_MSG_DATA_REQ_BLK));
+	
+	if_pt (recvq->proto->flags & IPS_PROTO_FLAG_CCA) {
+	  /* IBTA CCA handling:
+	   * If FECN bit set handle IBTA CCA protocol. For the flow that 
+	   * suffered congestion we flag it to generate a control packet with
+	   * the BECN bit set - This is currently an unsolicited ACK. 
+	   *
+	   * For all MQ packets the FECN processing/BECN generation is done
+	   * in the is_expected_or_nak function as each eager packet is
+	   * inspected there. 
+	   *
+	   * For TIDFLOW/Expected data transfers the FECN bit/BECN generation
+	   * is done in protoexp_data. Since header suppression can result
+	   * in even FECN packets being suppressed the expected protocol
+	   * generated addiional BECN packets if a "large" number of generations
+	   * are swapped without progress being made for receive. "Large" is
+	   * set empirically to 4.
+	   *
+	   * FECN packets are ignored for all control messages (except ACKs
+	   * and NAKs) since they indicate congestion on the control path which
+	   * is not rate controlled. The CCA specification allows FECN on
+	   * ACKs to be disregarded as well.
+	   */
+	  rcv_ev.is_congested = 
+	  _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN;
+	  rcv_ev.is_congested |= 
+	    (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1));
+	}
+	else
+	  rcv_ev.is_congested = 0;
+
+	dest_subcontext  = _get_proto_subcontext(rcv_ev.p_hdr);
+
+	if_pf (_check_headers(&rcv_ev))
+	  goto skip_packet;
+
+        if_pf (rcv_ev.error_flags || 
+	       (_get_proto_ipath_opcode(rcv_ev.p_hdr) != IPATH_OPCODE_USER1)) 
+	{
+	  
+	  _update_error_stats(recvq->proto, rcv_ev.error_flags);
+	  
+	  if ((rcv_ev.error_flags & INFINIPATH_RHF_H_TIDERR) || 
+	      (rcv_ev.error_flags & INFINIPATH_RHF_H_TFSEQERR) ||
+	      (rcv_ev.error_flags & INFINIPATH_RHF_H_TFGENERR)) {
+		/* Subcontexts need to see expected tid errors */
+		if (rcv_ev.ptype == RCVHQ_RCV_TYPE_EXPECTED &&
+		    dest_subcontext != recvq->subcontext)
+			goto subcontext_packet;
+
+		recvq->recvq_callbacks.callback_error(&rcv_ev);
+
+		if (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER) {
+		    /* tiderr and eager, don't consider updating egr head */
+		    if (state->hdr_countdown == 0 &&
+				state->rcv_egr_index_head == NO_EAGER_UPDATE) {
+			/* eager-full is not currently under tracing. */
+			uint32_t egr_cnt = recvq->egrq.elemcnt;
+			const uint32_t etail = ips_recvq_tail_get(&recvq->egrq);
+			const uint32_t ehead = ips_recvq_head_get(&recvq->egrq);
+
+			if (ehead == ((etail+1)%egr_cnt)) {
+			    /* eager is full, trace existing header entries */
+			    uint32_t hdr_size = recvq->hdrq_elemlast + hdrq_elemsz;
+			    const uint32_t htail = ips_recvq_tail_get(&recvq->hdrq);
+			    const uint32_t hhead = state->hdrq_head;
+
+			    state->hdr_countdown = (htail > hhead) ?
+				(htail - hhead) : (htail + hdr_size - hhead);
+			}
+		    }
+		    goto skip_packet_no_egr_update;
+		}
+	    }
+	    else
+		recvq->recvq_callbacks.callback_error(&rcv_ev);
+	    goto skip_packet;
+        }
+
+	/* If checksum is enabled, verify that it is valid */
+	if_pf (rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev))
+	  goto skip_packet;
+	
+	if (dest_subcontext == recvq->subcontext) {
+	    /* Classify packet from a known or unknown endpoint */
+	    struct ips_epstate_entry *epstaddr;
+
+      epstaddr =
+        ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->commidx +
+            INFINIPATH_KPF_RESERVED_BITS(rcv_ev.p_hdr->iph.pkt_flags));
+	    if_pf (epstaddr == NULL || epstaddr->epid != rcv_ev.epid) {
+	        rcv_ev.ipsaddr = NULL;
+		recvq->recvq_callbacks.callback_packet_unknown(&rcv_ev);
+	    }
+	    else {   
+	        rcv_ev.ipsaddr = epstaddr->ipsaddr;
+		ret = ips_proto_process_packet(&rcv_ev);
+		if (ret == IPS_RECVHDRQ_OOO) return PSM_OK_NO_PROGRESS;
+	    }
+	}
+	else {
+subcontext_packet:
+	    /* If the destination is not our subcontext, process message
+	     * as a subcontext message (shared contexts) */
+	    rcv_ev.ipsaddr = NULL;
+
+	    ret = recvq->recvq_callbacks.callback_subcontext(&rcv_ev,
+							     dest_subcontext);
+	}
+
+skip_packet:
+	/* 
+	 * important to update rcv_egr_index_head iff
+	 * 1. Packet was of type eager
+	 * 2. Packet actually consumed an eagerbuf (post QLE72XX)
+	 * 3. Packet was *not* an eager header with RHF_H_TIDERR to mark
+	 *    an eager overflow
+	 */
+	if (has_optional_eagerbuf ? ipath_hdrget_use_egr_buf(rhf)
+			          : (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER)) {
+	    state->rcv_egr_index_head = ipath_hdrget_index(rhf);
+	    /* a header entry is using an eager entry, stop tracing. */
+	    state->hdr_countdown = 0;
+	}
+
+skip_packet_no_egr_update:
+        /* Note that state->hdrq_head is sampled speculatively by the code
+         * in ips_ptl_shared_poll() when context sharing, so it is not safe
+         * for this shared variable to temporarily exceed the last element. */
+        tmp_hdrq_head = state->hdrq_head + hdrq_elemsz;
+	if_pt (tmp_hdrq_head <= recvq->hdrq_elemlast)
+          state->hdrq_head = tmp_hdrq_head;
+        else
+	  state->hdrq_head = 0;
+	
+	if_pf (has_no_rtail && ++recvq->state->hdrq_rhf_seq > LAST_RHF_SEQNO)
+	  recvq->state->hdrq_rhf_seq = 1;
+	
+	state->num_hdrq_done++;
+	num_hdrq_done++;
+	rcv_hdr = (const uint32_t *) recvq->hdrq.base_addr + state->hdrq_head;
+	done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK) ||
+	        (num_hdrq_done == num_hdrq_todo));
+
+	do_hdr_update = (state->head_update_interval ?
+			 (state->num_hdrq_done == state->head_update_interval) : done);
+	if (do_hdr_update) {
+	    ips_recvq_head_update(&recvq->hdrq, state->hdrq_head);
+
+	    /* Lazy update of egrq */
+	    if (state->rcv_egr_index_head != NO_EAGER_UPDATE) {
+	      ips_recvq_head_update(&recvq->egrq, state->rcv_egr_index_head);
+	      state->rcv_egr_index_head = NO_EAGER_UPDATE;
+	    }
+
+	    /* Process any pending acks while updated eager/headq queue */
+	    process_pending_acks(recvq);
+
+	    /* Reset header queue entries processed */
+	    state->num_hdrq_done = 0;
+	}
+
+	if (state->hdr_countdown > 0) {
+	    /* a header entry is consumed. */
+	    state->hdr_countdown -= hdrq_elemsz;
+	    if (state->hdr_countdown == 0) {
+		/* header entry count reaches zero. */
+		const uint32_t tail = ips_recvq_tail_get(&recvq->egrq);
+		const uint32_t head = ips_recvq_head_get(&recvq->egrq);
+		uint32_t egr_cnt = recvq->egrq.elemcnt;
+
+		/* Checks eager-full again. This is a real false-egr-full */
+		if (head == ((tail+1)%egr_cnt)) {
+		    ips_recvq_head_update(&recvq->egrq, tail);
+		    _IPATH_DBG("eager array full after overflow, flushing "
+				"(head %llx, tail %llx)\n",
+				(long long)head, (long long)tail);
+		    recvq->proto->stats.egr_overflow++;
+		} else
+		    _IPATH_ERROR("PSM BUG: EgrOverflow: eager queue is not full\n");
+	    }
+	}
+    }
+    /* while (hdrq_entries_to_read) */
+
+    /* Process any pending acks before exiting */
+    process_pending_acks(recvq);
+    
+    return num_hdrq_done ? PSM_OK : PSM_OK_NO_PROGRESS;
+}
+
+#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER
+/*
+ * QLE71XX
+ */
+static
+psm_error_t __recvpath
+ips_recvhdrq_progress_none(struct ips_recvhdrq *recvq)
+{
+    const int has_no_rtail = 0;
+    return ips_recvhdrq_progress_inner(recvq, has_no_rtail);
+}
+
+/* 
+ * QLE72XX+ 
+ */
+static
+psm_error_t __recvpath
+ips_recvhdrq_progress_nortail(struct ips_recvhdrq *recvq)
+{
+    const int has_no_rtail = 1;
+    return ips_recvhdrq_progress_inner(recvq, has_no_rtail);
+}
+
+psm_error_t __recvpath
+ips_recvhdrq_progress(struct ips_recvhdrq *recvq)
+{
+  /* Call the progress function with the right chip features. */
+  return recvq->progress_fn(recvq);
+}
+#endif
diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h
new file mode 100644
index 0000000..1e45f57
--- /dev/null
+++ b/ptl_ips/ips_recvhdrq.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_proto_header.h"
+#include "ips_proto_params.h"
+#include "ips_recvq.h"
+
+#ifndef _IPS_RECVHDRQ_H
+#define _IPS_RECVHDRQ_H
+
+struct ips_recvhdrq;
+struct ips_recvhdrq_state;
+struct ips_epstate;
+
+#define IPS_RECVHDRQ_CONTINUE   0
+#define IPS_RECVHDRQ_BREAK      1
+#define IPS_RECVHDRQ_OOO	2   /* out of order */
+#define IPS_RECVHDRQ_ELEMSZ_MAX 32  /* 128 bytes */
+#define LAST_RHF_SEQNO 13
+
+/* CCA related receive events */
+#define IPS_RECV_EVENT_FECN 0x1
+#define IPS_RECV_EVENT_BECN 0x2
+
+struct ips_recvhdrq_event {
+    struct ips_proto	      *proto;
+    const struct ips_recvhdrq *recvq;	    /* where message received */
+    const uint32_t	      *rcv_hdr;	    /* rcv_hdr ptr */
+    const __le32              *rhf;	    /* receive header flags */
+    struct ips_message_header *p_hdr;	    /* protocol header in rcv_hdr */
+    struct ptl_epaddr	      *ipsaddr;	    /* peer ipsaddr, if available */
+    psm_epid_t                 epid;        /* peer epid */
+    uint32_t		       error_flags; /* error flags */
+    uint8_t                    has_cksum;   /* payload has cksum */
+    uint8_t                    is_congested;/* Packet faced congestion */
+    uint16_t		       ptype;	    /* packet type */
+};
+
+struct ips_recvhdrq_callbacks {
+    int (*callback_packet_unknown)(const struct ips_recvhdrq_event *);
+    int (*callback_subcontext)(const struct ips_recvhdrq_event *, uint32_t subcontext);
+    int (*callback_error)(struct ips_recvhdrq_event *);
+};
+
+psm_error_t 
+ips_recvhdrq_init(const psmi_context_t *context,
+		  const struct ips_epstate *epstate,
+		  const struct ips_proto *proto,
+		  const struct ips_recvq_params *hdrq_params,
+		  const struct ips_recvq_params *egrq_params,
+		  const struct ips_recvhdrq_callbacks *callbacks,
+		  uint32_t flags,
+		  uint32_t subcontext,
+		  struct ips_recvhdrq *recvq,
+		  struct ips_recvhdrq_state *recvq_state);
+
+psm_error_t
+ips_recvhdrq_progress(struct ips_recvhdrq *recvq);
+
+psm_error_t
+ips_recvhdrq_fini(struct ips_recvhdrq *recvq);
+
+/*
+ * Structure containing state for recvhdrq reading. This is logically
+ * part of ips_recvhdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the context. Generally, do not 
+ * put pointers in here since the address map of each process can be 
+ * different.  
+ */
+#define NO_EAGER_UPDATE ~0U
+struct ips_recvhdrq_state
+{
+  uint32_t hdrq_head;			/* software copy of head */
+  uint32_t rcv_egr_index_head;          /* software copy of eager index head*/
+  uint32_t hdrq_rhf_seq; 		/* QLE73XX/QLE72XX last seq */	     
+  uint32_t head_update_interval;        /* Header update interval */
+  uint32_t num_hdrq_done;               /* Num header queue done */
+  uint32_t hdr_countdown;		/* for false-egr-full tracing */
+};
+
+/*
+ * Structure to read from recvhdrq
+ */
+typedef psm_error_t (*ips_recvhdrq_progress_fn_t)(struct ips_recvhdrq *recvq);
+
+struct ips_recvhdrq
+{
+    struct ips_proto  *proto;
+    const psmi_context_t *context; /* error handling, epid id, etc. */
+    ips_recvhdrq_progress_fn_t	progress_fn;
+    struct ips_recvhdrq_state *state;
+    uint32_t	       context_flags; /* derived from base_info.spi_runtime_flags */
+    uint32_t	       subcontext;   /* messages that don't match subcontext call
+				    * recv_callback_subcontext */
+
+    /* Header queue handling */
+    pthread_spinlock_t	     hdrq_lock;	    /* Lock for thread-safe polling */
+    uint32_t		     hdrq_rhf_off;  /* QLE73XX/QLE72XX rhf offset */
+    int			     hdrq_rhf_notail; /* rhf notail enabled */
+    uint32_t		     hdrq_elemlast; /* last element precomputed */
+    struct ips_recvq_params  hdrq;
+
+    /* Eager queue handling */
+    void		  **egrq_buftable;  /* table of eager idx-to-ptr */
+    struct ips_recvq_params egrq;
+
+    /* Lookup endpoints epid -> ptladdr (rank)) */
+    const struct ips_epstate	*epstate;
+
+    /* Callbacks to handle recvq events */
+    struct ips_recvhdrq_callbacks recvq_callbacks;
+
+    /* List of flows with pending acks for receive queue */
+    SLIST_HEAD(pending_flows, ips_flow) pending_acks;
+
+    uint32_t	      runtime_flags;
+    volatile __u64   *spi_status; 
+};
+
+PSMI_INLINE(
+int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq))
+{
+    if (recvq->hdrq_rhf_notail) /* use rhf-based reads */
+	return recvq->state->hdrq_rhf_seq != 
+	       ipath_hdrget_seq(
+		    recvq->hdrq.base_addr + recvq->state->hdrq_head + 
+		    recvq->hdrq_rhf_off);
+    else
+	return ips_recvq_tail_get(&recvq->hdrq) == recvq->state->hdrq_head;
+}
+
+PSMI_INLINE(
+void *ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev))
+{
+    /* XXX return NULL if no eager buffer allocated */
+    return ips_recvq_egr_index_2_ptr(rcv_ev->recvq->egrq_buftable,
+				     ipath_hdrget_index(rcv_ev->rhf));
+}
+
+PSMI_INLINE(
+int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq))
+{
+    int ret = pthread_spin_trylock(&recvq->hdrq_lock);
+    return !ret;
+}
+
+PSMI_INLINE(
+int ips_recvhdrq_lock(struct ips_recvhdrq *recvq))
+{
+    int ret = pthread_spin_lock(&recvq->hdrq_lock);
+    return !ret;
+}
+
+PSMI_INLINE(
+int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq))
+{
+    int ret = pthread_spin_unlock(&recvq->hdrq_lock);
+    return !ret;
+}
+
+PSMI_INLINE(
+uint32_t ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev))
+{
+  uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0;
+  
+  return ipath_hdrget_length_in_bytes(rcv_ev->rhf) -
+    (sizeof(struct ips_message_header) + CRC_SIZE_IN_BYTES + cksum_len + 
+     ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3)); /* padding */
+}
+
+#endif /* _IPS_RECVHDRQ_H */
+
diff --git a/ptl_ips/ips_recvq.c b/ptl_ips/ips_recvq.c
new file mode 100644
index 0000000..710320d
--- /dev/null
+++ b/ptl_ips/ips_recvq.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_recvq.h"
+
+/* We return a table of pointer indexes.
+ * 
+ * From the point of view of the returned pointer, index -1 always points to
+ * the address to call psmi_free on (since we force page-alignment).
+ */
+void **
+ips_recvq_egrbuf_table_alloc(psm_ep_t ep, void *baseptr, 
+			     uint32_t chunksize, 
+			     uint32_t bufnum, uint32_t bufsize)
+{
+    unsigned i;
+    uint32_t bufperchunk = chunksize / bufsize;
+    void *ptr_alloc;
+    uintptr_t *buft;
+    uintptr_t base = (uintptr_t) baseptr;
+
+    ptr_alloc = psmi_malloc(ep, UNDEFINED, 
+		      PSMI_PAGESIZE + sizeof(uintptr_t)*(bufnum+1));
+    if (ptr_alloc == NULL)
+	return NULL;
+    /* First pointer is to the actual allocated address, so we can free it but
+     * buft[1] is first on the page boundary
+     */
+    buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc+1, PSMI_PAGESIZE);
+    buft[-1] = (uintptr_t) ptr_alloc;
+    for (i=0; i<bufnum; i++)
+	buft[i] = base + (i / bufperchunk) * chunksize +
+            (i % bufperchunk) * bufsize;
+    return (void **)buft;
+}
+
+void
+ips_recvq_egrbuf_table_free(void **buftable)
+{
+    uintptr_t *buft = (uintptr_t *) buftable;
+    void *ptr_alloc = (void *) buft[-1];
+    psmi_free(ptr_alloc);
+}
+
diff --git a/ptl_ips/ips_recvq.h b/ptl_ips/ips_recvq.h
new file mode 100644
index 0000000..9bb5d06
--- /dev/null
+++ b/ptl_ips/ips_recvq.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_RECVQ_H
+#define _IPS_RECVQ_H
+
+#include "psm_user.h"
+
+struct ips_recvq_params {
+    volatile __le32 *tail_register; /* location of tail */
+    volatile __le32 *head_register; /* location of head */
+    uint32_t	    *base_addr;	    /* base address of q */
+    uint32_t	     elemsz;	    /* size of q elements (in words) */
+    uint32_t	     elemcnt;	    /* num of q elements (in words) */
+};
+
+/*
+ * Tables to map eager indexes into their buffer addresses
+ *
+ * If function returns NULL, no memory has been allocated and the error handler
+ * has been executed on 'ep' and hence assume status PSM_NO_MEMORY.
+ */
+void **ips_recvq_egrbuf_table_alloc(psm_ep_t ep,
+				    void *base, uint32_t chunksize, 
+				    uint32_t bufnum, uint32_t bufsize);
+void    ips_recvq_egrbuf_table_free(void **buftable);
+
+/*
+ * Accessor inlines for reading and writing to hdrq/egrq registers
+ */
+PSMI_ALWAYS_INLINE(
+void *ips_recvq_egr_index_2_ptr(void **egrq_buftable, int index))
+{
+    return egrq_buftable[index];
+}
+
+PSMI_INLINE(
+void ips_recvq_head_update(const struct ips_recvq_params *recvq, uint32_t newhead))
+{
+    *recvq->head_register = __cpu_to_le32(newhead);
+    return;
+}
+
+PSMI_INLINE(
+uint32_t ips_recvq_head_get(const struct ips_recvq_params *recvq))
+{
+    uint32_t res = __le32_to_cpu(*recvq->head_register);
+    ips_rmb();
+    return res;
+}
+
+PSMI_INLINE(
+void ips_recvq_tail_update(const struct ips_recvq_params *recvq, uint32_t newtail))
+{
+    *recvq->tail_register = __cpu_to_le32(newtail);
+    return;
+}
+
+PSMI_INLINE(
+uint32_t ips_recvq_tail_get(const struct ips_recvq_params *recvq))
+{
+    uint32_t res = __le32_to_cpu(*recvq->tail_register);
+    ips_rmb();
+    return res;
+}
+
+#endif /* _IPS_RECVQ_H */
diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c
new file mode 100644
index 0000000..452e752
--- /dev/null
+++ b/ptl_ips/ips_scb.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "psm_user.h"
+#include "ips_proto.h"
+#include "ips_scb.h"
+
+psm_error_t 
+ips_scbctrl_init(const psmi_context_t *context,
+		 uint32_t numscb, uint32_t numbufs,
+		 uint32_t imm_size, uint32_t bufsize, 
+		 ips_scbctrl_avail_callback_fn_t scb_avail_callback,
+		 void *scb_avail_context,
+		 struct ips_scbctrl *scbc)
+{
+    int i;
+    struct ips_scb *scb;
+    size_t scb_size;
+    size_t alloc_sz;
+    uintptr_t base, imm_base;
+    psm_ep_t ep = context->ep;
+    //scbc->context = context;
+    psm_error_t err = PSM_OK;
+
+    psmi_assert_always(numscb > 0);
+    scbc->sbuf_num = scbc->sbuf_num_cur = numbufs;
+    SLIST_INIT(&scbc->sbuf_free);
+    scbc->sbuf_buf_size  = bufsize;
+    scbc->sbuf_buf_base  = NULL;
+    scbc->sbuf_buf_alloc = NULL;
+    scbc->sbuf_buf_last  = NULL;
+
+    /* send buffers are not mandatory but when allocating them, make sure they
+     * are on a page boundary */
+    if (numbufs > 0) {
+	struct ips_scbbuf *sbuf;
+	int redzone = PSM_VALGRIND_REDZONE_SZ;
+
+	/* If the allocation requested is a page and we have redzones we have
+	 * to allocate 2 pages so we end up using a redzone of 2048 bytes.
+	 *
+	 * if the allocation is not 4096, we relax that requirement and keep
+	 * the redzones PSM_VALGRIND_REDZONE_SZ
+	 */
+	if (redzone > 0 && bufsize % PSMI_PAGESIZE == 0)
+	    redzone = PSMI_PAGESIZE / 2;
+	bufsize += 2 * redzone;
+	bufsize = PSMI_ALIGNUP(bufsize, 64);
+
+	alloc_sz = numbufs * bufsize + redzone + PSMI_PAGESIZE; 
+	scbc->sbuf_buf_alloc = 
+	    psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+	if (scbc->sbuf_buf_alloc == NULL) {
+	    err = PSM_NO_MEMORY;
+	    goto fail;
+	}
+	base = (uintptr_t)scbc->sbuf_buf_alloc;
+	base = PSMI_ALIGNUP(base + redzone, PSMI_PAGESIZE);
+	scbc->sbuf_buf_base = (void *)base;
+	scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs-1));
+	_IPATH_VDBG("sendbufs=%d, (redzone=%d|size=%d|redzone=%d),base=[%p..%p)\n", 
+		    numbufs, redzone, bufsize-2*redzone, redzone,  
+		    (void *) scbc->sbuf_buf_base, (void *) scbc->sbuf_buf_last);
+
+	for (i = 0; i < numbufs; i++) {
+	    sbuf = (struct ips_scbbuf *) (base + bufsize * i);
+	    SLIST_NEXT(sbuf, next) = NULL;
+	    SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next);
+	}
+
+	VALGRIND_CREATE_MEMPOOL(scbc->sbuf_buf_alloc, 
+			      0,
+			      /* Should be undefined but we stuff a next
+			       * pointer in the buffer */
+			      PSM_VALGRIND_MEM_DEFINED);
+    }
+    
+    imm_base = 0;
+    scbc->scb_imm_size = imm_size;
+    if (scbc->scb_imm_size) {
+      scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64);
+      alloc_sz = numscb * scbc->scb_imm_size + 64;
+      scbc->scb_imm_buf = 
+	psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+      if (scbc->scb_imm_buf == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+      }
+      imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64);
+    }
+    else
+      scbc->scb_imm_buf = NULL;
+    
+    scbc->scb_num = scbc->scb_num_cur = numscb;
+    SLIST_INIT(&scbc->scb_free);
+    scb_size = sizeof(struct ips_scb) + 2*PSM_VALGRIND_REDZONE_SZ;
+    scb_size = PSMI_ALIGNUP(scb_size, 64);
+    alloc_sz = numscb * scb_size + PSM_VALGRIND_REDZONE_SZ + 64;
+    scbc->scb_base = (void *)
+	psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz);
+    if (scbc->scb_base == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    base = (uintptr_t)scbc->scb_base;
+    base = PSMI_ALIGNUP(base + PSM_VALGRIND_REDZONE_SZ, 64);
+    for (i = 0; i < numscb; i++) {
+	scb = (struct ips_scb *)(base + i * scb_size);
+	scb->scbc = scbc;
+	if (scbc->scb_imm_buf)
+	  scb->imm_payload = (void*)(imm_base + (i * scbc->scb_imm_size));
+	else
+	  scb->imm_payload = NULL;
+	
+	SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+    }
+    scbc->scb_avail_callback = scb_avail_callback;
+    scbc->scb_avail_context  = scb_avail_context;
+
+    /* It would be nice to mark the scb as undefined but we pre-initialize the
+     * "next" pointer and valgrind would see this as a violation.
+     */
+    VALGRIND_CREATE_MEMPOOL(scbc, PSM_VALGRIND_REDZONE_SZ, 
+				  PSM_VALGRIND_MEM_DEFINED);
+
+fail:
+    return err;
+}
+
+psm_error_t
+ips_scbctrl_fini(struct ips_scbctrl *scbc)
+{
+    if (scbc->scb_base != NULL) {
+	psmi_free(scbc->scb_base);
+	VALGRIND_DESTROY_MEMPOOL(scbc);
+    }
+    if (scbc->sbuf_buf_alloc) {
+	VALGRIND_DESTROY_MEMPOOL(scbc->sbuf_buf_alloc);
+	psmi_free(scbc->sbuf_buf_alloc);
+    }
+    return PSM_OK;
+}
+
+int
+ips_scbctrl_bufalloc(ips_scb_t *scb)
+{
+    struct ips_scbctrl *scbc = scb->scbc;
+
+    psmi_assert_always(scbc->sbuf_num > 0);
+    psmi_assert_always(!((scb->payload >= scbc->sbuf_buf_base) &&
+		        (scb->payload <= scbc->sbuf_buf_last)));
+    if (SLIST_EMPTY(&scbc->sbuf_free))
+	return 0;
+    else {
+        psmi_assert(scbc->sbuf_num_cur); 
+        scb->payload = SLIST_FIRST(&scbc->sbuf_free);
+	scb->payload_size = scbc->sbuf_buf_size;
+	scbc->sbuf_num_cur--;
+	
+	/* If under memory pressure request ACK for packet to reclaim 
+	 * credits.
+	 */
+	if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1))
+	  scb->flags |= IPS_SEND_FLAG_ACK_REQ;
+	
+	VALGRIND_MEMPOOL_ALLOC(scbc->sbuf_buf_alloc, scb->payload,
+			       scb->payload_size);
+	SLIST_REMOVE_HEAD(&scbc->sbuf_free, next);
+	return 1;
+    }
+}
+
+int
+ips_scbctrl_avail(struct ips_scbctrl *scbc)
+{
+    return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0);
+}
+
+ips_scb_t *
+ips_scbctrl_alloc(struct ips_scbctrl *scbc, int scbnum, int len, uint32_t flags)
+{
+    ips_scb_t *scb, *scb_head = NULL;
+
+    psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num>0) : 1);
+
+    while (scbnum--) {
+	if (SLIST_EMPTY(&scbc->scb_free))
+	    break;
+	scb = SLIST_FIRST(&scbc->scb_free);
+	scb->flags = 0; /* Need to set this here as bufalloc may request
+			 * an ACK under memory pressure 
+			 */
+	VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+
+	if (flags & IPS_SCB_FLAG_ADD_BUFFER) {
+	  if (len > scbc->scb_imm_size) {
+	    if (!ips_scbctrl_bufalloc(scb))
+	      break;
+	  }
+	  else { /* Attach immediate buffer */
+	    scb->payload = scb->imm_payload;
+	    scb->payload_size = scbc->scb_imm_size;
+	    psmi_assert(scb->payload);
+	  }
+	}
+	else {
+	    scb->payload = NULL;
+	    scb->payload_size = 0;
+	}
+	
+	scb->tid = IPATH_EAGER_TID_ID;
+	scb->tidsendc = NULL;
+	scb->callback = NULL;
+        scb->ips_lrh.mqhdr = 0;
+        scb->offset = 0;
+        scb->nfrag = 1;
+	scb->frag_size = 0;
+	
+	scbc->scb_num_cur--;
+	if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+	  scb->flags |= IPS_SEND_FLAG_ACK_REQ;
+	
+	SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+	SLIST_NEXT(scb, next) = scb_head;
+	scb_head = scb;
+    }
+    return scb_head;
+}
+
+void
+ips_scbctrl_free(ips_scb_t *scb)
+{
+    struct ips_scbctrl *scbc = scb->scbc;
+    if (scbc->sbuf_num && (scb->payload >= scbc->sbuf_buf_base) &&
+	(scb->payload <= scbc->sbuf_buf_last)) {
+        scbc->sbuf_num_cur++;
+	SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next);
+	VALGRIND_MEMPOOL_FREE(scbc->sbuf_buf_alloc, scb->payload);
+    }
+
+    scb->payload = NULL;
+    scb->tidsendc = NULL;
+    scb->payload_size = 0;
+    scbc->scb_num_cur++;
+    if (SLIST_EMPTY(&scbc->scb_free)) {
+	SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+	if (scbc->scb_avail_callback != NULL)
+	    scbc->scb_avail_callback(scbc, scbc->scb_avail_context);
+    }
+    else 
+	SLIST_INSERT_HEAD(&scbc->scb_free, scb, next);
+
+    VALGRIND_MEMPOOL_FREE(scbc, scb);
+    return;
+}
+
+ips_scb_t *
+ips_scbctrl_alloc_tiny(struct ips_scbctrl *scbc)
+{
+    ips_scb_t	*scb;
+    if (SLIST_EMPTY(&scbc->scb_free))
+	return NULL;
+    scb = SLIST_FIRST(&scbc->scb_free);
+    
+    VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb));
+    SLIST_REMOVE_HEAD(&scbc->scb_free, next);
+    SLIST_NEXT(scb, next) = NULL;
+    
+    scb->payload = NULL;
+    scb->payload_size = 0;
+    scb->flags = 0;
+    scb->tid = IPATH_EAGER_TID_ID;
+    scb->tidsendc = NULL;
+    scb->callback = NULL;
+    scb->nfrag = 1;
+    scb->frag_size = 0;
+    
+    scbc->scb_num_cur--;
+    if (scbc->scb_num_cur < (scbc->scb_num >> 1))
+      scb->flags |= IPS_SEND_FLAG_ACK_REQ;
+    return scb;
+}
+
diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h
new file mode 100644
index 0000000..f7fb148
--- /dev/null
+++ b/ptl_ips/ips_scb.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_SCB_H
+#define _IPS_SCB_H
+
+#include "psm_user.h"
+#include "ips_proto_header.h"
+
+/* ips_alloc_scb flags */
+#define IPS_SCB_FLAG_NONE	0x0
+#define IPS_SCB_FLAG_ADD_BUFFER 0x1
+
+/* macros to update scb */
+#define ips_scb_mqhdr(scb)     scb->ips_lrh.mqhdr
+#define ips_scb_mqtag(scb)     scb->ips_lrh.data[0].u64w0
+#define ips_scb_mqparam(scb)   scb->ips_lrh.data[1]
+#define ips_scb_uwords(scb)    scb->ips_lrh.data
+#define ips_scb_subopcode(scb) scb->ips_lrh.sub_opcode
+#define ips_scb_buffer(scb)    scb->payload
+#define ips_scb_length(scb)    scb->payload_size
+#define ips_scb_flags(scb)     scb->flags
+#define ips_scb_dma_ctr(scb)   scb->dma_ctr
+#define ips_scb_epaddr(scb)    scb->epaddr
+#define ips_scb_cb(scb)        scb->callback
+#define ips_scb_cb_param(scb)  scb->cb_param
+#define ips_scb_hdr_dlen(scb)  scb->ips_lrh.hdr_dlen
+
+struct ips_scbbuf;
+struct ips_scb;
+struct ips_scbctrl;
+struct ips_tid_send_desc;
+
+typedef void (*ips_scbctrl_avail_callback_fn_t)(struct ips_scbctrl *, 
+					        void *context);
+
+STAILQ_HEAD(ips_scb_stailq, ips_scb);
+SLIST_HEAD(ips_scb_slist, ips_scb);
+
+struct ips_scbctrl {
+    //const psmi_context_t *context;
+
+    /* Send control blocks for each send */
+    uint32_t			     scb_num;
+    uint32_t                         scb_num_cur;
+    SLIST_HEAD(scb_free, ips_scb)    scb_free;
+    void			    *scb_base;
+    ips_scbctrl_avail_callback_fn_t  scb_avail_callback;
+    void			    *scb_avail_context;
+
+    /* Immediate data for send buffers */		    
+    uint32_t                         scb_imm_size;
+    void                            *scb_imm_buf;
+
+    /*
+     * Send buffers (or bounce buffers) to keep user data if we need to
+     * retransmit.
+     */
+    uint32_t				sbuf_num;
+    uint32_t                            sbuf_num_cur;
+    SLIST_HEAD(sbuf_free, ips_scbbuf)	sbuf_free;
+    void			       *sbuf_buf_alloc;
+    uint32_t				sbuf_buf_size;
+    void			       *sbuf_buf_base;
+    void			       *sbuf_buf_last;
+};
+
+struct ips_scbbuf {
+	SLIST_ENTRY(ips_scbbuf)	next;
+};
+
+typedef struct ips_scb ips_scb_t;
+
+struct ips_scb {
+	union {
+	    SLIST_ENTRY(ips_scb)    next;
+	    STAILQ_ENTRY(ips_scb)   nextq;
+	};
+	union {
+	    void		*payload;
+	    struct ips_scbbuf	*sbuf;
+	};
+	uint64_t ack_timeout;	/* in cycles  */
+	uint64_t abs_timeout;	/* in cycles  */
+
+	/* Used when composing packet */
+	psmi_seqnum_t seq_num;
+	uint32_t payload_size;
+	uint32_t extra_bytes;
+        uint32_t cksum;
+	uint32_t flags;
+	uint32_t dma_ctr;
+	uint32_t payload_bytes;
+	uint16_t pkt_flags;
+        uint16_t tid;
+	uint16_t offset;
+	uint16_t nfrag;
+	uint16_t frag_size;
+  
+	struct ips_flow *flow;
+	struct ptl_epaddr *epaddr;
+	struct ips_tid_send_desc *tidsendc;
+	void	*tsess;
+	uint16_t tsess_length;
+	
+
+	struct ips_scbctrl *scbc;
+        void               *imm_payload;
+
+        union {
+	  int (*callback) (void *, uint32_t);
+	  psm_am_completion_fn_t completion_am;
+	};
+	void *cb_param;
+
+	struct {
+	    union ipath_pbc	      pbc;
+	    struct ips_message_header ips_lrh;
+	} PSMI_CACHEALIGN;
+};
+
+void	    ips_scbctrl_free(ips_scb_t *scb);
+int	    ips_scbctrl_bufalloc(ips_scb_t *scb);
+int	    ips_scbctrl_avail(struct ips_scbctrl *scbc);
+ips_scb_t * ips_scbctrl_alloc(struct ips_scbctrl *scbc, 
+			      int scbnum, int len, uint32_t flags);
+ips_scb_t * ips_scbctrl_alloc_tiny(struct ips_scbctrl *scbc);
+
+psm_error_t ips_scbctrl_init(const psmi_context_t *context, 
+		 uint32_t numscb, uint32_t numbufs, 
+		 uint32_t imm_size, uint32_t bufsize, 
+		 ips_scbctrl_avail_callback_fn_t, void *avail_context,
+		 struct ips_scbctrl *);
+psm_error_t ips_scbctrl_fini(struct ips_scbctrl *);
+
+psm_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd);
+
+#endif /* _IPS_SCB_H */
diff --git a/ptl_ips/ips_spio.c b/ptl_ips/ips_spio.c
new file mode 100644
index 0000000..2c3c985
--- /dev/null
+++ b/ptl_ips/ips_spio.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* included header files  */
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sched.h>
+
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_spio.h"
+#include "ipserror.h" /* ips error codes */
+#include "ips_proto_params.h"
+#include "ipath_byteorder.h"
+
+
+#define SPIO_INUSE_MASK 0xAAAAAAAAAAAAAAAAULL
+#define SPIO_CHECK_MASK 0x5555555555555555ULL
+
+/* Report PIO stalls every 20 seconds at the least */
+#define SPIO_STALL_WARNING_INTERVAL	  (nanosecs_to_cycles(20e9))
+#define SPIO_MAX_CONSECUTIVE_SEND_FAIL	  (1<<20) /* 1M */
+/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */
+#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */
+
+static void spio_report_stall(struct ips_spio *ctrl, 
+			      uint64_t t_cyc_now, 
+			      uint64_t send_failures);
+
+static void spio_handle_stall(struct ips_spio *ctrl,
+			      uint64_t send_failures);
+
+static inline
+uint64_t
+ips_spio_read_avail_index(struct ips_spio *ctrl, int index)
+{
+    if (ctrl->runtime_flags & IPATH_RUNTIME_PIO_REGSWAPPED && index > 3) {
+	return __le64_to_cpu(ctrl->spio_avail_addr[index ^ 1]);
+    }
+    else
+	return __le64_to_cpu(ctrl->spio_avail_addr[index]);
+}
+
+psm_error_t
+ips_spio_init(const struct psmi_context *context, const struct ptl *ptl,
+	      struct ips_spio *ctrl)
+{
+    psm_error_t err = PSM_OK;
+    const struct ipath_base_info *base_info = &context->base_info;
+    unsigned wc_unordered;
+    char *order_str = "undefined";
+    int i, last_shadow_index;
+    int num_shadow_index = sizeof(ctrl->spio_avail_shadow) / 
+			   sizeof(ctrl->spio_avail_shadow[0]);
+
+    ctrl->ptl = ptl;
+    ctrl->context = context;
+    /* Copy runtime flags */
+    ctrl->runtime_flags = ptl->runtime_flags;
+    ctrl->unit_id = context->ep->unit_id;
+    ctrl->portnum = context->ep->portnum;
+    pthread_spin_init(&ctrl->spio_lock, 0);
+    ctrl->spio_avail_addr =
+        (__le64 *)(ptrdiff_t)base_info->spi_pioavailaddr;
+    ctrl->spio_buffer_base =
+        (uint32_t *)(ptrdiff_t)base_info->spi_piobufbase;
+    ctrl->spio_sendbuf_status =
+        (unsigned long *)(ptrdiff_t)base_info->spi_sendbuf_status;
+
+    ctrl->spio_buffer_spacing = base_info->spi_pioalign >> 2;
+    ctrl->spio_first_buffer = ctrl->spio_current_buffer = 
+	base_info->spi_pioindex;
+    ctrl->spio_last_buffer = 
+	ctrl->spio_first_buffer + base_info->spi_piocnt - 1;
+    ctrl->spio_num_of_buffer = base_info->spi_piocnt;
+
+    ctrl->spio_consecutive_failures = 0;
+    ctrl->spio_num_stall = 0ULL;
+    ctrl->spio_next_stall_warning = 0ULL;
+    ctrl->spio_last_stall_cyc = 0ULL;
+    ctrl->spio_init_cyc = get_cycles();
+
+    last_shadow_index = ctrl->spio_last_buffer / 32;
+    last_shadow_index += (ctrl->spio_last_buffer % 32) ? 1 : 0;
+    if (last_shadow_index > num_shadow_index) 
+    {
+	err = psmi_handle_error(ctrl->context->ep, PSM_EP_DEVICE_FAILURE,
+		"Number of buffer avail registers is wrong; "
+		"have %u, expected %u (1st %u, piocnt %u, last %u)",
+		last_shadow_index, 
+		(uint32_t)(sizeof(ctrl->spio_avail_shadow) / 
+			   sizeof(ctrl->spio_avail_shadow[0])),
+		base_info->spi_pioindex, ctrl->spio_last_buffer,
+		base_info->spi_piocnt);
+	goto fail;
+    }
+
+    /* update the shadow copy with the current contents of hardware
+     * available registers */
+    for (i = 0; i < num_shadow_index; i++)
+	ctrl->spio_avail_shadow[i] = ips_spio_read_avail_index(ctrl, i);
+
+    /* Figure out the type of ordering we require for pio writes.  Update the
+     * routine we use for copies according to the type of pio write required */
+    wc_unordered  = base_info->spi_runtime_flags;
+    wc_unordered &= IPATH_RUNTIME_FORCE_WC_ORDER;
+
+    if (base_info->spi_runtime_flags & IPATH_RUNTIME_SPECIAL_TRIGGER) {
+      /* For now all PIO packets are < 2K and use the 2K trigger function. */
+      ctrl->spio_copy_fn = ipath_write_pio_special_trigger2k;
+      order_str = "natural CPU (w/ 2k special trigger)";
+    }
+    else {
+	switch ( wc_unordered ) {
+	    case 0: 
+#ifdef __MIC__
+		ctrl->spio_copy_fn = getenv("IPATH_MIC_DWORD_PIO")?
+				ipath_write_pio:ipath_write_pio_vector;
+#else
+		ctrl->spio_copy_fn = ipath_write_pio;
+#endif
+		order_str = "natural CPU";
+		break;
+
+	    case IPATH_RUNTIME_FORCE_WC_ORDER: 
+	    default:    // any other non-zero
+		ctrl->spio_copy_fn = ipath_write_pio_force_order;
+		order_str = "forced";
+		break;
+	}
+    }
+
+    _IPATH_PRDBG("PIO copy uses %s ordering\n", order_str);
+
+fail:
+    return err;
+}
+
+psm_error_t
+ips_spio_fini(struct ips_spio *ctrl)
+{
+    spio_report_stall(ctrl, get_cycles(), 0ULL);
+    return PSM_OK;
+}
+
+static
+void 
+spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now, 
+		  uint64_t send_failures)
+{
+    int last, i;
+    size_t off = 0;
+    char buf[1024];
+    
+    if (ctrl->spio_num_stall == 0)
+	return;
+
+    last = ctrl->spio_last_buffer/32;
+
+    if (send_failures > 0) {
+	char bufctr[128];
+	uint64_t tx_stat, rx_stat;
+	int ret;
+
+	off = snprintf(buf, sizeof buf - 1, 
+	    "PIO Send Bufs context %d with %d bufs from %d to %d. PIO avail regs: ",
+	    (int) psm_epid_context(ctrl->context->epid),
+	    ctrl->spio_num_of_buffer, ctrl->spio_first_buffer, 
+	    ctrl->spio_last_buffer);
+
+	for (i = 0; i < 8; i++) {
+	    uint64_t avail = ips_spio_read_avail_index(ctrl, i);
+	    off += snprintf(buf+off, sizeof buf - off - 1, " <%d>=(%llx) ", 
+		    i, (long long) avail);
+	}
+	off += snprintf(buf+off, sizeof buf - off - 1, ". PIO shadow regs: ");
+	for (i = ctrl->spio_first_buffer/32; i <= last; i++) {
+	    off += snprintf(buf+off, sizeof buf - off - 1, " <%d>=(%llx) ", 
+		    i, (long long)ctrl->spio_avail_shadow[i]);
+	}
+	buf[off] = '\0';
+
+	/* In case ipathfs isn't running */
+	ret = infinipath_get_single_portctr(ctrl->unit_id, ctrl->portnum,
+					    "TxPkt", &tx_stat);
+	if (ret != -1) {
+		ret = infinipath_get_single_portctr(ctrl->unit_id,
+						    ctrl->portnum, "RxPkt",
+						    &rx_stat);
+		if (ret != -1) {
+			snprintf(bufctr, sizeof bufctr - 1, 
+				 "(TxPktCnt=%llu,RxPktCnt=%llu)",
+				 (unsigned long long) tx_stat,
+				 (unsigned long long) rx_stat);
+			bufctr[sizeof bufctr - 1] = '\0';
+		} else 
+			bufctr[0] = '\0';
+	} else
+		bufctr[0] = '\0';
+	_IPATH_DBG("PIO Send Stall after at least %.2fM failed send attempts "
+	    "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n",
+	    send_failures / 1e6,
+	    PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+	    PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc),
+	    (unsigned long long) ctrl->spio_num_stall,
+	    bufctr[0] != '\0' ? bufctr : "", buf);
+    }
+    else {
+	_IPATH_DBG(
+	    "PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs",
+	    (unsigned long long) ctrl->spio_num_stall,
+	    PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc),
+	    PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc));
+    }
+
+    return;
+}
+
+static void 
+spio_handle_stall(struct ips_spio *ctrl,
+		  uint64_t send_failures)
+{
+    uint64_t t_cyc_now = get_cycles();
+    int i, last;
+    
+    /* We handle the pio-stall every time but only report something every 20
+     * seconds.  We print a summary at the end while closing the device */
+    ctrl->spio_num_stall++;
+    ctrl->spio_num_stall_total++;
+
+    if (ctrl->spio_next_stall_warning <= t_cyc_now) {
+	/* If context status is ok (i.e. no cables pulled or anything) */
+	if (psmi_context_check_status(ctrl->context) == PSM_OK)
+	    spio_report_stall(ctrl, t_cyc_now, send_failures);
+	ctrl->spio_next_stall_warning = 
+		get_cycles() + SPIO_STALL_WARNING_INTERVAL;
+    }
+
+    /* re-initialize our shadow from the real registers; by this time,
+     * we know the hardware has to have done the update.
+     * Also, kernel check may have changed things.
+     */
+    last = ctrl->spio_last_buffer/32;
+    for (i = 0; i <= last; i++) {
+        uint64_t mask, avail, shadow_avail;
+      
+        avail = ips_spio_read_avail_index(ctrl, i);
+	shadow_avail = ctrl->spio_avail_shadow[i];
+	mask = (~(avail ^ shadow_avail) & SPIO_CHECK_MASK) << 1;
+	shadow_avail &= ~mask; /* clear all possible in-use bits */
+	shadow_avail |= (avail & mask);
+	ctrl->spio_avail_shadow[i] = shadow_avail;
+    }
+
+    ctrl->spio_last_stall_cyc = t_cyc_now;
+
+    return;
+}
+
+/*
+ * Update our shadow of the PIO available bitfield at index 'index'
+ */
+static void __sendpath 
+spio_update_shadow(struct ips_spio *ctrl, int index)
+{
+    register uint64_t mask, avail, shadow_avail;
+
+    if_pf (*ctrl->spio_sendbuf_status) {
+      __u64 event_mask;
+      struct ips_proto *proto = (struct ips_proto*) &ctrl->ptl->proto;
+      
+      /* Get event mask for PSM to process */
+      event_mask = (uint64_t) *ctrl->spio_sendbuf_status;
+      
+      /* First ack the driver the receipt of the events */
+      _IPATH_VDBG("Acking event(s) 0x%"PRIx64" to qib driver.\n", (uint64_t) event_mask);
+      ipath_event_ack(ctrl->context->ctrl, event_mask);
+      
+      if (event_mask & IPATH_EVENT_DISARM_BUFS) {
+	/* Just acking event has disarmed all buffers */
+	_IPATH_VDBG("Disarm of send buffers completed.\n");
+      }
+      
+      if (event_mask & IPATH_EVENT_LINKDOWN) {
+	/* A link down event can clear the LMC and SL2VL change as those
+	 * events are implicitly handled in the link up/down event handler.
+	 */
+	event_mask &= ~(IPATH_EVENT_LMC_CHANGE | IPATH_EVENT_SL2VL_CHANGE);
+	ips_ibta_link_updown_event(proto);
+	_IPATH_VDBG("Link down detected.\n");
+      }
+      
+      if (event_mask & IPATH_EVENT_LID_CHANGE) {
+	/* Display a warning that LID change has occurred during the run. This
+	 * is not supported in the current implementation and in general is
+	 * bad for the SM to re-assign LIDs during a run.
+	 */
+	int lid, olid;
+	
+	lid = 
+	  ipath_get_port_lid(proto->ep->context.base_info.spi_unit,
+			     proto->ep->context.base_info.spi_port);
+	olid = PSMI_EPID_GET_LID(ctrl->context->epid);
+	
+	_IPATH_INFO("Warning! LID change detected during run. Old LID: %x, New Lid: %x\n", olid, lid);
+      }
+      
+      if (event_mask & IPATH_EVENT_LMC_CHANGE) {
+	_IPATH_INFO("Fabric LMC changed.\n");
+      }
+      
+      if (event_mask & IPATH_EVENT_SL2VL_CHANGE) {
+	_IPATH_INFO("SL2VL mapping changed for port.\n");
+	ips_ibta_init_sl2vl_table(proto);
+      }
+    }
+
+    index &= 0x7;	// max spio_avail_shadow[] index.
+    avail = ips_spio_read_avail_index(ctrl, index);
+ 
+    do {
+	shadow_avail = ctrl->spio_avail_shadow[index];
+	mask = (~(avail ^ shadow_avail) & SPIO_CHECK_MASK) << 1;
+	shadow_avail &= ~mask; /* clear all possible in-use bits */
+	shadow_avail |= (avail & mask);
+    }
+#ifndef PSMI_USE_THREADS
+    while (0);
+    ctrl->spio_avail_shadow[index] = shadow_avail;
+#else
+    while (ips_cswap(...));
+#endif
+}
+
+static void
+spio_handle_resync(struct ips_spio *ctrl,
+		   uint64_t consecutive_send_failed)
+{
+  if (ctrl->runtime_flags & IPATH_RUNTIME_FORCE_PIOAVAIL)
+    ipath_force_pio_avail_update(ctrl->context->ctrl);
+  if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1)))
+    spio_handle_stall(ctrl, consecutive_send_failed);
+}
+
+/* 
+ * This function attempts to write a packet to a PIO.
+ *
+ * Recoverable errors:
+ * PSM_OK: Packet triggered through PIO.
+ * PSM_EP_NO_RESOURCES: No PIO bufs available or cable pulled.
+ *
+ * Unrecoverable errors:
+ * PSM_EP_NO_NETWORK: No network, no lid, ...
+ * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc.
+ */
+psm_error_t __sendpath 
+ips_spio_transfer_frame(struct ips_spio *ctrl, struct ips_flow *flow,
+			void *header, void *payload, int length,
+			uint32_t isCtrlMsg, uint32_t cksum_valid,uint32_t cksum)
+{
+    uint32_t *current_pio_buffer;
+    const uint64_t toggle_bits = 3ULL;
+    psm_error_t err = PSM_OK;
+    int tries;
+    int do_lock = (ctrl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+    struct ipath_pio_params pio_params;
+    struct ips_message_header *p_hdr = (struct ips_message_header*) header;
+
+    if (do_lock)
+	pthread_spin_lock(&ctrl->spio_lock);
+
+    if_pf (PSMI_FAULTINJ_ENABLED()) {
+	PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1, IPS_FAULTINJ_PIOLOST);
+	PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1, IPS_FAULTINJ_PIOBUSY);
+	if (psmi_faultinj_is_fault(fi_lost)) {
+	    if (do_lock)
+		pthread_spin_unlock(&ctrl->spio_lock);
+	    return PSM_OK;
+	}
+	else if (psmi_faultinj_is_fault(fi_busy))
+	    goto fi_busy;
+	/* else fall through normal processing path, i.e. no faults */
+    }
+
+    if (ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] & 
+        (1ULL<<(((ctrl->spio_current_buffer) % 32 * 2) + 1))) 
+    {
+	/* 
+	 * If the bit was already set, we couldn't get the pio buf. Update our
+	 * shadow copy.
+	 */
+        spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32);
+
+        tries = ctrl->spio_num_of_buffer;
+
+	while (tries && (ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] & 
+			    (1ULL<<(((ctrl->spio_current_buffer % 32) * 2) + 1)))) 
+	{
+            /* advance spio_current_buffer to next buffer */
+	    if (++ctrl->spio_current_buffer > ctrl->spio_last_buffer) {
+		ctrl->spio_current_buffer = ctrl->spio_first_buffer;
+                spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32);
+	    }
+	    else if ( (ctrl->spio_current_buffer % 32) == 0 ) 
+                spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32);
+            tries--;
+        }
+
+        if_pf ( !tries ) {
+	    /* Check unit status */
+fi_busy:
+	    if ((err = psmi_context_check_status(ctrl->context)) == PSM_OK) {
+		if (0 == (++ctrl->spio_consecutive_failures & 
+			    (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL-1)))
+		    spio_handle_resync(ctrl, ctrl->spio_consecutive_failures);
+		err = PSM_EP_NO_RESOURCES; 
+	    }
+	    /* If cable is pulled, we don't count it as a consecutive failure,
+	     * we just make it as though no send pio was available */
+	    else if (err == PSM_OK_NO_PROGRESS) 
+		err = PSM_EP_NO_RESOURCES; 
+	    /* else something bad happened in check_status */
+	    if (do_lock)
+		pthread_spin_unlock(&ctrl->spio_lock);
+            return err;
+        }
+    }
+    if (ctrl->spio_num_stall) // now able to send, so clear if set
+        ctrl->spio_num_stall = 0;
+
+    /* toggle the Generation bit and set the busy bit.
+     * If we detected a flip,        toggle busy but not GenBit (0x2)
+     * If we didn't detect the flip, toggle busy but not the GenBit (0x3) */
+    ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] ^= 
+	    (toggle_bits<<(((ctrl->spio_current_buffer % 32) * 2)));
+
+    current_pio_buffer = (uint32_t *) ctrl->spio_buffer_base +
+        (ctrl->spio_buffer_spacing * 
+	 (ctrl->spio_current_buffer - ctrl->spio_first_buffer));
+
+    /* advance spio_current_buffer to next buffer */
+    if (++ctrl->spio_current_buffer > ctrl->spio_last_buffer)
+	ctrl->spio_current_buffer = ctrl->spio_first_buffer;
+
+    ctrl->spio_consecutive_failures = 0;
+
+    if (do_lock)
+	pthread_spin_unlock(&ctrl->spio_lock);
+
+    pio_params.length = length;
+    pio_params.vl = (__be16_to_cpu(p_hdr->lrh[0]) >> LRH_VL_SHIFT) & 0xf;
+    pio_params.port = ctrl->portnum;
+    pio_params.cksum_is_valid = cksum_valid;
+    pio_params.cksum = cksum;
+
+    /* For matched send/receive rates and control messages IPD is not
+     * required.
+     */
+    if_pf (!isCtrlMsg && flow->path->epr_active_ipd)
+      pio_params.rate = 
+      ips_proto_pbc_static_rate(flow, 
+				(length + sizeof(struct ips_message_header)));
+    else
+      pio_params.rate = 0;
+    
+    /* Copy buffer using PIO */
+    ctrl->spio_copy_fn(current_pio_buffer, &pio_params, header, payload);
+
+    return PSM_OK;
+} // ips_spio_transfer_frame()
+
diff --git a/ptl_ips/ips_spio.h b/ptl_ips/ips_spio.h
new file mode 100644
index 0000000..2ba7cea
--- /dev/null
+++ b/ptl_ips/ips_spio.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPS_SPIO_H
+#define IPS_SPIO_H
+
+#include "psm_user.h"
+
+struct ips_spio;
+struct ptl;
+
+psm_error_t ips_spio_init(const psmi_context_t *context, 
+			  const struct ptl *ptl,
+			  struct ips_spio *ctrl);
+psm_error_t ips_spio_transfer_frame(struct ips_spio *ctrl,struct ips_flow *flow,
+				    void *header, void *payload, int length,
+				    uint32_t isCtrlMsg, 
+				    uint32_t cksum_valid, uint32_t cksum);
+psm_error_t ips_spio_fini(struct ips_spio *ctrl);
+
+struct ips_spio
+{
+    const struct ptl       *ptl;
+    const psmi_context_t   *context;
+    uint32_t	            runtime_flags;
+    int			    unit_id;
+    uint16_t		    portnum;
+    pthread_spinlock_t      spio_lock;
+
+    /* pio copy routine */
+    void  (*spio_copy_fn)(volatile uint32_t *,
+	const struct ipath_pio_params *pioparm, void *, void *);
+
+    volatile __le64   *spio_avail_addr __attribute__((aligned(64)));
+    volatile uint32_t *spio_buffer_base;
+    volatile unsigned long *spio_sendbuf_status;
+
+    uint32_t spio_buffer_spacing;
+    uint32_t spio_first_buffer;
+    uint32_t spio_last_buffer;
+    uint32_t spio_current_buffer;
+    uint32_t spio_num_of_buffer;
+
+    uint64_t spio_avail_shadow[8] __attribute__((aligned(64)));
+
+    uint32_t spio_consecutive_failures;
+    uint64_t spio_num_stall;
+    uint64_t spio_num_stall_total;
+    uint64_t spio_next_stall_warning;
+    uint64_t spio_last_stall_cyc;
+    uint64_t spio_init_cyc;
+  
+};
+
+#endif /* IPS_SPIO_H */
diff --git a/ptl_ips/ips_stats.h b/ptl_ips/ips_stats.h
new file mode 100644
index 0000000..2bc4afd
--- /dev/null
+++ b/ptl_ips/ips_stats.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_STATS_H
+#define _IPS_STATS_H
+
+struct psm_epaddr;  /* for non-PSM clients */
+
+/* Old stats */
+typedef 
+struct {
+	uint64_t err_chk_send;
+	uint64_t err_chk_recv;
+	uint64_t send_failed;
+	uint64_t recv_dropped;
+	union {
+	    uint64_t recv_copied; /* obsolete */
+	    uint64_t nak_sent;
+	};
+	uint64_t nak_recv;
+	uint64_t total_send_eager;
+	uint64_t total_send_exp;
+	uint64_t acks_sent;
+	uint64_t retransmits;
+	uint64_t recv_matched;
+	uint64_t recv_unmatched;
+	uint64_t scb_alloc_yields;
+} ips_sess_stat;
+
+int ips_get_stat(struct psm_epaddr *epaddr, ips_sess_stat *stats);
+
+#endif /* _IPS_STATS_H */
diff --git a/ptl_ips/ips_subcontext.c b/ptl_ips/ips_subcontext.c
new file mode 100644
index 0000000..7299d39
--- /dev/null
+++ b/ptl_ips/ips_subcontext.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_subcontext.h"
+#include "ptl_ips.h"
+
+psm_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, const psmi_context_t *context,
+			struct ips_subcontext_ureg **uregp,
+			uint32_t subcontext_cnt)
+{
+    psm_error_t err = PSM_OK;
+    const struct ipath_base_info *base_info = &context->base_info;
+    uint64_t *all_subcontext_uregbase = (uint64_t *) (uintptr_t)
+                                     base_info->spi_subctxt_uregbase;
+    unsigned pagesize = getpagesize();
+    int i;
+    psmi_assert_always(all_subcontext_uregbase != NULL);
+    for (i = 0; i < INFINIPATH_MAX_SUBCONTEXT; i++) {
+        struct ips_subcontext_ureg *subcontext_ureg = 
+          (struct ips_subcontext_ureg *) &all_subcontext_uregbase[_IPATH_UregMax*8];
+        *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL;
+        all_subcontext_uregbase += pagesize / sizeof(uint64_t);
+    }
+    return err;
+}
+
+psm_error_t
+ips_subcontext_ureg_initialize(ptl_t *ptl, uint32_t subcontext,
+                            struct ips_subcontext_ureg *uregp)
+{
+    psm_error_t err = PSM_OK;
+    memset(uregp, 0, sizeof(*uregp));
+    if (subcontext == 0) {
+        if (pthread_spin_init(&uregp->context_lock, 
+                              PTHREAD_PROCESS_SHARED) != 0) {
+            err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE,
+	        "Couldn't initialize process-shared spin lock");
+	}
+    }
+    return err;
+}
diff --git a/ptl_ips/ips_subcontext.h b/ptl_ips/ips_subcontext.h
new file mode 100644
index 0000000..d69f6e9
--- /dev/null
+++ b/ptl_ips/ips_subcontext.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IPS_SUBCONTEXT_H
+#define __IPS_SUBCONTEXT_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+
+/* This data structure is allocated in ureg page of each subcontext process */
+
+struct ips_subcontext_ureg {
+    pthread_spinlock_t context_lock;		/* only used in master ureg */
+    struct ips_recvhdrq_state recvq_state;	/* only used in master ureg */
+    struct ips_writehdrq_state writeq_state;	/* used in all ureg pages */
+};
+
+psm_error_t
+ips_subcontext_ureg_get(ptl_t *ptl, const psmi_context_t *context,
+			struct ips_subcontext_ureg **uregp,
+                        uint32_t subcontext_cnt);
+
+psm_error_t
+ips_subcontext_ureg_initialize(ptl_t *ptl, uint32_t subcontext,
+                            struct ips_subcontext_ureg *uregp);
+
+#endif
diff --git a/ptl_ips/ips_tid.c b/ptl_ips/ips_tid.c
new file mode 100644
index 0000000..eb77ed8
--- /dev/null
+++ b/ptl_ips/ips_tid.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_tid.h"
+
+psm_error_t ips_ptl_handle_check_unit_status(psm_ep_t ep, int ips_rc);
+
+psm_error_t
+ips_tid_init(struct ips_tid *tidc, const psmi_context_t *context)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+    struct psmi_stats_entry entries[] = { 
+	PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL,
+			NULL, &tidc->tid_num_total),
+    };
+
+    tidc->context	= context;
+    tidc->tid_num_max   = base_info->spi_tidcnt;
+    tidc->tid_num_avail = base_info->spi_tidcnt;
+    tidc->tid_pagesz    = base_info->spi_tid_maxsize;
+
+    tidc->tid_num_total = 0;
+
+    return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+				    PSMI_STATSTYPE_TIDS,
+				    entries,
+				    PSMI_STATS_HOWMANY(entries),
+				    tidc);
+}
+
+psm_error_t
+ips_tid_fini(struct ips_tid *tidc)
+{
+    return PSM_OK;
+}
+
+psm_error_t
+ips_tid_acquire(struct ips_tid *tidc, const void *buf,
+		int ntids, ips_tidmap_t tid_map,	
+		uint16_t *tid_array)
+{
+    psm_error_t err = PSM_OK;
+    int rc;
+
+    psmi_assert((uintptr_t)buf % tidc->tid_pagesz == 0);
+    psmi_assert(ntids <= tidc->tid_num_avail);
+
+    rc = ipath_update_tid(tidc->context->ctrl, ntids,
+			  (uint64_t)(uintptr_t) tid_array,
+			  (uint64_t)(uintptr_t) buf,
+			  (uint64_t)(uintptr_t) tid_map);
+
+    if (rc != 0) {
+	/* We're still going to fail but check unit status */
+	err = psmi_err_only(psmi_context_check_status(tidc->context));
+	if (err == PSM_OK) /* okay, but something else is still wrong */
+	    err = psmi_handle_error(tidc->context->ep, PSM_EP_DEVICE_FAILURE,
+				    "Failed to update %d tids",
+				    ntids);
+	goto fail;
+    }
+
+    tidc->tid_num_total += ntids;
+    tidc->tid_num_avail -= ntids;
+
+fail:
+    return err;
+}
+
+psm_error_t
+ips_tid_release(struct ips_tid *tidc, ips_tidmap_t tidmap, int ntids)
+{
+    psm_error_t err = PSM_OK;
+
+    if (ipath_free_tid(tidc->context->ctrl, ntids, 
+		       (uint64_t) (uintptr_t) tidmap)) {
+	err = PSM_EP_DEVICE_FAILURE;
+	goto fail;
+    }
+
+    tidc->tid_num_avail += ntids;
+
+fail:
+    return err;
+}
+
diff --git a/ptl_ips/ips_tid.h b/ptl_ips/ips_tid.h
new file mode 100644
index 0000000..92170c5
--- /dev/null
+++ b/ptl_ips/ips_tid.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* included header files  */
+
+#ifndef _IPS_TID_H
+#define _IPS_TID_H
+
+#include "psm_user.h"
+
+#define IPS_TID_MAX_TIDS    512
+#define IPS_TID_ALIGNMENT   4
+
+typedef uint64_t ips_tidmap_t[IPS_TID_MAX_TIDS/64];
+
+struct ips_tid {
+    const psmi_context_t *context;
+
+    uint32_t	tid_num_max;
+    uint32_t	tid_num_avail;
+    uint32_t	tid_pagesz;
+
+    uint64_t	tid_num_total;
+};
+
+psm_error_t ips_tid_init(struct ips_tid *tidc, const psmi_context_t *context);
+psm_error_t ips_tid_fini(struct ips_tid *tidc);
+
+/* Acquiring tids.
+ * Buffer base has to be aligned on ips_tid_page_size() boundary
+ * Buffer base+length has to be aligned on IPS_TID_ALIGNMENT boundary
+ */
+psm_error_t
+ips_tid_acquire(struct ips_tid *tidc, 
+		const void *buf,	/* input buffer, aligned to page_size  */
+		int ntids,		/* input number of tids */
+		ips_tidmap_t tidmap,	/* output tidmap */
+		uint16_t *tid_array);	/* output tidarray, */
+
+psm_error_t
+ips_tid_release(struct ips_tid *tidc,
+		ips_tidmap_t tidmap,	/* input tidmap */
+		int ntids);		/* intput number of tids to release */
+PSMI_INLINE(
+psm_error_t
+ips_tid_num_available(struct ips_tid *tidc))
+{
+    return tidc->tid_num_avail;
+}
+
+PSMI_INLINE(
+int
+ips_tid_num_required(struct ips_tid *tidc, void *bufi, uint32_t length))
+{
+    uintptr_t buf = (uintptr_t) bufi;
+    const uint32_t page_size = tidc->tid_pagesz;
+
+    return (PSMI_ALIGNUP(buf + length, page_size) -
+	    PSMI_ALIGNDOWN(buf, page_size)) / page_size;
+}
+
+PSMI_INLINE(
+uint32_t
+ips_tid_page_size(struct ips_tid *tidc))
+{
+    return tidc->tid_pagesz;
+}
+
+#endif /* _IPS_TID_H */
diff --git a/ptl_ips/ips_tidflow.c b/ptl_ips/ips_tidflow.c
new file mode 100644
index 0000000..d769233
--- /dev/null
+++ b/ptl_ips/ips_tidflow.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_tidflow.h"
+
+psm_error_t ips_tf_init(const psmi_context_t *context,
+			struct ips_tfctrl *tfctrl, 
+			int start_flowidx, 
+			int end_flowidx,
+			ips_tf_avail_cb_fn_t cb,
+			void *cb_context)
+{
+  int tf_idx;
+  int num_flows = end_flowidx - start_flowidx;
+  
+#if TF_ADD
+  struct psmi_stats_entry entries[] = { 
+    PSMI_STATS_DECL("tidflow update count", MPSPAWN_STATS_REDUCTION_ALL,
+		    NULL, &tfctrl->tf_num_total),
+  };
+#endif
+
+  psmi_assert_always(num_flows > 0);
+  
+  tfctrl->context	  = context;
+  tfctrl->tf_start_idx    = start_flowidx;
+  tfctrl->tf_end_idx      = end_flowidx;
+  tfctrl->tf_num_max      = num_flows;
+  tfctrl->tf_num_avail    = num_flows;
+  tfctrl->tf_num_total    = 0;
+  tfctrl->tf_avail_cb     = cb;
+  tfctrl->tf_avail_context= cb_context;
+
+  SLIST_INIT(&tfctrl->tf_avail);
+  
+  for (tf_idx = start_flowidx; tf_idx < end_flowidx; tf_idx++) {
+    /* Update flow state */
+    tfctrl->tf[tf_idx].state     = TF_STATE_DEALLOCATED;
+    tfctrl->tf[tf_idx].tf_idx    = tf_idx;
+    tfctrl->tf[tf_idx].next_gen  = IPS_TF_INVALID_GENERATION + 1;
+
+    SLIST_NEXT(&tfctrl->tf[tf_idx], next) = NULL;
+    SLIST_INSERT_HEAD(&tfctrl->tf_avail, &tfctrl->tf[tf_idx], next);
+
+    /* Use tidflow reset as we may want to emulate hardware suppression on
+     * QLE73XX and tidflow_set_entry enables the header suppression engine while
+     * reset does not.
+     */
+    ipath_tidflow_reset(context->ctrl, tf_idx);
+  }
+    
+#if TF_ADD
+  /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */
+  return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+				  PSMI_STATSTYPE_TIDS,
+				  entries,
+				  PSMI_STATS_HOWMANY(entries),
+				  tidc);
+#else
+  return PSM_OK;
+#endif
+}
+
+psm_error_t ips_tf_fini(struct ips_tfctrl *tfctrl)
+{
+  return PSM_OK;
+}
+
+/* Allocate a tidflow */
+psm_error_t ips_tf_allocate(struct ips_tfctrl *tfctrl, 
+			    uint32_t *tf_idx,
+			    uint32_t *tf_gen)
+{
+  struct ips_tf *tf;
+  
+  if (!tfctrl->tf_num_avail){
+    *tf_idx = IPS_TF_INVALID;
+    *tf_gen = IPS_TF_INVALID_GENERATION;
+    return PSM_EP_NO_RESOURCES;
+  }
+  
+  psmi_assert(!SLIST_EMPTY(&tfctrl->tf_avail));
+  
+  tf = SLIST_FIRST(&tfctrl->tf_avail);
+  SLIST_REMOVE_HEAD(&tfctrl->tf_avail, next);
+  
+  psmi_assert(tf->state == TF_STATE_DEALLOCATED);
+
+  tf->state = TF_STATE_ALLOCATED;
+  
+  tfctrl->tf_num_avail--;
+  tfctrl->tf_num_total++;
+  
+  *tf_idx = tf->tf_idx;
+  *tf_gen = tf->next_gen;
+  
+  tf->next_gen++;
+  if (tf->next_gen == IPS_TF_INVALID_GENERATION)
+    tf->next_gen++;
+  
+  psmi_assert(*tf_gen != IPS_TF_INVALID_GENERATION);
+  psmi_assert_always(*tf_gen <= IPS_TF_MAX_GENERATION);
+  
+  return PSM_OK;
+}
+
+/* Deallocate a tidflow */
+psm_error_t ips_tf_deallocate(struct ips_tfctrl *tfctrl, uint32_t tf_idx)
+{
+  struct ips_tf *tf;
+
+  psmi_assert_always(tf_idx < tfctrl->tf_end_idx);
+  
+  tf = &tfctrl->tf[tf_idx];
+  psmi_assert(tf->state == TF_STATE_ALLOCATED);
+  tf->state = TF_STATE_DEALLOCATED;
+  
+  /* Mark invalid generation for flow (stale packets will be dropped) */
+  ipath_tidflow_set_entry(tfctrl->context->ctrl,
+		tf_idx, IPS_TF_INVALID_GENERATION, 0);
+  
+  SLIST_NEXT(tf, next) = NULL;
+  SLIST_INSERT_HEAD(&tfctrl->tf_avail, tf, next);
+  
+  /* If an available callback is registered invoke it */
+  if ((tfctrl->tf_num_avail++ == 0) && tfctrl->tf_avail_cb)
+    tfctrl->tf_avail_cb(tfctrl, tfctrl->tf_avail_context);
+ 
+  return PSM_OK;
+}
+
+/* Allocate a generation for a flow */
+psm_error_t ips_tfgen_allocate(struct ips_tfctrl *tfctrl, 
+			       uint32_t tf_idx, 
+			       uint32_t *tfgen)
+{
+  struct ips_tf *tf;
+  int ret = PSM_OK;
+  
+  psmi_assert_always(tf_idx < tfctrl->tf_end_idx);
+  
+  tf = &tfctrl->tf[tf_idx];
+  psmi_assert(tf->state == TF_STATE_ALLOCATED);
+  
+  *tfgen = tf->next_gen;
+  
+  tf->next_gen++;
+  if (tf->next_gen == IPS_TF_INVALID_GENERATION)
+    tf->next_gen++;
+  
+  return ret;
+}
+
diff --git a/ptl_ips/ips_tidflow.h b/ptl_ips/ips_tidflow.h
new file mode 100644
index 0000000..ac8e737
--- /dev/null
+++ b/ptl_ips/ips_tidflow.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_TIDFLOW_H
+#define _IPS_TIDFLOW_H
+
+#include "psm_user.h"
+
+#define IPS_TF_MAX_GENERATION 256
+#define IPS_TF_INVALID (~0U)
+#define IPS_TF_INVALID_GENERATION 0
+
+#define IPS_TF_PSN_PACK(flow,gen,seq) \
+  ( ((((uint64_t)flow)&0x1f)<<19) | \
+    ((((uint64_t)gen)&INFINIPATH_TF_GENVAL_MASK)<<INFINIPATH_TF_GENVAL_SHIFT)|\
+    ((((uint64_t)seq)&INFINIPATH_TF_SEQNUM_MASK)<<INFINIPATH_TF_SEQNUM_SHIFT))
+
+#define IPS_TF_PSN_UNPACK(tfval,flow,gen,seq) do {			\
+    (flow) = ((tfval)>>19) & 0x1f;					\
+    (gen) = ((tfval)>>INFINIPATH_TF_GENVAL_SHIFT) & INFINIPATH_TF_GENVAL_MASK; \
+    (seq) = ((tfval)>>INFINIPATH_TF_SEQNUM_SHIFT) & INFINIPATH_TF_SEQNUM_MASK; \
+  } while (0)
+
+#define IPS_TF_INC_SEQ(tfval) \
+  tfval = (tfval & ~INFINIPATH_TF_SEQNUM_MASK) | ((ipath_tidflow_get_seqnum(tfval) + 1) & INFINIPATH_TF_SEQNUM_MASK)
+
+struct ips_tfctrl;
+
+typedef void (*ips_tf_avail_cb_fn_t)(struct ips_tfctrl *,
+				     void *context);
+typedef enum {
+  TF_STATE_INVALID = 0,
+  TF_STATE_ALLOCATED = 1,
+  TF_STATE_DEALLOCATED = 2
+} tf_state_t;
+
+struct ips_tf {
+  
+  SLIST_ENTRY(ips_tf)           next;
+
+  tf_state_t			state;	
+
+  uint32_t                      tf_idx;
+				
+  uint32_t                      next_gen:8;
+  uint32_t                      pad:24;
+};
+
+struct ips_tfctrl {
+  const psmi_context_t		*context;
+  
+  uint32_t                      tf_start_idx;
+  uint32_t                      tf_end_idx;
+  
+  uint32_t                      tf_num_max;
+  uint32_t                      tf_num_avail;
+  
+  uint32_t                      tf_num_total;
+
+  ips_tf_avail_cb_fn_t          tf_avail_cb;
+  void                         *tf_avail_context;
+  
+  SLIST_HEAD(tf_free, ips_tf)   tf_avail;
+			   
+  struct ips_tf tf[INFINIPATH_TF_NFLOWS];
+};
+
+PSMI_ALWAYS_INLINE(
+int
+ips_tf_available(struct ips_tfctrl *tfctrl))
+{
+  return tfctrl->tf_num_avail;
+}
+
+psm_error_t ips_tf_init(const psmi_context_t *context,
+			struct ips_tfctrl *tfctrl, 
+			int start_flowidx, 
+			int end_flowidx,
+			ips_tf_avail_cb_fn_t cb,
+			void *cb_context);
+psm_error_t ips_tf_fini(struct ips_tfctrl *tfctrl);
+
+/* Allocate a tidflow */
+psm_error_t ips_tf_allocate(struct ips_tfctrl *tfctrl, 
+			    uint32_t *tf_idx,
+			    uint32_t *tf_gen);
+
+/* Deallocate a tidflow */
+psm_error_t ips_tf_deallocate(struct ips_tfctrl *tfctrl, uint32_t tf_idx);
+
+/* Allocate a generation for a flow */
+psm_error_t ips_tfgen_allocate(struct ips_tfctrl *tfctrl, 
+			       uint32_t tf_idx, 
+			       uint32_t *tfgen);
+
+#endif
diff --git a/ptl_ips/ips_writehdrq.c b/ptl_ips/ips_writehdrq.c
new file mode 100644
index 0000000..2fc097b
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ips_writehdrq.h"
+
+psm_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+                   const struct ips_recvq_params *hdrq_params,
+                   const struct ips_recvq_params *egrq_params,
+                   struct ips_writehdrq *writeq,
+                   struct ips_writehdrq_state *state,
+                   uint32_t runtime_flags)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+    memset(writeq, 0, sizeof(*writeq));
+    writeq->context = context;
+    writeq->state = state;
+    writeq->hdrq = *hdrq_params; /* deep copy */
+    writeq->hdrq_elemlast = ((writeq->hdrq.elemcnt - 1) * writeq->hdrq.elemsz);
+    writeq->egrq = *egrq_params; /* deep copy */
+    writeq->egrq_buftable =
+        ips_recvq_egrbuf_table_alloc(context->ep, writeq->egrq.base_addr,
+                                     base_info->spi_rcv_egrchunksize,
+                                     writeq->egrq.elemcnt,
+				     writeq->egrq.elemsz);
+    writeq->runtime_flags = runtime_flags;
+    writeq->hdrq_rhf_off = base_info->spi_rhf_offset;
+    if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+	writeq->state->hdrq_rhf_seq = 1; 
+	/*
+	 * We don't allow readers to see the RHF until the writer can
+	 * atomically write an updated RHF.
+	 */
+	writeq->hdrq_hdr_copysz = (writeq->hdrq.elemsz - 2) * sizeof(uint32_t);
+	/*
+	 * Ensure 8-byte alignment of the RHF by looking at RHF of the second
+	 * header, which is required for atomic RHF updates.
+	 */
+	psmi_assert_always(
+	    !((uintptr_t)(writeq->hdrq.base_addr + 
+			  writeq->hdrq.elemsz + writeq->hdrq_rhf_off) & 0x7));
+    }
+    else {
+	writeq->hdrq_hdr_copysz = writeq->hdrq.elemsz * sizeof(uint32_t);
+	writeq->state->hdrq_rhf_seq = 0; /* _seq is ignored */
+    }
+    writeq->state->enabled = 1;
+    return PSM_OK;
+}
+
+psm_error_t
+ips_writehdrq_fini(struct ips_writehdrq *writeq)
+{
+    ips_recvq_egrbuf_table_free(writeq->egrq_buftable);
+    return PSM_OK;
+}
diff --git a/ptl_ips/ips_writehdrq.h b/ptl_ips/ips_writehdrq.h
new file mode 100644
index 0000000..25e91d7
--- /dev/null
+++ b/ptl_ips/ips_writehdrq.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_WRITEHDRQ_H
+#define _IPS_WRITEHDRQ_H
+
+#include "psm_user.h"
+#include "ips_recvhdrq.h"
+#include "ips_recvq.h"
+#include "psm_mq_internal.h"
+
+/*
+ * Structure containing state for writehdrq writing. This is logically
+ * part of ips_writehdrq but needs to be separated out for context
+ * sharing so that it can be put in a shared memory page and hence
+ * be available to all processes sharing the port. Generally, do not 
+ * put pointers in here since the address map of each process can be 
+ * different.  
+ */
+struct ips_writehdrq_state
+{
+    uint32_t		     hdrq_rhf_seq;	/* last seq */
+    uint32_t		     enabled;		/* enables writing */
+};
+
+struct ips_writehdrq
+{
+    const psmi_context_t    *context;
+    struct ips_writehdrq_state *state;
+    struct ips_recvq_params  hdrq;
+    uint32_t                 hdrq_elemlast;
+    uint32_t		     hdrq_rhf_off;	/* rhf offset */
+    uint32_t		     hdrq_hdr_copysz;
+    struct ips_recvq_params  egrq;
+    void	           **egrq_buftable; /* table of eager idx-to-ptr */
+    uint32_t		     runtime_flags;
+};
+
+psm_error_t
+ips_writehdrq_init(const psmi_context_t *context,
+                   const struct ips_recvq_params *hdrq_params,
+                   const struct ips_recvq_params *egrq_params,
+                   struct ips_writehdrq *writeq,
+                   struct ips_writehdrq_state *state,
+                   uint32_t runtime_flags);
+
+psm_error_t
+ips_writehdrq_fini(struct ips_writehdrq *writeq);
+
+PSMI_ALWAYS_INLINE(
+void
+ips_writehdrq_write_rhf_atomic(uint32_t *rhf_dest, uint32_t *rhf_src))
+{
+#if WORDSIZE == 64
+    /*
+     * In 64-bit mode, we check in init that the rhf will always be 8-byte
+     * aligned
+     */
+    *((uint64_t *)rhf_dest) = *((uint64_t *)rhf_src);
+#else
+    /* 
+     * In 32-bit mode, we ensure that word 0 always gets written before word 1
+     */
+    rhf_dest[0] = rhf_src[0];
+    ips_wmb();
+    rhf_dest[1] = rhf_src[1];
+#endif
+    return;
+}
+
+PSMI_INLINE(
+int
+ips_writehdrq_append(struct ips_writehdrq *writeq,
+		     const struct ips_recvhdrq_event *rcv_ev))
+{
+    const uint32_t *rcv_hdr = rcv_ev->rcv_hdr;
+    uint32_t write_hdr_head;
+    uint32_t write_hdr_tail;
+    uint32_t *write_hdr;
+    uint32_t *write_rhf;
+    char *write_payload = NULL;
+    uint32_t next_write_hdr_tail;
+    uint32_t rcv_paylen;
+    union {
+	uint32_t    u32[2];
+	uint64_t    u64;
+    } rhf;
+    int result = IPS_RECVHDRQ_CONTINUE;
+
+    /* Drop packet if write header queue is disabled */
+    if (!writeq->state->enabled) {
+        result = IPS_RECVHDRQ_BREAK;
+        goto done;
+    }
+
+    write_hdr_head = ips_recvq_head_get(&writeq->hdrq);
+    write_hdr_tail = ips_recvq_tail_get(&writeq->hdrq);
+    write_hdr = writeq->hdrq.base_addr + write_hdr_tail;
+    write_rhf = write_hdr + writeq->hdrq_rhf_off;
+
+    /* Drop packet if write header queue is full */
+    next_write_hdr_tail = write_hdr_tail + writeq->hdrq.elemsz;
+    if (next_write_hdr_tail > writeq->hdrq_elemlast)
+	next_write_hdr_tail = 0;
+    if (next_write_hdr_tail == write_hdr_head) {
+        result = IPS_RECVHDRQ_BREAK;
+        goto done;
+    }
+
+    /* 
+     * If NORDMA_TAIL, don't let consumer see RHF until it's ready.  We copy
+     * the source rhf and operate on it until we are ready to atomically update
+     * it for the reader.
+     */
+    if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+	write_rhf = &rhf.u32[0];
+	rhf.u64 = *((uint64_t *) rcv_ev->rhf);
+    }
+
+    /* Copy the data if this is an eager packet */
+    rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev);
+    rcv_paylen += (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0);
+    
+    if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER && rcv_paylen > 0)
+    {
+	uint32_t write_egr_tail = ips_recvq_tail_get(&writeq->egrq);
+	uint32_t next_write_egr_tail;
+
+	/* Drop packet if write eager queue is full */
+	next_write_egr_tail = write_egr_tail + 1;
+	if (next_write_egr_tail >= writeq->egrq.elemcnt)
+	    next_write_egr_tail = 0;
+	if (next_write_egr_tail == ips_recvq_head_get(&writeq->egrq)) {
+            /* Copy the header to the subcontext's header queue */
+            psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz);
+
+	    /* Mark header with ETIDERR (eager overflow) */
+	    ipath_hdrset_err_flags(write_rhf, INFINIPATH_RHF_H_TIDERR);
+
+	    /* Fix up the header with current subcontext eager index */
+	    ipath_hdrset_index(write_rhf, write_egr_tail);
+
+            result = IPS_RECVHDRQ_BREAK;
+	}
+	else {
+            if (rcv_paylen) {
+	        const char *rcv_payload = ips_recvhdrq_event_payload(rcv_ev);
+
+	        /* Use pre-calculated address from look-up table */
+                write_payload = ips_recvq_egr_index_2_ptr(
+                                    writeq->egrq_buftable, write_egr_tail);
+
+	        psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen);
+	    }
+
+            /* Copy the header to the subcontext's header queue */
+            psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz);
+
+	    /* Fix up the header with the subcontext's eager index */
+	    ipath_hdrset_index((uint32_t *) write_rhf, write_egr_tail);
+
+	    /* Update the eager buffer tail pointer */
+            ips_recvq_tail_update(&writeq->egrq, next_write_egr_tail);
+	}
+    }
+    else {
+        /* Copy the header to the subcontext's header queue */
+        psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz);
+
+	/* Copy the value of the current egr tail, handles the
+	 * eager-with-no-payload case */
+	if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER)
+	    ipath_hdrset_index((uint32_t *) write_rhf,
+			   ips_recvq_tail_get(&writeq->egrq));
+    }
+
+    /* Ensure previous writes are visible before writing rhf seq or tail */
+    ips_wmb();
+
+    if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) {
+	/* We accumulated a few changes to the RHF and now want to make it
+	 * atomically visible for the reader.
+	 */
+        uint32_t rhf_seq = writeq->state->hdrq_rhf_seq;
+        ipath_hdrset_seq((uint32_t *) write_rhf, rhf_seq);
+        if (rhf_seq >= LAST_RHF_SEQNO)
+            writeq->state->hdrq_rhf_seq = 1;
+        else
+            writeq->state->hdrq_rhf_seq = rhf_seq + 1;
+
+	/* Now write the new rhf */
+	ips_writehdrq_write_rhf_atomic(write_hdr + writeq->hdrq_rhf_off, write_rhf);
+    }
+
+    /* The tail must be updated regardless of IPATH_RUNTIME_NODMA_RTAIL
+     * since this tail is also used to keep track of where 
+     * ips_writehdrq_append will write to next. For subcontexts there is 
+     * no separate shadow copy of the tail. */
+    ips_recvq_tail_update(&writeq->hdrq, next_write_hdr_tail);
+
+done:
+    return result;
+}
+
+#endif /* _IPS_WRITEHDRQ_H */
diff --git a/ptl_ips/ipserror.c b/ptl_ips/ipserror.c
new file mode 100644
index 0000000..1c84d47
--- /dev/null
+++ b/ptl_ips/ipserror.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* IPS - Interconnect Protocol Stack */
+
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <ipserror.h>
+
+char * ips_err_str(int ips_error)
+{
+    static char err_str[128];
+
+    switch (ips_error) {
+    case IPS_RC_OK:
+        return "OK!";
+
+    case IPS_RC_ERROR:
+        return "general error";
+
+    case IPS_RC_PENDING:
+        return "request pending";
+
+    case IPS_RC_EXIST:
+        return "entry exist";
+
+    case IPS_RC_MAX_ENTRIES_EXCEEDED:
+        return "max entries has been exceeded";
+
+    case IPS_RC_NOT_ENOUGH_BUFFERS:
+        return "not enough buffers to complete request";
+
+    case IPS_RC_NO_FREE_MEM:
+        return "no free memory";
+
+    case IPS_RC_NAME_LOOKUP_FAILED:
+        return "name lookup failed";
+
+    case IPS_RC_PARAM_ERROR:
+        return "invalid parameter";
+
+    case IPS_RC_UNKNOWN_DEVICE:
+        return "unknown device";
+
+    case IPS_RC_DEVICE_INIT_FAILED:
+        return "device init failed";
+
+    case IPS_RC_DATA_TRUNCATED:
+        return "data truncated";
+
+    case IPS_RC_INVALID_RANK:
+        return "invalid rank";
+
+    case IPS_RC_INVALID_OPCODE:
+        return "invalid op code";
+
+    case IPS_RC_PEER_NOT_READY:
+        return "peer is not ready";
+
+    case IPS_RC_PEER_CLOSED :
+        return "peer is closed";
+
+    case IPS_RC_DEST_EQUAL_LOCAL_RANK:
+        return "src and dest rank is equal";
+
+    case IPS_RC_DEVICE_ERROR:
+        return "InfiniPath hardware not found, hardware problem, or disabled";
+
+    case IPS_RC_NETWORK_DOWN:
+        return "The link is down";
+
+    case IPS_RC_NOT_ENOUGH_FREE_TIDS:
+        return "Not enough free TIDS to complete request";
+
+    case IPS_RC_NO_RESOURCE_AVAILABLE:
+        return "Internal resources exhausted";
+
+    case IPS_RC_HW_UPDATE_FAILED:
+        return "Failed TID update for rendevous, allocation problem";
+
+    case IPS_RC_PARTITION_ERROR:
+        return "One or more nodes is on a different partition";
+
+    case IPS_RC_RUN_ERROR:
+        return "One or more nodes is still running the previous job";
+
+    case IPS_RC_ALREADY_OPEN:
+        return "Open/init has already been called";
+
+    case IPS_RC_WAS_CLOSED:
+        return "Close has already been called";
+
+    case IPS_RC_DEST_EQUAL_LOCAL_LID:
+        return "src and dest LID is equal";
+
+    case IPS_RC_BUFFER_ALIGMENT_ERROR:
+        return "Buffer start address is not 32 bit aligned";
+
+    case IPS_RC_LENGTH_ALIGMENT_ERROR:
+        return "Buffer length is not a whole # of 32 bit words";
+
+    case IPS_RC_INVALID_DATA_LENGTH:
+        return "invalid data length";
+
+    case IPS_RC_BUSY:
+        return "Device is busy";
+
+    case IPS_RC_INIT_TIMEOUT_EXPIRED:
+        return "Could not connect to other nodes";
+
+    case IPS_RC_NO_PORTS_AVAILABLE:
+        return "All InfiniPath ports are in use.";
+
+    /* Performance Counters codes */
+    case IPS_RCPERF_INIT_FAILED:
+         return "Initialization of performance counters failed";
+
+    case IPS_RCPERF_EVENT_SETUP_FAILED:
+         return "Setting performance counter events failed";
+
+    case IPS_RCPERF_REG_DEFAULT_SET:
+         return "Default event set for one of the counters";
+
+    case IPS_RCPERF_UNSUPPORTED_CPU:
+         return "This CPU type is not supported";
+
+    case IPS_RCPERF_REG_GET_FAILED:
+         return "Failed to get register value for event";
+
+    case IPS_RCPERF_SET_EVENT_STR_FAILED:
+         return "Failed to find event description";
+
+    case IPS_RCPERF_INVALID_REGISTER:
+         return "Register index out of range of available counters";
+
+    case IPS_RC_SYSERR: // we hope errno hasn't changed since this was set...
+        snprintf(err_str, sizeof err_str, "System error: %s", strerror(errno));
+        return err_str;
+
+    default:
+        snprintf(err_str, sizeof err_str, "Error code %i: <no interpretation>", ips_error);
+        return err_str;
+    }
+}
diff --git a/ptl_ips/ipserror.h b/ptl_ips/ipserror.h
new file mode 100644
index 0000000..57f35de
--- /dev/null
+++ b/ptl_ips/ipserror.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * interface to InfiniPath Interconnect Protocol Stack
+ *
+ * This file contains the function prototypes of the interconnect protocol
+ * stack. It should be included in all the clients of the stack, such as MPI.
+ */
+
+#ifndef ipserror_h
+#define ipserror_h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return codes */
+#define IPS_RC_OK 0
+#define IPS_RC_ERROR (-1)
+#define IPS_RC_PENDING (-2)
+#define IPS_RC_EXIST (-3)
+#define IPS_RC_MAX_ENTRIES_EXCEEDED (-4)
+#define IPS_RC_NOT_ENOUGH_BUFFERS   (-100)
+#define IPS_RC_NO_FREE_MEM  (-101)
+#define IPS_RC_NAME_LOOKUP_FAILED   (-102)
+#define IPS_RC_PARAM_ERROR  (-103)
+#define IPS_RC_UNKNOWN_DEVICE   (-104)
+#define IPS_RC_DEVICE_INIT_FAILED   (-105)
+#define IPS_RC_DATA_TRUNCATED   (-106)
+#define IPS_RC_INVALID_RANK (-107)
+#define IPS_RC_INVALID_OPCODE   (-108)
+#define IPS_RC_PEER_NOT_READY   (-109)
+#define IPS_RC_PEER_CLOSED  (-110)
+#define IPS_RC_DEST_EQUAL_LOCAL_RANK    (-111)
+#define IPS_RC_DEVICE_ERROR  (-112)
+#define IPS_RC_NETWORK_DOWN  (-113)
+#define IPS_RC_NOT_ENOUGH_FREE_TIDS   (-114)
+#define IPS_RC_NO_RESOURCE_AVAILABLE (-115)
+#define IPS_RC_HW_UPDATE_FAILED (-116)
+#define IPS_RC_PARTITION_ERROR   (-117)
+#define IPS_RC_RUN_ERROR (-118)
+#define IPS_RC_ALREADY_OPEN (-119)
+#define IPS_RC_WAS_CLOSED (-120)
+#define IPS_RC_DEST_EQUAL_LOCAL_LID    (-121)
+#define IPS_RC_BUFFER_ALIGMENT_ERROR  (-122)
+#define IPS_RC_LENGTH_ALIGMENT_ERROR  (-123)
+#define IPS_RC_INVALID_DATA_LENGTH   (-124)
+#define IPS_RC_BUSY (-125)
+#define IPS_RC_INIT_TIMEOUT_EXPIRED (-126)
+#define IPS_RC_NO_PORTS_AVAILABLE (-127)
+#define IPS_RC_TRANSFER_INCOMPLETE (-128)
+#define IPS_RC_SYSERR (-129)	// errno has meaning, if no further errors since this error
+#define IPS_RC_STARTUP_ERR (-130)
+
+/* Performance Counters Error Codes */
+#define IPS_RCPERF_INIT_FAILED          (-200)
+#define IPS_RCPERF_EVENT_SETUP_FAILED   (-201)
+#define IPS_RCPERF_REG_DEFAULT_SET      (-202)
+#define IPS_RCPERF_UNSUPPORTED_CPU      (-203)
+#define IPS_RCPERF_REG_GET_FAILED       (-204)
+#define IPS_RCPERF_SET_EVENT_STR_FAILED (-205)
+#define IPS_RCPERF_INVALID_REGISTER     (-206)
+
+	char *ips_err_str(int);
+
+#ifdef __cplusplus
+}				/* extern "C" */
+#endif
+#endif
diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c
new file mode 100644
index 0000000..0c874d8
--- /dev/null
+++ b/ptl_ips/ptl.c
@@ -0,0 +1,860 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file implements the PSM PTL for ips */
+#include "psm_user.h"
+#include "ptl_ips.h"
+#include "ipserror.h"
+
+int ips_ptl_recvq_isempty(const struct ptl *ptl);
+
+#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS	250
+
+static
+int
+ips_subcontext_ignore(const struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext)
+{
+    return IPS_RECVHDRQ_CONTINUE;
+}
+
+static
+int
+ips_subcontext_process(const struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext)
+{
+    struct ptl_shared *recvshc = rcv_ev->proto->ptl->recvshc;
+    if_pt (subcontext != recvshc->subcontext &&
+           subcontext < recvshc->subcontext_cnt) {
+        return ips_writehdrq_append(&recvshc->writeq[subcontext], rcv_ev);
+    }
+    else {
+        _IPATH_VDBG("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n",
+		    (int) subcontext, (int) recvshc->subcontext_cnt,
+		    (int) recvshc->subcontext, (unsigned) rcv_ev->error_flags);
+        return IPS_RECVHDRQ_BREAK;
+    }
+}
+
+static
+void
+recvhdrq_hw_params(const psmi_context_t *context, 
+       		   struct ips_recvq_params *hdrq,
+		   struct ips_recvq_params *egrq,
+                   int is_shared_context, int subcontext)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+
+    hdrq->elemcnt   = base_info->spi_rcvhdr_cnt;
+    hdrq->elemsz    = base_info->spi_rcvhdrent_size;
+
+    egrq->elemsz    = base_info->spi_rcv_egrbufsize; /* bytes */
+    egrq->elemcnt   = base_info->spi_tidegrcnt; /* words */
+    
+    if (!is_shared_context) {
+      volatile uint64_t *uregbase =  /* HW registers */
+	(volatile uint64_t *) (uintptr_t) base_info->spi_uregbase;
+      hdrq->base_addr = (uint32_t *)(uintptr_t) base_info->spi_rcvhdr_base;
+      hdrq->head_register = (volatile __le32 *) &uregbase[ur_rcvhdrhead];
+      hdrq->tail_register = (volatile __le32 *) (uintptr_t)
+	base_info->spi_rcvhdr_tailaddr;
+      egrq->base_addr = (void *) (uintptr_t) base_info->spi_rcv_egrbufs;
+      egrq->head_register  = (volatile __le32 *)
+	&uregbase[ur_rcvegrindexhead];
+      egrq->tail_register  = (volatile __le32 *)
+	&uregbase[ur_rcvegrindextail];
+    }
+    else {
+      /* Subcontexts mimic the HW registers but use different addresses
+       * to avoid cache contention. */
+      volatile uint64_t *subcontext_uregbase;
+      uint32_t *rcv_hdr;
+      void *rcv_egr;
+      unsigned pagesize = getpagesize();
+      unsigned hdrsize, egrsize;
+      unsigned i = pagesize - 1;
+      hdrsize = (base_info->spi_rcvhdr_cnt * sizeof(uint32_t) *
+		 base_info->spi_rcvhdrent_size + i) & ~i;
+      egrsize = base_info->spi_rcv_egrbuftotlen;
+      subcontext_uregbase = (uint64_t *) 
+	(((uintptr_t) base_info->spi_subctxt_uregbase) +
+	 (pagesize * subcontext));
+      rcv_hdr = (uint32_t *) 
+	(((uintptr_t) base_info->spi_subctxt_rcvhdr_base +
+	  (hdrsize * subcontext)));
+      rcv_egr = (void *) 
+	(((uintptr_t) base_info->spi_subctxt_rcvegrbuf +
+	  (egrsize * subcontext)));
+      hdrq->base_addr = (uint32_t *) rcv_hdr;
+      hdrq->head_register = (volatile __le32 *)
+	&subcontext_uregbase[ur_rcvhdrhead * 8];
+      hdrq->tail_register = (volatile __le32 *) (uintptr_t)
+	&subcontext_uregbase[ur_rcvhdrtail * 8];
+      egrq->base_addr = rcv_egr;
+      egrq->head_register  = (volatile __le32 *)
+	&subcontext_uregbase[ur_rcvegrindexhead * 8];
+      egrq->tail_register  = (volatile __le32 *)
+	&subcontext_uregbase[ur_rcvegrindextail * 8];
+    }
+}
+
+static psm_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context);
+static psm_error_t shrecvq_fini(ptl_t *ptl);
+
+static
+size_t
+ips_ptl_sizeof(void)
+{
+    return sizeof(ptl_t);
+}
+
+static
+int
+ips_ptl_epaddr_stats_num(void)
+{
+    return sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t);
+}
+
+static
+int
+ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags)
+{
+    int num_stats = sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t);
+    int i;
+
+    /* All stats are uint64_t */
+    for (i = 0; i < num_stats; i++) 
+	flags[i] = MPSPAWN_STATS_REDUCTION_ALL |
+		   MPSPAWN_STATS_SKIP_IF_ZERO;
+
+    desc[0] = "errchecks sent";
+    desc[1] = "errchecks recv";
+    desc[2] = "naks sent";
+    desc[3] = "naks recv";
+    desc[4] = "connect reqs sent";
+    desc[5] = "disconnect reqs sent";
+    desc[6] = "tid grants sent";
+    desc[7] = "tid grants recv";
+    desc[8] = "send rexmit";
+    desc[9] = "congestion packets";
+
+    return num_stats;
+}
+
+int
+ips_ptl_epaddr_stats_get(psm_epaddr_t epaddr, uint64_t *stats_o)
+{
+    struct ptl_epaddr *ipsaddr = epaddr->ptladdr;
+    int i, num_stats = sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t);
+    uint64_t *stats_i = (uint64_t *) &ipsaddr->stats;
+
+    for (i = 0; i < num_stats; i++)
+	stats_o[i] = stats_i[i];
+
+    return num_stats;
+}
+
+static psm_error_t
+psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current)
+{
+    struct ptl *ptl = (struct ptl *) t->context;
+    const uint64_t current_count = get_cycles();
+    psm_error_t err;
+
+    err = psmi_context_check_status(ptl->context);
+    psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+	    current_count + ptl->status_cyc_timeout);
+
+    return err;
+}
+
+static
+psm_error_t 
+ips_ptl_init(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+    psm_error_t err = PSM_OK;
+    uint32_t num_of_send_bufs = ep->ipath_num_sendbufs;
+    uint32_t num_of_send_desc = ep->ipath_num_descriptors;
+    uint32_t imm_size = ep->ipath_imm_size;
+    const psmi_context_t *context = &ep->context;
+    const struct ipath_user_info *user_info = &context->user_info;
+    const int enable_shcontexts = (user_info->spu_subcontext_cnt > 0);
+    const uint64_t current_count = get_cycles();
+
+    /* Preconditions */
+    psmi_assert_always(ep != NULL);
+    psmi_assert_always(ep->epaddr != NULL);
+    psmi_assert_always(ep->epid != 0);
+    psmi_assert_always(ep->ipath_num_sendbufs > 0);
+
+    memset(ptl, 0, sizeof(struct ptl));
+
+    ptl->ep     = ep;         /* back pointer */
+    ptl->epid   = ep->epid;   /* cache epid */
+    ptl->epaddr = ep->epaddr; /* cache a copy */
+    ptl->ctl    = ctl;
+    ptl->context   = context;
+    ptl->runtime_flags = context->runtime_flags;
+
+    memset(ctl, 0, sizeof(*ctl));
+    /* Fill in the control structure */
+    ctl->ptl           = ptl;
+    ctl->ep_poll       = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll;
+    ctl->ep_connect    = ips_ptl_connect;
+    ctl->ep_disconnect = ips_ptl_disconnect;
+    ctl->mq_send       = ips_proto_mq_send;
+    ctl->mq_isend      = ips_proto_mq_isend;
+
+    ctl->am_short_request = ips_am_short_request;
+    ctl->am_short_reply   = ips_am_short_reply;
+
+    ctl->epaddr_stats_num  = ips_ptl_epaddr_stats_num;
+    ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init;
+    ctl->epaddr_stats_get  = ips_ptl_epaddr_stats_get;
+
+    /* 
+     * Runtime flags in 'ptl' are different from runtime flags in 'context'.
+     * In 'context', runtime flags reflect what the driver is capable of.
+     * In 'ptl', runtime flags reflect the features we can or want to use in
+     *           the driver's supported runtime flags.
+     */
+
+    /*
+     * This timer is to be used to check the context's status at every
+     * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS.  This is useful to detect when
+     * the link transitions from the DOWN state to the UP state.  We can thus
+     * stop aggregating link failure messages once we detect that the link is
+     * up.
+     */
+    psmi_timer_entry_init(&ptl->status_timer,
+	    psmi_context_check_status_callback, ptl);
+
+    /* cache the context's status timeout in cycles */
+    ptl->status_cyc_timeout =
+	    ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS);
+
+    /*
+     * Retransmissions and pending operations are kept in a timer structure
+     * (queue).  The timerq is shared to various internal IPS interfaces so
+     * that they too may schedule events on the timer queue.  The timerq is
+     * drained in the progress function.
+     */
+    if ((err = psmi_timer_init(&ptl->timerq)))
+	goto fail;
+
+    /* start the context's status timer */
+    psmi_timer_request_always(&ptl->timerq, &ptl->status_timer,
+	    current_count + ptl->status_cyc_timeout);
+
+    /*
+     * Hardware send pio used by eager and control messages.  
+     */
+    if ((err = ips_spio_init(context, ptl, &ptl->spioc)))
+	goto fail;
+
+    /*
+     * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings
+     * are added/removed by the connect portion of the ips protocol and lookup
+     * is made by the receive queue processing component.
+     */
+    if ((err = ips_epstate_init(&ptl->epstate, context)))
+	goto fail;
+
+    /*
+     * Actual ips protocol handling.
+     */
+    if ((err = ips_proto_init(context, ptl, num_of_send_bufs, num_of_send_desc,
+			      imm_size, &ptl->timerq, &ptl->epstate, 
+			      &ptl->spioc, &ptl->proto)))
+	goto fail;
+
+    /*
+     * Hardware receive hdr/egr queue, services incoming packets and issues
+     * callbacks for protocol handling in proto_recv.  It uses the epstate
+     * interface to determine if a packet is known or unknown.
+     */
+    if (!enable_shcontexts) {
+        struct ips_recvhdrq_callbacks recvq_callbacks;
+	struct ips_recvq_params hdrq, egrq;
+	recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+	recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown;
+	recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+	recvq_callbacks.callback_error = ips_proto_process_packet_error;
+	if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+		      &hdrq, &egrq, &recvq_callbacks, 
+		      ptl->runtime_flags, 0,
+		      &ptl->recvq, &ptl->recvq_state)))
+	    goto fail;
+    }
+
+    /*
+     * Software receive hdr/egr queue, used in shared contexts.
+     */
+    if (enable_shcontexts && (err = shrecvq_init(ptl, context)))
+        goto fail;
+
+    /* 
+     * Receive thread, always initialized but not necessary creates a
+     * pthread.
+     */
+    if ((err = ips_ptl_rcvthread_init(ptl, &ptl->recvq)))
+	goto fail;
+fail:
+    return err;
+}
+
+static
+psm_error_t
+ips_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_in)
+{
+    const struct ipath_user_info *user_info = &ptl->context->user_info;
+    const int enable_shcontexts = (user_info->spu_subcontext_cnt > 0);
+    psm_error_t err = PSM_OK;
+
+    if ((err = ips_proto_fini(&ptl->proto, force, timeout_in)))
+	goto fail;
+
+    /* We have to cancel the thread after terminating the protocol because
+     * connect/disconnect packets use interrupts and the kernel doesn't
+     * like to have no pollers waiting */
+    if ((err = ips_ptl_rcvthread_fini(ptl)))
+	goto fail;
+    
+    if ((err = ips_epstate_fini(&ptl->epstate)))
+	goto fail;
+
+    if ((err = ips_spio_fini(&ptl->spioc)))
+	goto fail;
+
+    if ((err = psmi_timer_fini(&ptl->timerq)))
+	goto fail;
+
+    if (!enable_shcontexts && (err = ips_recvhdrq_fini(&ptl->recvq)))
+	goto fail;
+
+    if (enable_shcontexts && (err = shrecvq_fini(ptl)))
+        goto fail;
+
+fail:
+    return err;
+}
+
+static 
+psm_error_t
+ips_ptl_optctl(const void *core_obj, int optname, 
+	       void *optval, uint64_t *optlen, int get)
+{
+  psm_error_t err = PSM_OK;
+  
+  switch(optname) {
+  case PSM_IB_OPT_EP_SL: 
+    {
+      /* Core object is psm_epaddr */
+      psm_epaddr_t epaddr = (psm_epaddr_t) core_obj; 
+      ips_epaddr_t *ipsaddr = epaddr->ptladdr;
+      
+      /* If endpoint does not use IB ignore for set, complain for get */
+      if (epaddr->ptlctl->ep_connect != ips_ptl_connect) {
+	if (get)
+	  err = psmi_handle_error(PSMI_EP_LOGEVENT, 
+				  PSM_PARAM_ERR, "Invalid EP transport");
+	goto exit_fn;
+      }
+      
+      /* Sanity check option length */
+      if (*optlen < sizeof(uint8_t)) {
+	err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				"Option value length error");
+	*optlen = sizeof(unsigned);
+	goto exit_fn;
+      }
+      
+      if (get) {
+	/* Get returns the SL for the PIO flow */
+	*((uint8_t *) optval) = 
+	  (uint8_t) ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].sl;
+      }
+      else {
+	uint16_t new_sl;
+	
+	/* Sanity check if SL is within range */
+	new_sl = (uint16_t) *(uint8_t*) optval;
+	if (new_sl > 15) {
+	  err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				  "Invalid SL value %u. 0 <= SL <= 15.",new_sl);
+	  goto exit_fn;
+	}
+	
+	/* Set new SL for all flows */
+	ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].sl = new_sl;
+	ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].sl = new_sl;
+	ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ].sl = new_sl;
+	ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP].sl = new_sl;
+      }
+    }
+    break;
+  case PSM_IB_OPT_DF_SL:
+    {
+      /* Set default SL to be used by an endpoint for all communication */
+      /* Core object is psm_epaddr */
+      psm_ep_t ep = (psm_ep_t) core_obj;
+      
+      /* Make sure ep is specified */
+      if (!ep) {
+	err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				"Invalid PSM Endpoint");
+	goto exit_fn;
+      }
+      
+      /* Sanity check option length */
+      if (*optlen < sizeof(uint8_t)) {
+	err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				"Option value length error");
+	*optlen = sizeof(uint8_t);
+	goto exit_fn;
+      }
+      
+      if (get) {
+	*((uint8_t *) optval) = ep->ptl_ips.ptl->proto.epinfo.ep_sl;	
+      }
+      else {
+	uint16_t new_sl;
+	
+	/* Sanity check if SL is within range */
+	new_sl = (uint16_t) *(uint8_t*) optval;
+	if (new_sl > 15) {
+	  err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, 
+				  "Invalid SL value %u. 0 <= SL <= 15.",new_sl);
+	  goto exit_fn;
+	}
+	
+	ep->ptl_ips.ptl->proto.epinfo.ep_sl = (uint8_t) new_sl;
+      }
+    }
+    break;
+  default:
+    err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown PSM_IB option %u.", optname);
+  }
+  
+ exit_fn:
+  return err;
+}
+
+static 
+psm_error_t
+ips_ptl_setopt(const void *component_obj, int optname, 
+	       const void *optval, uint64_t optlen)
+{
+  return ips_ptl_optctl(component_obj, optname, (void*) optval, &optlen, 0);
+}
+
+static
+psm_error_t
+ips_ptl_getopt(const void *component_obj, int optname,
+	       void *optval, uint64_t *optlen)
+{
+  return ips_ptl_optctl(component_obj, optname, optval, optlen, 1);
+}
+
+psm_error_t __recvpath 
+ips_ptl_poll(ptl_t *ptl, int _ignored)
+{
+    const uint64_t current_count = get_cycles();
+    const int do_lock = PSMI_PLOCK_DISABLED && 
+      (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD);
+    psm_error_t err = PSM_OK_NO_PROGRESS;
+    psm_error_t err2;
+    
+    if (!ips_recvhdrq_isempty(&ptl->recvq)) {
+      	if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq))
+      	    return err;
+	err = ips_recvhdrq_progress(&ptl->recvq);
+	if (do_lock)
+	    ips_recvhdrq_unlock(&ptl->recvq);
+	if_pf (err > PSM_OK_NO_PROGRESS)
+	    return err;
+	err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count);
+	if (err2 != PSM_OK_NO_PROGRESS)
+	    return err2;
+	else
+	    return err;
+    }	
+
+    /* 
+     * Process timer expirations after servicing receive queues (some packets
+     * may have been acked, some requests-to-send may have been queued).
+     *
+     * It's safe to look at the timer without holding the lock because it's not
+     * incorrect to be wrong some of the time.
+     */
+    if (psmi_timer_is_expired(&(ptl->timerq), current_count)) {
+	if (do_lock)
+	    ips_recvhdrq_lock(&ptl->recvq);
+	err = psmi_timer_process_expired(&(ptl->timerq), current_count);
+	if (do_lock)
+	    ips_recvhdrq_unlock(&ptl->recvq);
+    }
+
+    return err;
+}
+
+PSMI_INLINE(
+int
+ips_try_lock_shared_context (struct ptl_shared *recvshc))
+{
+    return pthread_spin_trylock(recvshc->context_lock);
+}
+
+PSMI_INLINE(
+void
+ips_lock_shared_context (struct ptl_shared *recvshc))
+{
+    pthread_spin_lock(recvshc->context_lock);
+}
+
+PSMI_INLINE(
+void
+ips_unlock_shared_context (struct ptl_shared *recvshc))
+{
+    pthread_spin_unlock(recvshc->context_lock);
+}
+
+psm_error_t __recvpath 
+ips_ptl_shared_poll(ptl_t *ptl, int _ignored)
+{
+    const uint64_t current_count = get_cycles();
+    psm_error_t err = PSM_OK_NO_PROGRESS;
+    psm_error_t err2;
+    struct ptl_shared *recvshc = ptl->recvshc;
+    psmi_assert(recvshc != NULL);
+
+    /* The following header queue checks are speculative (but safe)
+     * until this process has acquired the lock. The idea is to 
+     * minimize lock contention due to processes spinning on the 
+     * shared context. */
+    if (ips_recvhdrq_isempty(&recvshc->recvq)) {
+        if (!ips_recvhdrq_isempty(&ptl->recvq) &&
+	    ips_try_lock_shared_context(recvshc) == 0) {
+	    /* check that subcontext is empty while under lock to avoid 
+             * re-ordering of incoming packets (since packets from 
+             * hardware context will be processed immediately). */
+	    if_pt (ips_recvhdrq_isempty(&recvshc->recvq)) {
+                err = ips_recvhdrq_progress(&ptl->recvq);
+	    }
+            ips_unlock_shared_context(recvshc);
+	}
+    }
+
+    if_pf (err > PSM_OK_NO_PROGRESS)
+	return err;
+
+    if (!ips_recvhdrq_isempty(&recvshc->recvq)) {
+	err2 = ips_recvhdrq_progress(&recvshc->recvq);
+        if (err2 != PSM_OK_NO_PROGRESS) {
+	    err = err2;
+        }
+    }	
+
+    if_pf (err > PSM_OK_NO_PROGRESS)
+	return err;
+
+    /* 
+     * Process timer expirations after servicing receive queues (some packets
+     * may have been acked, some requests-to-send may have been queued).
+     */
+    err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count);
+    if (err2 != PSM_OK_NO_PROGRESS)
+	err = err2;
+
+    return err;
+}
+
+int __recvpath
+ips_ptl_recvq_isempty(const ptl_t *ptl)
+{
+    struct ptl_shared *recvshc = ptl->recvshc;
+
+    if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq))
+	return 0;
+    return ips_recvhdrq_isempty(&ptl->recvq);
+}
+
+/* 
+ * Legacy ips_get_stat -- do nothing.
+ */
+int ips_get_stat(psm_epaddr_t epaddr, ips_sess_stat * stats)
+{
+    memset(stats, 0, sizeof (ips_sess_stat));
+    return 0;
+}
+
+static 
+psm_error_t 
+shrecvq_init(ptl_t *ptl, const psmi_context_t *context)
+{
+    const struct ipath_base_info *base_info = &context->base_info;
+    const struct ipath_user_info *user_info = &context->user_info;
+    struct ips_recvhdrq_callbacks recvq_callbacks;
+    struct ips_recvq_params hdrq, egrq;
+    psm_error_t err = PSM_OK;
+    struct ptl_shared *recvshc;
+    int i;
+
+    psmi_assert_always(user_info->spu_subcontext_cnt > 0);
+
+    recvshc = (struct ptl_shared *)
+	    psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ptl_shared));
+    if (recvshc == NULL) {
+        err = PSM_NO_MEMORY;
+	goto fail;
+    }
+
+    ptl->recvshc = recvshc;
+    recvshc->ptl = ptl;
+
+    /* Initialize recvshc fields */
+    recvshc->subcontext = base_info->spi_subcontext;
+    recvshc->subcontext_cnt = user_info->spu_subcontext_cnt;
+    psmi_assert_always(recvshc->subcontext_cnt <= INFINIPATH_MAX_SUBCONTEXT);
+    psmi_assert_always(recvshc->subcontext < recvshc->subcontext_cnt);
+
+    if ((err = ips_subcontext_ureg_get(ptl, context, recvshc->subcontext_ureg,
+                                       recvshc->subcontext_cnt)))
+        goto fail;
+    if ((err = ips_subcontext_ureg_initialize(
+           ptl, recvshc->subcontext, recvshc->subcontext_ureg[recvshc->subcontext])))
+        goto fail;
+    recvshc->context_lock = &recvshc->subcontext_ureg[0]->context_lock;
+
+    /* Initialize (shared) hardware context recvq (ptl->recvq) */
+    /* NOTE: uses recvq in ptl structure for shared h/w context */
+    recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0);
+    recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown;
+    recvq_callbacks.callback_subcontext = ips_subcontext_process;
+    recvq_callbacks.callback_error = ips_proto_process_packet_error;
+    if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+		      &hdrq, &egrq, &recvq_callbacks,
+		      ptl->runtime_flags, recvshc->subcontext,
+		      &ptl->recvq,
+		      &recvshc->subcontext_ureg[0]->recvq_state))) {
+	goto fail;
+    }
+
+    /* Initialize software subcontext (recvshc->recvq). Subcontexts do */
+    /* not require the rcvhdr copy feature. */
+    recvhdrq_hw_params(context, &hdrq, &egrq, 1, recvshc->subcontext);
+    recvq_callbacks.callback_subcontext = ips_subcontext_ignore;
+    if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto,
+		      &hdrq, &egrq, &recvq_callbacks,
+		      ptl->runtime_flags & ~IPATH_RUNTIME_RCVHDR_COPY,
+                      recvshc->subcontext,
+		      &recvshc->recvq,
+		      &recvshc->recvq_state))) {
+	goto fail;
+    }
+
+    /* Initialize each recvshc->writeq for shared contexts */
+    for (i = 0; i < recvshc->subcontext_cnt; i++) {
+        recvhdrq_hw_params(context, &hdrq, &egrq, 1, i);
+        if ((err = ips_writehdrq_init(context, &hdrq, &egrq,
+                          &recvshc->writeq[i],
+                          &recvshc->subcontext_ureg[i]->writeq_state,
+                          ptl->runtime_flags & ~IPATH_RUNTIME_RCVHDR_COPY))) {
+	    goto fail;
+	}
+    }
+
+    if (err == PSM_OK)
+        _IPATH_DBG("Context sharing in use: lid %d, context %d, sub-context %d\n",
+	           (int) psm_epid_nid(ptl->epid), base_info->spi_context,
+                   recvshc->subcontext);
+fail:
+    return err;
+}
+
+static 
+psm_error_t 
+shrecvq_fini(ptl_t *ptl)
+{
+    psm_error_t err = PSM_OK;
+    int i;
+
+    /* disable my write header queue before deallocation */
+    i = ptl->recvshc->subcontext;
+    ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0;
+
+    if ((err = ips_recvhdrq_fini(&ptl->recvq)))
+        goto fail;
+
+    if ((err = ips_recvhdrq_fini(&ptl->recvshc->recvq)))
+        goto fail;
+
+    for (i = 0; i < ptl->recvshc->subcontext_cnt; i++) {
+        if ((err = ips_writehdrq_fini(&ptl->recvshc->writeq[i]))) {
+	    goto fail;
+        }
+    }
+
+    psmi_free(ptl->recvshc);
+
+fail:
+    return err;
+}
+
+psm_error_t 
+ips_ptl_connect(ptl_t *ptl, int numep, const psm_epid_t *array_of_epid, 
+		const int *array_of_epid_mask, psm_error_t *array_of_errors, 
+		psm_epaddr_t *array_of_epaddr, uint64_t timeout_in)
+{
+    psm_error_t		err;
+    psm_ep_t		ep;
+    psm_epid_t		*epid_array = NULL;
+    psm_error_t		*error_array = NULL;
+    psm_epaddr_t	*epaddr_array = NULL;
+    int			*mask_array = NULL;
+    int			i, count;
+
+    PSMI_PLOCK_ASSERT();
+    err = ips_proto_connect(&ptl->proto, numep, array_of_epid, 
+			     array_of_epid_mask, array_of_errors, 
+			     array_of_epaddr, timeout_in);
+    if (err) return err;
+
+    psmi_assert_always(ptl->ep->mctxt_master == ptl->ep);
+    if (ptl->ep->mctxt_next == ptl->ep) return err;
+
+    /* make the additional mutil-context connections. */
+    epid_array = (psm_epid_t *)
+	psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_epid_t)*numep);
+    mask_array = (int *)
+	psmi_malloc(ptl->ep, UNDEFINED, sizeof(int)*numep);
+    error_array = (psm_error_t *)
+	psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_error_t)*numep);
+    epaddr_array = (psm_epaddr_t *)
+	psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_epaddr_t)*numep);
+    if (!epid_array || !mask_array || !error_array || !epaddr_array) {
+	goto fail;
+    }
+
+    count = 0;
+    ep = ptl->ep->mctxt_next;
+    while (ep != ep->mctxt_master) {
+
+	/* Setup the mask array and epid array. */
+	for (i = 0; i < numep; i++) {
+	    if (array_of_epid_mask[i]
+	    && array_of_errors[i] == PSM_OK
+	    && count < array_of_epaddr[i]->mctxt_epcount) {
+		if (ep->gid_hi != array_of_epaddr[i]->mctxt_gidhi[count]) {
+		    mask_array[i] = 0;
+		    _IPATH_INFO("Subnet ID mismatch, ignore...\n");
+		} else {
+		    mask_array[i] = 1;
+		    epid_array[i] = array_of_epaddr[i]->mctxt_epid[count];
+		}
+	    } else {
+		mask_array[i] = 0;
+	    }
+	}
+
+	/* Make the real protocol connections. */
+	err = ips_proto_connect(&ep->ptl_ips.ptl->proto, numep, epid_array, 
+			     mask_array, error_array, 
+			     epaddr_array, timeout_in);
+	if (err) goto fail;
+
+	/* Make the epaddr linklist for this peer. */
+	for (i = 0; i < numep; i++) {
+	    if (!mask_array[i]) continue;
+
+	    /* In rare case, when the peer exits psm_ep_connect()
+ 	     * and sends a message, it is received by this epaddr,
+ 	     * because the epaddr->mctxt_master is still itself (linked
+ 	     * and changed by below macro), epaddr->mctxt_recv_seqnum
+ 	     * is increased, not the master's mctxt_recv_seqnum.
+ 	     * when this happens, we need to apply this mctxt_recv_seqnum
+ 	     * to master's mctxt_recv_seqnum, otherwise, the message
+ 	     * sequence number doesnot match master's mctxt_recv_seqnum,
+	     * and causes code hanging.
+	     * This case only happens in the last rail of multi-rail.
+ 	     */
+	    if (epaddr_array[i]->mctxt_recv_seqnum) {
+		array_of_epaddr[i]->mctxt_recv_seqnum +=
+			epaddr_array[i]->mctxt_recv_seqnum;
+		epaddr_array[i]->mctxt_recv_seqnum = 0;
+	    }
+
+	    PSM_MCTXT_APPEND(array_of_epaddr[i], epaddr_array[i]);
+
+	    /* randomize the rail to start traffic */
+	    if ((random()%(count+2)) == 0) {
+		array_of_epaddr[i]->mctxt_current = epaddr_array[i];
+	    }
+
+	    /* Set the # slave connections so far */
+	    array_of_epaddr[i]->mctxt_nsconn++;
+	}
+
+	count++;
+	ep = ep->mctxt_next;
+    }
+
+fail:
+    if (epid_array) psmi_free(epid_array);
+    if (mask_array) psmi_free(mask_array);
+    if (error_array) psmi_free(error_array);
+    if (epaddr_array) psmi_free(epaddr_array);
+
+    return err;
+}
+
+psm_error_t 
+ips_ptl_disconnect(ptl_t *ptl, int force, int numep, 
+		   const psm_epaddr_t array_of_epaddr[],
+		   const int array_of_epaddr_mask[], 
+		   psm_error_t array_of_errors[], uint64_t timeout_in)
+{
+    psm_error_t err;
+
+    fprintf(stderr, "Aiee! ips_proto_disconnect() called.\n");
+    PSMI_PLOCK_ASSERT();
+    err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr,
+			       array_of_epaddr_mask, array_of_errors, 
+			       timeout_in);
+    return err;
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_ips = { 
+  ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, ips_ptl_getopt
+};
diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h
new file mode 100644
index 0000000..08d4c53
--- /dev/null
+++ b/ptl_ips/ptl_fwd.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PTL_FWD_IPS_H
+#define _PTL_FWD_IPS_H
+#include "ptl.h"
+
+typedef struct ptl_epaddr ips_epaddr_t;
+
+/* Symbol in ips ptl */
+struct ptl_ctl_init psmi_ptl_ips;
+#endif /* _PTL_FWD_IPS_H */
diff --git a/ptl_ips/ptl_ips.h b/ptl_ips/ptl_ips.h
new file mode 100644
index 0000000..5643064
--- /dev/null
+++ b/ptl_ips/ptl_ips.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPS_PTL_H
+#define _IPS_PTL_H
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+#include "ips_proto_params.h"
+#include "ips_proto.h"
+#include "ips_spio.h"
+#include "ips_recvhdrq.h"
+#include "ips_writehdrq.h"
+#include "ips_epstate.h"
+#include "ips_stats.h"
+#include "ips_subcontext.h"
+
+struct ptl_shared;
+
+/*
+ * PTL at the ips level (for InfiniPath)
+ *
+ * This PTL structure glues all the ips components together.
+ *
+ * * ips timer, shared by various components, allows each component to
+ *   schedule time-based expiration callbacks on the timerq.
+ * * HW receive queue
+ * * send control block to handle eager messages
+ * * instantiation of the ips protocol
+ * * endpoint state, to map endpoint indexes into structures
+ *
+ *   Receive-side                                  
+ *
+ *          ----[   proto    ] 
+ *         /       ^      ^   
+ *        |        |      |
+ *        |     packet  packet
+ *        |	known   unknown
+ *   add_endpt      \ /
+ *        |          |
+ *        `----> [epstate] 
+ *                   ^
+ *                   |
+ *               lookup_endpt
+ *                   |
+ *                [recvq]
+ *                   |
+ *                 poll
+ *
+ */
+/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */
+/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */
+struct ptl {
+    psm_ep_t	      ep;       /* back ptr */	
+    psm_epid_t	      epid;     /* cached from ep */
+    psm_epaddr_t      epaddr;   /* cached from ep */
+    ips_epaddr_t      *ipsaddr; /* cached from epaddr */
+    ptl_ctl_t	      *ctl;     /* cached from init */
+    const psmi_context_t *context;    /* cached from init */
+
+    struct ips_spio	spioc;   /* PIO send control */
+    struct ips_proto	proto;	 /* protocol instance: timerq, epstate, spio */
+
+    /* Receive header queue and receive queue processing */
+    uint32_t		  runtime_flags;
+    struct psmi_timer_ctrl timerq;
+    struct ips_epstate    epstate; /* map incoming packets */
+    struct ips_recvhdrq_state recvq_state;
+    struct ips_recvhdrq   recvq;   /* HW recvq: epstate, proto */
+
+    /* timer to check the context's status */
+    struct psmi_timer	    status_timer;
+
+    /* context's status check timeout in cycles -- cached */
+    uint64_t		    status_cyc_timeout;
+
+    /* Shared contexts context */
+    struct ptl_shared	    *recvshc;
+
+    /* Rcv thread context */
+    struct ptl_rcvthread    *rcvthread;
+};
+
+/*
+ * Sample implementation of shared contexts context.
+ *
+ * In shared mode, the hardware queue is serviced by more than one process.
+ * Each process also mirrors the hardware queue in software (represented by an
+ * ips_recvhdrq).  For packets we service in the hardware queue that are not
+ * destined for us, we write them in other processes's receive queues
+ * (represented by an ips_writehdrq).
+ *
+ */
+struct ptl_shared {
+    ptl_t  *ptl;		            /* backptr to main ptl */
+    uint32_t subcontext;
+    uint32_t subcontext_cnt;
+
+    pthread_spinlock_t *context_lock;
+    struct ips_subcontext_ureg *subcontext_ureg[INFINIPATH_MAX_SUBCONTEXT];
+    struct ips_recvhdrq	recvq;	            /* subcontext receive queue */
+    struct ips_recvhdrq_state recvq_state;  /* subcontext receive queue state */
+    struct ips_writehdrq writeq[INFINIPATH_MAX_SUBCONTEXT]; /* peer subcontexts */
+};
+
+/*
+ * Connect/disconnect are wrappers around psm proto's connect/disconnect,
+ * mostly to abstract away PSM-specific stuff from ips internal structures
+ */
+psm_error_t ips_ptl_connect(ptl_t *ptl, int numep, 
+			    const psm_epid_t *array_of_epid, 
+			    const int *array_of_epid_mask, 
+			    psm_error_t *array_of_errors, 
+			    psm_epaddr_t *array_of_epaddr, 
+			    uint64_t timeout_in);
+
+psm_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep, 
+			       const psm_epaddr_t array_of_epaddr[],
+			       const int array_of_epaddr_mask[], 
+			       psm_error_t array_of_errors[], 
+			       uint64_t timeout_in);
+
+/*
+ * Generic Poll function for ips-level ptl
+ */
+psm_error_t ips_ptl_poll(ptl_t *ptl, int _ignored);
+psm_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored);
+
+/*
+ * Support for receive thread
+ */
+psm_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq);
+psm_error_t ips_ptl_rcvthread_fini(ptl_t *ptl);
+
+#endif /* _IPS_PTL_H */
diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c
new file mode 100644
index 0000000..5c30c7a
--- /dev/null
+++ b/ptl_ips/ptl_rcvthread.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/poll.h>
+
+#include "ptl_ips.h"
+#include "ips_proto.h"
+#include "ips_proto_internal.h"
+#include "ips_recvhdrq.h"
+
+/* All in milliseconds */
+#define RCVTHREAD_TO_MIN_FREQ	    10	   /* min of 10 polls per sec */
+#define RCVTHREAD_TO_MAX_FREQ	    100    /* max of 100 polls per sec */
+#define RCVTHREAD_TO_SHIFT	    1
+
+struct ptl_rcvthread;
+
+static void * ips_ptl_pollintr(void *recvthreadc);
+static psm_error_t rcvthread_initstats(ptl_t *ptl);
+static psm_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc);
+
+struct ptl_rcvthread {
+    const psmi_context_t	*context;
+    const ptl_t		*ptl;
+    struct ips_recvhdrq *recvq;
+
+    pthread_t hdrq_threadid;
+    uint64_t  t_start_cyc;
+    int	      pipefd[2];
+
+    /* stats and some for scheduling */
+    uint64_t  pollcnt;
+    uint64_t  pollcnt_to;
+    uint64_t  pollcyc;
+    uint64_t  pollok;
+
+    /* For scheduling interrupt thread */
+    int	      timeout_period_min;
+    int	      timeout_period_max;
+    int	      timeout_shift;
+    uint64_t  pollok_last;
+    uint64_t  pollcnt_last;
+    uint32_t  last_timeout;
+};
+
+/* 
+ * The receive thread knows about the ptl interface, so it can muck with it
+ * directly.
+ */
+psm_error_t 
+ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq)
+{
+    psm_error_t err = PSM_OK;
+    struct ptl_rcvthread *rcvc;
+
+    ptl->rcvthread = 
+	psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread));
+    if (ptl->rcvthread == NULL) {
+	err = PSM_NO_MEMORY;
+	goto fail;
+    }
+    rcvc = ptl->rcvthread;
+
+    rcvc->recvq = recvq;
+    rcvc->ptl = ptl;
+    rcvc->context  = ptl->context;
+    rcvc->t_start_cyc = get_cycles();
+
+    if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+
+	if ((err = rcvthread_initsched(rcvc)))
+	    goto fail;
+
+	/* Create a pipe so we can synchronously terminate the thread */
+	if (pipe(rcvc->pipefd) != 0) {
+	    err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE,
+		    "Cannot create a pipe for receive thread: %s\n",
+		    strerror(errno));
+	    goto fail;
+	}
+
+	if (pthread_create(&rcvc->hdrq_threadid, NULL, 
+			   ips_ptl_pollintr, ptl->rcvthread)) 
+	{
+	    close(rcvc->pipefd[0]);
+	    close(rcvc->pipefd[1]);
+	    err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE,
+		    "Cannot start receive thread: %s\n", strerror(errno));
+	    goto fail;
+	}
+
+    }
+
+    if ((err = rcvthread_initstats(ptl)))
+	goto fail;
+
+fail:
+    return err;
+}
+
+psm_error_t 
+ips_ptl_rcvthread_fini(ptl_t *ptl)
+{
+    struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) ptl->rcvthread;
+    uint64_t t_now;
+    double t_cancel_us;
+    psm_error_t err = PSM_OK;
+
+    PSMI_PLOCK_ASSERT();
+
+    if (ptl->rcvthread == NULL)
+	return err;
+
+    if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) {
+	t_now = get_cycles();
+
+	/* Disable interrupts then kill the receive thread */
+	if (psmi_context_interrupt_isenabled((psmi_context_t *) ptl->context))
+	    if ((err = psmi_context_interrupt_set((psmi_context_t *) ptl->context, 0)))
+		goto fail;
+
+	/* Close the pipe so we can have the thread synchronously exit.
+	   On Linux just closing the pipe does not wake up the receive
+	   thread. 
+	*/
+	if (write(rcvc->pipefd[1], (const void*) &t_now,
+		  sizeof(uint64_t)) == -1 ||
+ 	    close(rcvc->pipefd[1]) == -1) {
+	  _IPATH_VDBG("unable to close pipe to receive thread cleanly\n");
+	}
+	pthread_join(rcvc->hdrq_threadid, NULL);
+	t_cancel_us = (double) cycles_to_nanosecs(get_cycles() - t_now) / 1e3;
+
+	_IPATH_PRDBG("rcvthread poll success %lld/%lld times, "
+	    "thread cancelled in %.3f us\n", (long long) rcvc->pollok,
+	    (long long) rcvc->pollcnt, t_cancel_us);
+
+    }
+
+    psmi_free(ptl->rcvthread);
+
+fail:
+    return err;
+}
+
+psm_error_t
+rcvthread_initsched(struct ptl_rcvthread *rcvc)
+{
+    union psmi_envvar_val env_to;
+    char buf[192];
+    char *rcv_freq = buf;
+    int no_timeout = 0;
+    int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, 
+    		 RCVTHREAD_TO_MAX_FREQ,
+    		 RCVTHREAD_TO_SHIFT };
+    snprintf(buf, sizeof buf - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, 
+    	 RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT);
+    buf[sizeof buf - 1] = '\0';
+
+    if (!psmi_getenv("PSM_RCVTHREAD_FREQ",
+    	"Thread timeouts (per sec) <min_freq[:max_freq[:shift_freq]]>",
+    	PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+    	(union psmi_envvar_val) rcv_freq, &env_to))
+    {
+	/* not using default values */
+	int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals);
+	int invalid = 0;
+
+	if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) || 
+	    (nparsed > 1 && tvals[1] == 0)) 
+	{
+	   no_timeout = 1;
+	}
+	else {
+    	if (nparsed > 0 && tvals[0] > 1000) 
+    	    invalid = 1;
+    	if (nparsed > 1 && (tvals[1] > 1000 || tvals[1] < tvals[0]))
+    	    invalid = 1;
+    	if (nparsed > 2 && tvals[2] > 10)
+    	    invalid = 1;
+	}
+
+	if (invalid) {
+	    _IPATH_INFO("Overriding invalid request for RcvThread frequency"
+    		    " settings of %s to be <%d:%d:%d>\n",
+    		    env_to.e_str, RCVTHREAD_TO_MIN_FREQ,
+    		    RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT);
+	   tvals[0] = RCVTHREAD_TO_MIN_FREQ;
+	   tvals[1] = RCVTHREAD_TO_MAX_FREQ;
+	   tvals[2] = RCVTHREAD_TO_SHIFT;
+	}
+    }
+
+    if (no_timeout) {
+	rcvc->last_timeout = -1;
+	_IPATH_PRDBG("PSM_RCVTHREAD_FREQ set to only interrupt "
+		     "(no timeouts)\n");
+    }
+    else {
+	/* Convert freq to period in microseconds (for poll()) */
+	rcvc->timeout_period_max = 1000 / tvals[0];
+	rcvc->timeout_period_min = 1000 / tvals[1];
+	rcvc->timeout_shift = tvals[2];
+	/* Start in the middle of min and max */
+	rcvc->last_timeout = (rcvc->timeout_period_min +
+    			      rcvc->timeout_period_max) / 2;
+	_IPATH_PRDBG("PSM_RCVTHREAD_FREQ converted to period "
+		"min=%dms,max=%dms,shift=%d\n",
+		rcvc->timeout_period_min, rcvc->timeout_period_max,
+		rcvc->timeout_shift);
+    }
+    return PSM_OK;
+}
+
+static
+int
+rcvthread_next_timeout(struct ptl_rcvthread *rcvc)
+{
+    uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last;
+
+    if (pollok_diff > 0) {
+	if (rcvc->last_timeout > rcvc->timeout_period_min)
+	/* By default, be less aggressive, but there's a more aggressive
+	 * alternative if need be */
+#if 1
+	    rcvc->last_timeout >>= rcvc->timeout_shift;
+#else
+	    rcvc->last_timeout = rcvc->timeout_period_min;
+#endif
+    }
+    else { /* we had less progress */
+	if (rcvc->last_timeout < rcvc->timeout_period_max)
+	    rcvc->last_timeout <<= rcvc->timeout_shift;
+    }
+
+    rcvc->pollok_last = rcvc->pollok;
+    rcvc->pollcnt_last = rcvc->pollcnt; 
+    return (int) rcvc->last_timeout;
+}
+
+extern int ips_in_rcvthread;
+
+/*
+ * Receiver thread support.
+ *
+ * By default, polling in the driver asks the chip to generate an interrupt on
+ * every packet.  When the driver supports POLLURG we can switch the poll mode
+ * to one that requests interrupts only for packets that contain an urgent bit
+ * (and optionally enable interrupts for hdrq overflow events).  When poll
+ * returns an event, we *try* to make progress on the receive queue but simply
+ * go back to sleep if we notice that the main thread is already making
+ * progress.
+ */
+static
+void *
+ips_ptl_pollintr(void *rcvthreadc)
+{
+    struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) rcvthreadc;
+    struct ips_recvhdrq *recvq = rcvc->recvq;
+    psmi_context_t *context = (psmi_context_t *) rcvc->context;
+    int fd_dev = context->fd;
+    int fd_pipe = rcvc->pipefd[0];
+    psm_ep_t ep = context->ep;
+    struct pollfd pfd[2];
+    int ret;
+    int next_timeout = rcvc->last_timeout;
+    uint64_t t_cyc;
+    psm_error_t err;
+
+    /* No reason to have many of these, keep this as a backup in case the
+     * recvhdrq init function is misused */
+    psmi_assert_always((recvq->runtime_flags & PSMI_RUNTIME_RCVTHREAD));
+
+    /* Switch driver to a mode where it can interrupt on urgent packets */
+    if (psmi_context_interrupt_set((psmi_context_t *)
+			        rcvc->context, 1) == PSM_EP_NO_RESOURCES) {
+	_IPATH_PRDBG("ipath_poll_type feature not present in driver, turning "
+		     "off internal progress thread\n");
+	return NULL;
+    }
+
+    _IPATH_PRDBG("Enabled communication thread on URG packets\n");
+
+    while (1) {
+	pfd[0].fd = fd_dev;
+	pfd[0].events = POLLIN;
+	pfd[0].revents = 0;
+	pfd[1].fd = fd_pipe;
+	pfd[1].events = POLLIN;
+	pfd[1].revents = 0;
+
+	ret = poll(pfd, 2, next_timeout);
+	t_cyc = get_cycles();
+
+	if_pf (ret < 0) {
+	    if (errno == EINTR) 
+		_IPATH_DBG("got signal, keep polling\n");
+	    else
+		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
+		    "Receive thread poll() error: %s", strerror(errno));
+	}
+	else if (pfd[1].revents) {
+	    /* Any type of event on this fd means exit, should be POLLHUP */
+	    _IPATH_DBG("close thread: revents=0x%x\n", pfd[1].revents);
+	    close(fd_pipe);
+	    break;
+	}
+	else {
+	    rcvc->pollcnt++;
+
+	    if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) {
+		if (PSMI_PLOCK_DISABLED) {
+		    /* We do this check without acquiring the lock, no sense to
+		     * adding the overhead and it doesn't matter if we're
+		     * wrong. */
+		    if (ips_recvhdrq_isempty(recvq))
+			continue;
+		    if (!ips_recvhdrq_trylock(recvq))
+			continue;
+		    err = ips_recvhdrq_progress(recvq);
+		    if (err == PSM_OK)
+			rcvc->pollok++;
+		    else
+			rcvc->pollcyc += get_cycles() - t_cyc;
+		    ips_recvhdrq_unlock(recvq);
+		}
+		else if (!PSMI_PLOCK_TRY()) { 
+		    /* If we time out, we service shm and ipath.  If not, we
+		     * assume to have received an ipath interrupt and service
+		     * only ipath.
+		     */
+		    err = psmi_poll_internal(ep, 
+			    ret == 0 ? PSMI_TRUE : PSMI_FALSE);
+
+		    if (err == PSM_OK) {
+			rcvc->pollok++;
+			/*
+			if (rcvc->pollok % 1000 == 0 && rcvc->pollok >= 1000)
+			    _IPATH_INFO("pollok = %lld\n", (unsigned long long)rcvc->pollok);
+			    */
+		    }
+		    else
+			rcvc->pollcyc += get_cycles() - t_cyc;
+		    PSMI_PUNLOCK();
+		}
+	    }
+
+	    if (ret == 0) { /* change timeout only on timed out poll */
+		rcvc->pollcnt_to++;
+		next_timeout = rcvthread_next_timeout(rcvc);
+	    }
+	}
+    }
+
+    return NULL;
+}
+
+static uint64_t
+rcvthread_stats_pollok(void *context)
+{
+    struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) context;
+    double ratio = 0.0;
+    uint64_t ratio_u;
+    if (rcvc->pollcnt > 0)
+	ratio = (double) rcvc->pollok * 100.0 / rcvc->pollcnt;
+    memcpy(&ratio_u, &ratio, sizeof(uint64_t));
+    return ratio_u;
+}
+
+static uint64_t
+rcvthread_stats_pollcyc(void *context)
+{
+    struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) context;
+    /* log in milliseconds */
+    return (uint64_t) ((double) cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6);
+}
+
+static psm_error_t   
+rcvthread_initstats(ptl_t *ptl)
+{
+    struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) ptl->rcvthread;
+    struct psmi_stats_entry entries[] = { 
+	PSMI_STATS_DECL("intrthread schedule count", 
+			MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO,
+			NULL, &rcvc->pollcnt),
+	PSMI_STATS_DECL("intrthread schedule success (%)",
+			MPSPAWN_STATS_REDUCTION_ALL | 
+			MPSPAWN_STATS_TYPE_DOUBLE,
+			rcvthread_stats_pollok, NULL),
+	PSMI_STATS_DECL("intrthread timeout count", 
+			MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO,
+			NULL, &rcvc->pollcnt_to),
+	PSMI_STATS_DECL("intrthread wasted time (ms)",
+			MPSPAWN_STATS_REDUCTION_ALL,
+			rcvthread_stats_pollcyc, NULL)
+    };
+
+    /* If we don't want a thread, make sure we still initialize the counters
+     * but set them to NaN instead */
+    if (!(ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD)) {
+	int i;
+	static uint64_t ctr_nan = MPSPAWN_NAN;
+	for (i = 0; i < (int) PSMI_STATS_HOWMANY(entries); i++) {
+	    entries[i].getfn = NULL;
+	    entries[i].u.val = &ctr_nan;
+	}
+    }
+
+    return psmi_stats_register_type(PSMI_STATS_NO_HEADING,
+				    PSMI_STATSTYPE_RCVTHREAD,
+				    entries,
+				    PSMI_STATS_HOWMANY(entries),
+				    rcvc);
+}
diff --git a/ptl_self/Makefile b/ptl_self/Makefile
new file mode 100644
index 0000000..3b41f54
--- /dev/null
+++ b/ptl_self/Makefile
@@ -0,0 +1,45 @@
+# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved.
+# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+include $(top_srcdir)/buildflags.mak
+INCLUDES += -I$(top_srcdir)
+
+${TARGLIB}-objs := ptl.o
+
+all: ${${TARGLIB}-objs}
+
+%.o: %.c
+	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
+
+clean:
+	rm -f *.o
+
diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c
new file mode 100644
index 0000000..bac2d58
--- /dev/null
+++ b/ptl_self/ptl.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2013. Intel Corporation. All rights reserved.
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* 
+ * This file implements the PSM PTL for self (loopback)
+ */
+
+#include "psm_user.h"
+#include "psm_mq_internal.h"
+
+struct ptl {
+    psm_ep_t	    ep;
+    psm_epid_t	    epid;
+    psm_epaddr_t    epaddr;
+    ptl_ctl_t	    *ctl;
+};
+
+static
+psm_error_t __fastpath
+ptl_handle_rtsmatch(psm_mq_req_t recv_req, int was_posted)
+{
+    psm_mq_req_t send_req = (psm_mq_req_t) recv_req->ptl_req_ptr;
+
+    if (recv_req->recv_msglen > 0) {
+	PSM_VALGRIND_DEFINE_MQ_RECV(recv_req->buf, recv_req->buf_len,
+				recv_req->recv_msglen);
+	VALGRIND_MAKE_MEM_DEFINED(send_req->buf, send_req->buf_len);
+	VALGRIND_MAKE_MEM_DEFINED(send_req->buf, recv_req->recv_msglen);
+
+	psmi_mq_mtucpy(recv_req->buf, send_req->buf, recv_req->recv_msglen); 
+    }
+
+    psmi_mq_handle_rts_complete(recv_req);
+
+    /* If the send is already marked complete, that's because it was internally
+     * buffered. */
+    if (send_req->state == MQ_STATE_COMPLETE) {
+	psmi_mq_stats_rts_account(send_req);
+	if (send_req->buf != NULL && send_req->send_msglen > 0) 
+	    psmi_mq_sysbuf_free(send_req->mq, send_req->buf);
+	psmi_mq_req_free(send_req); /* req was left "live" even though the
+				     * sender was told that the send was done */
+    }
+    else
+	psmi_mq_handle_rts_complete(send_req);
+
+    _IPATH_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n",
+	    recv_req->buf, send_req, recv_req);
+    return PSM_OK;
+}
+
+static
+psm_error_t
+self_mq_send_testwait(psm_mq_req_t *ireq, int istest, psm_mq_status_t *status)
+{
+    uint8_t *ubuf;
+    psm_mq_req_t req = *ireq;
+
+    PSMI_PLOCK_ASSERT();
+
+    /* We're waiting on a send request, and the matching receive has not been
+     * posted yet.  This is a deadlock condition in MPI but we accodomate it
+     * here in the "self ptl" by using system-allocated memory.
+     */
+    req->testwait_callback = NULL; /* no more calls here */
+
+    ubuf = req->buf;
+    if (ubuf != NULL && req->send_msglen > 0) {
+	req->buf = psmi_mq_sysbuf_alloc(req->mq, req->send_msglen);
+	if (req->buf == NULL)
+	    return PSM_NO_MEMORY;
+	psmi_mq_mtucpy(req->buf, ubuf, req->send_msglen); 
+    }
+
+    /* Mark it complete but don't free the req, it's freed when the receiver
+     * does the match */
+    req->state = MQ_STATE_COMPLETE;
+    *ireq = PSM_MQ_REQINVALID;
+
+    if (status != NULL)
+	mq_status_copy(req, status);
+    return PSM_OK;
+}
+
+/* Self is different.  We do everything as rendezvous. */
+static
+psm_error_t __fastpath
+self_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, 
+	     uint64_t tag, const void *ubuf, uint32_t len, void *context,
+	     psm_mq_req_t *req_o)
+{
+    psm_mq_req_t send_req;
+    psm_mq_req_t recv_req;
+    int rc;
+
+    send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND);
+    if_pf (send_req == NULL)
+	return PSM_NO_MEMORY;
+
+    rc = psmi_mq_handle_rts(mq, tag, (uintptr_t) ubuf, len, epaddr,
+		                ptl_handle_rtsmatch, &recv_req);
+    send_req->buf = (void *) ubuf;
+    send_req->send_msglen = len;
+    send_req->context = context;
+    recv_req->ptl_req_ptr = (void *) send_req;
+    if (rc == MQ_RET_MATCH_OK) 
+	ptl_handle_rtsmatch(recv_req, 1);
+    else  
+	send_req->testwait_callback = self_mq_send_testwait;
+
+    _IPATH_VDBG("[self][b=%p][m=%d][t=%"PRIx64"][match=%s][req=%p]\n",
+	    ubuf, len, tag, rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req);
+    *req_o = send_req;
+    return PSM_OK;
+}
+
+static __fastpath
+psm_error_t
+self_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, 
+	    uint64_t tag, const void *ubuf, uint32_t len)
+{
+    psm_error_t err;
+    psm_mq_req_t req;
+    err = self_mq_isend(mq,epaddr,flags,tag,ubuf,len,NULL,&req);
+    psmi_mq_wait_internal(&req);
+    return err; 
+}
+
+static
+psm_error_t 
+self_connect(ptl_t *ptl,
+             int numep,
+	     const psm_epid_t array_of_epid[], 
+	     const int array_of_epid_mask[],
+             psm_error_t  array_of_errors[],
+	     psm_epaddr_t array_of_epaddr[],
+	     uint64_t timeout_ns)
+{
+    psmi_assert_always(ptl->epaddr != NULL);
+    psm_epaddr_t epaddr;
+    psm_error_t err = PSM_OK;
+    int i;
+
+    PSMI_PLOCK_ASSERT();
+
+    for (i = 0; i < numep; i++) {
+	if (!array_of_epid_mask[i])
+	    continue;
+
+	if (array_of_epid[i] == ptl->epid) {
+	    epaddr = psmi_epid_lookup(ptl->ep, ptl->epid);
+	    psmi_assert_always(epaddr == NULL);
+	    array_of_epaddr[i] = ptl->epaddr;
+	    array_of_epaddr[i]->ptl = ptl;
+	    array_of_epaddr[i]->ptlctl = ptl->ctl;
+	    array_of_epaddr[i]->epid = ptl->epid;
+	    array_of_epaddr[i]->ep = ptl->ep;
+	    if (psmi_epid_set_hostname(psm_epid_nid(ptl->epid), 
+				       psmi_gethostname(), 0)) {
+		err = PSM_NO_MEMORY;
+		goto fail;
+	    }
+	    psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr);
+	    array_of_errors[i] = PSM_OK;
+	}
+	else {
+	    array_of_epaddr[i] = NULL;
+	    array_of_errors[i] = PSM_EPID_UNREACHABLE;
+	}
+    }
+
+fail:
+    return err;
+}
+
+#if 0
+static
+psm_error_t 
+self_disconnect(ptl_t *ptl, int numep, 
+		const psm_epaddr_t array_of_epaddr[],
+		int array_of_epaddr_mask[],
+		int force, uint64_t timeout_ns)
+{
+    int i;
+    for (i = 0; i < numep; i++) {
+	if (array_of_epaddr_mask[i] == 0)
+	    continue;
+
+	if (array_of_epaddr[i] == ptl->epaddr) 
+	    array_of_epaddr_mask[i] = 1;
+	else
+	    array_of_epaddr_mask[i] = 0;
+    }
+    return PSM_OK;
+}
+#endif
+
+static
+size_t
+self_ptl_sizeof(void)
+{
+    return sizeof(ptl_t);
+}
+
+static
+psm_error_t 
+self_ptl_init(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl)
+{
+    psmi_assert_always(ep != NULL);
+    psmi_assert_always(ep->epaddr != NULL);
+    psmi_assert_always(ep->epid != 0);
+
+    ptl->ep   = ep; 
+    ptl->epid = ep->epid;
+    ptl->epaddr = ep->epaddr;
+    ptl->ctl = ctl;
+    ep->epaddr->mctxt_prev = ep->epaddr;
+    ep->epaddr->mctxt_next = ep->epaddr;
+    ep->epaddr->mctxt_master = ep->epaddr;
+
+    memset(ctl, 0, sizeof(*ctl));
+    /* Fill in the control structure */
+    ctl->ptl = ptl;
+    ctl->ep_poll = NULL;
+    ctl->ep_connect = self_connect;
+    ctl->ep_disconnect = NULL;
+
+    ctl->mq_send  = self_mq_send;
+    ctl->mq_isend = self_mq_isend;
+
+    /* No stats in self */
+    ctl->epaddr_stats_num  = NULL;
+    ctl->epaddr_stats_init = NULL;
+    ctl->epaddr_stats_get  = NULL;
+
+    return PSM_OK;
+}
+
+static
+psm_error_t 
+self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns)
+{
+    return PSM_OK; /* nothing to do */
+}
+
+static 
+psm_error_t
+self_ptl_setopt(const void *component_obj, int optname, 
+		const void *optval, uint64_t optlen)
+{
+  /* No options for SELF PTL at the moment */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown SELF ptl option %u.", optname);
+}
+
+static
+psm_error_t
+self_ptl_getopt(const void *component_obj, int optname,
+	       void *optval, uint64_t *optlen)
+{
+  /* No options for SELF PTL at the moment */
+  return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown SELF ptl option %u.", optname);
+}
+
+/* Only symbol we expose out of here */
+struct ptl_ctl_init
+psmi_ptl_self = { 
+  self_ptl_sizeof, self_ptl_init, self_ptl_fini,self_ptl_setopt,self_ptl_getopt
+};
diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h
new file mode 100644
index 0000000..ff79c7e
--- /dev/null
+++ b/ptl_self/ptl_fwd.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _PTL_FWD_SELF_H
+#define _PTL_FWD_SELF_H
+
+/* Symbol in am ptl */
+struct ptl_ctl_init psmi_ptl_self;
+
+#endif
+
-- 
cgit v1.2.3


From 0f8c608a1b128c42b75aa8fac35d3df4a3d57e57 Mon Sep 17 00:00:00 2001
From: Christoph Biedl <debian.axhn@manchmal.in-ulm.de>
Date: Sun, 16 Oct 2022 04:18:17 -0700
Subject: Import infinipath-psm_3.3+20.604758e7-6.2.debian.tar.xz

[dgit import tarball infinipath-psm 3.3+20.604758e7-6.2 infinipath-psm_3.3+20.604758e7-6.2.debian.tar.xz]
---
 README.source                                      |  58 +++
 changelog                                          | 110 ++++++
 compat                                             |   1 +
 control                                            |  56 +++
 copyright                                          |  79 ++++
 gbp.conf                                           |   4 +
 libpsm-infinipath1-dev.install                     |   2 +
 libpsm-infinipath1.install                         |   2 +
 libpsm-infinipath1.postinst.in                     |  10 +
 libpsm-infinipath1.postrm.in                       |  13 +
 libpsm-infinipath1.prerm.in                        |  13 +
 libpsm-infinipath1.symbols                         | 432 +++++++++++++++++++++
 .../0001-Fix-truncation-warnings-with-gcc7.patch   |  45 +++
 ...-sysmacros.h-to-avoid-warning-about-minor.patch |  31 ++
 patches/0003-gcc8.patch                            |  29 ++
 patches/0004-gcc-11-warning.patch                  |  22 ++
 patches/series                                     |   4 +
 rules                                              |  61 +++
 source/format                                      |   1 +
 source/options                                     |   1 +
 watch                                              |   3 +
 21 files changed, 977 insertions(+)
 create mode 100644 README.source
 create mode 100644 changelog
 create mode 100644 compat
 create mode 100644 control
 create mode 100644 copyright
 create mode 100644 gbp.conf
 create mode 100644 libpsm-infinipath1-dev.install
 create mode 100644 libpsm-infinipath1.install
 create mode 100644 libpsm-infinipath1.postinst.in
 create mode 100644 libpsm-infinipath1.postrm.in
 create mode 100644 libpsm-infinipath1.prerm.in
 create mode 100644 libpsm-infinipath1.symbols
 create mode 100644 patches/0001-Fix-truncation-warnings-with-gcc7.patch
 create mode 100644 patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch
 create mode 100644 patches/0003-gcc8.patch
 create mode 100644 patches/0004-gcc-11-warning.patch
 create mode 100644 patches/series
 create mode 100755 rules
 create mode 100644 source/format
 create mode 100644 source/options
 create mode 100644 watch

diff --git a/README.source b/README.source
new file mode 100644
index 0000000..fd5b4ef
--- /dev/null
+++ b/README.source
@@ -0,0 +1,58 @@
+This package is maintained from the upstream git repository located at
+https://github.com/intel/psm.git
+using Dep14 (http://dep.debian.net/deps/dep14/) layout/workflow.
+
+New versions should usually be built from the debian/master branch.
+There is a d/gbp.conf, so building with 'gbp buildpackage' is supported.
+The upstream branch is master (there are old upstream branches called upstream
+and upstream-branch that were used for intermediate package versions but are now
+obsolete).
+
+Patches are managed within the patch-queue/debian/master branch. This branch
+should always be based on the (upstream) master branch. The contents of the
+d/patches directory is then auto-generated using:
+$ gbp pq export
+
+To clone this repo use:
+$ gbp clone --pristine-tar git@salsa.debian.org:hpc-team/infinipath-psm.git
+
+To build the package after cloning:
+$ gbp buildpackage
+
+To be able to receive new upstream releases/commits, after cloning, you need to
+add the upstream repo address as a 'git remote' as follows:
+$ git remote add upstream https://github.com/intel/psm.git
+Then you can pull upstream changes as follows:
+$ git branch master
+$ git pull upstream
+
+Handling new upstream releases (e.g. for new release 3.3+20.604758e7):
+- Set an upstream tag (Until upstream tags their releases use the latest
+  upstream commit and tag it according to the scheme
+  3.3+<n>.<sha1> where n should be incremented by 1 for a new debian release
+  with upstream changes and sha1 is the short form of the SHA-1 object name of
+  that commit):
+  $ git tag upstream/3.3+20.604758e7
+- Rebase patch-queue branch (if patches currently exist)
+  $ git checkout patch-queue/debian/master
+  $ git rebase -i upstream/3.3+20.604758e7
+    Review patches, possibly fix conflicts, when done:
+  $ gbp pq export
+  This puts you into the debian/master branch automatically. Review the changes
+  and commit.
+- Merge new upstream release to the debian/master branch:
+  $ git checkout debian/master (probably you're already there)
+  $ git merge upstream/3.3+20.604758e7
+- Adjust debian files for new release ...
+- Set debian release tag when done (assuming debian version 3.3+20.604758e7-1)
+  $ git tag debian/3.3+20.604758e7-1
+- Build new package including new pristine-tar generation:
+  $ gbp buildpackage --git-pristine-tar --git-pristine-tar-commit \
+    --git-compression=xz
+- When all is fine, push the new version
+  $ git push --all
+  $ git push --tags
+
+Roland Fehrenbacher
+
+ -- Roland Fehrenbacher <rf@q-leap.de>, Wed, 27 Dec 2017 17:44:26 +0000
diff --git a/changelog b/changelog
new file mode 100644
index 0000000..7e666a1
--- /dev/null
+++ b/changelog
@@ -0,0 +1,110 @@
+infinipath-psm (3.3+20.604758e7-6.2) unstable; urgency=medium
+
+  * Non-maintainer upload
+  * Work around FTBFS with gcc-12. Closes: #984057
+
+ -- Christoph Biedl <debian.axhn@manchmal.in-ulm.de>  Sun, 16 Oct 2022 13:18:17 +0200
+
+infinipath-psm (3.3+20.604758e7-6.1) unstable; urgency=medium
+
+  * Non-maintainer upload.
+  * Work around FTBFS with gcc-10 (Closes: #957359)
+
+ -- Paul Gevers <elbrus@debian.org>  Sun, 03 Jan 2021 08:42:58 +0100
+
+infinipath-psm (3.3+20.604758e7-6) unstable; urgency=medium
+
+  * Fix ftbfs with GCC-8 (Closes: #897774). Thanks to Reiner Herrmann
+    for the patch.
+    - add patch 0003-gcc8.patch
+
+ -- Mehdi Dogguy <mehdi@debian.org>  Thu, 29 Nov 2018 23:53:33 +0100
+
+infinipath-psm (3.3+20.604758e7-5) unstable; urgency=medium
+
+  * Fix postrm maintainer script to avoid leaving an unowned file
+    (Closes: #886925) and add a postrm script to handle other cases.
+    Thanks to Andreas Beckmann for filing the bugreport and putting
+    relevant references in the bugreport.
+
+ -- Mehdi Dogguy <mehdi@debian.org>  Sun, 14 Jan 2018 11:29:47 +0100
+
+infinipath-psm (3.3+20.604758e7-4) unstable; urgency=medium
+
+  * Add myself to Uploaders
+  * Run wrap-and-sort
+  * Mark symbol ipath_dwordcpy_safe@Base as amd64 specific (Closes: #886359)
+  * Provide libpsm_infinipath.so.1 as an alternative
+
+ -- Mehdi Dogguy <mehdi@debian.org>  Thu, 04 Jan 2018 22:43:15 +0100
+
+infinipath-psm (3.3+20.604758e7-3) unstable; urgency=medium
+
+  * Make 'Debian HPC Team' new maintainer
+  * Fix lintian warnings (copyright + Priority)
+  * Add i386 specific symbols file
+
+ -- Roland Fehrenbacher <rf@debian.org>  Thu, 04 Jan 2018 13:12:09 +0100
+
+infinipath-psm (3.3+20.604758e7-1) unstable; urgency=medium
+
+  * New upstream release (Add upstream fixes up to git commit 604758e).
+  * Add patch for gcc 7 compilation (Closes: #853451).
+  * Remove patches applied upstream.
+  * Update Vcs-Git/Browser to point to new salsa.debian.org repo.
+  * Update Standards-Version to 4.1.3, no changes required.
+  * Update copyright file.
+
+ -- Roland Fehrenbacher <rfehren@debian.org>  Fri, 29 Dec 2017 11:47:37 +0100
+
+infinipath-psm (3.3+19.g67c0807.open-2) UNRELEASED; urgency=medium
+
+  * Improve architecture detection. (Closes: #807149)
+
+ -- Ana Beatriz Guerrero Lopez <ana@debian.org>  Mon, 04 Apr 2016 10:18:09 +0200
+
+infinipath-psm (3.3+19.g67c0807.open-1) unstable; urgency=medium
+
+  * Team upload.
+  * New upstream release.
+  * Add 0001-Initialize-variables.patch to fix -Werror=maybe-uninitialized
+    errors.
+  * Update symbols file.
+  * Update Standards-Version to 3.9.7, no changes required.
+  * Update Vcs-Browser to use HTTPS.
+  * Add 0002-Fix-a-few-typos.patch fixing a few typo reported by lintian.
+
+ -- Ana Beatriz Guerrero Lopez <ana@debian.org>  Fri, 01 Apr 2016 00:54:00 +0200
+
+infinipath-psm (3.3+7.gec1d6d2-3) unstable; urgency=low
+
+  * d/control: Make libpsm-infinipath1-dev conflict with libion-dev
+  
+  * Bug fix: "libpsm-infinipath1-dev and libion-dev: error when trying to
+    install together", thanks to Ralf Treinen (Closes: #807300).
+
+ -- Roland Fehrenbacher <rf@q-leap.de>  Tue, 08 Dec 2015 15:36:24 +0100
+
+infinipath-psm (3.3+7.gec1d6d2-2) unstable; urgency=medium
+
+  * Add symbols file d/libpsm-infinipath1.symbols 
+
+ -- Roland Fehrenbacher <rf@q-leap.de>  Tue, 08 Dec 2015 13:20:59 +0000
+
+infinipath-psm (3.3+7.gec1d6d2-1) unstable; urgency=low
+
+  * Upstream integrated previous Debian patches
+
+ -- Roland Fehrenbacher <rf@q-leap.de>  Mon, 07 Dec 2015 19:29:57 +0100
+
+infinipath-psm (3.3+7.g05f6f14.open-2) unstable; urgency=low
+
+  * Revert to xz compression
+
+ -- Roland Fehrenbacher <rf@q-leap.de>  Tue, 01 Dec 2015 12:38:18 +0100
+
+infinipath-psm (3.3+7.g05f6f14.open-1) unstable; urgency=medium
+
+  * Initial release. (Closes: #806524)
+
+ -- Roland Fehrenbacher <rf@q-leap.de>  Sat, 28 Nov 2015 13:49:53 +0100
diff --git a/compat b/compat
new file mode 100644
index 0000000..ec63514
--- /dev/null
+++ b/compat
@@ -0,0 +1 @@
+9
diff --git a/control b/control
new file mode 100644
index 0000000..1f5653f
--- /dev/null
+++ b/control
@@ -0,0 +1,56 @@
+Source: infinipath-psm
+Section: libs
+Priority: optional
+Maintainer: Debian HPC Team <debian-hpc@lists.debian.org>
+Uploaders: Roland Fehrenbacher <rf@q-leap.de>,
+           Mehdi Dogguy <mehdi@debian.org>
+Build-Depends: debhelper (>= 9),
+               dpkg-dev (>= 1.13.19),
+               uuid-dev
+Standards-Version: 4.1.3
+Homepage: https://github.com/intel/psm
+Vcs-Git: https://salsa.debian.org/hpc-team/infinipath-psm
+Vcs-Browser: https://salsa.debian.org/hpc-team/infinipath-psm
+
+Package: libpsm-infinipath1
+Architecture: amd64 i386
+Depends: ${misc:Depends},
+         ${shlibs:Depends}
+Description: PSM Messaging library for Intel Truescale adapters
+ The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level,
+ user-level communication interface for the Truescale family of products. PSM
+ users can use mechanisms necessary to implement higher-level communication
+ interfaces in parallel environments.
+ .
+ This package contains the shared libraries.
+
+Package: libpsm-infinipath1-dev
+Section: libdevel
+Architecture: amd64 i386
+Depends: libpsm-infinipath1 (= ${binary:Version}),
+         ${misc:Depends}
+Conflicts: libion-dev
+Description: Development files for libpsm-infinipath1
+ The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level,
+ user-level communication interface for the Truescale family of products. PSM
+ users can use mechanisms necessary to implement higher-level communication
+ interfaces in parallel environments.
+ .
+ This package is needed to compile programs against libpsm-infinipath1.
+ It contains the header files and links needed for compiling.
+
+Package: libpsm-infinipath1-dbg
+Section: debug
+Priority: optional
+Architecture: amd64 i386
+Depends: libpsm-infinipath1 (= ${binary:Version}),
+         ${misc:Depends}
+Description: Debugging symbols for libpsm-infinipath1
+ The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level,
+ user-level communication interface for the Truescale family of products. PSM
+ users can use mechanisms necessary to implement higher-level communication
+ interfaces in parallel environments.
+ .
+ This package contains the debugging symbols associated with
+ libpsm-infinipath1. They will automatically be used by gdb for debugging
+ libpsm-infinipath1-related issues.
diff --git a/copyright b/copyright
new file mode 100644
index 0000000..d97bebf
--- /dev/null
+++ b/copyright
@@ -0,0 +1,79 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: infinipath-psm
+Upstream-Contact: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Source: https://github.com/intel/psm
+
+Files: *
+Copyright: Copyright (c) 2012, 2017. Intel Corporation.
+           Copyright (c) 2005, 2006. QLogic Corporation.
+License: BSD-2-clause or GPL-2
+
+Files: debian/*
+Copyright: 2015-2017 Q-Leap Networks GmbH <info@q-leap.de>
+           2016 Ana Beatriz Guerrero Lopez <ana@debian.org>
+License: GPL-2+
+ This package is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ .
+ This package is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+ .
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>
+ .
+ On Debian systems, the complete text of the GNU General
+ Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
+
+License: GPL-2
+ This program is free software; you can redistribute it
+ and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ .
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE.  See the GNU General Public License for more
+ details.
+ .
+ You should have received a copy of the GNU General Public
+ License along with this package; if not, write to the Free
+ Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ Boston, MA  02110-1301 USA
+ .
+ On Debian systems, the full text of the GNU General Public
+ License version 2 can be found in the file
+ /usr/share/common-licenses/GPL-2.
+
+License: BSD-2-clause
+ The BSD 2-Clause License
+ .
+ Copyright (c) 2007 Cisco, Inc.  All rights reserved.
+ .
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ .
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+ .
+   * Redistributions in binary form must reproduce the above
+     copyright notice, this list of conditions and the following
+     disclaimer in the documentation and/or other materials provided
+     with the distribution.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
diff --git a/gbp.conf b/gbp.conf
new file mode 100644
index 0000000..5b6c22c
--- /dev/null
+++ b/gbp.conf
@@ -0,0 +1,4 @@
+[DEFAULT]
+upstream-branch = master
+debian-branch = debian/master
+ignore-branch = False
diff --git a/libpsm-infinipath1-dev.install b/libpsm-infinipath1-dev.install
new file mode 100644
index 0000000..f3800aa
--- /dev/null
+++ b/libpsm-infinipath1-dev.install
@@ -0,0 +1,2 @@
+usr/include/*
+usr/lib/*/lib*.so
diff --git a/libpsm-infinipath1.install b/libpsm-infinipath1.install
new file mode 100644
index 0000000..6e62d0c
--- /dev/null
+++ b/libpsm-infinipath1.install
@@ -0,0 +1,2 @@
+usr/lib/*/libinfinipath*.so.*
+usr/lib/libpsm1/*
diff --git a/libpsm-infinipath1.postinst.in b/libpsm-infinipath1.postinst.in
new file mode 100644
index 0000000..8a2e3a3
--- /dev/null
+++ b/libpsm-infinipath1.postinst.in
@@ -0,0 +1,10 @@
+#! /bin/sh
+
+set -e
+
+update-alternatives --install /usr/lib/@DEB_HOST_MULTIARCH@/libpsm_infinipath.so.@PSM_LIB_MAJOR@ libpsm_infinipath.so.@PSM_LIB_MAJOR@ \
+    /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@ 40
+
+#DEBHELPER#
+
+exit 0
diff --git a/libpsm-infinipath1.postrm.in b/libpsm-infinipath1.postrm.in
new file mode 100644
index 0000000..d7ec507
--- /dev/null
+++ b/libpsm-infinipath1.postrm.in
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+set -e
+
+if [ "$1" != "remove" ] || [ "$1" != "disappear" ]
+then
+    update-alternatives --remove libpsm_infinipath.so.@PSM_LIB_MAJOR@ \
+    /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@
+fi
+
+#DEBHELPER#
+
+exit 0
diff --git a/libpsm-infinipath1.prerm.in b/libpsm-infinipath1.prerm.in
new file mode 100644
index 0000000..47d57e0
--- /dev/null
+++ b/libpsm-infinipath1.prerm.in
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+set -e
+
+if [ "$1" != "remove" ]
+then
+    update-alternatives --remove libpsm_infinipath.so.@PSM_LIB_MAJOR@ \
+    /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@
+fi
+
+#DEBHELPER#
+
+exit 0
diff --git a/libpsm-infinipath1.symbols b/libpsm-infinipath1.symbols
new file mode 100644
index 0000000..505543e
--- /dev/null
+++ b/libpsm-infinipath1.symbols
@@ -0,0 +1,432 @@
+libinfinipath.so.4 libpsm-infinipath1 #MINVER#
+ __ipath_dbgout@Base 3.3+7.gec1d6d2
+ __ipath_malloc_no_mmap@Base 3.3+7.gec1d6d2
+ __ipath_mylabel@Base 3.3+7.gec1d6d2
+ __ipath_pico_per_cycle@Base 3.3+7.gec1d6d2
+ infinipath_debug@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_port@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_port_names@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_port_names_count@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_unit@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_unit_names@Base 3.3+7.gec1d6d2
+ infinipath_get_ctrs_unit_names_count@Base 3.3+7.gec1d6d2
+ infinipath_get_next_name@Base 3.3+7.gec1d6d2
+ infinipath_get_single_portctr@Base 3.3+7.gec1d6d2
+ infinipath_get_single_stat@Base 3.3+7.gec1d6d2
+ infinipath_get_single_unitctr@Base 3.3+7.gec1d6d2
+ infinipath_get_stats@Base 3.3+7.gec1d6d2
+ infinipath_get_stats_names@Base 3.3+7.gec1d6d2
+ infinipath_get_stats_names_count@Base 3.3+7.gec1d6d2
+ infinipath_get_unit_flash@Base 3.3+7.gec1d6d2
+ infinipath_lookup_stat@Base 3.3+7.gec1d6d2
+ infinipath_put_unit_flash@Base 3.3+7.gec1d6d2
+ infinipath_release_names@Base 3.3+7.gec1d6d2
+ ipath_armlaunch_ctrl@Base 3.3+7.gec1d6d2
+ ipath_check_unit_status@Base 3.3+7.gec1d6d2
+ ipath_cmd_assign_context@Base 3.3+7.gec1d6d2
+ ipath_cmd_user_init@Base 3.3+7.gec1d6d2
+ ipath_cmd_wait_for_packet@Base 3.3+7.gec1d6d2
+ ipath_cmd_write@Base 3.3+7.gec1d6d2
+ ipath_cmd_writev@Base 3.3+7.gec1d6d2
+ ipath_context_close@Base 3.3+7.gec1d6d2
+ ipath_context_open@Base 3.3+7.gec1d6d2
+ ipath_disarm_bufs@Base 3.3+7.gec1d6d2
+ ipath_dwordcpy@Base 3.3+7.gec1d6d2
+ (arch=amd64)ipath_dwordcpy_safe@Base 3.3+7.gec1d6d2
+ ipath_event_ack@Base 3.3+7.gec1d6d2
+ ipath_flash_csum@Base 3.3+7.gec1d6d2
+ ipath_flush_egr_bufs@Base 3.3+7.gec1d6d2
+ ipath_force_pio_avail_update@Base 3.3+7.gec1d6d2
+ ipath_free_tid_err@Base 3.3+7.gec1d6d2
+ ipath_get_cc_settings_bin@Base 3.3+7.gec1d6d2
+ ipath_get_cc_table_bin@Base 3.3+7.gec1d6d2
+ ipath_get_mylabel@Base 3.3+7.gec1d6d2
+ ipath_get_num_contexts@Base 3.3+7.gec1d6d2
+ ipath_get_num_units@Base 3.3+7.gec1d6d2
+ ipath_get_port_gid@Base 3.3+7.gec1d6d2
+ ipath_get_port_lid@Base 3.3+7.gec1d6d2
+ ipath_get_port_lmc@Base 3.3+7.gec1d6d2
+ ipath_get_port_rate@Base 3.3+7.gec1d6d2
+ ipath_get_port_sl2vl@Base 3.3+7.gec1d6d2
+ ipath_hideous_ioctl_emulator@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_open@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_path@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_rd@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_read@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_unit_open@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_unit_rd@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_unit_read@Base 3.3+7.gec1d6d2
+ ipath_ipathfs_unit_write@Base 3.3+7.gec1d6d2
+ ipath_manage_rcvq@Base 3.3+7.gec1d6d2
+ ipath_mmap64@Base 3.3+7.gec1d6d2
+ ipath_poll_type@Base 3.3+7.gec1d6d2
+ ipath_sdma_complete@Base 3.3+7.gec1d6d2
+ ipath_sdma_inflight@Base 3.3+7.gec1d6d2
+ ipath_set_mylabel@Base 3.3+7.gec1d6d2
+ ipath_set_pkey@Base 3.3+7.gec1d6d2
+ ipath_sysfs_open@Base 3.3+7.gec1d6d2
+ ipath_sysfs_path@Base 3.3+7.gec1d6d2
+ ipath_sysfs_path_len@Base 3.3+7.gec1d6d2
+ ipath_sysfs_port_open@Base 3.3+7.gec1d6d2
+ ipath_sysfs_port_printf@Base 3.3+7.gec1d6d2
+ ipath_sysfs_port_read@Base 3.3+7.gec1d6d2
+ ipath_sysfs_port_read_s64@Base 3.3+7.gec1d6d2
+ ipath_sysfs_printf@Base 3.3+7.gec1d6d2
+ ipath_sysfs_read@Base 3.3+7.gec1d6d2
+ ipath_sysfs_read_s64@Base 3.3+7.gec1d6d2
+ ipath_sysfs_unit_open@Base 3.3+7.gec1d6d2
+ ipath_sysfs_unit_printf@Base 3.3+7.gec1d6d2
+ ipath_sysfs_unit_read@Base 3.3+7.gec1d6d2
+ ipath_sysfs_unit_read_s64@Base 3.3+7.gec1d6d2
+ ipath_sysfs_unit_write@Base 3.3+7.gec1d6d2
+ ipath_syslog@Base 3.3+7.gec1d6d2
+ ipath_touch_mmap@Base 3.3+7.gec1d6d2
+ ipath_update_tid_err@Base 3.3+7.gec1d6d2
+ ipath_userinit@Base 3.3+7.gec1d6d2
+ ipath_vsyslog@Base 3.3+7.gec1d6d2
+ ipath_wait_for_device@Base 3.3+7.gec1d6d2
+ ipath_wait_for_packet@Base 3.3+7.gec1d6d2
+ ipath_write_pio@Base 3.3+7.gec1d6d2
+ ipath_write_pio_force_order@Base 3.3+7.gec1d6d2
+ ipath_write_pio_special_trigger2k@Base 3.3+7.gec1d6d2
+ ipath_write_pio_special_trigger4k@Base 3.3+7.gec1d6d2
+libpsm_infinipath.so.1 libpsm-infinipath1 #MINVER#
+ __psm_am_get_parameters@Base 3.3+7.gec1d6d2
+ __psm_am_register_handlers@Base 3.3+7.gec1d6d2
+ __psm_am_reply_short@Base 3.3+7.gec1d6d2
+ __psm_am_request_short@Base 3.3+7.gec1d6d2
+ __psm_ep_close@Base 3.3+7.gec1d6d2
+ __psm_ep_connect@Base 3.3+7.gec1d6d2
+ __psm_ep_epid_lookup@Base 3.3+7.gec1d6d2
+ __psm_ep_epid_share_memory@Base 3.3+7.gec1d6d2
+ __psm_ep_num_devunits@Base 3.3+7.gec1d6d2
+ __psm_ep_open@Base 3.3+7.gec1d6d2
+ __psm_ep_open_internal@Base 3.3+7.gec1d6d2
+ __psm_ep_open_opts_get_defaults@Base 3.3+7.gec1d6d2
+ __psm_ep_query@Base 3.3+7.gec1d6d2
+ __psm_epaddr_getctxt@Base 3.3+7.gec1d6d2
+ __psm_epaddr_setctxt@Base 3.3+7.gec1d6d2
+ __psm_epaddr_setlabel@Base 3.3+7.gec1d6d2
+ __psm_epid_context@Base 3.3+7.gec1d6d2
+ __psm_epid_nid@Base 3.3+7.gec1d6d2
+ __psm_epid_port@Base 3.3+7.gec1d6d2
+ __psm_error_defer@Base 3.3+7.gec1d6d2
+ __psm_error_get_string@Base 3.3+7.gec1d6d2
+ __psm_error_register_handler@Base 3.3+7.gec1d6d2
+ __psm_finalize@Base 3.3+7.gec1d6d2
+ __psm_getopt@Base 3.3+7.gec1d6d2
+ __psm_init@Base 3.3+7.gec1d6d2
+ __psm_map_nid_hostname@Base 3.3+7.gec1d6d2
+ __psm_mq_cancel@Base 3.3+7.gec1d6d2
+ __psm_mq_finalize@Base 3.3+7.gec1d6d2
+ __psm_mq_get_stats@Base 3.3+7.gec1d6d2
+ __psm_mq_getopt@Base 3.3+7.gec1d6d2
+ __psm_mq_init@Base 3.3+7.gec1d6d2
+ __psm_mq_ipeek@Base 3.3+7.gec1d6d2
+ __psm_mq_iprobe@Base 3.3+7.gec1d6d2
+ __psm_mq_irecv@Base 3.3+7.gec1d6d2
+ __psm_mq_isend@Base 3.3+7.gec1d6d2
+ __psm_mq_send@Base 3.3+7.gec1d6d2
+ __psm_mq_setopt@Base 3.3+7.gec1d6d2
+ __psm_mq_test@Base 3.3+7.gec1d6d2
+ __psm_mq_wait@Base 3.3+7.gec1d6d2
+ __psm_poll@Base 3.3+7.gec1d6d2
+ __psm_setopt@Base 3.3+7.gec1d6d2
+ __psm_uuid_generate@Base 3.3+7.gec1d6d2
+ __psmi_poll_internal@Base 3.3+7.gec1d6d2
+ __psmi_poll_noop@Base 3.3+7.gec1d6d2
+ ips_am_short_reply@Base 3.3+7.gec1d6d2
+ ips_am_short_request@Base 3.3+7.gec1d6d2
+ ips_cca_adjust_rate@Base 3.3+7.gec1d6d2
+ ips_cca_timer_callback@Base 3.3+7.gec1d6d2
+ ips_crc_calculate@Base 3.3+7.gec1d6d2
+ ips_dma_transfer_frame@Base 3.3+7.gec1d6d2
+ ips_epstate_add@Base 3.3+7.gec1d6d2
+ ips_epstate_del@Base 3.3+7.gec1d6d2
+ ips_epstate_fini@Base 3.3+7.gec1d6d2
+ ips_epstate_init@Base 3.3+7.gec1d6d2
+ ips_err_str@Base 3.3+7.gec1d6d2
+ ips_flow_init@Base 3.3+7.gec1d6d2
+ ips_get_stat@Base 3.3+7.gec1d6d2
+ ips_ibta_fini@Base 3.3+7.gec1d6d2
+ ips_ibta_init@Base 3.3+7.gec1d6d2
+ ips_ibta_init_sl2vl_table@Base 3.3+7.gec1d6d2
+ ips_ibta_link_updown_event@Base 3.3+7.gec1d6d2
+ ips_mq_send_payload@Base 3.3+7.gec1d6d2
+ ips_opp_init@Base 3.3+7.gec1d6d2
+ ips_proto_am@Base 3.3+7.gec1d6d2
+ ips_proto_am_fini@Base 3.3+7.gec1d6d2
+ ips_proto_am_init@Base 3.3+7.gec1d6d2
+ ips_proto_build_connect_message@Base 3.3+7.gec1d6d2
+ ips_proto_connect@Base 3.3+7.gec1d6d2
+ ips_proto_disconnect@Base 3.3+7.gec1d6d2
+ ips_proto_dma_wait_until@Base 3.3+7.gec1d6d2
+ ips_proto_dump_data@Base 3.3+7.gec1d6d2
+ ips_proto_dump_err_stats@Base 3.3+7.gec1d6d2
+ ips_proto_dump_frame@Base 3.3+7.gec1d6d2
+ ips_proto_fini@Base 3.3+7.gec1d6d2
+ ips_proto_flow_enqueue@Base 3.3+7.gec1d6d2
+ ips_proto_flow_flush_dma@Base 3.3+7.gec1d6d2
+ ips_proto_flow_flush_pio@Base 3.3+7.gec1d6d2
+ ips_proto_get_rhf_errstring@Base 3.3+7.gec1d6d2
+ ips_proto_init@Base 3.3+7.gec1d6d2
+ ips_proto_isconnected@Base 3.3+7.gec1d6d2
+ ips_proto_mq_handle_cts@Base 3.3+7.gec1d6d2
+ ips_proto_mq_handle_rts_envelope@Base 3.3+7.gec1d6d2
+ ips_proto_mq_handle_rts_envelope_outoforder@Base 3.3+7.gec1d6d2
+ ips_proto_mq_isend@Base 3.3+7.gec1d6d2
+ ips_proto_mq_push_eager_data@Base 3.3+7.gec1d6d2
+ ips_proto_mq_push_eager_req@Base 3.3+7.gec1d6d2
+ ips_proto_mq_send@Base 3.3+7.gec1d6d2
+ ips_proto_process_ack@Base 3.3+7.gec1d6d2
+ ips_proto_process_connect@Base 3.3+7.gec1d6d2
+ ips_proto_process_packet_error@Base 3.3+7.gec1d6d2
+ ips_proto_process_packet_inner@Base 3.3+7.gec1d6d2
+ ips_proto_process_unknown@Base 3.3+7.gec1d6d2
+ ips_proto_recv_fini@Base 3.3+7.gec1d6d2
+ ips_proto_recv_init@Base 3.3+7.gec1d6d2
+ ips_proto_rv_scbavail_callback@Base 3.3+7.gec1d6d2
+ ips_proto_send_ctrl_message@Base 3.3+7.gec1d6d2
+ ips_proto_show_header@Base 3.3+7.gec1d6d2
+ ips_proto_timer_ack_callback@Base 3.3+7.gec1d6d2
+ ips_proto_timer_ctrlq_callback@Base 3.3+7.gec1d6d2
+ ips_proto_timer_pendq_callback@Base 3.3+7.gec1d6d2
+ ips_proto_timer_send_callback@Base 3.3+7.gec1d6d2
+ ips_protoexp_build_ctrl_message@Base 3.3+7.gec1d6d2
+ ips_protoexp_data@Base 3.3+7.gec1d6d2
+ ips_protoexp_fini@Base 3.3+7.gec1d6d2
+ ips_protoexp_flow_newgen@Base 3.3+7.gec1d6d2
+ ips_protoexp_handle_data_err@Base 3.3+7.gec1d6d2
+ ips_protoexp_handle_tf_generr@Base 3.3+7.gec1d6d2
+ ips_protoexp_handle_tf_seqerr@Base 3.3+7.gec1d6d2
+ ips_protoexp_handle_tiderr@Base 3.3+7.gec1d6d2
+ ips_protoexp_init@Base 3.3+7.gec1d6d2
+ ips_protoexp_recv_unaligned_data@Base 3.3+7.gec1d6d2
+ ips_protoexp_scb_inflight@Base 3.3+7.gec1d6d2
+ ips_protoexp_tid_get_from_token@Base 3.3+7.gec1d6d2
+ ips_protoexp_tid_grant@Base 3.3+7.gec1d6d2
+ ips_protoexp_tid_grant_ack@Base 3.3+7.gec1d6d2
+ ips_protoexp_tid_release@Base 3.3+7.gec1d6d2
+ ips_protoexp_tid_release_ack@Base 3.3+7.gec1d6d2
+ ips_ptl_connect@Base 3.3+7.gec1d6d2
+ ips_ptl_disconnect@Base 3.3+7.gec1d6d2
+ ips_ptl_epaddr_stats_get@Base 3.3+7.gec1d6d2
+ ips_ptl_poll@Base 3.3+7.gec1d6d2
+ ips_ptl_rcvthread_fini@Base 3.3+7.gec1d6d2
+ ips_ptl_rcvthread_init@Base 3.3+7.gec1d6d2
+ ips_ptl_recvq_isempty@Base 3.3+7.gec1d6d2
+ ips_ptl_shared_poll@Base 3.3+7.gec1d6d2
+ ips_recvhdrq_fini@Base 3.3+7.gec1d6d2
+ ips_recvhdrq_init@Base 3.3+7.gec1d6d2
+ ips_recvhdrq_progress@Base 3.3+7.gec1d6d2
+ ips_recvq_egrbuf_table_alloc@Base 3.3+7.gec1d6d2
+ ips_recvq_egrbuf_table_free@Base 3.3+7.gec1d6d2
+ ips_scbctrl_alloc@Base 3.3+7.gec1d6d2
+ ips_scbctrl_alloc_tiny@Base 3.3+7.gec1d6d2
+ ips_scbctrl_avail@Base 3.3+7.gec1d6d2
+ ips_scbctrl_bufalloc@Base 3.3+7.gec1d6d2
+ ips_scbctrl_fini@Base 3.3+7.gec1d6d2
+ ips_scbctrl_free@Base 3.3+7.gec1d6d2
+ ips_scbctrl_init@Base 3.3+7.gec1d6d2
+ ips_spio_fini@Base 3.3+7.gec1d6d2
+ ips_spio_init@Base 3.3+7.gec1d6d2
+ ips_spio_transfer_frame@Base 3.3+7.gec1d6d2
+ ips_subcontext_ureg_get@Base 3.3+7.gec1d6d2
+ ips_subcontext_ureg_initialize@Base 3.3+7.gec1d6d2
+ ips_tf_allocate@Base 3.3+7.gec1d6d2
+ ips_tf_deallocate@Base 3.3+7.gec1d6d2
+ ips_tf_fini@Base 3.3+7.gec1d6d2
+ ips_tf_init@Base 3.3+7.gec1d6d2
+ ips_tfgen_allocate@Base 3.3+7.gec1d6d2
+ ips_tid_acquire@Base 3.3+7.gec1d6d2
+ ips_tid_fini@Base 3.3+7.gec1d6d2
+ ips_tid_init@Base 3.3+7.gec1d6d2
+ ips_tid_release@Base 3.3+7.gec1d6d2
+ ips_tid_send_exp@Base 3.3+7.gec1d6d2
+ ips_tidflow_nak_post_process@Base 3.3+7.gec1d6d2
+ ips_writehdrq_fini@Base 3.3+7.gec1d6d2
+ ips_writehdrq_init@Base 3.3+7.gec1d6d2
+ kcopy_abi@Base 3.3+7.gec1d6d2
+ kcopy_get@Base 3.3+7.gec1d6d2
+ kcopy_put@Base 3.3+7.gec1d6d2
+ knem_get@Base 3.3+7.gec1d6d2
+ knem_open_device@Base 3.3+7.gec1d6d2
+ knem_put@Base 3.3+7.gec1d6d2
+ knem_register_region@Base 3.3+7.gec1d6d2
+ psm_am_get_parameters@Base 3.3+7.gec1d6d2
+ psm_am_register_handlers@Base 3.3+7.gec1d6d2
+ psm_am_reply_short@Base 3.3+7.gec1d6d2
+ psm_am_request_short@Base 3.3+7.gec1d6d2
+ psm_ep_close@Base 3.3+7.gec1d6d2
+ psm_ep_connect@Base 3.3+7.gec1d6d2
+ psm_ep_epid_lookup@Base 3.3+7.gec1d6d2
+ psm_ep_epid_share_memory@Base 3.3+7.gec1d6d2
+ psm_ep_num_devunits@Base 3.3+7.gec1d6d2
+ psm_ep_open@Base 3.3+7.gec1d6d2
+ psm_ep_open_opts_get_defaults@Base 3.3+7.gec1d6d2
+ psm_ep_query@Base 3.3+7.gec1d6d2
+ psm_epaddr_getctxt@Base 3.3+7.gec1d6d2
+ psm_epaddr_setctxt@Base 3.3+7.gec1d6d2
+ psm_epaddr_setlabel@Base 3.3+7.gec1d6d2
+ psm_epid_context@Base 3.3+7.gec1d6d2
+ psm_epid_nid@Base 3.3+7.gec1d6d2
+ psm_epid_port@Base 3.3+7.gec1d6d2
+ psm_error_defer@Base 3.3+7.gec1d6d2
+ psm_error_get_string@Base 3.3+7.gec1d6d2
+ psm_error_register_handler@Base 3.3+7.gec1d6d2
+ psm_finalize@Base 3.3+7.gec1d6d2
+ psm_getopt@Base 3.3+7.gec1d6d2
+ psm_init@Base 3.3+7.gec1d6d2
+ psm_map_nid_hostname@Base 3.3+7.gec1d6d2
+ psm_mq_cancel@Base 3.3+7.gec1d6d2
+ psm_mq_finalize@Base 3.3+7.gec1d6d2
+ psm_mq_get_stats@Base 3.3+7.gec1d6d2
+ psm_mq_getopt@Base 3.3+7.gec1d6d2
+ psm_mq_init@Base 3.3+7.gec1d6d2
+ psm_mq_ipeek@Base 3.3+7.gec1d6d2
+ psm_mq_iprobe@Base 3.3+7.gec1d6d2
+ psm_mq_irecv@Base 3.3+7.gec1d6d2
+ psm_mq_isend@Base 3.3+7.gec1d6d2
+ psm_mq_send@Base 3.3+7.gec1d6d2
+ psm_mq_setopt@Base 3.3+7.gec1d6d2
+ psm_mq_test@Base 3.3+7.gec1d6d2
+ psm_mq_wait@Base 3.3+7.gec1d6d2
+ psm_poll@Base 3.3+7.gec1d6d2
+ psm_setopt@Base 3.3+7.gec1d6d2
+ psm_uuid_generate@Base 3.3+7.gec1d6d2
+ psmi_abort_handler@Base 3.3+7.gec1d6d2
+ psmi_allhandlers@Base 3.3+7.gec1d6d2
+ psmi_am_getopt@Base 3.3+7.gec1d6d2
+ psmi_am_handler@Base 3.3+7.gec1d6d2
+ psmi_am_init_internal@Base 3.3+7.gec1d6d2
+ psmi_am_mq_handler@Base 3.3+7.gec1d6d2
+ psmi_am_mq_handler_data@Base 3.3+7.gec1d6d2
+ psmi_am_mq_handler_rtsdone@Base 3.3+7.gec1d6d2
+ psmi_am_mq_handler_rtsmatch@Base 3.3+7.gec1d6d2
+ psmi_am_reqq_add@Base 3.3+7.gec1d6d2
+ psmi_am_reqq_drain@Base 3.3+7.gec1d6d2
+ psmi_am_reqq_init@Base 3.3+7.gec1d6d2
+ psmi_am_setopt@Base 3.3+7.gec1d6d2
+ psmi_amsh_am_short_reply@Base 3.3+7.gec1d6d2
+ psmi_amsh_am_short_request@Base 3.3+7.gec1d6d2
+ psmi_amsh_generic@Base 3.3+7.gec1d6d2
+ psmi_amsh_long_reply@Base 3.3+7.gec1d6d2
+ psmi_amsh_long_request@Base 3.3+7.gec1d6d2
+ psmi_amsh_short_reply@Base 3.3+7.gec1d6d2
+ psmi_amsh_short_request@Base 3.3+7.gec1d6d2
+ psmi_calloc_internal@Base 3.3+7.gec1d6d2
+ psmi_context_check_status@Base 3.3+7.gec1d6d2
+ psmi_context_close@Base 3.3+7.gec1d6d2
+ psmi_context_interrupt_isenabled@Base 3.3+7.gec1d6d2
+ psmi_context_interrupt_set@Base 3.3+7.gec1d6d2
+ psmi_context_open@Base 3.3+7.gec1d6d2
+ psmi_core_getopt@Base 3.3+7.gec1d6d2
+ psmi_core_setopt@Base 3.3+7.gec1d6d2
+ psmi_crc@Base 3.3+7.gec1d6d2
+ psmi_cycles_left@Base 3.3+7.gec1d6d2
+ psmi_diags@Base 3.3+7.gec1d6d2
+ psmi_ep_device_is_enabled@Base 3.3+7.gec1d6d2
+ psmi_epaddr_get_hostname@Base 3.3+7.gec1d6d2
+ psmi_epaddr_get_name@Base 3.3+7.gec1d6d2
+ psmi_epaddr_kcopy_pid@Base 3.3+7.gec1d6d2
+ psmi_epid_add@Base 3.3+7.gec1d6d2
+ psmi_epid_fini@Base 3.3+7.gec1d6d2
+ psmi_epid_hca_type@Base 3.3+7.gec1d6d2
+ psmi_epid_init@Base 3.3+7.gec1d6d2
+ psmi_epid_itor_fini@Base 3.3+7.gec1d6d2
+ psmi_epid_itor_init@Base 3.3+7.gec1d6d2
+ psmi_epid_itor_next@Base 3.3+7.gec1d6d2
+ psmi_epid_lookup@Base 3.3+7.gec1d6d2
+ psmi_epid_remove@Base 3.3+7.gec1d6d2
+ psmi_epid_set_hostname@Base 3.3+7.gec1d6d2
+ psmi_epid_sl@Base 3.3+7.gec1d6d2
+ psmi_epid_subcontext@Base 3.3+7.gec1d6d2
+ psmi_epid_table@Base 3.3+7.gec1d6d2
+ psmi_errhandler_global@Base 3.3+7.gec1d6d2
+ psmi_error_cmp@Base 3.3+7.gec1d6d2
+ psmi_error_syslog_level@Base 3.3+7.gec1d6d2
+ psmi_faultinj_enabled@Base 3.3+7.gec1d6d2
+ psmi_faultinj_fini@Base 3.3+7.gec1d6d2
+ psmi_faultinj_getspec@Base 3.3+7.gec1d6d2
+ psmi_faultinj_init@Base 3.3+7.gec1d6d2
+ psmi_faultinj_is_fault@Base 3.3+7.gec1d6d2
+ psmi_faultinj_outfile@Base 3.3+7.gec1d6d2
+ psmi_faultinj_verbose@Base 3.3+7.gec1d6d2
+ psmi_free_internal@Base 3.3+7.gec1d6d2
+ psmi_get_hca_type@Base 3.3+7.gec1d6d2
+ psmi_get_ipv4addr@Base 3.3+7.gec1d6d2
+ psmi_getenv@Base 3.3+7.gec1d6d2
+ psmi_gethostname@Base 3.3+7.gec1d6d2
+ psmi_getpagesize@Base 3.3+7.gec1d6d2
+ psmi_handle_error@Base 3.3+7.gec1d6d2
+ psmi_infinipath_revision@Base 3.3+7.gec1d6d2
+ psmi_isinitialized@Base 3.3+7.gec1d6d2
+ psmi_log_memstats@Base 3.3+7.gec1d6d2
+ psmi_malloc_internal@Base 3.3+7.gec1d6d2
+ psmi_memcpyo@Base 3.3+7.gec1d6d2
+ psmi_mpool_create@Base 3.3+7.gec1d6d2
+ psmi_mpool_destroy@Base 3.3+7.gec1d6d2
+ psmi_mpool_find_obj_by_index@Base 3.3+7.gec1d6d2
+ psmi_mpool_get@Base 3.3+7.gec1d6d2
+ psmi_mpool_get_obj_gen_count@Base 3.3+7.gec1d6d2
+ psmi_mpool_get_obj_index@Base 3.3+7.gec1d6d2
+ psmi_mpool_get_obj_index_gen_count@Base 3.3+7.gec1d6d2
+ psmi_mpool_get_obj_info@Base 3.3+7.gec1d6d2
+ psmi_mpool_put@Base 3.3+7.gec1d6d2
+ psmi_mq_free@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_data@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_envelope@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_envelope_outoforder@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_envelope_unexpected@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_outoforder_queue@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_rts@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_rts_complete@Base 3.3+7.gec1d6d2
+ psmi_mq_handle_rts_outoforder@Base 3.3+7.gec1d6d2
+ psmi_mq_initialize_defaults@Base 3.3+7.gec1d6d2
+ psmi_mq_malloc@Base 3.3+7.gec1d6d2
+ psmi_mq_mtucpy@Base 3.3+7.gec1d6d2
+ psmi_mq_register_unexpected_callback@Base 3.3+7.gec1d6d2
+ psmi_mq_req_alloc@Base 3.3+7.gec1d6d2
+ psmi_mq_req_fini@Base 3.3+7.gec1d6d2
+ psmi_mq_req_init@Base 3.3+7.gec1d6d2
+ psmi_mq_stats_register@Base 3.3+7.gec1d6d2
+ psmi_mq_sysbuf_alloc@Base 3.3+7.gec1d6d2
+ psmi_mq_sysbuf_fini@Base 3.3+7.gec1d6d2
+ psmi_mq_sysbuf_free@Base 3.3+7.gec1d6d2
+ psmi_mq_sysbuf_getinfo@Base 3.3+7.gec1d6d2
+ psmi_mq_sysbuf_init@Base 3.3+7.gec1d6d2
+ psmi_mq_wait_internal@Base 3.3+7.gec1d6d2
+ psmi_opened_endpoint@Base 3.3+7.gec1d6d2
+ psmi_opened_endpoint_count@Base 3.3+7.gec1d6d2
+ psmi_parse_memmode@Base 3.3+7.gec1d6d2
+ psmi_parse_mpool_env@Base 3.3+7.gec1d6d2
+ psmi_parse_str_tuples@Base 3.3+7.gec1d6d2
+ psmi_poll_internal@Base 3.3+7.gec1d6d2
+ psmi_poll_noop@Base 3.3+7.gec1d6d2
+ psmi_progress_lock@Base 3.3+7.gec1d6d2
+ psmi_protocol_fn@Base 3.3+7.gec1d6d2
+ psmi_ptl_amsh@Base 3.3+7.gec1d6d2
+ psmi_ptl_ips@Base 3.3+7.gec1d6d2
+ psmi_ptl_self@Base 3.3+7.gec1d6d2
+ psmi_sharedcontext_params@Base 3.3+19.g67c0807.open
+ psmi_shm_attach@Base 3.3+7.gec1d6d2
+ psmi_shm_detach@Base 3.3+7.gec1d6d2
+ psmi_shm_mq_rv_thresh@Base 3.3+7.gec1d6d2
+ psmi_stats_deregister_all@Base 3.3+7.gec1d6d2
+ psmi_stats_memory@Base 3.3+7.gec1d6d2
+ psmi_stats_register@Base 3.3+7.gec1d6d2
+ psmi_stats_register_type@Base 3.3+7.gec1d6d2
+ psmi_strdup_internal@Base 3.3+7.gec1d6d2
+ psmi_syslog@Base 3.3+7.gec1d6d2
+ psmi_timer_cancel_inner@Base 3.3+7.gec1d6d2
+ psmi_timer_entry_init@Base 3.3+7.gec1d6d2
+ psmi_timer_fini@Base 3.3+7.gec1d6d2
+ psmi_timer_init@Base 3.3+7.gec1d6d2
+ psmi_timer_process_expired@Base 3.3+7.gec1d6d2
+ psmi_timer_request_always@Base 3.3+7.gec1d6d2
+ psmi_uuid_compare@Base 3.3+7.gec1d6d2
+ psmi_uuid_parse@Base 3.3+7.gec1d6d2
+ psmi_uuid_unparse@Base 3.3+7.gec1d6d2
+ psmi_verno_client@Base 3.3+7.gec1d6d2
+ psmi_verno_isinteroperable@Base 3.3+7.gec1d6d2
+ psmi_xfer_fn@Base 3.3+7.gec1d6d2
diff --git a/patches/0001-Fix-truncation-warnings-with-gcc7.patch b/patches/0001-Fix-truncation-warnings-with-gcc7.patch
new file mode 100644
index 0000000..6cb6b32
--- /dev/null
+++ b/patches/0001-Fix-truncation-warnings-with-gcc7.patch
@@ -0,0 +1,45 @@
+From: Roland Fehrenbacher <rf@q-leap.de>
+Date: Thu, 28 Dec 2017 19:56:31 +0100
+Subject: Fix truncation warnings with gcc7
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 8bit
+
+This patch was originally created by
+Johannes Brandstätter <jbrandst@2ds.eu> see
+https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=853451
+---
+ psm_ep.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/psm_ep.c b/psm_ep.c
+index 6857895..2bd0ed4 100644
+--- a/psm_ep.c
++++ b/psm_ep.c
+@@ -978,7 +978,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
+     int i, num_rails = 0;
+     char *uname = "IPATH_UNIT";
+     char *pname = "IPATH_PORT";
+-    char uvalue[4], pvalue[4];
++    char uvalue[4], pvalue[6];
+     int devid_enabled[PTL_MAX_INIT];
+     union psmi_envvar_val devs;
+ 
+@@ -1010,7 +1010,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
+ 	/* If multi-rail is used, set the first ep unit/port */
+ 	if (num_rails > 0) {
+ 	    snprintf(uvalue, 4, "%1d", units[0]);
+-	    snprintf(pvalue, 4, "%1d", ports[0]);
++	    snprintf(pvalue, 6, "%1d", ports[0]);
+ 	    setenv(uname, uvalue, 1);
+ 	    setenv(pname, pvalue, 1);
+ 	}
+@@ -1038,7 +1038,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
+     if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+ 	for (i = 1; i < num_rails; i++) {
+ 	    snprintf(uvalue, 4, "%1d", units[i]);
+-	    snprintf(pvalue, 4, "%1d", ports[i]);
++	    snprintf(pvalue, 6, "%1d", ports[i]);
+ 	    setenv(uname, uvalue, 1);
+ 	    setenv(pname, pvalue, 1);
+ 
diff --git a/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch b/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch
new file mode 100644
index 0000000..b0a69bb
--- /dev/null
+++ b/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch
@@ -0,0 +1,31 @@
+From: Roland Fehrenbacher <rf@q-leap.de>
+Date: Fri, 29 Dec 2017 12:19:43 +0100
+Subject: Include <sys/sysmacros.h> to avoid warning about minor
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 8bit
+
+ipath_proto.c: In function ‘ipath_userinit’:
+ipath_proto.c:539:13: error: In the GNU C Library, "minor" is defined
+ by <sys/sysmacros.h>. For historical compatibility, it is
+ currently defined by <sys/types.h> as well, but we plan to
+ remove this soon. To use "minor", include <sys/sysmacros.h>
+ directly. If you did not intend to use a system-defined macro
+ "minor", you should undefine it after including <sys/types.h>. [-Werror]
+     spctrl->spc_dev.spd_type = minor(st.st_rdev);
+---
+ ipath/ipath_proto.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c
+index 5f9365f..a943c1d 100644
+--- a/ipath/ipath_proto.c
++++ b/ipath/ipath_proto.c
+@@ -37,6 +37,7 @@
+ // level infinipath protocol code.
+ 
+ #include <sys/poll.h>
++#include <sys/sysmacros.h>
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <stdint.h>
diff --git a/patches/0003-gcc8.patch b/patches/0003-gcc8.patch
new file mode 100644
index 0000000..b405d18
--- /dev/null
+++ b/patches/0003-gcc8.patch
@@ -0,0 +1,29 @@
+Author: Reiner Herrmann <reiner@reiner-h.de>
+Description: Fix build with gcc 8
+ - psm_utils.c: reserve enough memory for both input strings and the fixed part
+ - psm_ep.c: e has sufficient space to copy including the NULL terminator,
+             which fixes a warning about truncation of the input string
+Bug-Debian: https://bugs.debian.org/897774
+
+--- a/psm_ep.c
++++ b/psm_ep.c
+@@ -1349,7 +1349,7 @@
+ 
+     b_new = (char *) devstr;
+     e = b_new + len;
+-    strncpy(e, devstring, len-1);
++    strncpy(e, devstring, len);
+     e[len-1] = '\0';
+     ee = e + len;
+     i = 0;
+--- a/psm_utils.c
++++ b/psm_utils.c
+@@ -955,7 +955,7 @@
+ 	union psmi_envvar_val env_fi;
+ 	char fvals_str[128];
+ 	char fname[128];
+-	char fdesc[256];
++	char fdesc[300];
+ 
+ 	snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom);
+ 	fvals_str[sizeof fvals_str - 1] = '\0';
diff --git a/patches/0004-gcc-11-warning.patch b/patches/0004-gcc-11-warning.patch
new file mode 100644
index 0000000..da556f5
--- /dev/null
+++ b/patches/0004-gcc-11-warning.patch
@@ -0,0 +1,22 @@
+Description: Disable warning in the cmpxchgl wrapper
+Author: Christoph Biedl <debian.axhn@manchmal.in-ulm.de> 
+Origin: no # upstream is dead
+Bug-Debian: https://bugs.debian.org/984057
+Last-Update: 2022-10-16
+
+--- a/include/linux-i386/sysdep.h
++++ b/include/linux-i386/sysdep.h
+@@ -106,10 +106,13 @@
+     uint32_t prev;
+     struct xchg_dummy { uint32_t a[100]; };
+ 
++#pragma GCC diagnostic push
++#pragma GCC diagnostic ignored "-Warray-bounds"
+     asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
+ 		 : "=a"(prev)
+ 		 : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old)
+ 		 : "memory");
++#pragma GCC diagnostic pop
+ 
+     return prev;
+ }
diff --git a/patches/series b/patches/series
new file mode 100644
index 0000000..df69493
--- /dev/null
+++ b/patches/series
@@ -0,0 +1,4 @@
+0001-Fix-truncation-warnings-with-gcc7.patch
+0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch
+0003-gcc8.patch
+0004-gcc-11-warning.patch
diff --git a/rules b/rules
new file mode 100755
index 0000000..d110e9a
--- /dev/null
+++ b/rules
@@ -0,0 +1,61 @@
+#!/usr/bin/make -f
+
+include /usr/share/dpkg/buildflags.mk
+include /usr/share/dpkg/pkg-info.mk
+include /usr/share/dpkg/architecture.mk
+
+export DEB_CFLAGS_MAINT_APPEND = -fcommon
+
+ifeq ($(DEB_HOST_ARCH),amd64)
+  ARCH := x86_64
+else
+  ARCH := $(DEB_HOST_ARCH)
+endif
+
+PSM_LIB_MAJOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(CURDIR)/psm.h`)
+PSM_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(CURDIR)/psm.h`)
+PSM_LIB_VERSION := ${PSM_LIB_MAJOR}.${PSM_LIB_MINOR}
+
+
+MAKE_OPTIONS := INSTALL_PREFIX=/usr libdir=/usr/lib/$(DEB_HOST_MULTIARCH) \
+  PSM_HAVE_SCIF=0 USE_PSM_UUID=0 arch=$(ARCH)
+
+%:
+	dh $@ --parallel
+
+debian/%.postinst: debian/%.postinst.in
+	sed 	-e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' 	\
+		-e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g'		\
+		-e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g'		\
+		$< > $@
+
+debian/%.prerm: debian/%.prerm.in
+	sed	-e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g'	\
+		-e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g'		\
+		-e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g'		\
+		$< > $@
+
+debian/%.postrm: debian/%.postrm.in
+	sed	-e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g'	\
+		-e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g'		\
+		-e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g'		\
+		$< > $@
+
+override_dh_auto_build: debian/libpsm-infinipath1.postinst debian/libpsm-infinipath1.prerm debian/libpsm-infinipath1.postrm
+	$(MAKE) $(MAKE_OPTIONS)
+
+override_dh_strip:
+	dh_strip --dbg-package=libpsm-infinipath1-dbg
+
+override_dh_auto_install:
+	$(MAKE) install $(MAKE_OPTIONS) DESTDIR=$$PWD/debian/tmp
+	mkdir debian/tmp/usr/lib/libpsm1/
+	mv debian/tmp/usr/lib/*/libpsm_infinipath.so.${PSM_LIB_VERSION}  debian/tmp/usr/lib/libpsm1/
+
+override_dh_auto_test:
+
+override_dh_auto_clean:
+	$(MAKE) $(MAKE_OPTIONS) distclean
+	-rm -f include/linux-i386/linux-i386 include/linux-ppc/linux-ppc
+	-[ ! -f debian/libpsm-infinipath1.postinst ] || rm debian/libpsm-infinipath1.postinst
+	-[ ! -f debian/libpsm-infinipath1.prerm ] || rm debian/libpsm-infinipath1.prerm
diff --git a/source/format b/source/format
new file mode 100644
index 0000000..163aaf8
--- /dev/null
+++ b/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/source/options b/source/options
new file mode 100644
index 0000000..b7bc1f2
--- /dev/null
+++ b/source/options
@@ -0,0 +1 @@
+compression = "xz"
diff --git a/watch b/watch
new file mode 100644
index 0000000..4755a19
--- /dev/null
+++ b/watch
@@ -0,0 +1,3 @@
+version=3
+
+https://www.openfabrics.org/downloads/infinipath-psm/infinipath-psm-(.*)\.(?:tar.gz|tar.bz2|tar.xz)
-- 
cgit v1.2.3


From a581310d680f034f10b43e84008e449056cdca8e Mon Sep 17 00:00:00 2001
From: Roland Fehrenbacher <rf@q-leap.de>
Date: Thu, 28 Dec 2017 19:56:31 +0100
Subject: Fix truncation warnings with gcc7
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch was originally created by
Johannes Brandstätter <jbrandst@2ds.eu> see
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=853451

Gbp-Pq: Name 0001-Fix-truncation-warnings-with-gcc7.patch
---
 psm_ep.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/psm_ep.c b/psm_ep.c
index 6857895..2bd0ed4 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -978,7 +978,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
     int i, num_rails = 0;
     char *uname = "IPATH_UNIT";
     char *pname = "IPATH_PORT";
-    char uvalue[4], pvalue[4];
+    char uvalue[4], pvalue[6];
     int devid_enabled[PTL_MAX_INIT];
     union psmi_envvar_val devs;
 
@@ -1010,7 +1010,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
 	/* If multi-rail is used, set the first ep unit/port */
 	if (num_rails > 0) {
 	    snprintf(uvalue, 4, "%1d", units[0]);
-	    snprintf(pvalue, 4, "%1d", ports[0]);
+	    snprintf(pvalue, 6, "%1d", ports[0]);
 	    setenv(uname, uvalue, 1);
 	    setenv(pname, pvalue, 1);
 	}
@@ -1038,7 +1038,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op
     if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
 	for (i = 1; i < num_rails; i++) {
 	    snprintf(uvalue, 4, "%1d", units[i]);
-	    snprintf(pvalue, 4, "%1d", ports[i]);
+	    snprintf(pvalue, 6, "%1d", ports[i]);
 	    setenv(uname, uvalue, 1);
 	    setenv(pname, pvalue, 1);
 
-- 
cgit v1.2.3


From 4bf9edab66c99f9a3e0b411ff739ee56bb0b99df Mon Sep 17 00:00:00 2001
From: Roland Fehrenbacher <rf@q-leap.de>
Date: Fri, 29 Dec 2017 12:19:43 +0100
Subject: Include <sys/sysmacros.h> to avoid warning about minor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ipath_proto.c: In function ‘ipath_userinit’:
ipath_proto.c:539:13: error: In the GNU C Library, "minor" is defined
 by <sys/sysmacros.h>. For historical compatibility, it is
 currently defined by <sys/types.h> as well, but we plan to
 remove this soon. To use "minor", include <sys/sysmacros.h>
 directly. If you did not intend to use a system-defined macro
 "minor", you should undefine it after including <sys/types.h>. [-Werror]
     spctrl->spc_dev.spd_type = minor(st.st_rdev);

Gbp-Pq: Name 0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch
---
 ipath/ipath_proto.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c
index 5f9365f..a943c1d 100644
--- a/ipath/ipath_proto.c
+++ b/ipath/ipath_proto.c
@@ -37,6 +37,7 @@
 // level infinipath protocol code.
 
 #include <sys/poll.h>
+#include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <stdint.h>
-- 
cgit v1.2.3


From 5e44e7f07b0e7932e681cdea081f3bf73bd97efd Mon Sep 17 00:00:00 2001
From: Reiner Herrmann <reiner@reiner-h.de>
Date: Sun, 16 Oct 2022 04:18:17 -0700
Subject: Fix build with gcc 8

Bug-Debian: https://bugs.debian.org/897774

- psm_utils.c: reserve enough memory for both input strings and the fixed part
- psm_ep.c: e has sufficient space to copy including the NULL terminator,
            which fixes a warning about truncation of the input string

Gbp-Pq: Name 0003-gcc8.patch
---
 psm_ep.c    | 2 +-
 psm_utils.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/psm_ep.c b/psm_ep.c
index 2bd0ed4..113a4d3 100644
--- a/psm_ep.c
+++ b/psm_ep.c
@@ -1349,7 +1349,7 @@ psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring)
 
     b_new = (char *) devstr;
     e = b_new + len;
-    strncpy(e, devstring, len-1);
+    strncpy(e, devstring, len);
     e[len-1] = '\0';
     ee = e + len;
     i = 0;
diff --git a/psm_utils.c b/psm_utils.c
index c8651fe..ebaeda6 100644
--- a/psm_utils.c
+++ b/psm_utils.c
@@ -955,7 +955,7 @@ psmi_faultinj_getspec(char *spec_name, int num, int denom)
 	union psmi_envvar_val env_fi;
 	char fvals_str[128];
 	char fname[128];
-	char fdesc[256];
+	char fdesc[300];
 
 	snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom);
 	fvals_str[sizeof fvals_str - 1] = '\0';
-- 
cgit v1.2.3


From 77e6e6932c73bceef03452d80e1e6d9d835083b1 Mon Sep 17 00:00:00 2001
From: Christoph Biedl <debian.axhn@manchmal.in-ulm.de>
Date: Sun, 16 Oct 2022 04:18:17 -0700
Subject: Disable warning in the cmpxchgl wrapper

Origin: no # upstream is dead
Bug-Debian: https://bugs.debian.org/984057
Last-Update: 2022-10-16


Gbp-Pq: Name 0004-gcc-11-warning.patch
---
 include/linux-i386/sysdep.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h
index ef99d1d..55ce91e 100644
--- a/include/linux-i386/sysdep.h
+++ b/include/linux-i386/sysdep.h
@@ -106,10 +106,13 @@ static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr,
     uint32_t prev;
     struct xchg_dummy { uint32_t a[100]; };
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
     asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
 		 : "=a"(prev)
 		 : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old)
 		 : "memory");
+#pragma GCC diagnostic pop
 
     return prev;
 }
-- 
cgit v1.2.3