From e0e66b73d16846dd5b4f8b9278a9b2f5474a7456 Mon Sep 17 00:00:00 2001 From: Roland Fehrenbacher Date: Fri, 29 Dec 2017 02:47:37 -0800 Subject: Import infinipath-psm_3.3+20.604758e7.orig.tar.xz [dgit import orig infinipath-psm_3.3+20.604758e7.orig.tar.xz] --- .gitignore | 6 + COPYING | 378 ++++ Makefile | 285 +++ README | 155 ++ buildflags.mak | 98 + doc/Makefile | 40 + include/ipath_byteorder.h | 257 +++ include/ipath_common.h | 892 ++++++++ include/ipath_debug.h | 86 + include/ipath_intf.h | 95 + include/ipath_queue.h | 512 +++++ include/ipath_service.h | 160 ++ include/ipath_udebug.h | 130 ++ include/ipath_user.h | 529 +++++ include/linux-i386/bit_ops.h | 76 + include/linux-i386/sysdep.h | 135 ++ include/linux-ppc/bit_ops.h | 145 ++ include/linux-ppc/sysdep.h | 104 + include/valgrind/memcheck.h | 279 +++ include/valgrind/valgrind.h | 3914 +++++++++++++++++++++++++++++++++ infinipath-psm.spec.in | 163 ++ intel-mic-psm-card.spec.in | 112 + intel-mic-psm.spec.in | 207 ++ ipath-psm-devel.srclist.in | 4 + ipath-psm.srclist.in | 4 + ipath/Makefile | 98 + ipath/ipath_debug.c | 256 +++ ipath/ipath_dwordcpy-generic.c | 78 + ipath/ipath_dwordcpy-i386.S | 62 + ipath/ipath_dwordcpy-ppc64.c | 78 + ipath/ipath_dwordcpy-x86_64-fast.S | 55 + ipath/ipath_dwordcpy-x86_64.c | 78 + ipath/ipath_i2cflash.c | 67 + ipath/ipath_proto.c | 547 +++++ ipath/ipath_protomic.c | 616 ++++++ ipath/ipath_service.c | 1377 ++++++++++++ ipath/ipath_sysfs.c | 752 +++++++ ipath/ipath_syslog.c | 92 + ipath/ipath_time.c | 300 +++ ipath/ipath_utils.c | 597 +++++ ipath/ipath_write_pio-i386.c | 276 +++ ipath/ipath_write_pio-ppc.c | 279 +++ ipath/ipath_write_pio-ppc64.c | 283 +++ ipath/ipath_write_pio-x86_64.c | 325 +++ libuuid/COPYING | 25 + libuuid/ChangeLog | 556 +++++ libuuid/Makefile | 45 + libuuid/clear.c | 44 + libuuid/compare.c | 56 + libuuid/copy.c | 46 + libuuid/gen_uuid.c | 322 +++ libuuid/isnull.c | 49 + libuuid/pack.c | 70 + libuuid/parse.c | 80 + libuuid/psm_uuid.c | 214 ++ libuuid/psm_uuid.h | 39 + libuuid/tst_uuid.c | 168 ++ libuuid/unpack.c | 64 + libuuid/unparse.c | 79 + libuuid/uuid.h | 108 + libuuid/uuidP.h | 77 + libuuid/uuid_time.c | 161 ++ mic-psm-card-devel.srclist.in | 2 + mic-psm-card.srclist.in | 6 + mic-psm-devel.srclist.in | 4 + mic-psm.srclist.in | 5 + mic/etc/sysconfig/mic/conf.d/psm.conf | 2 + mic/opt/intel/mic/psm/psm.filelist.in | 7 + mpspawn/mpspawn_stats.h | 115 + psm.c | 522 +++++ psm.h | 1045 +++++++++ psm.supp | 58 + psm_am.c | 170 ++ psm_am.h | 290 +++ psm_am_internal.h | 66 + psm_context.c | 686 ++++++ psm_context.h | 91 + psm_diags.c | 325 +++ psm_ep.c | 1423 ++++++++++++ psm_ep.h | 273 +++ psm_ep_connect.c | 292 +++ psm_error.c | 316 +++ psm_error.h | 54 + psm_help.h | 143 ++ psm_lock.h | 94 + psm_memcpy.c | 340 +++ psm_mpool.c | 469 ++++ psm_mpool.h | 72 + psm_mq.c | 729 ++++++ psm_mq.h | 600 +++++ psm_mq_internal.h | 484 ++++ psm_mq_recv.c | 546 +++++ psm_mq_utils.c | 402 ++++ psm_noship.h | 57 + psm_stats.c | 649 ++++++ psm_stats.h | 101 + psm_timer.c | 193 ++ psm_timer.h | 133 ++ psm_user.h | 214 ++ psm_utils.c | 1278 +++++++++++ psm_utils.h | 292 +++ psmd/Makefile | 82 + psmd/psmd.c | 758 +++++++ ptl.h | 182 ++ ptl_am/Makefile | 45 + ptl_am/am_reqrep.c | 96 + ptl_am/am_reqrep_shmem.c | 3513 +++++++++++++++++++++++++++++ ptl_am/kcopyrw.h | 50 + ptl_am/kcopyrwu.c | 105 + ptl_am/knemrw.h | 58 + ptl_am/knemrwu.c | 154 ++ ptl_am/psm_am_internal.h | 524 +++++ ptl_am/ptl.c | 375 ++++ ptl_am/ptl_fwd.h | 58 + ptl_am/scifrw.h | 50 + ptl_am/scifrwu.c | 97 + ptl_ips/Makefile | 55 + ptl_ips/ips_crc32.c | 91 + ptl_ips/ips_epstate.c | 137 ++ ptl_ips/ips_epstate.h | 83 + ptl_ips/ips_expected_proto.h | 280 +++ ptl_ips/ips_opp_path_rec.c | 444 ++++ ptl_ips/ips_path_rec.c | 660 ++++++ ptl_ips/ips_path_rec.h | 149 ++ ptl_ips/ips_proto.c | 2061 +++++++++++++++++ ptl_ips/ips_proto.h | 701 ++++++ ptl_ips/ips_proto_am.c | 355 +++ ptl_ips/ips_proto_am.h | 71 + ptl_ips/ips_proto_connect.c | 1639 ++++++++++++++ ptl_ips/ips_proto_dump.c | 259 +++ ptl_ips/ips_proto_expected.c | 2489 +++++++++++++++++++++ ptl_ips/ips_proto_header.h | 174 ++ ptl_ips/ips_proto_help.h | 759 +++++++ ptl_ips/ips_proto_internal.h | 70 + ptl_ips/ips_proto_mq.c | 964 ++++++++ ptl_ips/ips_proto_params.h | 204 ++ ptl_ips/ips_proto_recv.c | 1547 +++++++++++++ ptl_ips/ips_recvhdrq.c | 717 ++++++ ptl_ips/ips_recvhdrq.h | 206 ++ ptl_ips/ips_recvq.c | 74 + ptl_ips/ips_recvq.h | 97 + ptl_ips/ips_scb.c | 314 +++ ptl_ips/ips_scb.h | 169 ++ ptl_ips/ips_spio.c | 504 +++++ ptl_ips/ips_spio.h | 85 + ptl_ips/ips_stats.h | 62 + ptl_ips/ips_subcontext.c | 72 + ptl_ips/ips_subcontext.h | 58 + ptl_ips/ips_tid.c | 116 + ptl_ips/ips_tid.h | 99 + ptl_ips/ips_tidflow.c | 184 ++ ptl_ips/ips_tidflow.h | 127 ++ ptl_ips/ips_writehdrq.c | 86 + ptl_ips/ips_writehdrq.h | 236 ++ ptl_ips/ipserror.c | 175 ++ ptl_ips/ipserror.h | 100 + ptl_ips/ptl.c | 860 ++++++++ ptl_ips/ptl_fwd.h | 42 + ptl_ips/ptl_ips.h | 166 ++ ptl_ips/ptl_rcvthread.c | 444 ++++ ptl_self/Makefile | 45 + ptl_self/ptl.c | 299 +++ ptl_self/ptl_fwd.h | 41 + 163 files changed, 54450 insertions(+) create mode 100644 .gitignore create mode 100644 COPYING create mode 100644 Makefile create mode 100644 README create mode 100644 buildflags.mak create mode 100644 doc/Makefile create mode 100644 include/ipath_byteorder.h create mode 100644 include/ipath_common.h create mode 100644 include/ipath_debug.h create mode 100644 include/ipath_intf.h create mode 100644 include/ipath_queue.h create mode 100644 include/ipath_service.h create mode 100644 include/ipath_udebug.h create mode 100644 include/ipath_user.h create mode 100644 include/linux-i386/bit_ops.h create mode 100644 include/linux-i386/sysdep.h create mode 100644 include/linux-ppc/bit_ops.h create mode 100644 include/linux-ppc/sysdep.h create mode 100644 include/valgrind/memcheck.h create mode 100644 include/valgrind/valgrind.h create mode 100644 infinipath-psm.spec.in create mode 100644 intel-mic-psm-card.spec.in create mode 100644 intel-mic-psm.spec.in create mode 100644 ipath-psm-devel.srclist.in create mode 100644 ipath-psm.srclist.in create mode 100644 ipath/Makefile create mode 100644 ipath/ipath_debug.c create mode 100644 ipath/ipath_dwordcpy-generic.c create mode 100644 ipath/ipath_dwordcpy-i386.S create mode 100644 ipath/ipath_dwordcpy-ppc64.c create mode 100644 ipath/ipath_dwordcpy-x86_64-fast.S create mode 100644 ipath/ipath_dwordcpy-x86_64.c create mode 100644 ipath/ipath_i2cflash.c create mode 100644 ipath/ipath_proto.c create mode 100644 ipath/ipath_protomic.c create mode 100644 ipath/ipath_service.c create mode 100644 ipath/ipath_sysfs.c create mode 100644 ipath/ipath_syslog.c create mode 100644 ipath/ipath_time.c create mode 100644 ipath/ipath_utils.c create mode 100644 ipath/ipath_write_pio-i386.c create mode 100644 ipath/ipath_write_pio-ppc.c create mode 100644 ipath/ipath_write_pio-ppc64.c create mode 100644 ipath/ipath_write_pio-x86_64.c create mode 100644 libuuid/COPYING create mode 100644 libuuid/ChangeLog create mode 100644 libuuid/Makefile create mode 100644 libuuid/clear.c create mode 100644 libuuid/compare.c create mode 100644 libuuid/copy.c create mode 100644 libuuid/gen_uuid.c create mode 100644 libuuid/isnull.c create mode 100644 libuuid/pack.c create mode 100644 libuuid/parse.c create mode 100644 libuuid/psm_uuid.c create mode 100644 libuuid/psm_uuid.h create mode 100644 libuuid/tst_uuid.c create mode 100644 libuuid/unpack.c create mode 100644 libuuid/unparse.c create mode 100644 libuuid/uuid.h create mode 100644 libuuid/uuidP.h create mode 100644 libuuid/uuid_time.c create mode 100644 mic-psm-card-devel.srclist.in create mode 100644 mic-psm-card.srclist.in create mode 100644 mic-psm-devel.srclist.in create mode 100644 mic-psm.srclist.in create mode 100644 mic/etc/sysconfig/mic/conf.d/psm.conf create mode 100644 mic/opt/intel/mic/psm/psm.filelist.in create mode 100644 mpspawn/mpspawn_stats.h create mode 100644 psm.c create mode 100644 psm.h create mode 100644 psm.supp create mode 100644 psm_am.c create mode 100644 psm_am.h create mode 100644 psm_am_internal.h create mode 100644 psm_context.c create mode 100644 psm_context.h create mode 100644 psm_diags.c create mode 100644 psm_ep.c create mode 100644 psm_ep.h create mode 100644 psm_ep_connect.c create mode 100644 psm_error.c create mode 100644 psm_error.h create mode 100644 psm_help.h create mode 100644 psm_lock.h create mode 100644 psm_memcpy.c create mode 100644 psm_mpool.c create mode 100644 psm_mpool.h create mode 100644 psm_mq.c create mode 100644 psm_mq.h create mode 100644 psm_mq_internal.h create mode 100644 psm_mq_recv.c create mode 100644 psm_mq_utils.c create mode 100644 psm_noship.h create mode 100644 psm_stats.c create mode 100644 psm_stats.h create mode 100644 psm_timer.c create mode 100644 psm_timer.h create mode 100644 psm_user.h create mode 100644 psm_utils.c create mode 100644 psm_utils.h create mode 100644 psmd/Makefile create mode 100644 psmd/psmd.c create mode 100644 ptl.h create mode 100644 ptl_am/Makefile create mode 100644 ptl_am/am_reqrep.c create mode 100644 ptl_am/am_reqrep_shmem.c create mode 100644 ptl_am/kcopyrw.h create mode 100644 ptl_am/kcopyrwu.c create mode 100644 ptl_am/knemrw.h create mode 100644 ptl_am/knemrwu.c create mode 100644 ptl_am/psm_am_internal.h create mode 100644 ptl_am/ptl.c create mode 100644 ptl_am/ptl_fwd.h create mode 100644 ptl_am/scifrw.h create mode 100644 ptl_am/scifrwu.c create mode 100644 ptl_ips/Makefile create mode 100644 ptl_ips/ips_crc32.c create mode 100644 ptl_ips/ips_epstate.c create mode 100644 ptl_ips/ips_epstate.h create mode 100644 ptl_ips/ips_expected_proto.h create mode 100644 ptl_ips/ips_opp_path_rec.c create mode 100644 ptl_ips/ips_path_rec.c create mode 100644 ptl_ips/ips_path_rec.h create mode 100644 ptl_ips/ips_proto.c create mode 100644 ptl_ips/ips_proto.h create mode 100644 ptl_ips/ips_proto_am.c create mode 100644 ptl_ips/ips_proto_am.h create mode 100644 ptl_ips/ips_proto_connect.c create mode 100644 ptl_ips/ips_proto_dump.c create mode 100644 ptl_ips/ips_proto_expected.c create mode 100644 ptl_ips/ips_proto_header.h create mode 100644 ptl_ips/ips_proto_help.h create mode 100644 ptl_ips/ips_proto_internal.h create mode 100644 ptl_ips/ips_proto_mq.c create mode 100644 ptl_ips/ips_proto_params.h create mode 100644 ptl_ips/ips_proto_recv.c create mode 100644 ptl_ips/ips_recvhdrq.c create mode 100644 ptl_ips/ips_recvhdrq.h create mode 100644 ptl_ips/ips_recvq.c create mode 100644 ptl_ips/ips_recvq.h create mode 100644 ptl_ips/ips_scb.c create mode 100644 ptl_ips/ips_scb.h create mode 100644 ptl_ips/ips_spio.c create mode 100644 ptl_ips/ips_spio.h create mode 100644 ptl_ips/ips_stats.h create mode 100644 ptl_ips/ips_subcontext.c create mode 100644 ptl_ips/ips_subcontext.h create mode 100644 ptl_ips/ips_tid.c create mode 100644 ptl_ips/ips_tid.h create mode 100644 ptl_ips/ips_tidflow.c create mode 100644 ptl_ips/ips_tidflow.h create mode 100644 ptl_ips/ips_writehdrq.c create mode 100644 ptl_ips/ips_writehdrq.h create mode 100644 ptl_ips/ipserror.c create mode 100644 ptl_ips/ipserror.h create mode 100644 ptl_ips/ptl.c create mode 100644 ptl_ips/ptl_fwd.h create mode 100644 ptl_ips/ptl_ips.h create mode 100644 ptl_ips/ptl_rcvthread.c create mode 100644 ptl_self/Makefile create mode 100644 ptl_self/ptl.c create mode 100644 ptl_self/ptl_fwd.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5f61dda --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +infinipath-psm.spec +infinipath-psm-*.tar.gz +*.o +*.d +*.so* +_revision.c diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..560cf3a --- /dev/null +++ b/COPYING @@ -0,0 +1,378 @@ +This software is available to you under a choice of one of two +licenses. You may choose to be licensed under the terms of the the +OpenIB.org BSD license or the GNU General Public License (GPL) Version +2, both included below. + +Copyright (c) 2007 Cisco, Inc. All rights reserved. + +================================================================== + + OpenIB.org BSD license + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +================================================================== + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d79c4bd --- /dev/null +++ b/Makefile @@ -0,0 +1,285 @@ +# Copyright (c) 2013 Intel Corporation. All rights reserved. +# Copyright (c) 2006-2011. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +top_srcdir := $(shell pwd) +build_dir ?= $(top_srcdir) +include $(top_srcdir)/buildflags.mak +lib_build_dir := $(build_dir) +ifdef LOCAL_PREFIX + INSTALL_PREFIX := $(LOCAL_PREFIX) +else + INSTALL_PREFIX := /usr +endif +libdir ?= $(INSTALL_PREFIX)/lib64 +sbindir ?= $(INSTALL_PREFIX)/sbin + +INSTALL_LIB_TARG = $(libdir) +INSTALL_SBIN_TARG = $(sbindir) +RPM_BUILD_DIR=${top_srcdir}/rpmbuild +TARG_DIR ?= $(top_srcdir) +TMI_DIR := $(top_srcdir)/contrib/$(TMI_NAME) + +TMI_NAME := tmi-2009-11-20 +TARGLIB := libpsm_infinipath + +SUBDIRS:= ptl_self ptl_ips ptl_am libuuid ipath + +LDLIBS := -linfinipath $(SCIF_LINK_FLAGS) -lrt -lpthread -ldl ${EXTRA_LIBS} + +# Library version information +PSM_VERNO_MAJOR := $(shell sed -n 's/^\#define.*PSM_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h) +PSM_VERNO_MINOR := $(shell sed -n 's/^\#define.*PSM_VERNO_MINOR.*0x\([0-9]\?[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h) +PSM_LIB_MAJOR := $(shell printf "%d" ${PSM_VERNO_MAJOR}) +PSM_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(build_dir)/psm.h`) +IPATH_LIB_MAJOR := 4 +IPATH_LIB_MINOR := 0 +MAJOR := $(PSM_LIB_MAJOR) +MINOR := $(PSM_LIB_MINOR) + +# The desired version number comes from the most recent tag starting with "v" +VERSION := $(shell if [ -d .git ] ; then git describe --tags --abbrev=0 --match='v*' | sed -e 's/^v//' -e 's/-/_/'; else echo "version" ; fi) + +# The desired release number comes the git describe following the version which +# is the number of commits since the version tag was planted suffixed by the g +RELEASE := $(shell if [ -d .git ] ; then git describe --tags --long --match='v*' | sed -e 's/v[0-9.]*-\(.*\)/\1_open/' -e 's/-/_/'; else echo "release" ; fi) + +VERSION_RELEASE := $(VERSION)-$(RELEASE) + +# Try to figure out which libuuid to use. This needs to be +# done before we include buildflags.mak +PSM_USE_SYS_UUID=0 +ifneq (1,${USE_PSM_UUID}) + # Check whether the uuid header file is present. The header file is + # installed by the -devel package, which should have a dependency + # on the package which installs the library. + PSM_HAVE_UUID_H=$(shell if [ -f /usr/include/uuid/uuid.h ]; then echo 1; else echo 0; fi) + ifeq (1,${PSM_HAVE_UUID_H}) + SYS_UUID_RPM_NAME=$(shell rpm -qf --qf "%{NAME} = %{VERSION}-%{RELEASE}" /usr/include/uuid/uuid.h) + PSM_USE_SYS_UUID=1 + endif +endif + +# Build the daemon only if SCIF headers are found and we are building for the host +SUBDIRS += $(and $(MIC:1=),$(PSM_HAVE_SCIF:0=),psmd) + +ifneq (x86_64,$(arch)) + ifneq (i386,$(arch)) + $(error Unsupported architecture $(arch)) + endif +endif + +export top_srcdir build_srcdir TMI_NAME TMI_DIR PSM_VERNO_MAJOR PSM_LIB_MAJOR \ + PSM_VERNO_MINOR PSM_LIB_MINOR IPATH_LIB_MAJOR IPATH_LIB_MINOR PSM_USE_SYS_UUID \ + DESTDIR INSTALL_SBIN_TARG INSTALL_LIB_TARG PSM_HAVE_SCIF + +${TARGLIB}-objs := ptl_am/am_reqrep_shmem.o \ + ptl_am/am_reqrep.o \ + ptl_am/ptl.o \ + ptl_am/kcopyrwu.o \ + ptl_am/knemrwu.o \ + ptl_am/scifrwu.o \ + psm_context.o \ + psm_ep.o \ + psm_ep_connect.o \ + psm_error.o \ + psm_utils.o \ + psm_timer.o \ + psm_am.o \ + psm_mq.o \ + psm_mq_utils.o \ + psm_mq_recv.o \ + psm_mpool.o \ + psm_stats.o \ + psm_memcpy.o \ + psm.o \ + libuuid/psm_uuid.o \ + ptl_ips/ptl.o \ + ptl_ips/ptl_rcvthread.o \ + ptl_ips/ipserror.o \ + ptl_ips/ips_scb.o \ + ptl_ips/ips_epstate.o \ + ptl_ips/ips_recvq.o \ + ptl_ips/ips_recvhdrq.o \ + ptl_ips/ips_spio.o \ + ptl_ips/ips_proto.o \ + ptl_ips/ips_proto_recv.o \ + ptl_ips/ips_proto_connect.o \ + ptl_ips/ips_proto_expected.o \ + ptl_ips/ips_tid.o \ + ptl_ips/ips_crc32.o \ + ptl_ips/ips_tidflow.o \ + ptl_ips/ips_proto_dump.o \ + ptl_ips/ips_proto_mq.o \ + ptl_ips/ips_proto_am.o \ + ptl_ips/ips_subcontext.o \ + ptl_ips/ips_path_rec.o \ + ptl_ips/ips_opp_path_rec.o \ + ptl_ips/ips_writehdrq.o \ + ptl_self/ptl.o \ + psm_diags.o + +all: libs + +libs: symlinks + for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir ;\ + done + $(MAKE) ${TARGLIB}.so + +clean: + rm -f _revision.c + for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir $@ ;\ + done + rm -f *.o ${TARGLIB}.* + +distclean: cleanlinks clean + rm -f *.spec *.srclist + rm -f *.tar.gz + +.PHONY: symlinks +symlinks: + @[[ -L $(build_dir)/include/linux-ppc64 ]] || \ + ln -sf linux-ppc $(build_dir)/include/linux-ppc64 + @[[ -L $(build_dir)/include/linux-x86_64 ]] || \ + ln -sf linux-i386 $(build_dir)/include/linux-x86_64 + +cleanlinks: + rm -f $(build_dir)/include/linux-ppc64 + rm -f $(build_dir)/include/linux-x86_64 + +install: all + for subdir in $(SUBDIRS); do \ + $(MAKE) -i -C $$subdir $@ ;\ + done + install -D ${TARGLIB}.so.${MAJOR}.${MINOR} \ + $(DESTDIR)${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR} + (cd $(DESTDIR)${INSTALL_LIB_TARG} ; \ + ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \ + ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so) ; \ + if [ X$(MIC) != X1 ]; then \ + install -D psm.h ${DESTDIR}/usr/include/psm.h ; \ + install -D psm_mq.h ${DESTDIR}/usr/include/psm_mq.h ; \ + else \ + filelist=/opt/intel/mic/psm/psm.filelist ; \ + sed -e 's!%IPATHMAJOR%!$(IPATH_LIB_MAJOR)!g' \ + -e 's!%IPATHMINOR%!$(IPATH_LIB_MINOR)!g' \ + -e 's!%PSMMAJOR%!$(MAJOR)!g' \ + -e 's!%PSMMINOR%!$(MINOR)!g' \ + mic$$filelist.in > mic$$filelist ; \ + install -D mic/$$filelist ${DESTDIR}$$filelist ; \ + rm -f mic$$filelist ; \ + fi + +tmi: libs + $(MAKE) -C contrib/$(TMI_NAME) verbs=PSM +tmiclean: + $(MAKE) -C contrib/$(TMI_NAME) verbs=PSM clean + + +.PHONY: infinipath-psm.spec +infinipath-psm.spec: infinipath-psm.spec.in + sed -e 's/@VERSION@/'${VERSION}'/g' -e 's/@RELEASE@/'${RELEASE}'/g' $< > $@ + if [ X$(MIC) != X1 ]; then \ + if [ X$(PSM_USE_SYS_UUID) = X1 ]; then \ + REQUIRES="Requires: $(shell echo $(SYS_UUID_RPM_NAME) | sed -e 's/-devel//')" ; \ + REQUIRESDEVEL="Requires: $(SYS_UUID_RPM_NAME)" ; \ + fi ; \ + [ -n "$${REQUIRES}" ] && \ + sed -i -e 's%@REQUIRES@%'"$${REQUIRES}"'%g' -e 's/@PSM_UUID@//g' $@ || \ + sed -i -e '/@REQUIRES@/d' -e 's/@PSM_UUID@/USE_PSM_UUID=1/g' $@ ; \ + [ -n "$${REQUIRESDEVEL}" ] && \ + sed -i -e 's%@REQUIRES-DEVEL@%'"$$REQUIRESDEVEL"'%g' $@ || \ + sed -i -e '/@REQUIRES-DEVEL@/d' $@ ; \ + else \ + sed -i -e '/@REQUIRES@/d' \ + -e '/@REQUIRES-DEVEL@/d' \ + -e 's/@PSM_UUID@/USE_PSM_UUID=1/g' $@ ; \ + fi +dist: distclean infinipath-psm.spec + rm -rf $(RPM_BUILD_DIR) + mkdir -p infinipath-psm-${VERSION_RELEASE} + for x in $$(/usr/bin/find . -name ".git" -prune -o \ + -name "cscope*" -prune -o \ + -name "*.spec.in" -prune -o \ + -name "infinipath-psm-${VERSION_RELEASE}" -prune -o \ + -name "*.orig" -prune -o \ + -name "*~" -prune -o \ + -name "#*" -prune -o \ + -name "*.rpm" -prune -o \ + -name "build" -prune -o \ + -name ".gitignore" -prune -o \ + -print); do \ + dir=$$(dirname $$x); \ + mkdir -p infinipath-psm-${VERSION_RELEASE}/$$dir; \ + [ ! -d $$x ] && cp $$x infinipath-psm-${VERSION_RELEASE}/$$dir; \ + done ; \ + if [ -d .git ] ; then git log -n1 --pretty=format:%H > \ + infinipath-psm-${VERSION_RELEASE}/COMMIT ; fi + tar czvf infinipath-psm-${VERSION_RELEASE}.tar.gz infinipath-psm-${VERSION_RELEASE} + rm -rf infinipath-psm-${VERSION_RELEASE} + +ofeddist: + USE_PSM_UUID=1 $(MAKE) dist + + +# rebuild the cscope database, skipping sccs files, done once for +# top level +cscope: + find * -type f ! -name '[ps].*' \( -iname '*.[cfhs]' -o \ + -iname \\*.cc -o -name \\*.cpp -o -name \\*.f90 \) -print | cscope -bqu -i - + +${TARGLIB}.so: ${TARGLIB}.so.${MAJOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +${TARGLIB}.so.${MAJOR}: ${TARGLIB}.so.${MAJOR}.${MINOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +# when we build the shared library, generate a revision and date +# string in it, for easier id'ing when people may have copied the +# file around. Generate it such that the ident command can find it +# and strings -a | grep InfiniPath does a reasonable job as well. +${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} + date +'char psmi_infinipath_revision[] ="$$""Date: %F %R ${rpm_extra_description}InfiniPath $$";' > ${lib_build_dir}/_revision.c + $(CC) -c $(BASECFLAGS) $(INCLUDES) _revision.c -o _revision.o + $(CC) $(LDFLAGS) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared -Wl,--unique='*fastpath*' \ + ${${TARGLIB}-objs} _revision.o -L$(build_dir)/ipath $(LDLIBS) + @leaks=`nm $@ | grep ' [DT] ' | \ + grep -v -e ' [DT] \(_edata\|_fini\|_init\|infinipath_\|ips_\|psmi\|__psm_\|__psmi_\|_rest.pr\|_save.pr\|kcopy\|knem\|scif\)'`; \ + if test -n "$$leaks"; then echo "Build failed, leaking symbols:"; echo "$$leaks"; exit 1; fi + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +.PHONY: $(SUBDIRS) + diff --git a/README b/README new file mode 100644 index 0000000..505a973 --- /dev/null +++ b/README @@ -0,0 +1,155 @@ + + Copyright (c) 2013-2014, Intel Corporation. All rights reserved. + Copyright (c) 2006-2011. QLogic Corporation. All rights reserved. + Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + + This software is available to you under a choice of one of two + licenses. You may choose to be licensed under the terms of the GNU + General Public License (GPL) Version 2, available from the file + COPYING in the main directory of this source tree, or the + OpenIB.org BSD license below: + + Redistribution and use in source and binary forms, with or + without modification, are permitted provided that the following + conditions are met: + + - Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + - Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + +================================================================================ + +OFED Support +------------ +OFED 3.5 or above should be installed on the node. Prior versions of OFED +have an older QLogic IB driver (ib_qib) and do not fully support all the PSM +features with this release. + +Building PSM +------------ +Build requires that GNU GCC compiler is installed on the machine doing the +build. If compiling code for MIC, SCIF must be present. Root privileges +are required to install the runtime libraries and development header files +into the standard system location. + +Building from Makefile +----------------------- +1. Untar the tarball: + $ tar zxvf infinipath-psm-$PRODUCT-$RELEASE.tar.gz +2. Change directory into the untarred location: + $ cd infinipath-psm-$PRODUCT-$RELEASE +3. Run make on the command line. This will build the libraries. By default, + the Makefile will auto-detect whether libuuid and the uuid.h header file + are installed. If so, it will use the system's libuuid. Otherwise, PSM + will be compiled with the libuuid included with PSM. + $ make + + The Makefile will attempt to detect if SCIF is present, and if found, it will + build the SCIF enabled variant by default. Auto-detection of SCIF can be + disabled by passing the PSM_HAVE_SCIF variable. + + To specify the SCIF-enabled version, set the PSM_HAVE_SCIF variable: + $ make PSM_HAVE_SCIF=1 + To specify the non-SCIF version, even if SCIF is present, clear the variable: + $ make PSM_HAVE_SCIF=0 + + To force compiling with the included libuuid, use the USE_PSM_UUID variable: + $ make USE_PSM_UUID=1 + + (PSM_HAVE_SCIF and USE_PSM_UUID may be used in conjunction) + +4. Install the libraries and header files on the system (as root): + $ make install + +The libraries will be installed in either /usr/lib or /usr/lib64, depending on +the architecture of the machine, and the header files will be installed in +/usr/include. +This behavior can be altered by using the "DESTDIR" and "LIBDIR" variables on +the "make install" command line. "DESTDIR" will add a leading path component +to the overall install path and "LIBDIR" will change the path where libraries +will be installed. For example, "make DESTDIR=/tmp/psm-install install" will +install all files (libraries and headers) into "/tmp/psm-install/usr/...", +"make DESTDIR=/tmp/psm-install LIBDIR=/libraries install" will install the +libraries in "/tmp/psm-install/libraries" and the headers in +"/tmp/psm-install/usr/include", and "make LIBDIR=/tmp/libs install" will +install the libraries in "/tmp/libs" and the headers in "/usr/include". + +MPI Libraries supported +----------------------- +A large number of open source (OpenMPI, MVAPICH, MVAPICH2) and Vendor MPI +implementations support PSM for optimized communication on QLogic Truescale +Infiniband HCAs. Vendor MPI implementations (HP-MPI, Intel MPI 4.0 with PMI, +Platform/Scali MPI) require that the PSM runtime libraries be installed and +available on each node. Usually a configuration file or a command line switch +to mpirun needs to be specified to utilize the PSM transport. + +OpenMPI support +--------------- +It is recommended to use the OpenMPI v1.5 development branch. Prior versions +of OpenMPI have an issue with support PSM network transports mixed with standard +Verbs transport (BTL openib). This prevents an OpenMPI installation with +network modules available for PSM and Verbs to work correctly on nodes with +no QLogic IB hardware. This has been fixed in the latest development branch +allowing a single OpenMPI installation to target IB hardware via PSM or Verbs +as well as alternate transports seamlessly. + +PSM header and runtime files need to be installed on a node where the OpenMPI +build is performed. All compute nodes additionally should have the PSM runtime +libraries available on them. OpenMPI provides a standard configure, make and +make install mechanism which will detect and build the relevant PSM network +modules for OpenMPI once the header and runtime files are detected. Further +information on compiling and running MPI applications with OpenMPI on PSM is +available in the QLogic OFED User guide available at: + +http://driverdownloads.qlogic.com/QLogicDriverDownloads_UI/SearchByProduct.aspx?ProductCategory=301&Product=1116&Os=65 + +MVAPICH and MVAPICH2 support +---------------------------- +Both MVAPICH and MVAPICH2 support PSM transport for optimized communication on +QLogic Truescale IB hardware. MVAPICH2 1.4 and MVAPICH 1.2 versions are +recommended. PSM header and runtime files need to be installed on a node where +MVAPICH builds are performed. All compute nodes additionally should have the +PSM runtime libraries available on them. + +MVAPICH provides a shell script in it's top level directory called +make.mvapich.psm to configur, make and install MVAPICH with PSM support. + +MVAPICH2 provides a standard configure and make infrastructure. In order to +MVAPICH2 for PSM the following should be performed from the top level directory: + + - ./configure --prefix= --with-device=ch3:psm + - make + - make install + +Further information on compiling and running MPI applications with MVAPICH on +PSM is available in the QLogic OFED User guide available at: + +http://driverdownloads.qlogic.com/QLogicDriverDownloads_UI/SearchByProduct.aspx?ProductCategory=301&Product=1116&Os=65 + +Submitting comments, bugs, questions +------------------------------------ +The best way to report bugs, send comments or ask questions is to sign up to the +developers mailing list: psm-devel@qlogic.com. Because of spam only subscribers +are allowed to post to the list. Please ensure that you subscribe with and post +from the same email address else posts will be blocked as spam. To subscribe +send the following in the BODY of an email to majordomo@qlogic.com: + + subscribe psm-devel + +Majordomo will reply back with instructions on how to confirm your subscription. +The mailing list can be used to report bugs, send comments and ask questions. + diff --git a/buildflags.mak b/buildflags.mak new file mode 100644 index 0000000..34fdf1c --- /dev/null +++ b/buildflags.mak @@ -0,0 +1,98 @@ +# Copyright (c) 2012. Intel Corporation. All rights reserved. +# Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. +# Copyright (c) 2003, 2004, 2005 PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +# set top_srcdir and include this file + +ifeq (,$(top_srcdir)) +$(error top_srcdir must be set to include makefile fragment) +endif + +export os ?= $(shell uname -s | tr '[A-Z]' '[a-z]') +export arch := $(shell uname -p | sed -e 's,\(i[456]86\|athlon$$\),i386,') + +CC ?= gcc + +SCIF_LINK_FLAGS := +SCIF_INCLUDE_FLAGS := + +compiler_arch := $(shell $(CC) -dumpmachine || echo "none") +ifeq ($(compiler_arch),none) +$(error Could not determine compiler arch for $(CC)) +endif +MIC := $(if $(findstring k1om,$(compiler_arch)),1,0) + +# If SCIF_ROOT_DIR is set, we should assume using SCIF +# If SCIF_INCLUDE_FLAGS is set, we should assume using SCIF +# If /usr/include/scif.h exists, we should assume using SCIF + +ifdef SCIF_ROOT_DIR + SCIF_LINK_FLAGS := -L$(SCIF_ROOT_DIR)/source-root/k1om-hybrid/$(if $(MIC:0=),card,host)/scif_lib #-lscif + SCIF_INCLUDE_FLAGS := -I$(SCIF_ROOT_DIR)/source-root/k1om-hybrid/include +endif + +PSM_HAVE_SCIF ?= $(shell printf '\#include \nint main(void){return(0);}\n' | \ + $(CC) $(CFLAGS) $(LDFLAGS) -x c - -o /dev/null &> /dev/null && echo 1 || echo 0) + +ifeq (1,$(PSM_HAVE_SCIF)) + SCIF_INCLUDE_FLAGS += -DPSM_HAVE_SCIF=1 + SCIF_LINK_FLAGS += -lscif +endif + +WERROR := -Werror +INCLUDES := -I. -I$(top_srcdir)/include -I$(top_srcdir)/mpspawn \ + -I$(top_srcdir)/include/$(os)-$(arch) $(SCIF_INCLUDE_FLAGS) +BASECFLAGS += $(BASE_FLAGS) $(if $(MIC:0=),$(if $(filter $(CC),icc),-mmic,-D__MIC__)) \ + -Wall $(WERROR) $(if $(MIC:0=),-Wno-unused) -fpic -fPIC -D_GNU_SOURCE \ + $(if $(filter $(CC),icc),,-funwind-tables) $(if $(PSM_PROFILE:0=),-DPSM_PROFILE) \ + ${IPATH_CFLAGS} +ASFLAGS += $(BASE_FLAGS) $(if $(MIC:0=),$(if $(filter $(CC),icc),-mmic,-D__MIC__)) -g3 -fpic + +LDFLAGS += $(SCIF_LINK_FLAGS) + +# If linker flags are needed, uncomment the line below and set flags +#LDFLAGS += + +ifneq (,${PSM_DEBUG}) + BASECFLAGS += -O -g3 -DPSM_DEBUG $(if $(filter $(CC),icc),,-funit-at-a-time) \ + -Wp,-D_FORTIFY_SOURCE=2 +else + BASECFLAGS += -O3 -g3 +endif +ifeq (1,${PSM_USE_SYS_UUID}) + BASECFLAGS += -DPSM_USE_SYS_UUID + EXTRA_LIBS = -luuid +endif + +CFLAGS += $(BASECFLAGS) $(if $(filter $(CC),gcc),-Wno-strict-aliasing) \ + $(if $(PSM_VALGRIND:0=),-DPSM_VALGRIND,-DNVALGRIND) + diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..dba53ee --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,40 @@ +# Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +ifeq (,$(build_dir)) +$(error build_dir must be set) +endif + +top_srcdir := .. + + + diff --git a/include/ipath_byteorder.h b/include/ipath_byteorder.h new file mode 100644 index 0000000..d5cd40d --- /dev/null +++ b/include/ipath_byteorder.h @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ipath_byteorder_h +#define ipath_byteorder_h + +#ifdef __cplusplus + extern "C" { +#endif + +#include +#include + +#ifndef __BYTE_ORDER +# error "BYTE_ORDER undefined" +#endif + +typedef __u16 __le16; +typedef __u16 __be16; +typedef __u32 __le32; +typedef __u32 __be32; +typedef __u64 __le64; +typedef __u64 __be64; + +static __inline__ __u16 __ipath_fswab16(__u16) __attribute__ ((always_inline)); +static __inline__ __u32 __ipath_fswab32(__u32) __attribute__ ((always_inline)); +static __inline__ __u64 __ipath_fswab64(__u64) __attribute__ ((always_inline)); + +static __inline__ __u16 __ipath_fswab16(__u16 x) +{ + return ((x & (__u16)0x00ffU) << 8) + | ((x & (__u16)0xff00U) >> 8); +} + +static __inline__ __u32 __ipath_fswab32(__u32 x) +{ + return ((x & (__u32)0x000000ffUL) << 24) + | ((x & (__u32)0x0000ff00UL) << 8) + | ((x & (__u32)0x00ff0000UL) >> 8) + | ((x & (__u32)0xff000000UL) >> 24); +} + +static __inline__ __u64 __ipath_fswab64(__u64 x) +{ + return ((x & (__u64)0x00000000000000ffULL) << 56) + | ((x & (__u64)0x000000000000ff00ULL) << 40) + | ((x & (__u64)0x0000000000ff0000ULL) << 24) + | ((x & (__u64)0x00000000ff000000ULL) << 8) + | ((x & (__u64)0x000000ff00000000ULL) >> 8) + | ((x & (__u64)0x0000ff0000000000ULL) >> 24) + | ((x & (__u64)0x00ff000000000000ULL) >> 40) + | ((x & (__u64)0xff00000000000000ULL) >> 56); +} + +static __inline__ __u16 __cpu_to_le16(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_le32(__le32) __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_le64(__le64) __attribute__ ((always_inline)); + +static __inline__ __u16 __le16_to_cpu(__le16) __attribute__ ((always_inline)); +static __inline__ __u32 __le32_to_cpu(__le32) __attribute__ ((always_inline)); +static __inline__ __u64 __le64_to_cpu(__le64) __attribute__ ((always_inline)); + +static __inline__ __u16 __cpu_to_be16(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __cpu_to_be32(__be32) __attribute__ ((always_inline)); +static __inline__ __u64 __cpu_to_be64(__be64) __attribute__ ((always_inline)); + +static __inline__ __u16 __be16_to_cpu(__be16) __attribute__ ((always_inline)); +static __inline__ __u32 __be32_to_cpu(__be32) __attribute__ ((always_inline)); +static __inline__ __u64 __be64_to_cpu(__be64) __attribute__ ((always_inline)); + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) +{ + return x; +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) +{ + return x; +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) +{ + return x; +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) +{ + return x; +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) +{ + return x; +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) +{ + return x; +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) +{ + return __ipath_fswab16(x); +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) +{ + return __ipath_fswab32(x); +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) +{ + return __ipath_fswab64(x); +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) +{ + return __ipath_fswab16(x); +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) +{ + return __ipath_fswab32(x); +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) +{ + return __ipath_fswab64(x); +} + +#elif __BYTE_ORDER == __BIG_ENDIAN + +/* + * __cpu_to_le* routines + */ +static __inline__ __le16 __cpu_to_le16(__u16 x) +{ + return __ipath_fswab16(x); +} + +static __inline__ __le32 __cpu_to_le32(__u32 x) +{ + return __ipath_fswab32(x); +} + +static __inline__ __le64 __cpu_to_le64(__u64 x) +{ + return __ipath_fswab64(x); +} + +/* + * __le*_to_cpu routines + */ +static __inline__ __u16 __le16_to_cpu(__le16 x) +{ + return __ipath_fswab16(x); +} + +static __inline__ __u32 __le32_to_cpu(__le32 x) +{ + return __ipath_fswab32(x); +} + +static __inline__ __u64 __le64_to_cpu(__le64 x) +{ + return __ipath_fswab64(x); +} + +/* + * __cpu_to_be* routines + */ +static __inline__ __be16 __cpu_to_be16(__u16 x) +{ + return x; +} + +static __inline__ __be32 __cpu_to_be32(__u32 x) +{ + return x; +} + +static __inline__ __be64 __cpu_to_be64(__u64 x) +{ + return x; +} + +/* + * __be*_to_cpu routines + */ +static __inline__ __u16 __be16_to_cpu(__be16 x) +{ + return x; +} + +static __inline__ __u32 __be32_to_cpu(__be32 x) +{ + return x; +} + +static __inline__ __u64 __be64_to_cpu(__be64 x) +{ + return x; +} + +#else +# error "unsupported BYTE_ORDER: " #BYTE_ORDER +#endif + +#ifdef __cplusplus + } // extern "C" +#endif + +#endif // ipath_byteorder_h diff --git a/include/ipath_common.h b/include/ipath_common.h new file mode 100644 index 0000000..8bf9986 --- /dev/null +++ b/include/ipath_common.h @@ -0,0 +1,892 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_COMMON_H +#define _IPATH_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* BEGIN_NOSHIP_TO_OPENIB */ +#include +#ifndef __KERNEL__ +// Pointer annotations used by the "sparse" checker tool. +#define __iomem +#include "ipath_byteorder.h" +#endif +/* END_NOSHIP_TO_OPENIB */ + +/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */ +#define IPATH_SRC_OUI_1 0x00 +#define IPATH_SRC_OUI_2 0x11 +#define IPATH_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _IPATH_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + * _IPATH_DEBUGGING define as 0 if you want to remove debug prints + */ + +/* + * valid states passed to ipath_set_linkstate() user call + */ +#define IPATH_IB_LINKDOWN 0 +#define IPATH_IB_LINKARM 1 +#define IPATH_IB_LINKACTIVE 2 +#define IPATH_IB_LINKINIT 3 +#define IPATH_IB_LINKDOWN_SLEEP 4 +#define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. + */ +#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */ +/* Chip has been found and initted */ +#define IPATH_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define IPATH_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define IPATH_STATUS_IB_CONF 0x80 +/* no link established, probably no cable */ +#define IPATH_STATUS_IB_NOCABLE 0x100 +/* A Fatal hardware error has occurred. */ +#define IPATH_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +typedef enum _ipath_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number (Shared contexts). */ + _IPATH_UregMax = 4, + /* (RW) RcvTIDFlow table for expected sends in QLE73XX */ + ur_rcvtidflow = 512 +} ipath_ureg; + +/* bit values for spi_runtime_flags */ +#define IPATH_RUNTIME_PCIE 0x2 +#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 +#define IPATH_RUNTIME_RCVHDR_COPY 0x8 +#define IPATH_RUNTIME_MASTER 0x10 +#define IPATH_RUNTIME_RCHK 0x20 +#define IPATH_RUNTIME_NODMA_RTAIL 0x80 +#define IPATH_RUNTIME_SPECIAL_TRIGGER 0x100 +#define IPATH_RUNTIME_SDMA 0x200 +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 +/* + * MEA: below means chip expects 7322-style context/qp mapping, + * not 7220-style. This needs work, because we actually care what + * the remote chip uses, not what the local chip uses, other + * than to somehow tell the remote endpoint. + */ +#define IPATH_RUNTIME_CTXT_MSB_IN_QP 0x1000 +#define IPATH_RUNTIME_CTXT_REDIRECT 0x2000 +#define IPATH_RUNTIME_HDRSUPP 0x4000 + +/* + * This structure is returned by ipath_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct ipath_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* InfiniPath context assigned, goes into sent packets */ + __u16 spi_context; + __u16 spi_subcontext; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer in byts. Any given packet's total size must + * be less than this. Included is the starting control word, so + * if 2052 is returned, then total pkt size is 2048 bytes or less. + */ + __u32 spi_piosize; + /* size of the TID cache in infinipath, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in infinipath, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where receive buffer queue is mapped into */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an infinipath + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. Always maps real chip register space. + */ + __u64 spi_uregbase; + + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize bytes, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a + * link status qword (formerly combined with driver status), then a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip contexts available to user processes */ + __u32 spi_ncontexts; + __u16 spi_unit; /* unit number of chip we are using; */ + __u16 spi_port; /* IB port number we are using for send */ + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */ + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* + * shared memory pages for subctxts if ctxt is shared; these cover + * all the processes in the group sharing a single context. + * all have enough space for the num_subcontexts value on this job. + */ + __u64 spi_subctxt_uregbase; + __u64 spi_subctxt_rcvegrbuf; + __u64 spi_subctxt_rcvhdr_base; + + /* shared memory page for send buffer disarm status */ + __u64 spi_sendbuf_status; +} __attribute__ ((aligned(8))); + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of ipath_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define IPATH_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define IPATH_USER_SWMINOR 13 + +#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) + +/* BEGIN_NOSHIP_TO_OPENIB */ +#ifndef IPATH_KERN_TYPE +/* END_NOSHIP_TO_OPENIB */ +#define IPATH_KERN_TYPE 0 +/* BEGIN_NOSHIP_TO_OPENIB */ +#endif +/* END_NOSHIP_TO_OPENIB */ + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of ipath_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION) + +/* + * If the unit is specified via open, HCA choice is fixed. If port is + * specified, it's also fixed. Otherwise we try to spread contexts + * across ports and HCAs, using different algorithims. WITHIN is + * the old default, prior to this mechanism. +*/ +#define IPATH_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then + * ports; this is the default */ +#define IPATH_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin + * active ports within), then next HCA */ +#define IPATH_PORT_ALG_COUNT 2 /* number of algorithm choices */ + +/* + * This structure is passed to ipath_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct ipath_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to IPATH_USER_SWVERSION. + */ + __u32 spu_userversion; + + __u32 _spu_scif_nodeid; /* used for mic processes */ + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + __u32 spu_port_alg; /* which IPATH_PORT_ALG_*; unused user minor < 11 */ + + /* + * If two or more processes wish to share a context, each process + * must set the spu_subcontext_cnt and spu_subcontext_id to the same + * values. The only restriction on the spu_subcontext_id is that + * it be unique for a given node. + */ + __u16 spu_subcontext_cnt; + __u16 spu_subcontext_id; + + __u32 spu_port; /* IB port requested by user if > 0 */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +#define __IPATH_CMD_USER_INIT 16 /* old set up userspace */ +#define IPATH_CMD_CTXT_INFO 17 /* find out what resources we got */ +#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ +#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes */ +#define IPATH_CMD_ASSIGN_CONTEXT 23 /* allocate HCA and context (or port, historically) */ +#define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_PIOAVAILCHK 25 /* check if pio send stuck */ +#define IPATH_CMD_TIDCHKFIX 26 /* check expected tid, and fixup */ +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define IPATH_CMD_SDMA_INFLIGHT 31 /* latest sdma inflight count */ +#define IPATH_CMD_SDMA_COMPLETE 32 /* try to complete pending sdma */ +/* CMD 33 is available (used to be to enable backpressure). Removed in IFS 5.1*/ +#define IPATH_CMD_DISARM_BUFS 34 /* disarm send buffers w/ errors */ +#define IPATH_CMD_ACK_EVENT 35 /* ack & clear bits *spi_sendbuf_status */ +/* MIC to setup memory with mic driver */ +#define IPATH_CMD_MIC_MEM_INFO 41 /* mic memory setup operation */ + +/* + * IPATH_CMD_ACK_EVENT obsoletes IPATH_CMD_DISARM_BUFS, but we keep it for + * compatibility with libraries from previous release. The ACK_EVENT + * will take appropriate driver action (if any, just DISARM for now), + * then clear the bits passed in as part of the mask. These bits are + * in the first 64bit word at spi_sendbuf_status, and are passed to + * the driver in + */ +#define IPATH_EVENT_DISARM_BUFS (1ULL << 0) +#define IPATH_EVENT_LINKDOWN (1ULL << 1) +#define IPATH_EVENT_LID_CHANGE (1ULL << 2) +#define IPATH_EVENT_LMC_CHANGE (1ULL << 3) +#define IPATH_EVENT_SL2VL_CHANGE (1ULL << 4) + +/* + * The following ipath commands are only used for mic system to send + * commands to host daemon. All commands above are also used by mic. + */ +#define IPATH_CMD_CONTEXT_OPEN 51 /* open a context */ +#define IPATH_CMD_CONTEXT_CLOSE 52 /* close a context */ + +#define IPATH_CMD_GET_NUM_UNITS 61 /* number of hca units */ +#define IPATH_CMD_GET_NUM_CTXTS 62 /* number of contexts */ +#define IPATH_CMD_GET_PORT_LID 63 /* port lid */ +#define IPATH_CMD_GET_PORT_GID 64 /* port gid */ +#define IPATH_CMD_GET_PORT_LMC 65 /* port lmc */ +#define IPATH_CMD_GET_PORT_RATE 66 /* port rate */ +#define IPATH_CMD_GET_PORT_S2V 67 /* port sl2vl */ + +#define IPATH_CMD_GET_STATS_NAMES 68 /* stats names */ +#define IPATH_CMD_GET_STATS 69 /* stats */ +#define IPATH_CMD_GET_CTRS_UNAMES 70 /* counters unit names */ +#define IPATH_CMD_GET_CTRS_UNIT 71 /* counters unit */ +#define IPATH_CMD_GET_CTRS_PNAMES 72 /* counters port names */ +#define IPATH_CMD_GET_CTRS_PORT 73 /* counters port */ + +#define IPATH_CMD_GET_CC_SETTINGS 74 /* get cc settings */ +#define IPATH_CMD_GET_CC_TABLE 75 /* get cc table */ + +/* cmd for diag code */ +#define IPATH_CMD_WAIT_FOR_PACKET 76 +#define IPATH_CMD_GET_UNIT_FLASH 77 +#define IPATH_CMD_PUT_UNIT_FLASH 78 + +/* + * Poll types + */ + +#define IPATH_POLL_TYPE_ANYRCV 0 +#define IPATH_POLL_TYPE_URGENT 0x01 + +struct ipath_ctxt_info { + __u16 num_active; /* number of active units */ + __u16 unit; /* unit (chip) assigned to caller */ + __u16 port; /* IB port assigned to caller */ + __u16 context; /* context on unit assigned to caller */ + __u16 subcontext; /* subcontext on unit assigned to caller */ + __u16 num_contexts; /* number of contexts available on unit */ + __u16 num_subcontexts; /* number of subcontexts opened on context */ + __u16 rec_cpu; /* cpu # for affinity (ffff if none) */ +}; + +struct ipath_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +/* + * To send general info between PSM on mic and psmd on host. + * this structure should be not more that "structure ipath_user_info". + */ +struct ipath_mic_info { + int unit; /* unit number */ + int port; /* port number */ + int data1; /* return data or -1 */ + int data2; /* errno if data1=-1 */ + __u64 data3; /* other data */ + __u64 data4; /* other data */ +} __attribute__ ((aligned(8))); + +/* + * PSM tells mic driver how to operate memores. flags: + * 0x1: map remote host buffer, offset is the SCIF offset + * 0x2: allocate knx memory in kernel. + * 0x4: allocate physically contiguous knx memory in kernel. + * 0x8: SCIF register knx memory, and copy offset to first 8 bytes. + */ +struct ipath_mem_info { + uint32_t key; /* key to match mmap offset */ + uint32_t flags; /* flags indicate what to do */ + size_t length; /* buffer length in bytes */ + off_t offset; /* remotely registerd offset */ +}; + +struct ipath_cmd { + __u32 type; /* command type */ + union { + struct ipath_mem_info mem_info; /* mic memory */ + struct ipath_mic_info mic_info; + struct ipath_tid_info tid_info; + struct ipath_user_info user_info; + /* send dma inflight/completion counter */ + __u64 sdma_cntr; + /* address in userspace of struct ipath_ctxt_info to + write result to */ + __u64 ctxt_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + /* back pressure enable bit for one particular context */ + __u8 ctxt_bp; + /* ipath_event_ack(), IPATH_EVENT_* bits */ + __u64 event_mask; + } cmd; +}; + +struct ipath_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * ipath_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the ipath_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __ipath_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct ipath_iovec sps_iov[4]; +}; + +/* Passed into diag data special file's ->write method. */ +struct ipath_diag_pkt { + __u32 unit; + __u64 data; + __u32 len; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define IPATH_FLASH_VERSION 2 +struct ipath_flash { + /* flash layout version (IPATH_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software + * The other bits that are used only by the driver or diags are in + * ipath_registers.h + */ + +/* RcvHdrFlags bits */ +#define INFINIPATH_RHF_LENGTH_MASK 0x7FF +#define INFINIPATH_RHF_LENGTH_SHIFT 0 +#define INFINIPATH_RHF_RCVTYPE_MASK 0x7 +#define INFINIPATH_RHF_RCVTYPE_SHIFT 11 +#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF +#define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_SEQ_MASK 0xF +#define INFINIPATH_RHF_SEQ_SHIFT 0 +#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF +#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4 +#define INFINIPATH_RHF_H_ICRCERR 0x80000000 +#define INFINIPATH_RHF_H_VCRCERR 0x40000000 +#define INFINIPATH_RHF_H_PARITYERR 0x20000000 +#define INFINIPATH_RHF_H_LENERR 0x10000000 +#define INFINIPATH_RHF_H_MTUERR 0x08000000 +#define INFINIPATH_RHF_H_IHDRERR 0x04000000 +#define INFINIPATH_RHF_H_TIDERR 0x02000000 +#define INFINIPATH_RHF_H_MKERR 0x01000000 +#define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_H_TFGENERR 0x00400000 +#define INFINIPATH_RHF_H_TFSEQERR 0x00200000 +#define INFINIPATH_RHF_H_ERR_MASK 0xFFE00000 +#define INFINIPATH_RHF_L_USE_EGR 0x80000000 +#define INFINIPATH_RHF_L_SWA 0x00008000 +#define INFINIPATH_RHF_L_SWB 0x00004000 + +/* TidFlow related bits */ +#define INFINIPATH_TF_SEQNUM_SHIFT 0 +#define INFINIPATH_TF_SEQNUM_MASK 0x7ff +#define INFINIPATH_TF_GENVAL_SHIFT 11 +#define INFINIPATH_TF_GENVAL_MASK 0xff +#define INFINIPATH_TF_ISVALID_SHIFT 19 +#define INFINIPATH_TF_ISVALID_MASK 0x1 +#define INFINIPATH_TF_ENABLED_SHIFT 20 +#define INFINIPATH_TF_ENABLED_MASK 0x1 +#define INFINIPATH_TF_KEEP_AFTER_SEQERR_SHIFT 21 +#define INFINIPATH_TF_KEEP_AFTER_SEQERR_MASK 0x1 +#define INFINIPATH_TF_KEEP_AFTER_GENERR_SHIFT 22 +#define INFINIPATH_TF_KEEP_AFTER_GENERR_MASK 0x1 +#define INFINIPATH_TF_STATUS_SHIFT 27 +#define INFINIPATH_TF_STATUS_MASK 0x3 +#define INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT 27 +#define INFINIPATH_TF_STATUS_SEQMISMATCH_MASK 0x1 +#define INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT 28 +#define INFINIPATH_TF_STATUS_GENMISMATCH_MASK 0x1 + +#define INFINIPATH_TF_FLOWID_SHIFT 19 +#define INFINIPATH_TF_NFLOWS 32 + +/* infinipath header fields */ +#define INFINIPATH_I_VERS_MASK 0xF +#define INFINIPATH_I_VERS_SHIFT 28 +#define INFINIPATH_I_CONTEXT_MASK 0xF +#define INFINIPATH_I_CONTEXT_SHIFT 24 +#define INFINIPATH_I_TID_MASK 0x7FF +#define INFINIPATH_I_TID_SHIFT 13 +#define INFINIPATH_I_OFFSET_MASK 0x1FFF +#define INFINIPATH_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define INFINIPATH_KPF_INTR 0x1 +#define INFINIPATH_KPF_HDRSUPP 0x2 +#define INFINIPATH_KPF_INTR_HDRSUPP_MASK 0x3 +#define INFINIPATH_KPF_COMMIDX_MASK 0x003C +#define INFINIPATH_KPF_COMMIDX_SHIFT 2 +#define INFINIPATH_KPF_RESERVED_BITS(pktflags) \ + ((__le16_to_cpu(pktflags) & INFINIPATH_KPF_COMMIDX_MASK) \ + << IPS_EPSTATE_COMMIDX_SHIFT) \ + +#define INFINIPATH_MAX_SUBCONTEXT 4 + +#define IPATH_MAX_UNIT 4 /* max units supported */ +#define IPATH_MAX_PORT 2 /* no boards have more than 2 IB ports */ + +/* SendPIO per-buffer control */ +/* BEGIN_NOSHIP_TO_OPENIB */ +// #define INFINIPATH_SP_LENGTHP1_MASK 0x3FF /* unused currently */ +// #define INFINIPATH_SP_LENGTHP1_SHIFT 0 /* unused currently */ +// #define INFINIPATH_SP_INTR 0x80 /* unused currently */ +/* END_NOSHIP_TO_OPENIB */ +#define INFINIPATH_SP_TEST 0x40 +#define INFINIPATH_SP_TESTEBP 0x20 + +/* these are currently used only on 7322 chips; they should be referenced + * only at the lowest level pio send buffer fill routines; they go into + * the pbcflags field. OLSON: need to clean this up. */ +#define __PBC_IBPORT (1U << 26) +#define __PBC_VLSHIFT (27) + +/* this portion only defines what we currently use */ +union ipath_pbc { + __u64 qword; + __u32 dword; + struct { + __u16 length; + __u16 fill1; + __u32 pbcflags; + }; +}; + +/* SendPIOAvail bits */ +#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 +#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* infinipath header format */ +struct ipath_header { + /* + * Version - 4 bits, Context (or port, historically) - 4 bits, + * TID - 10 bits and Offset. + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 4, TID 11, offset 13. + */ + __le32 ver_context_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* infinipath user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ infinipath. + */ +struct ipath_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; +}; + +/* infinipath ethernet header format */ +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __le32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + +/* BEGIN_NOSHIP_TO_OPENIB */ +/* + * The PIO buffer used for sending infinipath messages must only be written + * in 32-bit words, all the data must be written, and no writes can occur + * after the last word is written (which transfers "ownership" of the buffer + * to the chip and triggers the message to be sent). + * Since the Linux sk_buff structure can be recursive, non-aligned, and + * any number of bytes in each segment, we use the following structure + * to keep information about the overall state of the copy operation. + * This is used to save the information needed to store the checksum + * in the right place before sending the last word to the hardware and + * to buffer the last 0-3 bytes of non-word sized segments. + */ +struct copy_data_s { + struct ether_header *hdr; + /* addr of PIO buf to write csum to */ + __u32 __iomem *csum_pio; + __u32 __iomem *to; /* addr of PIO buf to write data to */ + __u32 device; /* which device to allocate PIO bufs from */ + __s32 error; /* set if there is an error. */ + __s32 extra; /* amount of data saved in u.buf below */ + __u32 len; /* total length to send in bytes */ + __u32 flen; /* frament length in words */ + __u32 csum; /* partial IP checksum */ + __u32 pos; /* position for partial checksum */ + __u32 offset; /* offset to where data currently starts */ + __s32 checksum_calc; /* set to 1 when csum has been calculated */ + struct sk_buff *skb; + union { + __u32 w; + __u8 buf[4]; + } u; +}; +/* END_NOSHIP_TO_OPENIB */ + +/* IB - LRH header consts */ +#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define IPATH_DEFAULT_SERVICE_ID 0x1000117500000000ULL +#define IPATH_DEFAULT_P_KEY 0xFFFF +#define IPATH_PERMISSIVE_LID 0xFFFF +#define IPATH_AETH_CREDIT_SHIFT 24 +#define IPATH_AETH_CREDIT_MASK 0x1F +#define IPATH_AETH_CREDIT_INVAL 0x1F +#define IPATH_PSN_MASK 0xFFFFFF +#define IPATH_MSN_MASK 0xFFFFFF +#define IPATH_QPN_MASK 0xFFFFFF +#define IPATH_MULTICAST_LID_BASE 0xC000 +/* BEGIN_NOSHIP_TO_OPENIB */ +#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK +/* END_NOSHIP_TO_OPENIB */ +#define IPATH_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + +/* BEGIN_NOSHIP_TO_OPENIB */ +/* OpCodes */ +#define IPATH_OPCODE_USER1 0xC0 +#define IPATH_OPCODE_ITH4X 0xC1 + +/* OpCode 30 is use by stand-alone test programs */ +#define IPATH_OPCODE_RAW_DATA 0xDE +/* last OpCode (31) is reserved for test */ +#define IPATH_OPCODE_TEST 0xDF +/* END_NOSHIP_TO_OPENIB */ + +/* sub OpCodes - ith4x */ +#define IPATH_ITH4X_OPCODE_ENCAP 0x81 +#define IPATH_ITH4X_OPCODE_LID_ARP 0x82 + +/* Value set in ips_common.h for IPS_HEADER_QUEUE_WORDS */ +#define IPATH_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK; +} + +static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 ipath_hdrget_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ipath_hdrget_seq(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT) + & INFINIPATH_RHF_SEQ_MASK; +} + +static inline __u32 ipath_hdrget_offset(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT) + & INFINIPATH_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 ipath_hdrget_use_egr_buf(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR; +} + +static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + +#endif /* _IPATH_COMMON_H */ diff --git a/include/ipath_debug.h b/include/ipath_debug.h new file mode 100644 index 0000000..41ba098 --- /dev/null +++ b/include/ipath_debug.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_DEBUG_H +#define _IPATH_DEBUG_H + +#ifndef _IPATH_DEBUGGING /* debugging enabled or not */ +#define _IPATH_DEBUGGING 1 +#endif + +#if _IPATH_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe infinipath.ko infinipath_debug=7 + */ + +#define __IPATH_INFO 0x1 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x2 /* generic debug */ +#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __IPATH_VERBDBG 0x40 /* very verbose debug */ +#define __IPATH_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages and important env vars */ +#define __IPATH_PROCDBG 0x100 +/* print mmap/nopage stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x200 +/* low-level environment variables */ +#define __IPATH_ENVDBG 0x400 +#define __IPATH_EPKTDBG 0x800 /* print error packet data */ +#define __IPATH_CCADBG 0x1000 /* print CCA related events */ +#else /* _IPATH_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __IPATH_INFO 0x0 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x0 /* generic debug */ +#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __IPATH_VERBDBG 0x0 /* very verbose debug */ +#define __IPATH_PKTDBG 0x0 /* print packet data */ +#define __IPATH_PROCDBG 0x0 /* print process startup (init)/exit messages */ +/* print mmap/nopage stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x0 +#define __IPATH_CCADBG 0x0 /* print CCA related events */ + +#endif /* _IPATH_DEBUGGING */ + +#define __IPATH_VERBOSEDBG __IPATH_VERBDBG + +#endif /* _IPATH_DEBUG_H */ diff --git a/include/ipath_intf.h b/include/ipath_intf.h new file mode 100644 index 0000000..66506e9 --- /dev/null +++ b/include/ipath_intf.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_INTF_H +#define _IPATH_INTF_H + +#include +#include +#include + +#ifdef __inline__ +#undef __inline__ +#endif +#define __inline__ inline __attribute__((always_inline,unused)) +#ifdef __unused__ +#undef __unused__ +#endif +#define __unused__ __attribute__((unused)) + +#include "sysdep.h" +#include "bit_ops.h" + +/* these aren't implemented for user mode, which is OK until we multi-thread */ +typedef struct _atomic { + uint32_t counter; +} atomic_t; /* no atomic_t type in user-land */ +#define atomic_set(a,v) ((a)->counter = (v)) +#define atomic_inc_return(a) (++(a)->counter) + +#if defined(__PATHCC__) && __PATHCC__ < 3 + #define likely(x) (x) + #define unlikely(x) (x) + #define if_pt(cond) if (cond) + #define if_pf(cond) if (cond) + #define _Pragma_unlikely _Pragma("mips_frequency_hint never") + #define _Pragma_likely _Pragma("mips_frequency_hint frequent") +#elif defined(__GNUC__) || (defined(__PATHCC__) && __PATHCC__ >= 3) + #define likely(x) __builtin_expect(!!(x), 1L) + #define unlikely(x) __builtin_expect(!!(x), 0L) + #define if_pt(cond) if (likely(cond)) + #define if_pf(cond) if (unlikely(cond)) + #define _Pragma_unlikely + #define _Pragma_likely +#else + #error "Unsupported compiler" +#endif + +#define yield() sched_yield() + +/* + * __fastpath is used to group routines in the fastpath, to reduce cache + * misses and conflicts + */ +#define __fastpath __attribute__((section(".text.fastpath"))) + +/* + * Move from using __fastpath to split __recvpath and __sendpath + */ +//#define __sendpath __attribute__((section(".text.sendpath"))) +//#define __recvpath __attribute__((section(".text.recvpath"))) +#define __sendpath __fastpath +#define __recvpath __fastpath + +#endif /* _IPATH_INTF_H */ diff --git a/include/ipath_queue.h b/include/ipath_queue.h new file mode 100644 index 0000000..d96610e --- /dev/null +++ b/include/ipath_queue.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD: src/sys/sys/queue.h,v 1.32.2.7 2002/04/17 14:21:02 des Exp $ + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +/* + * This file defines five types of data structures: singly-linked lists, + * singly-linked tail queues, lists, tail queues, and circular queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may only be traversed in the forward direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * A circle queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or after + * an existing element, at the head of the list, or at the end of the list. + * A circle queue may be traversed in either direction, but has a more + * complex end of list detection. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * + * SLIST LIST STAILQ TAILQ CIRCLEQ + * _HEAD + + + + + + * _HEAD_INITIALIZER + + + + + + * _ENTRY + + + + + + * _INIT + + + + + + * _EMPTY + + + + + + * _FIRST + + + + + + * _NEXT + + + + + + * _PREV - - - + + + * _LAST - - + + + + * _FOREACH + + + + + + * _FOREACH_REVERSE - - - + + + * _INSERT_HEAD + + + + + + * _INSERT_BEFORE - + - + + + * _INSERT_AFTER + + + + + + * _INSERT_TAIL - - + + + + * _REMOVE_HEAD + - + - - + * _REMOVE + + + + + + * + */ + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + struct type *curelm = SLIST_FIRST((head)); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = \ + SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + } \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY(head) ? \ + NULL : \ + ((struct type *) \ + ((char *)((head)->stqh_last) - offsetof(struct type, field)))) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD(head, field); \ + } \ + else { \ + struct type *curelm = STAILQ_FIRST((head)); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + if ((STAILQ_NEXT(curelm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + } \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_REMOVE(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ +} + +/* + * Tail queue functions. + */ +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ +} while (0) + +/* + * Circular queue declarations. + */ +#define CIRCLEQ_HEAD(name, type) \ +struct name { \ + struct type *cqh_first; /* first element */ \ + struct type *cqh_last; /* last element */ \ +} + +#define CIRCLEQ_HEAD_INITIALIZER(head) \ + { (void *)&(head), (void *)&(head) } + +#define CIRCLEQ_ENTRY(type) \ +struct { \ + struct type *cqe_next; /* next element */ \ + struct type *cqe_prev; /* previous element */ \ +} + +/* + * Circular queue functions. + */ +#define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head)) + +#define CIRCLEQ_FIRST(head) ((head)->cqh_first) + +#define CIRCLEQ_FOREACH(var, head, field) \ + for ((var) = CIRCLEQ_FIRST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_NEXT((var), field)) + +#define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ + for ((var) = CIRCLEQ_LAST((head)); \ + (var) != (void *)(head) || ((var) = NULL); \ + (var) = CIRCLEQ_PREV((var), field)) + +#define CIRCLEQ_INIT(head) do { \ + CIRCLEQ_FIRST((head)) = (void *)(head); \ + CIRCLEQ_LAST((head)) = (void *)(head); \ +} while (0) + +#define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_NEXT((listelm), field); \ + CIRCLEQ_PREV((elm), field) = (listelm); \ + if (CIRCLEQ_NEXT((listelm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((listelm), field), field) = (elm);\ + CIRCLEQ_NEXT((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (listelm); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_PREV((listelm), field); \ + if (CIRCLEQ_PREV((listelm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((listelm), field), field) = (elm);\ + CIRCLEQ_PREV((listelm), field) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = CIRCLEQ_FIRST((head)); \ + CIRCLEQ_PREV((elm), field) = (void *)(head); \ + if (CIRCLEQ_LAST((head)) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = (elm); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_FIRST((head)), field) = (elm); \ + CIRCLEQ_FIRST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ + CIRCLEQ_NEXT((elm), field) = (void *)(head); \ + CIRCLEQ_PREV((elm), field) = CIRCLEQ_LAST((head)); \ + if (CIRCLEQ_FIRST((head)) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = (elm); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_LAST((head)), field) = (elm); \ + CIRCLEQ_LAST((head)) = (elm); \ +} while (0) + +#define CIRCLEQ_LAST(head) ((head)->cqh_last) + +#define CIRCLEQ_NEXT(elm,field) ((elm)->field.cqe_next) + +#define CIRCLEQ_PREV(elm,field) ((elm)->field.cqe_prev) + +#define CIRCLEQ_REMOVE(head, elm, field) do { \ + if (CIRCLEQ_NEXT((elm), field) == (void *)(head)) \ + CIRCLEQ_LAST((head)) = CIRCLEQ_PREV((elm), field); \ + else \ + CIRCLEQ_PREV(CIRCLEQ_NEXT((elm), field), field) = \ + CIRCLEQ_PREV((elm), field); \ + if (CIRCLEQ_PREV((elm), field) == (void *)(head)) \ + CIRCLEQ_FIRST((head)) = CIRCLEQ_NEXT((elm), field); \ + else \ + CIRCLEQ_NEXT(CIRCLEQ_PREV((elm), field), field) = \ + CIRCLEQ_NEXT((elm), field); \ +} while (0) + +#endif /* !_SYS_QUEUE_H_ */ diff --git a/include/ipath_service.h b/include/ipath_service.h new file mode 100644 index 0000000..72ac29e --- /dev/null +++ b/include/ipath_service.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_SERVICE_H +#define _IPATH_SERVICE_H + +// This file contains all the lowest level routines calling into sysfs +// and qib driver. All other calls are based on these routines. + +#include "ipath_intf.h" +#include "ipath_common.h" +#include "ipath_udebug.h" + +// any unit id to match. +#define IPATH_UNIT_ID_ANY ((long)-1) + +// Given the unit number and port, return an error, or the corresponding LID +// Returns an int, so -1 indicates an error. 0 indicates that +// the unit is valid, but no LID has been assigned. +int ipath_get_port_lid(int, int); + +// Given the unit number and port, return an error, or the corresponding GID +// Returns an int, so -1 indicates an error. +int ipath_get_port_gid(int, int, uint64_t *hi, uint64_t *lo); + +// Given the unit number, return an error, or the corresponding LMC value +// for the port +// Returns an int, so -1 indicates an error. 0 +int ipath_get_port_lmc(int unit, int port); + +// Given the unit number, return an error, or the corresponding link rate +// for the port +// Returns an int, so -1 indicates an error. +int ipath_get_port_rate(int unit, int port); + +// Given a unit, port and SL, return an error, or the corresponding VL for the +// SL as programmed by the SM +// Returns an int, so -1 indicates an error. +int ipath_get_port_sl2vl(int unit, int port, int sl); + +// get the number of units supported by the driver. Does not guarantee +// that a working chip has been found for each possible unit #. Returns +// -1 with errno set, or number of units >=0 (0 means none found). +int ipath_get_num_units(void); + +// get the number of contexts from the unit id. +// Returns 0 if no unit or no match. +int ipath_get_num_contexts(int unit); + +// Open ipath device file, return -1 on error. +int ipath_context_open(int unit, int port, uint64_t open_timeout); +void ipath_context_close(int fd); +int ipath_cmd_write(int fd, struct ipath_cmd *, size_t count); +int ipath_cmd_writev(int fd, const struct iovec *iov, int iovcnt); +int ipath_cmd_assign_context(int fd, void *buf, size_t count); +int ipath_cmd_user_init(int fd, void *buf, size_t count); + +int ipath_get_cc_settings_bin(int unit, int port, char *ccabuf); +int ipath_get_cc_table_bin(int unit, int port, uint16_t **cctp); + +// we use mmap64() because we compile in both 32 and 64 bit mode, +// and we have to map physical addresses that are > 32 bits long. +// While linux implements mmap64, it doesn't have a man page, +// and isn't declared in any header file, so we declare it here ourselves. + +// We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and +// redirects mmap to mmap64 for us, but at least through suse10 and fc4, +// it doesn't work when the address being mapped is > 32 bits. It chips +// off bits 32 and above. So we stay with mmap64. +extern void *mmap64(void *, size_t, int, int, int, __off64_t); +void *ipath_mmap64(void *, size_t, int, int, int, __off64_t); + +// Statistics maintained by the driver +int infinipath_get_stats(uint64_t *, int); +int infinipath_get_stats_names(char **namep); +// Counters maintained in the chip, globally, and per-prot +int infinipath_get_ctrs_unit(int unitno, uint64_t *, int); +int infinipath_get_ctrs_unit_names(int unitno, char **namep); +int infinipath_get_ctrs_port(int unitno, int port, uint64_t *, int); +int infinipath_get_ctrs_port_names(int unitno, char **namep); + +/* sysfs helper routines (only those currently used are exported; + * try to avoid using others) */ + +/* base name of path (without unit #) for qib driver */ +#define QIB_CLASS_PATH "/sys/class/infiniband/qib" + +/* read a signed 64-bit quantity, in some arbitrary base */ +int ipath_sysfs_read_s64(const char *attr, int64_t *valp, int base); + +/* read a string value */ +int ipath_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap); + +/* open attribute in unit's sysfs directory via open(2) */ +int ipath_sysfs_unit_open(uint32_t unit, const char *attr, int flags); +/* print to attribute in {unit,port} sysfs directory */ +int ipath_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); +int ipath_sysfs_unit_printf(uint32_t unit, const char *attr, + const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +int ipath_ipathfs_unit_write(uint32_t unit, const char *attr, const void *data, + size_t len); +/* read up to one page of malloc'ed data (caller must free), returning + number of bytes read or -1 */ +int ipath_ipathfs_read(const char *attr, char **datap); +int ipath_ipathfs_unit_read(uint32_t unit, const char *attr, char **data); +/* read a signed 64-bit quantity, in some arbitrary base */ +int ipath_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base); +int ipath_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base); +/* these read directly into supplied buffer and take a count */ +int ipath_ipathfs_rd(const char *, void *, int); +int ipath_ipathfs_unit_rd(uint32_t unit, const char *, void *, int); + +int ipath_ipathfs_open(const char *relname, int flags); + +/* wait for device special file to show up. timeout is in + * milliseconds, 0 is "callee knows best", < 0 is infinite. */ +int ipath_wait_for_device(const char *path, long timeout); + +int ipath_cmd_wait_for_packet(int fd); +int infinipath_get_unit_flash(int unit, char **datap); +int infinipath_put_unit_flash(int unit, char *data, int len); + +#endif // _IPATH_SERVICE_H diff --git a/include/ipath_udebug.h b/include/ipath_udebug.h new file mode 100644 index 0000000..bce2233 --- /dev/null +++ b/include/ipath_udebug.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_UDEBUG_H +#define _IPATH_UDEBUG_H + +#include +#include "ipath_debug.h" + +extern unsigned infinipath_debug; +const char *ipath_get_unit_name(int unit); +extern char *__progname; + +#if _IPATH_DEBUGGING + +extern char *__ipath_mylabel; +void ipath_set_mylabel(char *); +char *ipath_get_mylabel(); +extern FILE *__ipath_dbgout; + +#define _IPATH_UNIT_ERROR(unit,fmt,...) \ + do { \ + _Pragma_unlikely \ + printf("%s%s: " fmt, __ipath_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while(0) + +#define _IPATH_ERROR(fmt,...) \ + do { \ + _Pragma_unlikely \ + printf("%s%s: " fmt, __ipath_mylabel, __progname, \ + ##__VA_ARGS__); \ + } while(0) + +#define _IPATH_INFO(fmt,...) \ + do { \ + _Pragma_unlikely \ + if(unlikely(infinipath_debug&__IPATH_INFO)) \ + printf("%s%s: " fmt, __ipath_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while(0) + +#define __IPATH_PKTDBG_ON unlikely(infinipath_debug & __IPATH_PKTDBG) + +#define __IPATH_DBG_WHICH(which,fmt,...) \ + do { \ + _Pragma_unlikely \ + if(unlikely(infinipath_debug&(which))) \ + fprintf(__ipath_dbgout, "%s%s: " fmt, __ipath_mylabel, __func__, \ + ##__VA_ARGS__); \ + } while(0) + +#define __IPATH_DBG_WHICH_NOFUNC(which,fmt,...) \ + do { \ + _Pragma_unlikely \ + if(unlikely(infinipath_debug&(which))) \ + fprintf(__ipath_dbgout, "%s" fmt, __ipath_mylabel, \ + ##__VA_ARGS__); \ + } while(0) + +#define _IPATH_DBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__) +#define _IPATH_VDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_VERBDBG,fmt,##__VA_ARGS__) +#define _IPATH_PDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_PKTDBG,fmt,##__VA_ARGS__) +#define _IPATH_EPDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_EPKTDBG,fmt,##__VA_ARGS__) +#define _IPATH_PRDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_PROCDBG,fmt,##__VA_ARGS__) +#define _IPATH_ENVDBG(lev,fmt,...) \ + __IPATH_DBG_WHICH_NOFUNC( \ + (lev==0) ? __IPATH_INFO : \ + (lev>1?__IPATH_ENVDBG:(__IPATH_PROCDBG|__IPATH_ENVDBG)),\ + "env " fmt,##__VA_ARGS__) +#define _IPATH_MMDBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_MMDBG,fmt,##__VA_ARGS__) +#define _IPATH_CCADBG(fmt,...) __IPATH_DBG_WHICH(__IPATH_CCADBG,fmt,##__VA_ARGS__) + +#else /* ! _IPATH_DEBUGGING */ + +#define _IPATH_UNIT_ERROR(unit,fmt,...) \ + do { \ + printf ("%s" fmt, "", ##__VA_ARGS__); \ + } while(0) + +#define _IPATH_ERROR(fmt,...) \ + do { \ + printf ("%s" fmt, "", ##__VA_ARGS__); \ + } while(0) + +#define _IPATH_INFO(fmt,...) + +#define __IPATH_PKTDBG_ON 0 + +#define _IPATH_DBG(fmt,...) +#define _IPATH_PDBG(fmt,...) +#define _IPATH_EPDBG(fmt,...) +#define _IPATH_PRDBG(fmt,...) +#define _IPATH_VDBG(fmt,...) +#define _IPATH_MMDBG(fmt,...) +#define _IPATH_CCADBG(fmt,...) + +#endif /* _IPATH_DEBUGGING */ + +#endif /* _IPATH_DEBUG_H */ diff --git a/include/ipath_user.h b/include/ipath_user.h new file mode 100644 index 0000000..3d120f0 --- /dev/null +++ b/include/ipath_user.h @@ -0,0 +1,529 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_USER_H +#define _IPATH_USER_H + +// This file contains all of the data structures and routines that are +// publicly visible and usable (to low level infrastructure code; it is +// not expected that any application, or even normal application-level library, +// will ever need to use any of this). + +// Additional entry points and data structures that are used by these routines +// may be referenced in this file, but they should not be generally available; +// they are visible here only to allow use in inlined functions. Any variable, +// data structure, or function that starts with a leading "_" is in this +// category. + +// Include header files we need that are unlikely to otherwise be needed by +// programs. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ipath_intf.h" +#include "ipath_common.h" +#include "ipath_byteorder.h" +#include "ipath_udebug.h" +#include "ipath_service.h" + +// interval timing routines +// Convert a count of cycles to elapsed nanoseconds +// this is only accurate for reasonably large numbers of cycles (at least tens) +static __inline__ uint64_t cycles_to_nanosecs(uint64_t) + __attribute__ ((always_inline)); +// convert elapsed nanoseconds to elapsed cycles +// this is only accurate for reasonably large numbers of nsecs (at least tens) +static __inline__ uint64_t nanosecs_to_cycles(uint64_t) + __attribute__ ((always_inline)); +// get current count of nanoseconds from unspecified base value (only useful for +// intervals) +static __inline__ uint64_t get_nanoseconds() __attribute__ ((always_inline)); + +// This block will eventually move to a separate file, but for now we'll leave +// it here. +typedef struct _ipath_dev { + int32_t spd_fd; + int32_t spd_type; // ipath_type + volatile uint64_t *spd_uregbase; // mmap'ed to chip or virtual user regs + volatile uint64_t *spd_piobase; // mmap'ed access to chip PIO buffers + uint64_t __pad[8]; // placeholder for future binary compat expansion +} ipath_dev; + +struct _ipath_ctrl { + ipath_dev spc_dev; // for use by "driver" code only, other code treats as an opaque cookie. + +// some local storages in some condition: +// as storage of __ipath_rcvtidflow in ipath_userinit(). + __le32 regs[INFINIPATH_TF_NFLOWS << 1]; +// as storage of __ipath_tidflow_wmb in ipath_userinit(). + __le32 tidflow_wmb_location; +// as storage of spi_sendbuf_status in ipath_userinit(). + uint64_t sendbuf_status; +// for ipath_check_unit_status(), ipath_proto.c + int lasterr; + +// location to which InfiniPath writes the rcvhdrtail +// register whenever it changes, so that no chip registers are read in +// the performance path. + volatile __le32 *__ipath_rcvtail; +// address where ur_rcvhdrhead is written + volatile __le32 *__ipath_rcvhdrhead; +// address where ur_rcvegrindexhead is written + volatile __le32 *__ipath_rcvegrhead; +// address where ur_rcvegrindextail is read + volatile __le32 *__ipath_rcvegrtail; +// number of eager buffers + uint32_t __ipath_tidegrcnt; +// address where ur_rcvtidflow is written + volatile __le32 *__ipath_rcvtidflow; +// Serialize writes to tidflow QLE73XX + volatile __le32 *__ipath_tidflow_wmb; + +// save away spi_status for use in ipath_check_unit_status() + volatile __u64 *__ipath_spi_status; +}; + +// PIO write routines assume that the message header is always 56 bytes. +#define IPATH_MESSAGE_HDR_SIZE 56 +// Usable bytes in header (hdrsize - lrh - bth) +#define IPATH_MESSAGE_HDR_SIZE_IPATH (IPATH_MESSAGE_HDR_SIZE-20) +// Must be same as PSM_CRC_SIZE_IN_BYTES in ips_proto_params.h +#define IPATH_CRC_SIZE_IN_BYTES 8 + +// After the device is opened, ipath_userinit() is called to give the driver the +// parameters the user code wants to use, and to get the implementation values, +// etc. back. 0 is returned on success, a positive value is a standard errno, +// and a negative value is reserved for future use. The first argument is +// the filedescriptor returned by the device open. +// +// It is allowed to have multiple devices (and of different types) +// simultaneously opened and initialized, although this won't be fully +// implemented initially. This routine is used by the low level +// infinipath protocol code (and any other code that has similar low level +// functionality). +// This is the only routine that takes a file descriptor, rather than an +// struct _ipath_ctrl *. The struct _ipath_ctrl * used for everything +// else is returned by this routine. +struct _ipath_ctrl *ipath_userinit(int32_t, struct ipath_user_info *, + struct ipath_base_info *b); + +// don't inline these; it's all init code, and not inlining makes the +// overall code shorter and easier to debug +void ipath_touch_mmap(void *, size_t) __attribute__ ((noinline)); + +int32_t ipath_update_tid_err(void); // handle update tid errors out of line +int32_t ipath_free_tid_err(void); // handle free tid errors out of line + +// set the BTH pkey to check for this process. +// This is for receive checks, not for sends. It isn't necessary +// to set the default key, that's always allowed by the hardware. +// If too many pkeys are in use for the hardware to support, this +// will return EAGAIN, and the caller should then fail and exit +// or use the default key and check the pkey in the received packet +// checking. +int32_t ipath_set_pkey(struct _ipath_ctrl *, uint16_t); + +// flush the eager buffers, by setting the +// eager index head register == eager index tail, if queue is full +void ipath_flush_egr_bufs(struct _ipath_ctrl *ctrl); + +int ipath_wait_for_packet(struct _ipath_ctrl *); + +// stop_start == 0 disables receive on the context, for use in queue overflow +// conditions. stop_start==1 re-enables, and returns value of tail register, +// to be used to re-init the software copy of the head register +int ipath_manage_rcvq(struct _ipath_ctrl *ctrl, uint32_t stop_start); + +// ctxt_bp == 0 disables fabric back pressure on the context. +// ctxt_bp == 1 enables fabric back pressure on the context. +int ipath_manage_bp(struct _ipath_ctrl *ctrl, uint8_t ctxt_bp); + +// enable == 1 enables armlaunch (normal), 0 disables (only used +// ipath_pkt_test -B at the moment, needed for linda). +int ipath_armlaunch_ctrl(struct _ipath_ctrl *ctrl, uint32_t enable); + +// force an update of the PIOAvail register to memory +int ipath_force_pio_avail_update(struct _ipath_ctrl *ctrl); + +// Disarm any send buffers which need disarming. +int ipath_disarm_bufs(struct _ipath_ctrl *ctrl); + +// New user event mechanism, using spi_sendbuf_status IPATH_EVENT_* bits +// obsoletes ipath_disarm_bufs(), and extends it, although old mechanism +// remains for binary compatibility. +int ipath_event_ack(struct _ipath_ctrl *ctrl, __u64 ackbits); + +// Return send dma's current "in flight counter " +int ipath_sdma_inflight(struct _ipath_ctrl *ctrl, uint32_t *counter); + +// Return send dma's current "completion counter" +int ipath_sdma_complete(struct _ipath_ctrl *ctrl, uint32_t *counter); + +// set whether we want an interrupt on all packets, or just urgent ones +int ipath_poll_type(struct _ipath_ctrl *ctrl, uint16_t poll_type); + +static int32_t __inline__ ipath_free_tid(struct _ipath_ctrl *, + uint32_t, uint64_t) + __attribute__ ((always_inline)); + +// check the unit status, and return an IPS_RC_* code if it is not in a +// usable state. It will also print a message if not in a usable state +int ipath_check_unit_status(struct _ipath_ctrl *ctrl); + +// Statistics maintained by the driver +const char * infinipath_get_next_name(char **names); +uint64_t infinipath_get_single_stat(const char *attr, uint64_t *s); +int infinipath_get_stats_names_count(void); +// Counters maintained in the chip, globally, and per-prot +int infinipath_get_ctrs_unit_names_count(int unitno); +int infinipath_get_ctrs_port_names_count(int unitno); + +uint64_t infinipath_get_single_unitctr(int unit, const char *attr, uint64_t *s); +int infinipath_get_single_portctr(int unit, int port, const char *attr, + uint64_t *c); +void infinipath_release_names(char *namep); + +// Syslog wrapper +// +// level is one of LOG_EMERG, LOG_ALERT, LOG_CRIT, LOG_ERR, LOG_WARNING, +// LOG_NOTICE, LOG_INFO, LOG_DEBUG. +// +// prefix should be a short string to describe which part of the software stack +// is using syslog, i.e. "PSM", "mpi", "mpirun". +// +void ipath_syslog(const char *prefix, int to_console, int level, + const char *format, ...) + __attribute__((format(printf, 4, 5))); + +void ipath_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap); + +/* parameters for PBC for pio write routines, to avoid passing lots + * of args; we instead pass the structure pointer. */ +struct ipath_pio_params { + uint16_t length; + uint8_t vl; + uint8_t port; + uint32_t cksum_is_valid; + uint32_t cksum; + uint32_t rate; +}; + +// write pio buffers. The ipath_write_pio_force_order() version assumes +// that the processor does not write store buffers to i/o devices in the +// order in which they are writte, and that when flushing partially +// filled store buffers, the words are not ordered either. The ipath_write_pio() +// form is used when the processor writes store buffers to i/o in the order +// in which they are filled, and writes partially filled buffers in increasing +// address order (assuming they are filled that way). +// The arguments are pio buffer address, payload length, header, and payload +void ipath_write_pio_vector(volatile uint32_t *, const struct ipath_pio_params *, + void *, void *); +void ipath_write_pio(volatile uint32_t *, const struct ipath_pio_params *, + void *, void *); +void ipath_write_pio_force_order(volatile uint32_t *, + const struct ipath_pio_params *, void *, void *); + +#define IPATH_SPECIAL_TRIGGER_MAGIC 0xaebecede +// IBA7220 can use a "Special" trigger. We write to the last dword +// in the mapped SendBuf to trigger the launch. +void ipath_write_pio_special_trigger2k(volatile uint32_t *, + const struct ipath_pio_params *, void *, void *); +void ipath_write_pio_special_trigger4k(volatile uint32_t *, + const struct ipath_pio_params *, void *, void *); + +/* + * Copy routine that may copy a byte multiple times but optimized for througput + * This is not safe to use for PIO routines where we want a guarantee that a + * byte is only copied/moved across the bus once. + */ +void ipath_dwordcpy(volatile uint32_t *dest, const uint32_t * src, uint32_t ndwords); + +/* +* Safe version of ipath_dwordcpy that is guaranteed to only copy each byte once. +*/ +#if defined(__x86_64__) +void ipath_dwordcpy_safe(volatile uint32_t *dest, const uint32_t * src, uint32_t ndwords); +#else +#define ipath_dwordcpy_safe ipath_dwordcpy +#endif + +// From here to the end of the file are implementation details that should not +// be used outside this file (other than to call the function), except in the +// one infrastructure file in which they are defined. + +// NOTE: doing paired 32 bit writes to the chip to store 64 bit values (as from +// 32 bit programs) will not work correctly, because there is no sub-qword address +// decode. Therefore 32 bit programs use only a single 32 bit store; the head +// register values are all less than 32 bits, anyway. Given that, we use +// only 32 bits even for 64 bit programs, for simplicity. These functions must +// not be called until after ipath_userinit() is called. +// The ctrl argument is currently unused, but remains useful for adding +// debug code. + +static __inline__ void ipath_put_rcvegrindexhead(struct _ipath_ctrl *ctrl, + uint32_t val) +{ + *ctrl->__ipath_rcvegrhead = __cpu_to_le32(val); +} + +static __inline__ void ipath_put_rcvhdrhead(struct _ipath_ctrl *ctrl, + uint32_t val) +{ + *ctrl->__ipath_rcvhdrhead = __cpu_to_le32(val); +} + +static __inline__ uint32_t ipath_get_rcvhdrtail(struct _ipath_ctrl *ctrl) +{ + uint32_t res = __le32_to_cpu(*ctrl->__ipath_rcvtail); + ips_rmb(); + return res; +} + +static __inline__ void ipath_tidflow_set_entry(struct _ipath_ctrl *ctrl, + uint32_t flowid, uint8_t genval, uint16_t seqnum) +{ + ctrl->__ipath_rcvtidflow[flowid << 1] = __cpu_to_le32( + (1 << INFINIPATH_TF_ISVALID_SHIFT) | + (1 << INFINIPATH_TF_ENABLED_SHIFT) | + (1 << INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) | + (1 << INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT) | + (genval << INFINIPATH_TF_GENVAL_SHIFT) | + ((seqnum & INFINIPATH_TF_SEQNUM_MASK) << INFINIPATH_TF_SEQNUM_SHIFT)); + /* Write a read-only register to act as a delay between tidflow writes */ + *ctrl->__ipath_tidflow_wmb = 0; +} + +static __inline__ void ipath_tidflow_reset(struct _ipath_ctrl *ctrl, + uint32_t flowid) +{ + ctrl->__ipath_rcvtidflow[flowid << 1] = __cpu_to_le32( + (1 << INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) | + (1 << INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT)); + /* Write a read-only register to act as a delay between tidflow writes */ + *ctrl->__ipath_tidflow_wmb = 0; +} + +/* + * This should only be used for debugging. + * Normally, we shouldn't read the chip. + */ +static __inline__ uint32_t ipath_tidflow_get(struct _ipath_ctrl *ctrl, + uint32_t flowid) +{ + return __le32_to_cpu(ctrl->__ipath_rcvtidflow[flowid << 1]); +} + +static __inline__ uint32_t ipath_tidflow_get_seqmismatch(uint32_t val) +{ + return (val >> INFINIPATH_TF_STATUS_SEQMISMATCH_SHIFT) & + INFINIPATH_TF_STATUS_SEQMISMATCH_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_genmismatch(uint32_t val) +{ + return (val >> INFINIPATH_TF_STATUS_GENMISMATCH_SHIFT) & + INFINIPATH_TF_STATUS_GENMISMATCH_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_isvalid(uint32_t val) +{ + return (val >> INFINIPATH_TF_ISVALID_SHIFT) & INFINIPATH_TF_ISVALID_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_seqnum(uint32_t val) +{ + return (val >> INFINIPATH_TF_SEQNUM_SHIFT) & INFINIPATH_TF_SEQNUM_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_genval(uint32_t val) +{ + return (val >> INFINIPATH_TF_GENVAL_SHIFT) & INFINIPATH_TF_GENVAL_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_enabled(uint32_t val) +{ + return (val >> INFINIPATH_TF_ENABLED_SHIFT) & INFINIPATH_TF_ENABLED_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_keep_after_seqerr(uint32_t val) +{ + return (val >> INFINIPATH_TF_KEEP_AFTER_SEQERR_SHIFT) & + INFINIPATH_TF_KEEP_AFTER_SEQERR_MASK; +} + +static __inline__ uint32_t ipath_tidflow_get_keep_after_generr(uint32_t val) +{ + return (val >> INFINIPATH_TF_KEEP_AFTER_GENERR_SHIFT) & + INFINIPATH_TF_KEEP_AFTER_GENERR_MASK; +} + +/* + * This should only be used by a process to write the eager index into + * a subcontext's eager header entry. + */ +static __inline__ void ipath_hdrset_index(__le32 *rbuf, uint32_t val) +{ + rbuf[0] = + (rbuf[0] & + __cpu_to_le32(~(INFINIPATH_RHF_EGRINDEX_MASK << + INFINIPATH_RHF_EGRINDEX_SHIFT))) | + __cpu_to_le32((val & INFINIPATH_RHF_EGRINDEX_MASK) << + INFINIPATH_RHF_EGRINDEX_SHIFT); +} + +/* + * This should only be used by a process to update the receive header + * error flags. + */ +static __inline__ void ipath_hdrset_err_flags(__le32 *rbuf, uint32_t val) +{ + rbuf[1] |= __cpu_to_le32(val); +} + +/* + * This should only be used by a process to write the rhf seq number into + * a subcontext's eager header entry. + */ +static __inline__ void ipath_hdrset_seq(__le32 *rbuf, uint32_t val) +{ + rbuf[1] = + (rbuf[1] & + __cpu_to_le32(~(INFINIPATH_RHF_SEQ_MASK << + INFINIPATH_RHF_SEQ_SHIFT))) | + __cpu_to_le32((val & INFINIPATH_RHF_SEQ_MASK) << + INFINIPATH_RHF_SEQ_SHIFT); +} + +// Manage TID entries. It is possible that not all entries +// requested may be allocated. A matching ipath_free_tid() must be +// done for each ipath_update_tid(), because currently no caching or +// reuse of expected tid entries is allowed, to work around malloc/free +// and mmap/munmap issues. The driver decides which TID entries to allocate. +// If ipath_free_tid is called to free entries in use by a different +// send by the same process, data corruption will probably occur, +// but only within that process, not for other processes. + +// update tidcnt expected TID entries from the array pointed to by tidinfo. +// Returns 0 on success, else an errno. See full description at declaration +static int32_t __inline__ ipath_update_tid(struct _ipath_ctrl *ctrl, + uint32_t tidcnt, uint64_t tidlist, + uint64_t vaddr, uint64_t tidmap) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_TID_UPDATE; + + cmd.cmd.tid_info.tidcnt = tidcnt; // number of tid entries to do + cmd.cmd.tid_info.tidlist = tidlist; // driver copies tids back directly to this + cmd.cmd.tid_info.tidvaddr = vaddr; // base address for this send to map + cmd.cmd.tid_info.tidmap = tidmap; // driver copies directly to this + if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) + return ipath_update_tid_err(); + return 0; +} + +static int32_t __inline__ ipath_free_tid(struct _ipath_ctrl *ctrl, + uint32_t tidcnt, uint64_t tidmap) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_TID_FREE; + + cmd.cmd.tid_info.tidcnt = tidcnt; + cmd.cmd.tid_info.tidmap = tidmap; // driver copies from this + if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) + return ipath_free_tid_err(); + return 0; +} + +extern uint32_t __ipath_pico_per_cycle; // only for use in these functions + +// this is only accurate for reasonably large numbers of cycles (at least tens) +static __inline__ uint64_t cycles_to_nanosecs(uint64_t cycs) +{ + return (__ipath_pico_per_cycle * cycs) / 1000ULL; +} + +// this is only accurate for reasonably large numbers of nsecs (at least tens) +static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns) +{ + return (ns * 1000ULL) / __ipath_pico_per_cycle; +} + +static __inline__ uint64_t get_nanoseconds() +{ + return cycles_to_nanosecs(get_cycles()); +} + +// open the diags device, if supported by driver. Returns 0 on +// success, errno on failure. Also tells driver that diags +// is active, which changes some driver behavior +int ipath_diag_open(unsigned); // unit +int ipath_diag_close(void); + +// diags chip read and write routines + +int ipathd_read32(uint64_t reg_offset, uint32_t * read_valp); +int ipathd_write32(uint64_t reg_offset, uint32_t write_val); + +int ipathd_readmult(uint64_t, unsigned, uint64_t *); // chip: offset, cnt, ptr +int ipathd_write(uint64_t, uint64_t); // chip: offset, value + +#define IPATH_READ_EEPROM 31337 +#define IPATH_WRITE_EEPROM 101 + +struct ipath_eeprom_req { + void *addr; + uint16_t len; + uint16_t offset; +}; + +int ipathd_send_pkt(const void *, unsigned); // send a packet for diags +int ipathd_read_i2c(struct ipath_eeprom_req *); // diags read i2c flash + +__u8 ipath_flash_csum(struct ipath_flash *, int); + +int ipathd_reset_hardware(uint32_t); + +int ipath_hideous_ioctl_emulator(int unit, int reqtype, + struct ipath_eeprom_req *req); + +#endif // _IPATH_USER_H diff --git a/include/linux-i386/bit_ops.h b/include/linux-i386/bit_ops.h new file mode 100644 index 0000000..ca8b80f --- /dev/null +++ b/include/linux-i386/bit_ops.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_i386_BIT_OPS_H +#define _IPATH_i386_BIT_OPS_H + +static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile(LOCK_PREFIX "btrl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile(LOCK_PREFIX "btcl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), + "=m" (*addr) : "dIr" (nr) : "memory"); + return oldbit; +} + +static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btrl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btcl %1,%0" : "=m" (*addr) : "dIr"(nr)); +} + +static __inline__ int ips___test_and_set_bit(int nr, + volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btsl %2,%1\n\tsbbl %0,%0" : "=r" (oldbit), + "=m" (*addr) : "dIr" (nr) : "memory"); + return oldbit; +} + +#endif /* _IPATH_i386_BIT_OPS_H */ diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h new file mode 100644 index 0000000..ef99d1d --- /dev/null +++ b/include/linux-i386/sysdep.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_i386_SYSDEP_H +#define _IPATH_i386_SYSDEP_H + +static __inline__ uint64_t get_cycles(void) +{ + uint64_t v; + uint32_t a,d; + + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + v = ((uint64_t)a) | (((uint64_t)d)<<32); + + return v; +} + +#ifndef LOCK_PREFIX +#define LOCK_PREFIX "lock " +#endif + +static __inline__ void ips_mb() +{ +#ifdef __MIC__ + asm volatile("lock; addl $0,0(%%rsp)" ::: "memory"); +#else + asm volatile("mfence" : : : "memory"); +#endif +} + +/* gcc-3.4 has a bug with this function body at -O0 */ +static +#if defined(__GNUC__) && !defined(__PATHCC__) && __GNUC__==3 && __GNUC_MINOR__==4 +#else +__inline__ +#endif +void ips_rmb() +{ +#ifdef __MIC__ + asm volatile("lock; addl $0,0(%%rsp)" ::: "memory"); +#else + asm volatile("" : : : "memory"); +#endif +} + +static __inline__ void ips_wmb() +{ +#ifdef __MIC__ + asm volatile("lock; addl $0,0(%%rsp)" ::: "memory"); +#else + asm volatile("sfence" : : : "memory"); +#endif +} + +static __inline__ void ips_sync_writes() +{ +#ifdef __MIC__ + asm volatile("lock; addl $0,0(%%rsp)" ::: "memory"); +#else + asm volatile("sfence" : : : "memory"); +#endif +} + +static __inline__ void ips_sync_reads() +{ +#ifdef __MIC__ + asm volatile("lock; addl $0,0(%%rsp)" ::: "memory"); +#else + asm volatile("lfence" : : : "memory"); +#endif +} + +static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr, + uint32_t old, uint32_t new) +{ + uint32_t prev; + struct xchg_dummy { uint32_t a[100]; }; + + asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old) + : "memory"); + + return prev; +} + +typedef struct { volatile int32_t counter; } ips_atomic_t; + +#define ips_atomic_set(v,i) (((v)->counter) = (i)) +#define ips_atomic_cmpxchg(p,oval,nval) \ + ips_cmpxchg((volatile uint32_t *) &((p)->counter),oval,nval) + +#if 0 +static __inline__ int32_t +ips_cmpxchg(volatile int32_t *p, int32_t old_value, int32_t new_value) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a" (old_value) : + "r" (new_value) : + "memory"); + return old_value; +} +#endif + +#endif /* _IPATH_i386_SYSDEP_H */ diff --git a/include/linux-ppc/bit_ops.h b/include/linux-ppc/bit_ops.h new file mode 100644 index 0000000..0326bb4 --- /dev/null +++ b/include/linux-ppc/bit_ops.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_ppc64_BIT_OPS_H +#define _IPATH_ppc64_BIT_OPS_H + +#if defined(__powerpc64__) +# define _NRMASK 63 +# define _NRSHIFT 6 +# define _NRSWIZZ 0 +# define _LLARX "ldarx " +# define _STLCX "stdcx. " +#else +# define _NRMASK 31 +# define _NRSHIFT 5 +# define _NRSWIZZ 1 +# define _LLARX "lwarx " +# define _STLCX "stwcx. " +#endif + +static __inline__ unsigned long ips___nrmask(int nr) +{ + return 1UL << (nr & _NRMASK); +} + +static __inline__ int ips___nroffset(int nr) +{ + return (nr >> _NRSHIFT) ^ _NRSWIZZ; +} + +static __inline__ void ips_clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long old; + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + + __asm__ __volatile__( +"1:" _LLARX "%0,0,%3 \n" + "andc %0,%0,%2 \n" + _STLCX "%0,0,%3 \n" + "bne- 1b" + : "=&r" (old), "=m" (*p) + : "r" (mask), "r" (p), "m" (*p) + : "cc"); +} + +static __inline__ void ips_change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long old; + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + + __asm__ __volatile__( +"1:" _LLARX "%0,0,%3 \n" + "xor %0,%0,%2 \n" + _STLCX "%0,0,%3 \n" + "bne- 1b" + : "=&r" (old), "=m" (*p) + : "r" (mask), "r" (p), "m" (*p) + : "cc"); +} + +static __inline__ int ips_test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long old, t; + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + + __asm__ __volatile__( + "eieio \n" +"1:" _LLARX "%0,0,%3 \n" + "or %1,%0,%2 \n" + _STLCX "%1,0,%3 \n" + "bne- 1b \n" + "sync" + : "=&r" (old), "=&r" (t) + : "r" (mask), "r" (p) + : "cc", "memory"); + + return (old & mask) != 0; +} + +static __inline__ void ips___clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + + *p &= ~mask; +} + +static __inline__ void ips___change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + + *p ^= mask; +} + +static __inline__ int ips___test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = ips___nrmask(nr); + volatile unsigned long *p = addr + ips___nroffset(nr); + unsigned long old = *p; + + *p = old | mask; + return (old & mask) != 0; +} + +#undef _NRMASK +#undef _NRSHIFT +#undef _NRSWIZZ +#undef _LLARX +#undef _STLCX + +#endif /* _IPATH_ppc64_BIT_OPS_H */ diff --git a/include/linux-ppc/sysdep.h b/include/linux-ppc/sysdep.h new file mode 100644 index 0000000..604096e --- /dev/null +++ b/include/linux-ppc/sysdep.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_ppc64_SYSDEP_H +#define _IPATH_ppc64_SYSDEP_H + +static __inline__ uint64_t get_cycles(void) +{ + uint64_t v; + +#if __WORDSIZE == 64 + asm volatile("mftb %0" : "=r" (v) : ); +#else + uint32_t vu0, vu1, vl; + do { + asm volatile("mftbu %0" : "=r" (vu0) : ); + asm volatile("mftb %0" : "=r" (vl) : ); + asm volatile("mftbu %0" : "=r" (vu1) : ); + } while ( vu0 != vu1 ); + + v = vu1; + v <<= 32; + v |= vl; +#endif + + return v; +} + +static __inline__ void ips_mb() +{ + asm volatile ("sync" : : : "memory"); +} + +static __inline__ void ips_rmb() +{ + asm volatile ("lwsync" : : : "memory"); +} + +static __inline__ void ips_wmb() +{ + asm volatile ("eieio" : : : "memory"); +} + +static __inline__ void ips_sync_writes() +{ + asm volatile("lwsync" : : : "memory"); +} + +static __inline__ void ips_sync_reads() +{ + asm volatile("isync" : : : "memory"); +} + +static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *p, uint32_t old, + uint32_t new) +{ + uint32_t prev; + + __asm__ __volatile__ ("\n\ +1: lwarx %0,0,%2 \n\ + cmpw 0,%0,%3 \n\ + bne 2f \n\ + stwcx. %4,0,%2 \n\ + bne- 1b\n\ + sync\n\ +2:" + : "=&r" (prev), "=m" (*p) + : "r" (p), "r" (old), "r" (new), "m" (*p) + : "cc", "memory"); + + return prev; +} + +#endif /* _IPATH_ppc64_SYSDEP_H */ diff --git a/include/valgrind/memcheck.h b/include/valgrind/memcheck.h new file mode 100644 index 0000000..2cbb460 --- /dev/null +++ b/include/valgrind/memcheck.h @@ -0,0 +1,279 @@ + +/* + ---------------------------------------------------------------- + + Notice that the following BSD-style license applies to this one + file (memcheck.h) only. The rest of Valgrind is licensed under the + terms of the GNU General Public License, version 2, unless + otherwise indicated. See the COPYING file in the source + distribution for details. + + ---------------------------------------------------------------- + + This file is part of MemCheck, a heavyweight Valgrind tool for + detecting memory errors. + + Copyright (C) 2000-2007 Julian Seward. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------------------------------------------------------- + + Notice that the above BSD-style license applies to this one file + (memcheck.h) only. The entire rest of Valgrind is licensed under + the terms of the GNU General Public License, version 2. See the + COPYING file in the source distribution for details. + + ---------------------------------------------------------------- +*/ + +#ifndef __MEMCHECK_H +#define __MEMCHECK_H + +/* This file is for inclusion into client (your!) code. + + You can use these macros to manipulate and query memory permissions + inside your own programs. + + See comment near the top of valgrind.h on how to use them. +*/ + +#include "valgrind.h" + +/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! + This enum comprises an ABI exported by Valgrind to programs + which use client requests. DO NOT CHANGE THE ORDER OF THESE + ENTRIES, NOR DELETE ANY -- add new ones at the end. */ +typedef + enum { + VG_USERREQ__MAKE_MEM_NOACCESS = VG_USERREQ_TOOL_BASE('M','C'), + VG_USERREQ__MAKE_MEM_UNDEFINED, + VG_USERREQ__MAKE_MEM_DEFINED, + VG_USERREQ__DISCARD, + VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE, + VG_USERREQ__CHECK_MEM_IS_DEFINED, + VG_USERREQ__DO_LEAK_CHECK, + VG_USERREQ__COUNT_LEAKS, + + VG_USERREQ__GET_VBITS, + VG_USERREQ__SET_VBITS, + + VG_USERREQ__CREATE_BLOCK, + + VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, + + /* This is just for memcheck's internal use - don't use it */ + _VG_USERREQ__MEMCHECK_RECORD_OVERLAP_ERROR + = VG_USERREQ_TOOL_BASE('M','C') + 256 + } Vg_MemCheckClientRequest; + + +/* Client-code macros to manipulate the state of memory. */ + +/* Mark memory at _qzz_addr as unaddressable for _qzz_len bytes. */ +#define VALGRIND_MAKE_MEM_NOACCESS(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__MAKE_MEM_NOACCESS, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Similarly, mark memory at _qzz_addr as addressable but undefined + for _qzz_len bytes. */ +#define VALGRIND_MAKE_MEM_UNDEFINED(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__MAKE_MEM_UNDEFINED, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Similarly, mark memory at _qzz_addr as addressable and defined + for _qzz_len bytes. */ +#define VALGRIND_MAKE_MEM_DEFINED(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__MAKE_MEM_DEFINED, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Similar to VALGRIND_MAKE_MEM_DEFINED except that addressability is + not altered: bytes which are addressable are marked as defined, + but those which are not addressable are left unchanged. */ +#define VALGRIND_MAKE_MEM_DEFINED_IF_ADDRESSABLE(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__MAKE_MEM_DEFINED_IF_ADDRESSABLE, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Create a block-description handle. The description is an ascii + string which is included in any messages pertaining to addresses + within the specified memory range. Has no other effect on the + properties of the memory range. */ +#define VALGRIND_CREATE_BLOCK(_qzz_addr,_qzz_len, _qzz_desc) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__CREATE_BLOCK, \ + _qzz_addr, _qzz_len, _qzz_desc, \ + 0, 0); \ + _qzz_res; \ + })) + +/* Discard a block-description-handle. Returns 1 for an + invalid handle, 0 for a valid handle. */ +#define VALGRIND_DISCARD(_qzz_blkindex) \ + (__extension__ ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* default return */, \ + VG_USERREQ__DISCARD, \ + 0, _qzz_blkindex, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Client-code macros to check the state of memory. */ + +/* Check that memory at _qzz_addr is addressable for _qzz_len bytes. + If suitable addressibility is not established, Valgrind prints an + error message and returns the address of the first offending byte. + Otherwise it returns zero. */ +#define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__CHECK_MEM_IS_ADDRESSABLE,\ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Check that memory at _qzz_addr is addressable and defined for + _qzz_len bytes. If suitable addressibility and definedness are not + established, Valgrind prints an error message and returns the + address of the first offending byte. Otherwise it returns zero. */ +#define VALGRIND_CHECK_MEM_IS_DEFINED(_qzz_addr,_qzz_len) \ + (__extension__({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__CHECK_MEM_IS_DEFINED, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + _qzz_res; \ + })) + +/* Use this macro to force the definedness and addressibility of an + lvalue to be checked. If suitable addressibility and definedness + are not established, Valgrind prints an error message and returns + the address of the first offending byte. Otherwise it returns + zero. */ +#define VALGRIND_CHECK_VALUE_IS_DEFINED(__lvalue) \ + VALGRIND_CHECK_MEM_IS_DEFINED( \ + (volatile unsigned char *)&(__lvalue), \ + (unsigned int)(sizeof (__lvalue))) + +/* Do a memory leak check mid-execution. */ +#define VALGRIND_DO_LEAK_CHECK \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DO_LEAK_CHECK, \ + 0, 0, 0, 0, 0); \ + } + +/* Just display summaries of leaked memory, rather than all the + details */ +#define VALGRIND_DO_QUICK_LEAK_CHECK \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DO_LEAK_CHECK, \ + 1, 0, 0, 0, 0); \ + } + +/* Return number of leaked, dubious, reachable and suppressed bytes found by + all previous leak checks. They must be lvalues. */ +#define VALGRIND_COUNT_LEAKS(leaked, dubious, reachable, suppressed) \ + /* For safety on 64-bit platforms we assign the results to private + unsigned long variables, then assign these to the lvalues the user + specified, which works no matter what type 'leaked', 'dubious', etc + are. We also initialise '_qzz_leaked', etc because + VG_USERREQ__COUNT_LEAKS doesn't mark the values returned as + initialised. */ \ + {unsigned int _qzz_res; \ + unsigned long _qzz_leaked = 0, _qzz_dubious = 0; \ + unsigned long _qzz_reachable = 0, _qzz_suppressed = 0; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__COUNT_LEAKS, \ + &_qzz_leaked, &_qzz_dubious, \ + &_qzz_reachable, &_qzz_suppressed, 0); \ + leaked = _qzz_leaked; \ + dubious = _qzz_dubious; \ + reachable = _qzz_reachable; \ + suppressed = _qzz_suppressed; \ + } + +/* Get the validity data for addresses [zza..zza+zznbytes-1] and copy it + into the provided zzvbits array. Return values: + 0 if not running on valgrind + 1 success + 2 [previously indicated unaligned arrays; these are now allowed] + 3 if any parts of zzsrc/zzvbits are not addressable. + The metadata is not copied in cases 0, 2 or 3 so it should be + impossible to segfault your system by using this call. +*/ +#define VALGRIND_GET_VBITS(zza,zzvbits,zznbytes) \ + (__extension__({unsigned int _qzz_res; \ + char* czza = (char*)zza; \ + char* czzvbits = (char*)zzvbits; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__GET_VBITS, \ + czza, czzvbits, zznbytes, 0, 0 ); \ + _qzz_res; \ + })) + +/* Set the validity data for addresses [zza..zza+zznbytes-1], copying it + from the provided zzvbits array. Return values: + 0 if not running on valgrind + 1 success + 2 [previously indicated unaligned arrays; these are now allowed] + 3 if any parts of zza/zzvbits are not addressable. + The metadata is not copied in cases 0, 2 or 3 so it should be + impossible to segfault your system by using this call. +*/ +#define VALGRIND_SET_VBITS(zza,zzvbits,zznbytes) \ + (__extension__({unsigned int _qzz_res; \ + char* czza = (char*)zza; \ + char* czzvbits = (char*)zzvbits; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__SET_VBITS, \ + czza, czzvbits, zznbytes, 0, 0 ); \ + _qzz_res; \ + })) + +#endif + diff --git a/include/valgrind/valgrind.h b/include/valgrind/valgrind.h new file mode 100644 index 0000000..7b82f83 --- /dev/null +++ b/include/valgrind/valgrind.h @@ -0,0 +1,3914 @@ +/* -*- c -*- + ---------------------------------------------------------------- + + Notice that the following BSD-style license applies to this one + file (valgrind.h) only. The rest of Valgrind is licensed under the + terms of the GNU General Public License, version 2, unless + otherwise indicated. See the COPYING file in the source + distribution for details. + + ---------------------------------------------------------------- + + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2000-2007 Julian Seward. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------------------------------------------------------- + + Notice that the above BSD-style license applies to this one file + (valgrind.h) only. The entire rest of Valgrind is licensed under + the terms of the GNU General Public License, version 2. See the + COPYING file in the source distribution for details. + + ---------------------------------------------------------------- +*/ + +/* This file is for inclusion into client (your!) code. + + You can use these macros to manipulate and query Valgrind's + execution inside your own programs. + + The resulting executables will still run without Valgrind, just a + little bit more slowly than they otherwise would, but otherwise + unchanged. When not running on valgrind, each client request + consumes very few (eg. 7) instructions, so the resulting performance + loss is negligible unless you plan to execute client requests + millions of times per second. Nevertheless, if that is still a + problem, you can compile with the NVALGRIND symbol defined (gcc + -DNVALGRIND) so that client requests are not even compiled in. */ + +#ifndef __VALGRIND_H +#define __VALGRIND_H + +#include + +/* Nb: this file might be included in a file compiled with -ansi. So + we can't use C++ style "//" comments nor the "asm" keyword (instead + use "__asm__"). */ + +/* Derive some tags indicating what the target platform is. Note + that in this file we're using the compiler's CPP symbols for + identifying architectures, which are different to the ones we use + within the rest of Valgrind. Note, __powerpc__ is active for both + 32 and 64-bit PPC, whereas __powerpc64__ is only active for the + latter (on Linux, that is). */ +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_ppc32_aix5 +#undef PLAT_ppc64_aix5 + +#if !defined(_AIX) && defined(__i386__) +# define PLAT_x86_linux 1 +#elif !defined(_AIX) && defined(__x86_64__) +# define PLAT_amd64_linux 1 +#elif !defined(_AIX) && defined(__powerpc__) && !defined(__powerpc64__) +# define PLAT_ppc32_linux 1 +#elif !defined(_AIX) && defined(__powerpc__) && defined(__powerpc64__) +# define PLAT_ppc64_linux 1 +#elif defined(_AIX) && defined(__64BIT__) +# define PLAT_ppc64_aix5 1 +#elif defined(_AIX) && !defined(__64BIT__) +# define PLAT_ppc32_aix5 1 +#endif + +/* If we're not compiling for our target platform, don't generate + any inline asms. */ +#if !defined(PLAT_x86_linux) && !defined(PLAT_amd64_linux) \ + && !defined(PLAT_ppc32_linux) && !defined(PLAT_ppc64_linux) \ + && !defined(PLAT_ppc32_aix5) && !defined(PLAT_ppc64_aix5) +# if !defined(NVALGRIND) +# define NVALGRIND 1 +# endif +#endif + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS. There is nothing */ +/* in here of use to end-users -- skip to the next section. */ +/* ------------------------------------------------------------------ */ + +#if defined(NVALGRIND) + +/* Define NVALGRIND to completely remove the Valgrind magic sequence + from the compiled code (analogous to NDEBUG's effects on + assert()) */ +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { \ + (_zzq_rlval) = (_zzq_default); \ + } + +#else /* ! NVALGRIND */ + +/* The following defines the magic code sequences which the JITter + spots and handles magically. Don't look too closely at them as + they will rot your brain. + + The assembly code sequences for all architectures is in this one + file. This is because this file must be stand-alone, and we don't + want to have multiple files. + + For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default + value gets put in the return slot, so that everything works when + this is executed not under Valgrind. Args are passed in a memory + block, and so there's no intrinsic limit to the number that could + be passed, but it's currently five. + + The macro args are: + _zzq_rlval result lvalue + _zzq_default default value (result returned when running on real CPU) + _zzq_request request code + _zzq_arg1..5 request params + + The other two macros are used to support function wrapping, and are + a lot simpler. VALGRIND_GET_NR_CONTEXT returns the value of the + guest's NRADDR pseudo-register and whatever other information is + needed to safely run the call original from the wrapper: on + ppc64-linux, the R2 value at the divert point is also needed. This + information is abstracted into a user-visible type, OrigFn. + + VALGRIND_CALL_NOREDIR_* behaves the same as the following on the + guest, but guarantees that the branch instruction will not be + redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64: + branch-and-link-to-r11. VALGRIND_CALL_NOREDIR is just text, not a + complete inline asm, since it needs to be combined with more magic + inline asm stuff to be useful. +*/ + +/* ------------------------- x86-linux ------------------------- */ + +#if defined(PLAT_x86_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "roll $3, %%edi ; roll $13, %%edi\n\t" \ + "roll $29, %%edi ; roll $19, %%edi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned int _zzq_args[6]; \ + volatile unsigned int _zzq_result; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EDX = client_request ( %EAX ) */ \ + "xchgl %%ebx,%%ebx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %EAX = guest_NRADDR */ \ + "xchgl %%ecx,%%ecx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_EAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%EAX */ \ + "xchgl %%edx,%%edx\n\t" +#endif /* PLAT_x86_linux */ + +/* ------------------------ amd64-linux ------------------------ */ + +#if defined(PLAT_amd64_linux) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rolq $3, %%rdi ; rolq $13, %%rdi\n\t" \ + "rolq $61, %%rdi ; rolq $51, %%rdi\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + { volatile unsigned long long int _zzq_args[6]; \ + volatile unsigned long long int _zzq_result; \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RDX = client_request ( %RAX ) */ \ + "xchgq %%rbx,%%rbx" \ + : "=d" (_zzq_result) \ + : "a" (&_zzq_args[0]), "0" (_zzq_default) \ + : "cc", "memory" \ + ); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + volatile unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %RAX = guest_NRADDR */ \ + "xchgq %%rcx,%%rcx" \ + : "=a" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_CALL_NOREDIR_RAX \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* call-noredir *%RAX */ \ + "xchgq %%rdx,%%rdx\n\t" +#endif /* PLAT_amd64_linux */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[6]; \ + unsigned int _zzq_result; \ + unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 3,%1\n\t" /*default*/ \ + "mr 4,%2\n\t" /*ptr*/ \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" /*result*/ \ + : "=b" (_zzq_result) \ + : "b" (_zzq_default), "b" (_zzq_ptr) \ + : "cc", "memory", "r3", "r4"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "cc", "memory", "r3" \ + ); \ + _zzq_orig->nraddr = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[6]; \ + register unsigned long long int _zzq_result __asm__("r3"); \ + register unsigned long long int* _zzq_ptr __asm__("r4"); \ + _zzq_args[0] = (unsigned long long int)(_zzq_request); \ + _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1" \ + : "=r" (_zzq_result) \ + : "0" (_zzq_default), "r" (_zzq_ptr) \ + : "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr __asm__("r3"); \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4" \ + : "=r" (__addr) \ + : \ + : "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +typedef + struct { + unsigned int nraddr; /* where's the code? */ + unsigned int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ + "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned int _zzq_args[7]; \ + register unsigned int _zzq_result; \ + register unsigned int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int)(_zzq_request); \ + _zzq_args[1] = (unsigned int)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "lwz 3, 24(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +typedef + struct { + unsigned long long int nraddr; /* where's the code? */ + unsigned long long int r2; /* what tocptr do we need? */ + } + OrigFn; + +#define __SPECIAL_INSTRUCTION_PREAMBLE \ + "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ + "rotldi 0,0,61 ; rotldi 0,0,51\n\t" + +#define VALGRIND_DO_CLIENT_REQUEST( \ + _zzq_rlval, _zzq_default, _zzq_request, \ + _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ + \ + { unsigned long long int _zzq_args[7]; \ + register unsigned long long int _zzq_result; \ + register unsigned long long int* _zzq_ptr; \ + _zzq_args[0] = (unsigned int long long)(_zzq_request); \ + _zzq_args[1] = (unsigned int long long)(_zzq_arg1); \ + _zzq_args[2] = (unsigned int long long)(_zzq_arg2); \ + _zzq_args[3] = (unsigned int long long)(_zzq_arg3); \ + _zzq_args[4] = (unsigned int long long)(_zzq_arg4); \ + _zzq_args[5] = (unsigned int long long)(_zzq_arg5); \ + _zzq_args[6] = (unsigned int long long)(_zzq_default); \ + _zzq_ptr = _zzq_args; \ + __asm__ volatile("mr 4,%1\n\t" \ + "ld 3, 48(4)\n\t" \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = client_request ( %R4 ) */ \ + "or 1,1,1\n\t" \ + "mr %0,3" \ + : "=b" (_zzq_result) \ + : "b" (_zzq_ptr) \ + : "r3", "r4", "cc", "memory"); \ + _zzq_rlval = _zzq_result; \ + } + +#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ + { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ + register unsigned long long int __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR */ \ + "or 2,2,2\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->nraddr = __addr; \ + __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ + /* %R3 = guest_NRADDR_GPR2 */ \ + "or 4,4,4\n\t" \ + "mr %0,3" \ + : "=b" (__addr) \ + : \ + : "r3", "cc", "memory" \ + ); \ + _zzq_orig->r2 = __addr; \ + } + +#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + __SPECIAL_INSTRUCTION_PREAMBLE \ + /* branch-and-link-to-noredir *%R11 */ \ + "or 3,3,3\n\t" + +#endif /* PLAT_ppc64_aix5 */ + +/* Insert assembly code for other platforms here... */ + +#endif /* NVALGRIND */ + +/* ------------------------------------------------------------------ */ +/* PLATFORM SPECIFICS for FUNCTION WRAPPING. This is all very */ +/* ugly. It's the least-worst tradeoff I can think of. */ +/* ------------------------------------------------------------------ */ + +/* This section defines magic (a.k.a appalling-hack) macros for doing + guaranteed-no-redirection macros, so as to get from function + wrappers to the functions they are wrapping. The whole point is to + construct standard call sequences, but to do the call itself with a + special no-redirect call pseudo-instruction that the JIT + understands and handles specially. This section is long and + repetitious, and I can't see a way to make it shorter. + + The naming scheme is as follows: + + CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc} + + 'W' stands for "word" and 'v' for "void". Hence there are + different macros for calling arity 0, 1, 2, 3, 4, etc, functions, + and for each, the possibility of returning a word-typed result, or + no result. +*/ + +/* Use these to write the name of your wrapper. NOTE: duplicates + VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */ + +#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname) \ + _vgwZU_##soname##_##fnname + +#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname) \ + _vgwZZ_##soname##_##fnname + +/* Use this macro from within a wrapper function to collect the + context (address and possibly other info) of the original function. + Once you have that you can then use it in one of the CALL_FN_ + macros. The type of the argument _lval is OrigFn. */ +#define VALGRIND_GET_ORIG_FN(_lval) VALGRIND_GET_NR_CONTEXT(_lval) + +/* Derivatives of the main macros below, for calling functions + returning void. */ + +#define CALL_FN_v_v(fnptr) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_v(_junk,fnptr); } while (0) + +#define CALL_FN_v_W(fnptr, arg1) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_W(_junk,fnptr,arg1); } while (0) + +#define CALL_FN_v_WW(fnptr, arg1,arg2) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0) + +#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3) \ + do { volatile unsigned long _junk; \ + CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0) + +/* ------------------------- x86-linux ------------------------- */ + +#if defined(PLAT_x86_linux) + +/* These regs are trashed by the hidden call. No need to mention eax + as gcc can already see that, plus causes gcc to bomb. */ +#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx" + +/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $4, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $8, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $12, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $16, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $20, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $24, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $28, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $32, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $36, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $40, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $44, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ + arg6,arg7,arg8,arg9,arg10, \ + arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "pushl 48(%%eax)\n\t" \ + "pushl 44(%%eax)\n\t" \ + "pushl 40(%%eax)\n\t" \ + "pushl 36(%%eax)\n\t" \ + "pushl 32(%%eax)\n\t" \ + "pushl 28(%%eax)\n\t" \ + "pushl 24(%%eax)\n\t" \ + "pushl 20(%%eax)\n\t" \ + "pushl 16(%%eax)\n\t" \ + "pushl 12(%%eax)\n\t" \ + "pushl 8(%%eax)\n\t" \ + "pushl 4(%%eax)\n\t" \ + "movl (%%eax), %%eax\n\t" /* target->%eax */ \ + VALGRIND_CALL_NOREDIR_EAX \ + "addl $48, %%esp\n" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_x86_linux */ + +/* ------------------------ amd64-linux ------------------------ */ + +#if defined(PLAT_amd64_linux) + +/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi", \ + "rdi", "r8", "r9", "r10", "r11" + +/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned + long) == 8. */ + +/* NB 9 Sept 07. There is a nasty kludge here in all these CALL_FN_ + macros. In order not to trash the stack redzone, we need to drop + %rsp by 128 before the hidden call, and restore afterwards. The + nastyness is that it is only by luck that the stack still appears + to be unwindable during the hidden call - since then the behaviour + of any routine using this macro does not match what the CFI data + says. Sigh. + + Why is this important? Imagine that a wrapper has a stack + allocated local, and passes to the hidden call, a pointer to it. + Because gcc does not know about the hidden call, it may allocate + that local in the redzone. Unfortunately the hidden call may then + trash it before it comes to use it. So we must step clear of the + redzone, for the duration of the hidden call, to make it safe. + + Probably the same problem afflicts the other redzone-style ABIs too + (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is + self describing (none of this CFI nonsense) so at least messing + with the stack pointer doesn't give a danger of non-unwindable + stack. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + "addq $128,%%rsp\n\t" \ + VALGRIND_CALL_NOREDIR_RAX \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $8, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $16, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $24, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $32, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $40, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)(arg1); \ + _argvec[2] = (unsigned long)(arg2); \ + _argvec[3] = (unsigned long)(arg3); \ + _argvec[4] = (unsigned long)(arg4); \ + _argvec[5] = (unsigned long)(arg5); \ + _argvec[6] = (unsigned long)(arg6); \ + _argvec[7] = (unsigned long)(arg7); \ + _argvec[8] = (unsigned long)(arg8); \ + _argvec[9] = (unsigned long)(arg9); \ + _argvec[10] = (unsigned long)(arg10); \ + _argvec[11] = (unsigned long)(arg11); \ + _argvec[12] = (unsigned long)(arg12); \ + __asm__ volatile( \ + "subq $128,%%rsp\n\t" \ + "pushq 96(%%rax)\n\t" \ + "pushq 88(%%rax)\n\t" \ + "pushq 80(%%rax)\n\t" \ + "pushq 72(%%rax)\n\t" \ + "pushq 64(%%rax)\n\t" \ + "pushq 56(%%rax)\n\t" \ + "movq 48(%%rax), %%r9\n\t" \ + "movq 40(%%rax), %%r8\n\t" \ + "movq 32(%%rax), %%rcx\n\t" \ + "movq 24(%%rax), %%rdx\n\t" \ + "movq 16(%%rax), %%rsi\n\t" \ + "movq 8(%%rax), %%rdi\n\t" \ + "movq (%%rax), %%rax\n\t" /* target->%rax */ \ + VALGRIND_CALL_NOREDIR_RAX \ + "addq $48, %%rsp\n" \ + "addq $128,%%rsp\n\t" \ + : /*out*/ "=a" (_res) \ + : /*in*/ "a" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_amd64_linux */ + +/* ------------------------ ppc32-linux ------------------------ */ + +#if defined(PLAT_ppc32_linux) + +/* This is useful for finding out about the on-stack stuff: + + extern int f9 ( int,int,int,int,int,int,int,int,int ); + extern int f10 ( int,int,int,int,int,int,int,int,int,int ); + extern int f11 ( int,int,int,int,int,int,int,int,int,int,int ); + extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int ); + + int g9 ( void ) { + return f9(11,22,33,44,55,66,77,88,99); + } + int g10 ( void ) { + return f10(11,22,33,44,55,66,77,88,99,110); + } + int g11 ( void ) { + return f11(11,22,33,44,55,66,77,88,99,110,121); + } + int g12 ( void ) { + return f12(11,22,33,44,55,66,77,88,99,110,121,132); + } +*/ + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc32-linux, + sizeof(unsigned long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[1]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[2]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[4]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[5]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[6]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[7]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[8]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[9]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[10]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[11]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-16\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,16\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[12]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[13]; \ + volatile unsigned long _res; \ + _argvec[0] = (unsigned long)_orig.nraddr; \ + _argvec[1] = (unsigned long)arg1; \ + _argvec[2] = (unsigned long)arg2; \ + _argvec[3] = (unsigned long)arg3; \ + _argvec[4] = (unsigned long)arg4; \ + _argvec[5] = (unsigned long)arg5; \ + _argvec[6] = (unsigned long)arg6; \ + _argvec[7] = (unsigned long)arg7; \ + _argvec[8] = (unsigned long)arg8; \ + _argvec[9] = (unsigned long)arg9; \ + _argvec[10] = (unsigned long)arg10; \ + _argvec[11] = (unsigned long)arg11; \ + _argvec[12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "addi 1,1,-32\n\t" \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,20(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,16(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,12(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,8(1)\n\t" \ + /* args1-8 */ \ + "lwz 3,4(11)\n\t" /* arg1->r3 */ \ + "lwz 4,8(11)\n\t" \ + "lwz 5,12(11)\n\t" \ + "lwz 6,16(11)\n\t" /* arg4->r6 */ \ + "lwz 7,20(11)\n\t" \ + "lwz 8,24(11)\n\t" \ + "lwz 9,28(11)\n\t" \ + "lwz 10,32(11)\n\t" /* arg8->r10 */ \ + "lwz 11,0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "addi 1,1,32\n\t" \ + "mr %0,3" \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[0]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_linux */ + +/* ------------------------ ppc64-linux ------------------------ */ + +#if defined(PLAT_ppc64_linux) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)" /* restore tocptr */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-128\n\t" /* expand stack frame */ \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,128" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "addi 1,1,-144\n\t" /* expand stack frame */ \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + "addi 1,1,144" /* restore frame */ \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_linux */ + +/* ------------------------ ppc32-aix5 ------------------------- */ + +#if defined(PLAT_ppc32_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "lwz 3," #_n_fr "(1)\n\t" \ + "stw 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned + long) == 4. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(64) \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(64) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "stw 2,-8(11)\n\t" /* save tocptr */ \ + "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(72) \ + /* arg12 */ \ + "lwz 3,48(11)\n\t" \ + "stw 3,68(1)\n\t" \ + /* arg11 */ \ + "lwz 3,44(11)\n\t" \ + "stw 3,64(1)\n\t" \ + /* arg10 */ \ + "lwz 3,40(11)\n\t" \ + "stw 3,60(1)\n\t" \ + /* arg9 */ \ + "lwz 3,36(11)\n\t" \ + "stw 3,56(1)\n\t" \ + /* args1-8 */ \ + "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ + "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ + "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ + "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ + "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ + "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ + "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ + "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ + "lwz 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "lwz 2,-8(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(72) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc32_aix5 */ + +/* ------------------------ ppc64-aix5 ------------------------- */ + +#if defined(PLAT_ppc64_aix5) + +/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ + +/* These regs are trashed by the hidden call. */ +#define __CALLER_SAVED_REGS \ + "lr", "ctr", "xer", \ + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ + "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ + "r11", "r12", "r13" + +/* Expand the stack frame, copying enough info that unwinding + still works. Trashes r3. */ + +#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ + "addi 1,1,-" #_n_fr "\n\t" \ + "ld 3," #_n_fr "(1)\n\t" \ + "std 3,0(1)\n\t" + +#define VG_CONTRACT_FRAME_BY(_n_fr) \ + "addi 1,1," #_n_fr "\n\t" + +/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned + long) == 8. */ + +#define CALL_FN_W_v(lval, orig) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+0]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_W(lval, orig, arg1) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+1]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+2]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+3]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+4]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+5]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+6]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+7]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+8]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+9]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+10]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(128) \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(128) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+11]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ + arg7,arg8,arg9,arg10,arg11,arg12) \ + do { \ + volatile OrigFn _orig = (orig); \ + volatile unsigned long _argvec[3+12]; \ + volatile unsigned long _res; \ + /* _argvec[0] holds current r2 across the call */ \ + _argvec[1] = (unsigned long)_orig.r2; \ + _argvec[2] = (unsigned long)_orig.nraddr; \ + _argvec[2+1] = (unsigned long)arg1; \ + _argvec[2+2] = (unsigned long)arg2; \ + _argvec[2+3] = (unsigned long)arg3; \ + _argvec[2+4] = (unsigned long)arg4; \ + _argvec[2+5] = (unsigned long)arg5; \ + _argvec[2+6] = (unsigned long)arg6; \ + _argvec[2+7] = (unsigned long)arg7; \ + _argvec[2+8] = (unsigned long)arg8; \ + _argvec[2+9] = (unsigned long)arg9; \ + _argvec[2+10] = (unsigned long)arg10; \ + _argvec[2+11] = (unsigned long)arg11; \ + _argvec[2+12] = (unsigned long)arg12; \ + __asm__ volatile( \ + "mr 11,%1\n\t" \ + VG_EXPAND_FRAME_BY_trashes_r3(512) \ + "std 2,-16(11)\n\t" /* save tocptr */ \ + "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ + VG_EXPAND_FRAME_BY_trashes_r3(144) \ + /* arg12 */ \ + "ld 3,96(11)\n\t" \ + "std 3,136(1)\n\t" \ + /* arg11 */ \ + "ld 3,88(11)\n\t" \ + "std 3,128(1)\n\t" \ + /* arg10 */ \ + "ld 3,80(11)\n\t" \ + "std 3,120(1)\n\t" \ + /* arg9 */ \ + "ld 3,72(11)\n\t" \ + "std 3,112(1)\n\t" \ + /* args1-8 */ \ + "ld 3, 8(11)\n\t" /* arg1->r3 */ \ + "ld 4, 16(11)\n\t" /* arg2->r4 */ \ + "ld 5, 24(11)\n\t" /* arg3->r5 */ \ + "ld 6, 32(11)\n\t" /* arg4->r6 */ \ + "ld 7, 40(11)\n\t" /* arg5->r7 */ \ + "ld 8, 48(11)\n\t" /* arg6->r8 */ \ + "ld 9, 56(11)\n\t" /* arg7->r9 */ \ + "ld 10, 64(11)\n\t" /* arg8->r10 */ \ + "ld 11, 0(11)\n\t" /* target->r11 */ \ + VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ + "mr 11,%1\n\t" \ + "mr %0,3\n\t" \ + "ld 2,-16(11)\n\t" /* restore tocptr */ \ + VG_CONTRACT_FRAME_BY(144) \ + VG_CONTRACT_FRAME_BY(512) \ + : /*out*/ "=r" (_res) \ + : /*in*/ "r" (&_argvec[2]) \ + : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ + ); \ + lval = (__typeof__(lval)) _res; \ + } while (0) + +#endif /* PLAT_ppc64_aix5 */ + +/* ------------------------------------------------------------------ */ +/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ +/* */ +/* ------------------------------------------------------------------ */ + +/* Some request codes. There are many more of these, but most are not + exposed to end-user view. These are the public ones, all of the + form 0x1000 + small_number. + + Core ones are in the range 0x00000000--0x0000ffff. The non-public + ones start at 0x2000. +*/ + +/* These macros are used by tools -- they must be public, but don't + embed them into other programs. */ +#define VG_USERREQ_TOOL_BASE(a,b) \ + ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16)) +#define VG_IS_TOOL_USERREQ(a, b, v) \ + (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000)) + +/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! + This enum comprises an ABI exported by Valgrind to programs + which use client requests. DO NOT CHANGE THE ORDER OF THESE + ENTRIES, NOR DELETE ANY -- add new ones at the end. */ +typedef + enum { VG_USERREQ__RUNNING_ON_VALGRIND = 0x1001, + VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002, + + /* These allow any function to be called from the simulated + CPU but run on the real CPU. Nb: the first arg passed to + the function is always the ThreadId of the running + thread! So CLIENT_CALL0 actually requires a 1 arg + function, etc. */ + VG_USERREQ__CLIENT_CALL0 = 0x1101, + VG_USERREQ__CLIENT_CALL1 = 0x1102, + VG_USERREQ__CLIENT_CALL2 = 0x1103, + VG_USERREQ__CLIENT_CALL3 = 0x1104, + + /* Can be useful in regression testing suites -- eg. can + send Valgrind's output to /dev/null and still count + errors. */ + VG_USERREQ__COUNT_ERRORS = 0x1201, + + /* These are useful and can be interpreted by any tool that + tracks malloc() et al, by using vg_replace_malloc.c. */ + VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301, + VG_USERREQ__FREELIKE_BLOCK = 0x1302, + /* Memory pool support. */ + VG_USERREQ__CREATE_MEMPOOL = 0x1303, + VG_USERREQ__DESTROY_MEMPOOL = 0x1304, + VG_USERREQ__MEMPOOL_ALLOC = 0x1305, + VG_USERREQ__MEMPOOL_FREE = 0x1306, + VG_USERREQ__MEMPOOL_TRIM = 0x1307, + VG_USERREQ__MOVE_MEMPOOL = 0x1308, + VG_USERREQ__MEMPOOL_CHANGE = 0x1309, + VG_USERREQ__MEMPOOL_EXISTS = 0x130a, + + /* Allow printfs to valgrind log. */ + VG_USERREQ__PRINTF = 0x1401, + VG_USERREQ__PRINTF_BACKTRACE = 0x1402, + + /* Stack support. */ + VG_USERREQ__STACK_REGISTER = 0x1501, + VG_USERREQ__STACK_DEREGISTER = 0x1502, + VG_USERREQ__STACK_CHANGE = 0x1503 + } Vg_ClientRequest; + +#if !defined(__GNUC__) +# define __extension__ /* */ +#endif + +/* Returns the number of Valgrinds this code is running under. That + is, 0 if running natively, 1 if running under Valgrind, 2 if + running under Valgrind which is running under another Valgrind, + etc. */ +#define RUNNING_ON_VALGRIND __extension__ \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */, \ + VG_USERREQ__RUNNING_ON_VALGRIND, \ + 0, 0, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Discard translation of code in the range [_qzz_addr .. _qzz_addr + + _qzz_len - 1]. Useful if you are debugging a JITter or some such, + since it provides a way to make sure valgrind will retranslate the + invalidated area. Returns no value. */ +#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DISCARD_TRANSLATIONS, \ + _qzz_addr, _qzz_len, 0, 0, 0); \ + } + +/* These requests are for getting Valgrind itself to print something. + Possibly with a backtrace. This is a really ugly hack. */ + +#if defined(NVALGRIND) + +# define VALGRIND_PRINTF(...) +# define VALGRIND_PRINTF_BACKTRACE(...) + +#else /* NVALGRIND */ + +/* Modern GCC will optimize the static routine out if unused, + and unused attribute will shut down warnings about it. */ +static int VALGRIND_PRINTF(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF, + (unsigned long)format, (unsigned long)vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...) + __attribute__((format(__printf__, 1, 2), __unused__)); +static int +VALGRIND_PRINTF_BACKTRACE(const char *format, ...) +{ + unsigned long _qzz_res; + va_list vargs; + va_start(vargs, format); + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, VG_USERREQ__PRINTF_BACKTRACE, + (unsigned long)format, (unsigned long)vargs, + 0, 0, 0); + va_end(vargs); + return (int)_qzz_res; +} + +#endif /* NVALGRIND */ + +/* These requests allow control to move from the simulated CPU to the + real CPU, calling an arbitary function. + + Note that the current ThreadId is inserted as the first argument. + So this call: + + VALGRIND_NON_SIMD_CALL2(f, arg1, arg2) + + requires f to have this signature: + + Word f(Word tid, Word arg1, Word arg2) + + where "Word" is a word-sized type. + + Note that these client requests are not entirely reliable. For example, + if you call a function with them that subsequently calls printf(), + there's a high chance Valgrind will crash. Generally, your prospects of + these working are made higher if the called function does not refer to + any global variables, and does not refer to any libc or other functions + (printf et al). Any kind of entanglement with libc or dynamic linking is + likely to have a bad outcome, for tricky reasons which we've grappled + with a lot in the past. +*/ +#define VALGRIND_NON_SIMD_CALL0(_qyy_fn) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL0, \ + _qyy_fn, \ + 0, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL1, \ + _qyy_fn, \ + _qyy_arg1, 0, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL2, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, 0, 0); \ + _qyy_res; \ + }) + +#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \ + __extension__ \ + ({unsigned long _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__CLIENT_CALL3, \ + _qyy_fn, \ + _qyy_arg1, _qyy_arg2, \ + _qyy_arg3, 0); \ + _qyy_res; \ + }) + +/* Counts the number of errors that have been recorded by a tool. Nb: + the tool must record the errors with VG_(maybe_record_error)() or + VG_(unique_error)() for them to be counted. */ +#define VALGRIND_COUNT_ERRORS \ + __extension__ \ + ({unsigned int _qyy_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ + VG_USERREQ__COUNT_ERRORS, \ + 0, 0, 0, 0, 0); \ + _qyy_res; \ + }) + +/* Mark a block of memory as having been allocated by a malloc()-like + function. `addr' is the start of the usable block (ie. after any + redzone) `rzB' is redzone size if the allocator can apply redzones; + use '0' if not. Adding redzones makes it more likely Valgrind will spot + block overruns. `is_zeroed' indicates if the memory is zeroed, as it is + for calloc(). Put it immediately after the point where a block is + allocated. + + If you're using Memcheck: If you're allocating memory via superblocks, + and then handing out small chunks of each superblock, if you don't have + redzones on your small blocks, it's worth marking the superblock with + VALGRIND_MAKE_MEM_NOACCESS when it's created, so that block overruns are + detected. But if you can put redzones on, it's probably better to not do + this, so that messages for small overruns are described in terms of the + small block rather than the superblock (but if you have a big overrun + that skips over a redzone, you could miss an error this way). See + memcheck/tests/custom_alloc.c for an example. + + WARNING: if your allocator uses malloc() or 'new' to allocate + superblocks, rather than mmap() or brk(), this will not work properly -- + you'll likely get assertion failures during leak detection. This is + because Valgrind doesn't like seeing overlapping heap blocks. Sorry. + + Nb: block must be freed via a free()-like function specified + with VALGRIND_FREELIKE_BLOCK or mismatch errors will occur. */ +#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MALLOCLIKE_BLOCK, \ + addr, sizeB, rzB, is_zeroed, 0); \ + } + +/* Mark a block of memory as having been freed by a free()-like function. + `rzB' is redzone size; it must match that given to + VALGRIND_MALLOCLIKE_BLOCK. Memory not freed will be detected by the leak + checker. Put it immediately after the point where the block is freed. */ +#define VALGRIND_FREELIKE_BLOCK(addr, rzB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__FREELIKE_BLOCK, \ + addr, rzB, 0, 0, 0); \ + } + +/* Create a memory pool. */ +#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed) \ + {unsigned int _qzz_res __unused__; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__CREATE_MEMPOOL, \ + pool, rzB, is_zeroed, 0, 0); \ + } + +/* Destroy a memory pool. */ +#define VALGRIND_DESTROY_MEMPOOL(pool) \ + {unsigned int _qzz_res __unused__; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__DESTROY_MEMPOOL, \ + pool, 0, 0, 0, 0); \ + } + +/* Associate a piece of memory with a memory pool. */ +#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size) \ + {unsigned int _qzz_res __unused__; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_ALLOC, \ + pool, addr, size, 0, 0); \ + } + +/* Disassociate a piece of memory from a memory pool. */ +#define VALGRIND_MEMPOOL_FREE(pool, addr) \ + {unsigned int _qzz_res __unused__; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_FREE, \ + pool, addr, 0, 0, 0); \ + } + +/* Disassociate any pieces outside a particular range. */ +#define VALGRIND_MEMPOOL_TRIM(pool, addr, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_TRIM, \ + pool, addr, size, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MOVE_MEMPOOL(poolA, poolB) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MOVE_MEMPOOL, \ + poolA, poolB, 0, 0, 0); \ + } + +/* Resize and/or move a piece associated with a memory pool. */ +#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_CHANGE, \ + pool, addrA, addrB, size, 0); \ + } + +/* Return 1 if a mempool exists, else 0. */ +#define VALGRIND_MEMPOOL_EXISTS(pool) \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__MEMPOOL_EXISTS, \ + pool, 0, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Mark a piece of memory as being a stack. Returns a stack id. */ +#define VALGRIND_STACK_REGISTER(start, end) \ + ({unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_REGISTER, \ + start, end, 0, 0, 0); \ + _qzz_res; \ + }) + +/* Unmark the piece of memory associated with a stack id as being a + stack. */ +#define VALGRIND_STACK_DEREGISTER(id) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_DEREGISTER, \ + id, 0, 0, 0, 0); \ + } + +/* Change the start and end address of the stack id. */ +#define VALGRIND_STACK_CHANGE(id, start, end) \ + {unsigned int _qzz_res; \ + VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ + VG_USERREQ__STACK_CHANGE, \ + id, start, end, 0, 0); \ + } + +#undef PLAT_x86_linux +#undef PLAT_amd64_linux +#undef PLAT_ppc32_linux +#undef PLAT_ppc64_linux +#undef PLAT_ppc32_aix5 +#undef PLAT_ppc64_aix5 + +#endif /* __VALGRIND_H */ diff --git a/infinipath-psm.spec.in b/infinipath-psm.spec.in new file mode 100644 index 0000000..a84f81f --- /dev/null +++ b/infinipath-psm.spec.in @@ -0,0 +1,163 @@ +# Copyright (c) 2012. Intel Corporation. All rights reserved. +# Copyright (c) 2010. QLogic Corporation. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +Summary: Intel PSM Libraries +Name: infinipath-psm +Version: @VERSION@ +Release: @RELEASE@ +Epoch: 4 +License: GPL +Group: System Environment/Libraries +URL: http://www.intel.com/ +Source0: %{name}-%{version}-%{release}.tar.gz +Prefix: /usr +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root +Provides: infinipath-psm = %{version} +%if "%{PSM_HAVE_SCIF}" == "1" +Provides: intel-mic-psm = %{version} +%endif +# MIC package +Obsoletes: intel-mic-psm +# OFED package +Obsoletes: infinipath-libs <= %{version}-%{release} +Conflicts: infinipath-libs <= %{version}-%{release} +# mpss package +Obsoletes: mpss-psm <= %{version}-%{release} +Conflicts: mpss-psm <= %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES@ + +%package -n infinipath-psm-devel +Summary: Development files for Intel PSM +Group: System Environment/Development +Requires: infinipath-psm = %{version}-%{release} +Provides: infinipath-psm-devel = %{version} +%if "%{PSM_HAVE_SCIF}" == "1" +Provides: intel-mic-psm-devel = %{version} +%endif +# MIC package +Obsoletes: intel-mic-psm-devel +# OFED package +Obsoletes: infinipath-devel <= %{version}-%{release} +Conflicts: infinipath-devel <= %{version}-%{release} +# mpss package +Obsoletes: mpss-psm-dev <= %{version}-%{release} +Conflicts: mpss-psm-dev <= %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES-DEVEL@ + +# %package card-devel +# Summary: Development files for Intel Xeon Phi +# Group: System Environment/Development +# Requires: %{name} = %{version}-%{release} +# Requires(post): /sbin/ldconfig +# Requires(postun): /sbin/ldconfig + + +%global debug_package %{nil} + +#PSM_HAVE_SCIF is one of: 0 1 +%{!?PSM_HAVE_SCIF: %global PSM_HAVE_SCIF 0} + +%define INFINIPATH_MAKEARG PSM_HAVE_SCIF=0 MIC=0 +%define INTEL_MAKEARG PSM_HAVE_SCIF=1 MIC=0 +%define INTEL_CARD_MAKEARG PSM_HAVE_SCIF=1 MIC=1 LOCAL_PREFIX=/opt/intel/mic/psm +%define card_prefix /opt/intel/mic/psm + +%if "%{PSM_HAVE_SCIF}" == "0" + %define MAKEARG PSM_HAVE_SCIF=0 MIC=0 +%else + %if "%{PSM_HAVE_SCIF}" == "1" + %define MAKEARG PSM_HAVE_SCIF=1 MIC=0 + %else + %define MAKEARG PSM_HAVE_SCIF=0 MIC=0 + %define PSM_HAVE_SCIF "1" + %endif +%endif + +%description +The PSM Messaging API, or PSM API, is Intel's low-level +user-level communications interface for the True Scale +family of products. PSM users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. + +%description -n infinipath-psm-devel +Development files for the libpsm_infinipath library + +%prep +%setup -q -n %{name}-%{version}-%{release} + +%build +%{__make} @PSM_UUID@ %{MAKEARG} + +%install +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT +export DESTDIR=$RPM_BUILD_ROOT +%{__make} install %{MAKEARG} + +%clean +rm -rf $RPM_BUILD_ROOT + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig +%post devel -p /sbin/ldconfig +%postun devel -p /sbin/ldconfig + +%files +%defattr(-,root,root,-) +/usr/lib64/libpsm_infinipath.so.* +/usr/lib64/libinfinipath.so.* +%if "%{PSM_HAVE_SCIF}" == "1" +/usr/sbin/psmd +%endif + +%files -n infinipath-psm-devel +%defattr(-,root,root,-) +/usr/lib64/libpsm_infinipath.so +/usr/lib64/libinfinipath.so +/usr/include/psm.h +/usr/include/psm_mq.h + + + +%changelog +* Fri Sep 25 2015 Henry Estela - @VERSION@-1 +- Always build infinipath-psm with different Provides names. +* Tue Nov 6 2012 Mitko Haralanov - @VERSION@-1 +- Add Intel Xeon Phi related changes +* Tue May 11 2010 Mitko Haralanov - @VERSION@-1 +- Initial build. + diff --git a/intel-mic-psm-card.spec.in b/intel-mic-psm-card.spec.in new file mode 100644 index 0000000..44a3123 --- /dev/null +++ b/intel-mic-psm-card.spec.in @@ -0,0 +1,112 @@ +# Copyright (c) 2012. Intel Corporation. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +%define debug_package ${nil} +%{!?install_prefix:%define install_prefix /usr} + +Summary: Intel PSM Libraries for Intel Xeon Phi +Name: intel-mic-psm-card +Version: @VERSION@ +Release: @RELEASE@ +License: GPL +Group: System Environment/Damon +URL: http://www.intel.com/ +Source0: %{name}-%{version}-%{release}.tar.gz +Prefix: /usr +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES@ + +%package devel +Summary: Development files for Intel Xeon Phi +Group: System Environment/Development +Requires: %{name} = %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig + +%description +The PSM Messaging API, or PSM API, is Intel's low-level +user-level communications interface for the True Scale +family of products. PSM users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. + +%description devel +Development files for libpsm_infinipath library + +%prep +%setup -q -n %{name}-%{version}-%{release} + +%build +%{__make} + +%install +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT +%{make_install} +%if %(test "%{install_prefix}" = "/usr" && echo 0 || echo 1) + cp -a mic/* $RPM_BUILD_ROOT + find $RPM_BUILD_ROOT/ -name "*.in" -exec rm -f {} \; +%endif + +%clean +rm -rf $RPM_BUILD_ROOT + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig +%post devel -p /sbin/ldconfig +%postun devel -p /sbin/ldconfig + +%files +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so.* +%{install_prefix}/lib64/libinfinipath.so.* +%if %(test "%{install_prefix}" = "/usr" && echo 0 || echo 1) + %{install_prefix}/psm.filelist + %{_sysconfdir}/sysconfig/mic/conf.d/psm.conf +%endif + +%files devel +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so +%{install_prefix}/lib64/libinfinipath.so + +%changelog +* Thu Apr 11 2013 Mitko Haralanov +- Remove any unwanted files before packaging +* Wed Nov 28 2012 Mitko Haralanov +- Add Xeon Phi devel package +* Thu Nov 9 2012 Mitko Haralanov +- Add TMI to package +* Mon Nov 5 2012 Mitko Haralanov +- Initial build. + diff --git a/intel-mic-psm.spec.in b/intel-mic-psm.spec.in new file mode 100644 index 0000000..71d9021 --- /dev/null +++ b/intel-mic-psm.spec.in @@ -0,0 +1,207 @@ +# Copyright (c) 2012. Intel Corporation. All rights reserved. +# Copyright (c) 2010. QLogic Corporation. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +Summary: Intel PSM Libraries +Name: intel-mic-psm +Version: @VERSION@ +Release: @RELEASE@ +License: GPL +Group: System Environment/Libraries +URL: http://www.intel.com/ +Source0: %{name}-%{version}-%{release}.tar.gz +Prefix: /usr +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root +Provides: %{name} = %{version} +# ifs package +Obsoletes: infinipath-libs <= %{version}-%{release} +Conflicts: infinipath-libs <= %{version}-%{release} +# mpss package +Obsoletes: mpss-psm <= %{version}-%{release} +Conflicts: mpss-psm <= %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES@ + +%package devel +Summary: Development files for Intel PSM +Group: System Environment/Development +Requires: %{name} = %{version}-%{release} +Provides: %{name}-devel = %{version} +# ifs package +Obsoletes: infinipath-devel <= %{version}-%{release} +Conflicts: infinipath-devel <= %{version}-%{release} +# mpss package +Obsoletes: mpss-psm-dev <= %{version}-%{release} +Conflicts: mpss-psm-dev <= %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES-DEVEL@ + + +%package -n infinipath-psm +Summary: QLogic PSM Libraries +Epoch: 4 +License: GPL +Group: System Environment/Libraries +URL: http://www.qlogic.com/ +Prefix: /usr +Provides: infinipath-psm = %{version} +Conflicts: infinipath-libs intel-mic-psm +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES@ + +%package -n infinipath-psm-devel +Summary: Development files for Intel PSM +Group: System Environment/Development +Requires: infinipath-psm = %{version}-%{release} +Provides: infinipath-psm-devel = %{version} +# ifs package +Obsoletes: infinipath-devel <= %{version}-%{release} +Conflicts: infinipath-devel <= %{version}-%{release} +# mpss package +Obsoletes: mpss-psm-dev <= %{version}-%{release} +Conflicts: mpss-psm-dev <= %{version}-%{release} +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +@REQUIRES-DEVEL@ + +# %package card-devel +# Summary: Development files for Intel Xeon Phi +# Group: System Environment/Development +# Requires: %{name} = %{version}-%{release} +# Requires(post): /sbin/ldconfig +# Requires(postun): /sbin/ldconfig + + +%global debug_package %{nil} + +#%{!?install_prefix:%define install_prefix /usr} +#PSM_HAVE_SCIF is one of: 0 1 +%{!?PSM_HAVE_SCIF: %global PSM_HAVE_SCIF 0} + +%define INFINIPATH_MAKEARG PSM_HAVE_SCIF=0 MIC=0 +%define INTEL_MAKEARG PSM_HAVE_SCIF=1 MIC=0 +%define INTEL_CARD_MAKEARG PSM_HAVE_SCIF=1 MIC=1 LOCAL_PREFIX=/opt/intel/mic/psm +%define card_prefix /opt/intel/mic/psm + +%if "%{PSM_HAVE_SCIF}" == "0" + %define MAKEARG PSM_HAVE_SCIF=0 MIC=0 +%else + %if "%{PSM_HAVE_SCIF}" == "1" + %define MAKEARG PSM_HAVE_SCIF=1 MIC=0 + %else + %define MAKEARG PSM_HAVE_SCIF=0 MIC=0 + %define PSM_HAVE_SCIF "1" + %endif +%endif + +%description +The PSM Messaging API, or PSM API, is Intel's low-level +user-level communications interface for the True Scale +family of products. PSM users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. + +%description devel +Development files for the libpsm_infinipath library + +%description -n infinipath-psm +The PSM Messaging API, or PSM API, is QLogic's low-level +user-level communications interface for the Truescale +family of products. PSM users are enabled with mechanisms +necessary to implement higher level communications +interfaces in parallel environments. + +%description -n infinipath-psm-devel +Development files for the libpsm_infinipath library + +%prep +%setup -q -n %{name}-%{version}-%{release} + +%build +%{__make} @PSM_UUID@ %{MAKEARG} + +%install +rm -rf $RPM_BUILD_ROOT +mkdir -p $RPM_BUILD_ROOT +export DESTDIR=$RPM_BUILD_ROOT +%{__make} install %{MAKEARG} + + + +%clean +rm -rf $RPM_BUILD_ROOT + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig +%post devel -p /sbin/ldconfig +%postun devel -p /sbin/ldconfig + +%if "%{PSM_HAVE_SCIF}" == "1" +%files +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so.* +%{install_prefix}/lib64/libinfinipath.so.* +/usr/sbin/psmd + +%files devel +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so +%{install_prefix}/lib64/libinfinipath.so +/usr/include/psm.h +/usr/include/psm_mq.h +%endif + + +%if "%{PSM_HAVE_SCIF}" == "0" +%files -n infinipath-psm +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so.* +%{install_prefix}/lib64/libinfinipath.so.* + +%files -n infinipath-psm-devel +%defattr(-,root,root,-) +%{install_prefix}/lib64/libpsm_infinipath.so +%{install_prefix}/lib64/libinfinipath.so +/usr/include/psm.h +/usr/include/psm_mq.h +%endif + + + +%changelog +* Tue Nov 6 2012 Mitko Haralanov - @VERSION@-1 +- Add Intel Xeon Phi related changes +* Tue May 11 2010 Mitko Haralanov - @VERSION@-1 +- Initial build. + diff --git a/ipath-psm-devel.srclist.in b/ipath-psm-devel.srclist.in new file mode 100644 index 0000000..a1dc132 --- /dev/null +++ b/ipath-psm-devel.srclist.in @@ -0,0 +1,4 @@ +/usr/include/psm.h +/usr/include/psm_mq.h +%LIBPREFIX%/libinfinipath.so +%LIBPREFIX%/libpsm_infinipath.so diff --git a/ipath-psm.srclist.in b/ipath-psm.srclist.in new file mode 100644 index 0000000..97a45ff --- /dev/null +++ b/ipath-psm.srclist.in @@ -0,0 +1,4 @@ +%LIBPREFIX%/libinfinipath.so.4 +%LIBPREFIX%/libinfinipath.so.4.0 +%LIBPREFIX%/libpsm_infinipath.so.1 +%LIBPREFIX%/libpsm_infinipath.so.1.15 diff --git a/ipath/Makefile b/ipath/Makefile new file mode 100644 index 0000000..8c2cc6e --- /dev/null +++ b/ipath/Makefile @@ -0,0 +1,98 @@ +# Copyright (c) 2012. Intel Corporation. All rights reserved. +# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +TARGLIB := libinfinipath +MAJOR := $(IPATH_LIB_MAJOR) +MINOR := $(IPATH_LIB_MINOR) + +include $(top_srcdir)/buildflags.mak +BASECFLAGS += -D_GNU_SOURCE +INCLUDES += -I$(top_srcdir)/ptl_ips + +ifeq (${arch},x86_64) + PLATFORM_OBJ=ipath_dwordcpy-x86_64-fast.o +else + PLATFORM_OBJ= +endif + +${TARGLIB}-objs := ipath_debug.o ipath_time.o ipath_proto.o \ + ipath_utils.o ipath_service.o ipath_protomic.o \ + ipath_dwordcpy-$(arch).o ipath_i2cflash.o ipath_sysfs.o ipath_syslog.o \ + ipath_write_pio-$(arch).o $(PLATFORM_OBJ) + +all .DEFAULT: ${TARGLIB}.so + +install: all + install -D ${TARGLIB}.so.${MAJOR}.${MINOR} \ + ${DESTDIR}${INSTALL_LIB_TARG}/${TARGLIB}.so.${MAJOR}.${MINOR} + (cd ${DESTDIR}${INSTALL_LIB_TARG} ; \ + ln -sf ${TARGLIB}.so.${MAJOR}.${MINOR} ${TARGLIB}.so.${MAJOR} ; \ + ln -sf ${TARGLIB}.so.${MAJOR} ${TARGLIB}.so) + +${TARGLIB}.so: ${TARGLIB}.so.${MAJOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +${TARGLIB}.so.${MAJOR}: ${TARGLIB}.so.${MAJOR}.${MINOR} + ln -fs ${TARGLIB}.so.${MAJOR}.${MINOR} $@ + +# when we build the shared library, generate a revision and date +# string in it, for easier id'ing when people may have copied the +# file around. Generate it such that the ident command can find it +# and strings -a | grep InfiniPath does a reasonable job as well. +${TARGLIB}.so.${MAJOR}.${MINOR}: ${${TARGLIB}-objs} + date +'static __attribute__ ((unused)) char __psc_infinipath_revision[] ="$$""Date: %F %R ${rpm_extra_description}InfiniPath $$";' > _revision.c + $(CC) -c $(BASECFLAGS) $(INCLUDES) _revision.c -o _revision.o + $(CC) -o $@ -Wl,-soname=${TARGLIB}.so.${MAJOR} -shared \ + -Wl,--unique='*fastpath*' \ + ${${TARGLIB}-objs} _revision.o $(LDFLAGS) $(if $(MIC:0=),$(SCIF_LINK_FLAGS)) + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) $(if $(MIC:0=),$(SCIF_INCLUDE_FLAGS)) -c $< -o $@ + +%.o: %.S + $(CC) $(ASFLAGS) -c $< -o $@ + +ipath_debug.o: WERROR := +# This is temporarily necessary in order to get backtrace to work. Bug 3536 +ipath_debug.o: ipath_debug.c + $(CC) $(BASECFLAGS) $(INCLUDES) -c $< -o $@ + +ipath_write_pio-ppc.o: ipath_write_pio-ppc.c + $(CC) $(CFLAGS) -maltivec $(INCLUDES) -c $< -o $@ + +ipath_write_pio-ppc64.o: ipath_write_pio-ppc64.c + $(CC) $(CFLAGS) -maltivec $(INCLUDES) -c $< -o $@ + +clean: + rm -f _revision.c + rm -f *.o ${TARGLIB}.* diff --git a/ipath/ipath_debug.c b/ipath/ipath_debug.c new file mode 100644 index 0000000..b89502f --- /dev/null +++ b/ipath/ipath_debug.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ipath_user.h" + +unsigned infinipath_debug = 1; +char* __ipath_mylabel = NULL; +FILE *__ipath_dbgout; +static void init_ipath_mylabel(void) __attribute__ ((constructor)); +static void init_ipath_backtrace(void) __attribute__ ((constructor)); +static void init_ipath_dbgfile(void) __attribute__ ((constructor)); +static void fini_ipath_backtrace(void) __attribute__ ((destructor)); + +static void init_ipath_mylabel(void) +{ + char lbl[1024]; + char hostname[80]; + char *e; + /* By default, try to come up with a decent default label, it will be + * overriden later. Try getting rank, if that's not available revert to + * pid. */ + gethostname(hostname, 80); + lbl[0] = '\0'; + hostname[sizeof hostname - 1] = '\0'; + if ((((e = getenv("PSC_MPI_RANK")) && *e)) || + (((e = getenv("MPI_RANKID")) && *e)) || + (((e = getenv("MPIRUN_RANK")) && *e))) + { + char *ep; + unsigned long val; + val = strtoul(e, &ep, 10); + if (ep != e) /* valid conversion */ + snprintf(lbl, 1024, "%s.%lu", hostname, val); + } + if (lbl[0] == '\0') + snprintf(lbl, 1024, "%s.%u", hostname, getpid()); + __ipath_mylabel = strdup(lbl); +} + +static void +ipath_sighdlr(int sig, siginfo_t *p1, void *ucv) +{ + // we make these static to try and avoid issues caused + // by stack overflow that might have gotten us here. + static void *backaddr[128]; // avoid stack usage + static char buf[150], hname[64], fname[128]; + static int i, j, fd, id; + static int write_result __unused__; + extern char *__progname; + + // If this is a SIGINT do not display backtrace. Just invoke exit handlers + if ((sig == SIGINT) || (sig == SIGTERM)) + exit(1); + + id = snprintf(buf, sizeof buf, + "\n%.60s:%u terminated with signal %d", __progname, getpid(), sig); + if(ucv) { + static ucontext_t *uc; + uc = (ucontext_t*)ucv; + id += snprintf(buf+id, sizeof buf-id, " at PC=%lx SP=%lx", +#if defined(__x86_64__) + (unsigned long)uc->uc_mcontext.gregs[REG_RIP], + (unsigned long)uc->uc_mcontext.gregs[REG_RSP]); +#elif defined(__i386__) + (unsigned long)uc->uc_mcontext.gregs[REG_EIP], + (unsigned long)uc->uc_mcontext.gregs[REG_ESP]); +#else + 0ul, 0ul); +#warning No stack pointer or instruction pointer for this arch +#endif + } + id += snprintf(buf+id, sizeof buf-id, ". Backtrace:\n"); + write_result = write(2, buf, id); + + i = backtrace(backaddr, sizeof(backaddr)/sizeof(backaddr[0])); + if(i>2) // skip ourselves and backtrace + j=2,i-=j; + else + j=0; + backtrace_symbols_fd(backaddr+j, i, 2); + (void)fsync(2); + + // try to write it to a file as well, in case the rest doesn't make it out. + // Do it second, in case we get a second failure (more likely). + // We might eventually want to print some more of the registers to the + // btr file, to aid debugging, but not for now. + // Truncate the program name if overly long, so we always get pid and (at least part of) + // hostname. + (void)gethostname(hname, sizeof hname); + hname[sizeof(hname) - 1] = '\0'; + snprintf(fname, sizeof fname, "%s.80s-%u,%.32s.btr", __progname, getpid(), hname); + if((fd=open(fname, O_CREAT|O_WRONLY, 0644))>=0) { + write_result = write(fd, buf, id); + backtrace_symbols_fd(backaddr+j, i, fd); + (void)fsync(fd); + (void)close(fd); + } + exit(1); // not _exit(), want atexit handlers to get run +} + +static struct sigaction sigsegv_act; +static struct sigaction sigbus_act; +static struct sigaction sigill_act; +static struct sigaction sigabrt_act; +static struct sigaction sigint_act; +static struct sigaction sigterm_act; + +// we do this as a constructor so any user program that sets signal +// handlers for these will override our settings, but we still +// get backtraces if they don't +static void init_ipath_backtrace(void) +{ + // we need to track memory corruption + static struct sigaction act; // easier than memset + act.sa_sigaction = ipath_sighdlr; + act.sa_flags = SA_SIGINFO; + + if(!getenv("IPATH_NO_BACKTRACE")) {// permanent, although probably + // undocumented way to disable backtraces. + (void)sigaction(SIGSEGV, &act, &sigsegv_act); + (void)sigaction(SIGBUS, &act, &sigbus_act); + (void)sigaction(SIGILL, &act, &sigill_act); + (void)sigaction(SIGABRT, &act, &sigabrt_act); + (void)sigaction(SIGINT, &act, &sigint_act); + (void)sigaction(SIGTERM, &act, &sigterm_act); + } +} + +static void fini_ipath_backtrace(void) +{ + if(!getenv("IPATH_NO_BACKTRACE")) { + (void)sigaction(SIGSEGV, &sigsegv_act, NULL); + (void)sigaction(SIGBUS, &sigbus_act, NULL); + (void)sigaction(SIGILL, &sigill_act, NULL); + (void)sigaction(SIGABRT, &sigabrt_act, NULL); + (void)sigaction(SIGINT, &sigint_act, NULL); + (void)sigaction(SIGTERM, &sigterm_act, NULL); + } +} + +// if IPATH_DEBUG_FILENAME is set in the environment, then all the +// debug prints (not info and error) will go to that file. +// %h is expanded to the hostname, and %p to the pid, if present. +static void init_ipath_dbgfile(void) +{ + char *fname = getenv("IPATH_DEBUG_FILENAME"); + char *exph, *expp, tbuf[1024]; + FILE *newf; + + if(!fname) { + __ipath_dbgout = stdout; + return; + } + exph = strstr(fname, "%h"); // hostname + expp = strstr(fname, "%p"); // pid + if(exph || expp) { + int baselen; + char hname[256], pid[12]; + if(exph) { + *hname = hname[sizeof(hname)-1] = 0; + gethostname(hname, sizeof(hname)-1); + if(!*hname) + strcpy(hname, "[unknown]"); + } + if(expp) + snprintf(pid, sizeof pid, "%d", getpid()); + if(exph && expp) { + if(exph < expp) { + baselen = exph - fname; + snprintf(tbuf, sizeof tbuf, "%.*s%s%.*s%s%s", + baselen, fname, hname, + (int)(expp - (exph+2)), exph+2, pid, expp+2); + } + else { + baselen = expp - fname; + snprintf(tbuf, sizeof tbuf, "%.*s%s%.*s%s%s", + baselen, fname, pid, + (int)(exph - (expp+2)), expp+2, hname, exph+2); + } + } + else if(exph) { + baselen = exph - fname; + snprintf(tbuf, sizeof tbuf, "%.*s%s%s", + baselen, fname, hname, exph+2); + } + else { + baselen = expp - fname; + snprintf(tbuf, sizeof tbuf, "%.*s%s%s", + baselen, fname, pid, expp+2); + } + fname = tbuf; + } + newf = fopen(fname, "a"); + if(!newf) { + _IPATH_ERROR("Unable to open \"%s\" for debug output, using stdout: %s\n", + fname, strerror(errno)); + __ipath_dbgout = stdout; + } + else { + __ipath_dbgout = newf; + setlinebuf(__ipath_dbgout); + } +} + +void ipath_set_mylabel(char* label) +{ + __ipath_mylabel = label; +} + +char *ipath_get_mylabel() +{ + return __ipath_mylabel; +} diff --git a/ipath/ipath_dwordcpy-generic.c b/ipath/ipath_dwordcpy-generic.c new file mode 100644 index 0000000..33e7301 --- /dev/null +++ b/ipath/ipath_dwordcpy-generic.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#if defined(__x86_64__) +#define ipath_dwordcpy ipath_dwordcpy_safe +#endif + +void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + uint64_t *src64[4]; + uint64_t *dst64[4]; + src64[0] = (uint64_t *)src; + dst64[0] = (uint64_t *)dest; + + while ( ndw >= 8 ) { + *dst64[0] = *src64[0]; + src64[1] = src64[0]+1; + src64[2] = src64[0]+2; + src64[3] = src64[0]+3; + ndw -= 8; + dst64[1] = dst64[0]+1; + dst64[2] = dst64[0]+2; + dst64[3] = dst64[0]+3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if ( ndw ) { + src = (uint32_t *)src64[0]; + dest = (uint32_t *)dst64[0]; + + switch ( ndw ) { + case 7: *dest++ = *src++; + case 6: *dest++ = *src++; + case 5: *dest++ = *src++; + case 4: *dest++ = *src++; + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } + + } +} diff --git a/ipath/ipath_dwordcpy-i386.S b/ipath/ipath_dwordcpy-i386.S new file mode 100644 index 0000000..970651c --- /dev/null +++ b/ipath/ipath_dwordcpy-i386.S @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + .globl ipath_dwordcpy + .file "ipath_dword32cpy.S" + .text + .p2align 4,,15 +ipath_dwordcpy: + // standard C calling convention, args on stack + // does not return any value + .type ipath_dwordcpy, @function + // save caller-saved regs + mov %edi,%eax + mov %esi,%edx + + // setup regs + mov 0xc(%esp,1),%ecx + mov 0x4(%esp,1),%edi + mov 0x8(%esp,1),%esi + // and do it + cld + rep + movsd + + // restore + mov %eax,%edi + mov %edx,%esi + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/ipath/ipath_dwordcpy-ppc64.c b/ipath/ipath_dwordcpy-ppc64.c new file mode 100644 index 0000000..33e7301 --- /dev/null +++ b/ipath/ipath_dwordcpy-ppc64.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#if defined(__x86_64__) +#define ipath_dwordcpy ipath_dwordcpy_safe +#endif + +void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + uint64_t *src64[4]; + uint64_t *dst64[4]; + src64[0] = (uint64_t *)src; + dst64[0] = (uint64_t *)dest; + + while ( ndw >= 8 ) { + *dst64[0] = *src64[0]; + src64[1] = src64[0]+1; + src64[2] = src64[0]+2; + src64[3] = src64[0]+3; + ndw -= 8; + dst64[1] = dst64[0]+1; + dst64[2] = dst64[0]+2; + dst64[3] = dst64[0]+3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if ( ndw ) { + src = (uint32_t *)src64[0]; + dest = (uint32_t *)dst64[0]; + + switch ( ndw ) { + case 7: *dest++ = *src++; + case 6: *dest++ = *src++; + case 5: *dest++ = *src++; + case 4: *dest++ = *src++; + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } + + } +} diff --git a/ipath/ipath_dwordcpy-x86_64-fast.S b/ipath/ipath_dwordcpy-x86_64-fast.S new file mode 100644 index 0000000..6465aae --- /dev/null +++ b/ipath/ipath_dwordcpy-x86_64-fast.S @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + .globl ipath_dwordcpy + .file "ipath_dwordcpy-x86_64-fast.S" + .text + .p2align 4,,15 + // standard C calling convention, rdi is dest, rsi is source, rdx is count + // does not return any value +ipath_dwordcpy: + .type ipath_dwordcpy, @function + movl %edx,%ecx + shrl $1,%ecx + andl $1,%edx + cld + rep + movsq + movl %edx,%ecx + rep + movsd + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/ipath/ipath_dwordcpy-x86_64.c b/ipath/ipath_dwordcpy-x86_64.c new file mode 100644 index 0000000..33e7301 --- /dev/null +++ b/ipath/ipath_dwordcpy-x86_64.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#if defined(__x86_64__) +#define ipath_dwordcpy ipath_dwordcpy_safe +#endif + +void ipath_dwordcpy(uint32_t *dest, const uint32_t *src, uint32_t ndwords) +{ + uint_fast32_t ndw = ndwords; + uint64_t *src64[4]; + uint64_t *dst64[4]; + src64[0] = (uint64_t *)src; + dst64[0] = (uint64_t *)dest; + + while ( ndw >= 8 ) { + *dst64[0] = *src64[0]; + src64[1] = src64[0]+1; + src64[2] = src64[0]+2; + src64[3] = src64[0]+3; + ndw -= 8; + dst64[1] = dst64[0]+1; + dst64[2] = dst64[0]+2; + dst64[3] = dst64[0]+3; + *dst64[1] = *src64[1]; + *dst64[2] = *src64[2]; + *dst64[3] = *src64[3]; + src64[0] += 4; + dst64[0] += 4; + } + if ( ndw ) { + src = (uint32_t *)src64[0]; + dest = (uint32_t *)dst64[0]; + + switch ( ndw ) { + case 7: *dest++ = *src++; + case 6: *dest++ = *src++; + case 5: *dest++ = *src++; + case 4: *dest++ = *src++; + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } + + } +} diff --git a/ipath/ipath_i2cflash.c b/ipath/ipath_i2cflash.c new file mode 100644 index 0000000..e906895 --- /dev/null +++ b/ipath/ipath_i2cflash.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_user.h" + +uint8_t +ipath_flash_csum(struct ipath_flash *ifp, int adjust) +{ + uint8_t *ip = (uint8_t*)ifp; + uint8_t csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct ipath_flash)) + len = sizeof(struct ipath_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if(adjust) + ifp->if_csum = csum; + return csum; +} + diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c new file mode 100644 index 0000000..5f9365f --- /dev/null +++ b/ipath/ipath_proto.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MIC__ +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +#include + +#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) + +// don't inline these; it's all init code, and not inlining makes the +// overall code shorter and easier to debug. +static void ipath_setaffinity(int) __attribute__ ((noinline)); + +// set the processor affinity based upon the assigned context. +// We want to do this early, before much memory is allocated +// (by user or kernel code) so that we get memory allocated on +// the node upon which we will be running. This was done in the +// MPI init code, but that's way too late... +// +// We need to know both the context, and the unit (chip) that we are +// using. If we have more than 2 cpus, and we have more than one +// chip, we use the unit number as part of the algorithm, so that +// we try to stay on a cpu close to the chip that we are using. +// +// This will need more work; it isn't really right yet for dual core, +// dual cpu. We may change the command to just return the cpu that +// should be used for affinity, eventually. +// Since user contextss start at 1, we subtract one. +// The "same" code is done as part of MPI_Init, if the job is only +// using shared memory, no infinipath +static void ipath_setaffinity(int fd) +{ + struct ipath_ctxt_info info; + struct ipath_cmd cmd; + cpu_set_t cpuset; + + if(getenv("IPATH_NO_CPUAFFINITY")) { + _IPATH_PRDBG("Skipping processor affinity, $IPATH_NO_CPUAFFINITY set\n"); + return; + } + + memset(&cmd, 0, sizeof(struct ipath_cmd)); + memset(&info, 0, sizeof(struct ipath_ctxt_info)); + cmd.type = IPATH_CMD_CTXT_INFO; + cmd.cmd.ctxt_info = (uintptr_t) &info; + if(ipath_cmd_write(fd, &cmd, sizeof(cmd)) == -1) { + _IPATH_INFO("CTXT_INFO command failed: %s\n", strerror(errno)); + return; + } + if(!info.num_active || !info.context) { + _IPATH_INFO("CTXT_INFO: %u active contexts unit %u:%u %u/%u, skip cpu affinity\n", + info.num_active, info.unit, info.port, info.context, info.subcontext); + return; + } + + if(info.rec_cpu == (__u16)-1) { + _IPATH_PRDBG("Skipping processor affinity, set already or no " + "unallocated cpu\n"); + return; + } + + CPU_ZERO(&cpuset); + CPU_SET(info.rec_cpu, &cpuset); + if(sched_setaffinity(0,sizeof cpuset, &cpuset)) + _IPATH_INFO("Couldn't set runon processor %u (unit:context %u:%u) (%u active chips): %s\n", + info.rec_cpu, info.unit, info.context, info.num_active, strerror(errno)); + else + _IPATH_PRDBG("Set CPU affinity to %u, context %u:%u:%u (%u active chips)\n", + info.rec_cpu, info.unit, info.context, info.subcontext, info.num_active); +} + +// It is allowed to have multiple devices (and of different types) +// simultaneously opened and initialized, although this (still! Oct 07) +// implemented. This routine is used by the low level +// infinipath protocol code (and any other code that has similar low level +// functionality). +// This is the only routine that takes a file descriptor, rather than an +// struct _ipath_ctrl *. The struct _ipath_ctrl * used for everything +// else is returned as part of ipath_base_info. +struct _ipath_ctrl *ipath_userinit(int fd, struct ipath_user_info *u, + struct ipath_base_info *b) +{ + struct _ipath_ctrl *spctrl = NULL; + void *tmp; + uint64_t *tmp64; + struct stat st; + struct ipath_cmd c; + size_t usize; + uintptr_t pg_mask; + __u64 pioavailaddr; + uint64_t uregbase; + int __ipath_pg_sz; + + /* First get the page size */ + __ipath_pg_sz = sysconf(_SC_PAGESIZE); + pg_mask = ~ (intptr_t) (__ipath_pg_sz - 1); + + u->spu_base_info_size = sizeof(*b); + u->spu_base_info = (uint64_t)(uintptr_t) b; + + memset(&c, 0, sizeof(struct ipath_cmd)); + c.type = IPATH_CMD_ASSIGN_CONTEXT; + memcpy(&c.cmd.user_info, u, sizeof(*u)); + + if(ipath_cmd_assign_context(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("assign_context command failed: %s\n", strerror(errno)); + goto err; + } + + ipath_setaffinity(fd); // prior to memory allocation in driver, etc. + + c.type = IPATH_CMD_USER_INIT; + memcpy(&c.cmd.user_info, u, sizeof(*u)); + + if(ipath_cmd_user_init(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("userinit command failed: %s\n", strerror(errno)); + goto err; + } + /* + * If header redirection is enabled, there will be a shared subcontext + * with the kernel that we have to examine. + */ + if (b->spi_runtime_flags & IPATH_RUNTIME_CTXT_REDIRECT) + u->spu_subcontext_cnt = 1; + + _IPATH_PRDBG("Driver is %sQLogic-built\n", + ((1<<31)&b->spi_sw_version) ? "" : "not "); + if((0x7fff&(b->spi_sw_version >> 16)) != IPATH_USER_SWMAJOR) { + _IPATH_INFO + ("User major version 0x%x not same as driver major 0x%x\n", + IPATH_USER_SWMAJOR, b->spi_sw_version >> 16); + if((b->spi_sw_version >> 16) < IPATH_USER_SWMAJOR) + goto err; // else assume driver knows how to be compatible + } + else if ((b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) { + _IPATH_PRDBG("User minor version 0x%x not same as driver minor 0x%x\n", + IPATH_USER_SWMINOR, b->spi_sw_version & 0xffff); + if ((b->spi_sw_version & 0xffff) < IPATH_USER_SWMINOR) + b->spi_sendbuf_status = 0; + } + + if (u->spu_subcontext_cnt && + (b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) { + _IPATH_INFO("Mismatched user minor version (%d) and driver " + "minor version (%d) while context sharing. Ensure " + "that driver and library are from the same " + "release.\n", + IPATH_USER_SWMINOR, + (int) (b->spi_sw_version & 0xffff)); + } + +#ifdef PSM_DEBUG + _IPATH_PRDBG("spi_subcontext = %d\n", (int) b->spi_subcontext); + _IPATH_PRDBG("spi_subctxt_uregbase = 0x%llx\n", (unsigned long long) b->spi_subctxt_uregbase); + _IPATH_PRDBG("spi_subctxt_rcvegrbuf = 0x%llx\n", (unsigned long long) b->spi_subctxt_rcvegrbuf); + _IPATH_PRDBG("spi_subctxt_rcvhdr_base = 0x%llx\n", (unsigned long long) b->spi_subctxt_rcvhdr_base); + _IPATH_PRDBG("spu_subcontext_cnt = %d\n", (int) u->spu_subcontext_cnt); + _IPATH_PRDBG("spu_subcontext_id = %d\n", (int) u->spu_subcontext_id); +#endif + + if(!(spctrl = calloc(1, sizeof(struct _ipath_ctrl)))) { + _IPATH_INFO("can't allocate memory for ipath_ctrl: %s\n", + strerror(errno)); + goto err; + } + + /* Check if we need to turn off header suppression in hardware and + * emulate it in software. Since the driver disables all TID flow + * entries we don't need to do anything just fake it that this + * looks like Linda. + * Note: This will break the hardware detection heuristics where we + * determine that a card is QLE73XX by looking at the capability to + * support header suppression! Need the driver to provide the requisite + * information so we can move away from heuristics based on flags. + */ + { + const char *env; + + if ((env = getenv("IPATH_HW_HEADER_SUPPRESSION")) && (*env != '\0')) { + int hwsupp = (int) strtol(env, NULL, 0); + + if (!hwsupp && (b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) { + _IPATH_INFO("Disabling hardware suppresion!\n"); + b->spi_runtime_flags &= ~IPATH_RUNTIME_HDRSUPP; + } + } /* Env */ + + } + + + usize = b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP ? + 2 * __ipath_pg_sz : __ipath_pg_sz; + _IPATH_DBG("uregbase=%llx usize=%u context=%d\n", + (unsigned long long) b->spi_uregbase, + (unsigned) usize, (int) b->spi_context); + + // now mmap in the rcvhdrq, egr bufs, PIO buffers and user regs + // _ipath_uregbase is the user regs; not offset as it is in the kernel + uregbase = b->spi_uregbase; + if((tmp=ipath_mmap64(0, usize, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_LOCKED, fd, + (__off64_t)b->spi_uregbase)) == MAP_FAILED) { + _IPATH_INFO("mmap of user registers at %llx failed: %s\n", + (long long unsigned)b->spi_uregbase, + strerror(errno)); + goto err; + } + + _IPATH_MMDBG("mmap user regs from kernel %llx to %p (0x%lx bytes)\n", + (long long unsigned) b->spi_uregbase, tmp, + (unsigned long)usize); + + // we don't try to fault these in, no need + tmp64 = (uint64_t *)tmp; + b->spi_uregbase = (uint64_t)(uintptr_t)tmp; + spctrl->spc_dev.spd_uregbase = (volatile uint64_t*) tmp; + + /* + * Set up addresses for optimized register writeback routines. + * This is for the real onchip registers, shared context or not + */ + spctrl->__ipath_rcvhdrhead = (uint32_t*)&tmp64[ur_rcvhdrhead]; + spctrl->__ipath_rcvegrhead = (uint32_t*)&tmp64[ur_rcvegrindexhead]; + spctrl->__ipath_rcvegrtail = (uint32_t*)&tmp64[ur_rcvegrindextail]; + + if (!(b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) { + _IPATH_DBG("HdrSupp not available. Using virt tidflow table.\n"); + spctrl->__ipath_rcvtidflow = spctrl->regs; + spctrl->__ipath_tidflow_wmb = &spctrl->tidflow_wmb_location; + } + else { + spctrl->__ipath_rcvtidflow = (uint32_t*)&tmp64[ur_rcvtidflow]; + spctrl->__ipath_tidflow_wmb = (__le32*)spctrl->__ipath_rcvegrtail; + } + + /* map the receive tidflow table in QLE73XX */ + _IPATH_DBG("rcvtidfflow=%p offset=0x%lx\n", + spctrl->__ipath_rcvtidflow, + (long) ((uintptr_t) spctrl->__ipath_rcvtidflow - (uintptr_t) tmp64)); + + { char *maxpio; uint32_t numpio; + maxpio = getenv("IPATH_MAXPIO"); + if(maxpio && (numpio=strtoul(maxpio, NULL, 0))>0 && + numpio < b->spi_piocnt) { + _IPATH_INFO("$IPATH_MAXPIO is %u, reducing PIO buffer count from %u\n", + numpio, b->spi_piocnt); + b->spi_piocnt = numpio; + } + } + + // map in the PIO buffers, much like ureg, since it's + // in the chip address space + if((tmp=ipath_mmap64(0, b->spi_pioalign*b->spi_piocnt, + PROT_WRITE, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_piobufbase)) == MAP_FAILED) { + _IPATH_INFO("mmap of pio buffers at %llx failed: %s\n", + (long long unsigned)b->spi_piobufbase, + strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap PIO buffers from kernel %llx, %u pages to %p\n", + (unsigned long long)b->spi_piobufbase, b->spi_piocnt, tmp); + // Do not try to read the PIO buffers; they are mapped write + // only. We'll fault them in as we write to them. + b->spi_piobufbase = (uintptr_t)tmp; + } + + if (b->spi_sendbuf_status) { + if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t)b->spi_sendbuf_status)) == MAP_FAILED) { + _IPATH_INFO("mmap of send buffer status page at %llx failed: %s\n", + (long long unsigned)b->spi_sendbuf_status, + strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap send buffer status page from kernel %llx to %p\n", + (long long unsigned)b->spi_sendbuf_status, tmp); + // we don't try to fault these in; no need + b->spi_sendbuf_status = (uint64_t)(uintptr_t)tmp; + } + } + else{ + b->spi_sendbuf_status = (uint64_t)(uintptr_t) &spctrl->sendbuf_status; + } + + /* + * Removed reference to waldo. + * Also needs to be read/write when context sharing so process can update the TID. + */ + if((tmp=ipath_mmap64(0, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t), + u->spu_subcontext_cnt ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_rcvhdr_base)) == MAP_FAILED) { + _IPATH_INFO("mmap of rcvhdrq failed: %s\n", strerror(errno)); + goto err; + } + else { + // for use in protocol code + _IPATH_MMDBG("mmap rcvhdrq from kernel %llx, %lx bytes to %p\n", + (unsigned long long)b->spi_rcvhdr_base, + (unsigned long)(b->spi_rcvhdrent_size * + b->spi_rcvhdr_cnt*sizeof(uint32_t)), tmp); + ipath_touch_mmap(tmp, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t)); + b->spi_rcvhdr_base = (uintptr_t)tmp; // set to mapped address + } + + if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + /* Don't mmap tail pointer if not using it. */ + /* make tail address for false-eager-full recovery, CQ, Jul 15, 2013 */ + spctrl->__ipath_rcvtail = (volatile uint32_t*) + &spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8]; + _IPATH_MMDBG("mmap rcvhdrq tail %p\n", spctrl->__ipath_rcvtail); + b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t)spctrl->__ipath_rcvtail; + } + else if ((b->spi_rcvhdr_tailaddr & pg_mask) == (uregbase & pg_mask)) { + uintptr_t s; + s = b->spi_rcvhdr_tailaddr - (b->spi_rcvhdr_tailaddr & pg_mask); + b->spi_rcvhdr_tailaddr = b->spi_uregbase + s; + spctrl->__ipath_rcvtail = (volatile uint32_t*)(uintptr_t)b->spi_rcvhdr_tailaddr; + } + else if (!b->spi_rcvhdr_tailaddr) { + /* If tailaddr is NULL, use the ureg page (for context sharing) */ + spctrl->__ipath_rcvtail = (volatile uint32_t*) + &spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8]; + _IPATH_MMDBG("mmap rcvhdrq tail %p\n", spctrl->__ipath_rcvtail); + } + else if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_rcvhdr_tailaddr)) == MAP_FAILED) { + _IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno)); + goto err; + } + else { + ipath_touch_mmap(tmp, __ipath_pg_sz); + spctrl->__ipath_rcvtail = (volatile uint32_t*)tmp; // for use in protocol code + _IPATH_MMDBG("mmap rcvhdrq tail from kernel %llx to %p\n", + (unsigned long long)b->spi_rcvhdr_tailaddr, tmp); + /* Update baseinfo with new value of tail address */ + b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t) tmp; + } + + spctrl->__ipath_tidegrcnt = b->spi_tidegrcnt; + if(!b->spi_rcv_egrbuftotlen) { + _IPATH_ERROR("new protocol against older driver, fall back to old\n"); + b->spi_rcv_egrbuftotlen = b->spi_rcv_egrbufsize*b->spi_tidegrcnt; + } + + if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen, + PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_rcv_egrbufs)) == MAP_FAILED) { + _IPATH_INFO("mmap of egr bufs from %llx failed: %s\n", + (long long)b->spi_rcv_egrbufs, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap egr bufs of 0x%x bytes (0x%x) from kernel %llx to %p\n", + b->spi_rcv_egrbufsize, b->spi_rcv_egrbuftotlen, + (long long)b->spi_rcv_egrbufs, tmp); + ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen); + b->spi_rcv_egrbufs = (uint64_t)(uintptr_t)tmp; + } + + pioavailaddr = b->spi_pioavailaddr; + if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_pioavailaddr)) == MAP_FAILED) { + _IPATH_INFO("mmap of pioavail registers (%llx) failed: %s\n", + (long long)b->spi_pioavailaddr, strerror(errno)); + goto err; + } + else { + volatile __le64 *pio; + _IPATH_MMDBG("mmap pioavail from kernel 0x%llx to %p\n", + (long long)b->spi_pioavailaddr, tmp); + b->spi_pioavailaddr = (uintptr_t)tmp; + pio = (volatile __le64 *)(uintptr_t)b->spi_pioavailaddr; + _IPATH_DBG("pioindex=0x%x, piocnt=0x%x " + "pioavailregs 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", + b->spi_pioindex, b->spi_piocnt, + (unsigned long long)__le64_to_cpu(pio[0]), + (unsigned long long)__le64_to_cpu(pio[1]), + (unsigned long long)__le64_to_cpu(pio[2]), + (unsigned long long)__le64_to_cpu(pio[3])); + } + + if ((b->spi_status & pg_mask) == (pioavailaddr & pg_mask)) { + /* spi_status and spi_pioavailaddr are in the same page */ + uintptr_t s; + s = b->spi_status - pioavailaddr; + b->spi_status = (uintptr_t)tmp + s; + spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status; + } + else if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)(b->spi_status & pg_mask))) == MAP_FAILED) { + _IPATH_INFO("mmap of spi_status (%llx) failed: %s\n", + (long long)b->spi_status, strerror(errno)); + goto err; + } + else { + /* spi_status and spi_pioavailaddr are in different pages */ + uintptr_t s; + _IPATH_MMDBG("mmap spi_status from kernel 0x%llx to %p\n", + (long long)b->spi_status, tmp); + s = b->spi_status - (b->spi_status & pg_mask); + b->spi_status = (uintptr_t)tmp + s; + spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status; + } + _IPATH_DBG("chipstatus=0x%llx\n", + (unsigned long long)*spctrl->__ipath_spi_status); + + if(u->spu_subcontext_cnt > 0) { + unsigned num_subcontexts = u->spu_subcontext_cnt; + size_t size; + int i; + + size = __ipath_pg_sz * num_subcontexts; + if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_subctxt_uregbase)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext uregbase array (%llx) failed: %s\n", + (long long)b->spi_subctxt_uregbase, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext uregbase array (0x%zx) from kernel %llx to %p\n", + size, (long long)b->spi_subctxt_uregbase, tmp); + ipath_touch_mmap(tmp, size); + + b->spi_subctxt_uregbase = (uint64_t)(uintptr_t)tmp; + + for (i = 0; i < num_subcontexts; i++) { + volatile uint64_t *uregp = (volatile uint64_t *)tmp; + if (i == u->spu_subcontext_id) { + * (volatile uint32_t *) &uregp[ur_rcvhdrtail * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvhdrhead * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvegrindexhead * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvegrindextail * 8] = 0; + } + tmp = (void *)((char*)tmp + __ipath_pg_sz); + } + } + size = ALIGN(b->spi_rcvhdr_cnt * b->spi_rcvhdrent_size * + sizeof(uint32_t), __ipath_pg_sz) * num_subcontexts; + if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_subctxt_rcvhdr_base)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n", + (long long)b->spi_subctxt_rcvhdr_base, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext rcvhdr_base array (0x%zx) from kernel %llx to %p\n", + size, (long long)b->spi_subctxt_rcvhdr_base, tmp); + ipath_touch_mmap(tmp, size); + b->spi_subctxt_rcvhdr_base = (uint64_t)(uintptr_t)tmp; + } + if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen * num_subcontexts, + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)b->spi_subctxt_rcvegrbuf)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n", + (long long)b->spi_subctxt_rcvegrbuf, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext rcvegrbuf array (0x%x) from kernel %llx to %p\n", + b->spi_rcv_egrbuftotlen, (long long)b->spi_subctxt_rcvegrbuf, + tmp); + ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen * num_subcontexts); + b->spi_subctxt_rcvegrbuf = (uint64_t)(uintptr_t)tmp; + } + } + + spctrl->spc_dev.spd_fd = fd; + if(fstat(fd, &st)) { + _IPATH_INFO("can't stat infinipath device to determine type: %s\n", + strerror(errno)); + goto err; + } + else if(!S_ISCHR(st.st_mode)) { + // shouldn't ever happen, since the commands worked, but... + _IPATH_INFO("file descriptor is not for a real device, failing\n"); + goto err; + } + spctrl->spc_dev.spd_type = minor(st.st_rdev); + return spctrl; +err: + if(spctrl) + free(spctrl); + return NULL; +} + +#endif //__MIC__ diff --git a/ipath/ipath_protomic.c b/ipath/ipath_protomic.c new file mode 100644 index 0000000..2c3afa3 --- /dev/null +++ b/ipath/ipath_protomic.c @@ -0,0 +1,616 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef __MIC__ +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +#include + +#include + +#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) + +/* + * unit : bit 1-3 + * context : bit 4-8 + * subcontext : bit 9-11 + * type : bit 12-16 + */ +#define MAKE_KEY(unit, context, subcontext, type, subctxtcnt) \ + (((unit)&0x7) | (((context)&0x1F)<<3) | \ + (((subcontext)&0x7)<<8) | (((type)&0x1F)<<11) | \ + (((subctxtcnt)&0x7)<<16)) + +#define GET_UNIT_FROM_KEY(key) \ + ((key)&0x7) + +#define GET_CONTEXT_FROM_KEY(key) \ + (((key)>>3)&0x1F) + +/* +flags in above structure has the following bits: +0x1: map remote host buffer, offset is the SCIF offset +0x2: allocate knx memory in kernel. +0x4: allocate physically contiguous knx memory in kernel. +0x8: SCIF register knx memory, and copy offset to first 8 bytes. +*/ +#define MIC_HOSTMEM_MAP 0x1 +#define MIC_KNXMEM_ALLOC 0x2 +#define MIC_KNXMEM_ALLOC_CONTG 0x4 +#define MIC_KNXMEM_REGISTER 0x8 + +/* + * Memory name to map into PSM process. + */ +#define SPI_SENDBUF_STATUS 1 +#define SPI_RCVHDR_BASE 2 +#define SPI_RCVHDR_TAILADDR 3 +#define SPI_RCV_EGRBUFS 4 +#define SPI_UREGBASE 5 +#define SPI_PIOBUFBASE 6 +#define SPI_PIOAVAILADDR 7 +#define SPI_STATUS 8 +#define SPI_SUBCTXT_UREGBASE 9 +#define SPI_SUBCTXT_RCVHDR_BASE 10 +#define SPI_SUBCTXT_RCVEGRBUF 11 + +static void ipath_setaffinity(int fd) +{ + cpu_set_t cpuset; + char *env; + + if(getenv("IPATH_NO_CPUAFFINITY")) { + _IPATH_PRDBG("Skipping processor affinity, $IPATH_NO_CPUAFFINITY set\n"); + return; + } + + env = getenv("IPATH_SET_CPUAFFINITY"); + if (!env) return; + + CPU_ZERO(&cpuset); + CPU_SET(atoi(env), &cpuset); + if(sched_setaffinity(0,sizeof cpuset, &cpuset)) { + _IPATH_INFO("sched_setaffinity() failed, cpu %d\n", atoi(env)); + } + + return; +} + +// It is allowed to have multiple devices (and of different types) +// simultaneously opened and initialized, although this (still! Oct 07) +// implemented. This routine is used by the low level +// infinipath protocol code (and any other code that has similar low level +// functionality). +// This is the only routine that takes a file descriptor, rather than an +// struct _ipath_ctrl *. The struct _ipath_ctrl * used for everything +// else is returned as part of ipath_base_info. +struct _ipath_ctrl *ipath_userinit(int fd, struct ipath_user_info *u, + struct ipath_base_info *b) +{ + struct _ipath_ctrl *spctrl = NULL; + void *tmp; + uint64_t *tmp64; + struct stat st; + struct ipath_cmd c; + size_t usize; + uintptr_t pg_mask; + __u64 pioavailaddr; + __u64 sendbuf_status, rcvhdr_base, rcv_egrbufs; + int __ipath_pg_sz; + + /* First get the page size */ + __ipath_pg_sz = sysconf(_SC_PAGESIZE); + pg_mask = ~ (intptr_t) (__ipath_pg_sz - 1); + + u->spu_base_info_size = sizeof(*b); + u->spu_base_info = (uint64_t)(uintptr_t) b; + + memset(&c, 0, sizeof(struct ipath_cmd)); + c.type = IPATH_CMD_ASSIGN_CONTEXT; + memcpy(&c.cmd.user_info, u, sizeof(*u)); + + if(ipath_cmd_assign_context(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("assign_context command failed: %s\n", strerror(errno)); + goto err; + } + + ipath_setaffinity(fd); // prior to memory allocation in driver, etc. + + /* + * Allocate b->spi_sendbuf_status, one page size. + */ + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SENDBUF_STATUS, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC|MIC_KNXMEM_REGISTER; + c.cmd.mem_info.length = __ipath_pg_sz; + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, fd, + (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of send buffer status page at %llx failed: %s\n", + (long long unsigned)b->spi_sendbuf_status, + strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap send buffer status page from kernel %llx to %p\n", + (long long unsigned)b->spi_sendbuf_status, tmp); + // we don't try to fault these in; no need + sendbuf_status = (uint64_t)(uintptr_t)tmp; + if (b->spi_subcontext == 0) { + b->spi_sendbuf_status = (uint64_t)(*((off_t*)tmp)); + //*((off_t*)tmp) = 0; + } + } + + /* + * Allocate b->spi_rcvhdr_base. + */ + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_RCVHDR_BASE, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC_CONTG|MIC_KNXMEM_REGISTER; + c.cmd.mem_info.length = b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t); + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + if((tmp=ipath_mmap64(0, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t), + u->spu_subcontext_cnt ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of rcvhdrq failed: %s\n", strerror(errno)); + goto err; + } + else { + // for use in protocol code + _IPATH_MMDBG("mmap rcvhdrq from kernel %llx, %lx bytes to %p\n", + (unsigned long long)b->spi_rcvhdr_base, + (unsigned long)(b->spi_rcvhdrent_size * + b->spi_rcvhdr_cnt*sizeof(uint32_t)), tmp); + ipath_touch_mmap(tmp, b->spi_rcvhdrent_size*b->spi_rcvhdr_cnt*sizeof(uint32_t)); + rcvhdr_base = (uintptr_t)tmp; // set to mapped address + if (b->spi_subcontext == 0) { + b->spi_rcvhdr_base = (uint64_t)(*((off_t*)tmp)); + //*((off_t*)tmp) = 0; + } + } + + /* + * Skip b->spi_rcvhdr_tailaddr. + */ + if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) + ; /* Don't mmap tail pointer if not using it. */ + else { + _IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno)); + goto err; + } + + /* + * Allocate b->spi_rcv_egrbufs. + */ + if(!b->spi_rcv_egrbuftotlen) { + _IPATH_ERROR("new protocol against older driver, fall back to old\n"); + goto err; + } + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_RCV_EGRBUFS, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC|MIC_KNXMEM_REGISTER; + c.cmd.mem_info.length = b->spi_rcv_egrbuftotlen; + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + if((tmp=ipath_mmap64(0, b->spi_rcv_egrbuftotlen, + PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of egr bufs from %llx failed: %s\n", + (long long)b->spi_rcv_egrbufs, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap egr bufs of 0x%x bytes (0x%x) from kernel %llx to %p\n", + b->spi_rcv_egrbufsize, b->spi_rcv_egrbuftotlen, + (long long)b->spi_rcv_egrbufs, tmp); + ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen); + rcv_egrbufs = (uint64_t)(uintptr_t)tmp; + if (b->spi_subcontext == 0) { + b->spi_rcv_egrbufs = (uint64_t)(*((off_t*)tmp)); + //*((off_t*)tmp) = 0; + } + } + + memset(&c, 0, sizeof(struct ipath_cmd)); + c.type = IPATH_CMD_USER_INIT; + memcpy(&c.cmd.user_info, u, sizeof(*u)); + + if(ipath_cmd_user_init(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("userinit command failed: %s\n", strerror(errno)); + goto err; + } + /* + * If header redirection is enabled, there will be a shared subcontext + * with the kernel that we have to examine. + */ + if (b->spi_runtime_flags & IPATH_RUNTIME_CTXT_REDIRECT) + u->spu_subcontext_cnt = 1; + + _IPATH_PRDBG("Driver is %sQLogic-built\n", + ((1<<31)&b->spi_sw_version) ? "" : "not "); + if((0x7fff&(b->spi_sw_version >> 16)) != IPATH_USER_SWMAJOR) { + _IPATH_INFO + ("User major version 0x%x not same as driver major 0x%x\n", + IPATH_USER_SWMAJOR, b->spi_sw_version >> 16); + if((b->spi_sw_version >> 16) < IPATH_USER_SWMAJOR) + goto err; // else assume driver knows how to be compatible + } + else if ((b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) { + _IPATH_PRDBG("User minor version 0x%x not same as driver minor 0x%x\n", + IPATH_USER_SWMINOR, b->spi_sw_version & 0xffff); + if ((b->spi_sw_version & 0xffff) < IPATH_USER_SWMINOR) + b->spi_sendbuf_status = 0; + } + + if (u->spu_subcontext_cnt && + (b->spi_sw_version & 0xffff) != IPATH_USER_SWMINOR) { + _IPATH_INFO("Mismatched user minor version (%d) and driver " + "minor version (%d) while context sharing. Ensure " + "that driver and library are from the same " + "release.\n", + IPATH_USER_SWMINOR, + (int) (b->spi_sw_version & 0xffff)); + } + + if(!(spctrl = calloc(1, sizeof(struct _ipath_ctrl)))) { + _IPATH_INFO("can't allocate memory for ipath_ctrl: %s\n", + strerror(errno)); + goto err; + } + + /* + * Setup KNC buffers mapped to host. + */ + b->spi_sendbuf_status = sendbuf_status; + b->spi_rcvhdr_base = rcvhdr_base; + b->spi_rcv_egrbufs = rcv_egrbufs; + + /* Check if we need to turn off header suppression in hardware and + * emulate it in software. Since the driver disables all TID flow + * entries we don't need to do anything just fake it that this + * looks like Linda. + * Note: This will break the hardware detection heuristics where we + * determine that a card is QLE73XX by looking at the capability to + * support header suppression! Need the driver to provide the requisite + * information so we can move away from heuristics based on flags. + */ + { + const char *env; + + if ((env = getenv("IPATH_HW_HEADER_SUPPRESSION")) && (*env != '\0')) { + int hwsupp = (int) strtol(env, NULL, 0); + + if (!hwsupp && (b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) { + _IPATH_INFO("Disabling hardware suppresion!\n"); + b->spi_runtime_flags &= ~IPATH_RUNTIME_HDRSUPP; + } + } /* Env */ + + } + + + usize = b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP ? + 2 * __ipath_pg_sz : __ipath_pg_sz; + _IPATH_DBG("uregbase=%llx usize=%u context=%d\n", + (unsigned long long) b->spi_uregbase, + (unsigned) usize, (int) b->spi_context); + + // now mmap in the rcvhdrq, egr bufs, PIO buffers and user regs + // _ipath_uregbase is the user regs; not offset as it is in the kernel + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_UREGBASE, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_HOSTMEM_MAP; + c.cmd.mem_info.length = usize; + c.cmd.mem_info.offset = b->spi_uregbase; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, usize, PROT_WRITE | PROT_READ, + MAP_SHARED | MAP_LOCKED, fd, + (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of user registers at %llx failed: %s\n", + (long long unsigned)b->spi_uregbase, + strerror(errno)); + goto err; + } + + _IPATH_MMDBG("mmap user regs from kernel %llx to %p (0x%lx bytes)\n", + (long long unsigned) b->spi_uregbase, tmp, + (unsigned long)usize); + + // we don't try to fault these in, no need + tmp64 = (uint64_t *)tmp; + b->spi_uregbase = (uint64_t)(uintptr_t)tmp; + spctrl->spc_dev.spd_uregbase = (volatile uint64_t*) tmp; + + /* + * Set up addresses for optimized register writeback routines. + * This is for the real onchip registers, shared context or not + */ + spctrl->__ipath_rcvhdrhead = (uint32_t*)&tmp64[ur_rcvhdrhead]; + spctrl->__ipath_rcvegrhead = (uint32_t*)&tmp64[ur_rcvegrindexhead]; + spctrl->__ipath_rcvegrtail = (uint32_t*)&tmp64[ur_rcvegrindextail]; + + if (b->spi_runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + spctrl->__ipath_rcvtail = (volatile uint32_t*) + &spctrl->spc_dev.spd_uregbase[ur_rcvhdrtail * 8]; + b->spi_rcvhdr_tailaddr = (uint64_t) (uintptr_t)spctrl->__ipath_rcvtail; + } else { + _IPATH_INFO("mmap of rcvhdrq tail failed: %s\n", strerror(errno)); + goto err; + } + + if (!(b->spi_runtime_flags & IPATH_RUNTIME_HDRSUPP)) { + static __le32 regs[INFINIPATH_TF_NFLOWS << 1]; + static __le32 tidflow_wmb_location; + _IPATH_DBG("HdrSupp not available. Using virt tidflow table.\n"); + spctrl->__ipath_rcvtidflow = regs; + spctrl->__ipath_tidflow_wmb = &spctrl->tidflow_wmb_location; + } + else { + spctrl->__ipath_rcvtidflow = (uint32_t*)&tmp64[ur_rcvtidflow]; + spctrl->__ipath_tidflow_wmb = (__le32*)spctrl->__ipath_rcvegrtail; + } + + /* map the receive tidflow table in QLE73XX */ + _IPATH_DBG("rcvtidfflow=%p offset=0x%lx\n", + spctrl->__ipath_rcvtidflow, + (long) ((uintptr_t) spctrl->__ipath_rcvtidflow - (uintptr_t) tmp64)); + + { char *maxpio; uint32_t numpio; + maxpio = getenv("IPATH_MAXPIO"); + if(maxpio && (numpio=strtoul(maxpio, NULL, 0))>0 && + numpio < b->spi_piocnt) { + _IPATH_INFO("$IPATH_MAXPIO is %u, reducing PIO buffer count from %u\n", + numpio, b->spi_piocnt); + b->spi_piocnt = numpio; + } + } + + // map in the PIO buffers, much like ureg, since it's + // in the chip address space + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, b->spi_subcontext, SPI_PIOBUFBASE, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_HOSTMEM_MAP; + c.cmd.mem_info.length = b->spi_pioalign*b->spi_piocnt; + c.cmd.mem_info.offset = b->spi_piobufbase; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, b->spi_pioalign*b->spi_piocnt, + PROT_WRITE, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of pio buffers at %llx failed: %s\n", + (long long unsigned)b->spi_piobufbase, + strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG("mmap PIO buffers from kernel %llx, %u pages to %p\n", + (unsigned long long)b->spi_piobufbase, b->spi_piocnt, tmp); + // Do not try to read the PIO buffers; they are mapped write + // only. We'll fault them in as we write to them. + b->spi_piobufbase = (uintptr_t)tmp; + } + + pioavailaddr = b->spi_pioavailaddr; + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_PIOAVAILADDR, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_HOSTMEM_MAP; + c.cmd.mem_info.length = __ipath_pg_sz; + c.cmd.mem_info.offset = b->spi_pioavailaddr; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, __ipath_pg_sz, PROT_READ, MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of pioavail registers (%llx) failed: %s\n", + (long long)b->spi_pioavailaddr, strerror(errno)); + goto err; + } + else { + volatile __le64 *pio; + _IPATH_MMDBG("mmap pioavail from kernel 0x%llx to %p\n", + (long long)b->spi_pioavailaddr, tmp); + b->spi_pioavailaddr = (uintptr_t)tmp; + pio = (volatile __le64 *)(uintptr_t)b->spi_pioavailaddr; + _IPATH_DBG("pioindex=0x%x, piocnt=0x%x " + "pioavailregs 0x%llx, 0x%llx, 0x%llx, 0x%llx\n", + b->spi_pioindex, b->spi_piocnt, + (unsigned long long)__le64_to_cpu(pio[0]), + (unsigned long long)__le64_to_cpu(pio[1]), + (unsigned long long)__le64_to_cpu(pio[2]), + (unsigned long long)__le64_to_cpu(pio[3])); + } + + if ((b->spi_status & pg_mask) == (pioavailaddr & pg_mask)) { + /* spi_status and spi_pioavailaddr are in the same page */ + uintptr_t s; + s = b->spi_status - pioavailaddr; + b->spi_status = (uintptr_t)tmp + s; + spctrl->__ipath_spi_status = (__u64 volatile*)(uintptr_t)b->spi_status; + } + else { + _IPATH_INFO("mmap of spi_status (%llx) failed: %s\n", + (long long)b->spi_status, strerror(errno)); + goto err; + } + _IPATH_DBG("chipstatus=0x%llx\n", + (unsigned long long)*spctrl->__ipath_spi_status); + + if(u->spu_subcontext_cnt) { + unsigned num_subcontexts = u->spu_subcontext_cnt; + size_t size; + int i; + + size = __ipath_pg_sz * num_subcontexts; + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_UREGBASE, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC; + c.cmd.mem_info.length = size; + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext uregbase array (%llx) failed: %s\n", + (long long)b->spi_subctxt_uregbase, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext uregbase array (0x%zx) from kernel %llx to %p\n", + size, (long long)b->spi_subctxt_uregbase, tmp); + ipath_touch_mmap(tmp, size); + + b->spi_subctxt_uregbase = (uint64_t)(uintptr_t)tmp; + + for (i = 0; i < num_subcontexts; i++) { + volatile uint64_t *uregp = (volatile uint64_t *)tmp; + if (i == u->spu_subcontext_id) { + * (volatile uint32_t *) &uregp[ur_rcvhdrtail * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvhdrhead * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvegrindexhead * 8] = 0; + * (volatile uint32_t *) &uregp[ur_rcvegrindextail * 8] = 0; + } + tmp = (void *)((char *)tmp + __ipath_pg_sz); + } + } + size = ALIGN(b->spi_rcvhdr_cnt * b->spi_rcvhdrent_size * + sizeof(uint32_t), __ipath_pg_sz) * num_subcontexts; + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_RCVHDR_BASE, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC; + c.cmd.mem_info.length = size; + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext rcvhdr_base array (%llx) failed: %s\n", + (long long)b->spi_subctxt_rcvhdr_base, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext rcvhdr_base array (0x%zx) from kernel %llx to %p\n", + size, (long long)b->spi_subctxt_rcvhdr_base, tmp); + ipath_touch_mmap(tmp, size); + b->spi_subctxt_rcvhdr_base = (uint64_t)(uintptr_t)tmp; + } + + size = b->spi_rcv_egrbuftotlen * num_subcontexts; + c.type = IPATH_CMD_MIC_MEM_INFO; + c.cmd.mem_info.key = MAKE_KEY(b->spi_unit, b->spi_context, 0, SPI_SUBCTXT_RCVEGRBUF, u->spu_subcontext_cnt); + c.cmd.mem_info.flags = MIC_KNXMEM_ALLOC; + c.cmd.mem_info.length = size; + c.cmd.mem_info.offset = 0; + if (ipath_cmd_write(fd, &c, sizeof(c)) == -1) { + _IPATH_INFO("ipath_cmd_write() call failed: %s\n", strerror(errno)); + goto err; + } + + if((tmp=ipath_mmap64(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, + fd, (__off64_t)c.cmd.mem_info.key<<12)) == MAP_FAILED) { + _IPATH_INFO("mmap of subcontext rcvegrbuf array (%llx) failed: %s\n", + (long long)b->spi_subctxt_rcvegrbuf, strerror(errno)); + goto err; + } + else { + _IPATH_MMDBG( + "mmap subcontext rcvegrbuf array (0x%x) from kernel %llx to %p\n", + b->spi_rcv_egrbuftotlen, (long long)b->spi_subctxt_rcvegrbuf, + tmp); + ipath_touch_mmap(tmp, b->spi_rcv_egrbuftotlen * num_subcontexts); + b->spi_subctxt_rcvegrbuf = (uint64_t)(uintptr_t)tmp; + } + } + + spctrl->spc_dev.spd_fd = fd; + return spctrl; +err: + if(spctrl) + free(spctrl); + return NULL; +} + +#endif //__MIC__ diff --git a/ipath/ipath_service.c b/ipath/ipath_service.c new file mode 100644 index 0000000..f25b09b --- /dev/null +++ b/ipath/ipath_service.c @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains ipath service routine interface used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_service.h" + +/* + * This function is necessary in a udev-based world. There can be an + * arbitrarily long (but typically less than one second) delay between + * a driver getting loaded and any dynamic special files turning up. + * + * The timeout is in milliseconds. A value of zero means "callee + * decides timeout". Negative is infinite. + * + * Returns 0 on success, -1 on error or timeout. Check errno to see + * whether there was a timeout (ETIMEDOUT) or an error (any other + * non-zero value). + */ +int +ipath_wait_for_device(const char *path, long timeout) +{ + int saved_errno; + struct stat st; + long elapsed; + int ret; + + if (timeout == 0) + timeout = 15000; + + elapsed = 0; + + while (1) { + static const long default_ms = 250; + struct timespec req = { 0 }; + long ms; + + ret = stat(path, &st); + saved_errno = errno; + + if (ret == 0 || (ret == -1 && errno != ENOENT)) + break; + + if (timeout - elapsed == 0) { + saved_errno = ETIMEDOUT; + break; + } + + if (elapsed == 0) { + if (timeout == -1) + _IPATH_DBG("Device file %s not present on first check; " + "waiting indefinitely...\n", path); + else + _IPATH_DBG("Device file %s not present on first check; " + "waiting up to %.1f seconds...\n", + path, timeout / 1e3); + } + + if (timeout < 0 || timeout - elapsed >= default_ms) + ms = default_ms; + else + ms = timeout; + + elapsed += ms; + req.tv_nsec = ms * 1000000; + + ret = nanosleep(&req, NULL); + saved_errno = errno; + + if (ret == -1) + break; + } + + if (ret == 0) + _IPATH_DBG("Found %s after %.1f seconds\n", path, elapsed / 1e3); + else + _IPATH_INFO("The %s device failed to appear after %.1f seconds: %s\n", + path, elapsed / 1e3, strerror(saved_errno)); + + errno = saved_errno; + return ret; +} + +#ifdef __MIC__ +#include +#define PSM_HOST_PORT SCIF_OFED_PORT_7 /* predefined port */ +#define PSM_HOST_NODE 0 /* host node is always 0 */ +scif_epd_t psmd_epd = -1; +int qibp_fd = -1; + +static scif_epd_t +ipath_psmd_connect(uint16_t node, uint16_t port) +{ + int conn_port, tries = 20; + struct scif_portID portID; + scif_epd_t epd; + uid_t uid; + gid_t gid; + + epd = scif_open(); + if (epd < 0) { + fprintf(stderr, "scif_open failed with error %d\n", errno); + return (scif_epd_t)-1; + } + + if ((conn_port = scif_bind(epd, 0)) < 0) { + fprintf(stderr, "scif_bind failed with error %d\n", errno); + scif_close(epd); + return (scif_epd_t)-1; + } + + portID.port = port; + portID.node = node; +retry: + if (scif_connect(epd, &portID) < 0) { + if ((errno == ECONNREFUSED) && (tries > 0)) { + tries--; + sleep(1); + goto retry; + } + fprintf(stderr, "scif_connect failed with error %d(%s)\n", errno, strerror(errno)); + fprintf(stderr, "Please check if /usr/sbin/psmd is running on host.\n"); + scif_close(epd); + return (scif_epd_t)-1; + } + + uid = geteuid(); + if (scif_send(epd, &uid, sizeof(uid), SCIF_SEND_BLOCK) != sizeof(uid)) { + fprintf(stderr, "cannot send uid to psmd service\n"); + scif_close(epd); + return (scif_epd_t)-1; + } + gid = getegid(); + if (scif_send(epd, &gid, sizeof(gid), SCIF_SEND_BLOCK) != sizeof(gid)) { + fprintf(stderr, "cannot send gid to psmd service\n"); + scif_close(epd); + return (scif_epd_t)-1; + } + + return epd; +} + +static int +ipath_scif_send(void *buf, size_t len) +{ + int ret; + + if (psmd_epd == -1) { + psmd_epd = ipath_psmd_connect(PSM_HOST_NODE, PSM_HOST_PORT); + if (psmd_epd == -1) return -1; + } + + while (len) { + ret = scif_send(psmd_epd, buf, (uint32_t)len, SCIF_SEND_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static int +ipath_scif_recv(void *buf, size_t len) +{ + int ret; + while (len) { + ret = scif_recv(psmd_epd, buf, (uint32_t)len, SCIF_RECV_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static int +ipath_qibp_open(void) +{ + char dev_name[MAXPATHLEN]; + int fd; + + snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath"); + + if (ipath_wait_for_device(dev_name, 0) == -1) { + fprintf(stderr, "Could not find an InfiniPath qibp device %s\n", dev_name); + return -1; + } + + if ((fd = open(dev_name, O_RDWR)) == -1) { + fprintf(stderr, "mic:Can't open %s for reading and writing\n", dev_name); + return -1; + } + + if(fcntl(fd, F_SETFD, FD_CLOEXEC)) + fprintf(stdout, "Failed to set close on exec for device: %s\n", + strerror(errno)); + + return fd; +} + +#endif //__MIC + +int +ipath_context_open(int unit, int port, uint64_t open_timeout) +{ + int fd; + +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd; + + /* + * Re-direct context open request to psmd on host. + */ + cmd.type = IPATH_CMD_CONTEXT_OPEN; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + cmd.cmd.mic_info.data3 = open_timeout; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + fd = cmd.cmd.mic_info.data1; + if (fd == -1) { + errno = cmd.cmd.mic_info.data2; + return -1; + } + + /* + * Open MIC side qibp before context is assigned. + */ + if (qibp_fd != -1) { + fprintf(stderr, "ipath_context_open(): qibp already opened\n"); + return -1; + } + qibp_fd = ipath_qibp_open(); + if (qibp_fd == -1) return -1; + +#else + char dev_name[MAXPATHLEN]; + + if (unit != IPATH_UNIT_ID_ANY && unit >= 0) + snprintf(dev_name, sizeof(dev_name), "%s%u", "/dev/ipath", unit); + else + snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath"); + + if (ipath_wait_for_device(dev_name, (long)open_timeout) == -1) { + _IPATH_DBG("Could not find an InfiniPath Unit on device " + "%s (%lds elapsed)", dev_name, (long)open_timeout / 1000); + return -1; + } + + if ((fd = open(dev_name, O_RDWR)) == -1) { + _IPATH_DBG("(host:Can't open %s for reading and writing", + dev_name); + return -1; + } + + if(fcntl(fd, F_SETFD, FD_CLOEXEC)) + _IPATH_INFO("Failed to set close on exec for device: %s\n", + strerror(errno)); +#endif + + return fd; +} + +void +ipath_context_close(int fd) +{ +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_CONTEXT_CLOSE; + cmd.cmd.mic_info.data1 = fd; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return; + + if (qibp_fd >= 0) { + close(qibp_fd); + qibp_fd = -1; + } + if (psmd_epd >= 0) { + scif_close(psmd_epd); + psmd_epd = -1; + } +#else + (void) close(fd); +#endif +} + +int +ipath_cmd_writev(int fd, const struct iovec *iov, int iovcnt) +{ +#ifdef __MIC__ + return writev(qibp_fd, iov, iovcnt); +#else + return writev(fd, iov, iovcnt); +#endif +} + +int +ipath_cmd_assign_context(int fd, void *buf, size_t count) +{ +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd, *pcmd; + + ret = ipath_scif_send(buf, count); + if (ret) return ret; + + ret = ipath_scif_send(&fd, sizeof(fd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret < 0) { + errno = cmd.cmd.mic_info.data2; + return ret; + } + + pcmd = (struct ipath_cmd *)buf; + ret = ipath_scif_recv( + (void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info, + (int)pcmd->cmd.user_info.spu_base_info_size); + return ret; +#else + return write(fd, buf, count); +#endif +} + +int +ipath_cmd_user_init(int fd, void *buf, size_t count) +{ +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd, *pcmd; + + ret = ipath_scif_send(buf, count); + if (ret) return ret; + + pcmd = (struct ipath_cmd *)buf; + ret = ipath_scif_send( + (void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info, + (int)pcmd->cmd.user_info.spu_base_info_size); + if (ret) return ret; + + ret = ipath_scif_send(&fd, sizeof(fd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret < 0) { + errno = cmd.cmd.mic_info.data2; + return ret; + } + + ret = ipath_scif_recv( + (void*)(uintptr_t)pcmd->cmd.user_info.spu_base_info, + (int)pcmd->cmd.user_info.spu_base_info_size); + return ret; +#else + return write(fd, buf, count); +#endif +} + +int +ipath_cmd_write(int fd, struct ipath_cmd *cmd, size_t count) +{ +#ifdef __MIC__ +/* +following cmd are processed by mic driver: +IPATH_CMD_SDMA_COMPLETE +IPATH_CMD_SDMA_INFLIGHT +IPATH_CMD_TID_UPDATE +IPATH_CMD_TID_FREE +IPATH_CMD_MEM_INFO +*/ + int ret; + + if (cmd->type == IPATH_CMD_MIC_MEM_INFO || + cmd->type == IPATH_CMD_SDMA_COMPLETE || + cmd->type == IPATH_CMD_SDMA_INFLIGHT || + cmd->type == IPATH_CMD_TID_UPDATE || + cmd->type == IPATH_CMD_TID_FREE) { + return write(qibp_fd, cmd, count); + } + + ret = ipath_scif_send(cmd, count); + if (ret) return ret; + + ret = ipath_scif_send(&fd, sizeof(fd)); + if (ret) return ret; + + ret = ipath_scif_recv(cmd, count); + if (ret) return ret; + + ret = cmd->cmd.mic_info.data1; + if (ret) errno = cmd->cmd.mic_info.data2; + return ret; +#else + return write(fd, cmd, count); +#endif +} + +// we use mmap64() because we compile in both 32 and 64 bit mode, +// and we have to map physical addresses that are > 32 bits long. +// While linux implements mmap64, it doesn't have a man page, +// and isn't declared in any header file, so we declare it here ourselves. + +// We'd like to just use -D_LARGEFILE64_SOURCE, to make off_t 64 bits and +// redirects mmap to mmap64 for us, but at least through suse10 and fc4, +// it doesn't work when the address being mapped is > 32 bits. It chips +// off bits 32 and above. So we stay with mmap64. +void * +ipath_mmap64(void *addr, size_t length, int prot, int flags, int fd, __off64_t offset) +{ +#ifdef __MIC__ + if (qibp_fd == -1) { + fprintf(stderr, "ipath_mmap64(): qibp not opened, qibp_fd=-1\n"); + return MAP_FAILED; + } + fd = qibp_fd; +#endif + return mmap64(addr, length, prot, flags, fd, offset); +} + +// get the number of units supported by the driver. Does not guarantee +// that a working chip has been found for each possible unit #. +// number of units >=0 (0 means none found). +// formerly used sysfs file "num_units" +int +ipath_get_num_units(void) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_NUM_UNITS; + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; +#else + char pathname[128]; + struct stat st; + int i; + + ret = 0; + for(i=0; i 0) { + int64_t val; + if (unit_id == IPATH_UNIT_ID_ANY) { + uint32_t u, p; + for (u = 0; u < units; u++) { + for (p = 1; p <= IPATH_MAX_PORT; p++) + if (ipath_get_port_lid(u, p) != -1) + break; + if (p <= IPATH_MAX_PORT && + !ipath_sysfs_unit_read_s64(u, "nctxts", &val, 0)) + n += (uint32_t) val; + } + } + else { + uint32_t p; + for (p = 1; p <= IPATH_MAX_PORT; p++) + if (ipath_get_port_lid(unit_id, p) != -1) + break; + if (p <= IPATH_MAX_PORT && + !ipath_sysfs_unit_read_s64(unit_id, "nctxts", &val, 0)) + n += (uint32_t) val; + } + } +#endif + + return n; +} + +// Given the unit number, return an error, or the corresponding LID +// For now, it's used only so the MPI code can determine it's own +// LID, and which other LIDs (if any) are also assigned to this node +// Returns an int, so -1 indicates an error. 0 may indicate that +// the unit is valid, but no LID has been assigned. +// No error print because we call this for both potential +// ports without knowing if both ports exist (or are connected) +int +ipath_get_port_lid(int unit, int port) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_PORT_LID; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; +#else + int64_t val; + char *state; + + ret = ipath_sysfs_port_read(unit, port, "phys_state", &state); + if (ret == -1) { + if(errno == ENODEV) + /* this is "normal" for port != 1, on single + * port chips */ + _IPATH_VDBG("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _IPATH_DBG("Failed to get phys_state for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } else { + if (strncmp(state, "5: LinkUp", 9)) { + _IPATH_DBG("!LinkUp for unit %u:%u\n", unit, port); + ret = -1; + } + free(state); + } + if (ret == -1) return ret; + + ret = ipath_sysfs_port_read_s64(unit, port, "lid", &val, 0); + + if (ret == -1) { + if(errno == ENODEV) + /* this is "normal" for port != 1, on single + * port chips */ + _IPATH_VDBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _IPATH_DBG("Failed to get LID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } + else { + ret = val; + +// disable this feature since we don't have a way to provide +// file descriptor in multiple context case. +#if 0 + if(getenv("IPATH_DIAG_LID_LOOP")) { + // provides diagnostic ability to run MPI, etc. even + // on loopback, by claiming a different LID for each context + struct ipath_ctxt_info info; + struct ipath_cmd cmd; + cmd.type = IPATH_CMD_CTXT_INFO; + cmd.cmd.ctxt_info = (uintptr_t) &info; + if(__ipath_lastfd == -1) + _IPATH_INFO("Can't run CONTEXT_INFO for lid_loop, fd not set\n"); + else if(write(__ipath_lastfd, &cmd, sizeof(cmd)) == -1) + _IPATH_INFO("CONTEXT_INFO command failed: %s\n", strerror(errno)); + else if(!info.context) + _IPATH_INFO("CONTEXT_INFO returned context 0!\n"); + else { + _IPATH_PRDBG("Using lid 0x%x, base %x, context %x\n", + ret + info.context, ret, info.context); + ret += info.context; + } + } +#endif + } +#endif + + return ret; +} + +// Given the unit number, return an error, or the corresponding GID +// For now, it's used only so the MPI code can determine its fabric ID. +// Returns an int, so -1 indicates an error. +// No error print because we call this for both potential +// ports without knowing if both ports exist (or are connected) +int +ipath_get_port_gid(int unit, int port, uint64_t *hi, uint64_t *lo) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_PORT_GID; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; + else { + *hi = cmd.cmd.mic_info.data3; + *lo = cmd.cmd.mic_info.data4; + } +#else + char *gid_str = NULL; + + ret = ipath_sysfs_port_read(unit, port, "gids/0", &gid_str); + + if (ret == -1) { + if (errno == ENODEV) + /* this is "normal" for port != 1, on single + * port chips */ + _IPATH_VDBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + else + _IPATH_DBG("Failed to get GID for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } + else { + unsigned int gid[8]; + if (sscanf(gid_str, "%4x:%4x:%4x:%4x:%4x:%4x:%4x:%4x", + &gid[0], &gid[1], &gid[2], &gid[3], + &gid[4], &gid[5], &gid[6], &gid[7]) != 8) { + _IPATH_DBG("Failed to parse GID for unit %u:%u: %s\n", + unit, port, gid_str); + ret = -1; + } + else { + *hi = (((uint64_t) gid[0]) << 48) | (((uint64_t) gid[1]) << 32) | + (((uint64_t) gid[2]) << 16) | (((uint64_t) gid[3]) << 0); + *lo = (((uint64_t) gid[4]) << 48) | (((uint64_t) gid[5]) << 32) | + (((uint64_t) gid[6]) << 16) | (((uint64_t) gid[7]) << 0); + } + free(gid_str); + } +#endif + + return ret; +} + +// Given the unit number, return an error, or the corresponding LMC value +// for the port +// Returns an int, so -1 indicates an error. 0 +int +ipath_get_port_lmc(int unit, int port) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_PORT_LMC; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; +#else + int64_t val; + + ret = ipath_sysfs_port_read_s64(unit, port, "lid_mask_count", &val, 0); + + if (ret == -1) { + _IPATH_INFO("Failed to get LMC for unit %u:%u: %s\n", + unit, port, strerror(errno)); + } + else + ret = val; +#endif + + return ret; +} + +// Given the unit number, return an error, or the corresponding link rate +// for the port +// Returns an int, so -1 indicates an error. +int +ipath_get_port_rate(int unit, int port) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_PORT_RATE; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; +#else + double rate; + char *data_rate = NULL, *newptr; + + ret = ipath_sysfs_port_read(unit, port, "rate", &data_rate); + if (ret == -1) + goto get_port_rate_error; + else { + rate = strtod(data_rate, &newptr); + if ((rate == 0) && (data_rate == newptr)) + goto get_port_rate_error; + } + + free(data_rate); + return ((int) (rate * 2) >> 1); + + get_port_rate_error: + _IPATH_INFO("Failed to get link rate for unit %u:%u: %s\n", + unit, port, strerror(errno)); +#endif + + return ret; +} + +// Given a unit, port and SL, return an error, or the corresponding VL for the +// SL as programmed by the SM +// Returns an int, so -1 indicates an error. 0 +int +ipath_get_port_sl2vl(int unit, int port, int sl) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_PORT_S2V; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + cmd.cmd.mic_info.data1 = sl; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret == -1) errno = cmd.cmd.mic_info.data2; +#else + int64_t val; + char sl2vlpath[16]; + + snprintf(sl2vlpath, sizeof(sl2vlpath), "sl2vl/%d", sl); + ret = ipath_sysfs_port_read_s64(unit, port, sl2vlpath, &val, 0); + + if (ret == -1) { + _IPATH_DBG("Failed to get SL2VL mapping for SL %d unit %u:%u: %s\n", + sl, unit, port, strerror(errno)); + } + else + ret = val; +#endif + + return ret; +} + +/* These have been fixed to read the values, but they are not + * compatible with the ipath driver, they return new info with + * the qib driver + */ +static int infinipath_count_names(const char *namep) +{ + int n = 0; + while (*namep != '\0') { + if (*namep == '\n') + n++; + namep++; + } + return n; +} + +int infinipath_get_stats_names(char **namep) +{ +#ifdef __MIC__ + int ret, size; + char *name; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_STATS_NAMES; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + size = cmd.cmd.mic_info.data2 + 1; + name = malloc(size); + if (!name) return -1; + + ret = ipath_scif_recv(name, size); + if (ret) { + free(name); + return ret; + } + + *namep = name; + return infinipath_count_names(*namep); +#else + int i; + i = ipath_ipathfs_read("driver_stats_names", namep); + if (i < 0) + return -1; + else + return infinipath_count_names(*namep); +#endif +} + +int infinipath_get_stats(uint64_t *s, int nelem) +{ +#ifdef __MIC__ + int ret, n; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_STATS; + cmd.cmd.mic_info.data1 = nelem; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + n = ret; + ret = ipath_scif_recv(s, n*sizeof(*s)); + if (ret) { + return ret; + } + return n; +#else + int i; + i = ipath_ipathfs_rd("driver_stats", s, nelem * sizeof(*s)); + if(i < 0) + return -1; + else + return i / sizeof(*s); +#endif +} + +int infinipath_get_ctrs_unit_names(int unitno, char **namep) +{ +#ifdef __MIC__ + int ret, size; + char *name; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CTRS_UNAMES; + cmd.cmd.mic_info.unit = unitno; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + size = cmd.cmd.mic_info.data2 + 1; + name = malloc(size); + if (!name) return -1; + + ret = ipath_scif_recv(name, size); + if (ret) { + free(name); + return ret; + } + + *namep = name; + return infinipath_count_names(*namep); +#else + int i; + i = ipath_ipathfs_unit_read(unitno, "counter_names", namep); + if (i < 0) + return -1; + else + return infinipath_count_names(*namep); +#endif +} + +int infinipath_get_ctrs_unit(int unitno, uint64_t *c, int nelem) +{ +#ifdef __MIC__ + int ret, n; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CTRS_UNIT; + cmd.cmd.mic_info.unit = unitno; + cmd.cmd.mic_info.data1 = nelem; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + n = ret; + ret = ipath_scif_recv(c, n*sizeof(*c)); + if (ret) { + return ret; + } + return n; +#else + int i; + i = ipath_ipathfs_unit_rd(unitno, "counters", c, + nelem * sizeof(*c)); + if(i < 0) + return -1; + else + return i / sizeof(*c); +#endif +} + +int infinipath_get_ctrs_port_names(int unitno, char **namep) +{ +#ifdef __MIC__ + int ret, size; + char *name; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CTRS_PNAMES; + cmd.cmd.mic_info.unit = unitno; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + size = cmd.cmd.mic_info.data2 + 1; + name = malloc(size); + if (!name) return -1; + + ret = ipath_scif_recv(name, size); + if (ret) { + free(name); + return ret; + } + + *namep = name; + return infinipath_count_names(*namep); +#else + int i; + i = ipath_ipathfs_unit_read(unitno, "portcounter_names", namep); + if (i < 0) + return -1; + else + return infinipath_count_names(*namep); +#endif +} + +int infinipath_get_ctrs_port(int unitno, int port, uint64_t *c, int nelem) +{ +#ifdef __MIC__ + int ret, n; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CTRS_PORT; + cmd.cmd.mic_info.unit = unitno; + cmd.cmd.mic_info.port = port; + cmd.cmd.mic_info.data1 = nelem; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) { + if (ret == -1) errno = cmd.cmd.mic_info.data2; + return ret; + } + + n = ret; + ret = ipath_scif_recv(c, n*sizeof(*c)); + if (ret) { + return ret; + } + return n; +#else + int i; + char buf[32]; + snprintf(buf, sizeof buf, "port%dcounters", port); + i = ipath_ipathfs_unit_rd(unitno, buf, c, + nelem * sizeof(*c)); + if(i < 0) + return -1; + else + return i / sizeof(*c); +#endif +} + +int +ipath_get_cc_settings_bin(int unit, int port, char *ccabuf) +{ +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CC_SETTINGS; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret != 1) return ret; + + ret = ipath_scif_recv(ccabuf, 84); + if (ret) return ret; +#else + int fd; + +/* + * Check qib driver CCA setting, and try to use it if available. + * Fall to self CCA setting if errors. + */ + sprintf(ccabuf, + "/sys/class/infiniband/qib%d/ports/%d/CCMgtA/cc_settings_bin", + unit, port); + fd = open(ccabuf, O_RDONLY); + if (fd < 0) { + return 0; + } + /* (16+16+640)/8=84 */ + if (read(fd, ccabuf, 84) != 84) { + _IPATH_CCADBG("Read cc_settings_bin failed. using static CCA\n"); + close(fd); + return 0; + } + + close(fd); +#endif + + return 1; +} + +int +ipath_get_cc_table_bin(int unit, int port, uint16_t **cctp) +{ + int i, ccti_limit; + uint16_t *cct; + +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_GET_CC_TABLE; + cmd.cmd.mic_info.unit = unit; + cmd.cmd.mic_info.port = port; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret <= 0) return ret; + + ccti_limit = ret; + i = (ccti_limit+1)*sizeof(uint16_t); + cct = malloc(i); + if (!cct) { + return -1; + } + + ret = ipath_scif_recv(cct, i); + if (ret) { + free(cct); + return ret; + } +#else + int fd; + char pathname[256]; + + *cctp = NULL; + sprintf(pathname, + "/sys/class/infiniband/qib%d/ports/%d/CCMgtA/cc_table_bin", + unit, port); + fd = open(pathname, O_RDONLY); + if (fd < 0) { + _IPATH_CCADBG("Open cc_table_bin failed. using static CCA\n"); + return 0; + } + if (read(fd, &ccti_limit, 2) != 2) { + _IPATH_CCADBG("Read ccti_limit failed. using static CCA\n"); + close(fd); + return 0; + } + if (ccti_limit < 63 || ccti_limit > 65535) { + _IPATH_CCADBG("Read ccti_limit %d not in range [63, 65535], " + "using static CCA.\n", ccti_limit); + close(fd); + return 0; + } + + i = (ccti_limit+1)*sizeof(uint16_t); + cct = malloc(i); + if (!cct) { + close(fd); + return -1; + } + if (read(fd, cct, i) != i) { + _IPATH_CCADBG("Read ccti_entry_list, using static CCA\n"); + free(cct); + close(fd); + return 0; + } + + close(fd); +#endif + + *cctp = cct; + return ccti_limit; +} + +/* + * This is for diag function ipath_wait_for_packet() only + */ +int +ipath_cmd_wait_for_packet(int fd) +{ + int ret; + +#ifdef __MIC__ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_WAIT_FOR_PACKET; + cmd.cmd.mic_info.data1 = fd; + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret < 0) errno = cmd.cmd.mic_info.data2; +#else + struct pollfd pfd; + + pfd.fd = fd; + pfd.events = POLLIN; + + ret = poll(&pfd, 1, 500 /* ms */); +#endif + + return ret; +} + +/* + * This is for diag function ipath_hideous_ioctl_emulator() only + */ +int infinipath_get_unit_flash(int unitno, char **datap) +{ +#ifdef __MIC__ + int ret, size; + char *data; + struct ipath_cmd cmd; + + *datap = NULL; + cmd.type = IPATH_CMD_GET_UNIT_FLASH; + cmd.cmd.mic_info.unit = unitno; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret < 0) { + errno = cmd.cmd.mic_info.data2; + return ret; + } + + size = cmd.cmd.mic_info.data2 + 1; + data = malloc(size); + if (!data) return -1; + + ret = ipath_scif_recv(data, size); + if (ret) { + free(data); + return ret; + } + + *datap = data; + return 0; +#else + int i; + i = ipath_ipathfs_unit_read(unitno, "flash", datap); + if (i < 0) + return -1; + else + return 0; +#endif +} + +/* + * This is for diag function ipath_hideous_ioctl_emulator() only + */ +int infinipath_put_unit_flash(int unitno, char *data, int len) +{ +#ifdef __MIC__ + int ret; + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_PUT_UNIT_FLASH; + cmd.cmd.mic_info.unit = unitno; + cmd.cmd.mic_info.data1 = len; + + ret = ipath_scif_send(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = ipath_scif_send(data, len); + if (ret) return ret; + + ret = ipath_scif_recv(&cmd, sizeof(cmd)); + if (ret) return ret; + + ret = cmd.cmd.mic_info.data1; + if (ret < 0) errno = cmd.cmd.mic_info.data2; + return ret; +#else + int i; + i = ipath_ipathfs_unit_write(unitno, "flash", data, len); + if (i < 0) + return -1; + else + return 0; +#endif +} diff --git a/ipath/ipath_sysfs.c b/ipath/ipath_sysfs.c new file mode 100644 index 0000000..9065f8b --- /dev/null +++ b/ipath/ipath_sysfs.c @@ -0,0 +1,752 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __MIC__ +// This file contains a simple sysfs interface used by the low level +// infinipath protocol code. It also implements the interface to ipathfs. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_service.h" + +static char *sysfs_path; +static size_t sysfs_path_len; +static char *ipathfs_path; +static long sysfs_page_size; + +static void __attribute__((constructor)) sysfs_init(void) +{ + struct stat s; + if (sysfs_path == NULL) + sysfs_path = getenv("IPATH_SYSFS_PATH"); + if (sysfs_path == NULL) { + static char syspath[64]; + snprintf(syspath, sizeof(syspath), + "%s%d", QIB_CLASS_PATH, 0); + sysfs_path = syspath; + } + if(stat(sysfs_path, &s) || !S_ISDIR(s.st_mode)) + _IPATH_DBG("Did not find sysfs directory %s, using anyway\n", + sysfs_path); + sysfs_path_len = strlen(sysfs_path); + + if (ipathfs_path == NULL) + ipathfs_path = getenv("IPATH_IPATHFS_PATH"); + if (ipathfs_path == NULL) + ipathfs_path = "/ipathfs"; + + if (!sysfs_page_size) + sysfs_page_size = sysconf(_SC_PAGESIZE); +} + +const char *ipath_sysfs_path(void) +{ + return sysfs_path; +} + +size_t ipath_sysfs_path_len(void) +{ + return sysfs_path_len; +} + +const char *ipath_ipathfs_path(void) +{ + return ipathfs_path; +} + +int ipath_sysfs_open(const char *attr, int flags) +{ + char buf[1024]; + int saved_errno; + int fd; + + snprintf(buf, sizeof(buf), "%s/%s", ipath_sysfs_path(), attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _IPATH_DBG("Failed to open driver attribute '%s': %s\n", attr, + strerror(errno)); + _IPATH_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int ipath_ipathfs_open(const char *attr, int flags) +{ + char buf[1024]; + int saved_errno; + int fd; + + snprintf(buf, sizeof(buf), "%s/%s", ipath_ipathfs_path(), attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _IPATH_DBG("Failed to open driver attribute '%s': %s\n", attr, + strerror(errno)); + _IPATH_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +static int sysfs_vprintf(int fd, const char *fmt, va_list ap) +{ + char *buf; + int len, ret; + int saved_errno; + + buf = alloca(sysfs_page_size); + len = vsnprintf(buf, sysfs_page_size, fmt, ap); + + if (len > sysfs_page_size) { + _IPATH_DBG("Attempt to write more (%d) than %ld bytes\n", len, + sysfs_page_size); + saved_errno = EINVAL; + ret = -1; + goto bail; + } + + ret = write(fd, buf, len); + saved_errno = errno; + + if (ret != -1 && ret < len) { + _IPATH_DBG("Write ran short (%d < %d)\n", ret, len); + saved_errno = EAGAIN; + ret = -1; + } + +bail: + errno = saved_errno; + return ret; +} + +int ipath_sysfs_printf(const char *attr, const char *fmt, ...) +{ + int fd = -1; + va_list ap; + int ret = -1; + int saved_errno; + + fd = ipath_sysfs_open(attr, O_WRONLY); + saved_errno = errno; + + if (fd == -1) { + goto bail; + } + + va_start(ap, fmt); + ret = sysfs_vprintf(fd, fmt, ap); + saved_errno = errno; + va_end(ap); + + if (ret == -1) { + _IPATH_DBG("Failed to write to driver attribute '%s': %s\n", attr, + strerror(errno)); + } + +bail: + if (fd != -1) + close(fd); + + errno = saved_errno; + return ret; +} + +int ipath_sysfs_unit_open(uint32_t unit, const char *attr, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + int len, l; + + snprintf(buf, sizeof(buf), "%s", ipath_sysfs_path()); + len = l = strlen(buf) - 1; + while(l > 0 && isdigit(buf[l])) + l--; + if(l) + buf[++l] = 0; + else + l = len; /* assume they know what they are doing */ + snprintf(buf+l, sizeof(buf)-l, "%u/%s", unit, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _IPATH_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + _IPATH_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int ipath_sysfs_port_open(uint32_t unit, uint32_t port, const char *attr, + int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + int len, l; + + snprintf(buf, sizeof(buf), "%s", ipath_sysfs_path()); + len = l = strlen(buf) - 1; + while(l > 0 && isdigit(buf[l])) + l--; + if(l) + buf[++l] = 0; + else + l = len; /* assume they know what they are doing */ + snprintf(buf+l, sizeof(buf)-l, "%u/ports/%u/%s", unit, port, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _IPATH_DBG("Failed to open attribute '%s' of unit %d:%d: %s\n", attr, + unit, port, strerror(errno)); + _IPATH_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int ipath_ipathfs_unit_open(uint32_t unit, const char *attr, int flags) +{ + int saved_errno; + char buf[1024]; + int fd; + + snprintf(buf, sizeof(buf), "%s/%u/%s", ipath_ipathfs_path(), unit, attr); + fd = open(buf, flags); + saved_errno = errno; + + if (fd == -1) { + _IPATH_DBG("Failed to open attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + _IPATH_DBG("Offending file name: %s\n", buf); + } + + errno = saved_errno; + return fd; +} + +int ipath_sysfs_port_printf(uint32_t unit, uint32_t port, const char *attr, + const char *fmt, ...) +{ + va_list ap; + int ret = -1; + int saved_errno; + int fd; + + fd = ipath_sysfs_port_open(unit, port, attr, O_WRONLY); + saved_errno = errno; + + if (fd == -1) { + goto bail; + } + + va_start(ap, fmt); + ret = sysfs_vprintf(fd, fmt, ap); + saved_errno = errno; + va_end(ap); + + if (ret == -1) { + _IPATH_DBG("Failed to write to attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + } + +bail: + if (fd != -1) + close(fd); + + errno = saved_errno; + return ret; +} + +int ipath_sysfs_unit_printf(uint32_t unit, const char *attr, + const char *fmt, ...) +{ + va_list ap; + int ret = -1; + int saved_errno; + int fd; + + fd = ipath_sysfs_unit_open(unit, attr, O_WRONLY); + saved_errno = errno; + + if (fd == -1) { + goto bail; + } + + va_start(ap, fmt); + ret = sysfs_vprintf(fd, fmt, ap); + saved_errno = errno; + va_end(ap); + + if (ret == -1) { + _IPATH_DBG("Failed to write to attribute '%s' of unit %d: %s\n", attr, + unit, strerror(errno)); + } + +bail: + if (fd != -1) + close(fd); + + errno = saved_errno; + return ret; +} + +static int read_page(int fd, char **datap) +{ + char *data = NULL; + int saved_errno; + int ret = -1; + + data = malloc(sysfs_page_size); + saved_errno = errno; + + if (!data) { + _IPATH_DBG("Could not allocate memory: %s\n", strerror(errno)); + goto bail; + } + + ret = read(fd, data, sysfs_page_size); + saved_errno = errno; + + if (ret == -1) { + _IPATH_DBG("Read of attribute failed: %s\n", strerror(errno)); + goto bail; + } + +bail: + if (ret == -1) { + free(data); + } else { + *datap = data; + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int ipath_sysfs_read(const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_sysfs_open(attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int ipath_sysfs_unit_read(uint32_t unit, const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_sysfs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int ipath_sysfs_port_read(uint32_t unit, uint32_t port, const char *attr, + char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_sysfs_port_open(unit, port, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int ipath_sysfs_unit_write(uint32_t unit, const char *attr, const void *data, + size_t len) +{ + int fd = -1, ret = -1; + int saved_errno; + + if (len > sysfs_page_size) { + _IPATH_DBG("Attempt to write more (%ld) than %ld bytes\n", (long) len, + sysfs_page_size); + saved_errno = EINVAL; + goto bail; + } + + fd = ipath_sysfs_unit_open(unit, attr, O_WRONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = write(fd, data, len); + saved_errno = errno; + + if (ret == -1) { + _IPATH_DBG("Attempt to write %ld bytes failed: %s\n", + (long) len, strerror(errno)); + goto bail; + } + + if (ret < len) { // sysfs routines can routine count including null byte + // so don't return an error if it's > len + _IPATH_DBG("Attempt to write %ld bytes came up short (%ld bytes)\n", + (long) len, (long) ret); + saved_errno = EAGAIN; + ret = -1; + } + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int ipath_ipathfs_read(const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_ipathfs_open(attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * On return, caller must free *datap. + */ +int ipath_ipathfs_unit_read(uint32_t unit, const char *attr, char **datap) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_ipathfs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read_page(fd, datap); + saved_errno = errno; + +bail: + if (ret == -1) + *datap = NULL; + + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +/* + * The _rd routines jread directly into a supplied buffer, + * unlike the _read routines. + */ +int ipath_ipathfs_rd(const char *attr, void *buf, int n) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_ipathfs_open(attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read(fd, buf, n); + saved_errno = errno; + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int ipath_ipathfs_unit_rd(uint32_t unit, const char *attr, void *buf, int n) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_ipathfs_unit_open(unit, attr, O_RDONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = read(fd, buf, n); + saved_errno = errno; + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int ipath_ipathfs_unit_write(uint32_t unit, const char *attr, const void *data, + size_t len) +{ + int fd = -1, ret = -1; + int saved_errno; + + fd = ipath_ipathfs_unit_open(unit, attr, O_WRONLY); + saved_errno = errno; + + if (fd == -1) + goto bail; + + ret = write(fd, data, len); + saved_errno = errno; + + if (ret == -1) { + _IPATH_DBG("Attempt to write %ld bytes failed: %s\n", + (long) len, strerror(errno)); + goto bail; + } + + if (ret != len) { + _IPATH_DBG("Attempt to write %ld bytes came up short (%ld bytes)\n", + (long) len, (long) ret); + saved_errno = EAGAIN; + ret = -1; + } + +bail: + if (fd != -1) { + close(fd); + } + + errno = saved_errno; + return ret; +} + +int ipath_sysfs_read_s64(const char *attr, int64_t *valp, int base) +{ + char *data, *end; + int ret; + int saved_errno; + long long val; + + ret = ipath_sysfs_read(attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + free(data); + errno = saved_errno; + return ret; +} + +int ipath_sysfs_unit_read_s64(uint32_t unit, const char *attr, + int64_t *valp, int base) +{ + char *data, *end; + int saved_errno; + long long val; + int ret; + + ret = ipath_sysfs_unit_read(unit, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + free(data); + errno = saved_errno; + return ret; +} + +int ipath_sysfs_port_read_s64(uint32_t unit, uint32_t port, const char *attr, + int64_t *valp, int base) +{ + char *data, *end; + int saved_errno; + long long val; + int ret; + + ret = ipath_sysfs_port_read(unit, port, attr, &data); + saved_errno = errno; + + if (ret == -1) { + goto bail; + } + + val = strtoll(data, &end, base); + saved_errno = errno; + + if (!*data || !(*end == '\0' || isspace(*end))) { + ret = -1; + goto bail; + } + + *valp = val; + ret = 0; + +bail: + free(data); + errno = saved_errno; + return ret; +} + +#endif //__MIC__ diff --git a/ipath/ipath_syslog.c b/ipath/ipath_syslog.c new file mode 100644 index 0000000..b4ea8c4 --- /dev/null +++ b/ipath/ipath_syslog.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define __USE_GNU +#include +#include +#include +#include +#include + +#include "ipath_user.h" + +#define SYSLOG_MAXLEN 512 + +extern char *__ipath_mylabel; + +void +ipath_vsyslog(const char *prefix, int to_console, int level, + const char *format, va_list ap) +{ + char logprefix[SYSLOG_MAXLEN]; + + if (to_console) { + char hostname[80]; + va_list ap_cons; + va_copy(ap_cons, ap); + size_t len = strlen(format); + gethostname(hostname, sizeof hostname); + hostname[sizeof hostname - 1] = '\0'; + + if (__ipath_mylabel) + fprintf(stderr, "%s", __ipath_mylabel); + else + fprintf(stderr, "%s: ", hostname); + + vfprintf(stderr, format, ap_cons); + if (format[len] != '\n') + fprintf(stderr, "\n"); + fflush(stderr); + va_end(ap_cons); + } + + (void)snprintf(logprefix, sizeof(logprefix), + "(ipath/%s)[%d]: %s", prefix ? prefix : "ipath", (int) getpid(), + format); + + vsyslog(level | LOG_USER, logprefix, ap); + + return; +} + +void +ipath_syslog(const char *prefix, int to_console, int level, + const char *format, ...) +{ + va_list ap; + va_start(ap, format); + ipath_vsyslog(prefix, to_console, level, format, ap); + va_end(ap); +} + diff --git a/ipath/ipath_time.c b/ipath/ipath_time.c new file mode 100644 index 0000000..ca3faa8 --- /dev/null +++ b/ipath/ipath_time.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define __USE_GNU +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_user.h" + +// init the cycle counter to picosecs/cycle conversion automatically +// at program startup, if it's using timing functions. +static void init_picos_per_cycle(void) __attribute__ ((constructor)); +static int ipath_timebase_isvalid(uint32_t pico_per_cycle); +static uint32_t ipath_timebase_from_cpuinfo(uint32_t old_pico_per_cycle); + +// in case two of our mechanisms fail +#ifdef __powerpc__ +#define SAFEDEFAULT_PICOS_PER_CYCLE 69000 +#else +#define SAFEDEFAULT_PICOS_PER_CYCLE 500 +#endif + +uint32_t __ipath_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + +// This isn't perfect, but it's close enough for rough timing. We want this +// to work on systems where the cycle counter isn't the same as the clock +// frequency. +// __ipath_pico_per_cycle isn't going to lead to completely accurate +// conversions from timestamps to nanoseconds, but it's close enough for +// our purposes, which is mainly to allow people to show events with nsecs +// or usecs if desired, rather than cycles. We use it in some performance +// analysis, but it has to be done with care, since cpuspeed can change, +// different cpu's can have different speeds, etc. +// +// Some architectures don't have their TSC-equivalent running at anything +// related to the the processor speed (e.g. G5 Power systems use a fixed +// 33 MHz frequency). + +#define MIN_TEST_TIME_IN_PICOS (100000000000LL) /* 100 milliseconds */ + +static int timebase_debug = 0; /* off by default */ + +#define timebase_warn_always(fmt,...) \ + ipath_syslog("timebase", 1, LOG_ERR, fmt, ##__VA_ARGS__) +#define timebase_warn(fmt,...) if (timebase_debug) \ + timebase_warn_always(fmt, ##__VA_ARGS__) + +static int ipath_timebase_isvalid(uint32_t pico_per_cycle) +{ +#if defined(__x86_64__) || defined(__i386__) + /* If pico-per-cycle is less than 200, the clock speed would be greater + * than 5 GHz. Similarly, we minimally support a 1GHz clock. + * Allow some slop, because newer kernels with HPET can be a few + * units off, and we don't want to spend the startup time needlessly */ + if (pico_per_cycle >= 198 && pico_per_cycle <= 1005) + return 1; +#elif defined(__powerpc__) + /* If pico-per-cycle is not between 1MHz and 1GHz, complain */ + if (pico_per_cycle >= 9950 && pico_per_cycle <= 1005000) + return 1; +#endif + else + return 0; +} + +/* + * Method #1: + * + * Derive the pico-per-cycle by trying to correlate the difference between two + * reads of the tsc counter to gettimeofday. + */ +static void init_picos_per_cycle() +{ + struct timeval tvs, tve; + int64_t usec = 0; + uint64_t ts, te; + int64_t delta; + uint32_t picos = 0; + int trials = 0; + int retry = 0; + cpu_set_t cpuset, cpuset_saved; + int have_cpuset = 1; + + /* + * Make sure we try to calculate the cycle time without being migrated. + */ + CPU_ZERO(&cpuset_saved); + if (sched_getaffinity(0, sizeof cpuset, &cpuset_saved)) + have_cpuset = 0; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if(have_cpuset && sched_setaffinity(0,sizeof cpuset, &cpuset)) + have_cpuset = 0; + + /* + * If we set affinity correctly, give the scheduler another change to put + * us on processor 0 + */ + if (have_cpuset) + sched_yield(); + +retry_pico_test: + if (++retry == 10) { + __ipath_pico_per_cycle = + ipath_timebase_from_cpuinfo(picos); + goto reset_cpu_mask; /* Reset CPU mask before exiting */ + } + + usec = 0; + gettimeofday(&tvs, NULL); + ts = get_cycles(); + while (usec < MIN_TEST_TIME_IN_PICOS) { /* wait for at least 100 millisecs */ + trials++; + usleep(125); + gettimeofday(&tve, NULL); + usec = 1000000LL * (tve.tv_usec - tvs.tv_usec) + + 1000000000000LL * (tve.tv_sec - tvs.tv_sec); + if (usec < 0) { + timebase_warn("RTC timebase, gettimeofday is negative (!) %lld\n", + (long long) usec); + goto retry_pico_test; + } + } + te = get_cycles(); + delta = te - ts; + picos = (uint32_t)(usec / delta); + + if (!ipath_timebase_isvalid(picos)) { + cpu_set_t cpuget; + int affinity_valid = !sched_getaffinity(0, sizeof cpuget, &cpuget); + if (affinity_valid && !CPU_ISSET(0, &cpuget)) + affinity_valid = 0; + timebase_warn("Failed to get valid RTC timebase, gettimeofday delta=%lld, " + "rtc delta=%lld, picos_per_cycle=%d affinity_valid=%s (trial %d/10)\n", + (long long) usec, (long long) delta, picos, + affinity_valid ? "YES" : "NO", retry); + goto retry_pico_test; + } + + /* If we've had to retry even once, let that be known */ + if (retry > 1) + timebase_warn("Clock is %d picos/cycle found in %d trials and " + "%.3f seconds (retry=%d)\n", picos, trials, + (double) usec / 1.0e12, retry); + + __ipath_pico_per_cycle = picos; + + reset_cpu_mask: + /* Restore affinity */ + if (have_cpuset) { + sched_setaffinity(0, sizeof cpuset, &cpuset_saved); + /* + * Give a chance to other processes that also set affinity to 0 for + * doing this test. + */ + sched_yield(); + } +} + +/* + * Method #2: + * + * Derive the pico-per-cycle from /proc instead of using sleep trick + * that relies on scheduler. + */ +static uint32_t +ipath_timebase_from_cpuinfo(uint32_t old_pico_per_cycle) +{ + /* we only validate once */ + uint32_t new_pico_per_cycle = old_pico_per_cycle; + + char hostname[80]; + gethostname(hostname, 80); + hostname[sizeof hostname - 1] = '\0'; + + if (getenv("IPATH_DEBUG_TIMEBASE")) + timebase_debug = 1; + + /* If the old one is valid, don't bother with this mechanism */ + if (ipath_timebase_isvalid(old_pico_per_cycle)) + return old_pico_per_cycle; + +#if defined(__x86_64__) || defined(__i386__) + { + FILE *fp = fopen("/proc/cpuinfo","r"); + char input[255]; + char *p = NULL; + + if (!fp) + goto fail; + + while (!feof(fp) && fgets(input, 255, fp)) { + if (strstr(input,"cpu MHz")) { + p = strchr(input,':'); + double MHz = 0.0; + if (p) MHz = atof(p+1); + new_pico_per_cycle = (uint32_t)(1000000. / MHz); + break; + } + } + fclose(fp); + if (!p) + goto fail; + } +#elif defined(__powerpc__) + #include + #include + { + DIR *dp = opendir("/proc/device-tree/cpus"); + uint32_t freq; + FILE *fp = NULL; + char buf[256]; + struct dirent *de = NULL; + int found = 0; + if (!dp) + goto fail; + do { + de = readdir(dp); + if (de && (de->d_name == strstr(de->d_name, "PowerPC,"))) { + found = 1; + break; + } + } while (de != NULL); + if (!found) + goto fail; + + snprintf(buf, sizeof buf, + "/proc/device-tree/cpus/%s/timebase-frequency", de->d_name); + if ((fp = fopen(buf, "r"))) { + if (fread((void *) &freq, sizeof(uint32_t), 1, fp) != 1) + goto fail; + /* freq is in Hz */ + new_pico_per_cycle = 1e6 / (freq / 1e6); + fclose(fp); + } + else + goto fail; + } +#endif + + /* If there's no change (within a small range), just return the old one */ + if (abs(new_pico_per_cycle - old_pico_per_cycle) < 5) + return old_pico_per_cycle; + + if (ipath_timebase_isvalid(new_pico_per_cycle)) { + timebase_warn_always("RTC timebase, using %d picos/cycle from /proc " + "instead of the detected %d picos/cycle\n", + new_pico_per_cycle, old_pico_per_cycle); + return new_pico_per_cycle; + } + +fail: + new_pico_per_cycle = SAFEDEFAULT_PICOS_PER_CYCLE; + timebase_warn_always( + "Problem obtaining CPU time base, detected to be %d " + "pico/cycle, adjusted to safe default %d picos/cycle", + old_pico_per_cycle, new_pico_per_cycle); + return new_pico_per_cycle; +} + diff --git a/ipath/ipath_utils.c b/ipath/ipath_utils.c new file mode 100644 index 0000000..4df8189 --- /dev/null +++ b/ipath/ipath_utils.c @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains ipath service routine interface used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +int __ipath_malloc_no_mmap = 0; // keep track whether we disabled mmap in malloc + +// This exists as a separate routine called on (very rare) +// ipath_update_tid() errors, so as to avoid pulling unnecessary code +// into the instruction cache, keeping the fast path code as fast possible. +int ipath_update_tid_err(void) +{ + int ret = errno; // preserve errno for return + + _IPATH_INFO("failed: %s\n", strerror(errno)); + return ret; +} + +// This exists as a separate routine called on (very rare) +// ipath_free_tid() errors, so as to avoid pulling unnecessary code +// into the instruction cache, keeping the fast path code as fast possible. +int ipath_free_tid_err(void) +{ + int ret = errno; // preserve errno for return + + _IPATH_INFO("failed: %s\n", strerror(errno)); + return ret; +} + +// touch the pages, with a 32 bit read +void ipath_touch_mmap(void *m, size_t bytes) +{ + volatile uint32_t *b = (volatile uint32_t *)m, c; + size_t i; // m is always page aligned, so pgcnt exact + int __ipath_pg_sz; + + /* First get the page size */ + __ipath_pg_sz = sysconf(_SC_PAGESIZE); + + _IPATH_VDBG("Touch %lu mmap'ed pages starting at %p\n", (unsigned long) bytes/__ipath_pg_sz, m); + bytes /= sizeof c; + for(i=0; ispc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) + _IPATH_INFO("failed: %s\n", strerror(errno)); + return -1; + } + + return 0; +} + +// flush the eager buffers, by setting the eager index head to eager index tail +// if eager buffer queue is full. +// +// Called when we had eager buffer overflows (ERR_TID/INFINIPATH_RHF_H_TIDERR +// was set in RHF errors), and no good eager packets were received, so +// that eager head wasn't adavanced. +// + +void ipath_flush_egr_bufs(struct _ipath_ctrl *ctrl) +{ + uint32_t head = __le32_to_cpu(*ctrl->__ipath_rcvegrhead); + uint32_t tail = __le32_to_cpu(*ctrl->__ipath_rcvegrtail); + + if((head%ctrl->__ipath_tidegrcnt) == ((tail+1)%ctrl->__ipath_tidegrcnt)) { + _IPATH_DBG("eager array full after overflow, flushing (head %llx, tail %llx\n", + (long long)head, (long long)tail); + *ctrl->__ipath_rcvegrhead = __cpu_to_le32(tail); + } +} + +// stop_start == 0 disables receive on the context, for use in queue +// overflow conditions. stop_start==1 re-enables, to be used to +// re-init the software copy of the head register +int ipath_manage_rcvq(struct _ipath_ctrl *ctrl, uint32_t stop_start) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_RECV_CTRL; + cmd.cmd.recv_ctrl = stop_start; + + if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// enable == 1 enables armlaunch (normal), 0 disables (only used +// ipath_pkt_test -B at the moment, needed for linda). +int ipath_armlaunch_ctrl(struct _ipath_ctrl *ctrl, uint32_t enable) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_ARMLAUNCH_CTRL; + cmd.cmd.armlaunch_ctrl = enable; + + if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// force PIOAvail register to be updated to memory +int ipath_force_pio_avail_update(struct _ipath_ctrl *ctrl) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_PIOAVAILUPD; + + if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// ack event bits, and clear them. Usage is check *spi_sendbuf_status, +// pass bits you are prepared to handle to ipath_event_ack(), perform the +// appropriate actions for bits that were set, and then (if appropriate) +// check the bits again. +int ipath_event_ack(struct _ipath_ctrl *ctrl, __u64 ackbits) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_ACK_EVENT; + cmd.cmd.event_mask = ackbits; + + if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver. */ + _IPATH_DBG("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// Disarm any send buffers which need disarming. +int ipath_disarm_bufs(struct _ipath_ctrl *ctrl) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_DISARM_BUFS; + + if (ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver. */ + _IPATH_DBG("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// Wait until send dma completion reaches at least 'completion_counter' +int ipath_sdma_complete(struct _ipath_ctrl *ctrl, uint32_t *counter) +{ + struct ipath_cmd cmd; + int ret; + + cmd.type = IPATH_CMD_SDMA_COMPLETE; + cmd.cmd.sdma_cntr = (uintptr_t) counter; + VALGRIND_MAKE_MEM_DEFINED(&cmd, sizeof(struct ipath_cmd)); + + *counter = 0; + if ((ret = ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd))) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + return 1; +} + +// Return send dma's current "in flight counter " +int ipath_sdma_inflight(struct _ipath_ctrl *ctrl, uint32_t *counter) +{ + struct ipath_cmd cmd; + int ret; + + cmd.type = IPATH_CMD_SDMA_INFLIGHT; + cmd.cmd.sdma_cntr = (uintptr_t) counter; + VALGRIND_MAKE_MEM_DEFINED(&cmd, sizeof(struct ipath_cmd)); + + *counter = 0; + if ((ret = ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd))) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s (errno=%d)\n", strerror(errno), errno); + return -1; + } + return 1; +} + +// Tell the driver to change the way packets can generate interrupts. +// +// IPATH_POLL_TYPE_URGENT: Generate interrupt only when packet sets +// INFINIPATH_KPF_INTR +// IPATH_POLL_TYPE_ANYRCV: wakeup on any rcv packet (when polled on). +// +// PSM: Uses TYPE_URGENT in ips protocol +// +int ipath_poll_type(struct _ipath_ctrl *ctrl, uint16_t poll_type) +{ + struct ipath_cmd cmd; + + cmd.type = IPATH_CMD_POLL_TYPE; + cmd.cmd.poll_type = poll_type; + + if(ipath_cmd_write(ctrl->spc_dev.spd_fd, &cmd, sizeof(cmd)) == -1) { + if (errno != EINVAL) /* not implemented in driver */ + _IPATH_INFO("failed: %s\n", strerror(errno)); + return -1; + } + return 0; +} + +// wait for a received packet for our context +// This allows us to not busy wait, if nothing has happened for a +// while, which allows better measurements of cpu utilization, and +// in some cases, slightly better performance. Called where we would +// otherwise call sched_yield(). It is not guaranteed that a packet +// has arrived, so the normal checking loop(s) should be done. +// +// PSM: not used as is, PSM has it's own use of polling for interrupt-only +// packets (sets ipath_poll_type to TYPE_URGENT) +int ipath_wait_for_packet(struct _ipath_ctrl *ctrl) +{ + return ipath_cmd_wait_for_packet(ctrl->spc_dev.spd_fd); +} + +int ipath_hideous_ioctl_emulator(int unit, int reqtype, struct ipath_eeprom_req *req) +{ + switch (reqtype) { + case IPATH_READ_EEPROM: + { + // Emulate a read of a byte range by doing a full read, then + // getting the bits we want. + char *data; + + if (infinipath_get_unit_flash(unit, &data) == -1) { + if (data) free(data); + return -1; + } + + memcpy((char *) (unsigned long) req->addr, data + req->offset, + req->len); + + free(data); + + break; + } + case IPATH_WRITE_EEPROM: + { + // Emulate a write to a byte range by doing a full read, + // modifying the bits we want, then a full write. + char *data; + int len; + + len = infinipath_get_unit_flash(unit, &data); + + if (len == -1) { + if (data) free(data); + return -1; + } + + memcpy(data + req->offset, (char *) (unsigned long) req->addr, + req->len); + + if (infinipath_put_unit_flash(unit, data, len) == -1) { + free(data); + return -1; + } + + free(data); + + break; + } + default: + fprintf(stderr, "invalid hideous emulated ioctl: %d\n", reqtype); + exit(1); + } + return 0; +} + +// check if the chip/board are in an OK state. If not, +// print a message and return an error code. Used at +// places where we are going to be in slow mode anyway, +// such as open, close, and out of pio buffers +// +// PSM: implemented in context abstraction psmi_context_check_status() +// As of 7322-ready driver, need to check port-specific qword for IB +// as well as older unit-only. For now, we don't have the port interface +// defined, so just check port 0 qword for spi_status +// Hard-code spmsg as 3rd qword until we have IB port +int ipath_check_unit_status(struct _ipath_ctrl *ctrl) +{ + char *spmsg = NULL, *msg = NULL, buf[80]; + int rc = IPS_RC_OK; + _Pragma_unlikely + + if(!ctrl->__ipath_spi_status) + return rc; + + if( !(ctrl->__ipath_spi_status[0] & IPATH_STATUS_CHIP_PRESENT) || + (ctrl->__ipath_spi_status[0] & (IPATH_STATUS_HWERROR))) { + rc = IPS_RC_DEVICE_ERROR; + if(ctrl->lasterr != rc) { // only report once + spmsg = (char*)&ctrl->__ipath_spi_status[2]; // string for hardware error, if any + if(!*spmsg) { + msg = buf; + snprintf(buf, sizeof buf, "%s\n", + (ctrl->__ipath_spi_status[0] & IPATH_STATUS_HWERROR) ? + "Hardware error" : "Hardware not found"); + } + } + } + else if (!(ctrl->__ipath_spi_status[0] & IPATH_STATUS_IB_CONF) && + !(ctrl->__ipath_spi_status[1] & IPATH_STATUS_IB_CONF)) { + rc = IPS_RC_NETWORK_DOWN; + if(ctrl->lasterr != rc) // only report once + spmsg = (char*)&ctrl->__ipath_spi_status[2]; // string for hardware error, if any + } + else if (!(ctrl->__ipath_spi_status[0] & IPATH_STATUS_IB_READY) && + !(ctrl->__ipath_spi_status[1] & IPATH_STATUS_IB_READY)) { + // if only this error, probably cable pulled, switch rebooted, etc. + // report it the first time, and then treat it same as BUSY, since + // it could be recovered from within the quiescence period + rc = IPS_RC_BUSY; + if(ctrl->lasterr != rc) // only report once + msg = "IB Link is down"; + } + if(spmsg && *spmsg) { + _IPATH_ERROR("Hardware problem: %s\n", spmsg); + // and try to get it out to user before returning error so mpirun shows + // since mpi interface code will normally exit immediately on errors + fflush(stdout); + sleep(1); + } + else if(msg) + _IPATH_DBG("%s\n", msg); + if(ctrl->lasterr && rc==IPS_RC_OK) + ctrl->lasterr = 0; // cleared up, report if it happens again + else if(rc != IPS_RC_OK) + ctrl->lasterr = rc; + return rc; +} + +/* These have been fixed to read the values, but they are not + * compatible with the ipath driver, they return new info with + * the qib driver + */ +static int infinipath_count_names(const char *namep) +{ + int n = 0; + while (*namep != '\0') { + if (*namep == '\n') + n++; + namep++; + } + return n; +} + +const char * infinipath_get_next_name(char **names) +{ + char *p, *start; + + p = start = *names; + while (*p != '\0' && *p != '\n') { + p++; + } + if (*p == '\n') { + *p = '\0'; + p++; + *names = p; + return start; + } else + return NULL; +} + +void infinipath_release_names(char *namep) +{ + /* TODO: names were initialised in the data section before. Now + * they are allocated when ipath_ipathfs_read() is called. Allocation + * for names is done only once at init time. Should we eventually + * have an "stats_type_unregister" type of routine to explicitely + * deallocate memory and free resources ? + */ +#if 0 + if (namep != NULL) + free(namep); +#endif +} + +int infinipath_get_stats_names_count() +{ + char *namep; + int c; + + c = infinipath_get_stats_names(&namep); + free(namep); + return c; +} + +int infinipath_get_ctrs_unit_names_count(int unitno) +{ + char *namep; + int c; + + c = infinipath_get_ctrs_unit_names(unitno, &namep); + free(namep); + return c; +} + +int infinipath_get_ctrs_port_names_count(int unitno) +{ + char *namep; + int c; + + c = infinipath_get_ctrs_port_names(unitno, &namep); + free(namep); + return c; +} + +int infinipath_lookup_stat(const char *attr, char *namep, uint64_t *stats, + uint64_t *s) +{ + const char *p; + int i, ret = -1, len = strlen(attr); + int nelem = infinipath_count_names(namep); + + for (i = 0; i < nelem; i++) { + p = infinipath_get_next_name(&namep); + if (p == NULL) break; + if (strncasecmp(p, attr, len+1) == 0) { + ret = i; + *s = stats[i]; + } + } + return ret; +} + +uint64_t infinipath_get_single_stat(const char *attr, uint64_t *s) +{ + int nelem, n = 0, ret = -1; + char *namep = NULL; + uint64_t *stats = NULL; + + nelem = infinipath_get_stats_names(&namep); + if (nelem == -1 || namep == NULL) + goto bail; + stats = calloc(nelem, sizeof(uint64_t)); + if (stats == NULL) + goto bail; + n = infinipath_get_stats(stats, nelem); + if (n != nelem) + goto bail; + ret = infinipath_lookup_stat(attr, namep, stats, s); +bail: + if (namep != NULL) + free(namep); + if (stats != NULL) + free(stats); + return ret; +} + +uint64_t infinipath_get_single_unitctr(int unit, const char *attr, uint64_t *s) +{ + int nelem, n = 0, ret = -1; + char *namep = NULL; + uint64_t *stats = NULL; + + nelem = infinipath_get_ctrs_unit_names(unit, &namep); + if (nelem == -1 || namep == NULL) + goto bail; + stats = calloc(nelem, sizeof(uint64_t)); + if (stats == NULL) + goto bail; + n = infinipath_get_ctrs_unit(unit, stats, nelem); + if (n != nelem) + goto bail; + ret = infinipath_lookup_stat(attr, namep, stats, s); +bail: + if (namep != NULL) + free(namep); + if (stats != NULL) + free(stats); + return ret; +} + +int infinipath_get_single_portctr(int unit, int port, const char *attr, + uint64_t *s) +{ + int nelem, n = 0, ret = -1; + char *namep = NULL; + uint64_t *stats = NULL; + + nelem = infinipath_get_ctrs_port_names(unit, &namep); + if (nelem == -1 || namep == NULL) + goto bail; + stats = calloc(nelem, sizeof(uint64_t)); + if (stats == NULL) + goto bail; + n = infinipath_get_ctrs_port(unit, port, stats, nelem); + if (n != nelem) + goto bail; + ret = infinipath_lookup_stat(attr, namep, stats, s); +bail: + if (namep != NULL) + free(namep); + if (stats != NULL) + free(stats); + return ret; +} + +/* + * Add a constructor function to disable mmap if asked to do so by the user + */ +static void init_mallopt_disable_mmap(void) __attribute__ ((constructor)); + +static void init_mallopt_disable_mmap(void) +{ + char *env = getenv("IPATH_DISABLE_MMAP_MALLOC"); + + if (env && *env) { + if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) { + __ipath_malloc_no_mmap = 1; + } + } + + return; +} diff --git a/ipath/ipath_write_pio-i386.c b/ipath/ipath_write_pio-i386.c new file mode 100644 index 0000000..603edc1 --- /dev/null +++ b/ipath/ipath_write_pio-i386.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void ipath_write_pio_force_order(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + // 32 bit programs require fence after first 32 bits of pbc write + // Can't do as uint64_t store, or compiler could reorder + ips_wmb(); + *piob++ = buf.pbcflags; + + if(!pioparm->length) { + uint32_t *dhdr, dcpywords; + dcpywords = (IPATH_MESSAGE_HDR_SIZE >> 2)-1; + ipath_dwordcpy_safe(piob, hdr, dcpywords); + ips_wmb(); + dhdr = hdr; + piob += dcpywords; + dhdr += dcpywords; + *piob++ = *dhdr; + } else { + uint32_t *pay2 = bdata, j; + uint32_t len = pioparm->length; + + ipath_dwordcpy_safe(piob, hdr, + IPATH_MESSAGE_HDR_SIZE >> 2); + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + + len >>= 2; + if(len>16) { + uint32_t pay_words = 16*((len-1)/16); + ipath_dwordcpy_safe(piob, pay2, pay_words); + piob += pay_words; + pay2 += pay_words; + len -= pay_words; + } + // now write the final chunk a word at a time, fence before trigger + for(j=0;j<(len-1);j++) + *piob++ = *pay2++; + ips_wmb(); // flush the buffer out now, so + *piob++ = *pay2; + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + ips_wmb(); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +void ipath_write_pio(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {0}; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + // 32 bit programs needs compiler fence to prevent compiler reordering + // the two 32 bit stores in a uint64_t, but on inorder wc systems, does not + // need a memory fence. + asm volatile("" : : : "memory"); + *piob++ = buf.pbcflags; + + ipath_dwordcpy_safe(piob, hdr, + IPATH_MESSAGE_HDR_SIZE >> 2); + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + asm volatile("" : : : "memory"); // prevent compiler reordering + + if(pioparm->length) + ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +static inline void ipath_write_pio_special_trigger(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata, + unsigned offset) +{ + union ipath_pbc buf = {0}; + volatile uint32_t *piobs = piob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *piob++ = buf.dword; + // 32 bit programs needs compiler fence to prevent compiler reordering + // the two 32 bit stores in a uint64_t, but on inorder wc systems, does not + // need a memory fence. + asm volatile("" : : : "memory"); + *piob++ = buf.pbcflags; + + ipath_dwordcpy_safe(piob, hdr, + IPATH_MESSAGE_HDR_SIZE >> 2); + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + asm volatile("" : : : "memory"); // prevent compiler reordering + + if (pioparm->length) + ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); + *(piobs + offset) = IPATH_SPECIAL_TRIGGER_MAGIC; + ips_wmb(); +} + +void ipath_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); +} + +void ipath_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); +} + diff --git a/ipath/ipath_write_pio-ppc.c b/ipath/ipath_write_pio-ppc.c new file mode 100644 index 0000000..f6bda57 --- /dev/null +++ b/ipath/ipath_write_pio-ppc.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +#include + +union piovec { + vector unsigned int vec; + uint32_t dw[4]; +}; + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void ipath_write_pio_force_order(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + volatile uint32_t *dpiob = (volatile uint32_t *)piob; + uint32_t *dhdr = hdr; + uint32_t *ddata = bdata; + uint32_t dlen = pioparm->length >> 2; + union piovec vec; + volatile vector unsigned int *vpiob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + vpiob = (volatile vector unsigned int *)dpiob; + + vec.dw[0] = buf.dword; + vec.dw[1] = 0; + vec.dw[2] = *dhdr++; + vec.dw[3] = *dhdr++; + *vpiob++ = vec.vec; + ips_wmb(); + + vec.dw[0] = *dhdr++; + vec.dw[1] = *dhdr++; + vec.dw[2] = *dhdr++; + vec.dw[3] = *dhdr++; + *vpiob++ = vec.vec; + + vec.dw[0] = *dhdr++; + vec.dw[1] = *dhdr++; + vec.dw[2] = *dhdr++; + vec.dw[3] = *dhdr++; + *vpiob++ = vec.vec; + + vec.dw[0] = *dhdr++; + vec.dw[1] = *dhdr++; + vec.dw[2] = *dhdr++; + vec.dw[3] = *dhdr; + + if ( !dlen ) { + ips_wmb(); + *vpiob++ = vec.vec; + dpiob = (volatile uint32_t *) vpiob; + } else { + *vpiob++ = vec.vec; + + while ( dlen > 4 ) { + vec.dw[0] = *ddata++; + vec.dw[1] = *ddata++; + vec.dw[2] = *ddata++; + vec.dw[3] = *ddata++; + *vpiob++ = vec.vec; + dlen -= 4; + } + + switch ( dlen ) { + + case 4: { + vec.dw[0] = *ddata++; + vec.dw[1] = *ddata++; + vec.dw[2] = *ddata++; + vec.dw[3] = *ddata; + ips_wmb(); + *vpiob++ = vec.vec; + dpiob = (volatile uint32_t *) vpiob; + } break; + + case 3: { + dpiob = (volatile uint32_t *)vpiob; + *dpiob++ = *ddata++; + *dpiob++ = *ddata++; + ips_wmb(); + *dpiob++ = *ddata; + } break; + + case 2: { + dpiob = (volatile uint32_t *)vpiob; + *dpiob++ = *ddata++; + ips_wmb(); + *dpiob++ = *ddata; + } break; + + case 1: { + dpiob = (volatile uint32_t *)vpiob; + ips_wmb(); + *dpiob++ = *ddata; + } break; + } + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *dpiob = pioparm->cksum; + dpiob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *dpiob = pioparm->cksum; + } + + ips_wmb(); + + return; +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +void ipath_write_pio(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + volatile uint32_t *dpiob = piob; + uint32_t *dhdr = hdr; + uint32_t *ddata = bdata; + uint32_t dlen = pioparm->length >> 2; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *dpiob++ = buf.dword; + asm volatile("" : : : "memory"); + *dpiob++ = 0; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + *dpiob++ = *dhdr++; + if ( !dlen ) { + asm volatile("" : : : "memory"); + *dpiob++ = *dhdr; + } else { + *dpiob++ = *dhdr; + + while ( dlen > 1 ) { + *dpiob++ = *ddata++; + dlen -= 1; + } + + asm volatile("" : : : "memory"); + *dpiob++ = *ddata; + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *dpiob = pioparm->cksum; + dpiob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *dpiob = pioparm->cksum; + } + + ips_wmb(); + + return; +} + +void ipath_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + _IPATH_ERROR("no special trigger 2k support for ppc\n"); +} + +void ipath_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + _IPATH_ERROR("no special trigger 4k support for ppc\n"); +} diff --git a/ipath/ipath_write_pio-ppc64.c b/ipath/ipath_write_pio-ppc64.c new file mode 100644 index 0000000..c7f8764 --- /dev/null +++ b/ipath/ipath_write_pio-ppc64.c @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +#include + +union piovec { + vector unsigned int vec; + uint64_t qw[2]; +}; + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void ipath_write_pio_force_order(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + volatile uint64_t *qpiob = (volatile uint64_t *)piob; + uint64_t *qhdr = hdr; + uint64_t *qdata = bdata; + uint64_t dlen = pioparm->length >> 2; + union piovec vec; + volatile vector unsigned int *vpiob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + vpiob = (volatile vector unsigned int *)qpiob; + + vec.qw[0] = buf.qword; + vec.qw[1] = *qhdr++; + *vpiob++ = vec.vec; + ips_wmb(); + + vec.qw[0] = *qhdr++; + vec.qw[1] = *qhdr++; + *vpiob++ = vec.vec; + + vec.qw[0] = *qhdr++; + vec.qw[1] = *qhdr++; + *vpiob++ = vec.vec; + + vec.qw[0] = *qhdr++; + vec.qw[1] = *qhdr; + + if ( !dlen ) { + ips_wmb(); + *vpiob++ = vec.vec; + piob = (volatile uint32_t*) qpiob; + } else { + *vpiob++ = vec.vec; + + while ( dlen > 4 ) { + vec.qw[0] = *qdata++; + vec.qw[1] = *qdata++; + *vpiob++ = vec.vec; + dlen -= 4; + } + + switch ( dlen ) { + + case 4: { + vec.qw[0] = *qdata++; + vec.qw[1] = *qdata; + ips_wmb(); + *vpiob++ = vec.vec; + piob = (volatile uint32_t*) qpiob; + } break; + + case 3: { + volatile uint32_t *dpiob; + uint32_t *ddata; + qpiob = (volatile uint64_t *)vpiob; + *qpiob++ = *qdata++; + dpiob = (volatile uint32_t *)qpiob; + ddata = (uint32_t *)qdata; + ips_wmb(); + *dpiob++ = *ddata; + piob = (volatile uint32_t*) dpiob; + } break; + + case 2: { + qpiob = (volatile uint64_t *)vpiob; + ips_wmb(); + *qpiob++ = *qdata; + piob = (volatile uint32_t*) qpiob; + } break; + + case 1: { + volatile uint32_t *dpiob = (volatile uint32_t *)vpiob; + uint32_t *ddata = (uint32_t *)qdata; + ips_wmb(); + *dpiob++ = *ddata; + piob = (volatile uint32_t*) dpiob; + } break; + } + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *piob = pioparm->cksum; + } + + ips_wmb(); + + return; +} + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +void ipath_write_pio(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + volatile uint64_t *qpiob = (volatile uint64_t *)piob; + uint64_t *qhdr = hdr; + uint64_t *qdata = bdata; + uint64_t dlen = pioparm->length >> 2; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len) >> 2) + dlen + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *qpiob++ = buf.qword; + asm volatile("" : : : "memory"); // prevent compiler reordering + *qpiob++ = *qhdr++; + *qpiob++ = *qhdr++; + *qpiob++ = *qhdr++; + *qpiob++ = *qhdr++; + *qpiob++ = *qhdr++; + *qpiob++ = *qhdr++; + if ( !dlen ) { + asm volatile("" : : : "memory"); // prevent compiler reordering + *qpiob++ = *qhdr; + piob = (volatile uint32_t*) qpiob; + } else { + *qpiob++ = *qhdr; + + while ( dlen > 2 ) { + *qpiob++ = *qdata++; + dlen -= 2; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + + switch ( dlen ) { + + case 2: { + *qpiob++ = *qdata; + piob = (volatile uint32_t*) qpiob; + } break; + + case 1: { + volatile uint32_t *dpiob = (volatile uint32_t *)qpiob; + uint32_t *ddata = (uint32_t *)qdata; + + *dpiob++ = *ddata; + piob = (volatile uint32_t*) dpiob; + } break; + } + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); + + return; +} + +void ipath_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + _IPATH_ERROR("no special trigger 2k support for ppc64\n"); +} + +void ipath_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + _IPATH_ERROR("no special trigger 4k support for ppc64\n"); +} diff --git a/ipath/ipath_write_pio-x86_64.c b/ipath/ipath_write_pio-x86_64.c new file mode 100644 index 0000000..a5d47d7 --- /dev/null +++ b/ipath/ipath_write_pio-x86_64.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains the initialization functions used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipserror.h" +#include "ipath_user.h" + +/* + * These pio copy routines are here so they can be used by test code, as well + * as by MPI, and can change independently of MPI +*/ + +/* + * for processors that may not write store buffers in the order filled, + * and when the store buffer is not completely filled (partial at end, or + * interrupted and flushed) may write the partial buffer in + * "random" order. requires additional serialization +*/ +void ipath_write_pio_force_order(volatile uint32_t *piob, const struct ipath_pio_params *pioparm, + void *hdr, void *bdata) +{ + union ipath_pbc buf = {.qword = 0}; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + ips_wmb(); // pbc must be forced to be first write to chip buffer + piob += 2; + + if(!pioparm->length) { + uint32_t *dhdr, dcpywords; + dcpywords = (IPATH_MESSAGE_HDR_SIZE >> 2)-1; + ipath_dwordcpy_safe(piob, hdr, dcpywords); + ips_wmb(); + dhdr = hdr; + piob += dcpywords; + dhdr += dcpywords; + *piob++ = *dhdr; + } else { + uint32_t *pay2 = bdata, j; + uint32_t len = pioparm->length; + + ipath_dwordcpy_safe(piob, hdr, + IPATH_MESSAGE_HDR_SIZE >> 2); + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + + len >>= 2; + if(len>16) { + uint32_t pay_words = 16*((len-1)/16); + ipath_dwordcpy_safe(piob, pay2, pay_words); + piob += pay_words; + pay2 += pay_words; + len -= pay_words; + } + // now write the final chunk a word at a time, fence before trigger + for(j=0;j<(len-1);j++) + *piob++ = *pay2++; + ips_wmb(); // flush the buffer out now, so + *piob++ = *pay2; + } + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + ips_wmb(); + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + + +/* + * for processors that always write store buffers in the order filled, + * and if store buffer not completely filled (partial at end, or + * interrupted and flushed) always write the partial buffer in + * address order. Avoids serializing and flush instructions + * where possible. + */ +#ifdef __MIC__ +void ipath_write_pio_vector(volatile uint32_t *piob, const struct ipath_pio_params *pioparm, + void *hdr, void *bdata) +{ + union ipath_pbc *pbc; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + pbc = (union ipath_pbc *)((char *)hdr - 8); + pbc->qword = 0; + pbc->length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + pbc->pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + pbc->pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + +#ifdef PSM_DEBUG + if (((uint64_t)piob) & 63) { + _IPATH_ERROR("ipath_write_pio_vector(): piob not 64byte aligned\n"); + return; + } + if (((uint64_t)pbc) & 63) { + _IPATH_ERROR("ipath_write_pio_vector(): pbc not 64byte aligned\n"); + return; + } +#endif + memcpy((uint32_t *)piob, pbc, IPATH_MESSAGE_HDR_SIZE+8); + piob += (IPATH_MESSAGE_HDR_SIZE >> 2) + 2; + + if(pioparm->length) + memcpy((uint32_t *)piob, (uint32_t*)bdata, pioparm->length); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reorder + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + //ips_wmb(); +} +#endif //__MIC__ + +void ipath_write_pio(volatile uint32_t *piob, const struct ipath_pio_params *pioparm, + void *hdr, void *bdata) +{ + union ipath_pbc buf = {0}; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if (pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT | + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + piob += 2; + asm volatile("" : : : "memory"); // prevent compiler reordering + + ipath_dwordcpy_safe(piob, hdr, IPATH_MESSAGE_HDR_SIZE >> 2); + + asm volatile("" : : : "memory"); // prevent compiler reordering + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + + if(pioparm->length) + ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reorder + *piob = pioparm->cksum; + } + + /* send it on it's way, now, rather than waiting for processor to + * get around to flushing it */ + ips_wmb(); +} + +/* + * here we trigger on a "special" address, so just bang it out + * as fast as possible... + */ +static inline void +ipath_write_pio_special_trigger(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, + void *hdr, void *bdata, unsigned offset) +{ + union ipath_pbc buf = {0}; + volatile uint32_t *piobs = piob; + uint32_t cksum_len = pioparm->cksum_is_valid ? + IPATH_CRC_SIZE_IN_BYTES : 0; + + buf.length = + __cpu_to_le16(((IPATH_MESSAGE_HDR_SIZE + cksum_len + pioparm->length) >> 2) + 1); + if(pioparm->port > 1) + buf.pbcflags = __cpu_to_le32((pioparm->vl << __PBC_VLSHIFT) | + __PBC_IBPORT | + pioparm->rate); + else + buf.pbcflags = __cpu_to_le32(pioparm->vl << __PBC_VLSHIFT| + pioparm->rate); + + *(volatile uint64_t *)piob = buf.qword; + piob += 2; + asm volatile("" : : : "memory"); // prevent compiler reordering + + ipath_dwordcpy_safe(piob, hdr, + IPATH_MESSAGE_HDR_SIZE >> 2); + piob += IPATH_MESSAGE_HDR_SIZE >> 2; + asm volatile("" : : : "memory"); // prevent compiler reordering + + if(pioparm->length) + ipath_dwordcpy_safe(piob, (uint32_t*)bdata, pioparm->length>>2); + + /* If checksum is enabled insert CRC at end of packet */ + if_pf (pioparm->cksum_is_valid){ + int nCRCopies = IPATH_CRC_SIZE_IN_BYTES >> 2; + int nCRC = 0; + + piob += pioparm->length >> 2; + + while (nCRC < (nCRCopies-1)) { + *piob = pioparm->cksum; + piob++; + nCRC++; + } + + asm volatile("" : : : "memory"); // prevent compiler reordering + *piob = pioparm->cksum; + } + + /* + * flush then write "special" then flush... + */ + ips_wmb(); + *(piobs + offset) = IPATH_SPECIAL_TRIGGER_MAGIC; + ips_wmb(); +} + +void ipath_write_pio_special_trigger2k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 1023); +} + +void ipath_write_pio_special_trigger4k(volatile uint32_t *piob, + const struct ipath_pio_params *pioparm, void *hdr, void *bdata) +{ + ipath_write_pio_special_trigger(piob, pioparm, hdr, bdata, 2047); +} diff --git a/libuuid/COPYING b/libuuid/COPYING new file mode 100644 index 0000000..2f17068 --- /dev/null +++ b/libuuid/COPYING @@ -0,0 +1,25 @@ +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, and the entire permission notice in its entirety, + including the disclaimer of warranties. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. The name of the author may not be used to endorse or promote + products derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF +WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. diff --git a/libuuid/ChangeLog b/libuuid/ChangeLog new file mode 100644 index 0000000..b90e063 --- /dev/null +++ b/libuuid/ChangeLog @@ -0,0 +1,556 @@ +2006-06-30 Theodore Ts'o + + * Release of E2fsprogs 1.38 + +2005-03-21 Theodore Ts'o + + * Release of E2fsprogs 1.37 + +2006-02-05 Theodore Ts'o + + * Release of E2fsprogs 1.36 + +2005-02-05 Theodore Ts'o + + * Makefile.in: Remove uuid.pc on a "make distclean" + +2005-01-26 Theodore Ts'o + + * uuid.pc.in: Add pkg-config files. + +2005-01-18 Theodore Ts'o + + * Makefile.in: Fix the kernel compile-time echo commands to be + consistent and portable + +2005-01-17 Theodore Ts'o + + * uuidP.h: Use inttypes.h in preference to stdint.h for + compatibility with older FreeBSD and Solaris systems. + +2004-12-14 Theodore Ts'o + + * Makefile.in: Use Linux-kernel-style makefile output for "make + install" + + * Makefile.in (installdirs): Use $(MKINSTALLDIRS) macro. + Update dependencies. + +2004-11-30 Theodore Ts'o + + * Makefile.in: Use Linux-kernel-style makefile output to make it + easier to see errors/warnings. + +2004-09-17 Theodore Ts'o + + * gen_uuid.c (get_node_id): glibc always defines AF_LINK, so only + try to use struct sockaddr_dl if HAVE_NET_IF_DL_H is + defined. (Addresses Debian Bug #256669) + +2004-05-27 Theodore Ts'o + + * uuid.h (UUID_DEFINE): Make the UUID defined as a static + variable, with __attribute__ ((unused)) if we are using GCC. + +2004-05-04 Theodore Ts'o + + * Update and clean up uuid man pages + + * gen_uuid.c (uuid_generate_time): Mask off the timestamp to avoid + a Y8.8888K problem. + +2004-04-03 Theodore Ts'o + + * Makefile.in: Update the modtime even if subst doesn't need to + update the libuuid man pages, to avoid always re-running + subst, especially since there are no dependencies on the + man page. + +2004-04-03 Theodore Ts'o + + * libuuid.3.in, uuid_clear.3.in, uuid_compare.3.in, uuid_copy.3.in, + uuid_generate.3.in, uuid_is_null.3.in, uuid_parse.3.in, + uuid_time.3.in, uuid_unparse.3.in: Change licensing of man + pages from GPL to 3-clause BSD-style. + + * uuid_parse.3.in, uuid_unparse.3.in: Change the use of the term + "internal format" to "binary representation". + + * gen_uuid.c, pack.c, unpack.c, uuid_time.c, uuidP.h, + uuid_types.h.in: Use ANSI C99 types if stdint.h exists. + +2004-03-30 Theodore Ts'o + + * gen_uuid.c (get_node_id): Clean up AF_LINK #ifdef's for Darwin. + +2004-03-22 Theodore Ts'o + + * unparse.c (uuid_unparse_lower, uuid_unparse_upper), + uuid_unparse.3.in, uuid.h: Add new functions. + +2004-03-19 Theodore Ts'o + + * Change the license to be the 3-clause BSD-style license + + * uuid.h (UUID_DEFINE): Add UUID type #define's, and add an CPP + macro to define UUID constants. + + * gen_uuid.c (get_clock): Use 14 bits for the clock sequence, + instead of just 13 bits. + + * gen_uuid.c (get_node_id): Fix so that Darwin will actually get + the ethernet address correctly. + +2004-02-29 Brian Bergstrand + + * Makefile.in: Use $(BSDLIB_PIC_FLAG) to determine whether to use + -fpic or -fPIC + +2004-02-28 Theodore Ts'o + + * Release of E2fsprogs 1.35 + +2004-01-30 Theodore Ts'o + + * gen_uuid.c (uuid_generate_time): Fix bug pointed out by Ralf + S. Engelshall; when generating a random ethernet address + because one is not available, set the least significant + bit of the first byte of the MAC address, since it is the + first bit to be transmitted, and is therefore the + multicast bit. + +2003-07-25 Theodore Ts'o + + * Release of E2fsprogs 1.34 + +2003-04-21 Theodore Ts'o + + * Release of E2fsprogs 1.33 + +2003-04-21 Theodore Ts'o + + * Makefile.in: Use DYLD_LIBRAY_PATH so that "make check" works on + Darwin systems when building with shared libraries. + +2003-04-12 Theodore Ts'o + + * gen_uuid.c: Add #ifdef checks around #include and + . + +2003-04-03 Theodore Ts'o + + * gen_uuid.c (get_random_bytes): Always xor in a stream of bytes + from the system PRNG (i.e., random/srandom, seeded from + the time, pid, and uid) in case /dev/random isn't doing + the right thing on a particular system. It doesn't hurt, + and it can help, in the case of a buggy /dev/random. + +2003-03-14 Theodore Ts'o + + * Makefile.in: Add support for Apple Darwin + +2003-03-06 Theodore Tso + + * uuid_types.h.in: Don't redefine types if other e2fsprogs + *_types.h files have been included already. + + * Makefile.in (tst_uuid): Link against the static library instead + of all of the object files, so that we automatically pick + up -lsocket under Solaris. + +2003-03-02 Theodore Ts'o + + * Makefile.in, uuidP.h, uuid_types.h.in: Use uuid_types.h instead + of ext2_types.h + +2002-11-09 Theodore Ts'o + + * Release of E2fsprogs 1.32 + +2002-11-08 Theodore Ts'o + + * Release of E2fsprogs 1.31 + +2002-10-31 Theodore Ts'o + + * Release of E2fsprogs 1.30 + +2002-10-31 Theodore Ts'o + + * gen_uuid.c (get_random_bytes): Don't spin forever if read() + returns EINTR or EAGAIN, so that when /dev/random is + opened O_NONBLOCK, we don't end up spinning forever. + +2001-09-24 Theodore Tso + + * Release of E2fsprogs 1.29 + +2001-08-31 Theodore Tso + + * Release of E2fsprogs 1.28 + +2002-07-15 Theodore Ts'o + + * parse.c (uuid_parse): Fix uuid parsing bug which didn't complain + for certain types of invalid input text. (Addresses + Debian bug #152891). + + * tst_uuid.c: Add test cases for invalid text strings passed to + uuid_parse. + +2002-03-08 Theodore Tso + + * Release of E2fsprogs 1.27 + +2002-02-24 Theodore Tso + + * Makefile.in (install): Install hard links to man pages for + uuid_generate_random and uuid_generate_time. Remove + any compressed man pages before installing the man pages. + +2002-02-03 Theodore Tso + + * Release of E2fsprogs 1.26 + +2001-09-20 Theodore Tso + + * Release of E2fsprogs 1.25 + +2001-09-10 Theodore Tso + + * compare.c (uuid_compare), copy.c (uuid_copy), + isnull.c (uuid_is_null), pack.c (uuid_pack), + parse.c (uuid_parse), unpack.c (uuid_unpack), + unparse.c (uuid_unparse), uuid.h, uuidP.h, + uuid_time.c (uuid_time, uuid_type, uuid_variant): + Use const for pointer variables that we don't modify. Add + the appropriate ifdef's in uuid.h to make it be C++ friendly. + +2001-09-02 Theodore Tso + + * Release of E2fsprogs 1.24a + +2001-08-30 Theodore Tso + + * Release of E2fsprogs 1.24 + +2001-08-15 Theodore Tso + + * Release of E2fsprogs 1.23 + +2001-06-23 Theodore Tso + + * Release of E2fsprogs 1.22 + +2001-06-21 Theodore Tso + + * uuid.h: Add protection against multiple inclusion + +2001-06-15 Theodore Tso + + * Release of E2fsprogs 1.21 + +2001-06-01 Theodore Tso + + * Makefile.in, uuidP.h: Move include/asm/types.h.in to + lib/ext2fs/ext2_types.h.in. + +2001-06-01 Theodore Tso + + * unpack.c, unparse.c, uuid_time.c: Update files to be under the + LGPL (that somehow were missed when libuuid was converted + to use the LGPL). Whoops. + +2001-05-25 Theodore Tso + + * Release of E2fsprogs 1.20 + +2001-05-14 Theodore Tso + + * tst_uuid.c, uuid_time.c: Remove unneeded #include of ext2_fs.h + +2001-05-12 Theodore Tso + + * libuuid.3.in, uuid_clear.3.in, uuid_compare.3.in, uuid_copy.3.in, + uuid_generate.3.in, uuid_is_null.3.in, uuid_parse.3.in, + uuid_time.3.in, uuid_unparse.3.in: Update URL location of + e2fsprogs package. + +2001-05-01 Theodore Tso + + * parse.c, compare.c: Include string.h to fix gcc -Wall + complaints. + + * gen_uuid.c: Define _SVID_SOURCE to avoid gcc -Wall errors + because some required structures wouldn't be otherwise + defined. Fix a minor gcc -Wall nit in the declaration of + get_random_fd(). + +2001-01-12 Theodore Ts'o + + * uuid_time.c (main), tst_uuid.c (main): Fix gcc -Wall complaints. + + * uuid.h, copy.c (uuid_copy): Change arguments to make it + clear which argument is the source and which is the + destination. + + * gen_uuid.c (get_random_fd): Use gettimeofday to seed the PRNG, + so we can take advantage of tv_usec to do (slightly) + better at seeding it. + +2000-07-13 + + * Release of E2fsprogs 1.19 + +2000-07-07 Theodore Ts'o + + * Makefile.in (uuid_time): Fix compilation rule so that + uuid_time.o doesn't get bashed in order to build the + command-line version of uuid_time. + +2000-07-04 Theodore Ts'o + + * Makefile.in: Remove explicit link of -lc in the shared library. + (It shouldn't be necessary, and is harmful in some cases). + +2000-06-12 Theodore Ts'o + + * gen_uuid.c (get_random_bytes): Use O_NONBLOCK when trying to + open /dev/random. Break out the /dev/random + initialization code into a get_random_fd() function, and + use that function in uuid_generate() to determine whether + to use uuid_generate_random() or uuid_generate_time(). + +2000-05-25 + + * Makefile: Add hack dependency rule so that parallel makes work + correctly. + +2000-04-07 Theodore Ts'o + + * clear.c, compare.c, copy.c, gen_uuid.c, isnull.c, pack.c, + parse.c, uuid.h, uuidP.h: Changed copyright to be the + LGPL. + +Thu Apr 6 17:38:58 2000 Theodore Y. Ts'o + + * Makefile.in (uuid_time): Compile uuid_time in two steps (first + create .o, then link it against the libraries) to work + around bug in a.out linker. + + * dll/jump.funcs, dll/jump.import, dll/jump.params: Update a.out + shared library control files to reflect new added files. + +2000-04-03 Theodore Ts'o + + * gen_uuid.c (get_clock): Fix bug where the last timeval wasn't + getting set, causing potentially duplicate UUID's to be + generated. + +2000-03-12 Theodore Ts'o + + * gen_uuid.c (get_random_bytes): Make more paranoid about + misbehaving /dev/urandom. If we get a return of zero + without an error more than 8 times in a row, we break out + and return an error. Also, if /dev/urandom doesn't exist, + try /dev/random. + +2000-01-18 Theodore Ts'o + + * Makefile.in: Since LIBUUID can sometimes include + "-lsocket" we need a separate DEPLIBUUID that can be used + in Makefile's dependency rules. + +1999-11-19 + + * Makefile.in (distclean): Remove TAGS and Makefile.in.old from + the source directory. + +1999-11-10 + + * Release of E2fsprogs 1.18 + +1999-10-26 + + * Release of E2fsprogs 1.17 + +1999-10-26 + + * uuid_time.c (variant_string): Declare to be static to avoid gcc + warnings. + + * uuid.h: Add function prototypes for uuid_generate_random() and + uuid_generate_time(). + +1999-10-25 + + * gen_uuid_nt.c (uuid_generate): W2K strikes again! An + incompatible interface change means we need to detect + whether the code is running on an NT4 or NT5 system. + +1999-10-22 + + * Release of E2fsprogs 1.16 + +1999-10-21 + + * uuid_generate.8.in: Update man page to use a more standard + format (bold option flags and italicized variables), as + suggested by Andreas Dilger (adilger@enel.ucalgary.ca) + +1999-09-24 + + * gen_uuid_nt.c: New file which creates a UUID under Windows NT. + +1999-07-18 Theodore Ts'o + + * Release of E2fsprogs 1.15 + +1999-05-17 + + * gen_uuid.c (get_random_bytes): Use a while loop when reading + from /dev/urandom so that if we get interrupted while + reading the right thing happens. + (uuid_generate_random): Add new function which uses the + new UUID format which uses 122 random bits to form the + 128-bit UUID. + (uuid_generate): Rename the old uuid_generate to be + uuid_generate_time, and create a new uuid_generate + function which calls either uuid_generate_random or + uuid_genereate_time depending on whether /dev/urandom is + present. + + * uuid_generate.3.in: Update to reflect changesin uuid_generate + and its two new variants. + + * tst_uuid.c: Updated to test new uuid_generate functions, and to + reflect new semantics of uuid_compare. Added tests to + make sure the UUID type and variant created by UUID + generate is correct. + + * uuid_time.c (uuid_variant, uuid_type): Added new functions to + return the UUID variant and type information. The + debugging program now prints the UUID variant and type, + and warns if the unparsed time information is likely to be + incorrect. + + * uuid_parse.3.in, libuuid.3.in: Miscellaneous text cleanups. + +1999-05-03 + + * compare.c (uuid_compare): Change sense of uuid_compare so that + its return values match that of memcpy and the + uuid_compare() found in Paul Leach's internet-draft. + +1999-03-11 Andreas Dilger + + * Created man pages for libuuid functions. + +1999-01-09 Theodore Ts'o + + * Release of E2fsprogs 1.14 + +1998-12-15 Theodore Ts'o + + * Release of E2fsprogs 1.13 + +1998-12-04 Theodore Ts'o + + * Makefile.in: Update version numbers of the UUID shared library, + since we've added a new function (uuid_time()). + + * uuid_time.c: New file which returns the time field of a UUID. + (Good for debugging purposes) + +1998-07-09 Theodore Ts'o + + * Release of E2fsprogs 1.12 + +1998-06-25 Theodore Ts'o + + * tst_uuid.c (main): Fixed bogus declaration of the main's argv + parameter. + +1998-04-26 Theodore Ts'o + + * uuidP.h: Use asm/types.h instead of linux/types.h to avoid a + problem caused by glibc hack to prevent linux/types.h from + being included. + +1998-03-30 Theodore Ts'o + + * Makefile.in: Change to use new installation directory variables + convention. Fix uninstall rules to take $(DESTDIR) into + account. + +Sun Mar 8 22:17:59 1998 Theodore Ts'o + + * gen_uuid.c (get_node_id): Use char * instead of caddr_t, which + doesn't always exist for glibc. + +Tue Oct 14 21:48:16 1997 Theodore Ts'o + + * gen_uuid.c: Use clock_reg instead of clock, since clock + conflicts with a header file declaration. + +Tue Jun 17 01:33:20 1997 Theodore Ts'o + + * Release of E2fsprogs 1.11 + +Thu Apr 24 12:16:42 1997 Theodre Ts'o + + * Release of E2fsprogs version 1.10 + +Thu Apr 17 12:23:38 1997 Theodore Ts'o + + * Release of E2fsprogs version 1.09 + +Fri Apr 11 18:56:26 1997 Theodore Ts'o + + * Release of E2fsprogs version 1.08 + +Wed Mar 12 13:32:05 1997 Theodore Y. Ts'o + + * Release of E2fsprogs version 1.07 + +Sun Mar 2 16:45:36 1997 Theodore Ts'o + + * Makefile.in (ELF_VERSION): Change version to be 1.1 + +Thu Feb 6 23:08:07 1997 Theodore Ts'o + + * gen_uuid.c (uuid_generate): Set Multicast bit when picking a + random node_id, to prevent conflicts with IEEE 802 + addresses obtained from network cards. + +Wed Jan 1 23:51:09 1997 Theodore Ts'o + + * unpack.c, pack.c: Include string.h, since we use memcpy(). + +Tue Dec 3 13:05:11 1996 Theodore Ts'o + + * parse.c: Add #include of ctype.h and stdlib.h, to pull in the + required prototypes. + +Fri Oct 11 17:15:10 1996 Theodore Ts'o + + * Makefile.in (DLL_ADDRESS): Updated DLL address for libuuid. + +Tue Oct 8 02:02:03 1996 Theodore Ts'o + + * Release of E2fsprogs version 1.06 + +Thu Sep 12 15:23:07 1996 Theodore Ts'o + + * Release of E2fsprogs version 1.05 + +Tue Aug 27 16:50:43 1996 Miles Bader + + * uuid/gen_uuid.c [HAVE_NET_IF_H] : Include guarded. + [HAVE_NETINET_IN_H] : Include guarded. + (get_node_id): Surround bulk of function with #ifdef HAVE_NET_IF_H. + +Tue Aug 27 16:50:16 1996 Theodore Ts'o + + * gen_uuid.c (get_node_id): Add a specific ifdef for the HURD, + since it is broken w.r.t getting hardware addresses. diff --git a/libuuid/Makefile b/libuuid/Makefile new file mode 100644 index 0000000..ebe3643 --- /dev/null +++ b/libuuid/Makefile @@ -0,0 +1,45 @@ +# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +include $(top_srcdir)/buildflags.mak +CFLAGS += -DPSM_UUID=1 -Wno-unused-function +INCLUDES += -I$(top_srcdir) -I$(top_srcidr)/libuuid + +${TARGLIB}-objs := psm_uuid.o + +all: ${${TARGLIB}-objs} + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + rm -f *.o diff --git a/libuuid/clear.c b/libuuid/clear.c new file mode 100644 index 0000000..bb52682 --- /dev/null +++ b/libuuid/clear.c @@ -0,0 +1,44 @@ +/* + * clear.c -- Clear a UUID + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include "string.h" + +#include "uuidP.h" + +UUID_STATIC +void uuid_clear(uuid_t uu) +{ + memset(uu, 0, 16); +} + diff --git a/libuuid/compare.c b/libuuid/compare.c new file mode 100644 index 0000000..0a7dc9c --- /dev/null +++ b/libuuid/compare.c @@ -0,0 +1,56 @@ +/* + * compare.c --- compare whether or not two UUID's are the same + * + * Returns 0 if the two UUID's are different, and 1 if they are the same. + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include "uuidP.h" +#include + +#define UUCMP(u1,u2) if (u1 != u2) return((u1 < u2) ? -1 : 1); + +UUID_STATIC +int uuid_compare(const uuid_t uu1, const uuid_t uu2) +{ + struct uuid uuid1, uuid2; + + uuid_unpack(uu1, &uuid1); + uuid_unpack(uu2, &uuid2); + + UUCMP(uuid1.time_low, uuid2.time_low); + UUCMP(uuid1.time_mid, uuid2.time_mid); + UUCMP(uuid1.time_hi_and_version, uuid2.time_hi_and_version); + UUCMP(uuid1.clock_seq, uuid2.clock_seq); + return memcmp(uuid1.node, uuid2.node, 6); +} + diff --git a/libuuid/copy.c b/libuuid/copy.c new file mode 100644 index 0000000..37b03b2 --- /dev/null +++ b/libuuid/copy.c @@ -0,0 +1,46 @@ +/* + * copy.c --- copy UUIDs + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include "uuidP.h" + +UUID_STATIC +void uuid_copy(uuid_t dst, const uuid_t src) +{ + unsigned char *cp1; + const unsigned char *cp2; + int i; + + for (i=0, cp1 = dst, cp2 = src; i < 16; i++) + *cp1++ = *cp2++; +} diff --git a/libuuid/gen_uuid.c b/libuuid/gen_uuid.c new file mode 100644 index 0000000..a946f79 --- /dev/null +++ b/libuuid/gen_uuid.c @@ -0,0 +1,322 @@ +/* + * gen_uuid.c --- generate a DCE-compatible uuid + * + * Copyright (C) 1996, 1997, 1998, 1999 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +/* + * Force inclusion of SVID stuff since we need it if we're compiling in + * gcc-wall wall mode + */ +#ifndef _SVID_SOURCE +# define _SVID_SOURCE +#endif + +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_STDLIB_H +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_IOCTL_H +#include +#endif +#ifdef HAVE_SYS_SOCKET_H +#include +#endif +#ifdef HAVE_SYS_SOCKIO_H +#include +#endif +#ifdef HAVE_NET_IF_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif +#ifdef HAVE_NET_IF_DL_H +#include +#endif + +#include "psm.h" + +#include "uuidP.h" + +#ifdef HAVE_SRANDOM +#define srand(x) srandom(x) +#define rand() random() +#endif + +static int get_random_fd(void) +{ + struct timeval tv; + static int fd = -2; + int i; + + if (fd == -2) { + gettimeofday(&tv, 0); + fd = open("/dev/urandom", O_RDONLY); + if (fd == -1) + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + srand((getpid() << 16) ^ getuid() ^ tv.tv_sec ^ tv.tv_usec); + } + /* Crank the random number generator a few times */ + gettimeofday(&tv, 0); + for (i = (tv.tv_sec ^ tv.tv_usec) & 0x1F; i > 0; i--) + rand(); + return fd; +} + +/* + * Generate a series of random bytes. Use /dev/urandom if possible, + * and if not, use srandom/random. + */ +static void get_random_bytes(void *buf, int nbytes) +{ + int i, n = nbytes, fd = get_random_fd(); + int lose_counter = 0; + unsigned char *cp = (unsigned char *) buf; + + if (fd >= 0) { + while (n > 0) { + i = read(fd, cp, n); + if (i <= 0) { + if (lose_counter++ > 16) + break; + continue; + } + n -= i; + cp += i; + lose_counter = 0; + } + } + + /* + * We do this all the time, but this is the only source of + * randomness if /dev/random/urandom is out to lunch. + */ + for (cp = buf, i = 0; i < nbytes; i++) + *cp++ ^= (rand() >> 7) & 0xFF; + return; +} + +/* + * Get the ethernet hardware address, if we can find it... + */ +static int get_node_id(unsigned char *node_id) +{ +#ifdef HAVE_NET_IF_H + int sd; + struct ifreq ifr, *ifrp; + struct ifconf ifc; + char buf[1024]; + int n, i; + unsigned char *a; +#ifdef HAVE_NET_IF_DL_H + struct sockaddr_dl *sdlp; +#endif + +/* + * BSD 4.4 defines the size of an ifreq to be + * max(sizeof(ifreq), sizeof(ifreq.ifr_name)+ifreq.ifr_addr.sa_len + * However, under earlier systems, sa_len isn't present, so the size is + * just sizeof(struct ifreq) + */ +#ifdef HAVE_SA_LEN +#ifndef max +#define max(a,b) ((a) > (b) ? (a) : (b)) +#endif +#define ifreq_size(i) max(sizeof(struct ifreq),\ + sizeof((i).ifr_name)+(i).ifr_addr.sa_len) +#else +#define ifreq_size(i) sizeof(struct ifreq) +#endif /* HAVE_SA_LEN*/ + + sd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sd < 0) { + return -1; + } + memset(buf, 0, sizeof(buf)); + ifc.ifc_len = sizeof(buf); + ifc.ifc_buf = buf; + if (ioctl (sd, SIOCGIFCONF, (char *)&ifc) < 0) { + close(sd); + return -1; + } + n = ifc.ifc_len; + for (i = 0; i < n; i+= ifreq_size(*ifrp) ) { + ifrp = (struct ifreq *)((char *) ifc.ifc_buf+i); + strncpy(ifr.ifr_name, ifrp->ifr_name, IFNAMSIZ); +#ifdef SIOCGIFHWADDR + if (ioctl(sd, SIOCGIFHWADDR, &ifr) < 0) + continue; + a = (unsigned char *) &ifr.ifr_hwaddr.sa_data; +#else +#ifdef SIOCGENADDR + if (ioctl(sd, SIOCGENADDR, &ifr) < 0) + continue; + a = (unsigned char *) ifr.ifr_enaddr; +#else +#ifdef HAVE_NET_IF_DL_H + sdlp = (struct sockaddr_dl *) &ifrp->ifr_addr; + if ((sdlp->sdl_family != AF_LINK) || (sdlp->sdl_alen != 6)) + continue; + a = (unsigned char *) &sdlp->sdl_data[sdlp->sdl_nlen]; +#else + /* + * XXX we don't have a way of getting the hardware + * address + */ + close(sd); + return 0; +#endif /* HAVE_NET_IF_DL_H */ +#endif /* SIOCGENADDR */ +#endif /* SIOCGIFHWADDR */ + if (!a[0] && !a[1] && !a[2] && !a[3] && !a[4] && !a[5]) + continue; + if (node_id) { + memcpy(node_id, a, 6); + close(sd); + return 1; + } + } + close(sd); +#endif + return 0; +} + +/* Assume that the gettimeofday() has microsecond granularity */ +#define MAX_ADJUSTMENT 10 + +static int get_clock(uint32_t *clock_high, uint32_t *clock_low, uint16_t *ret_clock_seq) +{ + static int adjustment = 0; + static struct timeval last = {0, 0}; + static uint16_t clock_seq; + struct timeval tv; + unsigned long long clock_reg; + +try_again: + gettimeofday(&tv, 0); + if ((last.tv_sec == 0) && (last.tv_usec == 0)) { + get_random_bytes(&clock_seq, sizeof(clock_seq)); + clock_seq &= 0x3FFF; + last = tv; + last.tv_sec--; + } + if ((tv.tv_sec < last.tv_sec) || + ((tv.tv_sec == last.tv_sec) && + (tv.tv_usec < last.tv_usec))) { + clock_seq = (clock_seq+1) & 0x3FFF; + adjustment = 0; + last = tv; + } else if ((tv.tv_sec == last.tv_sec) && + (tv.tv_usec == last.tv_usec)) { + if (adjustment >= MAX_ADJUSTMENT) + goto try_again; + adjustment++; + } else { + adjustment = 0; + last = tv; + } + + clock_reg = tv.tv_usec*10 + adjustment; + clock_reg += ((unsigned long long) tv.tv_sec)*10000000; + clock_reg += (((unsigned long long) 0x01B21DD2) << 32) + 0x13814000; + + *clock_high = clock_reg >> 32; + *clock_low = clock_reg; + *ret_clock_seq = clock_seq; + return 0; +} + +UUID_STATIC +void uuid_generate_time(uuid_t out) +{ + static unsigned char node_id[6]; + static int has_init = 0; + struct uuid uu; + uint32_t clock_mid; + + if (!has_init) { + if (get_node_id(node_id) <= 0) { + get_random_bytes(node_id, 6); + /* + * Set multicast bit, to prevent conflicts + * with IEEE 802 addresses obtained from + * network cards + */ + node_id[0] |= 0x01; + } + has_init = 1; + } + get_clock(&clock_mid, &uu.time_low, &uu.clock_seq); + uu.clock_seq |= 0x8000; + uu.time_mid = (uint16_t) clock_mid; + uu.time_hi_and_version = ((clock_mid >> 16) & 0x0FFF) | 0x1000; + memcpy(uu.node, node_id, 6); + uuid_pack(&uu, out); +} + +UUID_STATIC +void uuid_generate_random(uuid_t out) +{ + uuid_t buf; + struct uuid uu; + + get_random_bytes(buf, sizeof(buf)); + uuid_unpack(buf, &uu); + + uu.clock_seq = (uu.clock_seq & 0x3FFF) | 0x8000; + uu.time_hi_and_version = (uu.time_hi_and_version & 0x0FFF) | 0x4000; + uuid_pack(&uu, out); +} + +/* + * This is the generic front-end to uuid_generate_random and + * uuid_generate_time. It uses uuid_generate_random only if + * /dev/urandom is available, since otherwise we won't have + * high-quality randomness. + */ +UUID_STATIC +void uuid_generate(uuid_t out) +{ + if (get_random_fd() >= 0) + uuid_generate_random(out); + else + uuid_generate_time(out); +} diff --git a/libuuid/isnull.c b/libuuid/isnull.c new file mode 100644 index 0000000..fb7fa3d --- /dev/null +++ b/libuuid/isnull.c @@ -0,0 +1,49 @@ +/* + * isnull.c --- Check whether or not the UUID is null + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include "uuidP.h" + +/* Returns 1 if the uuid is the NULL uuid */ +UUID_STATIC +int uuid_is_null(const uuid_t uu) +{ + const unsigned char *cp; + int i; + + for (i=0, cp = uu; i < 16; i++) + if (*cp++) + return 0; + return 1; +} + diff --git a/libuuid/pack.c b/libuuid/pack.c new file mode 100644 index 0000000..51c47ee --- /dev/null +++ b/libuuid/pack.c @@ -0,0 +1,70 @@ +/* + * Internal routine for packing UUID's + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include +#include "uuidP.h" + +UUID_STATIC +void uuid_pack(const struct uuid *uu, uuid_t ptr) +{ + uint32_t tmp; + unsigned char *out = ptr; + + tmp = uu->time_low; + out[3] = (unsigned char) tmp; + tmp >>= 8; + out[2] = (unsigned char) tmp; + tmp >>= 8; + out[1] = (unsigned char) tmp; + tmp >>= 8; + out[0] = (unsigned char) tmp; + + tmp = uu->time_mid; + out[5] = (unsigned char) tmp; + tmp >>= 8; + out[4] = (unsigned char) tmp; + + tmp = uu->time_hi_and_version; + out[7] = (unsigned char) tmp; + tmp >>= 8; + out[6] = (unsigned char) tmp; + + tmp = uu->clock_seq; + out[9] = (unsigned char) tmp; + tmp >>= 8; + out[8] = (unsigned char) tmp; + + memcpy(out+10, uu->node, 6); +} + diff --git a/libuuid/parse.c b/libuuid/parse.c new file mode 100644 index 0000000..0773447 --- /dev/null +++ b/libuuid/parse.c @@ -0,0 +1,80 @@ +/* + * parse.c --- UUID parsing + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include +#include +#include +#include + +#include "uuidP.h" + +UUID_STATIC +int uuid_parse(const char *in, uuid_t uu) +{ + struct uuid uuid; + int i; + const char *cp; + char buf[3]; + + if (strlen(in) != 36) + return -1; + for (i=0, cp = in; i <= 36; i++,cp++) { + if ((i == 8) || (i == 13) || (i == 18) || + (i == 23)) { + if (*cp == '-') + continue; + else + return -1; + } + if (i== 36) + if (*cp == 0) + continue; + if (!isxdigit(*cp)) + return -1; + } + uuid.time_low = strtoul(in, NULL, 16); + uuid.time_mid = strtoul(in+9, NULL, 16); + uuid.time_hi_and_version = strtoul(in+14, NULL, 16); + uuid.clock_seq = strtoul(in+19, NULL, 16); + cp = in+24; + buf[2] = 0; + for (i=0; i < 6; i++) { + buf[0] = *cp++; + buf[1] = *cp++; + uuid.node[i] = strtoul(buf, NULL, 16); + } + + uuid_pack(&uuid, uu); + return 0; +} diff --git a/libuuid/psm_uuid.c b/libuuid/psm_uuid.c new file mode 100644 index 0000000..fcfa94c --- /dev/null +++ b/libuuid/psm_uuid.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(PSM_USE_SYS_UUID) + +#define STDC_HEADERS 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_STAT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_STRINGS_H 1 +#define HAVE_INTTYPES_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_UNISTD_H 1 +#define ENABLE_HTREE 1 +#define ENABLE_SWAPFS 1 +#define HAVE_LONG_LONG 1 +#define HAVE_LONG_DOUBLE 1 +#define HAVE_WCHAR_T 1 +#define HAVE_WINT_T 1 +#define HAVE_INTTYPES_H_WITH_UINTMAX 1 +#define HAVE_STDINT_H_WITH_UINTMAX 1 +#define HAVE_INTMAX_T 1 +#define HAVE_POSIX_PRINTF 1 +#define HAVE_ALLOCA_H 1 +#define HAVE_ALLOCA 1 +#define HAVE_STDLIB_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_GETPAGESIZE 1 +#define HAVE_MMAP 1 +#define INTDIV0_RAISES_SIGFPE 1 +#define HAVE_UNSIGNED_LONG_LONG 1 +#define HAVE_UINTMAX_T 1 +#define HAVE_INTTYPES_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_ARGZ_H 1 +#define HAVE_LIMITS_H 1 +#define HAVE_LOCALE_H 1 +#define HAVE_NL_TYPES_H 1 +#define HAVE_MALLOC_H 1 +#define HAVE_STDDEF_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_SYS_PARAM_H 1 +#define HAVE_ASPRINTF 1 +#define HAVE_FWPRINTF 1 +#define HAVE_GETCWD 1 +#define HAVE_GETEGID 1 +#define HAVE_GETEUID 1 +#define HAVE_GETGID 1 +#define HAVE_GETUID 1 +#define HAVE_MEMPCPY 1 +#define HAVE_MUNMAP 1 +#define HAVE_PUTENV 1 +#define HAVE_SETENV 1 +#define HAVE_SETLOCALE 1 +#define HAVE_SNPRINTF 1 +#define HAVE_STPCPY 1 +#define HAVE_STRCASECMP 1 +#define HAVE_STRDUP 1 +#define HAVE_STRTOUL 1 +#define HAVE_TSEARCH 1 +#define HAVE_WCSLEN 1 +#define HAVE___ARGZ_COUNT 1 +#define HAVE___ARGZ_STRINGIFY 1 +#define HAVE___ARGZ_NEXT 1 +#define HAVE___FSETLOCKING 1 +#define HAVE_DECL__SNPRINTF 0 +#define HAVE_DECL__SNWPRINTF 0 +#define HAVE_DECL_FEOF_UNLOCKED 1 +#define HAVE_DECL_FGETS_UNLOCKED 0 +#define HAVE_DECL_GETC_UNLOCKED 1 +#define HAVE_ICONV 1 +#define ICONV_CONST +#define HAVE_LANGINFO_CODESET 1 +#define HAVE_LC_MESSAGES 1 +#define ENABLE_NLS 1 +#define HAVE_GETTEXT 1 +#define HAVE_DCGETTEXT 1 +#define HAVE_STDLIB_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_STDARG_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_ERRNO_H 1 +#define HAVE_MALLOC_H 1 +#define HAVE_MNTENT_H 1 +#define HAVE_PATHS_H 1 +#define HAVE_DIRENT_H 1 +#define HAVE_GETOPT_H 1 +#define HAVE_SETJMP_H 1 +#define HAVE_SIGNAL_H 1 +#define HAVE_TERMIOS_H 1 +#define HAVE_LINUX_FD_H 1 +#define HAVE_LINUX_MAJOR_H 1 +#define HAVE_SYS_IOCTL_H 1 +#define HAVE_SYS_PRCTL_H 1 +#define HAVE_SYS_QUEUE_H 1 +#define HAVE_SYS_SOCKET_H 1 +#define HAVE_SYS_SYSMACROS_H 1 +#define HAVE_SYS_TIME_H 1 +#define HAVE_SYS_STAT_H 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_WAIT_H 1 +#define HAVE_SYS_RESOURCE_H 1 +#define HAVE_NETINET_IN_H 1 +#define HAVE_SYS_MOUNT_H 1 +#define HAVE_NET_IF_H 1 +#define HAVE_VPRINTF 1 +#define HAVE_RECLEN_DIRENT 1 +#define HAVE_TYPE_SSIZE_T 1 +#define HAVE_LSEEK64_PROTOTYPE 1 +#define SIZEOF_SHORT 2 +#define SIZEOF_INT 4 +#define SIZEOF_LONG 8 +#define SIZEOF_LONG_LONG 8 +#define HAVE_INTTYPES_H 1 +#define HAVE_INTPTR_T 1 +#define HAVE_GETRUSAGE 1 +#define HAVE_LLSEEK 1 +#define HAVE_LSEEK64 1 +#define HAVE_OPEN64 1 +#define HAVE_STRTOULL 1 +#define HAVE_STRCASECMP 1 +#define HAVE_SRANDOM 1 +#define HAVE_FCHOWN 1 +#define HAVE_MALLINFO 1 +#define HAVE_FDATASYNC 1 +#define HAVE_STRNLEN 1 +#define HAVE_STRPTIME 1 +#define HAVE_SYSCONF 1 +#define HAVE_PATHCONF 1 +#define HAVE_POSIX_MEMALIGN 1 +#define HAVE_MEMALIGN 1 +#define HAVE_VALLOC 1 +#define HAVE___SECURE_GETENV 1 +#define HAVE_PRCTL 1 +#define HAVE_DLOPEN 1 +#define HAVE_EXT2_IOCTLS 1 + +#include "pack.c" +#include "unpack.c" +#include "clear.c" +#include "compare.c" +#include "copy.c" +#include "gen_uuid.c" +#include "isnull.c" +#include "parse.c" +#include "unparse.c" +#include "psm_help.h" + +#else /* PSM_USE_SYS_UUID */ +#include +#include "psm_user.h" +#endif + +void +__psm_uuid_generate(psm_uuid_t uuid_out) +{ + uuid_generate(uuid_out); + return; +} +PSMI_API_DECL(psm_uuid_generate) + +int +psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB) +{ + return uuid_compare(uuA, uuB); +} + +void +psmi_uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_lower(uu, out); +} + +int +psmi_uuid_parse(const char *in, uuid_t uu) +{ + return uuid_parse(in, uu); +} + diff --git a/libuuid/psm_uuid.h b/libuuid/psm_uuid.h new file mode 100644 index 0000000..5c2011f --- /dev/null +++ b/libuuid/psm_uuid.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSM_UUID_H +#define _PSM_UUID_H +int psmi_uuid_parse(const char *in, psm_uuid_t uu); +void psmi_uuid_unparse(const psm_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB); +#endif diff --git a/libuuid/tst_uuid.c b/libuuid/tst_uuid.c new file mode 100644 index 0000000..47ff06c --- /dev/null +++ b/libuuid/tst_uuid.c @@ -0,0 +1,168 @@ +/* + * tst_uuid.c --- test program from the UUID library + * + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include +#include + +#include "uuid.h" + +static int test_uuid(const char * uuid, int isValid) +{ + static const char * validStr[2] = {"invalid", "valid"}; + uuid_t uuidBits; + int parsedOk; + + parsedOk = uuid_parse(uuid, uuidBits) == 0; + + printf("%s is %s", uuid, validStr[isValid]); + if (parsedOk != isValid) { + printf(" but uuid_parse says %s\n", validStr[parsedOk]); + return 1; + } + printf(", OK\n"); + return 0; +} + +int +main(int argc, char **argv) +{ + uuid_t buf, tst; + char str[100]; + struct timeval tv; + time_t time_reg; + unsigned char *cp; + int i; + int failed = 0; + int type, variant; + + uuid_generate(buf); + uuid_unparse(buf, str); + printf("UUID generate = %s\n", str); + printf("UUID: "); + for (i=0, cp = (unsigned char *) &buf; i < 16; i++) { + printf("%02x", *cp++); + } + printf("\n"); + type = uuid_type(buf); variant = uuid_variant(buf); + printf("UUID type = %d, UUID variant = %d\n", type, variant); + if (variant != UUID_VARIANT_DCE) { + printf("Incorrect UUID Variant; was expecting DCE!\n"); + failed++; + } + printf("\n"); + + uuid_generate_random(buf); + uuid_unparse(buf, str); + printf("UUID random string = %s\n", str); + printf("UUID: "); + for (i=0, cp = (unsigned char *) &buf; i < 16; i++) { + printf("%02x", *cp++); + } + printf("\n"); + type = uuid_type(buf); variant = uuid_variant(buf); + printf("UUID type = %d, UUID variant = %d\n", type, variant); + if (variant != UUID_VARIANT_DCE) { + printf("Incorrect UUID Variant; was expecting DCE!\n"); + failed++; + } + if (type != 4) { + printf("Incorrect UUID type; was expecting " + "4 (random type)!\n"); + failed++; + } + printf("\n"); + + uuid_generate_time(buf); + uuid_unparse(buf, str); + printf("UUID string = %s\n", str); + printf("UUID time: "); + for (i=0, cp = (unsigned char *) &buf; i < 16; i++) { + printf("%02x", *cp++); + } + printf("\n"); + type = uuid_type(buf); variant = uuid_variant(buf); + printf("UUID type = %d, UUID variant = %d\n", type, variant); + if (variant != UUID_VARIANT_DCE) { + printf("Incorrect UUID Variant; was expecting DCE!\n"); + failed++; + } + if (type != 1) { + printf("Incorrect UUID type; was expecting " + "1 (time-based type)!\\n"); + failed++; + } + tv.tv_sec = 0; + tv.tv_usec = 0; + time_reg = uuid_time(buf, &tv); + printf("UUID time is: (%ld, %ld): %s\n", tv.tv_sec, tv.tv_usec, + ctime(&time_reg)); + uuid_parse(str, tst); + if (!uuid_compare(buf, tst)) + printf("UUID parse and compare succeeded.\n"); + else { + printf("UUID parse and compare failed!\n"); + failed++; + } + uuid_clear(tst); + if (uuid_is_null(tst)) + printf("UUID clear and is null succeeded.\n"); + else { + printf("UUID clear and is null failed!\n"); + failed++; + } + uuid_copy(buf, tst); + if (!uuid_compare(buf, tst)) + printf("UUID copy and compare succeeded.\n"); + else { + printf("UUID copy and compare failed!\n"); + failed++; + } + failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981b", 1); + failed += test_uuid("84949CC5-4701-4A84-895B-354C584A981B", 1); + failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981bc", 0); + failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981", 0); + failed += test_uuid("84949cc5x4701-4a84-895b-354c584a981b", 0); + failed += test_uuid("84949cc504701-4a84-895b-354c584a981b", 0); + failed += test_uuid("84949cc5-470104a84-895b-354c584a981b", 0); + failed += test_uuid("84949cc5-4701-4a840895b-354c584a981b", 0); + failed += test_uuid("84949cc5-4701-4a84-895b0354c584a981b", 0); + failed += test_uuid("g4949cc5-4701-4a84-895b-354c584a981b", 0); + failed += test_uuid("84949cc5-4701-4a84-895b-354c584a981g", 0); + + if (failed) { + printf("%d failures.\n", failed); + exit(1); + } + return 0; +} diff --git a/libuuid/unpack.c b/libuuid/unpack.c new file mode 100644 index 0000000..a05d664 --- /dev/null +++ b/libuuid/unpack.c @@ -0,0 +1,64 @@ +/* + * Internal routine for unpacking UUID + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include +#include "uuidP.h" + +UUID_STATIC +void uuid_unpack(const uuid_t in, struct uuid *uu) +{ + const uint8_t *ptr = in; + uint32_t tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_low = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_mid = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->time_hi_and_version = tmp; + + tmp = *ptr++; + tmp = (tmp << 8) | *ptr++; + uu->clock_seq = tmp; + + memcpy(uu->node, ptr, 6); +} + diff --git a/libuuid/unparse.c b/libuuid/unparse.c new file mode 100644 index 0000000..0857f50 --- /dev/null +++ b/libuuid/unparse.c @@ -0,0 +1,79 @@ +/* + * unparse.c -- convert a UUID to string + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include + +#include "uuidP.h" + +static const char *fmt_lower = + "%08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x"; + +static const char *fmt_upper = + "%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X"; + +#ifdef UUID_UNPARSE_DEFAULT_UPPER +#define FMT_DEFAULT fmt_upper +#else +#define FMT_DEFAULT fmt_lower +#endif + +static void uuid_unparse_x(const uuid_t uu, char *out, const char *fmt) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + sprintf(out, fmt, + uuid.time_low, uuid.time_mid, uuid.time_hi_and_version, + uuid.clock_seq >> 8, uuid.clock_seq & 0xFF, + uuid.node[0], uuid.node[1], uuid.node[2], + uuid.node[3], uuid.node[4], uuid.node[5]); +} + +UUID_STATIC +void uuid_unparse_lower(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_lower); +} + +UUID_STATIC +void uuid_unparse_upper(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, fmt_upper); +} + +UUID_STATIC +void uuid_unparse(const uuid_t uu, char *out) +{ + uuid_unparse_x(uu, out, FMT_DEFAULT); +} diff --git a/libuuid/uuid.h b/libuuid/uuid.h new file mode 100644 index 0000000..54a9e96 --- /dev/null +++ b/libuuid/uuid.h @@ -0,0 +1,108 @@ + +/* + * Public include file for the UUID library + * + * Copyright (C) 1996, 1997, 1998 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#ifndef _UUID_UUID_H +#define _UUID_UUID_H + +#include +#include +#include + +typedef unsigned char uuid_t[16]; + +/* UUID Variant definitions */ +#define UUID_VARIANT_NCS 0 +#define UUID_VARIANT_DCE 1 +#define UUID_VARIANT_MICROSOFT 2 +#define UUID_VARIANT_OTHER 3 + +/* UUID Type definitions */ +#define UUID_TYPE_DCE_TIME 1 +#define UUID_TYPE_DCE_RANDOM 4 + +/* Allow UUID constants to be defined */ +#ifdef __GNUC__ +#define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ + static const uuid_t name __attribute__ ((unused)) = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} +#else +#define UUID_DEFINE(name,u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15) \ + static const uuid_t name = {u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15} +#endif + +#ifdef PSM_UUID +#define UUID_STATIC static +#else +#define UUID_STATIC +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* gen_uuid.c */ +UUID_STATIC void uuid_generate(uuid_t out); +UUID_STATIC void uuid_generate_random(uuid_t out); +UUID_STATIC void uuid_generate_time(uuid_t out); + +/* clear.c */ +UUID_STATIC void uuid_clear(uuid_t uu); + +/* compare.c */ +UUID_STATIC int uuid_compare(const uuid_t uu1, const uuid_t uu2); + +/* copy.c */ +UUID_STATIC void uuid_copy(uuid_t dst, const uuid_t src); + +/* isnull.c */ +UUID_STATIC int uuid_is_null(const uuid_t uu); + +/* parse.c */ +UUID_STATIC int uuid_parse(const char *in, uuid_t uu); + +/* unparse.c */ +UUID_STATIC void uuid_unparse(const uuid_t uu, char *out); +UUID_STATIC void uuid_unparse_lower(const uuid_t uu, char *out); +UUID_STATIC void uuid_unparse_upper(const uuid_t uu, char *out); + +/* uuid_time.c */ +UUID_STATIC time_t uuid_time(const uuid_t uu, struct timeval *ret_tv); +UUID_STATIC int uuid_type(const uuid_t uu); +UUID_STATIC int uuid_variant(const uuid_t uu); + +#ifdef __cplusplus +} +#endif + +#endif /* _UUID_UUID_H */ diff --git a/libuuid/uuidP.h b/libuuid/uuidP.h new file mode 100644 index 0000000..fa7e91b --- /dev/null +++ b/libuuid/uuidP.h @@ -0,0 +1,77 @@ +/* + * uuid.h -- private header file for uuids + * + * Copyright (C) 1996, 1997 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#ifndef _UUID_UUIDP_H +#define _UUID_UUIDP_H + +#ifndef UUID_STATIC +# ifdef PSM_UUID +# define UUID_STATIC static +# else +# define UUID_STATIC +# endif +#endif + +#ifdef HAVE_INTTYPES_H +#include +#else +#include +#endif +#include + +#include "uuid.h" + +/* + * Offset between 15-Oct-1582 and 1-Jan-70 + */ +#define TIME_OFFSET_HIGH 0x01B21DD2 +#define TIME_OFFSET_LOW 0x13814000 + +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint16_t clock_seq; + uint8_t node[6]; +}; + +/* + * prototypes + */ +UUID_STATIC +void uuid_pack(const struct uuid *uu, uuid_t ptr); +UUID_STATIC +void uuid_unpack(const uuid_t in, struct uuid *uu); + +#endif /* _UUID_UUIDP_H */ diff --git a/libuuid/uuid_time.c b/libuuid/uuid_time.c new file mode 100644 index 0000000..d5f992b --- /dev/null +++ b/libuuid/uuid_time.c @@ -0,0 +1,161 @@ +/* + * uuid_time.c --- Interpret the time field from a uuid. This program + * violates the UUID abstraction barrier by reaching into the guts + * of a UUID and interpreting it. + * + * Copyright (C) 1998, 1999 Theodore Ts'o. + * + * %Begin-Header% + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * %End-Header% + */ + +#include +#include +#include +#include +#include +#include + +#include "uuidP.h" + +time_t uuid_time(const uuid_t uu, struct timeval *ret_tv) +{ + struct uuid uuid; + uint32_t high; + struct timeval tv; + unsigned long long clock_reg; + + uuid_unpack(uu, &uuid); + + high = uuid.time_mid | ((uuid.time_hi_and_version & 0xFFF) << 16); + clock_reg = uuid.time_low | ((unsigned long long) high << 32); + + clock_reg -= (((unsigned long long) 0x01B21DD2) << 32) + 0x13814000; + tv.tv_sec = clock_reg / 10000000; + tv.tv_usec = (clock_reg % 10000000) / 10; + + if (ret_tv) + *ret_tv = tv; + + return tv.tv_sec; +} + +int uuid_type(const uuid_t uu) +{ + struct uuid uuid; + + uuid_unpack(uu, &uuid); + return ((uuid.time_hi_and_version >> 12) & 0xF); +} + +int uuid_variant(const uuid_t uu) +{ + struct uuid uuid; + int var; + + uuid_unpack(uu, &uuid); + var = uuid.clock_seq; + + if ((var & 0x8000) == 0) + return UUID_VARIANT_NCS; + if ((var & 0x4000) == 0) + return UUID_VARIANT_DCE; + if ((var & 0x2000) == 0) + return UUID_VARIANT_MICROSOFT; + return UUID_VARIANT_OTHER; +} + +#ifdef DEBUG +static const char *variant_string(int variant) +{ + switch (variant) { + case UUID_VARIANT_NCS: + return "NCS"; + case UUID_VARIANT_DCE: + return "DCE"; + case UUID_VARIANT_MICROSOFT: + return "Microsoft"; + default: + return "Other"; + } +} + + +int +main(int argc, char **argv) +{ + uuid_t buf; + time_t time_reg; + struct timeval tv; + int type, variant; + + if (argc != 2) { + fprintf(stderr, "Usage: %s uuid\n", argv[0]); + exit(1); + } + if (uuid_parse(argv[1], buf)) { + fprintf(stderr, "Invalid UUID: %s\n", argv[1]); + exit(1); + } + variant = uuid_variant(buf); + type = uuid_type(buf); + time_reg = uuid_time(buf, &tv); + + printf("UUID variant is %d (%s)\n", variant, variant_string(variant)); + if (variant != UUID_VARIANT_DCE) { + printf("Warning: This program only knows how to interpret " + "DCE UUIDs.\n\tThe rest of the output is likely " + "to be incorrect!!\n"); + } + printf("UUID type is %d", type); + switch (type) { + case 1: + printf(" (time based)\n"); + break; + case 2: + printf(" (DCE)\n"); + break; + case 3: + printf(" (name-based)\n"); + break; + case 4: + printf(" (random)\n"); + break; + default: + printf("\n"); + } + if (type != 1) { + printf("Warning: not a time-based UUID, so UUID time " + "decoding will likely not work!\n"); + } + printf("UUID time is: (%ld, %ld): %s\n", tv.tv_sec, tv.tv_usec, + ctime(&time_reg)); + + return 0; +} +#endif diff --git a/mic-psm-card-devel.srclist.in b/mic-psm-card-devel.srclist.in new file mode 100644 index 0000000..7d6fd6c --- /dev/null +++ b/mic-psm-card-devel.srclist.in @@ -0,0 +1,2 @@ +%LIBPREFIX%/libinfinipath.so +%LIBPREFIX%/libpsm_infinipath.so diff --git a/mic-psm-card.srclist.in b/mic-psm-card.srclist.in new file mode 100644 index 0000000..beea15e --- /dev/null +++ b/mic-psm-card.srclist.in @@ -0,0 +1,6 @@ +/etc/sysconfig/mic/conf.d/psm.conf +%PREFIX%/psm.filelist +%LIBPREFIX%/libinfinipath.so.%IPATHMAJOR% +%LIBPREFIX%/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% +%LIBPREFIX%/libpsm_infinipath.so.%PSMMAJOR% +%LIBPREFIX%/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% diff --git a/mic-psm-devel.srclist.in b/mic-psm-devel.srclist.in new file mode 100644 index 0000000..a1dc132 --- /dev/null +++ b/mic-psm-devel.srclist.in @@ -0,0 +1,4 @@ +/usr/include/psm.h +/usr/include/psm_mq.h +%LIBPREFIX%/libinfinipath.so +%LIBPREFIX%/libpsm_infinipath.so diff --git a/mic-psm.srclist.in b/mic-psm.srclist.in new file mode 100644 index 0000000..d80350d --- /dev/null +++ b/mic-psm.srclist.in @@ -0,0 +1,5 @@ +%SBINPREFIX%/psmd +%LIBPREFIX%/libinfinipath.so.4 +%LIBPREFIX%/libinfinipath.so.4.0 +%LIBPREFIX%/libpsm_infinipath.so.1 +%LIBPREFIX%/libpsm_infinipath.so.1.15 diff --git a/mic/etc/sysconfig/mic/conf.d/psm.conf b/mic/etc/sysconfig/mic/conf.d/psm.conf new file mode 100644 index 0000000..deba040 --- /dev/null +++ b/mic/etc/sysconfig/mic/conf.d/psm.conf @@ -0,0 +1,2 @@ +# PSM download files +Overlay Filelist /opt/intel/mic/psm /opt/intel/mic/psm/psm.filelist on diff --git a/mic/opt/intel/mic/psm/psm.filelist.in b/mic/opt/intel/mic/psm/psm.filelist.in new file mode 100644 index 0000000..38c6add --- /dev/null +++ b/mic/opt/intel/mic/psm/psm.filelist.in @@ -0,0 +1,7 @@ +dir /lib64 755 0 0 +file /lib64/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% lib64/libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 755 0 0 +slink /lib64/libinfinipath.so.%IPATHMAJOR% libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 777 0 0 +slink /lib64/libinfinipath.so libinfinipath.so.%IPATHMAJOR%.%IPATHMINOR% 777 0 0 +file /lib64/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% lib64/libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 755 0 0 +slink /lib64/libpsm_infinipath.so.%PSMMAJOR% libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 777 0 0 +slink /lib64/libpsm_infinipath.so libpsm_infinipath.so.%PSMMAJOR%.%PSMMINOR% 777 0 0 diff --git a/mpspawn/mpspawn_stats.h b/mpspawn/mpspawn_stats.h new file mode 100644 index 0000000..3cc8bc7 --- /dev/null +++ b/mpspawn/mpspawn_stats.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MPSPAWN_STATS_H +#define _MPSPAWN_STATS_H + +#include + +#define MPSPAWN_STATS_VERSION 1 + +typedef enum +{ + MPSPAWN_STATS_TYPE_DOUBLE = 0x1, +#define MPSPAWN_STATS_TYPE_DOUBLE 0x1 + MPSPAWN_STATS_TYPE_HEADER = 0x2, +#define MPSPAWN_STATS_TYPE_HEADER 0x2 + MPSPAWN_STATS_REDUCTION_MAX = 0x1000, +#define MPSPAWN_STATS_REDUCTION_MAX 0x1000 + MPSPAWN_STATS_REDUCTION_MIN = 0x2000, +#define MPSPAWN_STATS_REDUCTION_MIN 0x2000 + MPSPAWN_STATS_REDUCTION_MEDIAN = 0x4000, +#define MPSPAWN_STATS_REDUCTION_MEDIAN 0x4000 + MPSPAWN_STATS_SKIP_IF_ZERO = 0x8000 +#define MPSPAWN_STATS_SKIP_IF_ZERO 0x8000 +} +mpspawn_stats_flags; + +#define MPSPAWN_STATS_REDUCTION_ALL (MPSPAWN_STATS_REDUCTION_MAX | \ + MPSPAWN_STATS_REDUCTION_MIN | MPSPAWN_STATS_REDUCTION_MEDIAN) + +#define MPSPAWN_STATS_DOUBLE_TO_U64(arg) (*((uint64_t *) &(arg))) +#define MPSPAWN_NAN_U64 ((uint64_t) ~0ULL) +#define MPSPAWN_ISNAN_U64(x) (((uint64_t)(x)) == MPSPAWN_NAN_U64) + +#define MPSPAWN_NAN ((uint64_t) ~0ULL) //NAN) +#define MPSPAWN_ISNAN(x) (isnan(x)) + +struct mpspawn_stats_add_args; /* client->mpspawn stats registration */ +struct mpspawn_stats_req_args; /* mpspawn->client fn callback stats request */ +struct mpspawn_stats_init_args; /* mpspawn->client "downcall" to register */ + +/* Clients implement this function to fill in mpspawn request for stats */ +typedef void (*mpspawn_stats_req_fn) (struct mpspawn_stats_req_args *); +/* mpspawn implements this function to allow clients to register new stats */ +typedef void (*mpspawn_stats_add_fn) (struct mpspawn_stats_add_args *); +/* mpspawn implements this function to map rank indexes into epaddr structs */ +struct psm_epaddr; +typedef struct psm_epaddr * (*mpspawn_map_epaddr_fn) (int rank); + +typedef struct mpspawn_stats_req_args { + int version; + int num; + uint64_t *stats; + uint16_t *flags; + void *context; +} +mpspawn_stats_req_args_t; + +typedef +struct mpspawn_stats_add_args { + int version; + int num; + char *header; + char **desc; + uint16_t *flags; + mpspawn_stats_req_fn req_fn; + void *context; +} mpspawn_stats_add_args_t; + +typedef +struct mpspawn_stats_init_args { + int version; + psm_mq_t mq; /* initialized mq endpoint */ + int num_epaddr; /* number of endpoints in job */ + mpspawn_stats_add_fn add_fn; /* function for client to add stats */ + mpspawn_map_epaddr_fn epaddr_map_fn; + const char *stats_types; /* stats type string mpirun -M */ +} +mpspawn_stats_init_args_t; + +/* Function in psm exposed to register stats */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args); + +#endif diff --git a/psm.c b/psm.c new file mode 100644 index 0000000..f8fa3d8 --- /dev/null +++ b/psm.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "psm_user.h" + +static int psmi_verno_major = PSM_VERNO_MAJOR; +static int psmi_verno_minor = PSM_VERNO_MINOR; +static int psmi_verno = PSMI_VERNO_MAKE(PSM_VERNO_MAJOR, PSM_VERNO_MINOR); +static int psmi_verno_client_val = 0; + +#define PSMI_NOT_INITIALIZED 0 +#define PSMI_INITIALIZED 1 +#define PSMI_FINALIZED -1 /* Prevent the user from calling psm_init + * once psm_finalize has been called. */ +static int psmi_isinit = PSMI_NOT_INITIALIZED; + +int +psmi_verno_client() +{ + return psmi_verno_client_val; +} + +#ifdef PSMI_PLOCK_IS_SPINLOCK +psmi_spinlock_t psmi_progress_lock; +#elif defined(PSMI_PLOCK_IS_MUTEXLOCK) +pthread_mutex_t psmi_progress_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(PSMI_PLOCK_IS_MUTEXLOCK_DEBUG) +pthread_mutex_t psmi_progress_lock = PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP; +pthread_t psmi_progress_lock_owner = PSMI_PLOCK_NO_OWNER; +#endif + +/* This function is used to determine whether the current library build can + * successfully communicate with another library that claims to be version + * 'verno'. + * + * PSM 1.x is always ABI compatible, but this checks to see if two different + * versions of the library can coexist. + */ +int +psmi_verno_isinteroperable(uint16_t verno) +{ + /* + * Up and including 1.03, all peers require to be 1.03 (or later). + */ + if (PSMI_VERNO_GET_MAJOR(verno) != PSM_VERNO_MAJOR) + return 0; + + /* This -1 tries to make sure that we always update this function for each + * new release of the library. There's an internal check to make sure that + * verno_iscompatible is always updated. Each new version should have an + * entry in the switch statement below. */ + int iscompat = -1; + + switch (psmi_verno) { + case 0x0110: + case 0x010f: + /* Multi-rail is supported in this version, since the packet header + * sequence number is shrunk from 24bits to 16bits, old version + * can not process such packet. The freed 8bits and another 8bits + * are used to form the message sequence number to keep message order + * in multi-rail case. + */ + iscompat = (verno >= 0x010f); + break; + case 0x010e: + /* Allow specification of send buffer descriptors in addition to send + * network buffers for IPS. Having a large number of send descriptors + * can be beneficial on large scale clusters with bursty network IO. + */ + case 0x010d: + /* Wire protocol is same as QOFED 1.4.2. Added support to specify + * path record resolution mechanism as well as service ID to use + * for endpoint. Required to implement support for alternate + * network topolgies. + */ + case 0x010c: + /* Added support for generic psm_set|getopt methods. Also exposed + * "some" internal implementation details via components that these + * methods operate on. Wire protocol remains the same but we need + * to bump the version number as the API changes so ULPs can detect + * if these methods are available. + */ + case 0x010b: + /* Removed VL specification per endpoint however is wire level + * compatible with 0x010a version. Use SL2VL mapping table coupled + * with the SL for endpoint to select VL. + */ + case 0x010a: + /* 0x010a updates wire protocol with support for AM requests with + * no replies (OPCODE_AM_REQUEST_NOREPLY). + */ + iscompat = (verno >= 0x010a); + break; + case 0x0109: + /* 0x0109 updates the wire protocol to pad writes upto cache line size + * to mitigate overhead of partial cache line writes on some processor + * architectures. Only MQ sends upto 2K bytes are padded. + */ + iscompat = (verno >= 0x0109); + break; + case 0x0108: + /* 0x0108 moved subcontext bits out of KPFlags and into ips header. + * This is incompatible with previous version. */ + iscompat = (verno >= 0x0108); + break; + case 0x0107: + case 0x0106: + case 0x0105: + /* 0x0105 coincides with release 2.1 which introduced a new + * expected send protocol. Anything before that is incompatible */ + iscompat = (verno >= 0x0105); + break; + case 0x0104: + case 0x0103: + /* Nothing below 1.03 is supported by 1.03 */ + iscompat = (verno >= 0x0103); + break; + default: + iscompat = -1; + } + return iscompat; +} + +int +psmi_isinitialized() +{ + return (psmi_isinit == PSMI_INITIALIZED); +} + +extern char psmi_infinipath_revision[]; + +psm_error_t +__psm_init(int *major, int *minor) +{ + psm_error_t err = PSM_OK; + union psmi_envvar_val env_tmask; + + if (psmi_isinit == PSMI_INITIALIZED) + goto update; + + if (psmi_isinit == PSMI_FINALIZED) { + err = PSM_IS_FINALIZED; + goto fail; + } + + if (major == NULL || minor == NULL) { + err = PSM_PARAM_ERR; + goto fail; + } + +#ifdef PSM_DEBUG + if (!getenv("PSM_NO_WARN")) + fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *DEBUG* build.\n"); +#endif + +#ifdef PSM_PROFILE + if (!getenv("PSM_NO_WARN")) + fprintf(stderr, "!!! WARNING !!! You are running an internal-only PSM *PROFILE* build.\n"); +#endif + + /* Make sure we complain if fault injection is enabled */ + if (getenv("PSM_FI") && !getenv("PSM_NO_WARN")) + fprintf(stderr, "!!! WARNING !!! You are running with fault injection enabled!\n"); + + /* Make sure, as an internal check, that this version knows how to detect + * cmopatibility with other library versions it may communicate with */ + if (psmi_verno_isinteroperable(psmi_verno) != 1) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "psmi_verno_isinteroperable() not updated for current version!"); + goto fail; + } + + /* The only way to not support a client is if the major number doesn't + * match */ + if (*major != PSM_VERNO_MAJOR) { + err = psmi_handle_error(NULL, PSM_INIT_BAD_API_VERSION, + "This library does not implement version %d.%d", + *major, *minor); + goto fail; + } + + /* Make sure we don't keep track of a client that claims a higher version + * number than we are */ + psmi_verno_client_val = min(PSMI_VERNO_MAKE(*major, *minor), psmi_verno); + + psmi_isinit = PSMI_INITIALIZED; + /* infinipath_debug lives in libinfinipath.so */ + psmi_getenv("PSM_TRACEMASK", + "Mask flags for tracing", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_FLAGS, + (union psmi_envvar_val) infinipath_debug, + &env_tmask); + infinipath_debug = (long) env_tmask.e_ulong; + + /* The "real thing" is done in ipath_proto.c as a constructor function, but + * we getenv it here to report what we're doing with the setting */ + { + extern int __ipath_malloc_no_mmap; + union psmi_envvar_val env_mmap; + char *env = getenv("IPATH_DISABLE_MMAP_MALLOC"); + int broken = (env && *env && !__ipath_malloc_no_mmap); + psmi_getenv("IPATH_DISABLE_MMAP_MALLOC", + broken ? "Skipping mmap disable for malloc()" : + "Disable mmap for malloc()", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val) 0, + &env_mmap); + if (broken) + _IPATH_ERROR("Couldn't successfully disable mmap in mallocs " + "with mallopt()\n"); + } + + if (getenv("PSM_IDENTIFY")) { + Dl_info info_psm, info_ipath; + _IPATH_INFO("%s from %s:%s\n", psmi_infinipath_revision, + dladdr(psm_init, &info_psm) ? info_psm.dli_fname : + "libpsm not available", + dladdr(ipath_userinit, &info_ipath) ? info_ipath.dli_fname : + "libinfinipath not available"); + } + +#ifdef PSMI_PLOCK_IS_SPINLOCK + psmi_spin_init(&psmi_progress_lock); +#endif + + if (getenv("PSM_DIAGS")) { + _IPATH_INFO("Running diags...\n"); + psmi_diags(); + } + + psmi_faultinj_init(); + + psmi_epid_init(); + +update: + *major = (int) psmi_verno_major; + *minor = (int) psmi_verno_minor; +fail: + return err; +} +PSMI_API_DECL(psm_init) + +psm_error_t +__psm_finalize(void) +{ + struct psmi_eptab_iterator itor; + char *hostname; + psm_ep_t ep; + extern psm_ep_t psmi_opened_endpoint; /* in psm_endpoint.c */ + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + ep = psmi_opened_endpoint; + while (ep != NULL) { + psmi_opened_endpoint = ep->user_ep_next; + psm_ep_close(ep, PSM_EP_CLOSE_GRACEFUL, + 2*PSMI_MIN_EP_CLOSE_TIMEOUT); + ep = psmi_opened_endpoint; + } + + psmi_epid_fini(); + + psmi_faultinj_fini(); + + /* De-allocate memory for any allocated space to store hostnames */ + psmi_epid_itor_init(&itor, PSMI_EP_HOSTNAME); + while ((hostname = psmi_epid_itor_next(&itor))) + psmi_free(hostname); + psmi_epid_itor_fini(&itor); + + psmi_isinit = PSMI_FINALIZED; + return PSM_OK; +} +PSMI_API_DECL(psm_finalize) + +/* + * Function exposed in >= 1.05 + */ +psm_error_t +__psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames) +{ + int i; + psm_error_t err = PSM_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + PSMI_PLOCK(); + + if (nids == NULL || hostnames == NULL) { + err = PSM_PARAM_ERR; + goto fail; + } + + for (i = 0; i < num; i++) { + if ((err = psmi_epid_set_hostname(nids[i], hostnames[i], 1))) + break; + } + +fail: + PSMI_PUNLOCK(); + return err; +} +PSMI_API_DECL(psm_map_nid_hostname) + +void +__psm_epaddr_setlabel(psm_epaddr_t epaddr, char const *epaddr_label) +{ + return; /* ignore this function */ +} +PSMI_API_DECL(psm_epaddr_setlabel) + +void +__psm_epaddr_setctxt(psm_epaddr_t epaddr, void *ctxt) +{ + + /* Eventually deprecate this API to use set/get opt as this is unsafe. */ + psm_setopt(PSM_COMPONENT_CORE, (const void*) epaddr, + PSM_CORE_OPT_EP_CTXT, (const void*) ctxt, sizeof(void*)); + +} +PSMI_API_DECL(psm_epaddr_setctxt) + +void * +__psm_epaddr_getctxt(psm_epaddr_t epaddr) +{ + psm_error_t err; + uint64_t optlen = sizeof(void*); + void *result = NULL; + + /* Evetually deprecate this API to use set/get opt as this is unsafe. */ + err = psm_getopt(PSM_COMPONENT_CORE, (const void*) epaddr, + PSM_CORE_OPT_EP_CTXT, (void*) &result, &optlen); + + if (err == PSM_OK) + return result; + else + return NULL; +} +PSMI_API_DECL(psm_epaddr_getctxt) + +psm_error_t +__psm_setopt(psm_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen) +{ + switch(component) { + case PSM_COMPONENT_CORE: + return psmi_core_setopt(component_obj, optname, optval, optlen); + break; + case PSM_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + return psm_mq_setopt((psm_mq_t) component_obj, optname, optval); + break; + case PSM_COMPONENT_AM: + /* Hand off to active messages */ + return psmi_am_setopt(component_obj, optname, optval, optlen); + break; + case PSM_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + return psmi_ptl_ips.setopt(component_obj, optname, optval, optlen); + break; + } + + /* Unrecognized/unknown component */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown component %u", component); + +} + +PSMI_API_DECL(psm_setopt); + +psm_error_t +__psm_getopt(psm_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen) +{ + switch(component) { + case PSM_COMPONENT_CORE: + return psmi_core_getopt(component_obj, optname, optval, optlen); + break; + case PSM_COMPONENT_MQ: + /* Use the deprecated MQ set/get opt for now which does not use optlen */ + return psm_mq_getopt((psm_mq_t) component_obj, optname, optval); + break; + case PSM_COMPONENT_AM: + /* Hand off to active messages */ + return psmi_am_getopt(component_obj, optname, optval, optlen); + break; + case PSM_COMPONENT_IB: + /* Hand off to IPS ptl to set option */ + return psmi_ptl_ips.getopt(component_obj, optname, optval, optlen); + break; + } + + /* Unrecognized/unknown component */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown component %u", component); +} +PSMI_API_DECL(psm_getopt); + +psm_error_t __recvpath +__psmi_poll_noop(ptl_t *ptl, int replyonly) +{ + return PSM_OK_NO_PROGRESS; +} +PSMI_API_DECL(psmi_poll_noop) + +psm_error_t __recvpath +__psm_poll(psm_ep_t ep) +{ + psm_error_t err1 = PSM_OK, err2 = PSM_OK; + psm_ep_t tmp; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + + tmp = ep; + do { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_PUNLOCK(); + return err1; + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM_OK_NO_PROGRESS) { /* some error unrelated to polling */ + PSMI_PUNLOCK(); + return err2; + } + ep = ep->mctxt_next; + } while (ep != tmp); + + /* This is valid because.. + * PSM_OK & PSM_OK_NO_PROGRESS => PSM_OK + * PSM_OK & PSM_OK => PSM_OK + * PSM_OK_NO_PROGRESS & PSM_OK => PSM_OK + * PSM_OK_NO_PROGRESS & PSM_OK_NO_PROGRESS => PSM_OK_NO_PROGRESS */ + PSMI_PUNLOCK(); + return (err1 & err2); +} +PSMI_API_DECL(psm_poll) + +psm_error_t __recvpath +__psmi_poll_internal(psm_ep_t ep, int poll_amsh) +{ + psm_error_t err1 = PSM_OK_NO_PROGRESS; + psm_error_t err2; + psm_ep_t tmp; + + PSMI_PLOCK_ASSERT(); + + tmp = ep; + do { + if (poll_amsh) { + err1 = ep->ptl_amsh.ep_poll(ep->ptl_amsh.ptl, 0); /* poll reqs & reps */ + if (err1 > PSM_OK_NO_PROGRESS) /* some error unrelated to polling */ + return err1; + } + + err2 = ep->ptl_ips.ep_poll(ep->ptl_ips.ptl, 0); /* get into ips_do_work */ + if (err2 > PSM_OK_NO_PROGRESS) /* some error unrelated to polling */ + return err2; + + ep = ep->mctxt_next; + } while (ep != tmp); + + return (err1 & err2); +} +PSMI_API_DECL(psmi_poll_internal) + +#ifdef PSM_PROFILE +/* These functions each have weak symbols */ +void +psmi_profile_block() +{ + ; // empty for profiler +} + +void +psmi_profile_unblock() +{ + ; // empty for profiler +} + +void +psmi_profile_reblock(int did_no_progress) +{ + ; // empty for profiler +} +#endif + diff --git a/psm.h b/psm.h new file mode 100644 index 0000000..ca1200d --- /dev/null +++ b/psm.h @@ -0,0 +1,1045 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PSM_H +#define PSM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + + + + +/* Local endpoint handle (opaque) + * + * + * Handle returned to the user when a new local endpoint is created. The + * handle is a local handle to be used in all communication functions and is + * not intended to globally identify the opened endpoint in any way. + * + * All open endpoint handles can be globally identified using the endpoint id + * integral type (psm_epid_t) and all communication must use an endpoint + * address (psm_epaddr_t) that can be obtained by connecting a local + * endpoint to one or more endpoint identifiers. + * + * @remark The local endpoint handle is opaque to the user. */ +typedef struct psm_ep *psm_ep_t; + +/* MQ handle (opaque) + * + * + * Handle returned to the user when a new Matched queue is created (@ref + * psm_mq_init). */ +typedef struct psm_mq *psm_mq_t; + +#define PSM_VERNO 0x0110 +#define PSM_VERNO_MAJOR 0x01 +#define PSM_VERNO_MINOR 0x10 + +enum psm_error { + + PSM_OK = 0, + + PSM_OK_NO_PROGRESS = 1, + + PSM_PARAM_ERR = 3, + + PSM_NO_MEMORY = 4, + + PSM_INIT_NOT_INIT = 5, + + PSM_INIT_BAD_API_VERSION = 6, + + PSM_NO_AFFINITY = 7, + + PSM_INTERNAL_ERR = 8, + + PSM_SHMEM_SEGMENT_ERR = 9, + + PSM_OPT_READONLY = 10, + + PSM_TIMEOUT = 11, + + PSM_TOO_MANY_ENDPOINTS = 12, + + + PSM_IS_FINALIZED = 13, + + + PSM_EP_WAS_CLOSED = 20, + + PSM_EP_NO_DEVICE = 21, + + PSM_EP_UNIT_NOT_FOUND = 22, + + PSM_EP_DEVICE_FAILURE = 23, + + PSM_EP_CLOSE_TIMEOUT = 24, + + PSM_EP_NO_PORTS_AVAIL = 25, + + PSM_EP_NO_NETWORK = 26, + + PSM_EP_INVALID_UUID_KEY = 27, + + PSM_EP_NO_RESOURCES = 28, + + + PSM_EPID_UNKNOWN = 40, + + PSM_EPID_UNREACHABLE = 41, + + PSM_EPID_INVALID_NODE = 43, + + PSM_EPID_INVALID_MTU = 44, + + PSM_EPID_INVALID_UUID_KEY = 45, + + PSM_EPID_INVALID_VERSION = 46, + + PSM_EPID_INVALID_CONNECT = 47, + + PSM_EPID_ALREADY_CONNECTED = 48, + + PSM_EPID_NETWORK_ERROR = 49, + + PSM_EPID_INVALID_PKEY = 50, + + PSM_EPID_PATH_RESOLUTION = 51, + + + PSM_MQ_NO_COMPLETIONS = 60, + + PSM_MQ_TRUNCATION = 61, + + + PSM_AM_INVALID_REPLY = 70, + + PSM_ERROR_LAST = 80 +}; + +/* Backwards header compatibility for a confusing error return name */ +#define PSM_MQ_INCOMPLETE PSM_MQ_NO_COMPLETIONS + +typedef enum psm_error psm_error_t; + +enum psm_component { + + PSM_COMPONENT_CORE = 0, + + PSM_COMPONENT_MQ = 1, + + PSM_COMPONENT_AM = 2, + + PSM_COMPONENT_IB = 3 +}; + +typedef enum psm_component psm_component_t; + +enum psm_path_res { + + PSM_PATH_RES_NONE = 0, + + PSM_PATH_RES_OPP = 1, + + PSM_PATH_RES_UMAD = 2 +}; + +typedef enum psm_path_res psm_path_res_t; + +/* Initialize PSM interface + * + * Call to initialize the PSM library for a desired API revision number. + * + * [in,out] api_verno_major As input a pointer to an integer that holds + * PSM_VERNO_MAJOR. As output, the pointer + * is updated with the major revision number of + * the loaded library. + * [in,out] api_verno_minor As intput, a pointer to an integer that holds + * PSM_VERNO_MINOR. As output, the pointer + * is updated with the minor revision number of + * the loaded library. + * + * [pre] The user has not called any other PSM library call except @ref + * psm_error_register_handler to register a global error handler. + * + * [warning] PSM initialization is a precondition for all functions used in the + * PSM library. + * + * [returns] PSM_OK The PSM interface could be opened and the desired API + * revision can be provided. + * [returns] PSM_INIT_BAD_API_VERSION The PSM library cannot compatibility for + * the desired API version. + * + * @verbatim + * // In this example, we want to handle our own errors before doing init, + * // since we don't want a fatal error if InfiniPath is not found. + * // Note that psm_error_register_handler (and psm_uuid_generate) + * // are the only function that can be called before psm_init + * + * int try_to_initialize_psm() { + * int verno_major = PSM_VERNO_MAJOR; + * int verno_minor = PSM_VERNO_MINOR; + * + * int err = psm_error_register_handler(NULL, // Global handler + * PSM_ERRHANDLER_NO_HANDLER); // return errors + * if (err) { + * fprintf(stderr, "Couldn't register global handler: %s\n", + * psm_error_get_string(err)); + * return -1; + * } + * + * err = psm_init(&verno_major, &verno_minor); + * if (err || verno_major > PSM_VERNO_MAJOR) { + * if (err) + * fprintf(stderr, "PSM initialization failure: %s\n", + * psm_error_get_string(err)); + * else + * fprintf(stderr, "PSM loaded an unexpected/unsupported " + * "version (%d.%d)\n", verno_major, verno_minor); + * return -1; + * } + * + * // We were able to initialize PSM but will defer all further error + * // handling since most of the errors beyond this point will be fatal. + * int err = psm_error_register_handler(NULL, // Global handler + * PSM_ERRHANDLER_PSM_HANDLER); // + * if (err) { + * fprintf(stderr, "Couldn't register global errhandler: %s\n", + * psm_error_get_string(err)); + * return -1; + * } + * return 1; + * } + * @endverbatim + */ +psm_error_t +psm_init(int *api_verno_major, int *api_verno_minor); + +/* Finalize PSM interface + * + * Single call to finalize PSM and close all unclosed endpoints + * + * [post] The user guarantees not to make any further PSM calls, including @ref + * psm_init. + * + * [returns] PSM_OK Always returns PSM_OK */ +psm_error_t +psm_finalize(void); + +/* Error handling opaque token + * + * A token is required for users that register their own handlers and wish to + * defer further error handling to PSM. */ +typedef struct psm_error_token *psm_error_token_t; + +/* Error handling function + * + * Users can handle errors explicitly instead of relying on PSM's own error + * handler. There is one global error handler and error handlers that can be + * individually set for each opened endpoint. By default, endpoints will + * inherit the global handler registered at the time of open. + * + * [in] ep Handle associated to the endpoint over which the error occurred + * or NULL if the error is being handled by the global error + * handler. + * [in] error PSM error identifier + * [in] error_string A descriptive error string of maximum length @ref + * PSM_ERRSTRING_MAXLEN. + * [in] token Opaque PSM token associated with the particular event that + * generated the error. The token can be used to extract the + * error string and can be passed to psm_error_defer to + * defer any remaining or unhandled error handling to PSM. + * + * [post] If the error handler returns, the error returned is propagated to the + * caller. */ +typedef psm_error_t (*psm_ep_errhandler_t)(psm_ep_t ep, + const psm_error_t error, + const char *error_string, + psm_error_token_t token); + +/* Obsolete names, only here for backwards compatibility */ +#define PSM_ERRHANDLER_DEFAULT ((psm_ep_errhandler_t)-1) +#define PSM_ERRHANDLER_NOP ((psm_ep_errhandler_t)-2) + +#define PSM_ERRHANDLER_PSM_HANDLER ((psm_ep_errhandler_t)-1) +/* PSM error handler as explained in error_handling */ + +#define PSM_ERRHANDLER_NO_HANDLER ((psm_ep_errhandler_t)-2) +/* Bypasses the default PSM error handler and returns all errors to the user + * (this is the default) */ + +#define PSM_ERRSTRING_MAXLEN 512 /* Maximum error string length. */ + +/* PSM error handler registration + * + * Function to register error handlers on a global basis and on a per-endpoint + * basis. PSM_ERRHANDLER_PSM_HANDLER and PSM_ERRHANDLER_NO_HANDLER are special + * pre-defined handlers to respectively enable use of the default PSM-internal + * handler or the no-handler that disables registered error handling and + * returns all errors to the caller (both are documented in error_handling). + * + * [in] ep Handle of the endpoint over which the error handler should be + * registered. With ep set to NULL, the behavior of the + * global error handler can be controlled. + * [in] errhandler Handler to register. Can be a user-specific error + * handling function or PSM_ERRHANDLER_PSM_HANDLER or + * PSM_ERRHANDLER_NO_HANDLER. + * + * @remark When ep is set to NULL, this is the only function that can be + * called before psm_init + */ +psm_error_t +psm_error_register_handler(psm_ep_t ep, const psm_ep_errhandler_t errhandler); + +/* PSM deferred error handler + * + * Function to handle fatal PSM errors if no error handler is installed or if + * the user wishes to defer further error handling to PSM. Depending on the + * type of error, PSM may or may not return from the function call. + * + * [in] err_token Error token initially passed to error handler + * + * [pre] The user is calling into the function because it has decided that PSM + * should handle an error case. + * + * [post] The function may or may not return depending on the error + */ +psm_error_t +psm_error_defer(psm_error_token_t err_token); + +/* Get generic error string from error + * + * Function to return the default error string associated to a PSM error. + * + * While a more detailed and precise error string is usually available within + * error handlers, this function is available to obtain an error string out of + * an error handler context or when a no-op error handler is registered. + * + * [in] error PSM error + */ +const char * +psm_error_get_string(psm_error_t error); + +/* Option key/pair structure + * + * Currently only used in MQ. + */ +struct psm_optkey +{ + uint32_t key; /* Option key */ + void *value; /* Option value */ +}; + + + +/* Endpoint ID + * + * Integral type of size 8 bytes that can be used by the user to globally + * identify a successfully opened endpoint. Although the contents of the + * endpoint id integral type remains opaque to the user, unique network id and + * InfiniPath port number can be extracted using psm_epid_nid and @ref + * psm_epid_context. + */ +typedef uint64_t psm_epid_t; + +/* Endpoint Address (opaque) + * + * Remote endpoint addresses are created when the user binds an endpoint ID + * to a particular endpoint handle using psm_ep_connect. A given endpoint + * address is only guaranteed to be valid over a single endpoint. + */ +typedef struct psm_epaddr *psm_epaddr_t; + +/* PSM Unique UID + * + * PSM type equivalent to the DCE-1 uuid_t, used to uniquely identify an + * endpoint within a particular job. Since PSM does not participate in job + * allocation and management, users are expected to generate a unique ID to + * associate endpoints to a particular parallel or collective job. + * [see] psm_uuid_generate + */ +typedef uint8_t psm_uuid_t[16]; + +/* Get Endpoint identifier's Unique Network ID */ +uint64_t +psm_epid_nid(psm_epid_t epid); + +/* Get Endpoint identifier's InfiniPath context number */ +uint64_t +psm_epid_context(psm_epid_t epid); + +/* Get Endpoint identifier's InfiniPath port (deprecated, use + * psm_epid_context instead) */ +uint64_t +psm_epid_port(psm_epid_t epid); + +/* List the number of available InfiniPath units + * + * Function used to determine the amount of locally available InfiniPath units. + * For N units, valid unit numbers in psm_ep_open are 0 to N-1. + * + * [returns] PSM_OK unless the user has not called psm_init + */ +psm_error_t +psm_ep_num_devunits(uint32_t *num_units); + +/* Utility to generate UUIDs for psm_ep_open + * + * This function is available as a utility for generating unique job-wide ids. + * See discussion in psm_ep_open for further information. + * + * @remark This function does not require PSM to be initialized. + */ +void +psm_uuid_generate(psm_uuid_t uuid_out); + +/* Affinity modes for the affinity member of struct psm_ep_open_opts */ +#define PSM_EP_OPEN_AFFINITY_SKIP 0 /* Disable setting affinity */ +#define PSM_EP_OPEN_AFFINITY_SET 1 /* Enable setting affinity unless + already set */ +#define PSM_EP_OPEN_AFFINITY_FORCE 2 /* Enable setting affinity regardless + of current affinity setting */ + +/* Default values for some constants */ +#define PSM_EP_OPEN_PKEY_DEFAULT 0xffffffffffffffffULL + /* Default protection key */ + +/* Endpoint Open Options + * + * These options are available for opening a PSM endpoint. Each is + * individually documented and setting each option to -1 or passing NULL as the + * options parameter in psm_ep_open instructs PSM to use + * implementation-defined defaults. + * + * Each option is documented in psm_ep_open */ +struct psm_ep_open_opts { + int64_t timeout; /* timeout in nanoseconds to open device */ + int unit; /* InfiniPath Unit ID to open on */ + int affinity; /* How PSM should set affinity */ + int shm_mbytes; /* Megabytes used for intra-node communication */ + int sendbufs_num; /* Preallocated send buffers */ +#if PSM_VERNO >= 0x0101 + uint64_t network_pkey; /* Network Protection Key (v1.01) */ +#endif +#if PSM_VERNO >= 0x0107 + int port; /* IB port to use (1 to N) */ +#if PSM_VERNO <= 0x010a + int outvl; /* IB VL to use when sending pkts */ +#endif + int outsl; /* IB SL to use when sending pkts */ +#endif +#if PSM_VERNO >= 0x010d + uint64_t service_id; /* IB Service ID to use for endpoint */ + psm_path_res_t path_res_type; /* Path resolution type */ +#endif +#if PSM_VERNO >= 0x010e + int senddesc_num; /* Preallocated send descriptors */ + int imm_size; /* Immediate data size for endpoint */ +#endif + +}; + +/* InfiniPath endpoint creation + * + * Function used to create a new local communication endpoint on an InfiniPath + * adapter. The returned endpoint handle is required in all PSM communication + * operations, as PSM can manage communication over multiple endpoints. An + * opened endpoint has no global context until the user connects the endpoint + * to other global endpoints by way of psm_ep_connect. All local endpoint + * handles are globally identified by endpoint IDs (psm_epid_t) which are + * also returned when an endpoint is opened. It is assumed that the user can + * provide an out-of-band mechanism to distribute the endpoint IDs in order to + * establish connections between endpoints (psm_ep_connect for more + * information). + * + * [in] unique_job_key Endpoint key, to uniquely identify the endpoint in + * a parallel job. It is up to the user to ensure + * that the key is globally unique over a period long + * enough to prevent duplicate keys over the same set + * of endpoints (see comments below). + * + * [in] opts Open options of type psm_ep_open_opts + * (see psm_ep_open_opts_get_defaults). + * + * [out] ep User-supplied storage to return a pointer to the newly + * created endpoint. The returned pointer of type psm_ep_t + * is a local handle and cannot be used to globally identify the + * endpoint. + * [out] epid User-supplied storage to return the endpoint ID associated + * to the newly created local endpoint returned in the ep + * handle. The endpoint ID is an integral type suitable for + * uniquely identifying the local endpoint. + * + * PSM does not internally verify the consistency of the uuid, it is up to the + * user to ensure that the uid is unique enough not to collide with other + * currently-running jobs. Users can employ three mechanisms to obtain a uuid. + * + * 1. Use the supplied psm_uuid_generate utility + * + * 2. Use an OS or library-specific uuid generation utility, that complies with + * OSF DCE 1.1, such as uuid_generate on Linux or uuid_create on FreeBSD. + * (see http://www.opengroup.org/onlinepubs/009629399/uuid_create.htm) + * + * 3. Manually pack a 16-byte string using a utility such as /dev/random or + * other source with enough entropy and proper seeding to prevent two nodes + * from generating the same uuid_t. + * + * The following options are relevent when opening an endpoint: + * * timeout establishes the amount of nanoseconds to wait before + * failing to open a port (with -1, defaults to 15 secs). + * * unit sets the InfiniPath unit number to use to open a port (with + * -1, PSM determines the best unit to open the port). If @c + * IPATH_UNIT is set in the environment, this setting is ignored. + * * affinity enables or disables PSM setting processor affinity. The + * option can be controlled to either disable (@ref + * PSM_EP_OPEN_AFFINITY_SKIP) or enable the affinity setting + * only if it is already unset (@ref + * PSM_EP_OPEN_AFFINITY_SET) or regardless of affinity begin + * set or not (PSM_EP_OPEN_AFFINITY_FORCE). + * If IPATH_NO_CPUAFFINITY is set in the environment, this + * setting is ignored. + * * shm_mbytes sets a maximum amount of megabytes that can be allocated + * to each local endpoint ID connected through this + * endpoint (with -1, defaults to 10 MB). + * * sendbufs_num sets the number of send buffers that can be + * pre-allocated for communication (with -1, defaults to + * 512 buffers of MTU size). + * * network_pkey sets the protection key to employ for point-to-point + * PSM communication. Unless a specific value is used, + * this parameter should be set to + * PSM_EP_OPEN_PKEY_DEFAULT. + * + * [warning] Currently, PSM limits the user to calling psm_ep_open only once + * per process and subsequent calls will fail. Multiple endpoints per process + * will be enabled in a future release. + * + * @verbatim + * // In order to open an endpoint and participate in a job, each endpoint has + * // to be distributed a unique 16-byte UUID key from an out-of-band source. + * // Presumably this can come from the parallel spawning utility either + * // indirectly through an implementors own spawning interface or as in this + * // example, the UUID is set as a string in an environment variable + * // propagated to all endpoints in the job. + * + * int try_to_open_psm_endpoint(psm_ep_t *ep, // output endpoint handle + * psm_epid_t *epid, // output endpoint identifier + * int unit) // unit of our choice + * { + * psm_ep_open_opts epopts; + * psm_uuid_t job_uuid; + * char *c; + * + * // Let PSM assign its default values to the endpoint options. + * psm_ep_open_opts_get_defaults(&epopts); + * + * // We want a stricter timeout and a specific unit + * epopts.timeout = 15*1e9; // 15 second timeout + * epopts.unit = unit; // We want a specific unit, -1 would let PSM + * // choose the unit for us. + * epopts.port = port; // We want a specific unit, <= 0 would let PSM + * // choose the port for us. + * // We've already set affinity, don't let PSM do so if it wants to. + * if (epopts.affinity == PSM_EP_OPEN_AFFINITY_SET) + * epopts.affinity = PSM_EP_OPEN_AFFINITY_SKIP; + * + * // ENDPOINT_UUID is set to the same value in the environment of all the + * // processes that wish to communicate over PSM and was generated by + * // the process spawning utility + * c = getenv("ENDPOINT_UUID"); + * if (c && *c) + * implementor_string_to_16byte_packing(c, job_uuid); + * else { + * fprintf(stderr, "Can't find UUID for endpoint\n); + * return -1; + * } + * + * // Assume we don't want to handle errors here. + * psm_ep_open(job_uuid, &epopts, ep, epid); + * return 1; + * } + * @endverbatim */ +psm_error_t +psm_ep_open(const psm_uuid_t unique_job_key, const struct psm_ep_open_opts *opts, + psm_ep_t *ep, psm_epid_t *epid); + +/* Endpoint open default options. + * + * Function used to initialize the set of endpoint options to their default + * values for use in psm_ep_open. + * + * [out] opts Endpoint Open options. + * + * [warning] For portable operation, users should always call this function + * prior to calling psm_ep_open. + * + * [return] PSM_OK If result could be updated + * [return] PSM_INIT_NOT_INIT If psm has not been initialized. + */ +psm_error_t +psm_ep_open_opts_get_defaults(struct psm_ep_open_opts *opts); + +/* Endpoint shared memory query + * + * Function used to determine if a remote endpoint shares memory with a + * currently opened local endpiont. + * + * [in] ep Endpoint handle + * [in] epid Endpoint ID + * + * [out] result Result is non-zero if the remote endpoint shares memory with the local + * endpoint ep, or zero otherwise. + * + * [return] PSM_OK If result could be updated + * [return] PSM_EPID_UNKNOWN If the epid is not recognized + */ +psm_error_t +psm_ep_epid_share_memory(psm_ep_t ep, psm_epid_t epid, int *result); + +/* Close endpoint + * [in] ep PSM endpoint handle + * [in] mode One of PSM_EP_CLOSE_GRACEFUL or PSM_EP_CLOSE_FORCE + * [in] timeout How long to wait in nanoseconds if mode is + * PSM_EP_CLOSE_GRACEFUL, 0 waits forever. If mode is + * PSM_EP_CLOSE_FORCE, this parameter is ignored. + * + * The following errors are returned, others are handled by the per-endpoint + * error handler: + * + * [return] PSM_OK Endpoint was successfully closed without force or + * successfully closed with force within the supplied timeout. + * [return] PSM_EP_CLOSE_TIMEOUT Endpoint could not be successfully closed + * within timeout. + */ +psm_error_t +psm_ep_close(psm_ep_t ep, int mode, int64_t timeout); + +#define PSM_EP_CLOSE_GRACEFUL 0 /* Graceful close mode in psm_ep_close */ +#define PSM_EP_CLOSE_FORCE 1 /* Forceful close mode in psm_ep_close */ + +/* Provide mappings for network id to hostname + * + * Since PSM does not assume or rely on the availability of an external + * networkid-to-hostname mapping service, users can provide one or more of + * these mappings. The psm_map_nid_hostname function allows a list of + * network ids to be associated to hostnames. + * + * This function is not mandatory for correct operation but may allow PSM to + * provide better diagnostics when remote endpoints are unavailable and can + * otherwise only be identified by their network id. + * + * [in] num Number elements in nid and hostnames arrays + * [in] nids User-provided array of network ids (i.e. InfiniBand LIDs), + * should be obtained by calling psm_epid_nid on each + * epid. + * [in] hostnames User-provided array of hostnames (array of + * NUL-terimated strings) where each hostname index + * maps to the provided nid hostname. + * + * [warning] Duplicate nids may be provided in the input nids array, only + * the first corresponding hostname will be remembered. + * + * [pre] The user may or may not have already provided a hostname mappings. + * [post] The user may free any dynamically allocated memory passed to the + * function. + * + */ +psm_error_t +psm_map_nid_hostname(int num, const uint64_t *nids, const char **hostnames); + +/* Connect one or more remote endpoints to a local endpoint + * + * Function to non-collectively establish a connection to a set of endpoint IDs + * and translate endpoint IDs into endpoint addresses. Establishing a remote + * connection with a set of remote endpoint IDs does not imply a collective + * operation and the user is free to connect unequal sets on each process. + * Similarly, a given endpoint address does not imply that a pairwise + * communication context exists between the local endpoint and remote endpoint. + * + * [in] ep PSM endpoint handle + * + * [in] num_of_epid The amount of endpoints to connect to, which + * also establishes the amount of elements contained in + * all of the function's array-based parameters. + * + * [in] array_of_epid User-allocated array that contains num_of_epid + * valid endpoint identifiers. Each endpoint id (or + * epid) has been obtained through an out-of-band + * mechanism and each endpoint must have been opened + * with the same uuid key. + * + * [in] array_of_epid_mask User-allocated array that contains num_of_epid + * integers. This array of masks allows users to + * select which of the epids in array_of_epid + * should be connected. If the integer at index i is + * zero, psm does not attempt to connect to the epid + * at index i in array_of_epid. If this parameter + * is NULL, psm will try to connect to each epid. + * + * [out] array_of_errors User-allocated array of at least num_of_epid + * elements. If the function does not return + * PSM_OK, this array can be consulted for each + * endpoint not masked off by array_of_epid_mask + * to know why the endpoint could not be connected. + * Endpoints that could not be connected because of + * an unrelated failure will be marked as @ref + * PSM_EPID_UNKNOWN. If the function returns + * PSM_OK, the errors for all endpoints will also + * contain PSM_OK. + * + * [out] array_of_epaddr User-allocated array of at least num_of_epid + * elements of type psm_epaddr_t. Each + * successfully connected endpoint is updated with + * an endpoint address handle that corresponds to + * the endpoint id at the same index in @c + * array_of_epid. Handles are only updated if the + * endpoint could be connected and if its error in + * array_of_errors is PSM_OK. + * + * [in] timeout Timeout in nanoseconds after which connection attempts will + * be abandoned. Setting this value to 0 disables timeout + * and waits until all endpoints have been successfully + * connected or until an error is detected. + * + * [pre] The user has opened a local endpoint and obtained a list of endpoint + * IDs to connect to a given endpoint handle using an out-of-band + * mechanism not provided by PSM. + * + * [post] If the connect is successful, array_of_epaddr is updated with valid + * endpoint addresses. + * + * [post] If unsuccessful, the user can query the return status of each + * individual remote endpoint in array_of_errors. + * + * [post] The user can call into psm_ep_connect many times with the same + * endpoint ID and the function is guaranteed to return the same output + * parameters. + * + * [post] PSM does not keep any reference to the arrays passed into the + * function and the caller is free to deallocate them. + * + * The error value with the highest importance is returned by + * the function if some portion of the communication failed. Users should + * always refer to individual errors in array_of_errors whenever the + * function cannot return PSM_OK. + * + * [returns] PSM_OK The entire set of endpoint IDs were successfully connected + * and endpoint addresses are available for all endpoint IDs. + * + * @verbatim + * int connect_endpoints(psm_ep_t ep, int numep, const psm_epid_t *array_of_epid, + * psm_epaddr_t **array_of_epaddr_out) + * { + * psm_error_t *errors = (psm_error_t *) + * calloc(numep, sizeof(psm_error_t)); + * if (errors == NULL) + * return -1; + * + * psm_epaddr_t *all_epaddrs = + * (psm_epaddr_t *) calloc(numep, sizeof(psm_epaddr_t)); + * + * if (all_epaddrs == NULL) + * return -1; + * + * psm_ep_connect(ep, numep, array_of_epid, + * NULL, // We want to connect all epids, no mask needed + * errors, + * all_epaddrs, + * 30*e9); // 30 second timeout, <1 ns is forever + * *array_of_epaddr_out = all_epaddrs; + * free(errors); + * return 1; + * } + * @endverbatim */ +psm_error_t +psm_ep_connect(psm_ep_t ep, int num_of_epid, const psm_epid_t *array_of_epid, + const int *array_of_epid_mask, psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, int64_t timeout); + +/* Ensure endpoint communication progress + * + * Function to ensure progress for all PSM components instantiated on an + * endpoint (currently, this only includes the MQ component). The function + * never blocks and is typically required in two cases: + * + * * Allowing all PSM components instantiated over a given endpoint to make + * communication progress. Refer to mq_progress for a detailed + * discussion on MQ-level progress issues. + * + * * Cases where users write their own synchronization primitives that + * depend on remote communication (such as spinning on a memory location + * which's new value depends on ongoing communication). + * + * The poll function doesn't block, but the user can rely on the @ref + * PSM_OK_NO_PROGRESS return value to control polling behaviour in terms of + * frequency (poll until an event happens) or execution environment (poll for a + * while but yield to other threads of CPUs are oversubscribed). + * + * [returns] PSM_OK Some communication events were progressed + * [returns] PSM_OK_NO_PROGRESS Polling did not yield any communication progress + * + */ +psm_error_t +psm_poll(psm_ep_t ep); + +/* Set a user-determined ep address label. + * + * [in] epaddr Endpoint address, obtained from psm_ep_connect + * [in] epaddr_label_string User-allocated string to print when + * identifying endpoint in error handling or other verbose + * printing. The NULL-terminated string must be allocated by + * the user since PSM only keeps a pointer to the label. If + * users do not explicitly set a label for each endpoint, + * endpoints will identify themselves as hostname:port. + */ +void +psm_epaddr_setlabel(psm_epaddr_t epaddr, const char *epaddr_label_string); + +/* Set a user-determined ep address context. + * + * [in] epaddr Endpoint address, obtained from psm_ep_connect + * [in] ctxt Opaque user defined state to associate with an endpoint + * address. This state can be retrieved via + * psm_epaddr_getctxt. + */ +void +psm_epaddr_setctxt(psm_epaddr_t epaddr, void *ctxt); + +/* Get the user-determined ep address context. Users can associate an + * opaque context with each endpoint via psm_epaddr_setctxt. + * + * [in] epaddr Endpoint address, obtained from psm_ep_connect. + */ +void * +psm_epaddr_getctxt(psm_epaddr_t epaddr); + +/* Below are all component specific options. The component object for each of + * the options is also specified. + */ + +/* PSM_COMPONENT_CORE options */ +/* PSM debug level */ +#define PSM_CORE_OPT_DEBUG 0x101 + /* [uint32_t ] Set/Get the PSM debug level. This option can be set + * before initializing the PSM library. + * + * component object: (null) + * option value: PSM Debug mask to set or currently active debug level. + */ + +/* PSM endpoint address context */ +#define PSM_CORE_OPT_EP_CTXT 0x102 + /* [uint32_t ] Set/Get the context associated with a PSM endpoint + * address (psm_epaddr_t). + * + * component object: PSM endpoint (psm_epaddr_t) address. + * option value: Context associated with PSM endpoint address. + */ + +/* PSM_COMPONENT_IB options */ +/* Default service level to use to communicate with remote endpoints */ +#define PSM_IB_OPT_DF_SL 0x201 + /* [uint32_t ] Default Infiniband SL to use for all remote communication. + * If unset defaults to Service Level 0. + * + * component object: Opened PSM endpoint id (psm_ep_t). + * option value: Default IB SL to use for endpoint. (0 <= SL < 15) + */ + +/* Set IB service level to use for communication to an endpoint */ +#define PSM_IB_OPT_EP_SL 0x202 + /* [uint32_t ] Infiniband SL to use for communication to specified + * remote endpoint. + * + * component object: PSM endpoint (@ ref psm_epaddr_t) address. + * option value: SL used to communicate with remote endpoint. (0 <= SL < 15) + */ + +/* PSM_COMPONENT_MQ options (deprecates psm_mq_set|getopt) */ +/* MQ options that can be set in psm_mq_init and psm_{set,get}_opt */ +#define PSM_MQ_OPT_RNDV_IB_SZ 0x301 +#define PSM_MQ_RNDV_IPATH_SZ PSM_MQ_OPT_RNDV_IB_SZ + /* [uint32_t ] Size at which to start enabling rendezvous + * messaging for InfiniPath messages (if unset, defaults to values + * between 56000 and 72000 depending on the system configuration) + * + * component object: PSM Matched Queue (psm_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ + +#define PSM_MQ_OPT_RNDV_SHM_SZ 0x302 +#define PSM_MQ_RNDV_SHM_SZ PSM_MQ_OPT_RNDV_SHM_SZ + /* [uint32_t ] Size at which to start enabling + * rendezvous messaging for shared memory (intra-node) messages (If + * unset, defaults to 64000 bytes). + * + * component object: PSM Matched Queue (psm_mq_t). + * option value: Size at which to switch to rendezvous protocol. + */ + +#define PSM_MQ_OPT_SYSBUF_MYBYTES 0x303 +#define PSM_MQ_MAX_SYSBUF_MBYTES PSM_MQ_OPT_SYSBUF_MYBYTES + /* [uint32_t ] Maximum amount of bytes to allocate for unexpected + * messages. + * + * component object: PSM Matched Queue (psm_mq_t). + * option value: Maximum amount of bytes to allocate for unexpected messages. + * Mesages that would cause memory allocation to exceed this amount will be + * dropped. + */ + + +/* PSM_COMPONENT_AM options */ +#define PSM_AM_OPT_FRAG_SZ 0x401 + + +/* Set an option for a PSM component + * + * Function to set the value of a PSM component option + * + * [in] component Type of PSM component for which to set the option + * [in] component_obj Opaque component specify object to apply the set + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * [in] optname Name of component option to set. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * [in] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size and format. + * [in] optlen Size of the memory region pointed to by optval. + * + * [returns] PSM_OK if option could be set. + * [returns] PSM_PARAM_ERR if the component or optname are not valid. + * [returns] PSM_OPT_READONLY if the option to be set is a read-only option. + * + */ +psm_error_t +psm_setopt(psm_component_t component, const void *component_obj, + int optname, const void *optval, uint64_t optlen); + +/* Get an option for a PSM component + * + * Function to get the value of a PSM component option + * + * [in] component Type of PSM component for which to get the option + * [in] component_obj Opaque component specify object to apply the get + * operation on. These are passed uninterpreted to the + * appropriate component for interpretation. + * [in] optname Name of component option to get. These are component + * specific and passed uninterpreted to the appropriate + * component for interpretation. + * [out] optval Pointer to storage that contains the value to be updated + * for the supplied option. It is up to the user to + * ensure that the pointer points to a valid memory region. + * [in,out] optlen This is a value result parameter initially containing + * the size of the memory region pointed to by optval and + * modified to return the actual size of optval. + * + * [returns] PSM_OK if option value could be retrieved successfully. + * [returns] PSM_PARAM_ERR if the component or optname are not valid. + * [returns] PSM_NO_MEMORY if the memory region optval is of insufficient size. + * optlen contains the required memory region size for + * optname value. + * + */ +psm_error_t +psm_getopt(psm_component_t component, const void *component_obj, + int optname, void *optval, uint64_t *optlen); + +/* Datatype for end-point information */ +typedef struct psm_epinfo { + psm_ep_t ep; /* The ep for this end-point*/ + psm_epid_t epid; /* The epid for this end-point */ + psm_uuid_t uuid; /* The UUID for this end-point */ + char uuid_str[64]; /* String representation of the UUID for this end-point */ +} psm_epinfo_t; + +/* Datatype for end-point connection */ +typedef struct psm_epconn { + psm_epaddr_t addr; /* The epaddr for this connection */ + psm_ep_t ep; /* The ep for this connection */ + psm_mq_t mq; /* The mq for this connection */ +} psm_epconn_t; + +/* Query PSM for end-point information. + * + * Function to query PSM for end-point information. This allows retrieval of end-point + * information in cases where the caller does not have access to the results of psm_ep_open(). + * In single-rail mode PSM will use a single end-point. In multi-rail mode, PSM will use an + * end-point per rail. + * + * [in,out] num_of_epinfo On input, sizes the available number of entries in array_of_epinfo. + * On output, specifies the returned number of entries in array_of_epinfo. + * [out] array_of_epinfo Returns end-point information structures. + * + * [pre] PSM is initialized and the end-point has been opened. + * + * [returns] PSM_OK indicates success. + * [returns] PSM_PARAM_ERR if input num_if_epinfo is less than or equal to zero. + * [returns] PSM_EP_WAS_CLOSED if PSM end-point is closed or does not exist. + */ +psm_error_t +psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo); + +/* Query PSM for end-point connections. + * + * Function to query PSM for end-point connections. This allows retrieval of end-point + * connnections in cases where the caller does not have access to the results of psm_ep_connect(). + * The epid values can be found using psm_ep_query() so that each PSM process can determine + * its own epid. These values can then be distributed across the PSM process so that each PSM + * process knows the epid for all other PSM processes. + * + * [in] epid The epid of a PSM process. + * [out] epconn The connection information for that PSM process. + * + * [pre] PSM is initialized and the end-point has been connected to this epid. + * + * [returns] PSM_OK indicates success. + * [returns] PSM_EP_WAS_CLOSED if PSM end-point is closed or does not exist. + * [returns] PSM_EPID_UNKNOWN if the epid value is not known to PSM. + */ +psm_error_t +psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/psm.supp b/psm.supp new file mode 100644 index 0000000..3113ad0 --- /dev/null +++ b/psm.supp @@ -0,0 +1,58 @@ + +# userinit +{ + syscall_ipath_userinit + Memcheck:Param + write(buf) + fun:__write_nocancel + fun:ipath_userinit +} + +# syscall poll type +{ + syscall_poll_type + Memcheck:Param + write(buf) + obj:/lib64/libc*.so + fun:ipath_poll_type +} + +# Tids allocation. +{ + syscall_tid_free + Memcheck:Param + write(buf) + obj:/lib64/libc*.so + fun:ips_tid_release +} + +# Tids de-allocation. +{ + syscall_tid_alloc + Memcheck:Param + write(buf) + obj:/lib64/libc*so + fun:ips_tid_acquire +} + +# really in QLogic MPI +{ + mpspawn_socket + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:psc_skt_sendN +} + +# gethostbyname on sles +{ + gethostbyname + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:get_mapping + fun:__nscd_get_map_ref + fun:nscd_gethst_r + fun:__nscd_gethostbyname_r + fun:gethostbyname_r@@GLIBC_2.2.5 +} diff --git a/psm_am.c b/psm_am.c new file mode 100644 index 0000000..d5db5c7 --- /dev/null +++ b/psm_am.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_am.h" +#include "psm_am_internal.h" + +int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid); + +static int _ignore_handler(PSMI_AM_ARGS_DEFAULT) +{ + return 0; +} + +int psmi_abort_handler(PSMI_AM_ARGS_DEFAULT) +{ + abort(); + return 0; +} + +psm_error_t +psmi_am_init_internal(psm_ep_t ep) +{ + int i; + psm_am_handler_fn_t *am_htable; + + ep->am_htable = + psmi_malloc(ep, UNDEFINED, + sizeof(psm_am_handler_fn_t) * PSMI_AM_NUM_HANDLERS); + if (ep->am_htable == NULL) + return PSM_NO_MEMORY; + + am_htable = (psm_am_handler_fn_t *) ep->am_htable; + for (i = 0; i < PSMI_AM_NUM_HANDLERS; i++) + am_htable[i] = _ignore_handler; + + return PSM_OK; +} + +psm_error_t +__psm_am_register_handlers(psm_ep_t ep, + const psm_am_handler_fn_t *handlers, + int num_handlers, int *handlers_idx) +{ + int i, j; + + /* For now just assign any free one */ + for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) { + if (ep->am_htable[i] == _ignore_handler) { + ep->am_htable[i] = handlers[j]; + handlers_idx[j] = i; + if (++j == num_handlers) /* all registered */ + break; + } + } + + if (j < num_handlers) { + /* Not enough free handlers, restore unused handlers */ + for (i = 0; i < j; i++) + ep->am_htable[handlers_idx[i]] = _ignore_handler; + + return psmi_handle_error(ep, PSM_EP_NO_RESOURCES, "Insufficient " + "available AM handlers: registered %d of %d requested handlers", + j, num_handlers); + } + else + return PSM_OK; +} +PSMI_API_DECL(psm_am_register_handlers) + +psm_error_t +__psm_am_request_short(psm_epaddr_t epaddr, psm_handler_t handler, + psm_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm_error_t err; + ptl_ctl_t *ptlc = epaddr->ptlctl; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + + err = ptlc->am_short_request(epaddr, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + PSMI_PUNLOCK(); + return err; +} +PSMI_API_DECL(psm_am_request_short) + +psm_error_t +__psm_am_reply_short(psm_am_token_t token, psm_handler_t handler, + psm_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm_error_t err; + struct psmi_am_token *tok = (struct psmi_am_token *)token; + psm_epaddr_t epaddr = tok->epaddr_from; + ptl_ctl_t *ptlc = epaddr->ptlctl; + + psmi_assert_always(token != NULL); + + /* No locking here since we are already within handler context and already + * locked */ + + PSMI_ASSERT_INITIALIZED(); + + err = ptlc->am_short_reply(token, handler, args, + nargs, src, len, flags, completion_fn, + completion_ctxt); + return err; +} +PSMI_API_DECL(psm_am_reply_short) + +psm_error_t +__psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out) +{ + struct psm_am_parameters params; + size_t s; + uint32_t frag_sz; + /* This is the same calculation as PSM_AM_OPT_FRAG_SZ in psm_utils.c */ + frag_sz = (ep && psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) ? + (ep->context.base_info.spi_piosize - + IPATH_MESSAGE_HDR_SIZE) : 2048; + params.max_handlers = PSMI_AM_NUM_HANDLERS; + params.max_nargs = PSMI_AM_MAX_ARGS; + params.max_request_short = frag_sz; + params.max_reply_short = frag_sz; + memset(parameters, 0, sizeof_parameters_in); + s = min(sizeof(params), sizeof_parameters_in); + memcpy(parameters, ¶ms, s); + *sizeof_parameters_out = s; + return PSM_OK; +} +PSMI_API_DECL(psm_am_get_parameters) diff --git a/psm_am.h b/psm_am.h new file mode 100644 index 0000000..c91c66e --- /dev/null +++ b/psm_am.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PSM_AM_H +#define PSM_AM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + + +/* Datatype for an index number representing an active message handler */ +typedef uint32_t psm_handler_t; + +/* Datatype for a token for an active message handler.*/ +typedef void *psm_am_token_t; + +/* PSM AM flags + * These flags may be combined using bitwise-or. + */ +#define PSM_AM_FLAG_NONE 0 /* This flag should be used when no other PSM AM flags are needed. */ +#define PSM_AM_FLAG_ASYNC 1 /* This flag indicates no need to copy source data. */ +#define PSM_AM_FLAG_NOREPLY 2 /* This flag indicates that the handler for this AM request is guaranteed not to generate a reply. */ + +/* The psm_amarg type represents the type of an AM argument. This is + * a 64-bit type and is broken down into four 16-bit fields, two 32-bit + * fields or one 64-bit field for the convenience of code using the PSM AM + * interface. + */ +typedef +struct psm_amarg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + }; + struct { + uint32_t u32w1; + uint32_t u32w0; + }; + uint64_t u64w0; + uint64_t u64; + }; +} +psm_amarg_t; + +/* The AM handler function type + * + * psm_am_handler_fm_t is the datatype for an AM handler. PSM AM will call-back + * into an AM handler using this function prototype. The parameters and result + * of these handler functions are described here. + * + * [in] token This is an opaque token value passed into a handler. + * A request handler may send at most one reply back to the original + * requestor, and must pass this value as the token parameter + * to the psm_am_reply_short() function. A reply handler is also + * passed a token value, but must not attempt to reply. + * [in] epaddr The end-point address of the other party in this AM transaction. + * [in] args A pointer to the arguments provided to this handler. + * [in] nargs The number of arguments. + * [in] src A pointer to the data payload provided to this handler. + * [in] len The length of the data payload in bytes. + * + * [returns] 0 The handler should always return a result of 0. + */ +typedef +int (*psm_am_handler_fn_t)(psm_am_token_t token, psm_epaddr_t epaddr, + psm_amarg_t *args, int nargs, + void *src, uint32_t len); + +/* Type for a completion call-back handler. + * + * A completion handler can be specified to give a call-back on the initiation + * side that an AM request or reply has completed on the target side. The call-back + * has a context pointer which is provided along with the call-back function + * pointer when the initiator generates the request or reply. This approach will + * typically give higher performance than using an AM request or reply to achieve + * the same effect, though note that no additional information can be passed + * from the target side back to the initiator side with the completion handler + * approach. + * + * [in] context A context pointer. + * [returns] void This handler has no return result. + */ +typedef +void (*psm_am_completion_fn_t)(void *context); + +/* Register AM call-back handlers at the specified end-point. + * + * This function is used to register an array of handlers, and may be called + * multiple times to register additonal handlers. The maximum number of handlers + * that can be registered is limited to the max_handlers value returned by + * psm_am_get_parameters(). Handlers are associated with a PSM end-point. The + * handlers are allocated index numbers in the the handler table for that end-point. + * The allocated index for the handler function in handlers[i] is returned in + * handlers_idx[i] for i in (0, num_handlers]. These handler index values are + * used in the psm_am_request_short() and psm_am_reply_short() functions. + * + * [in] ep End-point value + * [in] handlers Array of handler functions + * [in] num_handlers Number of handlers (sizes the handlers and handlers_idx arrays) + * [out] handlers_idx Used to return handler index mapping table + * + * [returns] PSM_OK Indicates success + * [returns] PSM_EP_NO_RESOURCES Insufficient slots in the AM handler table + */ +psm_error_t psm_am_register_handlers(psm_ep_t ep, + const psm_am_handler_fn_t *handlers, + int num_handlers, int *handlers_idx); + +/* Generate an AM request. + * + * This function generates an AM request causing an AM handler function to be + * called in the PSM process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in bytes + * to max_request_short returned by the psm_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the argument + * pointer will not be dereferenced. If payload is not required, set the payload size + * to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be provided, + * and a local call-back will be made to that function passing in that context + * pointer once remote execution of the handler has completed. If the completion + * call-back is not required, the handler should be specified as NULL and the + * pointer value will not be used. + * + * The allowed flags are any combination of the following combined with bitwise-or: + * PSM_AM_FLAG_NONE - No flags + * PSM_AM_FLAG_ASYNC - Indicates no need to copy source data + * PSM_AM_FLAG_NOREPLY - The handler for this AM request is guaranteed not to generate a reply + * + * The PSM AM implementation will not dereference the args pointer after return from + * this function. If PSM_AM_FLAG_ASYNC is not provided, the PSM AM implementation will + * not dereference the src pointer after return from this function. This may require the + * implementation to take a copy of the payload if the request cannot be issued immediately. + * However, if PSM_AM_FLAG_ASYNC is provided then a copy will not be taken and the PSM AM + * implementation retains ownership of the payload src memory until the request is locally + * complete. Local completion can be determined using the completion handler call-back, or + * through an AM handler associated with an AM reply. + * + * The PSM_AM_FLAG_NOREPLY flag indicates ahead of time to the AM handler that a reply will + * not be generated. Use of this flag is optional, but it may enable a performance optimization + * in this case by indicating that reply state is not required. + * + * [in] epaddr End-point address to run handler on + * [in] handler Index of handler to run + * [in] args Array of arguments to be provided to the handler + * [in] nargs Number of arguments to be provided to the handler + * [in] src Pointer to the payload to be delivered to the handler + * [in] len Length of the payload in bytes + * [in] flags These are PSM AM flags and may be combined together with bitwise-or + * [in] completion_fn The completion function to called locally when remote handler is complete + * [in] completion_ctxt User-provided context pointer to be passed to the completion handler + * + * [returns] PSM_OK indicates success. + */ +psm_error_t +psm_am_request_short(psm_epaddr_t epaddr, psm_handler_t handler, + psm_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/* Generate an AM reply. + * + * This function may only be called from an AM handler called due to an AM request. + * If the AM request uses the PSM_AM_FLAG_NOREPLY flag, the AM handler must not + * call this function. Otherwise, the AM request handler may call psm_am_reply_short() + * at most once, and must pass in the token value that it received in its own handler + * call-back. + * + * This function generates an AM reply causing an AM handler function to be + * called in the PSM process associated with the specified end-point address. + * The number of arguments is limited to max_nargs and the payload length in bytes + * to max_reply_short returned by the psm_am_get_parameters() function. + * If arguments are not required, set the number of arguments to 0 and the argument + * pointer will not be dereferenced. If payload is not required, set the payload size + * to 0 and the payload pointer will not be dereferenced. + * + * Optionally a completion function and completion context pointer can be provided, + * and a local call-back will be made to that function passing in that context + * pointer once remote execution of the handler has completed. If the completion + * call-back is not required, the handler should be specified as NULL and the + * pointer value will not be used. + * + * The allowed flags are any combination of the following combined with bitwise-or: + * PSM_AM_FLAG_NONE - No flags + * PSM_AM_FLAG_ASYNC - Indicates no need to copy source data + * + * The PSM AM implementation will not dereference the args pointer after return from + * this function. If PSM_AM_FLAG_ASYNC is not provided, the PSM AM implementation will + * not dereference the src pointer after return from this function. This may require the + * implementation to take a copy of the payload if the reply cannot be issued immediately. + * However, if PSM_AM_FLAG_ASYNC is provided then a copy will not be taken and the PSM AM + * implementation retains ownership of the payload src memory until the reply is locally + * complete. Local completion can be determined using the completion handler call-back. + * + * [in] token Token value provided to the AM handler that is generating the reply. + * [in] handler Index of handler to run + * [in] args Array of arguments to be provided to the handler + * [in] nargs Number of arguments to be provided to the handler + * [in] src Pointer to the payload to be delivered to the handler + * [in] len Length of the payload in bytes + * [in] flags These are PSM AM flags and may be combined together with bitwise-or + * [in] completion_fn The completion function to called locally when remote handler is complete + * [in] completion_ctxt User-provided context pointer to be passed to the completion handler + * + * [returns] PSM_OK indicates success. + */ +psm_error_t +psm_am_reply_short(psm_am_token_t token, psm_handler_t handler, + psm_amarg_t *args, int nargs, void *src, size_t len, + int flags, psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + +/* AM parameters + * + * This structure is used to return PSM AM implementation-specific parameter + * values back to the caller of the psm_am_get_parameters() function. This + * API also specifies the minimum values for these parameters that an + * implementation must at least provide: + * max_handlers >= 64, + * max_nargs >= 2, + * max_request_short >= 256 and + * max_reply_short >= 256. + */ +struct psm_am_parameters { + uint32_t max_handlers; /* Maximum number of handlers that can be registered. */ + uint32_t max_nargs; /* Maximum number of arguments to an AM handler. */ + uint32_t max_request_short; /* Maximum number of bytes in a request payload. */ + uint32_t max_reply_short; /* Maximum number of bytes in a reply payload. */ +}; + +/* Get the AM parameter values + * + * This function retrieves the implementation-specific AM parameter values for + * the specified end-point. + * + * [in] ep The end-point value returned by psm_ep_open(). + * [out] parameters Pointer to the struct where the parameters will be returned. + * [in] sizeof_parameters_in The size in bytes of the struct provided by the caller. + * [out] sizeof_parameters_out The size in bytes of the struct returned by PSM. + * + * [returns] PSM_OK indicates success. + */ +psm_error_t +psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters, + size_t sizeof_parameters_in, + size_t *sizeof_parameters_out); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/psm_am_internal.h b/psm_am_internal.h new file mode 100644 index 0000000..dbe1bbb --- /dev/null +++ b/psm_am_internal.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSM_AM_INTERNAL_H +#define _PSM_AM_INTERNAL_H + +#define PSMI_AM_MAX_ARGS 8 +#define PSMI_AM_NUM_HANDLERS 256 /* must be power of 2 */ + +#define PSMI_AM_ARGS_DEFAULT psm_am_token_t token, psm_epaddr_t epaddr, \ + psm_amarg_t *args, int nargs, \ + void *src, uint32_t len + +struct psmi_am_token { + psm_epaddr_t epaddr_from; + uint32_t flags; + /* Can handler reply? i.e. Not OPCODE_AM_REQUEST_NOREPLY request */ + uint32_t can_reply; + + /* PTLs may add other stuff here */ +}; + +PSMI_ALWAYS_INLINE( +psm_am_handler_fn_t +psm_am_get_handler_function(psm_ep_t ep, psm_handler_t handler_idx)) +{ + int hidx = handler_idx & (PSMI_AM_NUM_HANDLERS-1); + psm_am_handler_fn_t fn = (psm_am_handler_fn_t) ep->am_htable[hidx]; + psmi_assert_always(fn != NULL); + return fn; +} + +/* PSM internal initialization */ +psm_error_t psmi_am_init_internal(psm_ep_t ep); + +#endif diff --git a/psm_context.c b/psm_context.c new file mode 100644 index 0000000..390b49a --- /dev/null +++ b/psm_context.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "psm_user.h" + +#ifdef __MIC__ +#include +#endif + +#define PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT 1 +static int psmi_get_hca_selection_algorithm(void); +static psm_error_t psmi_init_userinfo_params(psm_ep_t ep, + int unit_id, int port, + psm_uuid_t const unique_job_key, + struct ipath_user_info *user_info); + +psm_error_t +psmi_context_interrupt_set(psmi_context_t *context, int enable) +{ + int poll_type; + int ret; + + if (( enable && (context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED)) || + (!enable && !(context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED))) + return PSM_OK; + + if (enable) + poll_type = IPATH_POLL_TYPE_URGENT; + else + poll_type = 0; + + ret = ipath_poll_type(context->ctrl, poll_type); + + if (ret != 0) + return PSM_EP_NO_RESOURCES; + else { + if (enable) + context->runtime_flags |= PSMI_RUNTIME_INTR_ENABLED; + else + context->runtime_flags &= ~PSMI_RUNTIME_INTR_ENABLED; + + return PSM_OK; + } +} + +int +psmi_context_interrupt_isenabled(psmi_context_t *context) +{ + return context->runtime_flags & PSMI_RUNTIME_INTR_ENABLED; +} + +static +char * +runtime_flags_string(char *buf, size_t len, uint32_t runtime_flags) +{ + size_t off = 0; + int flag = 0; + char *s; + + psmi_assert(len > 0 && buf != NULL); + buf[0] = '\0'; + + for (flag = 0; off < len && flag < 32; flag++) { + switch((1< 1) { + size_t c = strlen(buf); + buf[c - 1] = '\0'; + } + return buf; +} + +psm_error_t +psmi_context_open(const psm_ep_t ep, long unit_id, long port, + psm_uuid_t const job_key, + int64_t timeout_ns, psmi_context_t *context) +{ + long open_timeout = 0; + int lid; + uint64_t gid_hi, gid_lo; + char dev_name[MAXPATHLEN]; + psm_error_t err = PSM_OK; + uint32_t driver_verno, hca_type; + int retry_delay = 0; + + /* + * If shared contexts are enabled, try our best to schedule processes + * across one or many devices + */ + + if (timeout_ns > 0) + open_timeout = (long)(timeout_ns/MSEC_ULL); + if (unit_id != IPATH_UNIT_ID_ANY && unit_id >= 0) + snprintf(dev_name, sizeof(dev_name), "%s%u", "/dev/ipath", (unsigned)unit_id); + else + snprintf(dev_name, sizeof(dev_name), "%s", "/dev/ipath"); + + context->fd = ipath_context_open(unit_id, port, open_timeout); + if (context->fd == -1) { + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "PSM can't open %s for reading and writing", + dev_name); + goto bail; + } + + if ((err = psmi_init_userinfo_params(ep, (int) unit_id, (int)port, job_key, + &context->user_info))) + goto bail; + +retry_open: + context->ctrl = ipath_userinit(context->fd, &context->user_info, + &context->base_info); + + if (!context->ctrl) { + + /* ipath_userinit returns EBUSY on ipath and ENODEV on qib when + * no contexts are available. Handle both drivers. + */ + if ((errno != ENETDOWN) && (errno != EBUSY) && (errno != ENODEV)) + goto fail; + + if ((open_timeout == -1L) || (errno == EBUSY) || (errno == ENODEV)) { + if(!retry_delay) { + _IPATH_PRDBG("retrying open: %s, network down\n", dev_name); + retry_delay = 1; + } + else if(retry_delay<17) + retry_delay <<= 1; + + /* If device is still busy after 3 attempts give up. No contexts + * available. + */ + if (((errno == EBUSY) || (errno == ENODEV)) && retry_delay > 4) + goto fail; + + sleep(retry_delay); + goto retry_open; + } + + err = psmi_handle_error(NULL, PSM_EP_NO_NETWORK, + "can't open %s, network down", dev_name); + goto bail; + } + + if ((lid = ipath_get_port_lid(context->base_info.spi_unit, + context->base_info.spi_port)) == -1) { + err = psmi_handle_error(NULL, + PSM_EP_DEVICE_FAILURE, + "Can't get InfiniBand LID in psm_ep_open: is SMA running?"); + goto fail; + } + if (ipath_get_port_gid(context->base_info.spi_unit, + context->base_info.spi_port, + &gid_hi, &gid_lo) == -1) { + err = psmi_handle_error(NULL, + PSM_EP_DEVICE_FAILURE, + "Can't get InfiniBand GID in psm_ep_open: is SMA running?"); + goto fail; + } + ep->unit_id = context->base_info.spi_unit; + ep->portnum = context->base_info.spi_port; + ep->gid_hi = gid_hi; + ep->gid_lo = gid_lo; + + context->ep = (psm_ep_t) ep; + context->runtime_flags = context->base_info.spi_runtime_flags; + + /* Get type of hca assigned to context */ + hca_type = psmi_get_hca_type(context); + + /* Endpoint out_sl contains the default SL to use for this endpoint. */ + context->epid = + PSMI_EPID_PACK_EXT(lid, context->base_info.spi_context, + context->base_info.spi_subcontext, + hca_type, ep->out_sl); + + /* + * With driver 1.5 (release 2.1), assume we always need the force. + * Starting with 1.6, the flag is based on chip rev. + */ + driver_verno = context->base_info.spi_sw_version; + if (driver_verno == PSMI_MAKE_DRIVER_VERSION(1, 5)) + context->runtime_flags |= IPATH_RUNTIME_FORCE_PIOAVAIL; + + /* + * We only know of register-swapped pio bufs before driver 1.6 + * Starting with 1.6, the flag is based on chip rev. + */ + if (driver_verno < PSMI_MAKE_DRIVER_VERSION(1, 6)) + context->runtime_flags |= IPATH_RUNTIME_PIO_REGSWAPPED; + + /* We are overloading this runtime flags for PSM options so make sure + * something can never go horribly bad */ + psmi_assert_always(context->runtime_flags < _PSMI_RUNTIME_LAST); + context->spi_status = (volatile uint64_t *) + context->ctrl->__ipath_spi_status; + + { + char buf[192]; + _IPATH_PRDBG("Opened context %d.%d on device %s (LID=%d,epid=%llx), " + "runtime_flags=0x%x (%s), driver=%d.%d\n", + context->base_info.spi_context, + context->base_info.spi_subcontext, dev_name, lid, + (long long) context->epid, context->runtime_flags, + runtime_flags_string(buf, sizeof buf, context->runtime_flags), + context->base_info.spi_sw_version >> 16, + context->base_info.spi_sw_version & 0xffff); + } + goto ret; + +fail: + switch (errno) { + case ENOENT: + case ENODEV: + err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE, + "%s not found", dev_name); + break; + case ENXIO: + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "%s failure", dev_name); + break; + case EBUSY: + err = psmi_handle_error(NULL, PSM_EP_NO_PORTS_AVAIL, + "No free InfiniPath contexts available on %s", dev_name); + break; + default: + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "Driver initialization failure on %s", dev_name); + break; + } +bail: + _IPATH_PRDBG("%s open failed: %d (%s)\n", dev_name, err, strerror(errno)); + if (context->fd != -1) { + ipath_context_close(context->fd); + context->fd = -1; + } +ret: + return err; +} + +psm_error_t +psmi_context_close(psmi_context_t *context) +{ + if (context->fd >= 0) { + ipath_context_close(context->fd); + context->fd = -1; + } + return PSM_OK; +} + +/* + * This function works whether a context is intiialized or not in a psm_ep. + * + * Returns one of + * + * PSM_OK: Port status is ok (or context not intialized yet but still "ok") + * PSM_OK_NO_PROGRESS: Cable pulled + * PSM_EP_NO_NETWORK: No network, no lid, ... + * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + * The message follows the per-port status + * As of 7322-ready driver, need to check port-specific qword for IB + * as well as older unit-only. For now, we don't have the port interface + * defined, so just check port 0 qword for spi_status + */ + +#define STATUS_MASK (IPATH_STATUS_CHIP_PRESENT | \ + IPATH_STATUS_HWERROR | \ + IPATH_STATUS_IB_CONF | \ + IPATH_STATUS_IB_READY) + +#define STATUS_NO_ERROR_VAL (IPATH_STATUS_CHIP_PRESENT | \ + IPATH_STATUS_IB_CONF | \ + IPATH_STATUS_IB_READY) +psm_error_t +psmi_context_check_status(const psmi_context_t *contexti) +{ + psm_error_t err = PSM_OK; + uint64_t status, ibstatus; + char *errmsg = NULL; + psmi_context_t *context = (psmi_context_t *) contexti; + + if (context->spi_status == NULL) + goto ret; + + status = context->spi_status[0]; + ibstatus = context->spi_status[1]; + + /* Fatal chip-related errors */ + if ( !(status & IPATH_STATUS_CHIP_PRESENT) || + (status & (IPATH_STATUS_HWERROR))) { + + err = PSM_EP_DEVICE_FAILURE; + if (err != context->spi_status_lasterr) { /* report once */ + volatile char *errmsg_sp = (volatile char *)&context->spi_status[2]; + if (*errmsg_sp) + psmi_handle_error(context->ep, err, + "Hardware problem: %s", errmsg_sp); + else { + if (status & IPATH_STATUS_HWERROR) + errmsg = "Hardware error"; + else + errmsg = "Hardware not found"; + + psmi_handle_error(context->ep, err, errmsg, "%s"); + } + } + } + + /* Fatal network-related errors */ + else if (!(status & IPATH_STATUS_IB_CONF) && + !(ibstatus & IPATH_STATUS_IB_CONF)) { + err = PSM_EP_NO_NETWORK; + if (err != context->spi_status_lasterr) { /* report once */ + volatile char *errmsg_sp = (volatile char *)&context->spi_status[1]; + psmi_handle_error(context->ep, err, + "%s", *errmsg_sp ? errmsg_sp : "Network down"); + } + } + + /* These errors are not fatal, they are log only */ + else if (!(status & IPATH_STATUS_IB_READY) && + !(ibstatus & IPATH_STATUS_IB_READY)) { + err = PSM_OK_NO_PROGRESS; /* Cable pulled, switch rebooted, ... */ + if (err != context->spi_status_lasterr) { /* report once */ +#if 0 + psmi_handle_error(PSMI_EP_LOGEVENT, PSM_EP_NO_NETWORK, + "IB Link is down"); +#endif + } + } + + if (err == PSM_OK && context->spi_status_lasterr != PSM_OK) + context->spi_status_lasterr = PSM_OK; /* clear error */ + else if (err != PSM_OK) + context->spi_status_lasterr = err; /* record error */ + +ret: + return err; +} + +/* + * Prepare user_info params for driver open, used only in psmi_context_open + */ +static +psm_error_t +psmi_init_userinfo_params(psm_ep_t ep, int unit_id, int port, + psm_uuid_t const unique_job_key, + struct ipath_user_info *user_info) +{ + /* static variables, shared among rails */ + static int shcontexts_enabled = -1, rankid, nranks; + static int subcontext_id_start = -1; + + int avail_contexts = 0, max_contexts, ask_contexts, ranks_per_context = 0; + uint32_t job_key; + uint16_t *jkp; + psm_error_t err = PSM_OK; + union psmi_envvar_val env_maxctxt, env_ranks_per_context; + + memset(user_info, 0, sizeof *user_info); + user_info->spu_userversion = IPATH_USER_SWVERSION; + user_info->spu_subcontext_id = 0; + user_info->spu_subcontext_cnt = 0; + user_info->spu_port_alg = psmi_get_hca_selection_algorithm(); + + if (shcontexts_enabled == -1) { + shcontexts_enabled = psmi_sharedcontext_params(&nranks, &rankid); + } + + if (!shcontexts_enabled) + return err; + + avail_contexts = ipath_get_num_contexts(unit_id); + jkp = (uint16_t *) unique_job_key; + + /* Use a unique subcontext id based on uuid. This is just to optimistically + * prevent sharing a context across two unrelated jobs that would start at the + * same time */ + job_key = ((jkp[2] ^ jkp[3]) >> 8) | ((jkp[0] ^ jkp[1]) << 8); + job_key ^= ((jkp[6] ^ jkp[7]) >> 8) | ((jkp[4] ^ jkp[5]) << 8); + /* comment out, because it has more chance to generate the same job_key for + * two unrelated jobs that would start at the same time, and causes context + * allocation failure */ + //job_key &= ~0xff; /* just to make more readable */ + + if (avail_contexts == 0) { + err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE, + "PSM found 0 available contexts on InfiniPath device(s)."); + goto fail; + } + + /* See if the user wants finer control over context assignments */ + if (!psmi_getenv("PSM_SHAREDCONTEXTS_MAX", + "Maximum number of contexts for this PSM job", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) avail_contexts, + &env_maxctxt)) { + max_contexts = max(env_maxctxt.e_int, 1); /* needs to be non-negative */ + ask_contexts = min(max_contexts, avail_contexts); /* needs to be available */ + } + else + ask_contexts = max_contexts = avail_contexts; + + if (!psmi_getenv("PSM_RANKS_PER_CONTEXT", + "Number of ranks per context", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) 1, + &env_ranks_per_context)) { + ranks_per_context = max(env_ranks_per_context.e_int, 1); + ranks_per_context = min(ranks_per_context, INFINIPATH_MAX_SUBCONTEXT); + } + + /* + * See if we could get a valid local rank. If not, pre-attach to the + * shm segment to obtain a unique shmidx. + */ + if (rankid == -1) { + if ((err = psmi_shm_attach(ep, &rankid))) + goto fail; + } + + /* + * See if we could get a valid ppn. If not, approximate it to be the + * number of cores. + */ + if (nranks == -1) { + long nproc = sysconf(_SC_NPROCESSORS_ONLN); + if (nproc < 1) + nranks = 1; + else + nranks = nproc; + } + + /* + * Make sure that our guesses are good educated guesses + */ + if (rankid >= nranks) { + _IPATH_PRDBG("PSM_SHAREDCONTEXTS disabled because lrank=%d,ppn=%d\n", + rankid, nranks); + goto fail; + } + + if (ranks_per_context) { + int contexts = (nranks + ranks_per_context - 1) / ranks_per_context; + if (contexts > ask_contexts) { + err = psmi_handle_error(NULL, PSM_EP_NO_DEVICE, + "Context required %d (nranks %d, ranks_per_context %d) " + "is less than allowed context %d which is either the " + "total avail_context %d or set by PSM_SHAREDCONTEXTS_MAX\n", + contexts, nranks, ranks_per_context, ask_contexts, avail_contexts); + goto fail; + } + ask_contexts = contexts; + } + + user_info->spu_port = port; /* requested IB port if > 0 */ + if (subcontext_id_start == -1) { +#ifdef __MIC__ + /* this query is moved from ipath_userinit() to here, + * it is also used there by ipath_cmd_assign_context() call. */ + if (scif_get_nodeIDs(NULL, 0, (uint16_t*)&user_info->_spu_scif_nodeid) < 0) { + _IPATH_INFO("scif_get_nodeIDs() call failed: %s\n", strerror(errno)); + goto fail; + } + /* + * When processes from different MICs to use the same HCA, and + * context sharing is enabled, we can't mix them, only processes + * from the same MIC node can share a context, so we need to + * generate a unique id. Here we use the queried nodeID to do it, + * avail_contexts is a constant for all MICs. + */ + subcontext_id_start = avail_contexts * user_info->_spu_scif_nodeid; +#else + subcontext_id_start = 0; +#endif + } + + /* "unique" id based on job key */ + user_info->spu_subcontext_id = subcontext_id_start + + job_key + rankid % ask_contexts; + /* this is for multi-rail, when we setup a new rail, + * we can not use the same subcontext ID as the previous + * rail, otherwise, the driver will match previous rail + * and fail. + */ + subcontext_id_start += ask_contexts; + + /* Need to compute with how many *other* peers we will be sharing the + * context */ + if (nranks > ask_contexts) { + user_info->spu_subcontext_cnt = nranks / ask_contexts; + /* If ppn != multiple of contexts, some contexts get an uneven + * number of subcontexts */ + if (nranks % ask_contexts > rankid % ask_contexts) + user_info->spu_subcontext_cnt++; + /* The case of 1 process "sharing" a context (giving 1 subcontext) + * is supcontexted by the driver and PSM. However, there is no + * need to share in this case so disable context sharing. */ + if (user_info->spu_subcontext_cnt == 1) + user_info->spu_subcontext_cnt = 0; + } + /* else spu_subcontext_cnt remains 0 and context sharing is disabled. */ + + _IPATH_PRDBG("PSM_SHAREDCONTEXTS lrank=%d,ppn=%d,avail_contexts=%d," + "max_contexts=%d,ask_contexts=%d," + "ranks_per_context=%d,id=%u,peers=%d,port=%d\n", + rankid, nranks, avail_contexts, max_contexts, ask_contexts, + ranks_per_context, + (int) user_info->spu_subcontext_id, + (int) user_info->spu_subcontext_cnt, + (int) user_info->spu_port); +fail: + return err; +} + +int +psmi_sharedcontext_params(int *nranks, int *rankid) +{ + union psmi_envvar_val enable_shcontexts; + char *ppn_env = NULL, *lrank_env = NULL, *c; + + *rankid = -1; + *nranks = -1; + +#if 0 + /* DEBUG: Used to selectively test possible shared context and shm-only + * settings */ + unsetenv("PSC_MPI_NODE_RANK"); + unsetenv("PSC_MPI_PPN"); + unsetenv("MPI_LOCALRANKID"); + unsetenv("MPI_LOCALRANKS"); +#endif + + /* New name in 2.0.1, keep observing old name */ + if (psmi_getenv("PSM_SHAREDCONTEXTS", "Enable shared contexts", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val) + PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, + &enable_shcontexts)) + { + psmi_getenv("PSM_SHAREDPORTS", "Enable shared contexts", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val) + PSMI_SHARED_CONTEXTS_ENABLED_BY_DEFAULT, + &enable_shcontexts); + } + + if (!enable_shcontexts.e_int) + return 0; + + /* We support two types of syntaxes to let users give us a hint what + * our local rankid is. Moving towards MPI_, but still support PSC_ */ + if ((c = getenv("MPI_LOCALRANKID")) && *c != '\0') { + lrank_env = "MPI_LOCALRANKID"; + ppn_env = "MPI_LOCALNRANKS"; + } + else if ((c = getenv("PSC_MPI_PPN")) && *c != '\0') { + ppn_env = "PSC_MPI_PPN"; + lrank_env = "PSC_MPI_NODE_RANK"; + } + + if (ppn_env != NULL && lrank_env != NULL) { + union psmi_envvar_val env_rankid, env_nranks; + + psmi_getenv(lrank_env, "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) -1, + &env_rankid); + + psmi_getenv(ppn_env, "Shared context numranks", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) -1, + &env_nranks); + + *rankid = env_rankid.e_int; + *nranks = env_nranks.e_int; + } + return 1; +} + +static +int +psmi_get_hca_selection_algorithm(void) +{ + union psmi_envvar_val env_hca_alg; + int hca_alg = IPATH_PORT_ALG_ACROSS; + + /* If a specific unit is set in the environment, use that one. */ + psmi_getenv("IPATH_HCA_SELECTION_ALG", + "HCA Device Selection Algorithm to use. Round Robin (Default) " + "or Packed", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) "Round Robin", + &env_hca_alg); + + if (!strcasecmp(env_hca_alg.e_str, "Round Robin")) + hca_alg = IPATH_PORT_ALG_ACROSS; + else if (!strcasecmp(env_hca_alg.e_str, "Packed")) + hca_alg = IPATH_PORT_ALG_WITHIN; + else { + _IPATH_ERROR("Unknown HCA selection algorithm %s. Defaulting to Round Robin " + "allocation of HCAs.\n", env_hca_alg.e_str); + hca_alg = IPATH_PORT_ALG_ACROSS; + } + + return hca_alg; +} diff --git a/psm_context.h b/psm_context.h new file mode 100644 index 0000000..635bb10 --- /dev/null +++ b/psm_context.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_context.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_CONTEXT_H +#define _PSM_CONTEXT_H + +typedef +struct psmi_context { + int fd; /* driver fd */ + struct _ipath_ctrl *ctrl; /* driver opaque ipath_proto */ + psm_ep_t ep; /* psm ep handle */ + psm_epid_t epid; /* psm integral ep id */ + struct ipath_user_info user_info; + struct ipath_base_info base_info; + uint32_t runtime_flags; + uint32_t rcvthread_flags; + volatile uint64_t *spi_status; + psm_error_t spi_status_lasterr; +} +psmi_context_t; + +psm_error_t +psmi_context_open(const psm_ep_t ep, long unit_id, long port, + psm_uuid_t const job_key, + int64_t timeout_ns, psmi_context_t *context); + +psm_error_t +psmi_context_close(psmi_context_t *context); + +/* Check status of context */ +psm_error_t psmi_context_check_status(const psmi_context_t *context); + +psm_error_t psmi_context_interrupt_set(psmi_context_t *context, int enable); +int psmi_context_interrupt_isenabled(psmi_context_t *context); + +int psmi_sharedcontext_params(int *nranks, int *rankid); +/* Runtime flags describe what features are enabled in hw/sw and which + * corresponding PSM features are being used. + * + * Hi 16 bits are PSM options + * Lo 16 bits are IPATH_RUNTIME options copied from (ipath_common.h) + */ +#define PSMI_RUNTIME_RCVTHREAD 0x80000000 +#define PSMI_RUNTIME_INTR_ENABLED 0x40000000 +#define PSMI_RUNTIME_LOCKHDRQ PSMI_RUNTIME_RCVTHREAD /* alias */ +/* Update _PSMI_RUNTIME_LAST to be the lowest runtime flag */ +#define _PSMI_RUNTIME_LAST PSMI_RUNTIME_INTR_ENABLED + +/* + * The receive thread can be initialized with optional behaviour. + * + * Note: Currently there is no optional behaviour. + */ +#define PSMI_RCVTHREAD_FLAG_ENABLED 0x1 + + +#endif /* PSM_CONTEXT_H */ diff --git a/psm_diags.c b/psm_diags.c new file mode 100644 index 0000000..4502cf1 --- /dev/null +++ b/psm_diags.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +typedef void (*memcpy_fn_t)(void *dst, const void *src, size_t n); +static int psmi_test_memcpy(memcpy_fn_t, const char *name); +static int psmi_test_epid_table(int numelems); + +int psmi_diags(void); + +#define diags_assert(x) do { \ + if (!(x)) { \ + _IPATH_ERROR("Diags assertion failure: %s\n", \ + #x); \ + goto fail; \ + } \ + } while (0) + +#define DIAGS_RETURN_PASS(str) \ + do { _IPATH_INFO("%s: PASSED %s\n", __func__, str); return 0; } \ + while (0) +#define DIAGS_RETURN_FAIL(str) \ + do { _IPATH_INFO("%s: FAILED %s\n", __func__, str); return 1; } \ + while (0) + +int +psmi_diags(void) +{ + int ret = 0; + ret |= psmi_test_epid_table(2048); + ret |= psmi_test_memcpy((memcpy_fn_t) psmi_memcpyo, "psmi_memcpyo"); + //ret |= psmi_test_memcpy((memcpy_fn_t) psmi_mq_mtucpy, "psmi_mq_mtucpy"); + + if (ret) + DIAGS_RETURN_FAIL(""); + else + DIAGS_RETURN_PASS(""); +} + +/* + * Hash table test + */ +#define NALLOC 1024 +static int +psmi_test_epid_table(int numelems) +{ + psm_epaddr_t *ep_array, epaddr, ep_alloc; + psm_epid_t *epid_array, epid_tmp; + psm_ep_t ep = (psm_ep_t) (uintptr_t) 0xabcdef00; + struct psmi_epid_table *tab; + int i, j; + + ep_alloc = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm_epaddr)); + ep_array = (psm_epaddr_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(struct psm_epaddr *)); + epid_array = (psm_epid_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, numelems, sizeof(psm_epid_t)); + diags_assert(ep_alloc != NULL); + diags_assert(ep_array != NULL); + diags_assert(epid_array != NULL); + + srand(12345678); + + psmi_epid_init(); + tab = &psmi_epid_table; + + for (i = 0; i < numelems; i++) { + epid_array[i] = i; + ep_alloc[i].ep = ep; + ep_alloc[i].epid = epid_array[i]; + ep_array[i] = &ep_alloc[i]; + } + for (i = 0 ; i < numelems; i++) { + psmi_epid_add(ep, epid_array[i], ep_array[i]); + } + + /* Randomize epid_array */ + for (i = 0; i < numelems; i++) { + j = rand() % numelems; + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Lookup. */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ep == ep); + } + + /* Randomize epid_array again */ + for (i = 0; i < numelems; i++) { + j = rand() % numelems; + epid_tmp = epid_array[i]; + epid_array[i] = epid_array[j]; + epid_array[j] = epid_tmp; + } + /* Delete half */ + for (i = 0; i < numelems/2; i++) { + epaddr = psmi_epid_remove(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ep == ep); + } + /* Lookup other half -- expect non-NULL, then delete */ + for (i = numelems/2; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr != NULL); + diags_assert(epaddr->epid == epid_array[i]); + diags_assert(epaddr->ep == ep); + epaddr = psmi_epid_remove(ep, epid_array[i]); + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + /* Lookup whole thing, expect done */ + for (i = 0; i < numelems; i++) { + epaddr = psmi_epid_lookup(ep, epid_array[i]); + diags_assert(epaddr == NULL); + } + for (i = 0; i < tab->tabsize; i++) { + diags_assert(tab->table[i].entry == NULL || + tab->table[i].entry == EPADDR_DELETED); + } + + /* Make sure we're not leaking memory somewhere... */ + diags_assert(tab->tabsize > tab->tabsize_used && + tab->tabsize * PSMI_EPID_TABLOAD_FACTOR > + tab->tabsize_used); + + /* Only free on success */ + psmi_epid_fini(); + psmi_free(epid_array); + psmi_free(ep_array); + psmi_free(ep_alloc); + DIAGS_RETURN_PASS(""); + +fail: + /* Klocwork scan report memory leak. */ + psmi_epid_fini(); + if (epid_array) psmi_free(epid_array); + if (ep_array) psmi_free(ep_array); + if (ep_alloc) psmi_free(ep_alloc); + DIAGS_RETURN_FAIL(""); +} + +/* + * Memcpy correctness test + */ +static int memcpy_check_size (memcpy_fn_t fn, int *p, int *f, size_t n); +static void *memcpy_check_one (memcpy_fn_t fn, void *dst, void *src, size_t n); + +static int +psmi_test_memcpy(memcpy_fn_t fn, const char *memcpy_name) +{ + const int CORNERS = 0; + const long long lo = 1; + const long long hi = 16 * 1024 * 1024; + const long long below = 32; + const long long above = 32; + long long n, m; + char buf[128]; + int ret = 0; + int memcpy_passed; + int memcpy_failed; + + memcpy_passed = 0; + memcpy_failed = 0; + + ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, 0); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + + for (n = lo; n <= hi; n <<= 1) { + _IPATH_INFO("%s %d align=0..16\n", memcpy_name, (int) n); + for (m = n - below; m <= n + above; m++) { + if (m == n) { + ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, n); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } + else if (CORNERS && m >= lo && m <= hi && m > (n >> 1) && + m < max(n, ((n << 1) - below))) + { + ret = memcpy_check_size(fn, &memcpy_passed, &memcpy_failed, (size_t) m); + if (ret < 0) + DIAGS_RETURN_FAIL("no heap space"); + } + } + } + + int total = memcpy_passed + memcpy_failed; + if (total > 0) { + _IPATH_INFO("%d memcpy tests with %d passed (%.2f%%) " + "and %d failed (%.2f%%)\n", + total, memcpy_passed, (100.0 * memcpy_passed) / total, + memcpy_failed, (100.0 * memcpy_failed) / total); + } + if (memcpy_failed) { + snprintf(buf, sizeof buf, "%s %.2f%% of tests memcpy_failed", + memcpy_name, (100.0 * memcpy_failed) / total); + DIAGS_RETURN_FAIL(buf); + } + else { + DIAGS_RETURN_PASS(memcpy_name); + } +} + +void *memcpy_check_one (memcpy_fn_t fn, void *dst, void *src, size_t n) +{ + int ok = 1; + unsigned int seed = (unsigned int) + ((uintptr_t) dst ^ (uintptr_t) src ^ (uintptr_t) n); + unsigned int state; + size_t i; + psmi_assert_always(n > 0); + memset(src, 0x55, n); + memset(dst, 0xaa, n); + srand(seed); + state = seed; + for (i = 0; i < n; i++) { + ((uint8_t *) src)[i] = (rand_r(&state) >> 16) & 0xff; + } + + fn(dst, src, n); + memset(src, 0, n); + srand(seed); + state = seed; + for (i = 0; i < n; i++) { + int value = (int) (uint8_t) (rand_r(&state) >> 16); + int v = (int) ((uint8_t *) dst)[i]; + if (v != value) { + _IPATH_ERROR("Error on index %llu : got %d instead of %d\n", + (unsigned long long) i, v, value); + ok = 0; + } + } + return ok ? dst : NULL; +} + +int +memcpy_check_size (memcpy_fn_t fn, int *p, int *f, size_t n) +{ +#define num_aligns 16 +#define USE_MALLOC 0 +#define DEBUG 0 + uint8_t *src; + uint8_t *dst; + size_t size = n * 2 + num_aligns; + if (USE_MALLOC) { + src = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + dst = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); + if (src == NULL || dst == NULL) { + if (src) psmi_free(src); + if (dst) psmi_free(dst); + return -1; + } + } + else { + void *src_p = NULL, *dst_p = NULL; + if (posix_memalign(&src_p, 64, size) != 0 || + posix_memalign(&dst_p, 64, size) != 0) { + if (src_p) psmi_free(src_p); + if (dst_p) psmi_free(dst_p); + return -1; + } + else { + src = (uint8_t *) src_p; + dst = (uint8_t *) dst_p; + } + } + int src_align, dst_align; + for (src_align = 0; src_align < num_aligns; src_align++) { + for (dst_align = 0; dst_align < num_aligns; dst_align++) { + uint8_t *d = ((uint8_t *) dst) + dst_align; + uint8_t *s = ((uint8_t *) src) + src_align; + int ok = (memcpy_check_one(fn, d, s, n) != NULL); + if (DEBUG || !ok) { + _IPATH_INFO("memcpy(%p, %p, %llu) : %s\n", d, s, + (unsigned long long) n, + ok ? "passed" : "failed"); + } + if (ok) { + (*p)++; + } + else { + (*f)++; + } + } + } + psmi_free(src); + psmi_free(dst); + return 0; +} diff --git a/psm_ep.c b/psm_ep.c new file mode 100644 index 0000000..6857895 --- /dev/null +++ b/psm_ep.c @@ -0,0 +1,1423 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include // cpu_set +#include // isalpha + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +/* + * Endpoint management + */ +psm_ep_t psmi_opened_endpoint = NULL; +int psmi_opened_endpoint_count = 0; + +static psm_error_t psmi_ep_open_device(const psm_ep_t ep, + const struct psm_ep_open_opts *opts, + const psm_uuid_t unique_job_key, + struct psmi_context *context, + psm_epid_t *epid); + +/* + * Device managment + * + * PSM uses "devices" as components to manage communication to self, to peers + * reachable via shared memory and finally to peers reachable only through + * ipath. + * + * By default, PSMI_DEVICES_DEFAULT establishes the bind order a component is + * tested for reachability to each peer. First self, then shm and finally + * ipath. The order should really only affect endpoints that happen to be on + * the same node. PSM will correctly detect that two endpoints are on the same + * node even though they may be using different host interfaces. + */ + +#define PSMI_DEVICES_DEFAULT "self,shm,ipath" +static psm_error_t psmi_parse_devices(int devices[PTL_MAX_INIT], + const char *devstr); +static int psmi_device_is_enabled(const int devices[PTL_MAX_INIT], + int devid); +int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid); + +psm_error_t +__psm_ep_num_devunits(uint32_t *num_units_o) +{ + static int num_units = -1; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (num_units == -1) { + num_units = ipath_get_num_units(); + if (num_units == -1) + num_units = 0; + } + + *num_units_o = (uint32_t) num_units; + return PSM_OK; +} +PSMI_API_DECL(psm_ep_num_devunits) + +static int +cmpfunc(const void *p1, const void *p2) +{ + uint64_t a = ((uint64_t *)p1)[0]; + uint64_t b = ((uint64_t *)p2)[0]; + if (a < b) return -1; + if (a == b) return 0; + return 1; +} +static psm_error_t +psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port) +{ + uint32_t num_units; + uint64_t gid_hi, gid_lo; + int i, j, ret, count=0; + char *env; + psm_error_t err = PSM_OK; + uint64_t gidh[IPATH_MAX_UNIT][3]; + + env = getenv("PSM_MULTIRAIL"); + if (!env || atoi(env) == 0) { + *num_rails = 0; + return err; + } +#ifdef __MIC__ + env = getenv("MPI_LOCALRANKID"); + if (!env || atoi(env) == 0) { + _IPATH_INFO("PSM_MULTIRAIL is not supported and " + "ignored for this PSM mic version.\n"); + } + *num_rails = 0; + return err; +#endif + +/* + * map is in format: unit:port,unit:port,... + */ + if ((env = getenv("PSM_MULTIRAIL_MAP"))) { + if (sscanf(env, "%d:%d", &i, &j) == 2) { + char *comma = strchr(env, ','); + unit[count] = i; + port[count] = j; + count++; + while (comma) { + if (sscanf(comma, ",%d:%d", &i, &j) != 2) { + break; + } + unit[count] = i; + port[count] = j; + count++; + if (count == IPATH_MAX_UNIT) break; + comma = strchr(comma+1, ','); + } + } + *num_rails = count; + +/* + * Check if any of the port is not usable. + */ + for (i = 0; i < count; i++) { + ret = ipath_get_port_lid(unit[i], port[i]); + if (ret == -1) { + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "Couldn't get lid for unit %d:%d", + unit[i], port[i]); + return err; + } + ret = ipath_get_port_gid(unit[i], port[i], &gid_hi, &gid_lo); + if (ret == -1) { + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "Couldn't get gid for unit %d:%d", + unit[i], port[i]); + return err; + } + } + + return err; + } + + if ((err = psm_ep_num_devunits(&num_units))) { + return err; + } + if (num_units > IPATH_MAX_UNIT) { + _IPATH_INFO("Found %d units, max %d units are supported, use %d\n", + num_units, IPATH_MAX_UNIT, IPATH_MAX_UNIT); + num_units = IPATH_MAX_UNIT; + } + +/* + * Get all the ports with a valid lid and gid, one per unit. + * we don't know which number is a valid unit, we just loop + * over all supported numbers. + */ + for (i = 0; i < IPATH_MAX_UNIT; i++) { + for (j = 1; j <= IPATH_MAX_PORT; j++) { + ret = ipath_get_port_lid(i, j); + if (ret == -1) continue; + ret = ipath_get_port_gid(i, j, &gid_hi, &gid_lo); + if (ret == -1) continue; + + gidh[count][0] = gid_hi; + gidh[count][1] = i; + gidh[count][2] = j; + count++; + break; + } + if (count == num_units) break; + } + +/* + * Sort all the ports with gidh from small to big. + * This is for multiple fabrics, and we use fabric with the + * smallest gid to make the master connection. + */ + qsort(gidh, count, sizeof(uint64_t)*3, cmpfunc); + + for (i = 0; i < count; i++) { + unit[i] = (uint32_t)gidh[i][1]; + port[i] = (uint16_t)(uint32_t)gidh[i][2]; + } + *num_rails = count; + return err; +} + +static psm_error_t +psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, + uint64_t my_gid_hi, uint64_t my_gid_lo) +{ + static uint16_t *ipath_lids = NULL; + static uint32_t nlids; + uint32_t num_units; + int i; + psm_error_t err = PSM_OK; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (ipath_lids == NULL) { + if ((err = psm_ep_num_devunits(&num_units))) + goto fail; + ipath_lids = (uint16_t *) + psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_units*IPATH_MAX_PORT, + sizeof(uint16_t)); + if (ipath_lids == NULL) { + err = psmi_handle_error(NULL, PSM_NO_MEMORY, + "Couldn't allocate memory for dev_lids structure"); + goto fail; + } + + for (i = 0; i < IPATH_MAX_UNIT; i++) { + int j; + for (j = 1; j <= IPATH_MAX_PORT; j++) { + int lid = ipath_get_port_lid(i, j); + int ret; + uint64_t gid_hi = 0, gid_lo = 0; + + if (lid == -1) continue; + ret = ipath_get_port_gid(i, j, &gid_hi, &gid_lo); + if (ret == -1) + continue; + else if (my_gid_hi != gid_hi) { + _IPATH_VDBG("LID %d, unit %d, port %d, " + "mismatched GID %llx:%llx and " + "%llx:%llx\n", + lid, i, j, + (unsigned long long) gid_hi, + (unsigned long long) gid_lo, + (unsigned long long) my_gid_hi, + (unsigned long long) my_gid_lo); + continue; + } + _IPATH_VDBG("LID %d, unit %d, port %d, " + "matching GID %llx:%llx and " + "%llx:%llx\n", lid, i, j, + (unsigned long long) gid_hi, + (unsigned long long) gid_lo, + (unsigned long long) my_gid_hi, + (unsigned long long) my_gid_lo); + + ipath_lids[nlids++] = (uint16_t) lid; + } + } + if (nlids == 0) { + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "Couldn't get lid&gid from any unit/port"); + goto fail; + } + } + *lids = ipath_lids; + *num_lids_o = nlids; + +fail: + return err; +} + +uint64_t +__psm_epid_nid(psm_epid_t epid) +{ + return PSMI_EPID_GET_LID(epid); +} +PSMI_API_DECL(psm_epid_nid) + +/* Currently not exposed to users, we don't acknowledge the existence of + * subcontexts */ +uint64_t +psmi_epid_subcontext(psm_epid_t epid) +{ + return PSMI_EPID_GET_SUBCONTEXT(epid); +} + +/* Currently not exposed to users, we don't acknowledge the existence of + * service levels and HCA types encoding within epids. This may require + * changing to expose SLs + */ +uint64_t +psmi_epid_hca_type(psm_epid_t epid) +{ + return PSMI_EPID_GET_HCATYPE(epid); +} + +uint64_t +psmi_epid_sl(psm_epid_t epid) +{ + return PSMI_EPID_GET_SL(epid); +} + +uint64_t +__psm_epid_context(psm_epid_t epid) +{ + return PSMI_EPID_GET_CONTEXT(epid); +} +PSMI_API_DECL(psm_epid_context) + +uint64_t +__psm_epid_port(psm_epid_t epid) +{ + return __psm_epid_context(epid); +} +PSMI_API_DECL(psm_epid_port) + +psm_error_t +__psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo) +{ + psm_error_t err = PSM_OK; + int i; + psm_ep_t ep; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + if (*num_of_epinfo <= 0) { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid psm_ep_query parameters"); + return err; + } + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + return err; + } + + ep = psmi_opened_endpoint; + for (i = 0; i < *num_of_epinfo; i++) { + if (ep == NULL) break; + array_of_epinfo[i].ep = ep; + array_of_epinfo[i].epid = ep->epid; + memcpy(array_of_epinfo[i].uuid, + (void *) ep->key, sizeof(psm_uuid_t)); + psmi_uuid_unparse(ep->key, array_of_epinfo[i].uuid_str); + ep = ep->user_ep_next; + } + *num_of_epinfo = i; + + return err; +} +PSMI_API_DECL(psm_ep_query) + +psm_error_t +__psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn) +{ + psm_error_t err = PSM_OK; + psm_epaddr_t epaddr; + psm_ep_t ep; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Need to have an opened endpoint before we can resolve epids */ + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + return err; + } + + ep = psmi_opened_endpoint; + while (ep) { + epaddr = psmi_epid_lookup(ep, epid); + if (!epaddr) { + /* Search over SL values for bug 122239. Note that function + * ips_get_addr_from_epid() converts a base epid to an epaddr, + * which can then be used to get the correct epid for this flow. + * However, that function is at the IPS level and not accessible + * from here without breaking the layering. */ + uint64_t lid, context, subcontext, hca_type, sl, try_sl; + psm_epid_t try_epid; + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + hca_type = PSMI_EPID_GET_HCATYPE(epid); + sl = PSMI_EPID_GET_SL(epid); + for (try_sl = 0; !epaddr && try_sl < 16; try_sl++) { + if (try_sl != sl) { + try_epid = PSMI_EPID_PACK_EXT(lid, context, subcontext, + hca_type, try_sl); + epaddr = psmi_epid_lookup(psmi_opened_endpoint, try_epid); + } + } + + if (!epaddr) { + ep = ep->user_ep_next; + continue; + } + } + + /* Found connection for epid. Return info about endpoint to caller. */ + psmi_assert_always(epaddr->ep == ep); + epconn->addr = epaddr; + epconn->ep = epaddr->ep; + epconn->mq = epaddr->ep->mq; + return err; + } + + err = psmi_handle_error(NULL, PSM_EPID_UNKNOWN, + "Endpoint connection status unknown"); + return err; +} +PSMI_API_DECL(psm_ep_epid_lookup); + +psm_error_t +__psm_ep_epid_share_memory(psm_ep_t ep, psm_epid_t epid, int *result_o) +{ + uint32_t num_lids = 0; + uint16_t *lids = NULL; + int i; + uint16_t epid_lid; + int result = 0; + psm_error_t err; + + psmi_assert_always(ep != NULL); + PSMI_ERR_UNLESS_INITIALIZED(ep); + + epid_lid = (uint16_t) psm_epid_nid(epid); + /* If we're in non-ipath mode, done bother listing lids */ + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + uint64_t mylid = (uint16_t) psm_epid_nid(ep->epid); + if (mylid == epid_lid) + result = 1; + } + else { + err = psmi_ep_devlids(&lids, &num_lids, ep->gid_hi, ep->gid_lo); + if (err) + return err; + for (i = 0; i < num_lids; i++) { + if (epid_lid == lids[i]) { + result = 1; + break; + } + } + } + *result_o = result; + return PSM_OK; +} +PSMI_API_DECL(psm_ep_epid_share_memory) + +#define PSMI_EP_OPEN_SHM_MBYTES_MIN 2 +#define PSMI_EP_OPEN_PKEY_MASK 0x7fffULL + +psm_error_t +__psm_ep_open_opts_get_defaults(struct psm_ep_open_opts *opts) +{ + union psmi_envvar_val nSendBuf; + union psmi_envvar_val netPKey; +#if (PSM_VERNO >= 0x010d) + union psmi_envvar_val env_path_service_id; + union psmi_envvar_val env_path_res_type; +#endif +#if (PSM_VERNO >= 0x010e) + union psmi_envvar_val nSendDesc; + union psmi_envvar_val immSize; +#endif + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + /* Get number of default send buffers from environment */ + psmi_getenv("PSM_NUM_SEND_BUFFERS", + "Number of send buffers to allocate [1024]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 1024, + &nSendBuf); + + /* Get network key from environment. MVAPICH and other vendor MPIs do not + * specify it on ep open and we may require it for vFabrics. + */ + psmi_getenv("PSM_PKEY", + "Infiniband PKey to use for endpoint", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG, + (union psmi_envvar_val) IPATH_DEFAULT_P_KEY, + &netPKey); + +#if (PSM_VERNO >= 0x010d) + /* Get Service ID from environment */ + psmi_getenv("PSM_IB_SERVICE_ID", + "IB Service ID for path resolution", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_ULONG_ULONG, + (union psmi_envvar_val) IPATH_DEFAULT_SERVICE_ID, + &env_path_service_id); + + /* Get Path resolution type from environment Possible choices are: + * + * NONE : Default same as previous instances. Utilizes static data. + * OPP : Use OFED Plus Plus library to do path record queries. + * UMAD : Use raw libibumad interface to form and process path records. + * ANY : Try all available path record mechanisms. + */ + psmi_getenv("PSM_PATH_REC", + "Mechanism to query IB path record (default is no path query)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) "none", &env_path_res_type); +#endif + +#if (PSM_VERNO >= 0x010e) + /* Get numner of send descriptors - by default this is 4 times the number + * of send buffers - mainly used for short/inlined messages. + */ + psmi_getenv("PSM_NUM_SEND_DESCRIPTORS", + "Number of send descriptors to allocate [4096]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) (nSendBuf.e_uint << 2), + &nSendDesc); + + /* Get immediate data size - transfers less than immediate data size do + * not consume a send buffer and require just a send descriptor. + */ + psmi_getenv("PSM_SEND_IMMEDIATE_SIZE", + "Immediate data send size not requiring a buffer [128]", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 128, + &immSize); +#endif + + opts->timeout = 30000000000LL; /* 30 sec */ + opts->unit = IPATH_UNIT_ID_ANY; + opts->port = 0; + opts->outsl = PSMI_SL_DEFAULT; +#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a) + opts->outvl = 0; +#endif + opts->affinity = PSM_EP_OPEN_AFFINITY_SET; + opts->shm_mbytes = 10; + opts->sendbufs_num = nSendBuf.e_uint; + opts->network_pkey = (uint64_t) netPKey.e_ulong; +#if (PSM_VERNO >= 0x010d) + opts->service_id = (uint64_t) env_path_service_id.e_ulonglong; + + if (!strcasecmp(env_path_res_type.e_str, "none")) + opts->path_res_type = PSM_PATH_RES_NONE; + else if (!strcasecmp(env_path_res_type.e_str, "opp")) + opts->path_res_type = PSM_PATH_RES_OPP; + else if (!strcasecmp(env_path_res_type.e_str, "umad")) + opts->path_res_type = PSM_PATH_RES_UMAD; + else { + _IPATH_ERROR("Unknown path resolution type %s. Disabling use of path record query.\n", env_path_res_type.e_str); + opts->path_res_type = PSM_PATH_RES_NONE; + } +#endif +#if (PSM_VERNO >= 0x010e) + opts->senddesc_num = nSendDesc.e_uint; + opts->imm_size = immSize.e_uint; +#endif + + return PSM_OK; +} +PSMI_API_DECL(psm_ep_open_opts_get_defaults) + +psm_error_t psmi_poll_noop(ptl_t *ptl, int replyonly); + +psm_error_t +__psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled, + struct psm_ep_open_opts const *opts_i, psm_mq_t mq, + psm_ep_t *epo, psm_epid_t *epido) +{ + psm_ep_t ep = NULL; + uint32_t num_units; + size_t len; + psm_error_t err; + psm_epaddr_t epaddr = NULL; + char buf[128], *p, *e; + char *old_cpuaff = NULL, *old_unit = NULL; + union psmi_envvar_val yield_cnt, no_cpuaff, env_unit_id, + env_port_id, env_sl; + size_t ptl_sizes; + int default_cpuaff; + struct psm_ep_open_opts opts; + ptl_t *amsh_ptl, *ips_ptl, *self_ptl; + int i; + + /* First get the set of default options, we overwrite with the user's + * desired values afterwards */ + if ((err = psm_ep_open_opts_get_defaults(&opts))) + goto fail; + + if (opts_i != NULL) { + if (opts_i->timeout != -1) + opts.timeout = opts_i->timeout; + if (opts_i->unit != -1) + opts.unit = opts_i->unit; + if (opts_i->affinity != -1) + opts.affinity = opts_i->affinity; + if (opts_i->shm_mbytes != -1) + opts.shm_mbytes = opts_i->shm_mbytes; + if (opts_i->sendbufs_num != -1) + opts.sendbufs_num = opts_i->sendbufs_num; + if (psmi_verno_client() >= PSMI_VERNO_MAKE(1,1)) { + if ((opts_i->network_pkey & PSMI_EP_OPEN_PKEY_MASK) != + PSMI_EP_OPEN_PKEY_MASK) + opts.network_pkey = opts_i->network_pkey; + } + if (psmi_verno_client() >= PSMI_VERNO_MAKE(1,7)) { + /* these values are sanity checked below */ + opts.port = opts_i->port; + opts.outsl = opts_i->outsl; +#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a) + opts.outvl = opts_i->outvl; +#endif + } +#if (PSM_VERNO >= 0x010d) + /* Note: Environment variable specification for service ID and + * path resolition type takes precedence over ep_open defaults. + */ + if (psmi_verno_client() >= 0x010d) { + if (opts_i->service_id) + opts.service_id = (uint64_t) opts_i->service_id; + if (opts.path_res_type == PSM_PATH_RES_NONE) + opts.path_res_type = opts_i->path_res_type; + } +#endif + +#if (PSM_VERNO >= 0x010e) + if (psmi_verno_client() >= 0x010e) { + if (opts_i->senddesc_num) + opts.senddesc_num = opts_i->senddesc_num; + if (opts_i->imm_size) + opts.imm_size = opts_i->imm_size; + } +#endif + } + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK) + goto fail; + } else num_units = 0; + + /* do some error checking */ + if (opts.timeout < -1) { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid timeout value %lld", + (long long) opts.timeout); + goto fail; + } else if (num_units && (opts.unit < -1 || opts.unit >= IPATH_MAX_UNIT)) { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid Device Unit ID %d (%d units found)", + opts.unit, num_units); + goto fail; + } else if (opts.affinity < 0 || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid Affinity option: %d", opts.affinity); + goto fail; + } else if (opts.shm_mbytes < PSMI_EP_OPEN_SHM_MBYTES_MIN) { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid shm_mbytes option at %d mbytes (minimum is %d)", + opts.shm_mbytes, PSMI_EP_OPEN_SHM_MBYTES_MIN); + goto fail; + } + + /* Advertise in verbose env the fact that we parse the no-affinity + * variable. */ + default_cpuaff = psmi_getenv("IPATH_NO_CPUAFFINITY", + "Prevent PSM from setting affinity", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_YESNO, + PSMI_ENVVAR_VAL_NO, + &no_cpuaff); + + if (no_cpuaff.e_uint || + (default_cpuaff && opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP)) + { + old_cpuaff = getenv("IPATH_NO_CPUAFFINITY"); + setenv("IPATH_NO_CPUAFFINITY", "1", 1); + } + +#ifdef __MIC__ + /* + * On MIC, we always pick unit from /sys/class/qib/ipath/unit, + * but only do this if there is a HCA unit. + */ + if (num_units > 0) { + char pathname[128]; + struct stat st; + FILE *fp; + + snprintf(pathname, sizeof(pathname), + "/sys/class/qib/ipath/unit"); + fp = NULL; + if (stat(pathname, &st) || S_ISDIR(st.st_mode) || + !(fp = fopen(pathname, "r")) || (fscanf(fp, "%d", &opts.unit) != 1)) { + err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, + "Couldn't read from %s", pathname); + if (fp) fclose(fp); + goto fail; + } + fclose(fp); + psmi_assert(opts.unit != IPATH_UNIT_ID_ANY); + psmi_assert(opts.unit < IPATH_MAX_UNIT); + } +#else + /* If a specific unit is set in the environment, use that one. */ + if (!psmi_getenv("IPATH_UNIT", "Device Unit number (-1 autodetects)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val) IPATH_UNIT_ID_ANY, + &env_unit_id)) { + opts.unit = env_unit_id.e_long; + /* set mock UNIT *just* for setaffinity */ + if (opts.unit != IPATH_UNIT_ID_ANY) { + char buf[32]; + snprintf(buf, sizeof buf - 1, "%d", (int) opts.unit); + buf[sizeof buf - 1] = '\0'; + old_unit = getenv("IPATH_UNIT"); + setenv("IPATH_UNIT", buf, 1); + } + else + unsetenv("IPATH_UNIT"); + } +#endif + + if (!psmi_getenv("IPATH_PORT", "IB Port number (<= 0 autodetects)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)0, + &env_port_id)) { + opts.port = env_port_id.e_long; + } + + if (!psmi_getenv("IPATH_SL", "IB outging ServiceLevel number (default 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val) PSMI_SL_DEFAULT, + &env_sl)) { + opts.outsl = env_sl.e_long; + } + +#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a) + { + union psmi_envvar_val env_vl; + if (!psmi_getenv("IPATH_VL", "IB outging VirtualLane (default 0)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, + (union psmi_envvar_val)0, + &env_vl)) { + opts.outvl = env_vl.e_long; + } + } +#endif + + /* sanity check new capabilities, after both opts and env */ + if (opts.port < 0 || opts.port > IPATH_MAX_PORT) + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid Port number: %lld", + (unsigned long long) opts.port); + if (opts.outsl < 0 || opts.outsl > 15) + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid SL number: %lld", + (unsigned long long) opts.outsl); + +#if (PSM_VERNO >= 0x0107) && (PSM_VERNO <= 0x010a) + if (opts.outvl < 0 || opts.outvl > 7) + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Invalid VL number: %lld", + (unsigned long long) opts.outvl); +#endif + + ptl_sizes = + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? + psmi_ptl_self.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? + psmi_ptl_ips.sizeof_ptl() : 0) + + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? + psmi_ptl_amsh.sizeof_ptl() : 0); + if (ptl_sizes == 0) return PSM_EP_NO_DEVICE; + + ep = (psm_ep_t) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 1, + sizeof(struct psm_ep) + ptl_sizes); + epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, + 1, sizeof(struct psm_epaddr)); + if (ep == NULL || epaddr == NULL) { + err = psmi_handle_error(NULL, PSM_NO_MEMORY, + "Couldn't allocate memory for %s structure", + ep == NULL ? "psm_ep" : "psm_epaddr"); + goto fail; + } + + /* Copy PTL enabled status */ + for (i = 0; i < PTL_MAX_INIT; i++) + ep->devid_enabled[i] = devid_enabled[i]; + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + ep->mq = mq; + + /* Get ready for PTL initialization */ + memcpy(&ep->key, (void *) unique_job_key, sizeof(psm_uuid_t)); + ep->epaddr = epaddr; + ep->shm_mbytes = opts.shm_mbytes; + ep->memmode = mq->memmode; + ep->ipath_num_sendbufs = opts.sendbufs_num; + ep->network_pkey = (uint16_t) opts.network_pkey & PSMI_EP_OPEN_PKEY_MASK; +#if (PSM_VERNO >= 0x010d) + ep->service_id = opts.service_id; + ep->path_res_type = opts.path_res_type; +#else + /* Select sane defaults with older PSM header */ + ep->service_id = 0x1000117500000000ULL; /* Default service ID */ + ep->path_res_type = 0; /* No path resolution */ +#endif +#if (PSM_VERNO >= 0x010e) + ep->ipath_num_descriptors = opts.senddesc_num; + ep->ipath_imm_size = opts.imm_size; +#else + /* Default is 4 times more descriptors than buffers */ + ep->ipath_num_descriptors = ep->ipath_num_sendbufs << 2; + ep->ipath_imm_size = 128; +#endif + ep->errh = psmi_errhandler_global; /* by default use the global one */ + ep->ptl_amsh.ep_poll = psmi_poll_noop; + ep->ptl_ips.ep_poll = psmi_poll_noop; + ep->connections = 0; + + /* Active message fields, used by psmi_shm_attach() */ + ep->psmi_kassist_fd = -1; + ep->psmi_kassist_mode = 0; + ep->amsh_shmbase = 0; + ep->amsh_blockbase = 0; + ep->amsh_dirpage = NULL; + ep->amsh_keyname = NULL; + ep->amsh_shmfd = -1; + ep->amsh_shmidx = -1; + ep->amsh_max_idx = -1; + + /* See how many iterations we want to spin before yielding */ + psmi_getenv("PSM_YIELD_SPIN_COUNT", + "Spin poll iterations before yield", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, + &yield_cnt); + ep->yield_spin_cnt = yield_cnt.e_uint; + + ptl_sizes = 0; + amsh_ptl = ips_ptl = self_ptl = NULL; + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_ips.sizeof_ptl(); + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); + ptl_sizes += psmi_ptl_self.sizeof_ptl(); + } + + if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, + &(ep->context), &ep->epid))) + goto fail; + + /* Restore old cpuaffinity and unit settings. + * TODO: PSM should really just implement its own affinity + * setting function */ + if (old_cpuaff != NULL) + setenv("IPATH_NO_CPUAFFINITY", old_cpuaff, 1); + if (old_unit != NULL) + setenv("IPATH_UNIT", old_unit, 1); + + psmi_assert_always(ep->epid != 0); + ep->epaddr->epid = ep->epid; + + /* Set our new label as soon as we know what it is */ + strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); + buf[sizeof(buf) - 1] = '\0'; + + p = buf + strlen(buf); + + /* If our rank is set, use it. If not, use context.subcontext notation */ + if (((e = getenv("MPI_RANKID")) != NULL && *e) || + ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) + len = snprintf(p, sizeof buf - strlen(buf), ":%d.", atoi(e)); + else + len = snprintf(p, sizeof buf - strlen(buf), ":%d.%d.", + (uint32_t) psm_epid_context(ep->epid), + (uint32_t) psmi_epid_subcontext(ep->epid)); + *(p + len) = '\0'; + ep->context_mylabel = psmi_strdup(ep, buf); + if (ep->context_mylabel == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + //ipath_set_mylabel(ep->context_mylabel); + + if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0))) + goto fail; + + /* + * Active Message initialization + */ + if ((err = psmi_am_init_internal(ep))) + goto fail; + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { + if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) + goto fail; + } + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) + goto fail; + } + /* If we're shm-only, this device is enabled above */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) + goto fail; + } + else { + /* We may have pre-attached as part of getting our rank for enabling + * shared contexts. */ + psmi_shm_detach(ep); + } + + /* + * Keep only IPS since only IPS support multi-rail, other devices + * are only setup once. IPS device can come to this function again. + */ + for (i = 0; i < PTL_MAX_INIT; i++) { + if (devid_enabled[i] != PTL_DEVID_IPS) { + devid_enabled[i] = -1; + } + } + + *epido = ep->epid; + *epo = ep; + + return PSM_OK; + +fail: + if (ep != NULL) { + if (ep->context.fd != -1) close(ep->context.fd); + psmi_free(ep); + } + if (epaddr != NULL) + psmi_free(epaddr); + return err; +} + +psm_error_t +__psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *opts_i, + psm_ep_t *epo, psm_epid_t *epido) +{ + psm_error_t err; + psm_mq_t mq; + psm_epid_t epid; + psm_ep_t ep, tmp; + uint32_t units[IPATH_MAX_UNIT]; + uint16_t ports[IPATH_MAX_UNIT]; + int i, num_rails = 0; + char *uname = "IPATH_UNIT"; + char *pname = "IPATH_PORT"; + char uvalue[4], pvalue[4]; + int devid_enabled[PTL_MAX_INIT]; + union psmi_envvar_val devs; + + PSMI_ERR_UNLESS_INITIALIZED(NULL); + + PSMI_PLOCK(); + + /* Matched Queue initialization. We do this early because we have to + * make sure ep->mq exists and is valid before calling ips_do_work. + */ + err = psmi_mq_malloc(&mq); + if (err != PSM_OK) goto fail; + + /* See which ptl devices we want to use for this ep to be opened */ + psmi_getenv("PSM_DEVICES", + "Ordered list of PSM-level devices", + PSMI_ENVVAR_LEVEL_USER, + PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) PSMI_DEVICES_DEFAULT, + &devs); + + if ((err = psmi_parse_devices(devid_enabled, devs.e_str))) + goto fail; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + err = psmi_ep_multirail(&num_rails, units, ports); + if (err != PSM_OK) goto fail; + + /* If multi-rail is used, set the first ep unit/port */ + if (num_rails > 0) { + snprintf(uvalue, 4, "%1d", units[0]); + snprintf(pvalue, 4, "%1d", ports[0]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + } + } + + err = __psm_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, &ep, &epid); + if (err != PSM_OK) goto fail; + + if (psmi_opened_endpoint == NULL) { + psmi_opened_endpoint = ep; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next) tmp = tmp->user_ep_next; + tmp->user_ep_next = ep; + } + psmi_opened_endpoint_count++; + ep->mctxt_prev = ep->mctxt_next = ep; + ep->mctxt_master = ep; + mq->ep = ep; + + *epo = ep; + *epido = epid; + + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + for (i = 1; i < num_rails; i++) { + snprintf(uvalue, 4, "%1d", units[i]); + snprintf(pvalue, 4, "%1d", ports[i]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + + /* Create slave EP */ + err = __psm_ep_open_internal(unique_job_key, + devid_enabled, opts_i, mq, &tmp, &epid); + if (err) goto fail; + + /* Link slave EP after master EP. */ + PSM_MCTXT_APPEND(ep, tmp); + } + } + + /* Once we've initialized all devices, we can update the MQ with its + * default values */ + if (err == PSM_OK) err = psmi_mq_initialize_defaults(mq); + +fail: + PSMI_PUNLOCK(); + return err; +} +PSMI_API_DECL(psm_ep_open) + +psm_error_t +__psm_ep_close(psm_ep_t ep, int mode, int64_t timeout_in) +{ + psm_error_t err = PSM_OK; + uint64_t t_start = get_cycles(); + union psmi_envvar_val timeout_intval; + psm_ep_t tmp, mep; + + PSMI_ERR_UNLESS_INITIALIZED(ep); + psmi_assert_always(ep->mctxt_master == ep); + + PSMI_PLOCK(); + + if (psmi_opened_endpoint == NULL) { + err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + return err; + } + + tmp = psmi_opened_endpoint; + while (tmp && tmp != ep) { + tmp = tmp->user_ep_next; + } + if (!tmp) { + err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, + "PSM Endpoint is closed or does not exist"); + return err; + } + + psmi_getenv("PSM_CLOSE_TIMEOUT", + "End-point close timeout over-ride.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, + &timeout_intval); + + if (getenv("PSM_CLOSE_TIMEOUT")) { + timeout_in = timeout_intval.e_uint * SEC_ULL; + } + else if (timeout_in > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); + } + + if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* Infinite and excessive close time-out are limited here to a max. + * The "rationale" is that there is no point waiting around forever for + * graceful termination. Normal (or forced) process termination should clean + * up the context state correctly even if termination is not graceful. */ + if (timeout_in <= 0 || timeout_in < PSMI_MAX_EP_CLOSE_TIMEOUT) + timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; + _IPATH_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " + "%d connections\n", + ep, mode == PSM_EP_CLOSE_FORCE ? "YES" : "NO", + (double) timeout_in / 1e9, (int) ep->connections); + + /* XXX We currently cheat in the sense that we leave each PTL the allowed + * timeout. There's no good way to do this until we change the PTL + * interface to allow asynchronous finalization + */ + mep = ep; + tmp = ep->mctxt_prev; + do { + ep = tmp; + tmp = ep->mctxt_prev; + PSM_MCTXT_REMOVE(ep); + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) + err = psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, timeout_in); + + if ((err == PSM_OK || err == PSM_TIMEOUT) && + psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + err = psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, timeout_in); + + /* If there's timeouts in the disconnect requests, + * still make sure that we still get to close the + *endpoint and mark it closed */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) + psmi_context_close(&ep->context); + + psmi_free(ep->epaddr); + psmi_free(ep->context_mylabel); + /* + * Before freeing the master ep itself, + * remove it from the global linklist. + * We do it here to let atexit handler in ptl_am directory + * to search the global linklist and free the shared memory file. + */ + if (ep == mep) { + if (psmi_opened_endpoint == ep) { + psmi_opened_endpoint = ep->user_ep_next; + } else { + tmp = psmi_opened_endpoint; + while (tmp->user_ep_next != ep) { + tmp = tmp->user_ep_next; + } + tmp->user_ep_next = ep->user_ep_next; + } + psmi_opened_endpoint_count--; + } + psmi_free(ep); + + } while ((err == PSM_OK || err == PSM_TIMEOUT) && tmp != ep); + + PSMI_PUNLOCK(); + + _IPATH_PRDBG("Closed endpoint in %.3f secs\n", + (double) cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL); + return err; +} +PSMI_API_DECL(psm_ep_close) + +static +psm_error_t +psmi_ep_open_device(const psm_ep_t ep, + const struct psm_ep_open_opts *opts, + const psm_uuid_t unique_job_key, + struct psmi_context *context, + psm_epid_t *epid) +{ + psm_error_t err = PSM_OK; + + /* Skip affinity. No affinity if: + * 1. User explicitly sets no-affinity=YES in environment. + * 2. User doesn't set affinity in environment and PSM is opened with + * option affinity skip. + */ + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + uint32_t lid; + + ep->out_sl = opts->outsl; + + if ((err = psmi_context_open(ep, opts->unit, opts->port, unique_job_key, + opts->timeout, context)) != PSM_OK) + goto fail; + + _IPATH_DBG("[%d]use unit %d port %d\n", getpid(), + context->base_info.spi_unit, context->base_info.spi_port); + + if ((lid = ipath_get_port_lid(context->base_info.spi_unit, + context->base_info.spi_port)) == -1) { + err = psmi_handle_error(NULL, + PSM_EP_DEVICE_FAILURE, + "Can't get InfiniBand LID in psm_ep_open: is SMA running?"); + goto fail; + } + + if (context->base_info.spi_sw_version >= (1 << 16 | 5)) { + uint32_t rcvthread_flags; + union psmi_envvar_val env_rcvthread; + static int norcvthread = 0; /* only for first rail */ + + /* See if we want to activate support for receive thread */ + psmi_getenv("PSM_RCVTHREAD", "Recv thread flags (0 disables thread)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val)(norcvthread++?0:PSMI_RCVTHREAD_FLAGS), + &env_rcvthread); + rcvthread_flags = env_rcvthread.e_uint; + + /* If enabled, use the pollurg capability to implement a receive + * interrupt thread that can handle urg packets */ + if (rcvthread_flags) { + context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD; +#ifdef PSMI_PLOCK_IS_NOLOCK + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "#define PSMI_PLOCK_IS_NOLOCK not functional yet " + "with RCVTHREAD on"); +#endif + } + context->rcvthread_flags = rcvthread_flags; + + } + + *epid = context->epid; + } + else { + int rank, nranks; + char *e; + long nproc = sysconf(_SC_NPROCESSORS_ONLN); + + if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + /* In shm-only mode, we need to derive a valid epid based on our + * rank. We try to get it from the environment if its available, + * or resort to preattaching to the shared memory segment and use + * our shared memory rank (shmidx) as the rank. + */ + union psmi_envvar_val env_rankid; + + if (psmi_getenv("MPI_LOCALRANKID", "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) -1, + &env_rankid)) { + if (psmi_getenv("PSC_MPI_NODE_RANK", "Shared context rankid", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) -1, + &env_rankid)) { + if ((err = psmi_shm_attach(ep, &rank))) + goto fail; + } + else + rank = env_rankid.e_int; + } + else + rank = env_rankid.e_int; + nranks = (int) nproc; + } + else { + /* Self-only, meaning only 1 proc max */ + rank = 0; + nranks = 1; + } + + e = getenv("IPATH_NO_CPUAFFINITY"); + + /* Now that we have a rank, set our affinity based on this rank */ + if (e == NULL || *e == '\0') + { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + /* First see if affinity is already set */ + if (sched_getaffinity(0, sizeof cpuset, &cpuset)) { + _IPATH_PRDBG("Couldn't get processory affinity, assuming " + "not set: %s\n", strerror(errno)); + } + else { + int i, num_set = 0; + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &cpuset)) + num_set++; + } + + if (num_set > 0 && num_set < nproc) + _IPATH_PRDBG("CPU affinity already set, leaving as is\n"); + else if (rank >= nranks || rank < 0) + _IPATH_PRDBG("Skipping affinity, rank is %d and there are " + "only %d processors.\n", rank, nranks); + else { + CPU_ZERO(&cpuset); + CPU_SET(rank, &cpuset); + if (sched_setaffinity(0,sizeof cpuset, &cpuset)) + _IPATH_PRDBG("Couldn't set affinity to processor %d: %s\n", + rank, strerror(errno)); + else + _IPATH_PRDBG("Set CPU affinity to %d out of %d processors\n", + rank, nranks); + } + } + } + + /* + * We use a random lid 0xffff which doesn't really matter since we're + * closing ourselves to the outside world by explicitly disabling the + * ipath device). + */ + *epid = PSMI_EPID_PACK(0xffff, (rank>>2), rank); + } + +fail: + return err; +} + +/* Get a list of PTLs we want to use. The order is important, it affects + * whether node-local processes use shm or ips */ +static +psm_error_t +psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring) +{ + char *devstr = NULL; + char *b_new, *e, *ee, *b; + psm_error_t err = PSM_OK; + int len; + int i = 0; + + psmi_assert_always(devstring != NULL); + len = strlen(devstring)+1; + + for (i = 0; i < PTL_MAX_INIT; i++) + devices[i] = -1; + + devstr = (char *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, 2, len); + if (devstr == NULL) + goto fail; + + b_new = (char *) devstr; + e = b_new + len; + strncpy(e, devstring, len-1); + e[len-1] = '\0'; + ee = e + len; + i = 0; + while (e < ee && *e && i < PTL_MAX_INIT) { + while (*e && !isalpha(*e)) + e++; + b = e; + while (*e && isalpha(*e)) + e++; + *e = '\0'; + if (*b) { + if (!strcasecmp(b, "self")) { + devices[i++] = PTL_DEVID_SELF; + b_new = strcpy(b_new, "self,"); + b_new += 5; + } else if (!strcasecmp(b, "amsh")) { + devices[i++] = PTL_DEVID_AMSH; + strcpy(b_new, "amsh,"); + b_new += 5; + } else if (!strcasecmp(b, "ips")) { + devices[i++] = PTL_DEVID_IPS; + strcpy(b_new, "ips,"); + b_new += 4; + /* If shm or shmem is set, bind to amsh */ + } else if (!strcasecmp(b, "shm") || !strcasecmp(b, "shmem")) { + devices[i++] = PTL_DEVID_AMSH; + strcpy(b_new, "amsh,"); + b_new += 5; + /* If shm or shmem is set, bind to ipath */ + } else if (!strcasecmp(b, "ipath") || !(strcasecmp(b, "infinipath"))) { + devices[i++] = PTL_DEVID_IPS; + strcpy(b_new, "ips,"); + b_new += 4; + } else { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "%s set in environment variable PSM_PTL_DEVICES=\"%s\" " + "is not one of the recognized PTL devices (%s)", + b, devstring, PSMI_DEVICES_DEFAULT); + goto fail; + } + e++; + } + } + if (b_new != devstr) /* we parsed something, remove trailing comma */ + b_new[strlen(b_new) - 1] = '\0'; + + _IPATH_PRDBG("PSM Device allocation order: %s\n", devstr); +fail: + if (devstr != NULL) + psmi_free(devstr); + return err; + +} + +static +int +psmi_device_is_enabled(const int devid_enabled[PTL_MAX_INIT], int devid) +{ + int i; + for (i = 0; i < PTL_MAX_INIT; i++) + if (devid_enabled[i] == devid) + return 1; + return 0; +} + +int +psmi_ep_device_is_enabled(const psm_ep_t ep, int devid) +{ + return psmi_device_is_enabled(ep->devid_enabled, devid); +} + diff --git a/psm_ep.h b/psm_ep.h new file mode 100644 index 0000000..6c5723f --- /dev/null +++ b/psm_ep.h @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_ep.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_EP_H +#define _PSMI_EP_H + +#ifdef PSM_HAVE_SCIF +#include +#endif + +/* + * EPIDs encode the following information: + * + * LID:16 bits - LID for endpoint + * SUBCONTEXT:2 bits - Subcontext used for endpoint + * CONTEXT:6 bits - Context used for bits (upto 64 contexts) + * IBA_SL: 4 bits - Default SL to use for endpoint + * HCATYPE: 4 bits - QLE71XX, QLE72XX, QLE73XX .... + */ + +#define PSMI_HCA_TYPE_UNKNOWN 0 +#define PSMI_HCA_TYPE_QLE71XX 1 +#define PSMI_HCA_TYPE_QLE72XX 2 +#define PSMI_HCA_TYPE_QLE73XX 3 +#define PSMI_HCA_TYPE_DEFAULT PSMI_HCA_TYPE_UNKNOWN + +#define PSMI_SL_DEFAULT 0 +#define PSMI_VL_DEFAULT 0 + +#define PSMI_EPID_PACK_EXT(lid,context,subcontext,hca_type,sl) \ + ( ((((uint64_t)lid)&0xffff)<<16) | \ + ((((uint64_t)subcontext)&0x3)<<14) | \ + ((((uint64_t)context)&0x3f)<<8) | \ + ((((uint64_t)sl)&0xf)<<4) | \ + (((uint64_t)hca_type)&0xf) ) + +#define PSMI_EPID_PACK(lid,context,subcontext) \ + PSMI_EPID_PACK_EXT(lid,context,subcontext,PSMI_HCA_TYPE_DEFAULT, PSMI_SL_DEFAULT) + +#define PSMI_EPID_GET_LID(epid) (((epid)>>16)&0xffff) +#define PSMI_EPID_GET_SUBCONTEXT(epid) (((epid)>>14)&0x3) +#define PSMI_EPID_GET_CONTEXT(epid) (((epid)>>8)&0x3f) +#define PSMI_EPID_GET_SL(epid) (((epid)>>4)&0xf) +#define PSMI_EPID_GET_HCATYPE(epid) (((epid)>>0)&0xf) + +#define PSMI_MIN_EP_CONNECT_TIMEOUT (2 * SEC_ULL) +#define PSMI_MIN_EP_CLOSE_TIMEOUT (2 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_TIMEOUT (60 * SEC_ULL) + +#define PSMI_MIN_EP_CLOSE_GRACE_INTERVAL (1 * SEC_ULL) +#define PSMI_MAX_EP_CLOSE_GRACE_INTERVAL (10 * SEC_ULL) + +struct psm_ep { + psm_epid_t epid; /**> This endpoint's Endpoint ID */ + psm_epaddr_t epaddr; /**> This ep's ep address */ + psm_mq_t mq; /**> only 1 MQ */ + int unit_id; + uint16_t portnum; + uint8_t out_sl; + uint8_t pad; + int did_syslog; + psm_uuid_t key; + uint16_t network_pkey; /**> InfiniBand Pkey */ + uint64_t service_id; /* Infiniband service ID */ + psm_path_res_t path_res_type;/* Path resolution for endpoint */ + psm_ep_errhandler_t errh; + int devid_enabled[PTL_MAX_INIT]; + int memmode; /**> min, normal, large memory mode */ + +#ifdef PSM_HAVE_SCIF + scif_epd_t scif_epd; /* scif listen endpoint */ + int scif_dma_threshold; /* DMA message size threshold */ + int scif_mynodeid; /* my scif node ID */ + int scif_nnodes; /* Number of scif nodes on system */ + int scif_dma_mode; + pthread_t scif_thread; /* Thread listening for SCIF connects */ +#endif + + uint32_t ipath_num_sendbufs; /**> Number of allocated send buffers */ + uint32_t ipath_num_descriptors; /** Number of allocated scb descriptors*/ + uint32_t ipath_imm_size; /** Immediate data size */ + uint32_t shm_mbytes; /**> Number of shared memory pages */ + uint32_t connections; /**> Number of connections */ + + psmi_context_t context; + char *context_mylabel; + uint32_t yield_spin_cnt; + + /* EP link-lists */ + struct psm_ep *user_ep_next; + + /* EP link-lists for multi-context. */ + struct psm_ep *mctxt_prev; + struct psm_ep *mctxt_next; + struct psm_ep *mctxt_master; + + /* Active Message handler table */ + void **am_htable; + int psmi_kassist_fd; /* when using kassist */ + int psmi_kassist_mode; + + struct amsh_qdirectory *amsh_qdir; + uintptr_t amsh_shmbase; /* base for mmap */ + uintptr_t amsh_blockbase; /* base for block 0 (after ctl dirpage) */ + struct am_ctl_dirpage *amsh_dirpage; + psm_uuid_t amsh_keyno; /* context key uuid */ + char *amsh_keyname;/* context keyname */ + int amsh_shmfd; /* context shared mmap fd */ + int amsh_shmidx; /* last used shmidx */ + int amsh_max_idx; /* max directory idx seen so far */ + + uint64_t gid_hi; + uint64_t gid_lo; + + ptl_ctl_t ptl_amsh; + ptl_ctl_t ptl_ips; + ptl_ctl_t ptl_self; + + /* All ptl data is allocated inline below */ + uint8_t ptl_base_data[0] __attribute__((aligned(8))); +}; + +struct mqq { + psm_mq_req_t first; + psm_mq_req_t *lastp; +}; + +struct mqsq { + psm_mq_req_t first; + psm_mq_req_t *lastp; +}; + +typedef +union psmi_egrid { + struct { + uint32_t egr_flowid : 8; + uint32_t egr_msgno : 24; + }; + uint32_t egr_data; +} +psmi_egrid_t; + +typedef +union psmi_seqnum { + struct { + uint32_t seq:11; + uint32_t gen:8; + uint32_t flow:5; + }; + struct { + uint32_t pkt:16; + uint32_t msg:8; + }; + struct { + uint32_t psn:24; + }; + uint32_t val; +} psmi_seqnum_t; + +struct psm_epaddr { + struct ptl *ptl; /* Which ptl owns this epaddress */ + ptl_ctl_t *ptlctl; /* The control structure for the ptl */ + psm_epid_t epid; + psm_ep_t ep; + + void *usr_ep_ctxt; /* User context associated with endpoint */ + + STAILQ_HEAD(, psm_mq_req) egrlong; /**> egrlong request queue */ + STAILQ_HEAD(, psm_mq_req) egrdata; /**> egrlong data queue */ + psmi_egrid_t xmit_egrlong; + + /* PTLs have a few ways to initialize the ptl address */ + union { + ptl_epaddr_t *ptladdr; + uint32_t _ptladdr_u32[2]; + uint64_t _ptladdr_u64; + uint8_t _ptladdr_data[0]; + }; + + /* it makes sense only in master */ + uint64_t mctxt_gidhi[IPATH_MAX_UNIT]; + psm_epid_t mctxt_epid[IPATH_MAX_UNIT]; + int mctxt_epcount; + int mctxt_nsconn; /* # slave connection */ + uint16_t mctxt_send_seqnum; + uint16_t mctxt_recv_seqnum; + struct psm_epaddr *mctxt_current; + struct mqsq outoforder_q; /**> OutofOrder queue */ + int outoforder_c; /* OOO queue count */ + + /* epaddr linklist for multi-context. */ + struct psm_epaddr *mctxt_master; + struct psm_epaddr *mctxt_prev; + struct psm_epaddr *mctxt_next; +}; + +#define PSM_MCTXT_APPEND(head, node) \ + node->mctxt_prev = head->mctxt_prev; \ + node->mctxt_next = head; \ + head->mctxt_prev->mctxt_next = node; \ + head->mctxt_prev = node; \ + node->mctxt_master = head +#define PSM_MCTXT_REMOVE(node) \ + node->mctxt_prev->mctxt_next = node->mctxt_next; \ + node->mctxt_next->mctxt_prev = node->mctxt_prev; \ + node->mctxt_next = node->mctxt_prev = node; \ + node->mctxt_master = NULL + +#ifndef PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD +# define PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD 250 +#endif + +/* + * Users of BLOCKUNTIL should check the value of err upon return + */ +#define PSMI_BLOCKUNTIL(ep,err,cond) do { \ + int spin_cnt = 0; \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + err = psmi_poll_internal(ep, 1); \ + if (err == PSM_OK_NO_PROGRESS) { \ + PSMI_PROFILE_REBLOCK(1); \ + if (++spin_cnt == (ep)->yield_spin_cnt) { \ + spin_cnt = 0; \ + PSMI_PYIELD(); \ + } \ + } \ + else if (err == PSM_OK) { \ + PSMI_PROFILE_REBLOCK(0); \ + spin_cnt = 0; \ + } \ + else \ + break; \ + } \ + PSMI_PROFILE_UNBLOCK(); \ + } while(0) + +#endif /* _PSMI_EP_H */ diff --git a/psm_ep_connect.c b/psm_ep_connect.c new file mode 100644 index 0000000..98294e2 --- /dev/null +++ b/psm_ep_connect.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" + +int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid); + +psm_error_t +__psm_ep_connect(psm_ep_t ep, int num_of_epid, + psm_epid_t const *array_of_epid, + int const *array_of_epid_mask, /* can be NULL */ + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + int64_t timeout) +{ + psm_error_t err = PSM_OK; + ptl_ctl_t *ptlctl; + ptl_t *ptl; + int i, j, dup_idx; + int num_toconnect = 0; + int *epid_mask = NULL; + int *epid_mask_isdupof = NULL; + char *device; + uint64_t t_start = get_cycles(); + uint64_t t_left; + union psmi_envvar_val timeout_intval; + + PSMI_ERR_UNLESS_INITIALIZED(ep); + + PSMI_PLOCK(); + + /* + * Normally we would lock here, but instead each implemented ptl component + * does its own locking. This is mostly because the ptl components are + * ahead of the PSM interface in that they can disconnect their peers. + */ + if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || + num_of_epid < 1) { + err = psmi_handle_error(ep, PSM_PARAM_ERR, + "Invalid psm_ep_connect parameters"); + goto fail; + } + + /* We need two of these masks to detect duplicates */ + err = PSM_NO_MEMORY; + epid_mask = (int *) psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask == NULL) + goto fail; + epid_mask_isdupof = (int *) psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); + if (epid_mask_isdupof == NULL) + goto fail; + err = PSM_OK; + + /* Eventually handle timeouts across all connects. */ + for (j = 0; j < num_of_epid; j++) { + if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) + epid_mask[j] = 0; + else { + epid_mask[j] = 1; + array_of_errors[j] = PSM_EPID_UNKNOWN; + array_of_epaddr[j] = NULL; + num_toconnect++; + } + epid_mask_isdupof[j] = -1; + } + + psmi_getenv("PSM_CONNECT_TIMEOUT", + "End-point connection timeout over-ride. 0 for no time-out.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, + &timeout_intval); + + if (getenv("PSM_CONNECT_TIMEOUT")) { + timeout = timeout_intval.e_uint * SEC_ULL; + } + else if (timeout > 0) { + /* The timeout parameter provides the minimum timeout. A heuristic + * is used to scale up the timeout linearly with the number of + * endpoints, and we allow one second per 100 endpoints. */ + timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); + } + + if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) + timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; + _IPATH_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", + num_toconnect, (double) timeout/ 1e9); + + /* Look for duplicates in input array */ + for (i = 0; i < num_of_epid; i++) { + for (j = i + 1; j < num_of_epid; j++) { + if (array_of_epid[i] == array_of_epid[j] && + epid_mask[i] && epid_mask[j]) { + epid_mask[j] = 0; /* don't connect more than once */ + epid_mask_isdupof[j] = i; + } + } + } + + for (i = 0; i < PTL_MAX_INIT; i++) { + if (ep->devid_enabled[i] == -1) + continue; + /* Set up the right connect ptrs */ + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + ptlctl = &ep->ptl_ips; + ptl = ep->ptl_ips.ptl; + device = "ips"; + break; + case PTL_DEVID_AMSH: + ptlctl = &ep->ptl_amsh; + ptl = ep->ptl_amsh.ptl; + device = "amsh"; + break; + case PTL_DEVID_SELF: + ptlctl = &ep->ptl_self; + ptl = ep->ptl_self.ptl; + device = "self"; + break; + default: + device = "unknown"; + ptlctl = &ep->ptl_ips; /*no-unused*/ + ptl = ep->ptl_ips.ptl; /*no-unused*/ + device = "ips"; /*no-unused*/ + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); + break; + } + t_left = psmi_cycles_left(t_start, timeout); + + _IPATH_VDBG("Trying to connect with device %s\n", device); + if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, + epid_mask, array_of_errors, array_of_epaddr, + cycles_to_nanosecs(t_left)))) + { + _IPATH_PRDBG("Connect failure in device %s err=%d\n", + device, err); + goto connect_fail; + } + + /* Now process what's been connected */ + for (j = 0; j < num_of_epid; j++) { + dup_idx = epid_mask_isdupof[j]; + if (!epid_mask[j] && dup_idx == -1) + continue; + + if (dup_idx != -1) { /* dup */ + array_of_epaddr[j] = array_of_epaddr[dup_idx]; + array_of_errors[j] = array_of_errors[dup_idx]; + epid_mask_isdupof[j] = -1; + } + + if (array_of_errors[j] == PSM_OK) { + epid_mask[j] = 0; /* don't try on next ptl */ + ep->connections++; + } + } + } + + for (i = 0; i < num_of_epid; i++) { + ptl_ctl_t *c = NULL; + if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) + continue; + /* If we see unreachable here, that means some PTLs were not enabled */ + if (array_of_errors[i] == PSM_EPID_UNREACHABLE) { + err = PSM_EPID_UNREACHABLE; + break; + } + + psmi_assert_always(array_of_epaddr[i] != NULL); + c = array_of_epaddr[i]->ptlctl; + psmi_assert_always(c != NULL); + _IPATH_VDBG("%-20s DEVICE %s (%p)\n", + psmi_epaddr_get_name(array_of_epid[i]), + c == &ep->ptl_ips ? "ipath" : + (c == &ep->ptl_amsh ? "amsh" : "self" ), + (void *) array_of_epaddr[i]->ptl); + } + +connect_fail: + /* If the error is a timeout (at worse) and the client is InfiniPath MPI, + * just return timeout to let InfiniPath MPI handle the hostnames that + * timed out */ + if (err != PSM_OK) { + char errbuf[PSM_ERRSTRING_MAXLEN]; + size_t len; + int j = 0; + + if (err == PSM_EPID_UNREACHABLE) { + char *deverr = "of an incorrect setting"; + char *eperr = " "; + char *devname = NULL; + if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { + deverr = "there is no shared memory PSM device (shm)"; + eperr = " shared memory "; + } + else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { + deverr = "there is no InfiniPath PSM device (ipath)"; + eperr = " InfiniPath "; + } + + len = snprintf(errbuf, sizeof errbuf - 1, + "Some%sendpoints could not be connected because %s " + "in the currently enabled PSM_DEVICES (", + eperr, deverr); + for (i = 0; i < PTL_MAX_INIT && len < sizeof errbuf - 1; i++) { + switch (ep->devid_enabled[i]) { + case PTL_DEVID_IPS: + devname = "ipath"; + break; + case PTL_DEVID_AMSH: + devname = "shm"; + break; + case PTL_DEVID_SELF: + default: + devname = "self"; + break; + } + len += snprintf(errbuf+len, sizeof errbuf - len - 1, + "%s,", devname); + } + if (len < sizeof errbuf - 1 && devname != NULL) + /* parsed something, remove trailing comma */ + errbuf[len-1] = ')'; + } + else + len = snprintf(errbuf, sizeof errbuf - 1, + "%s", err == PSM_TIMEOUT ? + "Dectected connection timeout" : + psm_error_get_string(err)); + + /* first pass, look for all nodes with the error */ + for (i = 0; i < num_of_epid && len < sizeof errbuf - 1; i++) { + if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) + continue; + if (array_of_errors[i] == PSM_OK) + continue; + if (array_of_errors[i] == PSM_EPID_UNREACHABLE && + err != PSM_EPID_UNREACHABLE) + continue; + if (err == array_of_errors[i]) { + len += snprintf(errbuf+len, sizeof errbuf - len - 1, + "%c %s", j==0 ? ':' : ',', + psmi_epaddr_get_hostname(array_of_epid[i])); + j++; + } + } + errbuf[sizeof errbuf - 1] = '\0'; + err = psmi_handle_error(ep, err, errbuf, "%s"); + } + +fail: + PSMI_PUNLOCK(); + + if (epid_mask != NULL) + psmi_free(epid_mask); + if (epid_mask_isdupof != NULL) + psmi_free(epid_mask_isdupof); + + return err; +} +PSMI_API_DECL(psm_ep_connect) + diff --git a/psm_error.c b/psm_error.c new file mode 100644 index 0000000..6bcefcb --- /dev/null +++ b/psm_error.c @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" + +#define PSMI_NOLOG -1 + +struct psm_error_token +{ + psm_ep_t ep; + psm_error_t error; + char err_string[PSM_ERRSTRING_MAXLEN]; +}; + +static +psm_error_t +psmi_errhandler_noop(psm_ep_t ep, const psm_error_t err, + const char *error_string, psm_error_token_t token) +{ + return err; +} + +static +psm_error_t +psmi_errhandler_psm(psm_ep_t ep, + const psm_error_t err, + const char *error_string, + psm_error_token_t token) +{ + /* we want the error to be seen through ssh, etc., so we flush and then + * sleep a bit. Not perfect, but not doing so means it almost never + * gets seen. */ + fprintf(stderr, "%s%s\n", ipath_get_mylabel(), token->err_string); + fflush(stdout); + fflush(stderr); + + /* XXX Eventually, this will hook up to a connection manager, and we'll + * issue an upcall into the connection manager at shutdown time */ + sleep(3); + + /* We use this "special" ep internally to handle internal errors that are + * triggered from within code that is not expected to return to the user. + * Errors of this sort on not expected to be handled by users and always + * mean we have an internal PSM bug. */ + if (err == PSM_INTERNAL_ERR) + abort(); + else + exit(-1); +} + +psm_ep_errhandler_t psmi_errhandler_global = psmi_errhandler_noop; + +psm_error_t +__psm_error_defer(psm_error_token_t token) +{ + return psmi_errhandler_psm(token->ep, token->error, token->err_string, token); +} +PSMI_API_DECL(psm_error_defer) + +psm_error_t +__psm_error_register_handler(psm_ep_t ep, const psm_ep_errhandler_t errhandler) +{ + psm_ep_errhandler_t *errh; + if (ep == NULL) + errh = &psmi_errhandler_global; + else + errh = &ep->errh; + + if (errhandler == PSM_ERRHANDLER_PSM_HANDLER) + *errh = psmi_errhandler_psm; + else if (errhandler == PSM_ERRHANDLER_NO_HANDLER) + *errh = psmi_errhandler_noop; + else + *errh = errhandler; + + return PSM_OK; +} +PSMI_API_DECL(psm_error_register_handler) + +psm_error_t +psmi_handle_error(psm_ep_t ep, psm_error_t error, const char *buf, ...) +{ + va_list argptr; + int syslog_level; + int console_print = 0; + psm_error_t newerr; + struct psm_error_token token; + char *c, fullmsg[PSM_ERRSTRING_MAXLEN]; + token.error = error; + snprintf(fullmsg, PSM_ERRSTRING_MAXLEN-1, "%s", buf); + fullmsg[PSM_ERRSTRING_MAXLEN-1] = '\0'; + va_start(argptr, buf); + vsnprintf(token.err_string, PSM_ERRSTRING_MAXLEN-1, fullmsg, argptr); + va_end(argptr); + token.err_string[PSM_ERRSTRING_MAXLEN-1] = '\0'; + + /* Unless the user has set PSM_NO_VERBOSE_ERRORS, always print errors to + * console */ + c = getenv("PSM_NO_VERBOSE_ERRORS"); + console_print = 0; + if (ep == PSMI_EP_LOGEVENT) + console_print = 1; + else if (!c || *c == '\0') { /* no desire to prevent verbose errors */ + /* Remove the console print if we're internally handling the error */ + if (ep == PSMI_EP_NORETURN) + console_print = 0; + else if (ep == NULL && psmi_errhandler_global != psmi_errhandler_psm) + console_print = 1; + else if (ep != NULL && ep->errh != psmi_errhandler_psm) + console_print = 1; + } + + /* Before we let the user even handle the error, send to syslog */ + syslog_level = psmi_error_syslog_level(error); + if (syslog_level != PSMI_NOLOG || ep == PSMI_EP_LOGEVENT) + psmi_syslog(ep, console_print, + ep == PSMI_EP_LOGEVENT ? LOG_NOTICE : syslog_level, + "%s (err=%d)", + token.err_string, error); + + if (ep == PSMI_EP_LOGEVENT) /* we're just logging */ + newerr = PSM_OK; + else if (ep == PSMI_EP_NORETURN) + newerr = psmi_errhandler_psm(NULL, error, token.err_string, &token); + else if (ep == NULL) + newerr = psmi_errhandler_global(NULL, error, token.err_string, &token); + else + newerr = ep->errh(ep, error, token.err_string, &token); + + return newerr; +} + +/* Returns the "worst" error out of errA and errB */ +psm_error_t +psmi_error_cmp(psm_error_t errA, psm_error_t errB) +{ +#define _PSMI_ERR_IS(err) if (errA == (err) || errB == (err)) return (err) + + /* Bad runtime or before initialization */ + _PSMI_ERR_IS(PSM_NO_MEMORY); + _PSMI_ERR_IS(PSM_INTERNAL_ERR); + _PSMI_ERR_IS(PSM_INIT_NOT_INIT); + _PSMI_ERR_IS(PSM_INIT_BAD_API_VERSION); + + /* Before we cget an endpoint */ + _PSMI_ERR_IS(PSM_EP_NO_DEVICE); + _PSMI_ERR_IS(PSM_EP_UNIT_NOT_FOUND); + _PSMI_ERR_IS(PSM_EP_DEVICE_FAILURE); + _PSMI_ERR_IS(PSM_EP_NO_PORTS_AVAIL); + _PSMI_ERR_IS(PSM_TOO_MANY_ENDPOINTS); + + /* As we open/close the endpoint */ + _PSMI_ERR_IS(PSM_EP_NO_NETWORK); + _PSMI_ERR_IS(PSM_SHMEM_SEGMENT_ERR); + _PSMI_ERR_IS(PSM_EP_CLOSE_TIMEOUT); + _PSMI_ERR_IS(PSM_EP_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM_EP_NO_RESOURCES); + + /* In connect phase */ + _PSMI_ERR_IS(PSM_EPID_NETWORK_ERROR); + _PSMI_ERR_IS(PSM_EPID_INVALID_NODE); + _PSMI_ERR_IS(PSM_EPID_INVALID_CONNECT); + _PSMI_ERR_IS(PSM_EPID_INVALID_PKEY); + _PSMI_ERR_IS(PSM_EPID_INVALID_VERSION); + _PSMI_ERR_IS(PSM_EPID_INVALID_UUID_KEY); + _PSMI_ERR_IS(PSM_EPID_INVALID_MTU); + + /* Timeout if nothing else */ + _PSMI_ERR_IS(PSM_TIMEOUT); + + /* Last resort */ + return max(errA,errB); +} + +struct psmi_error_item { + int syslog_level; + const char *error_string; +}; + +static +struct psmi_error_item +psmi_error_items[] = { + { PSMI_NOLOG, "Success" }, /* PSM_OK = 0, */ + { PSMI_NOLOG, "No events were progressed in psm_poll" }, /* PSM_OK_NO_PROGRESS = 1 */ + { PSMI_NOLOG, "unknown 2" }, + { PSMI_NOLOG, "Error in a function parameter" }, /* PSM_PARAM_ERR = 3 */ + { LOG_CRIT , "Ran out of memory" }, /* PSM_NO_MEMORY = 4 */ + { PSMI_NOLOG, "PSM has not been initialized by psm_init" }, /* PSM_INIT_NOT_INIT = 5 */ + { LOG_INFO , "API version passed in psm_init is incompatible" }, /* PSM_INIT_BAD_API_VERSION = 6 */ + { PSMI_NOLOG, "PSM Could not set affinity" }, /* PSM_NO_AFFINITY = 7 */ + { LOG_ALERT , "PSM Unresolved internal error" }, /* PSM_INTERNAL_ERR = 8 */ + { LOG_CRIT , "PSM could not set up shared memory segment" }, /* PSM_SHMEM_SEGMENT_ERR = 9 */ + { PSMI_NOLOG, "PSM option is a read-only option" }, /* PSM_OPT_READONLY = 10 */ + { PSMI_NOLOG, "Operation timed out" }, /* PSM_TIMEOUT = 11 */ + { LOG_INFO , "Exceeded supported amount of endpoints" }, + /* PSM_TOO_MANY_ENDPOINTS = 12 */ + { PSMI_NOLOG, "PSM is in the finalized state" }, /* PSM_IS_FINALIZED = 13 */ + { PSMI_NOLOG, "unknown 14" }, + { PSMI_NOLOG, "unknown 15" }, + { PSMI_NOLOG, "unknown 16" }, + { PSMI_NOLOG, "unknown 17" }, + { PSMI_NOLOG, "unknown 18" }, + { PSMI_NOLOG, "unknown 19" }, + { PSMI_NOLOG, "Endpoint was closed" }, /* PSM_EP_WAS_CLOSED = 20 */ + { LOG_ALERT , "PSM Could not find an InfiniPath Unit" }, /* PSM_EP_NO_DEVICE = 21 */ + { PSMI_NOLOG, "User passed a bad unit number" }, /* PSM_EP_UNIT_NOT_FOUND = 22 */ + { LOG_ALERT , "Failure in initializing endpoint" }, /* PSM_EP_DEVICE_FAILURE = 23 */ + { PSMI_NOLOG, "Error closing the endpoing error" }, /* PSM_EP_CLOSE_TIMEOUT = 24 */ + { PSMI_NOLOG, "No free contexts could be obtained" }, /* PSM_EP_NO_PORTS_AVAIL = 25 */ + { LOG_ALERT , "Could not detect network connectivity" }, /* PSM_EP_NO_NETWORK = 26 */ + { LOG_INFO , "Invalid Unique job-wide UUID Key" }, /* PSM_EP_INVALID_UUID_KEY = 27 */ + { LOG_INFO , "Out of endpoint resources" }, /* PSM_EP_NO_RESOURCES = 28 */ + { PSMI_NOLOG, "unknown 29" }, + { PSMI_NOLOG, "unknown 30" }, + { PSMI_NOLOG, "unknown 31" }, + { PSMI_NOLOG, "unknown 32" }, + { PSMI_NOLOG, "unknown 33" }, + { PSMI_NOLOG, "unknown 34" }, + { PSMI_NOLOG, "unknown 35" }, + { PSMI_NOLOG, "unknown 36" }, + { PSMI_NOLOG, "unknown 37" }, + { PSMI_NOLOG, "unknown 38" }, + { PSMI_NOLOG, "unknown 39" }, + { PSMI_NOLOG, "Unknown/unresolved connection status (other errors occurred)" }, /* PSM_EPID_UNKNOWN = 40 */ + { PSMI_NOLOG, "Endpoint could not be reached" }, /* PSM_EPID_UNREACHABLE = 41 */ + { PSMI_NOLOG, "unknown 42" }, + { LOG_CRIT , "Invalid node (mismatch in bit width 32/64 or byte order)" }, /* PSM_EPID_INVALID_NODE = 43 */ + { LOG_CRIT , "Invalid MTU" }, /* PSM_EPID_INVALID_MTU = 44 */ + { PSMI_NOLOG, "UUID key mismatch" }, /* PSM_EPID_INVALID_UUID_KEY = 45 */ + { LOG_ERR , "Incompatible PSM version" }, /* PSM_EPID_INVALID_VERSION = 46 */ + { LOG_CRIT , "Connect received garbled connection information" }, /* PSM_EPID_INVALID_CONNECT = 47 */ + { PSMI_NOLOG, "Endpoint was already connected" }, /* PSM_EPID_ALREADY_CONNECTED = 48 */ + { LOG_CRIT , "Two or more endpoints have the same network id (LID)" }, /* PSM_EPID_NETWORK_ERROR = 49 */ + { LOG_CRIT, "Endpoint provided incompatible Partition Key" }, + { LOG_CRIT, "Unable to resolve network path. Is the SM running?" }, + { PSMI_NOLOG, "unknown 51" }, + { PSMI_NOLOG, "unknown 52" }, + { PSMI_NOLOG, "unknown 53" }, + { PSMI_NOLOG, "unknown 54" }, + { PSMI_NOLOG, "unknown 55" }, + { PSMI_NOLOG, "unknown 56" }, + { PSMI_NOLOG, "unknown 57" }, + { PSMI_NOLOG, "unknown 58" }, + { PSMI_NOLOG, "unknown 59" }, + { PSMI_NOLOG, "MQ Non-blocking request is incomplete" }, /* PSM_MQ_NO_COMPLETIONS = 60 */ + { PSMI_NOLOG, "MQ Message has been truncated at the receiver" }, /* PSM_MQ_TRUNCATION = 61 */ + { PSMI_NOLOG, "unknown 62" }, + { PSMI_NOLOG, "unknown 63" }, + { PSMI_NOLOG, "unknown 64" }, + { PSMI_NOLOG, "unknown 65" }, + { PSMI_NOLOG, "unknown 66" }, + { PSMI_NOLOG, "unknown 67" }, + { PSMI_NOLOG, "unknown 68" }, + { PSMI_NOLOG, "unknown 69" }, + { PSMI_NOLOG, "Invalid AM reply" }, + { PSMI_NOLOG, "unknown 71" }, + { PSMI_NOLOG, "unknown 72" }, + { PSMI_NOLOG, "unknown 73" }, + { PSMI_NOLOG, "unknown 74" }, + { PSMI_NOLOG, "unknown 75" }, + { PSMI_NOLOG, "unknown 76" }, + { PSMI_NOLOG, "unknown 77" }, + { PSMI_NOLOG, "unknown 78" }, + { PSMI_NOLOG, "unknown 79" }, + { PSMI_NOLOG, "unknown 80" }, +}; + +const char * +__psm_error_get_string(psm_error_t error) +{ + if (error >= PSM_ERROR_LAST) + return "unknown"; + else + return psmi_error_items[error].error_string; +} +PSMI_API_DECL(psm_error_get_string) + +int +psmi_error_syslog_level(psm_error_t error) +{ + if (error >= PSM_ERROR_LAST) + return PSMI_NOLOG; + else + return psmi_error_items[error].syslog_level; +} + diff --git a/psm_error.h b/psm_error.h new file mode 100644 index 0000000..21f5745 --- /dev/null +++ b/psm_error.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_error.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_ERROR_H +#define _PSMI_ERROR_H + +#define PSMI_EP_NONE (NULL) +#define PSMI_EP_NORETURN ((psm_ep_t) -2) +#define PSMI_EP_LOGEVENT ((psm_ep_t) -3) + +psm_ep_errhandler_t psmi_errhandler_global; + +psm_error_t psmi_handle_error(psm_ep_t ep, psm_error_t error, + const char *buf, ...) + __attribute__((format(printf, 3, 4))); + +psm_error_t psmi_error_cmp(psm_error_t errA, psm_error_t errB); +int psmi_error_syslog_level(psm_error_t error); + +#endif /* _PSMI_ERROR_H */ diff --git a/psm_help.h b/psm_help.h new file mode 100644 index 0000000..8efd11d --- /dev/null +++ b/psm_help.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_HELP_H +#define _PSMI_HELP_H + +/* XXX pathcc and gcc only */ +#define PSMI_INLINE(FN) \ + static inline FN + +#define PSMI_ALWAYS_INLINE(FN) \ + static __inline__ FN __attribute__((always_inline)); \ + static __inline__ FN + +#define PSMI_NEVER_INLINE(FN) \ + static FN __attribute__((noinline)); \ + static FN + +#define _PPragma(x) _Pragma(x) + +#define STRINGIFY(s) _STRINGIFY(s) +#define _STRINGIFY(s) #s +#define PSMI_CURLOC __FILE__ ":" STRINGIFY(__LINE__) +#define psmi_assert_always_loc(x,curloc) do { \ + if_pf (!(x)) { \ + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, \ + "Assertion failure at %s: %s", curloc, \ + STRINGIFY(x)); \ + } } while (0) + +#define psmi_assert_always(x) psmi_assert_always_loc(x,PSMI_CURLOC) + +#ifdef PSM_DEBUG +# define psmi_assert(x) psmi_assert_always(x) +# define PSMI_ASSERT_INITIALIZED() psmi_assert_always(psmi_isinitialized()) +#else +# define psmi_assert(x) +# define PSMI_ASSERT_INITIALIZED() +#endif + +#define _PSMI_API_NAME(FN) __ ## FN +#define _PSMI_API_STR(FN) _STRINGIFY(__ ## FN) +#define PSMI_API_DECL(FN) \ + typeof(_PSMI_API_NAME(FN)) FN __attribute__((weak, alias(_PSMI_API_STR(FN)))); + +#define PSMI_ERR_UNLESS_INITIALIZED(ep) do { \ + if (!psmi_isinitialized()) \ + return psmi_handle_error(ep, PSM_INIT_NOT_INIT, \ + "PSM has not been initialized"); \ + } while (0) + + +#define PSMI_CHECKMEM(err,mem) do { \ + if ((mem) == NULL) { \ + (err) = PSM_NO_MEMORY; \ + goto fail; \ + } \ + } while (0) + +#define PSMI_CACHEALIGN __attribute__((aligned(64))) + +/* Easy way to ignore the OK_NO_PROGRESS case */ +PSMI_ALWAYS_INLINE( +psm_error_t +psmi_err_only(psm_error_t err)) +{ + if (err > PSM_OK_NO_PROGRESS) + return err; + else + return PSM_OK; +} + +#ifdef min +#undef min +#endif +#define min(a,b) ((a) < (b) ? (a) : (b)) + +#ifdef max +#undef max +#endif +#define max(a,b) ((a) > (b) ? (a) : (b)) + +#define SEC_ULL 1000000000ULL +#define MSEC_ULL 1000000ULL +#define USEC_ULL 1000ULL +#define NSEC_ULL 1ULL + +#define PSMI_TRUE 1 +#define PSMI_FALSE 0 + +#define PSMI_CYCLES_TO_SECSF(cycles) \ + ((double) cycles_to_nanosecs(cycles) / 1.0e9) + +#define PSMI_PAGESIZE psmi_getpagesize() +#define PSMI_POWEROFTWO(P) (((P)&((P)-1)) == 0) +#define PSMI_ALIGNDOWN(p,P) (((uintptr_t)(p))&~((uintptr_t)((P)-1))) +#define PSMI_ALIGNUP(p,P) (PSMI_ALIGNDOWN((uintptr_t)(p)+((uintptr_t)((P)-1)),(P))) + +#define PSMI_MAKE_DRIVER_VERSION(major,minor) ((major)<<16 | ((minor) & 0xffff)) + +#define PSMI_STRICT_SIZE_DECL(member,sz) static const size_t __psm_ss_ ## member = sz +#define PSMI_STRICT_SIZE_VERIFY(member,sz) do { \ + if (__psm_ss_ ## member != (sz)) { \ + char errmsg[64]; \ + snprintf(errmsg,32, "Internal error: %s " \ + "size doesn't match expected %d bytes", \ + STRINGIFY(member), (int) __psm_ss_ ## member); \ + exit(-1); \ + } \ + } while (0) + + +#endif /* _PSMI_HELP_H */ diff --git a/psm_lock.h b/psm_lock.h new file mode 100644 index 0000000..9ad3df6 --- /dev/null +++ b/psm_lock.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_lock.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_LOCK_H +#define _PSMI_LOCK_H + +#ifndef PSMI_USE_PTHREAD_SPINLOCKS + #if defined(__powerpc__) + #define PSMI_USE_PTHREAD_SPINLOCKS 1 + #else + #define PSMI_USE_PTHREAD_SPINLOCKS 0 + #endif +#endif + +#if PSMI_USE_PTHREAD_SPINLOCKS + typedef pthread_spinlock_t psmi_spinlock_t; + + #define psmi_spin_init(lock) pthread_spin_init(lock,0) + #define psmi_spin_lock(lock) pthread_spin_lock(lock) + #define psmi_spin_trylock(lock) pthread_spin_trylock(lock) + #define psmi_spin_unlock(lock) pthread_spin_unlock(lock) +#else + typedef ips_atomic_t psmi_spinlock_t; + #define PSMI_SPIN_LOCKED 1 + #define PSMI_SPIN_UNLOCKED 0 + + PSMI_ALWAYS_INLINE( + int + psmi_spin_init(psmi_spinlock_t *lock)) { + ips_atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; + } + + PSMI_ALWAYS_INLINE( + int + psmi_spin_trylock(psmi_spinlock_t *lock)) { + if (ips_atomic_cmpxchg(lock,PSMI_SPIN_UNLOCKED,PSMI_SPIN_LOCKED) + == PSMI_SPIN_UNLOCKED) + return 0; + else + return EBUSY; + } + + PSMI_ALWAYS_INLINE( + int + psmi_spin_lock(psmi_spinlock_t *lock)) { + while (psmi_spin_trylock(lock) == EBUSY) + {} + return 0; + } + + PSMI_ALWAYS_INLINE( + int + psmi_spin_unlock(psmi_spinlock_t *lock)) { + atomic_set(lock, PSMI_SPIN_UNLOCKED); + return 0; + } +#endif /* PSMI_USE_PTHREAD_SPINLOCKS */ + +#endif /* _PSMI_LOCK_H */ diff --git a/psm_memcpy.c b/psm_memcpy.c new file mode 100644 index 0000000..cee165f --- /dev/null +++ b/psm_memcpy.c @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +/* Bug in 2.4 compiler that prevents this file from compiling. + * Hardcode memcpyo to psmi_mq_mtucpy (uses ipath_dwordcpy). + */ +#if (WORDSIZE != 64) || defined(__powerpc__) || \ + (defined(__PATHCC__) && __PATHCC__ == 2 && __PATHCC_MINOR__ == 4) +extern void psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars); + +void *psmi_memcpyo(void *dst, const void *src, size_t n) +{ + psmi_mq_mtucpy(dst,src,n); + return dst; +} +#else +#error "psmi_memcpyo() does not use psmi_mq_mtucpy()" +#include + +#define OPTERON_L1_CACHE_BYTES 65536 +#define OPTERON_L2_CACHE_BYTES 1048576 + +static inline size_t __memcpy_pathscale_opteron_sse2 + (uint8_t *d, const uint8_t *s, size_t n) __attribute__ ((always_inline)); + +static inline size_t __memcpy_pathscale_opteron_sse2 + (uint8_t *d, const uint8_t *s, size_t n) +{ + assert(n >= 16); + /* align destination up to 16 bytes */ + size_t i; + size_t align = (16 - (((uintptr_t) d) & 0xf)) & 0xf; + if (align != 0) { + for (i = 0; i < align; i++) { + d[i] = s[i]; + } + d += align; + s += align; + n -= align; + } + + __m128i *dp = (__m128i *) d; + __m128i const *sp = (__m128i const *) s; + + if ((((uintptr_t) sp) & 0xf) == 0x0) { + /* source and destination are both 16 byte aligned */ + if (n < (OPTERON_L2_CACHE_BYTES >> 2)) { + size_t count = n >> 7; + for (i = 0; i < count; i++) { + _mm_prefetch(((const char *) sp) + 512, _MM_HINT_NTA); + _mm_prefetch(((const char *) sp) + 576, _MM_HINT_NTA); + __m128i tmp0 = _mm_load_si128(sp); + __m128i tmp1 = _mm_load_si128(sp + 1); + __m128i tmp2 = _mm_load_si128(sp + 2); + __m128i tmp3 = _mm_load_si128(sp + 3); + __m128i tmp4 = _mm_load_si128(sp + 4); + __m128i tmp5 = _mm_load_si128(sp + 5); + __m128i tmp6 = _mm_load_si128(sp + 6); + __m128i tmp7 = _mm_load_si128(sp + 7); + _mm_store_si128(dp, tmp0); + _mm_store_si128(dp + 1, tmp1); + _mm_store_si128(dp + 2, tmp2); + _mm_store_si128(dp + 3, tmp3); + _mm_store_si128(dp + 4, tmp4); + _mm_store_si128(dp + 5, tmp5); + _mm_store_si128(dp + 6, tmp6); + _mm_store_si128(dp + 7, tmp7); + sp += 8; + dp += 8; + } + return align + (count << 7); + } + else { + size_t count = n >> 7; + for (i = 0; i < count; i++) { + _mm_prefetch(((const char *) sp) + 768, _MM_HINT_NTA); + _mm_prefetch(((const char *) sp) + 832, _MM_HINT_NTA); + __m128i tmp0 = _mm_load_si128(sp); + __m128i tmp1 = _mm_load_si128(sp + 1); + __m128i tmp2 = _mm_load_si128(sp + 2); + __m128i tmp3 = _mm_load_si128(sp + 3); + __m128i tmp4 = _mm_load_si128(sp + 4); + __m128i tmp5 = _mm_load_si128(sp + 5); + __m128i tmp6 = _mm_load_si128(sp + 6); + __m128i tmp7 = _mm_load_si128(sp + 7); + _mm_stream_si128(dp, tmp0); + _mm_stream_si128(dp + 1, tmp1); + _mm_stream_si128(dp + 2, tmp2); + _mm_stream_si128(dp + 3, tmp3); + _mm_stream_si128(dp + 4, tmp4); + _mm_stream_si128(dp + 5, tmp5); + _mm_stream_si128(dp + 6, tmp6); + _mm_stream_si128(dp + 7, tmp7); + sp += 8; + dp += 8; + } + return align + (count << 7); + } + } + else { + /* only destination is 16 byte aligned - use unaligned loads */ + if (n < (OPTERON_L2_CACHE_BYTES >> 2)) { + size_t count = n >> 7; + for (i = 0; i < count; i++) { + _mm_prefetch(((const char *) sp) + 512, _MM_HINT_NTA); + _mm_prefetch(((const char *) sp) + 576, _MM_HINT_NTA); + __m128i tmp0 = _mm_loadu_si128(sp); + __m128i tmp1 = _mm_loadu_si128(sp + 1); + __m128i tmp2 = _mm_loadu_si128(sp + 2); + __m128i tmp3 = _mm_loadu_si128(sp + 3); + __m128i tmp4 = _mm_loadu_si128(sp + 4); + __m128i tmp5 = _mm_loadu_si128(sp + 5); + __m128i tmp6 = _mm_loadu_si128(sp + 6); + __m128i tmp7 = _mm_loadu_si128(sp + 7); + _mm_store_si128(dp, tmp0); + _mm_store_si128(dp + 1, tmp1); + _mm_store_si128(dp + 2, tmp2); + _mm_store_si128(dp + 3, tmp3); + _mm_store_si128(dp + 4, tmp4); + _mm_store_si128(dp + 5, tmp5); + _mm_store_si128(dp + 6, tmp6); + _mm_store_si128(dp + 7, tmp7); + sp += 8; + dp += 8; + } + return align + (count << 7); + } + else { + size_t count = n >> 7; + for (i = 0; i < count; i++) { + /* 2 x 64 bytes of prefetch matches 8 x 16 bytes of load/store */ + /* The prefetch distance was tuned empirically */ + _mm_prefetch(((const char *) sp) + 768, _MM_HINT_NTA); + _mm_prefetch(((const char *) sp) + 832, _MM_HINT_NTA); + __m128i tmp0 = _mm_loadu_si128(sp); + _mm_stream_si128(dp, tmp0); + __m128i tmp1 = _mm_loadu_si128(sp + 1); + __m128i tmp2 = _mm_loadu_si128(sp + 2); + __m128i tmp3 = _mm_loadu_si128(sp + 3); + __m128i tmp4 = _mm_loadu_si128(sp + 4); + __m128i tmp5 = _mm_loadu_si128(sp + 5); + __m128i tmp6 = _mm_loadu_si128(sp + 6); + __m128i tmp7 = _mm_loadu_si128(sp + 7); + _mm_stream_si128(dp + 1, tmp1); + _mm_stream_si128(dp + 2, tmp2); + _mm_stream_si128(dp + 3, tmp3); + _mm_stream_si128(dp + 4, tmp4); + _mm_stream_si128(dp + 5, tmp5); + _mm_stream_si128(dp + 6, tmp6); + _mm_stream_si128(dp + 7, tmp7); + sp += 8; + dp += 8; + } + return align + (count << 7); + } + } + return 0; /* unreachable */ +} + +void *psmi_memcpyo(void *dst, const void *src, size_t n) +{ + uint8_t *d = (uint8_t *) dst; + const uint8_t *s = (uint8_t *) src; + + /* Smaller copies are detected and handled first since they are + * the most latency sensitive. Larger copies can have residual + * parts left at the end that are smaller than the unrolled loop. + * I use an outer do-loop to allow these cases to loop around to + * the smaller copy code. */ + + do { + if (n < 16) { + switch (n) { + case 0: { + return dst; + } + case 1: { + * (uint8_t *) d = * (const uint8_t *) s; + return dst; + } + case 2: { + * (uint16_t *) d = * (const uint16_t *) s; + return dst; + } + case 4: { + * (uint32_t *) d = * (const uint32_t *) s; + return dst; + } + case 8: { + * (uint64_t *) d = * (const uint64_t *) s; + return dst; + } + default: { + if (n & 0x8) { + * (uint64_t *) d = * (const uint64_t *) s; + d += 8; + s += 8; + } + if (n & 0x4) { + * (uint32_t *) d = * (const uint32_t *) s; + d += 4; + s += 4; + } + if (n & 0x2) { + * (uint16_t *) d = * (const uint16_t *) s; + d += 2; + s += 2; + } + if (n & 0x1) { + * (uint8_t *) d = * (const uint8_t *) s; + } + return dst; + } + } + } + else if (n < 64) { + uint64_t *dp = (uint64_t *) d; + const uint64_t *sp = (const uint64_t *) s; + size_t count = n >> 3; + size_t i; + /* ideally would like to tell compiler not to unroll this loop further */ + for (i = 0; i < count - 1; i += 2) { + uint64_t tmp0 = sp[i]; + uint64_t tmp1 = sp[i + 1]; + dp[i] = tmp0; + dp[i + 1] = tmp1; + } + size_t bytes = i << 3; + if (n == bytes) { + return dst; /* short-cut to return */ + } + d += bytes; + s += bytes; + n -= bytes; + } + else if (n < OPTERON_L1_CACHE_BYTES) { + /* align destination up to 8 bytes */ + size_t i; + size_t a = 8 - (((uintptr_t) d) & 0x7); + if (a != 8) { + for (i = 0; i < a; i++) { + d[i] = s[i]; + } + d += a; + s += a; + n -= a; + } + uint64_t *dp = (uint64_t *) d; + const uint64_t *sp = (const uint64_t *) s; + size_t count = n >> 6; + if (count > 0) { + i = count; + do { + uint64_t tmp0 = sp[0]; + uint64_t tmp1 = sp[1]; + uint64_t tmp2 = sp[2]; + uint64_t tmp3 = sp[3]; + dp[0] = tmp0; + dp[1] = tmp1; + dp[2] = tmp2; + dp[3] = tmp3; + uint64_t tmp4 = sp[4]; + uint64_t tmp5 = sp[5]; + uint64_t tmp6 = sp[6]; + uint64_t tmp7 = sp[7]; + dp[4] = tmp4; + dp[5] = tmp5; + dp[6] = tmp6; + dp[7] = tmp7; + __asm__("lea 64(%0),%0\n" : "+r"(sp)); /* was sp += 64 */ + __asm__("lea 64(%0),%0\n" : "+r"(dp)); /* was dp += 64 */ + i--; + } while (i > 0); + } + size_t bytes = count << 6; + if (n == bytes) { + return dst; /* short-cut to return */ + } + d += bytes; + s += bytes; + n -= bytes; + } +#if 0 /* performance of rep movsq appears to be unpredictable */ + else if (n < OPTERON_L1_CACHE_BYTES) { + size_t count = n >> 3; + __asm__ ("rep movsq\n" : + "+D" (d), "+S" (s), "+c" (count) : : "memory"); + size_t bytes = count << 3; + d += bytes; + s += bytes; + n -= bytes; + } +#endif + else { + size_t bytes = __memcpy_pathscale_opteron_sse2(d, s, n); + assert(bytes > 0); + d += bytes; + s += bytes; + n -= bytes; + } + } while (n > 0); + + return dst; +} +#endif diff --git a/psm_mpool.c b/psm_mpool.c new file mode 100644 index 0000000..6aadd9a --- /dev/null +++ b/psm_mpool.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" + +#define PSMI_MPOOL_ALIGNMENT 64 + +struct mpool_element { + union { + SLIST_ENTRY(mpool_element) me_next; + mpool_t me_mpool; + }; + + uint32_t me_gen_count; + uint32_t me_index; +#ifdef PSM_DEBUG + uint32_t me_isused; +#endif +} __attribute__ ((aligned(8))); + +#ifdef PSM_DEBUG +# define me_mark_used(me) ((me)->me_isused = 1) +# define me_mark_unused(me) ((me)->me_isused = 0) +#else +# define me_mark_used(me) +# define me_mark_unused(me) +#endif + +struct mpool { + int mp_type; + int mp_flags; + int mp_vector_shift; + + uint32_t mp_elm_vector_size; + uint32_t mp_elm_offset; + uint32_t mp_num_obj; + uint32_t mp_num_obj_inuse; + uint32_t mp_elm_size; + uint32_t mp_obj_size; + uint32_t mp_num_obj_per_chunk; + uint32_t mp_num_obj_max_total; + psmi_memtype_t mp_memtype; + + SLIST_HEAD(, mpool_element) mp_head; + struct mpool_element ** mp_elm_vector; + struct mpool_element ** mp_elm_vector_free; + non_empty_callback_fn_t mp_non_empty_cb; + void * mp_non_empty_cb_context; + +}; + +static int psmi_mpool_allocate_chunk(mpool_t); + +/** + * psmi_mpool_create() + * + * Create a memory pool and allocates objects of size + * . If more memory is needed to accomodate mpool_get() + * requests, the memory pool will allocate another chunk of + * objects, until it reaches the maximum number of objects + * it can allocate. + * + * size of each individual object + * number of objects to allocate per chunk (power of two) + * total number of objects that may be allocated + * at any given time. Must be a power of two greater than + * . + * + * flags to be applied on the memory pool (ie. memory + * alignment) + * + * callback to be called when the memory pool has some + * free objects available again (after running out of them). + * context pointer for the callback + * + * Return the mpool on success, NULL on failure. + */ +mpool_t +psmi_mpool_create(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context) +{ + mpool_t mp; + int s; + size_t hdr_size; + +#ifdef PSM_VALGRIND + /* For Valgrind we which to define a "redzone" before and after the + * allocation block, so we also allocate a blank mpool_element + * at the end of the user's block */ +#endif + + if (!PSMI_POWEROFTWO(num_obj_per_chunk) || + !PSMI_POWEROFTWO(num_obj_max_total) || + num_obj_max_total < num_obj_per_chunk) { + return NULL; + } + + mp = psmi_calloc(PSMI_EP_NONE, statstype, 1, sizeof(struct mpool)); + if (mp == NULL) { + fprintf(stderr, "Failed to allocate memory for memory pool: %s\n", + strerror(errno)); + return NULL; + } + + for (s = 1; s < num_obj_per_chunk; s <<= 1) + mp->mp_vector_shift++; + + mp->mp_flags = flags; + mp->mp_num_obj_per_chunk = num_obj_per_chunk; + mp->mp_num_obj_max_total = num_obj_max_total; + mp->mp_non_empty_cb = cb; + mp->mp_non_empty_cb_context = context; + + mp->mp_memtype = statstype; + + SLIST_INIT(&mp->mp_head); + mp->mp_elm_vector_size = num_obj_max_total / num_obj_per_chunk; + mp->mp_elm_vector = psmi_calloc(PSMI_EP_NONE, statstype, mp->mp_elm_vector_size, + sizeof(struct mpool_element *)); + if (mp->mp_elm_vector == NULL) { + fprintf(stderr, "Failed to allocate memory for memory pool vector: " + "%s\n", strerror(errno)); + psmi_free(mp); + return NULL; + } + + mp->mp_elm_vector_free = mp->mp_elm_vector; + + if (flags & PSMI_MPOOL_ALIGN) { + /* User wants its block to start on a PSMI_MPOOL_ALIGNMENT + * boundary. */ + hdr_size = PSMI_ALIGNUP(sizeof(struct mpool_element), + PSMI_MPOOL_ALIGNMENT); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, PSMI_MPOOL_ALIGNMENT); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + + mp->mp_elm_offset = hdr_size - sizeof(struct mpool_element); + } else { + hdr_size = sizeof(struct mpool_element); + mp->mp_obj_size = PSMI_ALIGNUP(obj_size, 8); + mp->mp_elm_size = hdr_size + mp->mp_obj_size; + mp->mp_elm_offset = 0; + } + + if (psmi_mpool_allocate_chunk(mp) != PSM_OK) { + psmi_mpool_destroy(mp); + return NULL; + } + + VALGRIND_CREATE_MEMPOOL(mp, 0 /* no redzone */, PSM_VALGRIND_MEM_UNDEFINED); + + return mp; +} + +/** + * psmi_mpool_get() + * + * memory pool + * + * Requests an object from the memory pool. + * + * Returns NULL if the maximum number of objects has been allocated (refer to + * in psmi_mpool_create) or if running out of memory. + */ +void * +psmi_mpool_get(mpool_t mp) +{ + struct mpool_element *me; + void *obj; + + if (SLIST_EMPTY(&mp->mp_head)) { + if (psmi_mpool_allocate_chunk(mp) != PSM_OK) + return NULL; + } + + me = SLIST_FIRST(&mp->mp_head); + SLIST_REMOVE_HEAD(&mp->mp_head, me_next); + + psmi_assert(!me->me_isused); + me_mark_used(me); + + /* store a backpointer to the memory pool */ + me->me_mpool = mp; + mp->mp_num_obj_inuse++; + psmi_assert(mp->mp_num_obj_inuse <= mp->mp_num_obj); + + obj = (void *) ((uintptr_t) me + sizeof(struct mpool_element)); + VALGRIND_MEMPOOL_ALLOC(mp, obj, mp->mp_obj_size); + return obj; +} + +/** + * psmi_mpool_put() + * + * object to return to the memory pool + * + * Returns an to the memory pool subsystem. This object will be re-used + * to fulfill new psmi_mpool_get() requests. + */ +void +psmi_mpool_put(void *obj) +{ + struct mpool_element *me; + int was_empty; + mpool_t mp; + + me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + me->me_gen_count++; + + mp = me->me_mpool; + + psmi_assert(mp != NULL); + psmi_assert(mp->mp_num_obj_inuse >= 0); + psmi_assert(me->me_isused); + me_mark_unused(me); + + was_empty = mp->mp_num_obj_inuse == mp->mp_num_obj_max_total; + SLIST_INSERT_HEAD(&mp->mp_head, me, me_next); + + mp->mp_num_obj_inuse--; + + VALGRIND_MEMPOOL_FREE(mp, obj); + + /* tell the user that memory is available */ + if (mp->mp_non_empty_cb && was_empty) + mp->mp_non_empty_cb(mp->mp_non_empty_cb_context); +} + +/** + * psmi_mpool_get_obj_index() + * + * object in the memory pool + * + * Returns the index of the in the memory pool. + */ + +int +psmi_mpool_get_obj_index(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_index; +} + +/** + * psmi_mpool_get_obj_gen_count() + * + * object in the memory pool + * + * Returns the generation count of the . + */ +uint32_t +psmi_mpool_get_obj_gen_count(void *obj) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + return me->me_gen_count; +} + +/** + * psmi_mpool_get_obj_index_gen_count() + * + * object in the memory pool + * + * Returns the index of the in . + * Returns the generation count of the in . + */ +int +psmi_mpool_get_obj_index_gen_count(void *obj, uint32_t *index, + uint32_t *gen_count) +{ + struct mpool_element *me = (struct mpool_element *) + ((uintptr_t) obj - sizeof(struct mpool_element)); + + *index = me->me_index; + *gen_count = me->me_gen_count; + return 0; +} + +/** + * psmi_mpool_find_obj_by_index() + * + * memory pool + * index of the object + * + * Returns the object located at in the memory pool or NULL if the + * is invalid. + */ +void * +psmi_mpool_find_obj_by_index(mpool_t mp, int index) +{ + struct mpool_element *me; + + if_pf (index < 0 || index >= mp->mp_num_obj) + return NULL; + + me = (struct mpool_element *) + ((uintptr_t) mp->mp_elm_vector[index >> mp->mp_vector_shift] + + (index & (mp->mp_num_obj_per_chunk - 1)) * mp->mp_elm_size + + mp->mp_elm_offset); + + /* If this mpool doesn't require generation counts, it's illegal to find a + * freed object */ +#ifdef PSM_DEBUG + if (mp->mp_flags & PSMI_MPOOL_NOGENERATION) + psmi_assert(!me->me_isused); +#endif + + return (void *)((uintptr_t) me + sizeof(struct mpool_element)); +} + +/** + * psmi_mpool_destroy() + * + * memory pool + * + * Destroy a previously allocated memory pool and reclaim its associated + * memory. The behavior is undefined if some objects have not been returned + * to the memory pool with psmi_mpool_put(). + */ +void +psmi_mpool_destroy(mpool_t mp) +{ + int i = 0; + size_t nbytes = mp->mp_num_obj * mp->mp_elm_size; + + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i]) + psmi_free(mp->mp_elm_vector[i]); + } + psmi_free(mp->mp_elm_vector); + nbytes += mp->mp_elm_vector_size * sizeof(struct mpool_element *); + VALGRIND_DESTROY_MEMPOOL(mp); + psmi_free(mp); + nbytes += sizeof(struct mpool); +} + +/** + * psmi_mpool_get_max_obj() + * + * memory pool + * + * Returns the num-obj-per-chunk + * Returns the num-obj-max-total + */ +void +psmi_mpool_get_obj_info(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total) +{ + *num_obj_per_chunk = mp->mp_num_obj_per_chunk; + *num_obj_max_total = mp->mp_num_obj_max_total; + return; +} + +static int +psmi_mpool_allocate_chunk(mpool_t mp) +{ + struct mpool_element *elm; + void *chunk; + uint32_t i = 0, num_to_allocate; + + num_to_allocate = + mp->mp_num_obj + mp->mp_num_obj_per_chunk > mp->mp_num_obj_max_total ? + 0 : mp->mp_num_obj_per_chunk; + + psmi_assert(mp->mp_num_obj + mp->mp_num_obj_per_chunk <= + mp->mp_num_obj_max_total); + + if (num_to_allocate == 0) + return PSM_NO_MEMORY; + + chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype, + num_to_allocate * mp->mp_elm_size); + if (chunk == NULL) { + fprintf(stderr, + "Failed to allocate memory for memory pool chunk: %s\n", + strerror(errno)); + return PSM_NO_MEMORY; + } + + for (i = 0; i < num_to_allocate; i++) { + elm = (struct mpool_element *)((uintptr_t)chunk + + i * mp->mp_elm_size + mp->mp_elm_offset); + elm->me_gen_count = 0; + elm->me_index = mp->mp_num_obj + i; +#ifdef PSM_DEBUG + elm->me_isused = 0; +#endif + SLIST_INSERT_HEAD(&mp->mp_head, elm, me_next); +#if 0 + fprintf(stderr, "chunk%ld i=%d elm=%p user=%p next=%p\n", + (long)(mp->mp_elm_vector_free - mp->mp_elm_vector), (int) i, elm, + (void *)((uintptr_t) elm + sizeof(struct mpool_element)), + SLIST_NEXT(elm, me_next)); +#endif + } + + psmi_assert((uintptr_t) mp->mp_elm_vector_free + < ((uintptr_t) mp->mp_elm_vector) + mp->mp_elm_vector_size + * sizeof(struct mpool_element *)); + + mp->mp_elm_vector_free[0] = chunk; + mp->mp_elm_vector_free++; + mp->mp_num_obj += num_to_allocate; + + return PSM_OK; +} + +#if 0 +void +psmi_mpool_dump(mpool_t mp) +{ + int i, j; + struct mpool_element *me; + + fprintf(stderr, "Memory pool %p has %d elements per chunk.\n", + mp, mp->mp_num_obj_per_chunk); + for (i = 0; i < mp->mp_elm_vector_size; i++) { + if (mp->mp_elm_vector[i] != NULL) { + fprintf(stderr, "===========================\n"); + fprintf(stderr, "mpool chunk #%d\n", i); + + for (j = 0, me = mp->mp_elm_vector[i]; + j < mp->mp_num_obj_per_chunk; + j++, me = (struct mpool_element *) + ((uintptr_t) me + mp->mp_elm_size)) { + fprintf(stderr, "obj=%p index=%d gen_count=%d\n", + (void *) ((uintptr_t) me + sizeof(struct mpool_element)), + me->me_index, me->me_gen_count); + } + fprintf(stderr, "===========================\n"); + } + } +} +#endif diff --git a/psm_mpool.h b/psm_mpool.h new file mode 100644 index 0000000..1567dd5 --- /dev/null +++ b/psm_mpool.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_mpool.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef PSM_MPOOL_H +#define PSM_MPOOL_H + +/* mpool flags */ +#define PSMI_MPOOL_ALIGN_CACHE 0x1 +#define PSMI_MPOOL_ALIGN_PAGE 0x2 +#define PSMI_MPOOL_NOGENERATION 0x4 + +/* Backwards compatibility */ +#define PSMI_MPOOL_ALIGN PSMI_MPOOL_ALIGN_CACHE + +typedef void (*non_empty_callback_fn_t)(void *context); +typedef struct mpool *mpool_t; + +mpool_t psmi_mpool_create(size_t obj_size, uint32_t num_obj_per_chunk, + uint32_t num_obj_max_total, int flags, + psmi_memtype_t statstype, + non_empty_callback_fn_t cb, void *context); + +void psmi_mpool_destroy(mpool_t mp); +void psmi_mpool_get_obj_info(mpool_t mp, uint32_t *num_obj_per_chunk, + uint32_t *num_obj_max_total); + +void * psmi_mpool_get(mpool_t mp); +void psmi_mpool_put(void *obj); + +int psmi_mpool_get_obj_index(void *obj); +uint32_t psmi_mpool_get_obj_gen_count(void *obj); +int psmi_mpool_get_obj_index_gen_count(void *obj, + uint32_t *index, + uint32_t *gen_count); + +void * psmi_mpool_find_obj_by_index(mpool_t mp, int index); + +#endif diff --git a/psm_mq.c b/psm_mq.c new file mode 100644 index 0000000..ea2655a --- /dev/null +++ b/psm_mq.c @@ -0,0 +1,729 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * Functions to manipulate the expected queue in mq_ep. + */ + +/* + * ! @brief PSM exposed version to allow PTLs to match + */ + +static +psm_mq_req_t +mq_req_match_with_tagsel(psm_mq_t mq, struct mqsq *q, uint64_t tag, + uint64_t tagsel, int remove) +{ + psm_mq_req_t *curp; + psm_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) { + if (!((tag ^ cur->tag) & tagsel)) { /* match! */ + if (remove) { + if ((*curp = cur->next) == NULL) /* fix tail */ + q->lastp = curp; + cur->next = NULL; + } + return cur; + } + } + return NULL; +} + +#if 0 +/* Only for psm_mq_irecv. Currently not enabled. */ +PSMI_ALWAYS_INLINE( +psm_mq_req_t +mq_req_match_with_tagsel_inline(struct mqsq *q, uint64_t tag, uint64_t tagsel)) +{ + psm_mq_req_t cur = q->first; + if (cur == NULL) + return NULL; + else if (!((cur->tag ^ tag) & tagsel)) { + if ((q->first = cur->next) == NULL) + q->lastp = &q->first; + cur->next = NULL; + return cur; + } + else + return mq_req_match_with_tagsel(q, tag, tagsel, 1); +} +#endif + +static +int +mq_req_remove_single(psm_mq_t mq, struct mqsq *q, psm_mq_req_t req) +{ + psm_mq_req_t *curp; + psm_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) { + if (cur == req) { + if ((*curp = cur->next) == NULL) + q->lastp = curp; + cur->next = NULL; + return 1; + } + } + return 0; +} + +#if 0 + /*XXX only used with cancel, for now */ + +static +psm_mq_req_t +mq_req_match_req(struct mqsq *q, psm_mq_req_t req, int remove) +{ + psm_mq_req_t *curp; + psm_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) { + if (cur->send_req == req) { + if (remove) { + if ((*curp = cur->next) == NULL) /* fix tail */ + q->lastp = curp; + cur->next = NULL; + } + return cur; + } + } + return NULL; /* no match */ +} +#endif + +void +psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars) +{ +#ifdef __MIC__ + memcpy(vdest, vsrc, nchars); +#else + unsigned char *dest = (unsigned char *)vdest; + const unsigned char *src = (const unsigned char *)vsrc; + if(nchars>>2) + ipath_dwordcpy((uint32_t*) dest, (uint32_t*) src, nchars>>2); + dest += (nchars>>2)<<2; + src += (nchars>>2)<<2; + switch (nchars&0x03) { + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } +#endif +} + +#if 0 // defined(__x86_64__) No consumers of mtucpy safe +void +psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars) +{ + unsigned char *dest = (unsigned char *)vdest; + const unsigned char *src = (const unsigned char *)vsrc; + if(nchars>>2) + ipath_dwordcpy_safe((uint32_t*) dest, (uint32_t*) src, nchars>>2); + dest += (nchars>>2)<<2; + src += (nchars>>2)<<2; + switch (nchars&0x03) { + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } +} +#endif + +psm_error_t +__psm_mq_iprobe(psm_mq_t mq, uint64_t tag, uint64_t tagsel, psm_mq_status_t *status) +{ + psm_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 0); + + if (req != NULL) { + PSMI_PUNLOCK(); + if (status != NULL) + mq_status_copy(req, status); + return PSM_OK; + } + + psmi_poll_internal(mq->ep, 1); + /* try again */ + req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 0); + + if (req != NULL) { + PSMI_PUNLOCK(); + if (status != NULL) + mq_status_copy(req, status); + return PSM_OK; + } + PSMI_PUNLOCK(); + return PSM_MQ_NO_COMPLETIONS; +} +PSMI_API_DECL(psm_mq_iprobe) + +psm_error_t +__psm_mq_cancel(psm_mq_req_t *ireq) +{ + psm_mq_req_t req = *ireq; + psm_mq_t mq; + psm_error_t err = PSM_OK; + + PSMI_ASSERT_INITIALIZED(); + + if (req == NULL) + return PSM_MQ_NO_COMPLETIONS; + + /* Cancelling a send is a blocking operation, and expensive. + * We only allow cancellation of rendezvous sends, consider the eager sends + * as always unsuccessfully cancelled. + */ + PSMI_PLOCK(); + + mq = req->mq; + if (MQE_TYPE_IS_RECV(req->type)) { + if (req->state == MQ_STATE_POSTED) { + int rc; + + rc = mq_req_remove_single(mq, &mq->expected_q, req); + psmi_assert_always(rc); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + err = PSM_OK; + } + else + err = PSM_MQ_NO_COMPLETIONS; + } + else { + err = psmi_handle_error(mq->ep, PSM_PARAM_ERR, + "Cannot cancel send requests (req=%p)", req); + } + + PSMI_PUNLOCK(); + + return err; +} +PSMI_API_DECL(psm_mq_cancel) + +/* This is the only PSM function that blocks. + * We handle it in a special manner since we don't know what the user's + * execution environment is (threads, oversubscribing processes, etc). + * + */ +PSMI_ALWAYS_INLINE( +psm_error_t +psmi_mq_wait_inner(psm_mq_req_t *ireq, psm_mq_status_t *status, int do_lock)) +{ + psm_error_t err = PSM_OK; + + psm_mq_req_t req = *ireq; + if (req == PSM_MQ_REQINVALID) { + return PSM_OK; + } + + if (do_lock) + PSMI_PLOCK(); + + if (req->state != MQ_STATE_COMPLETE) { + psm_mq_t mq = req->mq; + + /* We'll be waiting on this req, mark it as so */ + req->type |= MQE_TYPE_WAITING; + + _IPATH_VDBG("req=%p, buf=%p, len=%d, waiting\n", + req, req->buf, req->buf_len); + + if (req->testwait_callback) { + err = req->testwait_callback(ireq, 0, status); + if (do_lock) + PSMI_PUNLOCK(); + return err; + } + + PSMI_BLOCKUNTIL(mq->ep, err, req->state == MQ_STATE_COMPLETE); + + if (err > PSM_OK_NO_PROGRESS) + goto fail_with_lock; + else + err = PSM_OK; + } + + mq_qq_remove(&req->mq->completed_q, req); + + if (status != NULL) + mq_status_copy(req, status); + psmi_mq_req_free(req); + *ireq = PSM_MQ_REQINVALID; + + _IPATH_VDBG("req=%p complete, buf=%p, len=%d, err=%d\n", + req, req->buf, req->buf_len, req->error_code); + +fail_with_lock: + if (do_lock) + PSMI_PUNLOCK(); + return err; +} + +psm_error_t __sendpath +__psm_mq_wait(psm_mq_req_t *ireq, psm_mq_status_t *status) +{ + PSMI_ASSERT_INITIALIZED(); + return psmi_mq_wait_inner(ireq, status, 1); +} +PSMI_API_DECL(psm_mq_wait) + +psm_error_t __sendpath +psmi_mq_wait_internal(psm_mq_req_t *ireq) +{ + return psmi_mq_wait_inner(ireq, NULL, 0); +} + +psm_error_t __sendpath +__psm_mq_test(psm_mq_req_t *ireq, psm_mq_status_t *status) +{ + psm_mq_req_t req = *ireq; + psm_error_t err = PSM_OK; + + PSMI_ASSERT_INITIALIZED(); + + if (req == PSM_MQ_REQINVALID) { + return PSM_OK; + } + + if (req->state != MQ_STATE_COMPLETE) { + if (req->testwait_callback) { + PSMI_PLOCK(); + err = req->testwait_callback(ireq, 1, status); + PSMI_PUNLOCK(); + return err; + } + else + return PSM_MQ_NO_COMPLETIONS; + } + + if (status != NULL) + mq_status_copy(req, status); + + PSMI_PLOCK(); + mq_qq_remove(&req->mq->completed_q, req); + psmi_mq_req_free(req); + PSMI_PUNLOCK(); + + *ireq = PSM_MQ_REQINVALID; + _IPATH_VDBG("req=%p complete, tag=%llx buf=%p, len=%d, err=%d\n", + req, (unsigned long long) req->tag, req->buf, + req->buf_len, req->error_code); + + return err; +} +PSMI_API_DECL(psm_mq_test) + +psm_error_t __sendpath +__psm_mq_isend(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm_mq_req_t *req) +{ + psm_error_t err; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + err = dest->ptlctl->mq_isend(mq, dest, flags, stag, buf, len, context, req); + PSMI_PUNLOCK(); + +#if 0 +#ifdef PSM_VALGRIND + /* If the send isn't completed yet, make sure that we mark the memory as + * unaccessible + */ + if (*req != PSM_MQ_REQINVALID && + (*req)->state != MQ_STATE_COMPLETE) + VALGRIND_MAKE_MEM_NOACCESS(buf, len); +#endif +#endif + psmi_assert(*req != NULL); + return err; +} +PSMI_API_DECL(psm_mq_isend) + +psm_error_t __sendpath +__psm_mq_send(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len) +{ + psm_error_t err; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + err = dest->ptlctl->mq_send(mq, dest, flags, stag, buf, len); + PSMI_PUNLOCK(); + return err; +} +PSMI_API_DECL(psm_mq_send) + +psm_error_t __recvpath +__psm_mq_irecv(psm_mq_t mq, uint64_t tag, uint64_t tagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm_mq_req_t *reqo) +{ + psm_error_t err = PSM_OK; + psm_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + + PSMI_PLOCK(); + + /* First check unexpected Queue and remove req if found */ + req = mq_req_match_with_tagsel(mq, &mq->unexpected_q, tag, tagsel, 1); + + if (req == NULL) + { + /* prepost before arrival, add to expected q */ + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + if_pf (req == NULL) { + err = PSM_NO_MEMORY; + goto ret; + } + + req->tag = tag; + req->tagsel = tagsel; + req->state = MQ_STATE_POSTED; + req->buf = buf; + req->buf_len = len; + req->recv_msglen = len; + req->recv_msgoff = 0; + req->context = context; + + /* Nobody should touch the buffer after it's posted */ + VALGRIND_MAKE_MEM_NOACCESS(buf, len); + + mq_sq_append(&mq->expected_q, req); + _IPATH_VDBG("buf=%p,len=%d,tag=%"PRIx64 + " tagsel=%"PRIx64" req=%p\n", + buf,len,tag, tagsel, req); + } + else { + uint32_t copysz; + req->context = context; + + psmi_assert(MQE_TYPE_IS_RECV(req->type)); + _IPATH_VDBG("unexpected buf=%p,len=%d,tag=%"PRIx64 + " tagsel=%"PRIx64" req=%p\n", buf, len, tag, tagsel, req); + + switch (req->state) { + case MQ_STATE_COMPLETE: + if (req->buf != NULL) { /* 0-byte messages don't alloc a sysbuf */ + copysz = mq_set_msglen(req, len, req->send_msglen); + psmi_mq_mtucpy(buf, (const void *) req->buf, copysz); + psmi_mq_sysbuf_free(mq, req->buf); + } + req->buf = buf; + req->buf_len = len; + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_STATE_UNEXP: /* not done yet */ + copysz = mq_set_msglen(req, len, req->send_msglen); + /* Copy What's been received so far and make sure we don't receive + * any more than copysz. After that, swap system with user buffer + */ + req->recv_msgoff = min(req->recv_msgoff, copysz); + psmi_mq_mtucpy(buf, (const void *) req->buf, req->recv_msgoff); + /* What's "left" is no access */ + VALGRIND_MAKE_MEM_NOACCESS( + (void *)((uintptr_t) buf + req->recv_msgoff), len - req->recv_msgoff); + psmi_mq_sysbuf_free(mq, req->buf); + req->state = MQ_STATE_MATCHED; + req->buf = buf; + req->buf_len = len; + break; + + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + copysz = mq_set_msglen(req, len, req->send_msglen); + req->state = MQ_STATE_MATCHED; + req->buf = buf; + req->buf_len = len; + VALGRIND_MAKE_MEM_NOACCESS(buf, len); + req->recv_msgoff = 0; + req->rts_callback(req, 0); + break; + + default: + fprintf(stderr, "Unexpected state %d in req %p\n", req->state, req); + fprintf(stderr, "type=%d, mq=%p, tag=%p\n", + req->type, req->mq, (void *)(uintptr_t)req->tag); + abort(); + } + } + +ret: + PSMI_PUNLOCK(); + *reqo = req; + return err; +} +PSMI_API_DECL(psm_mq_irecv) + +psm_error_t __sendpath +__psm_mq_ipeek(psm_mq_t mq, psm_mq_req_t *oreq, psm_mq_status_t *status) +{ + psm_mq_req_t req; + + PSMI_ASSERT_INITIALIZED(); + + if ((req = mq->completed_q.first) == NULL) { + PSMI_PLOCK(); + psmi_poll_internal(mq->ep, 1); + if ((req = mq->completed_q.first) == NULL) { + PSMI_PUNLOCK(); + return PSM_MQ_NO_COMPLETIONS; + } + PSMI_PUNLOCK(); + } + /* something in the queue */ + *oreq = req; + if (status != NULL) + mq_status_copy(req, status); + + return PSM_OK; +} +PSMI_API_DECL(psm_mq_ipeek) + +static +psm_error_t +psmi_mqopt_ctl(psm_mq_t mq, uint32_t key, void *value, int get) +{ + psm_error_t err = PSM_OK; + uint32_t val32; + + switch (key) { + case PSM_MQ_RNDV_IPATH_SZ: + if (get) + *((uint32_t *)value) = mq->ipath_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->ipath_thresh_rv = val32; + } + _IPATH_VDBG("RNDV_IPATH_SZ = %d (%s)\n", + mq->ipath_thresh_rv, get ? "GET" : "SET"); + break; + + case PSM_MQ_RNDV_SHM_SZ: + if (get) + *((uint32_t *)value) = mq->shm_thresh_rv; + else { + val32 = *((uint32_t *) value); + mq->shm_thresh_rv = val32; + } + _IPATH_VDBG("RNDV_SHM_SZ = %d (%s)\n", + mq->shm_thresh_rv, get ? "GET" : "SET"); + break; + + case PSM_MQ_MAX_SYSBUF_MBYTES: + if (get) + *((uint32_t *)value) = (uint32_t)(mq->max_sysbuf_bytes / 1048576); + else { + val32 = *((uint32_t *) value); + /* XXX For now, don't support this */ + /* mq->max_sysbuf_bytes = 1048576ULL * val32; */ + mq->max_sysbuf_bytes = ~(0ULL); + } + break; + + default: + err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown option key=%u", key); + break; + } + return err; +} + +psm_error_t +__psm_mq_getopt(psm_mq_t mq, int key, void *value) +{ + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + return psmi_mqopt_ctl(mq, key, value, 1); +} +PSMI_API_DECL(psm_mq_getopt) + +psm_error_t +__psm_mq_setopt(psm_mq_t mq, int key, const void *value) +{ + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + return psmi_mqopt_ctl(mq, key, (void *) value, 0); +} +PSMI_API_DECL(psm_mq_setopt) + +/* + * This is the API for the user. We actually allocate the MQ much earlier, but + * the user can set options after obtaining an endpoint + */ +psm_error_t +__psm_mq_init(psm_ep_t ep, uint64_t tag_order_mask, + const struct psm_optkey *opts, + int numopts, psm_mq_t *mqo) +{ + psm_error_t err = PSM_OK; + psm_mq_t mq = ep->mq; + int i; + + PSMI_ERR_UNLESS_INITIALIZED(ep); + + psmi_assert(mq != NULL); + psmi_assert(mq->ep != NULL); + + /* Process options */ + for (i = 0; err == PSM_OK && i < numopts; i++) + err = psmi_mqopt_ctl(mq, opts[i].key, opts[i].value, 0); + if (err != PSM_OK) /* error already handled */ + goto fail; + + *mqo = mq; + +fail: + return err; +} +PSMI_API_DECL(psm_mq_init) + +psm_error_t +__psm_mq_finalize(psm_mq_t mq) +{ + psm_ep_t ep; + PSMI_ERR_UNLESS_INITIALIZED(mq->ep); + + ep = mq->ep; + do { + ep->mq = NULL; + ep = ep->mctxt_next; + } while (ep != mq->ep); + + return psmi_mq_free(mq); +} +PSMI_API_DECL(psm_mq_finalize) + +void +__psm_mq_get_stats(psm_mq_t mq, psm_mq_stats_t *stats) +{ + memcpy(stats, &mq->stats, sizeof(psm_mq_stats_t)); +} +PSMI_API_DECL(psm_mq_get_stats) + +psm_error_t +psmi_mq_malloc(psm_mq_t *mqo) +{ + psm_error_t err = PSM_OK; + + psm_mq_t mq = (psm_mq_t) psmi_calloc(NULL, UNDEFINED, 1, sizeof(struct psm_mq)); + if (mq == NULL) { + err = psmi_handle_error(NULL, PSM_NO_MEMORY, + "Couldn't allocate memory for mq endpoint"); + goto fail; + } + + mq->ep = NULL; + mq->memmode = psmi_parse_memmode(); + mq->expected_q.first = NULL; + mq->expected_q.lastp = &mq->expected_q.first; + mq->unexpected_q.first = NULL; + mq->unexpected_q.lastp = &mq->unexpected_q.first; + mq->completed_q.first = NULL; + mq->completed_q.lastp = &mq->completed_q.first; + + mq->cur_sysbuf_bytes = 0ULL; + mq->max_sysbuf_bytes = ~(0ULL); + + /* The values are overwritten in initialize_defaults, they're just set to + * sensible defaults until then */ + mq->ipath_thresh_rv = 64000; + mq->ipath_window_rv = 131072; + mq->shm_thresh_rv = 16000; + + memset(&mq->stats, 0, sizeof(psm_mq_stats_t)); + err = psmi_mq_req_init(mq); + if (err) + goto fail; + + /* Initialize the unexpected system buffer allocator */ + psmi_mq_sysbuf_init(mq); + char buf[128]; + psmi_mq_sysbuf_getinfo(mq, buf, sizeof buf); + _IPATH_VDBG("%s", buf); + *mqo = mq; + + return PSM_OK; +fail: + if (mq != NULL) + psmi_free(mq); + return err; +} + +psm_error_t +psmi_mq_initialize_defaults(psm_mq_t mq) +{ + union psmi_envvar_val env_rvwin, env_ipathrv, env_shmrv; + + psmi_getenv("PSM_MQ_RNDV_IPATH_THRESH", + "ipath eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) mq->ipath_thresh_rv, &env_ipathrv); + mq->ipath_thresh_rv = env_ipathrv.e_uint; + + /* Re-evaluate this since it may have changed after initializing the shm + * device */ + mq->shm_thresh_rv = psmi_shm_mq_rv_thresh; + psmi_getenv("PSM_MQ_RNDV_SHM_THRESH", + "shm eager-to-rendezvous switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) mq->shm_thresh_rv, &env_shmrv); + mq->shm_thresh_rv = env_shmrv.e_uint; + + psmi_getenv("PSM_MQ_RNDV_IPATH_WINDOW", + "ipath rendezvous window size", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) mq->ipath_window_rv, &env_rvwin); + mq->ipath_window_rv = env_rvwin.e_uint; + + return PSM_OK; +} + + +psm_error_t +psmi_mq_free(psm_mq_t mq) +{ + psmi_mq_req_fini(mq); + psmi_mq_sysbuf_fini(mq); + psmi_free(mq); + return PSM_OK; +} diff --git a/psm_mq.h b/psm_mq.h new file mode 100644 index 0000000..dd90028 --- /dev/null +++ b/psm_mq.h @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PSM_MQ_H +#define PSM_MQ_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + + +/* Initialize the MQ component for MQ communication + * + * This function provides the Matched Queue handle necessary to performa all + * Matched Queue communication operations. + * + * [in] ep Endpoint over which to initialize Matched Queue + * [in] tag_order_mask Order mask hint to let MQ know what bits of the send + * tag are required to maintain MQ message order. In + * MPI parlance, this mask sets the bits that store + * the context (or communicator ID). The user can + * choose to pass PSM_MQ_ORDERMASK_NONE or + * PSM_MQ_ORDERMASK_ALL to tell MQ to respectively + * provide no ordering guarantees or to provide + * ordering over all messages by ignoring the + * contexts of the send tags. + * [in] opts Set of options for Matched Queue + * [in] numopts Number of options passed + * [out] mq User-supplied storage to return the Matched Queue handle + * associated to the newly created Matched Queue. + * + * @remark This function can be called many times to retrieve the MQ handle + * associated to an endpoint, but options are only considered the first + * time the function is called. + * + * [post] The user obtains a handle to an instantiated Match Queue. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK A new Matched Queue has been instantiated across all the + * members of the group. + * + * @verbatim + * int try_open_endpoint_and_initialize_mq( + * psm_ep_t *ep, // endpoint handle + * psm_epid_t *epid, // unique endpoint ID + * psm_uuid_t job_uuid, // unique job uuid, for ep_open + * psm_mq_t *mq, // MQ handle initialized on endpoint 'ep' + * uint64_t communicator_bits) // Where we store our communicator or + * // context bits in the 64-bit tag. + * { + * // Simplifed open, see psm_ep_open documentation for more info + * psm_ep_open(job_uuid, + * NULL, // no options + * ep, epid); + * + * // We initialize a matched queue by telling PSM the bits that are + * // order-significant in the tag. Point-to-point ordering will not be + * // maintained between senders where the communicator bits are not the + * // same. + * psm_mq_init(ep, + * communicator_bits, + * NULL, // no other MQ options + * 0, // 0 options passed + * mq); // newly initialized matched Queue + * + * return 1; + * } + * @endverbatim + */ +psm_error_t +psm_mq_init(psm_ep_t ep, uint64_t tag_order_mask, + const struct psm_optkey *opts, int numopts, psm_mq_t *mq); + +#define PSM_MQ_ORDERMASK_NONE 0ULL + /* Used to initialize MQ and disable all MQ message ordering + * guarantees (this mask may prevent the use of MQ to maintain matched + * message envelope delivery required in MPI). */ + +#define PSM_MQ_ORDERMASK_ALL 0xffffffffffffffffULL + /* Used to initialize MQ with no message ordering hints, which forces + * MQ to maintain order over all messages */ + +/* Finalize (close) an MQ handle + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK A given Matched Queue has been freed and use of the future + * use of the handle produces undefined results. + */ +psm_error_t +psm_mq_finalize(psm_mq_t mq); + +/* MQ Non-blocking operation status + * + * Message completion status for asynchronous communication operations. + * For wait and test functions, MQ fills in the structure upon completion. + * Upon completion, receive requests fill in every field of the status + * structure while send requests only return a valid error_code and context + * pointer. + */ +typedef +struct psm_mq_status { + uint64_t msg_tag; /* Sender's original message tag (receive reqs only) */ + uint32_t msg_length; /* Sender's original message length (receive reqs only) */ + uint32_t nbytes; /* Actual number of bytes transfered (receive reqs only) */ + psm_error_t error_code; /* MQ error code for communication operation */ + void *context; /* User-associated context for send or receive */ +} +psm_mq_status_t; + +/* PSM Communication handle (opaque) */ +typedef struct psm_mq_req *psm_mq_req_t; + + + +/* Get an MQ option (Deprecated. Use psm_getopt with PSM_COMPONENT_MQ) + * + * Function to retrieve the value of an MQ option. + * + * [in] mq Matched Queue handle + * [in] option Index of option to retrieve. Possible values are: + * * PSM_MQ_RNDV_IPATH_SZ + * * PSM_MQ_RNDV_SHM_SZ + * * PSM_MQ_MAX_SYSBUF_MBYTES + * + * [in] value Pointer to storage that can be used to store the value of + * the option to be set. It is up to the user to ensure that the + * pointer points to a memory location large enough to accomodate + * the value associated to the type. Each option documents the size + * associated to its value. + * + * [returns] PSM_OK if option could be retrieved. + * [returns] PSM_PARAM_ERR if the option is not a valid option number + */ +psm_error_t +psm_mq_getopt(psm_mq_t mq, int option, void *value); + +/* Set an MQ option (Deprecated. Use psm_setopt with PSM_COMPONENT_MQ) + * + * Function to set the value of an MQ option. + * + * [in] mq Matched Queue handle + * [in] option Index of option to retrieve. Possible values are: + * * PSM_MQ_RNDV_IPATH_SZ + * * PSM_MQ_RNDV_SHM_SZ + * * PSM_MQ_MAX_SYSBUF_MBYTES + * + * [in] value Pointer to storage that contains the value to be updated + * for the supplied option number. It is up to the user to + * ensure that the pointer points to a memory location with a + * correct size. + * + * [returns] PSM_OK if option could be retrieved. + * [returns] PSM_PARAM_ERR if the option is not a valid option number + * [returns] PSM_OPT_READONLY if the option to be set is a read-only option + * (currently no MQ options are read-only). + */ +psm_error_t +psm_mq_setopt(psm_mq_t mq, int option, const void *value); + + + +#define PSM_MQ_FLAG_SENDSYNC 0x01 + /* MQ Send Force synchronous send */ + +#define PSM_MQ_REQINVALID ((psm_mq_req_t)(NULL)) + /* MQ request completion value */ + +/* Post a receive to a Matched Queue with tag selection criteria + * + * Function to receive a non-blocking MQ message by providing a preposted + * buffer. For every MQ message received on a particular MQ, the tag and @c + * tagsel parameters are used against the incoming message's send tag as + * described in tagmatch. + * + * [in] mq Matched Queue Handle + * [in] rtag Receive tag + * [in] rtagsel Receive tag selector + * [in] flags Receive flags (None currently supported) + * [in] buf Receive buffer + * [in] len Receive buffer length + * [in] context User context pointer, available in psm_mq_status_t + * upon completion + * [out] req PSM MQ Request handle created by the preposted receive, to + * be used for explicitly controlling message receive + * completion. + * + * [post] The supplied receive buffer is given to MQ to match against incoming + * messages unless it is cancelled via psm_mq_cancel @e before any + * match occurs. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The receive buffer has successfully been posted to the MQ. + */ +psm_error_t +psm_mq_irecv(psm_mq_t mq, uint64_t rtag, uint64_t rtagsel, uint32_t flags, + void *buf, uint32_t len, void *context, psm_mq_req_t *req); + +/* Send a blocking MQ message + * + * Function to send a blocking MQ message, whereby the message is locally + * complete and the source data can be modified upon return. + * + * [in] mq Matched Queue Handle + * [in] dest Destination EP address + * [in] flags Message flags, currently: + * * PSM_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * [in] stag Message Send Tag + * [in] buf Source buffer pointer + * [in] len Length of message starting at buf. + * + * [post] The source buffer is reusable and the send is locally complete. + * + * @note This send function has been implemented to best suit MPI_Send. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The message has been successfully sent. + */ +psm_error_t +psm_mq_send(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len); + +/* Send a non-blocking MQ message + * + * Function to initiate the send of a non-blocking MQ message, whereby the + * user guarantees that the source data will remain unmodified until the send + * is locally completed through a call such as psm_mq_wait or @ref + * psm_mq_test. + * + * [in] mq Matched Queue Handle + * [in] dest Destination EP address + * [in] flags Message flags, currently: + * * PSM_MQ_FLAG_SENDSYNC tells PSM to send the message + * synchronously, meaning that the message will not be sent until + * the receiver acknowledges that it has matched the send with a + * receive buffer. + * [in] stag Message Send Tag + * [in] buf Source buffer pointer + * [in] len Length of message starting at buf. + * [in] context Optional user-provided pointer available in @ref + * psm_mq_status_t when the send is locally completed. + * [out] req PSM MQ Request handle created by the non-blocking send, to + * be used for explicitly controlling message completion. + * + * [post] The source buffer is not reusable and the send is not locally complete + * until its request is completed by either psm_mq_test or @ref + * psm_mq_wait. + * + * @note This send function has been implemented to suit MPI_Isend. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The message has been successfully initiated. + * + * @verbatim + * psm_mq_req_t + * non_blocking_send(const psm_mq_t mq, psm_epaddr_t dest_ep, + * const void *buf, uint32_t len, + * int context_id, int send_tag, const my_request_t *req) + * { + * psm_mq_req_t req_mq; + * // Set up our send tag, assume that "my_rank" is global and represents + * // the rank of this process in the job + * uint64_t tag = ( ((context_id & 0xffff) << 48) | + * ((my_rank & 0xffff) << 32) | + * ((send_tag & 0xffffffff)) ); + * + * psm_mq_isend(mq, dest_ep, + * 0, // no flags + * tag, + * buf, + * len, + * req, // this req is available in psm_mq_status_t when one + * // of the synchronization functions is called. + * &req_mq); + * return req_mq; + * } + * @endverbatim + */ +psm_error_t +psm_mq_isend(psm_mq_t mq, psm_epaddr_t dest, uint32_t flags, uint64_t stag, + const void *buf, uint32_t len, void *context, psm_mq_req_t *req); + +/* Try to Probe if a message is received to match tag selection + * criteria + * + * Function to verify if a message matching the supplied tag and tag selectors + * has been received. The function is not fully matched until the user + * provides a buffer with the successfully matching tag selection criteria + * through psm_mq_irecv. + * Probing for messages may be useful if the size of the + * message to be received is unknown, in which case its size will be + * available in the msg_length member of the returned status. + * + * [in] mq Matched Queue Handle + * [in] rtag Message receive tag + * [in] rtagsel Message receive tag selector + * [out] status Upon return, status is filled with information + * regarding the matching send. + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The iprobe is successful and status is updated if non-NULL. + * [retval] PSM_MQ_NO_COMPLETIONS The iprobe is unsuccessful and status is unchaged. + */ +psm_error_t +psm_mq_iprobe(psm_mq_t mq, uint64_t rtag, uint64_t rtagsel, + psm_mq_status_t *status); + +/* Query for non-blocking requests ready for completion. + * + * Function to query a particular MQ for non-blocking requests that are ready + * for completion. Requests "ready for completion" are not actually considered + * complete by MQ until they are returned to the MQ library through @ref + * psm_mq_wait or psm_mq_test. + * + * If the user can deal with consuming request completions in the order in + * which they complete, this function can be used both for completions and for + * ensuring progress. The latter requirement is satisfied when the user + * peeks an empty completion queue as a side effect of always aggressively + * peeking and completing all an MQ's requests ready for completion. + * + * + * [in] mq Matched Queue Handle + * [in,out] req MQ non-blocking request + * [in] status Optional MQ status, can be NULL. + * + * [post] The user has ensured progress if the function returns @ref + * PSM_MQ_NO_COMPLETIONS + * + * The following error codes are returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The peek is successful and req is updated with a request + * ready for completion. If status is non-NULL, it is also + * updated. + * + * [retval] PSM_MQ_NO_COMPLETIONS The peek is not successful, meaning that there are + * no further requests ready for completion. The + * contents of req and status remain + * unchanged. + * @verbatim + * // Example that uses ipeek_mq_ipeek to make progress instead of psm_poll + * // We return the amount of non-blocking requests that we've completed + * int main_progress_loop(psm_mq_t mq) + * { + * int num_completed = 0; + * psm_mq_req_t req; + * psm_mq_status_t status; + * psm_error_t err; + * my_request_t *myreq; + * + * do { + * err = psm_mq_ipeek(mq, &req, + * NULL); // No need for status in ipeek here + * if (err == PSM_MQ_NO_COMPLETIONS) + * return num_completed; + * else if (err != PSM_OK) + * goto errh; + * num_completed++; + * + * // We obtained 'req' at the head of the completion queue. We can + * // now free the request with PSM and obtain our original reques + * // from the status' context + * err = psm_mq_test(&req, // will be marked as invalid + * &status); // we need the status + * myreq = (my_request_t *) status.context; + * + * // handle the completion for myreq whether myreq is a posted receive + * // or a non-blocking send. + * } + * while (1); + * } + * @endverbatim */ +psm_error_t +psm_mq_ipeek(psm_mq_t mq, psm_mq_req_t *req, psm_mq_status_t *status); + +/* Wait until a non-blocking request completes + * + * Function to wait on requests created from either preposted receive buffers + * or non-blocking sends. This is the only blocking function in the MQ + * interface and will poll until the request is complete as per the progress + * semantics explained in mq_progress. + * + * [in,out] request MQ non-blocking request + * [out] status Updated if non-NULL when request successfully completes + * + * [pre] The user has obtained a valid MQ request by calling psm_mq_isend + * or psm_mq_irecv and passes a pointer to enough storage to write + * the output of a psm_mq_status_t or NULL if status is to be + * ignored. + * + * [pre] Since MQ will internally ensure progress while the user is + * suspended, the user need not ensure that progress is made prior to + * calling this function. + * + * [post] The request is assigned the value PSM_MQ_REQINVALID and all + * associated MQ request storage is released back to the MQ library. + * + * [remarks] + * * This function ensures progress on the endpoint as long as the request + * is incomplete. + * * status can be NULL, in which case no status is written upon + * completion. + * * If request is PSM_MQ_REQINVALID, the function returns + * immediately. + * + * The following error code is returned. Other errors are handled by the PSM + * error handler (psm_error_register_handler). + * + * [retval] PSM_OK The request is complete or the value of was + * PSM_MQ_REQINVALID. + * + */ +psm_error_t +psm_mq_wait(psm_mq_req_t *request, psm_mq_status_t *status); + +/* Test if a non-blocking request is complete + * + * Function to test requests created from either preposted receive buffers or + * non-blocking sends for completion. Unlike psm_mq_wait, this function + * tests request for completion and @e never ensures progress directly or + * indirectly. It is up to the user to employ some of the progress functions + * described in mq_progress to ensure progress if the user chooses to + * exclusively test requests for completion. + * + * Testing a request for completion @e never internally ensure progress in + * order to be useful to construct higher-level completion tests over arrays to + * test some, all or any request that has completed. For testing arrays of + * requests, it is preferable for performance reasons to only ensure progress + * once before testing a set of requests for completion. + * + * [in,out] request MQ non-blocking request + * [out] status Updated if non-NULL and the request successfully + * completes + * + * [pre] The user has obtained a valid MQ request by calling psm_mq_isend + * or psm_mq_irecv and passes a pointer to enough storage to write + * the output of a psm_mq_status_t or NULL if status is to be + * ignored. + * + * [pre] The user has ensured progress on the Matched Queue if @ref + * psm_mq_test is exclusively used for guaranteeing request completions. + * + * [post] If the request is complete, the request is assigned the value @ref + * PSM_MQ_REQINVALID and all associated MQ request storage is released + * back to the MQ library. If the request is incomplete, the contents of + * request is unchanged. + * + * [post] The user will ensure progress on the Matched Queue if @ref + * psm_mq_test is exclusively used for guaranteeing request completions. + * + * The following two errors are always returned. Other errors are handled by + * the PSM error handler (psm_error_register_handler). + * + * [retval] PSM_OK The request is complete and request is set to @ref + * PSM_MQ_REQINVALID or the value of was PSM_MQ_REQINVALID + * + * [retval] PSM_MQ_NO_COMPLETIONS The request is not complete and request is + * unchanged. + * + * @verbatim + * // Function that returns the first completed request in an array + * // of requests. + * void * + * user_testany(psm_mq_t mq, psm_mq_req_t *allreqs, int nreqs) + * { + * int i; + * void *context = NULL; + * + * // Ensure progress only once + * psm_poll(mq); + * + * // Test for at least one completion and return it's context + * psm_mq_status_t stat; + * for (i = 0; i < nreqs; i++) { + * if (psm_mq_test(&allreqs[i], &stat) == PSM_OK) { + * context = stat.context; + * break; + * } + * } + * return context; + * } + * @endverbatim + */ +psm_error_t +psm_mq_test(psm_mq_req_t *request, psm_mq_status_t *status); + +/* Cancel a preposted request + * + * Function to cancel a preposted receive request returned by @ref + * psm_mq_irecv. It is currently illegal to cancel a send request initiated + * with psm_mq_isend. + * + * [pre] The user has obtained a valid MQ request by calling psm_mq_isend + * or psm_mq_irecv and passes a pointer to enough storage to write + * the output of a psm_mq_status_t or NULL if status is to be + * ignored. + * + * [post] Whether the cancel is successful or not, the user returns the + * request to the library by way of psm_mq_test or @ref + * psm_mq_wait. + * + * Only the two following errors can be returned directly, without being + * handled by the error handler (psm_error_register_handler): + * + * [retval] PSM_OK The request could be successfully cancelled such that the + * preposted receive buffer could be removed from the preposted + * receive queue before a match occurred. The associated @c + * request remains unchanged and the user must still return + * the storage to the MQ library. + * + * [retval] PSM_MQ_NO_COMPLETIONS The request could not be successfully cancelled + * since the preposted receive buffer has already + * matched an incoming message. The request + * remains unchanged. + * + */ +psm_error_t +psm_mq_cancel(psm_mq_req_t *req); + +struct psm_mq_stats { + uint64_t rx_user_bytes;/* Bytes received into a matched user buffer */ + uint64_t rx_user_num; /* Messages received into a matched user buffer */ + uint64_t rx_sys_bytes; /* Bytes received into an unmatched system buffer */ + uint64_t rx_sys_num; /* Messages received into an unmatched system buffer */ + + uint64_t tx_num; /* Total Messages transmitted (shm and ipath) */ + uint64_t tx_eager_num; /* Messages transmitted eagerly */ + uint64_t tx_eager_bytes; /* Bytes transmitted eagerly */ + uint64_t tx_rndv_num; /* Messages transmitted using expected TID mechanism */ + uint64_t tx_rndv_bytes; /* Bytes transmitted using expected TID mechanism */ + uint64_t tx_shm_num; /* Messages transmitted (shm only) */ + uint64_t rx_shm_num; /* Messages received through shm */ + + uint64_t rx_sysbuf_num; /* Number of system buffers allocated */ + uint64_t rx_sysbuf_bytes; /* Bytes allcoated for system buffers */ + + uint64_t _reserved[16]; /* Internally reserved for future use */ +}; + +#define PSM_MQ_NUM_STATS 13 /* How many stats are currently used in psm_mq_stats */ + +typedef struct psm_mq_stats psm_mq_stats_t; + +/* Retrieve statistics from an instantied MQ */ +void +psm_mq_get_stats(psm_mq_t mq, psm_mq_stats_t *stats); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/psm_mq_internal.h b/psm_mq_internal.h new file mode 100644 index 0000000..7c0f645 --- /dev/null +++ b/psm_mq_internal.h @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MQ_INT_H +#define MQ_INT_H + +#include "psm_user.h" + +#define MM_FLAG_NONE 0 +#define MM_FLAG_TRANSIENT 0x1 +#define MM_NUM_OF_POOLS 7 + +typedef struct _mem_block_ctrl mem_block_ctrl; +typedef struct _mem_ctrl mem_ctrl; + +struct _mem_ctrl { + mem_block_ctrl *free_list; + uint32_t total_alloc; + uint32_t current_available; + uint32_t block_size; + uint32_t flags; + uint32_t replenishing_rate; +}; + +struct _mem_block_ctrl { + union { + mem_ctrl *mem_handler; + mem_block_ctrl *next; + }; + char _redzone[PSM_VALGRIND_REDZONE_SZ]; +}; + +typedef psm_error_t (*psm_mq_unexpected_callback_fn_t) + (psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, + uint64_t tag, uint32_t send_msglen, + const void *payload, uint32_t paylen); + +struct psm_mq { + psm_ep_t ep; /**> ep back pointer */ + mpool_t sreq_pool; + mpool_t rreq_pool; + + psm_mq_unexpected_callback_fn_t unexpected_callback; + struct mqsq expected_q; /**> Preposted (expected) queue */ + struct mqsq unexpected_q; /**> Unexpected queue */ + struct mqq completed_q; /**> Completed queue */ + + uint64_t cur_sysbuf_bytes; + uint64_t max_sysbuf_bytes; + uint32_t ipath_thresh_rv; + uint32_t shm_thresh_rv; + uint32_t ipath_window_rv; + int memmode; + + psm_mq_stats_t stats; /**> MQ stats, accumulated by each PTL */ + + mem_ctrl handler_index[MM_NUM_OF_POOLS]; + int mem_ctrl_is_init; + uint64_t mem_ctrl_total_bytes; +}; + +#define MQ_IPATH_THRESH_TINY 8 +#define MQ_IPATH_THRESH_EGR_SDMA 34000 +#define MQ_IPATH_THRESH_EGR_SDMA_SQ 8192 + +#define MQE_TYPE_IS_SEND(type) ((type) & MQE_TYPE_SEND) +#define MQE_TYPE_IS_RECV(type) ((type) & MQE_TYPE_RECV) + +#define MQE_TYPE_SEND 0x1000 +#define MQE_TYPE_RECV 0x2000 +#define MQE_TYPE_FLAGMASK 0x0fff +#define MQE_TYPE_WAITING 0x0001 +#define MQE_TYPE_WAITING_PEER 0x0004 +#define MQE_TYPE_EGRLONG 0x0008 + +#define MQ_STATE_COMPLETE 0 +#define MQ_STATE_POSTED 1 +#define MQ_STATE_MATCHED 2 +#define MQ_STATE_UNEXP 3 +#define MQ_STATE_UNEXP_RV 4 +#define MQ_STATE_FREE 5 + +#define MQ_MSG_TINY 1 +#define MQ_MSG_SHORT 2 +#define MQ_MSG_LONG 3 +#define MQ_MSG_RTS 4 +#define MQ_MSG_RTS_EGR 5 +#define MQ_MSG_RTS_WAIT 6 +#define MQ_MSG_DATA 9 +#define MQ_MSG_DATA_BLK 10 +#define MQ_MSG_DATA_REQ 11 +#define MQ_MSG_DATA_REQ_BLK 12 +#define MQ_MSG_CTS_EGR 13 + +#define MQ_MSG_USER_FIRST 64 + +/* + * Descriptor allocation limits. + * The 'LIMITS' predefines fill in a psmi_rlimits_mpool structure + */ +#define MQ_SENDREQ_LIMITS { \ + .env = "PSM_MQ_SENDREQS_MAX", \ + .descr = "Max num of isend requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +#define MQ_RECVREQ_LIMITS { \ + .env = "PSM_MQ_RECVREQS_MAX", \ + .descr = "Max num of irecv requests in flight", \ + .env_level = PSMI_ENVVAR_LEVEL_USER, \ + .minval = 1, \ + .maxval = ~0, \ + .mode[PSMI_MEMMODE_NORMAL] = { 1024, 1048576 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1024, 65536 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 8192, 16777216 } \ + } + +typedef psm_error_t (*mq_rts_callback_fn_t)(psm_mq_req_t req, int was_posted); +typedef psm_error_t (*mq_testwait_callback_fn_t)(psm_mq_req_t *req, int istest, + psm_mq_status_t *status); + +/* receive mq_req, the default */ +struct psm_mq_req { + struct { + psm_mq_req_t next; + psm_mq_req_t *pprev; /* used in completion queue */ + }; + uint32_t state; + uint32_t type; + psm_mq_t mq; + + /* Tag matching vars */ + uint64_t tag; + uint64_t tagsel; /* used for receives */ + + /* Some PTLs want to get notified when there's a test/wait event */ + mq_testwait_callback_fn_t testwait_callback; + + /* Buffer attached to request. May be a system buffer for unexpected + * messages or a user buffer when an expected message */ + uint8_t *buf; + uint32_t buf_len; + uint32_t error_code; + + /* Used only for eager LONGs */ + STAILQ_ENTRY(psm_mq_req) nextq; /* used for egr-long only */ + psmi_egrid_t egrid; + psm_epaddr_t epaddr; + uint16_t msg_seqnum; /* msg seq num for mctxt */ + uint8_t tid_grant[128]; /* don't change the size unless... */ + + uint32_t recv_msglen; /* Message length we are ready to receive */ + uint32_t send_msglen; /* Message length from sender */ + uint32_t recv_msgoff; /* Message offset into buf */ + union { + uint32_t send_msgoff; /* Bytes received so far.. can be larger than buf_len */ + uint32_t recv_msgposted; + }; + + /* Used for request to send messages */ + void *context; /* user context associated to sends or receives */ + + /* Used to keep track of unexpected rendezvous */ + mq_rts_callback_fn_t rts_callback; + psm_epaddr_t rts_peer; + uint32_t rts_reqidx_peer; + uintptr_t rts_sbuf; + + /* PTLs get to store their own per-request data. MQ manages the allocation + * by allocating psm_mq_req so that ptl_req_data has enough space for all + * possible PTLs. + */ + union { + void *ptl_req_ptr; /* when used by ptl as pointer */ + uint8_t ptl_req_data[0]; /* when used by ptl for "inline" data */ + }; +}; + +void psmi_mq_mtucpy(void *vdest, const void *vsrc, uint32_t nchars); + +#if defined(__x86_64__) +void psmi_mq_mtucpy_safe(void *vdest, const void *vsrc, uint32_t nchars); +#else +#define psmi_mq_mtucpy_safe psmi_mq_mtucpy +#endif + +/* + * Optimize for 0-8 byte case, but also handle others. + */ +PSMI_ALWAYS_INLINE( +void mq_copy_tiny(uint32_t* dest, uint32_t* src, uint8_t len) +) +{ + switch (len) { + case 8: *dest++ = *src++; + case 4: *dest++ = *src++; + case 0: return; + case 7: + case 6: + case 5: *dest++ = *src++; len -= 4; + case 3: + case 2: + case 1: break; + default: /* greater than 8 */ + psmi_mq_mtucpy(dest,src,len); + return; + } + uint8_t* dest1 = (uint8_t*) dest; + uint8_t* src1 = (uint8_t*) src; + switch(len) { + case 3: *dest1++ = *src1++; + case 2: *dest1++ = *src1++; + case 1: *dest1++ = *src1++; + } +} + +/* + * Given an req with buffer ubuf of length ubuf_len, + * fill in the req's status and return the amount of bytes the request + * can receive. + * + * The function sets status truncation errors. Basically what MPI_Status. + */ +PSMI_ALWAYS_INLINE( +void mq_status_copy(psm_mq_req_t req, psm_mq_status_t *status)) +{ + status->msg_tag = req->tag; + status->msg_length = req->send_msglen; + status->nbytes = req->recv_msglen; + status->error_code = req->error_code; + status->context = req->context; +} + +PSMI_ALWAYS_INLINE( +uint32_t mq_set_msglen(psm_mq_req_t req, uint32_t recvlen, uint32_t sendlen)) +{ + req->send_msglen = sendlen; + if (recvlen < sendlen) { + req->recv_msglen = recvlen; + req->error_code = PSM_MQ_TRUNCATION; + return recvlen; + } + else { + req->recv_msglen = sendlen; + req->error_code = PSM_OK; + return sendlen; + } +} + +#ifndef PSM_DEBUG + +PSMI_ALWAYS_INLINE( +void +mq_qq_append(struct mqq *q, psm_mq_req_t req)) +{ + req->next = NULL; + req->pprev = q->lastp; + *(q->lastp) = req; + q->lastp = &req->next; +} +#else +#define mq_qq_append(q,req) do { \ + (req)->next = NULL;\ + (req)->pprev = (q)->lastp;\ + *((q)->lastp) = (req); \ + (q)->lastp = &(req)->next; \ + if (q == &(req)->mq->completed_q) \ + _IPATH_VDBG("Moving (req)=%p to completed queue on %s, %d\n", (req), __FILE__, __LINE__); \ +} while (0) +#endif + +PSMI_ALWAYS_INLINE( +void +mq_sq_append(struct mqsq *q, psm_mq_req_t req)) +{ + req->next = NULL; + *(q->lastp) = req; + q->lastp = &req->next; +} + +PSMI_ALWAYS_INLINE( +void +mq_qq_remove(struct mqq *q, psm_mq_req_t req)) +{ + if (req->next != NULL) + req->next->pprev = req->pprev; + else + q->lastp = req->pprev; + *(req->pprev) = req->next; +} + +psm_error_t psmi_mq_req_init(psm_mq_t mq); +psm_error_t psmi_mq_req_fini(psm_mq_t mq); +psm_mq_req_t psmi_mq_req_alloc(psm_mq_t mq, uint32_t type); +#define psmi_mq_req_free(req) psmi_mpool_put(req) + +/* + * MQ unexpected buffer management + */ +void psmi_mq_sysbuf_init(psm_mq_t mq); +void psmi_mq_sysbuf_fini(psm_mq_t mq); +void * psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t nbytes); +void psmi_mq_sysbuf_free(psm_mq_t mq, void *); +void psmi_mq_sysbuf_getinfo(psm_mq_t mq, char *buf, size_t len); + +/* + * Main receive progress engine, for shmops and ipath, in mq.c + */ +psm_error_t psmi_mq_malloc(psm_mq_t *mqo); +psm_error_t psmi_mq_initialize_defaults(psm_mq_t mq); +psm_error_t psmi_mq_free(psm_mq_t mq); + +/* Three functions that handle all MQ stuff */ +#define MQ_RET_MATCH_OK 0 +#define MQ_RET_UNEXP_OK 1 +#define MQ_RET_UNEXP_NO_RESOURCES 2 +#define MQ_RET_DATA_OK 3 +#define MQ_RET_DATA_OUT_OF_ORDER 4 + +int psmi_mq_handle_outoforder_queue(psm_epaddr_t epaddr); +int psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode, + psm_epaddr_t epaddr, uint16_t msg_seqnum, + uint64_t tag, psmi_egrid_t egrid, uint32_t msglen, + const void *payload, uint32_t paylen); +int psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, + uint64_t tag, psmi_egrid_t egrid, uint32_t msglen, + const void *payload, uint32_t paylen); +int psmi_mq_handle_data(psm_mq_req_t req, psm_epaddr_t epaddr, + uint32_t egrid, uint32_t offset, + const void *payload, uint32_t paylen); + +/* If rtsreq is non-NULL, it contains enough information to pull the data from + * the initiator and signal completion at a later time */ +int psmi_mq_handle_rts_outoforder(psm_mq_t mq, uint64_t tag, + uintptr_t send_buf, uint32_t send_msglen, + psm_epaddr_t peer, uint16_t msg_seqnum, + mq_rts_callback_fn_t cb, psm_mq_req_t *req_o); +int psmi_mq_handle_rts(psm_mq_t mq, uint64_t tag, uintptr_t send_buf, + uint32_t send_msglen, psm_epaddr_t peer, + mq_rts_callback_fn_t cb, psm_mq_req_t *req_o); +void psmi_mq_handle_rts_complete(psm_mq_req_t req); + +void psmi_mq_stats_register(psm_mq_t mq, mpspawn_stats_add_fn add_fn); + +PSMI_ALWAYS_INLINE( +psm_mq_req_t +mq_req_match(struct mqsq *q, uint64_t tag, int remove) +) +{ + psm_mq_req_t *curp; + psm_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) { + if (!((tag ^ cur->tag) & cur->tagsel)) { /* match! */ + if (remove) { + if ((*curp = cur->next) == NULL) /* fix tail */ + q->lastp = curp; + cur->next = NULL; + } + return cur; + } + } + return NULL; /* no match */ +} + +PSMI_ALWAYS_INLINE( +psm_mq_req_t +mq_ooo_match(struct mqsq *q, uint16_t msg_seqnum) +) +{ + psm_mq_req_t *curp; + psm_mq_req_t cur; + + for (curp = &q->first; (cur = *curp) != NULL; curp = &cur->next) { + if (cur->msg_seqnum == msg_seqnum) { /* match! */ + if ((*curp = cur->next) == NULL) /* fix tail */ + q->lastp = curp; + cur->next = NULL; + return cur; + } + } + return NULL; /* no match */ +} + +/* Default handler */ +int __fastpath +psmi_mq_handle_envelope_unexpected( + psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, + uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, + const void *payload, uint32_t paylen); + +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm_mq_t mq, + psm_mq_unexpected_callback_fn_t fn); + + +PSMI_ALWAYS_INLINE( +int +psmi_mq_handle_tiny_envelope(psm_mq_t mq, psm_epaddr_t epaddr, + uint64_t tag, const void *payload, uint32_t tinylen)) +{ + psm_mq_req_t req; + uint32_t msglen; + int rc; + psmi_assert(epaddr != NULL); + + req = mq_req_match(&(mq->expected_q), tag, 1); + if (req) { /* we have a match */ + req->tag = tag; + msglen = mq_set_msglen(req, req->buf_len, tinylen); + PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen); + mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + mq->stats.rx_user_bytes += msglen; + mq->stats.rx_user_num++; + _IPATH_VDBG("tiny from=%s match=YES (req=%p) mode=1 mqtag=%llu " + "msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), req, + (unsigned long long) tag, msglen, tinylen); + rc = MQ_RET_MATCH_OK; + } + else { + rc = psmi_mq_handle_envelope_unexpected(mq, MQ_MSG_TINY, epaddr, tag, + (union psmi_egrid) 0U, tinylen, payload, tinylen); + } + return rc; +} + +PSMI_ALWAYS_INLINE( +void +psmi_mq_stats_rts_account(psm_mq_req_t req)) +{ + psm_mq_t mq = req->mq; + if (MQE_TYPE_IS_SEND(req->type)) { + mq->stats.tx_num++; + mq->stats.tx_rndv_num++; + mq->stats.tx_rndv_bytes += req->send_msglen; + } + else { + mq->stats.rx_user_num++; + mq->stats.rx_user_bytes += req->recv_msglen; + } + return; +} + +#endif diff --git a/psm_mq_recv.c b/psm_mq_recv.c new file mode 100644 index 0000000..13a348d --- /dev/null +++ b/psm_mq_recv.c @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +#define psmi_mq_handle_egrdata(mq, req, epaddr) \ + do { \ + psm_mq_req_t dreq, treq; \ + dreq = STAILQ_FIRST(&epaddr->mctxt_master->egrdata); \ + while (dreq) { \ + treq = dreq; \ + dreq = STAILQ_NEXT(dreq, nextq); \ + if (treq->egrid.egr_data == req->egrid.egr_data) { \ + psmi_mq_handle_data(req, epaddr, treq->egrid.egr_data, \ + treq->recv_msgoff, treq->buf, treq->recv_msglen); \ + psmi_mq_sysbuf_free(mq, treq->buf); \ + STAILQ_REMOVE(&epaddr->mctxt_master->egrdata, \ + treq, psm_mq_req, nextq); \ + psmi_mq_req_free(treq); \ + } \ + } \ + } while (0) + +static void __recvpath +psmi_mq_req_copy(psm_mq_req_t req, psm_epaddr_t epaddr, + uint32_t offset, const void *buf, uint32_t nbytes) +{ + // recv_msglen may be changed by unexpected receive buf. + uint32_t msglen_this, end; + uint8_t *msgptr = (uint8_t *)req->buf + offset; + + end = offset + nbytes; + if (end > req->recv_msglen) { + if (offset >= req->recv_msglen) msglen_this = 0; + else msglen_this = req->recv_msglen - offset; + } else { + msglen_this = nbytes; + } + + VALGRIND_MAKE_MEM_DEFINED(msgptr, msglen_this); + psmi_mq_mtucpy(msgptr, buf, msglen_this); + + if (req->recv_msgoff < end) { + req->recv_msgoff = end; + } + req->send_msgoff += nbytes; + return; +} + +int __recvpath +psmi_mq_handle_data(psm_mq_req_t req, psm_epaddr_t epaddr, + uint32_t egrid, uint32_t offset, + const void *buf, uint32_t nbytes) +{ + psm_mq_t mq; + int rc; + + if (req == NULL) goto no_req; + + mq = req->mq; + if (req->state == MQ_STATE_MATCHED) + rc = MQ_RET_MATCH_OK; + else { + psmi_assert(req->state == MQ_STATE_UNEXP); + rc = MQ_RET_UNEXP_OK; + } + + psmi_assert(req->egrid.egr_data == egrid); + psmi_mq_req_copy(req, epaddr, offset, buf, nbytes); + + if (req->send_msgoff == req->send_msglen) { + if (req->type & MQE_TYPE_EGRLONG) { + STAILQ_REMOVE(&epaddr->mctxt_master->egrlong, + req, psm_mq_req, nextq); + } + + if (req->state == MQ_STATE_MATCHED) { + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + else { /* MQ_STATE_UNEXP */ + req->state = MQ_STATE_COMPLETE; + } + _IPATH_VDBG("epaddr=%s completed %d byte send, state=%d\n", + psmi_epaddr_get_name(epaddr->epid), + (int)req->send_msglen, req->state); + } + + return rc; + +no_req: + mq = epaddr->ep->mq; + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + req->egrid.egr_data = egrid; + req->recv_msgoff = offset; + req->recv_msglen = nbytes; + req->buf = psmi_mq_sysbuf_alloc(mq, nbytes); + psmi_mq_mtucpy(req->buf, buf, nbytes); + + STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrdata, req, nextq); + + return MQ_RET_UNEXP_OK; +} + +int __recvpath +psmi_mq_handle_rts(psm_mq_t mq, uint64_t tag, + uintptr_t send_buf, uint32_t send_msglen, + psm_epaddr_t peer, mq_rts_callback_fn_t cb, + psm_mq_req_t *req_o) +{ + psm_mq_req_t req; + int rc; + + PSMI_PLOCK_ASSERT(); + + req = mq_req_match(&(mq->expected_q), tag, 1); + + if (req) { /* we have a match, no need to callback */ + (void)mq_set_msglen(req, req->buf_len, send_msglen); + req->state = MQ_STATE_MATCHED; + req->tag = tag; + req->send_msgoff = 0; + req->rts_peer = peer; + req->rts_sbuf = send_buf; + *req_o = req; /* yes match */ + rc = MQ_RET_MATCH_OK; + } + else { /* No match, keep track of callback */ + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + /* We don't know recv_msglen yet but we set it here for + * mq_iprobe */ + req->send_msglen = req->recv_msglen = send_msglen; + req->state = MQ_STATE_UNEXP_RV; + req->tag = tag; + req->rts_callback = cb; + req->recv_msgoff = 0; + req->send_msgoff = 0; + req->rts_peer = peer; + req->rts_sbuf = send_buf; + mq_sq_append(&mq->unexpected_q, req); + *req_o = req; /* no match, will callback */ + rc = MQ_RET_UNEXP_OK; + } + + _IPATH_VDBG("from=%s match=%s (req=%p) mqtag=%" PRIx64" recvlen=%d " + "sendlen=%d errcode=%d\n", psmi_epaddr_get_name(peer->epid), + rc == MQ_RET_MATCH_OK ? "YES" : "NO", req, req->tag, + req->recv_msglen, req->send_msglen, req->error_code); + return rc; +} + +void +psmi_mq_handle_rts_complete(psm_mq_req_t req) +{ + psm_mq_t mq = req->mq; + + /* Stats on rendez-vous messages */ + psmi_mq_stats_rts_account(req); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); +#ifdef PSM_VALGRIND + if (MQE_TYPE_IS_RECV(req->type)) + PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, req->recv_msglen); + else + VALGRIND_MAKE_MEM_DEFINED(req->buf, req->buf_len); +#endif + _IPATH_VDBG("RTS complete, req=%p, recv_msglen = %d\n", + req, req->recv_msglen); + return; +} + +/* Not exposed in public psm, but may extend parts of PSM 2.1 to support + * this feature before 2.3 */ +psm_mq_unexpected_callback_fn_t +psmi_mq_register_unexpected_callback(psm_mq_t mq, + psm_mq_unexpected_callback_fn_t fn) +{ + psm_mq_unexpected_callback_fn_t old_fn = mq->unexpected_callback; + mq->unexpected_callback = fn; + return old_fn; +} + +int __recvpath +psmi_mq_handle_envelope_unexpected( + psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, + uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, + const void *payload, uint32_t paylen) +{ + psm_mq_req_t req; + uint32_t msglen; + + /* + * Keep a callback here in case we want to fit some other high-level + * protocols over MQ (i.e. shmem). These protocols would bypass the + * normal mesage handling and go to higher-level message handlers. + */ + if (mode >= MQ_MSG_USER_FIRST && mq->unexpected_callback) { + mq->unexpected_callback(mq,mode,epaddr,tag,send_msglen,payload,paylen); + return MQ_RET_UNEXP_OK; + } + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + req->tag = tag; + req->recv_msgoff = 0; + req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen; + + _IPATH_VDBG( + "from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64 + " send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), + req, mode, tag, send_msglen); +#if 0 + if (mq->cur_sysbuf_bytes+msglen > mq->max_sysbuf_bytes) { + _IPATH_VDBG("req=%p with len=%d exceeds limit of %llu sysbuf_bytes\n", + req, msglen, (unsigned long long) mq->max_sysbuf_bytes); + return MQ_RET_UNEXP_NO_RESOURCES; + } +#endif + switch (mode) { + case MQ_MSG_TINY: + if (msglen > 0) { + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); + } + else + req->buf = NULL; + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_SHORT: + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_mq_mtucpy(req->buf, payload, msglen); + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_LONG: + req->egrid = egrid; + req->send_msgoff = 0; + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + req->state = MQ_STATE_UNEXP; + req->type |= MQE_TYPE_EGRLONG; + STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); + _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", + egrid.egr_msgno, msglen, paylen); + if (paylen > 0) + psmi_mq_handle_data(req, epaddr, + egrid.egr_data, 0, payload, paylen); + psmi_mq_handle_egrdata(mq, req, epaddr); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", mode); + } + mq_sq_append(&mq->unexpected_q, req); + mq->stats.rx_sys_bytes += msglen; + mq->stats.rx_sys_num++; + + return MQ_RET_UNEXP_OK; +} + +/* + * This handles the regular (i.e. non-rendezvous MPI envelopes) + */ +int __recvpath +psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, + uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, + const void *payload, uint32_t paylen) +{ + psm_mq_req_t req; + uint32_t msglen; + int rc; + + psmi_assert(epaddr != NULL); + + req = mq_req_match(&(mq->expected_q), tag, 1); + + if (req) { /* we have a match */ + psmi_assert(MQE_TYPE_IS_RECV(req->type)); + req->tag = tag; + msglen = mq_set_msglen(req, req->buf_len, send_msglen); + + _IPATH_VDBG("from=%s match=YES (req=%p) mode=%x mqtag=%" + PRIx64" msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), + req, mode, tag, msglen, paylen); + + switch(mode) { + case MQ_MSG_TINY: + PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen); + mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_SHORT: /* message fits in 1 payload */ + PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen); + psmi_mq_mtucpy(req->buf, payload, msglen); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + break; + + case MQ_MSG_LONG: + req->egrid = egrid; + req->state = MQ_STATE_MATCHED; + req->type |= MQE_TYPE_EGRLONG; + req->send_msgoff = req->recv_msgoff = 0; + STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); + _IPATH_VDBG("exp MSG_LONG %d of length %d bytes pay=%d\n", + egrid.egr_msgno, msglen, paylen); + if (paylen > 0) + psmi_mq_handle_data(req, epaddr, + egrid.egr_data, 0, payload, paylen); + psmi_mq_handle_egrdata(mq, req, epaddr); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", mode); + } + + mq->stats.rx_user_bytes += msglen; + mq->stats.rx_user_num++; + + rc = MQ_RET_MATCH_OK; + if (mode == MQ_MSG_LONG) + return rc; + } + else + rc = psmi_mq_handle_envelope_unexpected(mq, mode, epaddr, tag, + egrid, send_msglen, payload, paylen); + + return rc; +} + +/* + * Note, epaddr is the master. + */ +int __recvpath +psmi_mq_handle_outoforder_queue(psm_epaddr_t epaddr) +{ + psm_mq_t mq = epaddr->ep->mq; + psm_mq_req_t ureq, ereq; + uint32_t msglen; + + next_ooo: + ureq = mq_ooo_match(&epaddr->outoforder_q, epaddr->mctxt_recv_seqnum); + if (ureq == NULL) return 0; + epaddr->mctxt_recv_seqnum++; + epaddr->outoforder_c--; + + ereq = mq_req_match(&(mq->expected_q), ureq->tag, 1); + if (ereq == NULL) { + mq_sq_append(&mq->unexpected_q, ureq); + if (epaddr->outoforder_c) goto next_ooo; + return 0; + } + + psmi_assert(MQE_TYPE_IS_RECV(ereq->type)); + ereq->tag = ureq->tag; + msglen = mq_set_msglen(ereq, ereq->buf_len, ureq->send_msglen); + + switch (ureq->state) { + case MQ_STATE_COMPLETE: + if (ureq->buf != NULL) { /* 0-byte don't alloc a sysbuf */ + psmi_mq_mtucpy(ereq->buf, + (const void *)ureq->buf, msglen); + psmi_mq_sysbuf_free(mq, ureq->buf); + } + ereq->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, ereq); + break; + case MQ_STATE_UNEXP: /* not done yet */ + ereq->type = ureq->type; + ereq->egrid = ureq->egrid; + ereq->epaddr = ureq->epaddr; + ereq->send_msgoff = ureq->send_msgoff; + ereq->recv_msgoff = min(ureq->recv_msgoff, msglen); + psmi_mq_mtucpy(ereq->buf, + (const void *)ureq->buf, ereq->recv_msgoff); + psmi_mq_sysbuf_free(mq, ureq->buf); + ereq->state = MQ_STATE_MATCHED; + STAILQ_INSERT_AFTER(&ureq->epaddr->mctxt_master->egrlong, + ureq, ereq, nextq); + STAILQ_REMOVE(&ureq->epaddr->mctxt_master->egrlong, + ureq, psm_mq_req, nextq); + break; + case MQ_STATE_UNEXP_RV: /* rendez-vous ... */ + ereq->state = MQ_STATE_MATCHED; + ereq->rts_peer = ureq->rts_peer; + ereq->rts_sbuf = ureq->rts_sbuf; + ereq->send_msgoff = 0; + ereq->rts_callback = ureq->rts_callback; + ereq->rts_reqidx_peer = ureq->rts_reqidx_peer; + ereq->type = ureq->type; + ereq->rts_callback(ereq, 0); + break; + default: + fprintf(stderr, "Unexpected state %d in req %p\n", ureq->state, ureq); + fprintf(stderr, "type=%d, mq=%p, tag=%p\n", + ureq->type, ureq->mq, (void *)(uintptr_t)ureq->tag); + abort(); + } + + psmi_mq_req_free(ureq); + if (epaddr->outoforder_c) goto next_ooo; + return 0; +} + +int __recvpath +psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode, + psm_epaddr_t epaddr, uint16_t msg_seqnum, + uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, + const void *payload, uint32_t paylen) +{ + psm_mq_req_t req; + uint32_t msglen; + + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + req->tag = tag; + req->recv_msgoff = 0; + req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen; + + _IPATH_VDBG( + "from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64 + " send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), + req, mode, tag, send_msglen); + switch (mode) { + case MQ_MSG_TINY: + if (msglen > 0) { + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); + } + else + req->buf = NULL; + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_SHORT: + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + psmi_mq_mtucpy(req->buf, payload, msglen); + req->state = MQ_STATE_COMPLETE; + break; + + case MQ_MSG_LONG: + req->egrid = egrid; + req->epaddr = epaddr; + req->send_msgoff = 0; + req->buf = psmi_mq_sysbuf_alloc(mq, msglen); + req->state = MQ_STATE_UNEXP; + req->type |= MQE_TYPE_EGRLONG; + STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); + _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", + egrid.egr_msgno, msglen, paylen); + if (paylen > 0) + psmi_mq_handle_data(req, epaddr, + egrid.egr_data, 0, payload, paylen); + psmi_mq_handle_egrdata(mq, req, epaddr); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Internal error, unknown packet 0x%x", mode); + } + + req->msg_seqnum = msg_seqnum; + mq_sq_append(&epaddr->mctxt_master->outoforder_q, req); + epaddr->mctxt_master->outoforder_c++; + mq->stats.rx_sys_bytes += msglen; + mq->stats.rx_sys_num++; + + return MQ_RET_UNEXP_OK; +} + +int __recvpath +psmi_mq_handle_rts_outoforder(psm_mq_t mq, uint64_t tag, + uintptr_t send_buf, uint32_t send_msglen, + psm_epaddr_t peer, uint16_t msg_seqnum, + mq_rts_callback_fn_t cb, + psm_mq_req_t *req_o) +{ + psm_mq_req_t req; + + PSMI_PLOCK_ASSERT(); + + req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert(req != NULL); + + /* We don't know recv_msglen yet but we set it here for + * mq_iprobe */ + req->send_msglen = req->recv_msglen = send_msglen; + req->state = MQ_STATE_UNEXP_RV; + req->tag = tag; + req->rts_callback = cb; + req->recv_msgoff = 0; + req->send_msgoff = 0; + req->rts_peer = peer; + req->rts_sbuf = send_buf; + req->msg_seqnum = msg_seqnum; + mq_sq_append(&peer->mctxt_master->outoforder_q, req); + peer->mctxt_master->outoforder_c++; + *req_o = req; /* no match, will callback */ + + _IPATH_VDBG("from=%s match=%s (req=%p) mqtag=%" PRIx64" recvlen=%d " + "sendlen=%d errcode=%d\n", psmi_epaddr_get_name(peer->epid), + "NO", req, req->tag, + req->recv_msglen, req->send_msglen, req->error_code); + return MQ_RET_UNEXP_OK; +} + diff --git a/psm_mq_utils.c b/psm_mq_utils.c new file mode 100644 index 0000000..a1a8667 --- /dev/null +++ b/psm_mq_utils.c @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +/* + * + * MQ request allocator + * + */ + +psm_mq_req_t __sendpath +psmi_mq_req_alloc(psm_mq_t mq, uint32_t type) +{ + psm_mq_req_t req; + + psmi_assert(type == MQE_TYPE_RECV || type == MQE_TYPE_SEND); + + if (type == MQE_TYPE_SEND) + req = psmi_mpool_get(mq->sreq_pool); + else + req = psmi_mpool_get(mq->rreq_pool); + + if_pt (req != NULL) { + /* A while ago there were issues about forgetting to zero-out parts of the + * structure, I'm leaving this as a debug-time option */ +#ifdef PSM_DEBUG + memset(req, 0, sizeof(struct psm_mq_req)); +#endif + req->type = type; + req->state = MQ_STATE_FREE; + req->next = NULL; + req->pprev = NULL; + req->error_code = PSM_OK; + req->mq = mq; + req->testwait_callback = NULL; + req->rts_peer = NULL; + req->ptl_req_ptr = NULL; + return req; + } + else { /* we're out of reqs */ + int issend = (type == MQE_TYPE_SEND); + uint32_t reqmax, reqchunk; + psmi_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool, + &reqchunk, &reqmax); + + psmi_handle_error(PSMI_EP_NORETURN, PSM_PARAM_ERR, + "Exhausted %d MQ %s request descriptors, which usually indicates " + "a user program error or insufficient request descriptors (%s=%d)", + reqmax, issend ? "isend" : "irecv", + issend ? "PSM_MQ_SENDREQS_MAX" : "PSM_MQ_RECVREQS_MAX", reqmax); + return NULL; + } +} + +psm_error_t +psmi_mq_req_init(psm_mq_t mq) +{ + psm_mq_req_t warmup_req; + psm_error_t err = PSM_OK; + + _IPATH_VDBG("mq element sizes are %d bytes\n", + (int) sizeof(struct psm_mq_req)); + + /* + * Send MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_SENDREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + + if ((mq->sreq_pool = psmi_mpool_create(sizeof(struct psm_mq_req), + chunksz, maxsz, 0, DESCRIPTORS, + NULL, NULL)) == NULL) + { + err = PSM_NO_MEMORY; + goto fail; + } + } + + /* + * Receive MQ requests + */ + { + struct psmi_rlimit_mpool rlim = MQ_RECVREQ_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(mq, 0, &rlim, &maxsz, &chunksz))) + goto fail; + + if ((mq->rreq_pool = + psmi_mpool_create(sizeof(struct psm_mq_req), chunksz, maxsz, 0, + DESCRIPTORS, NULL, NULL)) == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + } + + /* Warm up the allocators */ + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + + warmup_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + psmi_assert_always(warmup_req != NULL); + psmi_mq_req_free(warmup_req); + +fail: + return err; +} + +psm_error_t +psmi_mq_req_fini(psm_mq_t mq) +{ + psmi_mpool_destroy(mq->rreq_pool); + psmi_mpool_destroy(mq->sreq_pool); + return PSM_OK; +} + +/* + * + * System buffer (unexpected message) allocator + * + */ + +#if 0 +/* There's a version with a basic wrapper around malloc, as a back up */ +void * +psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t nbytes) +{ + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += nbytes; + return malloc(nbytes); +} + +void +psmi_mq_sysbuf_free(psm_mq_t mq, void *ptr) +{ + free(ptr); +} + +#else + +void psmi_mq_sysbuf_init(psm_mq_t mq) +{ + int i; + uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1}; + uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0}; + + if (mq->mem_ctrl_is_init) + return; + mq->mem_ctrl_is_init = 1; + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + mq->handler_index[i].block_size = block_sizes[i]; + mq->handler_index[i].current_available = 0; + mq->handler_index[i].free_list = NULL; + mq->handler_index[i].total_alloc = 0; + mq->handler_index[i].replenishing_rate = replenishing_rate[i]; + + if (block_sizes[i] == -1) { + psmi_assert_always(replenishing_rate[i] == 0); + mq->handler_index[i].flags = MM_FLAG_TRANSIENT; + } + else { + psmi_assert_always(replenishing_rate[i] > 0); + mq->handler_index[i].flags = MM_FLAG_NONE; + } + } + + VALGRIND_CREATE_MEMPOOL(mq, PSM_VALGRIND_REDZONE_SZ, + PSM_VALGRIND_MEM_UNDEFINED); + + /* Hit once on each block size so we have a pool that's allocated */ + for (i=0; i < MM_NUM_OF_POOLS; i++) { + void *ptr; + if (block_sizes[i] == -1) + continue; + ptr = psmi_mq_sysbuf_alloc(mq, block_sizes[i]); + psmi_mq_sysbuf_free(mq, ptr); + } +} + +void +psmi_mq_sysbuf_fini(psm_mq_t mq) // free all buffers that is currently not used +{ + mem_block_ctrl *block; + int i; + + if (mq->mem_ctrl_is_init == 0) + return; + + VALGRIND_DESTROY_MEMPOOL(mq); + + for (i=0; i < MM_NUM_OF_POOLS; i++) { + while ((block = mq->handler_index[i].free_list) != NULL) { + mq->handler_index[i].free_list = block->next; + psmi_free(block); + } + } + mq->mem_ctrl_is_init = 0; +} + +void +psmi_mq_sysbuf_getinfo(psm_mq_t mq, char *buf, size_t len) +{ + snprintf(buf, len-1, "Sysbuf consumption: %"PRIu64" bytes\n", + mq->mem_ctrl_total_bytes); + buf[len-1] = '\0'; + return; +} + +void * +psmi_mq_sysbuf_alloc(psm_mq_t mq, uint32_t alloc_size) +{ + mem_ctrl *mm_handler = mq->handler_index; + mem_block_ctrl *new_block; + int replenishing; + + /* There is a timing race with ips initialization, fix later. + * XXX */ + if (!mq->mem_ctrl_is_init) + psmi_mq_sysbuf_init(mq); + + mq->stats.rx_sysbuf_num++; + mq->stats.rx_sysbuf_bytes += alloc_size; + + while (mm_handler->block_size < alloc_size) + mm_handler++; + + replenishing = mm_handler->replenishing_rate; + + if (mm_handler->current_available == 0) { // allocate more buffers + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + uint32_t newsz = alloc_size + sizeof(mem_block_ctrl) + + PSM_VALGRIND_REDZONE_SZ; + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + + if (new_block) { + new_block->mem_handler = mm_handler; + new_block++; + mm_handler->total_alloc++; + mq->mem_ctrl_total_bytes += newsz; + VALGRIND_MEMPOOL_ALLOC(mq, new_block, alloc_size); + } + return new_block; + } + + do { + uint32_t newsz = mm_handler->block_size + sizeof(mem_block_ctrl) + + PSM_VALGRIND_REDZONE_SZ; + + new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); + mq->mem_ctrl_total_bytes += newsz; + + if (new_block) { + mm_handler->current_available++; + mm_handler->total_alloc++; + + new_block->next = mm_handler->free_list; + mm_handler->free_list = new_block; + } + + } while (--replenishing && new_block); + } + + if (mm_handler->current_available) { + mm_handler->current_available--; + + new_block = mm_handler->free_list; + mm_handler->free_list = new_block->next; + + new_block->mem_handler = mm_handler; + new_block++; + + VALGRIND_MEMPOOL_ALLOC(mq, new_block, mm_handler->block_size); + return new_block; + } + + return NULL; +} + +void psmi_mq_sysbuf_free(psm_mq_t mq, void * mem_to_free) +{ + mem_block_ctrl * block_to_free; + mem_ctrl *mm_handler; + + psmi_assert_always(mq->mem_ctrl_is_init); + + block_to_free = (mem_block_ctrl *)mem_to_free - 1; + mm_handler = block_to_free->mem_handler; + + VALGRIND_MEMPOOL_FREE(mq, mem_to_free); + + if (mm_handler->flags & MM_FLAG_TRANSIENT) { + psmi_free(block_to_free); + } else { + block_to_free->next = mm_handler->free_list; + mm_handler->free_list = block_to_free; + + mm_handler->current_available++; + } + + return; +} +#endif + +/* + * Hooks to plug into QLogic MPI stats + */ + +static +void psmi_mq_stats_callback(struct mpspawn_stats_req_args *args) +{ + uint64_t *entry = args->stats; + psm_mq_t mq = (psm_mq_t) args->context; + psm_mq_stats_t mqstats; + + psm_mq_get_stats(mq, &mqstats); + + if (args->num < 8) + return; + + entry[0] = mqstats.tx_eager_num; + entry[1] = mqstats.tx_eager_bytes; + entry[2] = mqstats.tx_rndv_num; + entry[3] = mqstats.tx_rndv_bytes; + + entry[4] = mqstats.rx_user_num; + entry[5] = mqstats.rx_user_bytes; + entry[6] = mqstats.rx_sys_num; + entry[7] = mqstats.rx_sys_bytes; +} + +void +psmi_mq_stats_register(psm_mq_t mq, mpspawn_stats_add_fn add_fn) +{ + char *desc[8]; + uint16_t flags[8]; + int i; + struct mpspawn_stats_add_args mp_add; + /* + * Hardcode flags until we correctly move mpspawn to its own repo. + * flags[i] = MPSPAWN_REDUCTION_MAX | MPSPAWN_REDUCTION_MIN; + */ + for (i = 0; i < 8; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL; + + desc[0] = "Eager count sent"; + desc[1] = "Eager bytes sent"; + desc[2] = "Rendezvous count sent"; + desc[3] = "Rendezvous bytes sent"; + desc[4] = "Expected count received"; + desc[5] = "Expected bytes received"; + desc[6] = "Unexpect count received"; + desc[7] = "Unexpect bytes received"; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = 8; + mp_add.header = "MPI Statistics Summary (max,min @ rank)"; + mp_add.req_fn = psmi_mq_stats_callback; + mp_add.desc = desc; + mp_add.flags = flags; + mp_add.context = mq; + + add_fn(&mp_add); +} diff --git a/psm_noship.h b/psm_noship.h new file mode 100644 index 0000000..201af81 --- /dev/null +++ b/psm_noship.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSM_NOSHIP_H_ +#define _PSM_NOSHIP_H_ + +#include "psm.h" + +typedef struct psm_epinfo { + psm_ep_t ep; + psm_epid_t epid; + psm_uuid_t uuid; + char uuid_str[64]; +} psm_epinfo_t; + +typedef struct psm_epconn { + psm_epaddr_t addr; + psm_ep_t ep; + psm_mq_t mq; +} psm_epconn_t; + +psm_error_t +psm_ep_query (int *num_of_epinfo, psm_epinfo_t *array_of_epinfo); + +psm_error_t +psm_ep_epid_lookup (psm_epid_t epid, psm_epconn_t *epconn); +#endif diff --git a/psm_stats.c b/psm_stats.c new file mode 100644 index 0000000..8b338ae --- /dev/null +++ b/psm_stats.c @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +struct psmi_stats_type { + STAILQ_ENTRY(psmi_stats_type) next; + struct psmi_stats_entry *entries; + + int num_entries; + void *heading; + uint32_t statstype; + void *context; +}; + +static STAILQ_HEAD(, psmi_stats_type) psmi_stats = + STAILQ_HEAD_INITIALIZER(psmi_stats); + +psm_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries_i, + int num_entries, + void *context) +{ + struct psmi_stats_entry *entries; + struct psmi_stats_type *type; + int i; + psm_error_t err = PSM_OK; + + entries = psmi_calloc(PSMI_EP_NONE, STATS, num_entries, sizeof(struct psmi_stats_entry)); + type = psmi_calloc(PSMI_EP_NONE, STATS, 1, sizeof(struct psmi_stats_type)); + PSMI_CHECKMEM(err, entries); + PSMI_CHECKMEM(err, type); + + type->entries = entries; + type->num_entries = num_entries; + type->statstype = statstype; + type->context = context; + type->heading = (char *) heading; + + for (i = 0; i < num_entries; i++) { + type->entries[i].desc = entries_i[i].desc; + type->entries[i].flags = entries_i[i].flags; + type->entries[i].getfn = entries_i[i].getfn; + type->entries[i].u.val = entries_i[i].u.val; + } + + STAILQ_INSERT_TAIL(&psmi_stats, type, next); + return err; + +fail: + if (entries) psmi_free(entries); + if (type) psmi_free(type); + return err; +} + +psm_error_t +psmi_stats_deregister_all(void) +{ + struct psmi_stats_type *type; + + /* Currently our mpi still reads stats after finalize so this isn't safe + * yet */ + while ((type = STAILQ_FIRST(&psmi_stats)) != NULL) { + STAILQ_REMOVE_HEAD(&psmi_stats, next); + psmi_free(type->entries); + psmi_free(type); + } + + return PSM_OK; +} + +static +uint32_t +typestring_to_type(const char *typestr) +{ + if (strncasecmp(typestr, "all", 4) == 0) + return PSMI_STATSTYPE_ALL; + else if (strncasecmp(typestr, "p2p", 4) == 0) + return PSMI_STATSTYPE_P2P; + else if (strncasecmp(typestr, "ipath", 6) == 0) + return PSMI_STATSTYPE_IPATH; + else if (strncasecmp(typestr, "ips", 4) == 0) + return PSMI_STATSTYPE_IPSPROTO; + else if ((strncasecmp(typestr, "intr", 5) == 0) || + (strncasecmp(typestr, "thread", 7) == 0) || + (strncasecmp(typestr, "rcvthread", 10) == 0)) + return PSMI_STATSTYPE_RCVTHREAD; + else if ((strncasecmp(typestr, "mq", 3) == 0) || + (strncasecmp(typestr, "mpi", 4) == 0)) + return PSMI_STATSTYPE_MQ; + else if ((strncasecmp(typestr, "tid", 4) == 0) || + (strncasecmp(typestr, "tids", 5) == 0)) + return PSMI_STATSTYPE_TIDS; + else if ((strncasecmp(typestr, "counter", 8) == 0) || + (strncasecmp(typestr, "counters", 9) == 0)) + return PSMI_STATSTYPE_DEVCOUNTERS; + else if (strncasecmp(typestr, "devstats", 9) == 0) + return PSMI_STATSTYPE_DEVSTATS; + else if ((strncasecmp(typestr, "memory", 7) == 0) || + (strncasecmp(typestr, "alloc", 6) == 0) || + (strncasecmp(typestr, "malloc", 7) == 0)) + return PSMI_STATSTYPE_MEMORY; + else + return 0; +} + +static +uint32_t +stats_parse_enabled_mask(const char *stats_string) +{ + char *b = (char *) stats_string; + char *e = b; + char buf[128]; + + uint32_t stats_enabled_mask = 0; + + while (*e) { + b = e; + while (*e && *e != ',' && *e != '+' && *e != '.' && + *e != '|' && *e != ':') + e++; + if (e > b) { /* something new to parse */ + int len = ((e - b) > (sizeof buf - 1)) ? + (sizeof buf - 1) : (e - b); + strncpy(buf, b, len); + buf[len] = '\0'; + stats_enabled_mask |= typestring_to_type(buf); + } + if (*e) + e++; /* skip delimiter */ + } + return stats_enabled_mask; +} + +static +void +psmi_stats_mpspawn_callback(struct mpspawn_stats_req_args *args) +{ + const struct psmi_stats_entry *entry; + struct psmi_stats_type *type = + (struct psmi_stats_type *) args->context; + int i, num = args->num; + uint64_t *stats = args->stats; + uint64_t *c = NULL; + uint64_t *s = NULL; + + psmi_assert(num == type->num_entries); + + if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS || + type->statstype == PSMI_STATSTYPE_DEVSTATS) + { + int unit_id = ((psm_ep_t) type->context)->unit_id; + int portno = ((psm_ep_t) type->context)->portnum; + uintptr_t off; + uint8_t *p = NULL; + int nc, npc, ns; + int nstats = infinipath_get_stats_names_count(); + int nctrs = infinipath_get_ctrs_unit_names_count(unit_id); + int npctrs = infinipath_get_ctrs_port_names_count(unit_id); + + if (nctrs != -1 && npctrs != -1) + c = psmi_calloc(PSMI_EP_NONE, STATS, nctrs+npctrs, + sizeof(uint64_t)); + if (nstats != -1) + s = psmi_calloc(PSMI_EP_NONE, STATS, nstats, sizeof(uint64_t)); + + /* + * If ipathfs is not loaded, we set NAN everywhere. We don't want + * stats to break just because 1 node didn't have ipath-stats + */ + if (type->statstype == PSMI_STATSTYPE_DEVCOUNTERS && c != NULL) { + nc = infinipath_get_ctrs_unit(unit_id, c, nctrs); + if (nc != -1 && nc == nctrs) + p = (uint8_t *)c; + if (nc == -1) + nc = 0; + npc = infinipath_get_ctrs_port(unit_id, portno, c+nc, npctrs); + if (!p && npc > 0 && npc == npctrs) + p = (uint8_t *)c; + } + else if (s != NULL) { + ns = infinipath_get_stats(s, nstats); + if (ns != -1) + p = (uint8_t *)s; + } + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + if (p) { + off = (uintptr_t) entry->u.off; + stats[i] = *((uint64_t *)(p + off)); + } + else + stats[i] = MPSPAWN_NAN_U64; + } + } + else if (type->statstype == PSMI_STATSTYPE_MEMORY) { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + stats[i] = *(uint64_t *) ((uintptr_t) &psmi_stats_memory + + (uintptr_t) entry->u.off); + } + } + else { + for (i = 0; i < num; i++) { + entry = &type->entries[i]; + if (entry->getfn != NULL) + stats[i] = entry->getfn(type->context); + else + stats[i] = *entry->u.val; + } + } + + if (c != NULL) + psmi_free(c); + if (s != NULL) + psmi_free(s); +} + +static +void +stats_register_mpspawn_single(mpspawn_stats_add_fn add_fn, + char *heading, + int num_entries, + struct psmi_stats_entry *entries, + mpspawn_stats_req_fn req_fn, + void *context) +{ + int i; + struct mpspawn_stats_add_args mp_add; + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_entries; + mp_add.header = heading; + mp_add.req_fn = req_fn; + mp_add.context = context; + + mp_add.desc = (char **) alloca(sizeof(char *) * num_entries); + psmi_assert_always(mp_add.desc != NULL); + + mp_add.flags = (uint16_t *) alloca(sizeof(uint16_t *) * num_entries); + psmi_assert_always(mp_add.flags != NULL); + + for (i = 0; i < num_entries; i++) { + mp_add.desc[i] = (char *) entries[i].desc; + mp_add.flags[i] = entries[i].flags; + } + + /* Ignore return code, doesn't matter to *us* if register failed */ + add_fn(&mp_add); + + return; +} + +static void stats_register_ipath_counters(psm_ep_t ep); +static void stats_register_ipath_stats(psm_ep_t ep); +static void stats_register_mem_stats(psm_ep_t ep); +static psm_error_t psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args); + +/* + * Downcall from QLogic MPI into PSM, so we can register stats + */ +void *psmi_stats_register(struct mpspawn_stats_init_args *args) +{ + struct psmi_stats_type *type; + uint32_t statsmask; + + /* + * Args has a version string in it, but we can ignore it since mpspawn + * will decide if it supports *our* version + */ + + /* + * Eventually, parse the stats_types to add various "flavours" of stats + */ + if (args->stats_types == NULL) + return NULL; + + statsmask = stats_parse_enabled_mask(args->stats_types); + + /* MQ (MPI-level) statistics */ + if (statsmask & PSMI_STATSTYPE_MQ) + psmi_mq_stats_register(args->mq, args->add_fn); + + /* PSM and ipath level statistics */ + if (statsmask & PSMI_STATSTYPE_DEVCOUNTERS) + stats_register_ipath_counters(args->mq->ep); + + if (statsmask & PSMI_STATSTYPE_DEVSTATS) + stats_register_ipath_stats(args->mq->ep); + + if (statsmask & PSMI_STATSTYPE_MEMORY) + stats_register_mem_stats(args->mq->ep); + + /* + * At this point all PSM and ipath-level components have registered stats + * with the PSM stats interface. We register with the mpspawn stats + * interface with an upcall in add_fn + */ + STAILQ_FOREACH(type, &psmi_stats, next) + { + if (type->statstype & statsmask) + stats_register_mpspawn_single(args->add_fn, + type->heading, + type->num_entries, + type->entries, + psmi_stats_mpspawn_callback, + type); + } + + /* + * Special handling for per-endpoint statistics + * Only MPI knows what the endpoint-addresses are in the running program, + * PSM has no sense of MPI worlds. In stats register, MPI tells PSM how + * many endpoints it anticipates having and PSM simply reserves that amount + * of stats entries X the amount of per-endpoint stats. + */ + if (statsmask & PSMI_STATSTYPE_P2P) + psmi_stats_epaddr_register(args); + + return NULL; +} + +struct stats_epaddr { + psm_ep_t ep; + mpspawn_map_epaddr_fn epaddr_map_fn; + int num_ep; + int num_ep_stats; +}; + +static +void +psmi_stats_epaddr_callback(struct mpspawn_stats_req_args *args) +{ + int i, num, off; + uint64_t *statsp; + struct stats_epaddr *stats_ctx = (struct stats_epaddr *) args->context; + psm_ep_t ep = stats_ctx->ep; + psm_epaddr_t epaddr; + + num = stats_ctx->num_ep * stats_ctx->num_ep_stats; + + /* First always NAN the entire stats request */ + for (i = 0; i < num; i++) { + if (args->flags[i] & MPSPAWN_STATS_TYPE_DOUBLE) + args->stats[i] = MPSPAWN_NAN; + else + args->stats[i] = MPSPAWN_NAN_U64; + } + + for (i = 0; i < stats_ctx->num_ep; i++) { + statsp = args->stats + i*stats_ctx->num_ep_stats; + off = 0; + epaddr = stats_ctx->epaddr_map_fn(i); + if (epaddr == NULL) + continue; + + /* Self */ + if (&ep->ptl_self == epaddr->ptlctl) { + if (ep->ptl_self.epaddr_stats_get != NULL) + off += ep->ptl_self.epaddr_stats_get(epaddr, statsp + off); + } + else { + if (ep->ptl_self.epaddr_stats_num != NULL) + off += ep->ptl_self.epaddr_stats_num(); + } + + /* Shm */ + if (&ep->ptl_amsh == epaddr->ptlctl) { + if (ep->ptl_amsh.epaddr_stats_get != NULL) + off += ep->ptl_amsh.epaddr_stats_get(epaddr, statsp + off); + } + else { + if (ep->ptl_amsh.epaddr_stats_num != NULL) + off += ep->ptl_amsh.epaddr_stats_num(); + } + + /* ips */ + if (&ep->ptl_ips == epaddr->ptlctl) { + if (ep->ptl_ips.epaddr_stats_get != NULL) + off += ep->ptl_ips.epaddr_stats_get(epaddr, statsp + off); + } + else { + if (ep->ptl_ips.epaddr_stats_num != NULL) + off += ep->ptl_ips.epaddr_stats_num(); + } + } + return; +} + +static +psm_error_t +psmi_stats_epaddr_register(struct mpspawn_stats_init_args *args) +{ + int i = 0, j; + int num_ep = args->num_epaddr; + int num_ep_stats = 0; + int nz; + char **desc, **desc_i; + uint16_t *flags, *flags_i; + char *p; + char buf[128]; + psm_ep_t ep; + struct mpspawn_stats_add_args mp_add; + struct stats_epaddr *stats_ctx; + psm_error_t err = PSM_OK; + + if (args->mq == NULL) + return PSM_OK; + ep = args->mq->ep; + + /* Figure out how many stats there are in an endpoint from all devices */ + if (ep->ptl_self.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_self.epaddr_stats_num(); + if (ep->ptl_amsh.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_amsh.epaddr_stats_num(); + if (ep->ptl_ips.epaddr_stats_num != NULL) + num_ep_stats += ep->ptl_ips.epaddr_stats_num(); + + /* Allocate desc and flags and let each device initialize their + * descriptions and flags */ + desc = psmi_malloc(ep, STATS, sizeof(char *) * num_ep_stats * (num_ep+1)); + if (desc == NULL) + return PSM_NO_MEMORY; + flags = psmi_malloc(ep, STATS, sizeof(uint16_t) * num_ep_stats * (num_ep+1)); + if (flags == NULL) { + psmi_free(desc); + return PSM_NO_MEMORY; + } + + /* Get the descriptions/flags from each device */ + i = 0; + i += ep->ptl_self.epaddr_stats_num != NULL ? + ep->ptl_self.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_amsh.epaddr_stats_num != NULL ? + ep->ptl_amsh.epaddr_stats_init(desc + i, flags + i) : 0; + i += ep->ptl_ips.epaddr_stats_num != NULL ? + ep->ptl_ips.epaddr_stats_init(desc + i, flags + i) : 0; + psmi_assert_always(i == num_ep_stats); + + /* + * Clone the descriptions for each endpoint but append "rank %d" to it + * beforehand. + */ + nz = (num_ep < 10 ? 1 : (num_ep < 100 ? 2 : /* cheap log */ + (num_ep < 1000 ? 3 : (num_ep < 1000 ? 4 : + (num_ep < 10000 ? 5 : 6))))); + + desc_i = desc + num_ep_stats; + flags_i = flags + num_ep_stats; + memset(desc_i, 0, sizeof(char*)*num_ep*num_ep_stats); + + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) { + snprintf(buf, sizeof buf - 1, "<%*d> %s", nz, i, desc[j]); + buf[sizeof buf - 1] = '\0'; + p = psmi_strdup(ep, buf); + if (p == NULL) { + err = PSM_NO_MEMORY; + goto clean; + } + desc_i [i * num_ep_stats + j] = p; + flags_i[i * num_ep_stats + j] = flags[j]; + } + } + + mp_add.version = MPSPAWN_STATS_VERSION; + mp_add.num = num_ep_stats * num_ep; + mp_add.header = "Endpoint-to-Endpoint Stats (by )"; + mp_add.req_fn = psmi_stats_epaddr_callback; + mp_add.desc = desc_i; + mp_add.flags = flags_i; + stats_ctx = psmi_malloc(ep, STATS, sizeof(struct stats_epaddr)); + if (stats_ctx == NULL) { + err = PSM_NO_MEMORY; + goto clean; + } + stats_ctx->ep = ep; + stats_ctx->epaddr_map_fn = args->epaddr_map_fn; + stats_ctx->num_ep = num_ep; + stats_ctx->num_ep_stats = num_ep_stats; + mp_add.context = stats_ctx; + + args->add_fn(&mp_add); + +clean: + /* Now we can free all the descriptions */ + for (i = 0; i < num_ep; i++) { + for (j = 0; j < num_ep_stats; j++) + if (desc_i[i * num_ep_stats + j]) psmi_free(desc_i[i * num_ep_stats + j]); + } + + psmi_free(desc); + psmi_free(flags); + + return err; +} + +static +void +stats_register_ipath_counters(psm_ep_t ep) +{ + int i, nc, npc; + char *cnames = NULL, *pcnames = NULL; + struct psmi_stats_entry *entries = NULL; + + nc = infinipath_get_ctrs_unit_names(ep->unit_id, &cnames); + if (nc == -1 || cnames == NULL) + goto bail; + npc = infinipath_get_ctrs_port_names(ep->unit_id, &pcnames); + if (npc == -1 || pcnames == NULL) + goto bail; + entries = psmi_calloc(ep, STATS, nc+npc, sizeof(struct psmi_stats_entry)); + if (entries == NULL) + goto bail; + + for (i = 0; i < nc; i++) { + entries[i].desc = infinipath_get_next_name(&cnames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i*sizeof(uint64_t); + } + for (i = nc; i < nc+npc; i++) { + entries[i].desc = infinipath_get_next_name(&pcnames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i*sizeof(uint64_t); + } + psmi_stats_register_type("InfiniPath device counters", + PSMI_STATSTYPE_DEVCOUNTERS, + entries, + nc+npc, + ep); + +bail: + if (cnames != NULL) + psmi_free(cnames); + if (pcnames != NULL) + psmi_free(pcnames); + if (entries != NULL) + psmi_free(entries); + return; +} + +static +void +stats_register_ipath_stats(psm_ep_t ep) +{ + int i, ns; + char *snames = NULL; + struct psmi_stats_entry *entries = NULL; + + ns = infinipath_get_stats_names(&snames); + if (ns == -1 || snames == NULL) + goto bail; + entries = psmi_calloc(ep, STATS, ns, sizeof(struct psmi_stats_entry)); + if (entries == NULL) + goto bail; + + for (i = 0; i < ns; i++) { + entries[i].desc = infinipath_get_next_name(&snames); + entries[i].flags = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + entries[i].getfn = NULL; + entries[i].u.off = i*sizeof(uint64_t); + } + psmi_stats_register_type("InfiniPath device statistics", + PSMI_STATSTYPE_DEVSTATS, + entries, + ns, + ep); + +bail: + if (snames != NULL) + psmi_free(snames); + if (entries != NULL) + psmi_free(entries); + return; +} + +#undef _SDECL +#define _SDECL(_desc, _param) { \ + .desc = _desc, \ + .flags = MPSPAWN_STATS_REDUCTION_ALL \ + | MPSPAWN_STATS_SKIP_IF_ZERO, \ + .getfn = NULL, \ + .u.off = offsetof(struct psmi_stats_malloc, _param) \ + } + +static +void +stats_register_mem_stats(psm_ep_t ep) +{ + struct psmi_stats_entry entries[] = { + _SDECL("Total (current)", m_all_total), + _SDECL("Total (max)", m_all_max), + _SDECL("All Peers (current)", m_perpeer_total), + _SDECL("All Peers (max)", m_perpeer_max), + _SDECL("Network Buffers (current)", m_netbufs_total), + _SDECL("Network Buffers (max)", m_netbufs_max), + _SDECL("PSM desctors (current)", m_descriptors_total), + _SDECL("PSM desctors (max)", m_descriptors_max), + _SDECL("Unexp. buffers (current)", m_unexpbufs_total), + _SDECL("Unexp. Buffers (max)", m_unexpbufs_max), + _SDECL("Other (current)", m_undefined_total), + _SDECL("Other (max)", m_undefined_max), + }; + + psmi_stats_register_type("PSM memory allocation statistics", + PSMI_STATSTYPE_MEMORY, + entries, + PSMI_STATS_HOWMANY(entries), + ep); +} diff --git a/psm_stats.h b/psm_stats.h new file mode 100644 index 0000000..9baed27 --- /dev/null +++ b/psm_stats.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_stats.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSM_STATS_H +#define _PSM_STATS_H + +#include "mpspawn_stats.h" + +#define PSMI_STATSTYPE_MQ 0x00001 +#define PSMI_STATSTYPE_RCVTHREAD 0x00100 /* num_wakups, ratio, etc. */ +#define PSMI_STATSTYPE_IPSPROTO 0x00200 /* acks,naks,err_chks */ +#define PSMI_STATSTYPE_TIDS 0x00400 +#define PSMI_STATSTYPE_MEMORY 0x01000 +#define PSMI_STATSTYPE_IPATH (PSMI_STATSTYPE_RCVTHREAD| \ + PSMI_STATSTYPE_IPSPROTO | \ + PSMI_STATSTYPE_MEMORY | \ + PSMI_STATSTYPE_TIDS) +#define PSMI_STATSTYPE_P2P 0x00800 /* ep-to-ep details */ +#define PSMI_STATSTYPE_DEVCOUNTERS 0x10000 +#define PSMI_STATSTYPE_DEVSTATS 0x20000 +#define PSMI_STATSTYPE_ALL 0xfffff +#define _PSMI_STATSTYPE_DEVMASK 0xf0000 + +/* Used to determine how many stats in static array decl. */ +#define PSMI_STATS_HOWMANY(entries) \ + (sizeof(entries)/sizeof(entries[0])) + +#define PSMI_STATS_NO_HEADING NULL + +#define PSMI_STATS_DECL(_desc,_flags,_getfn,_val) \ + { .desc = _desc, \ + .flags = _flags, \ + .getfn = _getfn, \ + .u.val = _val, \ + } + +#define PSMI_STATS_DECLU64(_desc,_val) \ + PSMI_STATS_DECL(_desc, \ + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, \ + NULL, \ + _val) + +struct psmi_stats_entry { + const char *desc; + uint16_t flags; + uint64_t (*getfn)(void *context); /* optional fn ptr to get value */ + union { + uint64_t *val; /* where value is stored if getfn is NULL */ + uint64_t off; /* of offset if that makes more sense */ + } u; +}; + +/* + * Copy the array of entries and keep track of the context + */ +psm_error_t +psmi_stats_register_type(const char *heading, + uint32_t statstype, + const struct psmi_stats_entry *entries, + int num_entries, + void *context); + +psm_error_t +psmi_stats_deregister_all(void); + +#endif /* PSM_STATS_H */ diff --git a/psm_timer.c b/psm_timer.c new file mode 100644 index 0000000..387abd4 --- /dev/null +++ b/psm_timer.c @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" + +#define __timerpath __recvpath + +#if PSMI_TIMER_STATS +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) ((ctrl)->num_insertions++) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) ((ctrl)->num_traversals++) +#else +# define PSMI_TIMER_STATS_ADD_INSERTION(ctrl) +# define PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl) +#endif + +psm_error_t +psmi_timer_init(struct psmi_timer_ctrl *ctrl) +{ + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + +#if PSMI_TIMER_STATS + ctrl->num_insertions = 0; + ctrl->num_traversals = 0; +#endif + + TAILQ_INIT(&ctrl->timerq); + return PSM_OK; +} + +void +psmi_timer_entry_init(struct psmi_timer *t_init, + psmi_timer_expire_callback_t expire_fn, + void *context) +{ + TAILQ_NEXT(t_init, timer) = NULL; + t_init->t_timeout = 0ULL; + t_init->flags = 0; + t_init->expire_callback = expire_fn; + t_init->context = context; + return; +} + +psm_error_t +psmi_timer_fini(struct psmi_timer_ctrl *ctrl) +{ +#if PSMI_TIMER_STATS + if (ctrl->num_insertions > 0) { + _IPATH_INFO("avg elem traversals/insertion = %3.2f %%\n", + 100.0 * (double) ctrl->num_traversals / ctrl->num_insertions); + } +#endif + return PSM_OK; +} + +void __timerpath +psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, + uint64_t t_cyc_expire) +{ + struct psmi_timer *t_cursor; + + psmi_assert(!(t_insert->flags & PSMI_TIMER_FLAG_PENDING)); + + t_insert->t_timeout = t_cyc_expire; + t_insert->flags |= PSMI_TIMER_FLAG_PENDING; + + /* + * We keep the list from oldest (head) to newest (tail), with the + * assumption that insert and remove occur much more often than search + * (when the timer expires). Newly added timers are more likely to expire + * later rather than sooner, which is why the head is older. + */ + PSMI_TIMER_STATS_ADD_INSERTION(ctrl); + + if (TAILQ_EMPTY(&ctrl->timerq)) { /* Common case */ + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + ctrl->t_cyc_next_expire = t_cyc_expire; + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + return; + } + else if (t_cyc_expire > PSMI_TIMER_PRIO_LAST) { + TAILQ_FOREACH(t_cursor, &ctrl->timerq, timer) { + if (t_cursor->t_timeout <= t_cyc_expire) { + TAILQ_INSERT_BEFORE(t_cursor, t_insert, timer); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + /* Got to the end of the list -- We're the next to expire */ + ctrl->t_cyc_next_expire = t_cyc_expire; + TAILQ_INSERT_TAIL(&ctrl->timerq, t_insert, timer); + return; + } + else { + TAILQ_FOREACH_REVERSE(t_cursor, &ctrl->timerq, timerq, timer) { + if (t_cursor->t_timeout >= t_cyc_expire) { + TAILQ_INSERT_AFTER(&ctrl->timerq, t_cursor, t_insert, timer); + ctrl->t_cyc_next_expire = min(t_cyc_expire, + ctrl->t_cyc_next_expire); + return; + } + PSMI_TIMER_STATS_ADD_TRAVERSAL(ctrl); + } + TAILQ_INSERT_HEAD(&ctrl->timerq, t_insert, timer); + /* No need to check if we inserted last, given first branch case */ + // if (TAILQ_LAST(&ctrl->timerq, timerq) == t_insert) + // ctrl->t_cyc_next_expire = t_cyc_expire; + return; + } + + return; +} + +psm_error_t __timerpath +psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, uint64_t t_cyc_expire) +{ + psm_error_t err = PSM_OK_NO_PROGRESS; + struct psmi_timer *t_cursor = TAILQ_LAST(&ctrl->timerq, timerq); + + while (t_cursor) { + if (t_cursor->t_timeout > t_cyc_expire) + break; + + err = PSM_OK; + psmi_assert(t_cursor->flags & PSMI_TIMER_FLAG_PENDING); + t_cursor->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_cursor, timer); + t_cursor->expire_callback(t_cursor, t_cyc_expire); + t_cursor = TAILQ_PREV(t_cursor, timerq, timer); + } + + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + + return err; +} + +void __timerpath +psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove) +{ + + psmi_assert(t_remove->flags & PSMI_TIMER_FLAG_PENDING); + + t_remove->flags &= ~PSMI_TIMER_FLAG_PENDING; + TAILQ_REMOVE(&ctrl->timerq, t_remove, timer); + + /* + * If we're removing the last entry, we need to reset the + * expiration cycle time. + */ + if (TAILQ_EMPTY(&ctrl->timerq)) + ctrl->t_cyc_next_expire = PSMI_TIMER_INFINITE; + else + ctrl->t_cyc_next_expire = + TAILQ_LAST(&ctrl->timerq, timerq)->t_timeout; + return; +} + + diff --git a/psm_timer.h b/psm_timer.h new file mode 100644 index 0000000..0a35c04 --- /dev/null +++ b/psm_timer.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_timer.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_TIMER_H +#define _PSMI_TIMER_H + +#include "psm_user.h" + +/* Keep timer stats */ +#define PSMI_TIMER_STATS 0 + +typedef struct psmi_timer psmi_timer; +typedef psm_error_t (*psmi_timer_expire_callback_t)(struct psmi_timer *, uint64_t); + +struct psmi_timer { + TAILQ_ENTRY(psmi_timer) timer; /* opaque */ + uint64_t t_timeout; /* opaque */ + uint8_t flags; /* opaque */ + + psmi_timer_expire_callback_t expire_callback; /* user -- callback fn */ + void *context; /* user -- callback param */ +}; + +struct psmi_timer_ctrl { + uint64_t t_cyc_next_expire; + TAILQ_HEAD(timerq, psmi_timer) timerq; + +#if PSMI_TIMER_STATS + uint64_t num_insertions; + uint64_t num_traversals; +#endif +}; + +/* + * Some events need to be unconditionally enqueued at the beginning of the + * timerq -- they are not timers meant to expire but merely operations that + * need to be delayed. For delayed operations, there are 5 levels of + * priority. + */ +#define PSMI_TIMER_PRIO_0 0ULL +#define PSMI_TIMER_PRIO_1 1ULL +#define PSMI_TIMER_PRIO_2 2ULL +#define PSMI_TIMER_PRIO_3 3ULL +#define PSMI_TIMER_PRIO_4 4ULL +#define PSMI_TIMER_PRIO_LAST PSMI_TIMER_PRIO_4 + +#define PSMI_TIMER_INFINITE 0xFFFFFFFFFFFFFFFFULL +#define PSMI_TIMER_FLAG_PENDING 0x01 + +/* + * Timer control initialization and finalization + */ +psm_error_t psmi_timer_init(struct psmi_timer_ctrl *ctrl); +psm_error_t psmi_timer_fini(struct psmi_timer_ctrl *ctrl); + +/* + * Timer entry initialization (a timer must be initialized before it can be + * added to the timer request queue). + */ + +void psmi_timer_entry_init(struct psmi_timer *t_init, + psmi_timer_expire_callback_t expire_fn, + void *context); + +/* + * Timer requests, conditional (macro) or unconditional + */ +#define psmi_timer_request(ctrl, t_insert, t_cyc) \ + if (!((t_insert)->flags & PSMI_TIMER_FLAG_PENDING)) \ + psmi_timer_request_always((ctrl), (t_insert), (t_cyc)) + +void psmi_timer_request_always(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_insert, + uint64_t t_cyc_expire); + +/* + * Timer cancelations, conditional (macro) only (cancel_inner is internal) + */ +#define psmi_timer_cancel(ctrl, t_remove) \ + if ((t_remove)->flags & PSMI_TIMER_FLAG_PENDING) \ + psmi_timer_cancel_inner(ctrl, t_remove) +void psmi_timer_cancel_inner(struct psmi_timer_ctrl *ctrl, + struct psmi_timer *t_remove); + +/* + * Timer processing, conditional or unconditional. + */ +#define psmi_timer_process_if_expired(ctrl, t_cyc_expire) \ + (((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) ? \ + psmi_timer_process_expired(ctrl, t_cyc_expire) : \ + PSM_OK_NO_PROGRESS) + +#define psmi_timer_is_expired(ctrl, t_cyc_expire) \ + ((ctrl)->t_cyc_next_expire <= (t_cyc_expire)) + +psm_error_t psmi_timer_process_expired(struct psmi_timer_ctrl *ctrl, + uint64_t t_cyc_expire); + +#endif /* _PSMI_TIMER_H */ diff --git a/psm_user.h b/psm_user.h new file mode 100644 index 0000000..c9aadcc --- /dev/null +++ b/psm_user.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_USER_H +#define _PSMI_USER_H + +#include +#include + +#include "psm.h" +#include "psm_mq.h" + +#include "ptl.h" + +#include "ipath_user.h" +#include "ipath_queue.h" +#include "valgrind/valgrind.h" +#include "valgrind/memcheck.h" + +#define _PSMI_IN_USER_H +#include "psm_help.h" +#include "psm_error.h" +#include "psm_context.h" +#include "psm_utils.h" +#include "psm_timer.h" +#include "psm_mpool.h" +#include "psm_ep.h" +#include "psm_lock.h" +#include "psm_stats.h" +#undef _PSMI_IN_USER_H + +#define PSMI_VERNO_MAKE(major,minor) ((((major)&0xff)<<8)|((minor)&0xff)) +#define PSMI_VERNO PSMI_VERNO_MAKE(PSM_VERNO_MAJOR, PSM_VERNO_MINOR) +#define PSMI_VERNO_GET_MAJOR(verno) ( ((verno)>>8) & 0xff ) +#define PSMI_VERNO_GET_MINOR(verno) ( ((verno)>>0) & 0xff ) + +int psmi_verno_client(); +int psmi_verno_isinteroperable(uint16_t verno); +int psmi_isinitialized(); + +psm_error_t psmi_poll_internal(psm_ep_t ep, int poll_amsh); +psm_error_t psmi_mq_wait_internal(psm_mq_req_t *ireq); + +/* + * Default setting for Receive thread + * + * 0 disables rcvthread by default + * 0x1 enables ips receive thread by default + */ +#define PSMI_RCVTHREAD_FLAGS 0x1 + +/* + * Define one of these below. + * + * Spinlock gives the best performance and makes sense with the progress thread + * only because the progress thread does a "trylock" and then goes back to + * sleep in a poll. + * + * Mutexlock should be used for experimentation while the more useful + * mutexlock-debug should be enabled during developement to catch potential + * errors. + */ +#ifdef PSM_DEBUG + #define PSMI_PLOCK_IS_MUTEXLOCK_DEBUG +#else + #define PSMI_PLOCK_IS_SPINLOCK + //#define PSMI_PLOCK_IS_MUTEXLOCK + //#define PSMI_PLOCK_IS_MUTEXLOCK_DEBUG + //#define PSMI_PLOCK_IS_NOLOCK +#endif + +#ifdef PSMI_PLOCK_IS_SPINLOCK + psmi_spinlock_t psmi_progress_lock; + #define PSMI_PLOCK_INIT() psmi_spin_init(&psmi_progress_lock) + #define PSMI_PLOCK_TRY() psmi_spin_trylock(&psmi_progress_lock) + #define PSMI_PLOCK() psmi_spin_lock(&psmi_progress_lock) + #define PSMI_PUNLOCK() psmi_spin_unlock(&psmi_progress_lock) + #define PSMI_PLOCK_ASSERT() + #define PSMI_PUNLOCK_ASSERT() + #define PSMI_PLOCK_DISABLED 0 +#elif defined(PSMI_PLOCK_IS_MUTEXLOCK_DEBUG) + pthread_mutex_t psmi_progress_lock; + pthread_t psmi_progress_lock_owner; + #define PSMI_PLOCK_NO_OWNER ((pthread_t)(-1)) + + PSMI_ALWAYS_INLINE( + int _psmi_mutex_trylock_inner(pthread_mutex_t *mutex, const char *curloc)) + { + psmi_assert_always_loc(psmi_progress_lock_owner != pthread_self(), curloc); + int ret = pthread_mutex_trylock(&psmi_progress_lock); + if (ret == 0) + psmi_progress_lock_owner = pthread_self(); + return ret; + } + + PSMI_ALWAYS_INLINE( + int _psmi_mutex_lock_inner(pthread_mutex_t *mutex, const char *curloc)) + { + psmi_assert_always_loc(psmi_progress_lock_owner != pthread_self(), curloc); + int ret = pthread_mutex_lock(&psmi_progress_lock); + psmi_assert_always_loc(ret != EDEADLK, curloc); + psmi_progress_lock_owner = pthread_self(); + return ret; + } + + PSMI_ALWAYS_INLINE( + void _psmi_mutex_unlock_inner(pthread_mutex_t *mutex, const char *curloc)) + { + psmi_assert_always_loc(psmi_progress_lock_owner == pthread_self(), curloc); + psmi_progress_lock_owner = PSMI_PLOCK_NO_OWNER; + psmi_assert_always_loc( + pthread_mutex_unlock(&psmi_progress_lock) != EPERM, curloc); + return; + } + #define PSMI_PLOCK_INIT() /* static initialization */ + #define PSMI_PLOCK_TRY() \ + _psmi_mutex_trylock_inner(&psmi_progress_lock, PSMI_CURLOC) + #define PSMI_PLOCK() \ + _psmi_mutex_lock_inner(&psmi_progress_lock, PSMI_CURLOC) + #define PSMI_PUNLOCK() \ + _psmi_mutex_unlock_inner(&psmi_progress_lock, PSMI_CURLOC) + #define PSMI_PLOCK_ASSERT() \ + psmi_assert_always(psmi_progress_lock_owner == pthread_self()); + #define PSMI_PUNLOCK_ASSERT() \ + psmi_assert_always(psmi_progress_lock_owner != pthread_self()); + + #define PSMI_PLOCK_DISABLED 0 +#elif defined (PSMI_PLOCK_IS_MUTEXLOCK) + pthread_mutex_t psmi_progress_lock; + #define PSMI_PLOCK_INIT() /* static initialization */ + #define PSMI_PLOCK_TRY() pthread_mutex_trylock(&psmi_progress_lock) + #define PSMI_PLOCK() pthread_mutex_lock(&psmi_progress_lock) + #define PSMI_PUNLOCK() pthread_mutex_unlock(&psmi_progress_lock) + #define PSMI_PLOCK_DISABLED 0 + #define PSMI_PLOCK_ASSERT() + #define PSMI_PUNLOCK_ASSERT() +#elif defined(PSMI_PLOCK_IS_NOLOCK) + #define PSMI_PLOCK_TRY() 0 /* 0 *only* so progress thread never succeeds */ + #define PSMI_PLOCK() + #define PSMI_PUNLOCK() + #define PSMI_PLOCK_DISABLED 1 + #define PSMI_PLOCK_ASSERT() + #define PSMI_PUNLOCK_ASSERT() +#else + #error No PLOCK lock type declared +#endif + +#define PSMI_PYIELD() \ + do { PSMI_PUNLOCK(); sched_yield(); PSMI_PLOCK(); } while (0) + +#ifdef PSM_PROFILE + void psmi_profile_block() __attribute__ ((weak)); + void psmi_profile_unblock() __attribute__ ((weak)); + void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));; + + #define PSMI_PROFILE_BLOCK() psmi_profile_block() + #define PSMI_PROFILE_UNBLOCK() psmi_profile_unblock() + #define PSMI_PROFILE_REBLOCK(noprog) psmi_profile_reblock(noprog) +#else + #define PSMI_PROFILE_BLOCK() + #define PSMI_PROFILE_UNBLOCK() + #define PSMI_PROFILE_REBLOCK(noprog) +#endif + +#ifdef PSM_VALGRIND + #define PSM_VALGRIND_REDZONE_SZ 8 + #define PSM_VALGRIND_DEFINE_MQ_RECV(buf,posted_len,recv_len) do { \ + VALGRIND_MAKE_MEM_DEFINED((void *)(buf), (posted_len)); \ + if ((recv_len) < (posted_len)) \ + VALGRIND_MAKE_MEM_UNDEFINED( \ + (void *) ((uintptr_t) (buf) + (recv_len)), \ + (posted_len) - (recv_len)); \ + } while (0) + +#else + #define PSM_VALGRIND_REDZONE_SZ 0 + #define PSM_VALGRIND_DEFINE_MQ_RECV(buf,posted_len,recv_len) +#endif + +/* Parameters for use in valgrind's "is_zeroed" */ +#define PSM_VALGRIND_MEM_DEFINED 1 +#define PSM_VALGRIND_MEM_UNDEFINED 0 + +#endif /* _PSMI_USER_H */ diff --git a/psm_utils.c b/psm_utils.c new file mode 100644 index 0000000..c8651fe --- /dev/null +++ b/psm_utils.c @@ -0,0 +1,1278 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include /* gethostbyname */ +#include "psm_user.h" +#include "psm_mq_internal.h" + +int psmi_ep_device_is_enabled(const psm_ep_t ep, int devid); + +struct psmi_epid_table psmi_epid_table; + +/* Iterator to access the epid table. + * 'ep' can be NULL if remote endpoints from all endpoint handles are requested + */ +void +psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm_ep_t ep) +{ + itor->i = 0; + itor->ep = ep; + pthread_mutex_lock(&psmi_epid_table.tablock); +} + +void * +psmi_epid_itor_next(struct psmi_eptab_iterator *itor) +{ + int i; + struct psmi_epid_tabentry *e; + + if (itor->i >= psmi_epid_table.tabsize) + return NULL; + for (i = itor->i; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (!e->entry || e->entry == EPADDR_DELETED) + continue; + if (itor->ep && e->ep != itor->ep) + continue; + itor->i = i+1; + return e->entry; + } + itor->i = psmi_epid_table.tabsize; /* put at end of table */ + return NULL; +} + +void +psmi_epid_itor_fini(struct psmi_eptab_iterator *itor) +{ + pthread_mutex_unlock(&psmi_epid_table.tablock); + itor->i = 0; +} + +#define mix64(a,b,c) \ +{ \ + a -= b; a -= c; a ^= (c>>43); \ + b -= c; b -= a; b ^= (a<<9); \ + c -= a; c -= b; c ^= (b>>8); \ + a -= b; a -= c; a ^= (c>>38); \ + b -= c; b -= a; b ^= (a<<23); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>35); \ + b -= c; b -= a; b ^= (a<<49); \ + c -= a; c -= b; c ^= (b>>11); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<18); \ + c -= a; c -= b; c ^= (b>>22); \ +} + +psm_error_t +psmi_epid_init() +{ + pthread_mutexattr_t attr; + psmi_epid_table.table = NULL, + psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&psmi_epid_table.tablock, &attr); + pthread_mutexattr_destroy(&attr); + return PSM_OK; +}; + +psm_error_t +psmi_epid_fini() +{ + if (psmi_epid_table.table != NULL) { + psmi_free(psmi_epid_table.table); + psmi_epid_table.table = NULL; + } + psmi_epid_table.tabsize = 0; + psmi_epid_table.tabsize_used = 0; + return PSM_OK; +} + +PSMI_ALWAYS_INLINE( +uint64_t +hash_this(const psm_ep_t ep, const psm_epid_t epid)) +{ + uint64_t ep_i = (uint64_t)(uintptr_t)ep; + uint64_t epid_i = (uint64_t) epid; + uint64_t hash = 0x9e3779b97f4a7c13LL; + mix64(ep_i,epid_i,hash); + return hash; +} + +PSMI_ALWAYS_INLINE( +void * +psmi_epid_lookup_inner(psm_ep_t ep, psm_epid_t epid, int remove)) +{ + uint64_t key = hash_this(ep, epid); + struct psmi_epid_tabentry *e; + void *entry = NULL; + int idx; + + pthread_mutex_lock(&psmi_epid_table.tablock); + if (!psmi_epid_table.table) + goto ret; + idx = (int)(key % psmi_epid_table.tabsize); + while (psmi_epid_table.table[idx].entry != NULL) { + /* An epid can be added twice if there's more than one opened endpoint, + * but really we match on epid *and* on endpoint */ + e = &psmi_epid_table.table[idx]; + if (e->entry != EPADDR_DELETED && e->key == key) + { + entry = e->entry; + if (remove) + psmi_epid_table.table[idx].entry = EPADDR_DELETED; + goto ret; + } + if (++idx == psmi_epid_table.tabsize) + idx = 0; + } +ret: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return entry; +} + +void * +psmi_epid_lookup(psm_ep_t ep, psm_epid_t epid) +{ + void *entry = psmi_epid_lookup_inner(ep, epid, 0); + if (PSMI_EP_HOSTNAME != ep) + _IPATH_VDBG("lookup of (%p,%" PRIx64 ") returns %p\n", ep, epid, entry); + return entry; +} + +void * +psmi_epid_remove(psm_ep_t ep, psm_epid_t epid) +{ + if (PSMI_EP_HOSTNAME != ep) + _IPATH_VDBG("remove of (%p,%" PRIx64 ")\n", ep, epid); + return psmi_epid_lookup_inner(ep, epid, 1); +} + +psm_error_t +psmi_epid_add(psm_ep_t ep, psm_epid_t epid, void *entry) +{ + uint64_t key; + int idx, i, newsz; + struct psmi_epid_tabentry *e; + psm_error_t err = PSM_OK; + + if (PSMI_EP_HOSTNAME != ep) + _IPATH_VDBG("add of (%p,%" PRIx64 ") with entry %p\n", ep, epid, entry); + pthread_mutex_lock(&psmi_epid_table.tablock); + /* Leave this here, mostly for sanity and for the fact that the epid + * table is currently not used in the critical path */ + if (++psmi_epid_table.tabsize_used > + (int)(psmi_epid_table.tabsize * PSMI_EPID_TABLOAD_FACTOR)) + { + struct psmi_epid_tabentry *newtab; + newsz = psmi_epid_table.tabsize + PSMI_EPID_TABSIZE_CHUNK; + newtab = (struct psmi_epid_tabentry *) + psmi_calloc(ep, PER_PEER_ENDPOINT, + newsz, sizeof(struct psmi_epid_tabentry)); + if (newtab == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + if (psmi_epid_table.table) { /* rehash the table */ + for (i = 0; i < psmi_epid_table.tabsize; i++) { + e = &psmi_epid_table.table[i]; + if (e->entry == NULL) + continue; + /* When rehashing, mark deleted as free again */ + if (e->entry == EPADDR_DELETED) { + psmi_epid_table.tabsize_used--; + continue; + } + idx = (int)(e->key % newsz); + while (newtab[idx].entry != NULL) + if (++idx == newsz) + idx = 0; + newtab[idx].entry = e->entry; + newtab[idx].key = e->key; + newtab[idx].ep = e->ep; + newtab[idx].epid = e->epid; + } + psmi_free(psmi_epid_table.table); + } + psmi_epid_table.table = newtab; + psmi_epid_table.tabsize = newsz; + } + key = hash_this(ep, epid); + idx = (int)(key % psmi_epid_table.tabsize); + e = &psmi_epid_table.table[idx]; + while (e->entry && e->entry != EPADDR_DELETED) { + if (++idx == psmi_epid_table.tabsize) + idx = 0; + e = &psmi_epid_table.table[idx]; + } + e->entry = entry; + e->key = key; + e->epid = epid; + e->ep = ep; + +fail: + pthread_mutex_unlock(&psmi_epid_table.tablock); + return err; +} + +char * +psmi_gethostname(void) +{ + /* XXX this will need a lock in a multi-threaded environment */ + static char hostname[80] = {'\0'}; + char *c; + + if (hostname[0] == '\0') { + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = '\0'; /* no guarantee of nul termination */ + if ((c = strchr(hostname, '.'))) + *c = '\0'; + } + + return hostname; +} + +/* + * Hostname stuff. We really only register the network portion of the epid + * since all epids from the same nid are assumed to have the same hostname. + */ +psm_error_t +psmi_epid_set_hostname(uint64_t nid, const char *hostname, int overwrite) +{ + size_t hlen; + char *h; + psm_error_t err = PSM_OK; + + if (hostname == NULL) + return PSM_OK; + /* First see if a hostname already exists */ + if ((h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid)) != NULL) { + if (!overwrite) + return PSM_OK; + + h = psmi_epid_remove(PSMI_EP_HOSTNAME, nid); + if (h != NULL) /* free the previous hostname if so exists */ + psmi_free(h); + } + + hlen = min(PSMI_EP_HOSTNAME_LEN, strlen(hostname)+1); + h = (char *) psmi_malloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, hlen); + if (h == NULL) + return PSM_NO_MEMORY; + snprintf(h, hlen, "%s", hostname); + h[hlen-1] = '\0'; + err = psmi_epid_add(PSMI_EP_HOSTNAME, nid, h); + return err; +} + +/* XXX These two functions are not thread safe, we'll use a rotating buffer + * trick whenever we need to make them thread safe */ +const char * +psmi_epaddr_get_hostname(psm_epid_t epid) +{ + static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; + static int bufno = 0; + uint64_t nid = psm_epid_nid(epid); + char *h, *hostname; + + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 4; + + /* First, if we have registered a host for this epid, just return that, or + * else try to return something with lid and context */ + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, nid); + if (h != NULL) + return h; + else { + uint64_t lid, context, subcontext; + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + snprintf(hostname, PSMI_EP_HOSTNAME_LEN-1, "LID=0x%04x:%d.%d", + (unsigned int) lid, (int) context, (int) subcontext); + hostname[PSMI_EP_HOSTNAME_LEN-1] = '\0'; + return hostname; + } +} + +/* This one gives the hostname with a lid */ +const char * +psmi_epaddr_get_name(psm_epid_t epid) +{ + static char hostnamebufs[4][PSMI_EP_HOSTNAME_LEN]; + static int bufno = 0; + char *h, *hostname; + uint64_t lid, context, subcontext; + + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + hostname = hostnamebufs[bufno]; + bufno = (bufno + 1) % 4; + + h = psmi_epid_lookup(PSMI_EP_HOSTNAME, psm_epid_nid(epid)); + if (h == NULL) + return psmi_epaddr_get_hostname(epid); + else { + snprintf(hostname, PSMI_EP_HOSTNAME_LEN-1, + "%s (LID=0x%04x:%d.%d)", h, + (unsigned int) lid, (int) context, (int) subcontext); + hostname[PSMI_EP_HOSTNAME_LEN-1] = '\0'; + } + return hostname; +} + +/* Wrapper, in case we port to OS xyz that doesn't have sysconf */ +uintptr_t +psmi_getpagesize(void) +{ + static uintptr_t pagesz = (uintptr_t) -1; + long sz; + if (pagesz != (uintptr_t) -1) + return pagesz; + sz = sysconf(_SC_PAGESIZE); + if (sz == -1) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Can't query system page size"); + } + + pagesz = (uintptr_t) sz; + return pagesz; +} + +/* If PSM_VERBOSE_ENV is set in the environment, we determine + * what its verbose level is and print the environment at "INFO" + * level if the environment's level matches the desired printlevel. + */ +static int psmi_getenv_verblevel = -1; +static int +psmi_getenv_is_verblevel(int printlevel) +{ + if (psmi_getenv_verblevel == -1) { + char *env = getenv("PSM_VERBOSE_ENV"); + if (env && *env) { + char *ep; + int val = (int) strtol(env, &ep, 0); + if (ep == env) + psmi_getenv_verblevel = 0; + else if (val == 2) + psmi_getenv_verblevel = 2; + else + psmi_getenv_verblevel = 1; + } + else + psmi_getenv_verblevel = 0; + } + return (printlevel <= psmi_getenv_verblevel); +} + +#define GETENV_PRINTF(_level,_fmt,...) \ + do { \ + int nlevel = _level; \ + if (psmi_getenv_is_verblevel(nlevel)) \ + nlevel = 0; \ + _IPATH_ENVDBG(nlevel,_fmt,##__VA_ARGS__); \ + } while (0) + +int +psmi_getenv(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval) +{ + int used_default = 0; + union psmi_envvar_val tval; + char *env = getenv(name); + int ishex = (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS || + type == PSMI_ENVVAR_TYPE_UINT_FLAGS); + + /* If we're not using the default, always reset the print + * level to '1' so the changed value gets seen at low + * verbosity */ +#define _GETENV_PRINT(used_default,fmt,val,defval) do { \ + if (used_default) \ + GETENV_PRINTF(level, "%s%-25s %-40s =>%s" #fmt \ + "\n", level>1?"*":" ", name, descr, ishex?" \ + 0x":" ", val); \ + else \ + GETENV_PRINTF(1, "%s%-25s %-40s =>%s" #fmt \ + " (default was%s" #fmt ")\n",level>1?"*":" ", \ + name, descr, ishex?" 0x":" ", val, \ + ishex?" 0x":" ", defval); \ + } while (0) + + switch (type) { + case PSMI_ENVVAR_TYPE_YESNO: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else if (env[0] == 'Y' || env[0] == 'y') + tval.e_int = 1; + else if (env[0] == 'N' || env[0] == 'n') + tval.e_int = 0; + else { + char *ep; + tval.e_ulong = strtoul(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + else if (tval.e_ulong != 0) + tval.e_ulong = 1; + } + _GETENV_PRINT(used_default,%s,tval.e_long?"YES":"NO", + defval.e_int?"YES":"NO"); + break; + + case PSMI_ENVVAR_TYPE_STR: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else + tval.e_str = env; + _GETENV_PRINT(used_default,%s,tval.e_str,defval.e_str); + break; + + case PSMI_ENVVAR_TYPE_INT: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else { + char *ep; + tval.e_int = (int) strtol(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + } + _GETENV_PRINT(used_default,%d,tval.e_int,defval.e_int); + break; + + case PSMI_ENVVAR_TYPE_UINT: + case PSMI_ENVVAR_TYPE_UINT_FLAGS: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else { + char *ep; + tval.e_int = (unsigned int) strtoul(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + } + if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) + _GETENV_PRINT(used_default,%x,tval.e_uint,defval.e_uint); + else + _GETENV_PRINT(used_default,%u,tval.e_uint,defval.e_uint); + break; + + case PSMI_ENVVAR_TYPE_LONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else { + char *ep; + tval.e_long = strtol(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + } + _GETENV_PRINT(used_default,%ld,tval.e_long,defval.e_long); + break; + case PSMI_ENVVAR_TYPE_ULONG_ULONG: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else { + char *ep; + tval.e_ulonglong = (unsigned long long) strtoull(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + } + _GETENV_PRINT(used_default,%llu, + tval.e_ulonglong, defval.e_ulonglong); + break; + case PSMI_ENVVAR_TYPE_ULONG: + case PSMI_ENVVAR_TYPE_ULONG_FLAGS: + default: + if (!env || *env == '\0') { + tval = defval; + used_default = 1; + } + else { + char *ep; + tval.e_ulong = (unsigned long) strtoul(env, &ep, 0); + if (ep == env) { + used_default = 1; + tval = defval; + } + } + if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) + _GETENV_PRINT(used_default,%lx,tval.e_ulong,defval.e_ulong); + else + _GETENV_PRINT(used_default,%lu,tval.e_ulong,defval.e_ulong); + break; + } +#undef _GETENV_PRINT + *newval = tval; + + return used_default; +} + +/* + * Parsing int parameters set in string tuples. + * Output array int *vals should be able to store 'ntup' elements. + * Values are only overwritten if they are parsed. + * Tuples are always separated by colons ':' + */ +int psmi_parse_str_tuples(const char *string, int ntup, int *vals) +{ + char *b = (char *) string; + char *e = b; + int tup_i = 0; + int n_parsed = 0; + char *buf = psmi_strdup(NULL, string); + psmi_assert_always(buf != NULL); + + while (*e && tup_i < ntup) { + b = e; + while (*e && *e != ':') + e++; + if (e > b) { /* something to parse */ + char *ep; + int len = e - b; + long int l; + strncpy(buf, b, len); + buf[len] = '\0'; + l = strtol(buf, &ep, 0); + if (ep != buf) { /* successful conversion */ + vals[tup_i] = (int) l; + n_parsed++; + } + } + if (*e == ':') + e++; /* skip delimiter */ + tup_i++; + } + psmi_free(buf); + return n_parsed; +} + +/* + * Memory footprint/usage mode. + * + * This can be used for debug or for separating large installations from + * small/medium ones. The default is to assume a medium installation. Large + * is not that much larger in memory footprint, but we make a conscious effort + * an consuming only the amount of memory we need. + */ +int +psmi_parse_memmode(void) +{ + union psmi_envvar_val env_mmode; + int used_default = + psmi_getenv("PSM_MEMORY", "Memory usage mode (normal or large)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) "normal", &env_mmode); + if (used_default || !strcasecmp(env_mmode.e_str, "normal")) + return PSMI_MEMMODE_NORMAL; + else if (!strcasecmp(env_mmode.e_str, "min")) + return PSMI_MEMMODE_MINIMAL; + else if (!strcasecmp(env_mmode.e_str, "large") || + !strcasecmp(env_mmode.e_str, "big")) + return PSMI_MEMMODE_LARGE; + else { + _IPATH_PRDBG("PSM_MEMORY env value %s unrecognized, " + "using 'normal' memory mode instead\n", + env_mmode.e_str); + return PSMI_MEMMODE_NORMAL; + } +} + +static +const char * +psmi_memmode_string(int mode) +{ + psmi_assert(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); + switch (mode) { + case PSMI_MEMMODE_NORMAL: + return "normal"; + case PSMI_MEMMODE_MINIMAL: + return "minimal"; + case PSMI_MEMMODE_LARGE: + return "large"; + default: + return "unknown"; + } +} + +psm_error_t +psmi_parse_mpool_env(const psm_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo) +{ + uint32_t val; + const char *env = rlim->env; + int mode = mq->memmode; + psm_error_t err = PSM_OK; + union psmi_envvar_val env_val; + + psmi_assert_always(mode >= PSMI_MEMMODE_NORMAL && mode < PSMI_MEMMODE_NUM); + + psmi_getenv(rlim->env, rlim->descr, rlim->env_level, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) rlim->mode[mode].obj_max, + &env_val); + + val = env_val.e_uint; + if (val < rlim->minval || val > rlim->maxval) + { + err = psmi_handle_error(NULL, PSM_PARAM_ERR, + "Env. var %s=%u is invalid (valid settings in mode PSM_MEMORY=%s" + " are inclusively between %u and %u)", env, val, + psmi_memmode_string(mode), rlim->minval, rlim->maxval); + goto fail; + } + + _IPATH_VDBG("%s max=%u,chunk=%u (mode=%s(%u),min=%u,max=%u)\n", + env, val, rlim->mode[mode].obj_chunk, psmi_memmode_string(mode), + mode, rlim->minval, rlim->maxval); + + *valo = val; + *chunkszo = rlim->mode[mode].obj_chunk; + +fail: + return err; +} + +uint64_t +psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns) +{ + if (timeout_ns < 0) + return 0ULL; + else if (timeout_ns == 0ULL || timeout_ns == ~0ULL) + return ~0ULL; + else { + uint64_t t_end = nanosecs_to_cycles(timeout_ns); + uint64_t t_now = get_cycles() - start_cycles; + + if (t_now >= t_end) + return 0ULL; + else + return (t_end - t_now); + } +} + +uint32_t +psmi_get_ipv4addr() +{ + struct hostent *he; + uint32_t addr = 0; + + he = gethostbyname(psmi_gethostname()); + if (he != NULL && he->h_addrtype == AF_INET && he->h_addr != NULL) { + memcpy(&addr, he->h_addr, sizeof(uint32_t)); + return addr; + } + else + return 0; +} + +#define PSMI_EP_IS_PTR(ptr) ((ptr) != NULL && (ptr) < PSMI_EP_LOGEVENT) + +void +psmi_syslog(psm_ep_t ep, int to_console, int level, const char *format, ...) +{ + va_list ap; + + /* If we've never syslogged anything from this ep at the PSM level, make + * sure we log context information */ + if (PSMI_EP_IS_PTR(ep) && !ep->did_syslog) { + char uuid_str[64]; + ep->did_syslog = 1; + + memset(&uuid_str, 0, sizeof uuid_str); + psmi_uuid_unparse(ep->key, uuid_str); + ipath_syslog("PSM", 0, LOG_WARNING, + "uuid_key=%s,unit=%d,context=%d,subcontext=%d", + uuid_str, + ep->context.base_info.spi_unit, + ep->context.base_info.spi_context, + ep->context.base_info.spi_subcontext); + } + + va_start(ap, format); + ipath_vsyslog("PSM", to_console, level, format, ap); + va_end(ap); +} + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed = 0; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +static uint32_t update_crc(uint32_t crc, unsigned char *buf, int len) +{ + uint32_t c = crc; + int n; + + if_pf (!crc_table_computed) + make_crc_table(); + for (n = 0; n < len; n++) { + c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); + } + return c; +} + +/* Return the CRC of the bytes buf[0..len-1]. */ +uint32_t psmi_crc(unsigned char *buf, int len) +{ + return update_crc(0xffffffff, buf, len) ^ 0xffffffff; +} + +/* Return the HCA type being used for a context */ +uint32_t psmi_get_hca_type(psmi_context_t *context) +{ + uint32_t hca_type; + + /* Determine HCA type. Use heuristics based on runtime flags + * + * Header suppression available: QLE73XX + * NODMA_RTAIL: QLE72XX + * : QLE71XX + */ + + if (context->runtime_flags & IPATH_RUNTIME_HDRSUPP) + hca_type = PSMI_HCA_TYPE_QLE73XX; + else if (context->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) + hca_type = PSMI_HCA_TYPE_QLE72XX; + else + hca_type = PSMI_HCA_TYPE_QLE71XX; + + return hca_type; +} + +#define PSMI_FAULTINJ_SPEC_NAMELEN 32 +struct psmi_faultinj_spec { + STAILQ_ENTRY(psmi_faultinj_spec) next; + char spec_name[PSMI_FAULTINJ_SPEC_NAMELEN]; + + unsigned long long num_faults; + unsigned long long num_calls; + + unsigned int seedp; + int num; + int denom; + +}; + +int psmi_faultinj_enabled = 0; +int psmi_faultinj_verbose = 0; +char *psmi_faultinj_outfile = NULL; + +static struct psmi_faultinj_spec psmi_faultinj_dummy; +static STAILQ_HEAD(, psmi_faultinj_spec) psmi_faultinj_head = + STAILQ_HEAD_INITIALIZER(psmi_faultinj_head); + +void +psmi_faultinj_init() +{ + union psmi_envvar_val env_fi; + + psmi_getenv("PSM_FI", "PSM Fault Injection (yes/no)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_YESNO, + PSMI_ENVVAR_VAL_NO, &env_fi); + + psmi_faultinj_enabled = !!env_fi.e_uint; + + if (psmi_faultinj_enabled) { + char *def = NULL; + if (!psmi_getenv("PSM_FI_TRACEFILE", "PSM Fault Injection output file", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) def, &env_fi)) + { + psmi_faultinj_outfile = psmi_strdup(NULL, env_fi.e_str); + } + } + + return; +} + +void +psmi_faultinj_fini() +{ + struct psmi_faultinj_spec *fi; + FILE *fp; + int do_fclose = 0; + + if (!psmi_faultinj_enabled || psmi_faultinj_outfile == NULL) + return; + + if (strncmp(psmi_faultinj_outfile, "stdout", 7) == 0) + fp = stdout; + else if (strncmp(psmi_faultinj_outfile, "stderr", 7) == 0) + fp = stderr; + else { + char *c = psmi_faultinj_outfile; + char buf[192]; + int append = 0; + if (*c == '+') { + append = 1; + ++c; + } + do_fclose = 1; + snprintf(buf, sizeof buf - 1, "%s.%s", c, __ipath_mylabel); + buf[sizeof buf - 1] = '\0'; + fp = fopen(buf, append ? "a" : "w"); + } + + if (fp != NULL) { + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + fprintf(fp, "%s:%s PSM_FI_%-12s %2.3f%% => " + "%2.3f%% %10lld faults/%10lld events\n", __progname, + __ipath_mylabel, fi->spec_name, + (double) fi->num * 100.0 / fi->denom, + (double) fi->num_faults * 100.0 / fi->num_calls, + fi->num_faults, fi->num_calls); + } + fflush(fp); + if (do_fclose) + fclose(fp); + } + + psmi_free(psmi_faultinj_outfile); + return; +} + +/* + * Intended to be used only once, not in the critical path + */ +struct psmi_faultinj_spec * +psmi_faultinj_getspec(char *spec_name, int num, int denom) +{ + struct psmi_faultinj_spec *fi; + + if (!psmi_faultinj_enabled) + return &psmi_faultinj_dummy; + + STAILQ_FOREACH(fi, &psmi_faultinj_head, next) { + if (strcmp(fi->spec_name, spec_name) == 0) + return fi; + } + + /* We got here, so no spec -- allocate one */ + fi = psmi_malloc(PSMI_EP_NONE, UNDEFINED, sizeof(struct psmi_faultinj_spec)); + strncpy(fi->spec_name, spec_name, PSMI_FAULTINJ_SPEC_NAMELEN-1); + fi->spec_name[PSMI_FAULTINJ_SPEC_NAMELEN-1] = '\0'; + fi->num = num; + fi->denom = denom; + fi->num_faults = 0; + fi->num_calls = 0; + + /* + * See if we get a hint from the environment. + * Format is + * + * + * By default, we chose the initial seed to be the 'pid'. If users need + * repeatability, they should set initial_seed to be the 'pid' when the + * error was observed or force the initial_seed to be a constant number in + * each running process. Using 'pid' is useful because core dumps store + * pids and our backtrace format does as well so if a crash is observed for + * a specific seed, programs can reuse the 'pid' to regenerate the same + * error condition. + */ + { + int fvals[3] = { num, denom, (int) getpid() }; + union psmi_envvar_val env_fi; + char fvals_str[128]; + char fname[128]; + char fdesc[256]; + + snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom); + fvals_str[sizeof fvals_str - 1] = '\0'; + snprintf(fname, sizeof fname - 1, "PSM_FI_%s", spec_name); + fname[sizeof fname - 1] = '\0'; + snprintf(fdesc, sizeof fdesc - 1, "Fault Injection %s <%s>", + fname, fvals_str); + + if (!psmi_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) fvals_str, + &env_fi)) + { + /* not using default values */ + int n_parsed = psmi_parse_str_tuples(env_fi.e_str, 3, fvals); + if (n_parsed >= 1) + fi->num = fvals[0]; + if (n_parsed >= 2) + fi->denom = fvals[1]; + if (n_parsed >= 3) + fi->seedp = fvals[2]; + } + } + + STAILQ_INSERT_TAIL(&psmi_faultinj_head, fi, next); + return fi; +} + +int +psmi_faultinj_is_fault(struct psmi_faultinj_spec *fi) +{ + int r; + if (!psmi_faultinj_enabled) /* never fault if disabled */ + return 0; + if (fi->num == 0) + return 0; + + fi->num_calls++; + r = rand_r(&fi->seedp); + if (r % fi->denom <= fi->num) { + fi->num_faults++; + return 1; + } + else + return 0; +} + +/* For memory allocation, we kind of break the PSM error handling rules. + * If the caller gets NULL, it has to assume that the error has been handled + * and should always return PSM_NO_MEMORY */ + +/* + * Log memory increments or decrements of type memstats_t. + */ +struct psmi_memtype_hdr { + struct { + uint64_t size : 48; + uint64_t magic : 8; + uint64_t type : 8; + }; +}; + +struct psmi_stats_malloc psmi_stats_memory; + +void +psmi_log_memstats(psmi_memtype_t type, int64_t nbytes) +{ +#define _add_max_total(type,nbytes) \ + psmi_stats_memory.m_ ## type ## _total += (nbytes); \ + psmi_stats_memory.m_ ## type ## _max = max( \ + psmi_stats_memory.m_ ## type ## _total, \ + psmi_stats_memory.m_ ## type ## _max); + + switch (type) { + case PER_PEER_ENDPOINT: + _add_max_total(perpeer, nbytes); + break; + case NETWORK_BUFFERS: + _add_max_total(netbufs, nbytes); + break; + case DESCRIPTORS: + _add_max_total(descriptors, nbytes); + break; + case UNEXPECTED_BUFFERS: + _add_max_total(unexpbufs, nbytes); + break; + case STATS: + _add_max_total(stats, nbytes); + break; + case UNDEFINED: + _add_max_total(undefined, nbytes); + break; + default: + psmi_assert_always(type == TOTAL); + break; + } + _add_max_total(all, nbytes); + psmi_stats_memory.m_all_max++; +#undef _add_max_total + + return; +} + +#define psmi_stats_mask PSMI_STATSTYPE_MEMORY + +#ifdef malloc +#undef malloc +#endif +void * +psmi_malloc_internal(psm_ep_t ep, psmi_memtype_t type, + size_t sz, const char *curloc) +{ + size_t newsz = sz; + void *newa; + + psmi_assert(sizeof(struct psmi_memtype_hdr) == 8); + + if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY) + newsz += sizeof(struct psmi_memtype_hdr); + + newa = malloc(newsz); + if (newa == NULL) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY, + "Out of memory for malloc at %s", curloc); + return NULL; + } + + if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = (struct psmi_memtype_hdr *) newa; + hdr->size = newsz; + hdr->type = type; + hdr->magic = 0x8c; + psmi_log_memstats(type, newsz); + newa = (void *) (hdr + 1); + //_IPATH_INFO("alloc is %p\n", newa); + } + return newa; +} + +#ifdef calloc +#undef calloc +#endif +void * +psmi_calloc_internal(psm_ep_t ep, psmi_memtype_t type, size_t nelem, + size_t elemsz, const char *curloc) +{ + void *newa = psmi_malloc_internal(ep, type, nelem*elemsz, curloc); + if (newa == NULL) /* error handled above */ + return NULL; + memset(newa, 0, nelem*elemsz); + return newa; +} + +#ifdef strdup +#undef strdup +#endif +void * +psmi_strdup_internal(psm_ep_t ep, const char *string, const char *curloc) +{ + size_t len = strlen(string)+1; + void *newa = psmi_malloc_internal(ep, UNDEFINED, len, curloc); + if (newa == NULL) + return NULL; + memcpy(newa, string, len); /* copy with \0 */ + return newa; +} + +#ifdef free +#undef free +#endif + +void +psmi_free_internal(void *ptr) +{ + if_pf (psmi_stats_mask & PSMI_STATSTYPE_MEMORY) { + struct psmi_memtype_hdr *hdr = + (struct psmi_memtype_hdr *) ptr - 1; + //_IPATH_INFO("hdr is %p, ptr is %p\n", hdr, ptr); + psmi_memtype_t type = hdr->type; + int64_t size = hdr->size; + int magic = (int) hdr->magic; + psmi_log_memstats(type, -size); + psmi_assert_always(magic == 0x8c); + ptr = (void *) hdr; + } + free(ptr); +} + +PSMI_ALWAYS_INLINE( +psm_error_t +psmi_coreopt_ctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm_error_t err = PSM_OK; + char err_string[256]; + + switch(optname) { + case PSM_CORE_OPT_DEBUG: + /* Sanity check length */ + if (*optlen < sizeof(unsigned)) { + snprintf(err_string, 256, "Option value length error"); + *optlen = sizeof(unsigned); + goto fail; + } + + if (get) { + *((unsigned *) optval) = infinipath_debug; + } + else + infinipath_debug = *(unsigned*) optval; + break; + case PSM_CORE_OPT_EP_CTXT: + { + /* core object is epaddr */ + psm_epaddr_t epaddr = (psm_epaddr_t) core_obj; + + /* Sanity check epaddr */ + if (!epaddr) { + snprintf(err_string, 256, "Invalid endpoint address"); + goto fail; + } + + /* Sanity check length */ + if (*optlen < sizeof(unsigned long)) { + snprintf(err_string, 256, "Option value length error"); + *optlen = sizeof(void*); + goto fail; + } + + if (get) { + *((unsigned long*) optval) = (unsigned long) epaddr->usr_ep_ctxt; + } + else + epaddr->usr_ep_ctxt = optval; + } + break; + default: + /* Unknown/unrecognized option */ + snprintf(err_string, 256, "Unknown PSM_CORE option %u.", optname); + goto fail; + } + + + return err; + + fail: + /* Unrecognized/unknown option */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, err_string, "%s"); +} + +psm_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, (void*) optval, &optlen, 0); +} + +psm_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_coreopt_ctl(core_obj, optname, optval, optlen, 1); +} + +/* PSM AM component option handling */ +PSMI_ALWAYS_INLINE( +psm_error_t +psmi_amopt_ctl(const void *am_obj, int optname, + void *optval, uint64_t *optlen, int get)) +{ + psm_error_t err = PSM_OK; + + switch(optname) { + case PSM_AM_OPT_FRAG_SZ: + { + /* AM object is a psm_epaddr (or NULL for global minimum sz) */ + psm_epaddr_t epaddr = (psm_epaddr_t) am_obj; + + if (!get) /* Cannot set this option */ + return psmi_handle_error(NULL, PSM_OPT_READONLY, + "Unable to set PSM_AM_OPT_FRAG_SZ. This is " + "a read only option."); + /* Sanity check length */ + if (*optlen < sizeof(uint32_t)) { + *optlen = sizeof(uint32_t); + return err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Option value length error"); + } + + /* TODO: Currently all AMs occur over IPS which utilizes the PIO flows. + * These are limited to the PIO size of the chip. Once we have AM + * capability over shared memory then we can have different fragment + * sizes over both transport and the global fragment size will need to + * take the minimum of all possible transports used. For now if the + * endpoint is opened get the PIO size from it else hard code it to 2K + * which is "correct" for all supported chips. + */ + *((unsigned *) optval) = + (epaddr && + psmi_ep_device_is_enabled(epaddr->ep, PTL_DEVID_IPS)) ? + (epaddr->ep->context.base_info.spi_piosize - + IPATH_MESSAGE_HDR_SIZE) : 2048; + } + + break; + default: + err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown PSM_AM option %u.", optname); + } + + return err; +} + +psm_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen) +{ + return psmi_amopt_ctl(am_obj, optname, (void*) optval, &optlen, 0); +} + +psm_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen) +{ + return psmi_amopt_ctl(am_obj, optname, optval, optlen, 1); +} diff --git a/psm_utils.h b/psm_utils.h new file mode 100644 index 0000000..e6420e0 --- /dev/null +++ b/psm_utils.h @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PSMI_IN_USER_H +#error psm_utils.h not meant to be included directly, include psm_user.h instead +#endif + +#ifndef _PSMI_UTILS_H +#define _PSMI_UTILS_H + +#include /* ipv4addr */ +#include /* malloc/free */ + +/* + * Endpoint 'id' hash table, with iterator interface + */ +struct psmi_epid_table { + struct psmi_epid_tabentry *table; + int tabsize; + int tabsize_used; + pthread_mutex_t tablock; +}; +/* + * Endpoint address hash table + */ +struct psmi_epid_tabentry { + void *entry; + uint64_t key; + psm_ep_t ep; + psm_epid_t epid; +}; + +extern struct psmi_epid_table psmi_epid_table; +#define EPADDR_DELETED ((void *)-1) /* tag used to mark deleted entries */ +#define PSMI_EPID_TABSIZE_CHUNK 128 +#define PSMI_EPID_TABLOAD_FACTOR ((float)0.7) + +psm_error_t psmi_epid_init(); +psm_error_t psmi_epid_fini(); +void *psmi_epid_lookup(psm_ep_t ep, psm_epid_t epid); +void *psmi_epid_remove(psm_ep_t ep, psm_epid_t epid); +psm_error_t psmi_epid_add(psm_ep_t ep, psm_epid_t epid, void *entry); +#define PSMI_EP_HOSTNAME ((psm_ep_t) -1) /* Special endpoint handle we use + * to register hostnames */ +#define PSMI_EP_CROSSTALK ((psm_ep_t) -2) /* Second special endpoint handle + * to log which nodes we've seen + * crosstalk from */ +struct psmi_eptab_iterator { + int i; /* last index looked up */ + psm_ep_t ep; +}; +void psmi_epid_itor_init(struct psmi_eptab_iterator *itor, psm_ep_t ep); +void *psmi_epid_itor_next(struct psmi_eptab_iterator *itor); +void psmi_epid_itor_fini(struct psmi_eptab_iterator *itor); + +uint64_t psmi_epid_hca_type(psm_epid_t epid); +uint64_t psmi_epid_sl(psm_epid_t epid); +/* + * Hostname manipulation + */ +#define PSMI_EP_HOSTNAME_LEN 64 /* hostname only */ +#define PSMI_EP_NAME_LEN 96 /* hostname:LID:context:subcontext */ +char * psmi_gethostname(void); +const char * psmi_epaddr_get_hostname(psm_epid_t epid); +const char * psmi_epaddr_get_name(psm_epid_t epid); +psm_error_t psmi_epid_set_hostname(uint64_t nid, const char *hostname, + int overwrite); + +/* + * Memory allocation, use macros only. + * + * In all calls, ep can be a specific endpoint (valid psm_ep_t) or PSMI_EP_NONE + * if no endpoint is available. + * + * psmi_malloc(ep, memtype, size) + * psmi_calloc(ep, memtype, elemsz, numelems) + * psmi_strdup(ep, memtype, ptr) + * psmi_free(ptr) + * + */ +typedef enum psmi_memtype { + TOTAL = 0, /* Logged automatically by malloc/calloc */ + UNDEFINED, /* For tracking "other types" of allocations */ + PER_PEER_ENDPOINT, /* For tracking "per peer" allocations */ + NETWORK_BUFFERS, /* For tracking network buffers */ + DESCRIPTORS, /* For tracking send/recv descriptors */ + UNEXPECTED_BUFFERS, /* For tracking unexpected recv buffers */ + STATS, /* For tracking stats-related allocs */ +} +psmi_memtype_t; + +/* + * We track allocation stats. + */ +struct psmi_stats_malloc { + int64_t m_all_total; + int64_t m_all_max; + int64_t m_perpeer_total; + int64_t m_perpeer_max; + int64_t m_netbufs_total; + int64_t m_netbufs_max; + int64_t m_descriptors_total; + int64_t m_descriptors_max; + int64_t m_unexpbufs_total; + int64_t m_unexpbufs_max; + int64_t m_undefined_total; + int64_t m_undefined_max; + int64_t m_stats_total; + int64_t m_stats_max; +}; + +extern struct psmi_stats_malloc psmi_stats_memory; + +void *psmi_malloc_internal(psm_ep_t ep, psmi_memtype_t mt, size_t sz, + const char *curloc); +void *psmi_calloc_internal(psm_ep_t ep, psmi_memtype_t mt, size_t num, size_t sz, + const char *curloc); +void *psmi_strdup_internal(psm_ep_t ep, const char *string, const char *curloc); +void psmi_free_internal(void *ptr); + +#define psmi_strdup(ep,string) psmi_strdup_internal(ep,string, PSMI_CURLOC) +#define psmi_calloc(ep,mt,nelem,elemsz) \ + psmi_calloc_internal(ep,mt,nelem,elemsz,PSMI_CURLOC) +#define psmi_malloc(ep,mt,sz) psmi_malloc_internal(ep,mt,sz,PSMI_CURLOC) +#define psmi_free(sz) psmi_free_internal(sz) + +#ifndef PSM_IS_TEST +#define malloc(sz) _use_psmi_malloc_instead_of_plain_malloc +#define calloc(sz,nelm) _use_psmi_calloc_instead_of_plain_calloc +#ifdef strdup +#undef strdup +#endif +#define strdup(ptr) _use_psmi_strdup_instead_of_plain_strdup +#define free(ptr) _use_psmi_free_instead_of_plain_free +#endif /* PSM_IS_TEST */ + +void psmi_log_memstats(psmi_memtype_t type, int64_t nbytes); + +/* + * Parsing int parameters set in string tuples. + */ +int psmi_parse_str_tuples(const char *str, int ntup, int *vals); + +/* + * Resource Limiting based on PSM memory mode. + */ +#define PSMI_MEMMODE_NORMAL 0 +#define PSMI_MEMMODE_MINIMAL 1 +#define PSMI_MEMMODE_LARGE 2 +#define PSMI_MEMMODE_NUM 3 + +struct psmi_rlimit_mpool { + const char *env; + const char *descr; + int env_level; + uint32_t minval; + uint32_t maxval; + struct { + uint32_t obj_chunk; + uint32_t obj_max; + } + mode[PSMI_MEMMODE_NUM]; +}; +psm_error_t psmi_parse_mpool_env(const psm_mq_t mq, int level, + const struct psmi_rlimit_mpool *rlim, + uint32_t *valo, uint32_t *chunkszo); +int psmi_parse_memmode(void); + +/* + * Parsing environment variables + */ + +union psmi_envvar_val { + void *e_void; + char *e_str; + int e_int; + unsigned int e_uint; + long e_long; + unsigned long e_ulong; + unsigned long long e_ulonglong; +}; + +#define PSMI_ENVVAR_LEVEL_USER 1 +#define PSMI_ENVVAR_LEVEL_HIDDEN 2 + +#define PSMI_ENVVAR_TYPE_YESNO 0 +#define PSMI_ENVVAR_TYPE_STR 1 +#define PSMI_ENVVAR_TYPE_INT 2 +#define PSMI_ENVVAR_TYPE_UINT 3 +#define PSMI_ENVVAR_TYPE_UINT_FLAGS 4 +#define PSMI_ENVVAR_TYPE_LONG 5 +#define PSMI_ENVVAR_TYPE_ULONG 6 +#define PSMI_ENVVAR_TYPE_ULONG_FLAGS 7 +#define PSMI_ENVVAR_TYPE_ULONG_ULONG 8 + +#define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1) +#define PSMI_ENVVAR_VAL_NO ((union psmi_envvar_val) 0) + +int psmi_getenv(const char *name, const char *descr, int level, + int type, union psmi_envvar_val defval, + union psmi_envvar_val *newval); + +/* + * Misc functionality + */ +uintptr_t psmi_getpagesize(void); +uint64_t psmi_cycles_left(uint64_t start_cycles, int64_t timeout_ns); +uint32_t psmi_get_ipv4addr(); +void psmi_syslog(psm_ep_t ep, int to_console, int level, + const char *format, ...); +void psmi_uuid_unparse(const psm_uuid_t uuid, char *out); +int psmi_uuid_compare(const psm_uuid_t uuA, const psm_uuid_t uuB); +void *psmi_memcpyo(void *dst, const void *src, size_t n); +uint32_t psmi_crc(unsigned char *buf, int len); +uint32_t psmi_get_hca_type(psmi_context_t *context); + +/* + * Diagnostics, all in psm_diags.c + */ +int psmi_diags(void); + +/* + * Fault injection + */ +struct psmi_faultinj_spec; +int psmi_faultinj_enabled; /* use macro to test */ +#if 1 /* possible to disable at compile time */ +#define PSMI_FAULTINJ_ENABLED() (!!psmi_faultinj_enabled) +#else +#define PSMI_FAULTINJ_ENABLED() 0 +#endif + +void psmi_faultinj_init(); +void psmi_faultinj_fini(); +struct psmi_faultinj_spec *psmi_faultinj_getspec(char *spec_name, + int num, int denom); +#define PSMI_FAULTINJ_STATIC_DECL(var, spec_name, num, denom) \ + static struct psmi_faultinj_spec *var = NULL; \ + if (PSMI_FAULTINJ_ENABLED() && (var) == NULL) \ + (var) = psmi_faultinj_getspec((spec_name), (num), (denom)); +int psmi_faultinj_is_fault(struct psmi_faultinj_spec *spec); + +/* + * PSM core component set/get options + */ +psm_error_t psmi_core_setopt(const void *core_obj, int optname, + const void *optval, uint64_t optlen); + +psm_error_t psmi_core_getopt(const void *core_obj, int optname, + void *optval, uint64_t *optlen); + +/* + * PSM AM component set/get options + */ +psm_error_t psmi_am_setopt(const void *am_obj, int optname, + const void *optval, uint64_t optlen); + +psm_error_t psmi_am_getopt(const void *am_obj, int optname, + void *optval, uint64_t *optlen); + +#endif /* _PSMI_UTILS_H */ diff --git a/psmd/Makefile b/psmd/Makefile new file mode 100644 index 0000000..c70665c --- /dev/null +++ b/psmd/Makefile @@ -0,0 +1,82 @@ +# +# +# This file is provided under a dual BSD/GPLv2 license. When using or +# redistributing this file, you may do so under either license. +# +# GPL LICENSE SUMMARY +# +# Copyright(c) 2012, 2017. Intel Corporation. +# Copyright(c) 2005, 2006. QLogic Corporation. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# Contact Information: +# Intel Corporation, www.intel.com +# +# BSD LICENSE +# +# Copyright(c) 2012, 2017. Intel Corporation. +# Copyright(c) 2005, 2006. QLogic Corporation. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# + +include $(top_srcdir)/buildflags.mak +CFLAGS += -Wall -Werror -D_IPATH_DEBUGGING=0 +LDFLAGS += $(SCIF_LINK_FLAGS) +INCLUDES += -I$(top_srcdir)/include -I$(top_srcdir)/include/linux-x86_64 $(SCIF_INCLUDE_FLAGS) +TARGETS = psmd + +all: ${TARGETS} + +${TARGETS}-objs := psmd.o ipath_service.o ipath_sysfs.o + +${TARGETS}: ${$(TARGETS)-objs} + $(CC) -o $@ $(CFLAGS) $^ $(LDFLAGS) + +psmd.o: psmd.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +ipath_service.o: $(top_srcdir)/ipath/ipath_service.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +ipath_sysfs.o: $(top_srcdir)/ipath/ipath_sysfs.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +install: + install -D psmd ${DESTDIR}${INSTALL_SBIN_TARG}/psmd +clean: + rm -f *.o $(TARGETS) + diff --git a/psmd/psmd.c b/psmd/psmd.c new file mode 100644 index 0000000..1b14e4c --- /dev/null +++ b/psmd/psmd.c @@ -0,0 +1,758 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This file contains ipath service routine interface used by the low +// level infinipath protocol code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_service.h" + +#include +#define PSMD_HOST_PORT SCIF_OFED_PORT_7 /* reserved, match psm library */ +#define BACKLOG 10 +scif_epd_t psm_epd = -1; + +static void +psmd_syslog(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vsyslog(LOG_ERR|LOG_USER, format, ap); + va_end(ap); +} + +static int +psmd_scif_send(void *buf, size_t len) +{ + int ret; + while (len) { + ret = scif_send(psm_epd, buf, (uint32_t)len, SCIF_SEND_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static int +psmd_scif_recv(void *buf, size_t len) +{ + int ret; + while (len) { + ret = scif_recv(psm_epd, buf, (uint32_t)len, SCIF_RECV_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static void child_handler(int signo) +{ + while (waitpid(-1, NULL, WNOHANG) > 0); +} + +static int +psmd_service(void) +{ + int ret; + struct ipath_cmd cmd; + + while (1) { + ret = psmd_scif_recv(&cmd, sizeof(cmd)); + if (ret) { + //psmd_syslog("get request error\n"); + scif_close(psm_epd); + psm_epd = -1; + return 0; + } + + switch(cmd.type) { + case IPATH_CMD_CONTEXT_OPEN: + { + int fd; + + fd = ipath_context_open(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, cmd.cmd.mic_info.data3); + + cmd.cmd.mic_info.data1 = fd; + if (fd < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + close(fd); + goto err; + } + break; + } + + case IPATH_CMD_CONTEXT_CLOSE: + { + ipath_context_close(cmd.cmd.mic_info.data1); + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_ASSIGN_CONTEXT: + { + int fd; + struct ipath_base_info binfo; + + ret = psmd_scif_recv(&fd, sizeof(fd)); + if (ret) goto err; + + memset(&binfo, 0, sizeof(binfo)); + cmd.cmd.user_info.spu_base_info = (__u64)&binfo; + cmd.cmd.user_info.spu_base_info_size = sizeof(binfo); + ret = ipath_cmd_assign_context(fd, &cmd, sizeof(cmd)); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + + if (cmd.cmd.mic_info.data1 >= 0) { + ret = psmd_scif_send(&binfo, sizeof(binfo)); + if (ret) goto err; + } + break; + } + + case IPATH_CMD_USER_INIT: + { + int fd; + struct ipath_base_info binfo; + + ret = psmd_scif_recv(&binfo, sizeof(binfo)); + if (ret) goto err; + ret = psmd_scif_recv(&fd, sizeof(fd)); + if (ret) goto err; + + cmd.cmd.user_info.spu_base_info = (__u64)&binfo; + cmd.cmd.user_info.spu_base_info_size = sizeof(binfo); + ret = ipath_cmd_user_init(fd, &cmd, sizeof(cmd)); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + + if (cmd.cmd.mic_info.data1 >= 0) { + ret = psmd_scif_send(&binfo, sizeof(binfo)); + if (ret) goto err; + } + break; + } + + case IPATH_CMD_SET_PART_KEY: + case IPATH_CMD_PIOAVAILUPD: + case IPATH_CMD_ACK_EVENT: + case IPATH_CMD_POLL_TYPE: + + case IPATH_CMD_RECV_CTRL: + case IPATH_CMD_ARMLAUNCH_CTRL: + case IPATH_CMD_DISARM_BUFS: + { + int fd; + + ret = psmd_scif_recv(&fd, sizeof(fd)); + if (ret) goto err; + + ret = ipath_cmd_write(fd, &cmd, sizeof(cmd)); + + cmd.cmd.mic_info.data1 = ret; + if (ret) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_NUM_UNITS: + { + ret = ipath_get_num_units(); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_NUM_CTXTS: + { + ret = ipath_get_num_contexts(cmd.cmd.mic_info.unit); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_PORT_LID: + { + ret = ipath_get_port_lid(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_PORT_GID: + { + ret = ipath_get_port_gid(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, + (uint64_t*)&cmd.cmd.mic_info.data3, + (uint64_t*)&cmd.cmd.mic_info.data4); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_PORT_LMC: + { + ret = ipath_get_port_lmc(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_PORT_RATE: + { + ret = ipath_get_port_rate(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_PORT_S2V: + { + ret = ipath_get_port_sl2vl(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, + cmd.cmd.mic_info.data1); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_STATS_NAMES: + { + char *name = NULL; + + ret = infinipath_get_stats_names(&name); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } else cmd.cmd.mic_info.data2 = strlen(name); + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (name) free(name); + goto err; + } + + /* with name and count is greater than zero */ + if (name && cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(name, strlen(name)+1); + } + if (name) free(name); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_STATS: + { + uint64_t *s; + + s = malloc(cmd.cmd.mic_info.data1*sizeof(*s)); + if (!s) { + cmd.cmd.mic_info.data1 = -1; + cmd.cmd.mic_info.data2 = ENOMEM; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + } + + ret = infinipath_get_stats(s, cmd.cmd.mic_info.data1); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (s) free(s); + goto err; + } + + if (cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(s, cmd.cmd.mic_info.data1*sizeof(*s)); + } + if (s) free(s); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_CTRS_UNAMES: + { + char *name = NULL; + + ret = infinipath_get_ctrs_unit_names(cmd.cmd.mic_info.unit, &name); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } else cmd.cmd.mic_info.data2 = strlen(name); + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (name) free(name); + goto err; + } + + /* with name and count is greater than zero */ + if (name && cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(name, strlen(name)+1); + } + if (name) free(name); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_CTRS_UNIT: + { + uint64_t *c; + + c = malloc(cmd.cmd.mic_info.data1*sizeof(*c)); + if (!c) { + cmd.cmd.mic_info.data1 = -1; + cmd.cmd.mic_info.data2 = ENOMEM; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + } + + ret = infinipath_get_ctrs_unit(cmd.cmd.mic_info.unit, + c, cmd.cmd.mic_info.data1); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (c) free(c); + goto err; + } + + if (cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(c, cmd.cmd.mic_info.data1*sizeof(*c)); + } + if (c) free(c); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_CTRS_PNAMES: + { + char *name = NULL; + + ret = infinipath_get_ctrs_port_names(cmd.cmd.mic_info.unit, &name); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } else cmd.cmd.mic_info.data2 = strlen(name); + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (name) free(name); + goto err; + } + + /* with name and count is greater than zero */ + if (name && cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(name, strlen(name)+1); + } + if (name) free(name); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_CTRS_PORT: + { + uint64_t *c; + + c = malloc(cmd.cmd.mic_info.data1*sizeof(*c)); + if (!c) { + cmd.cmd.mic_info.data1 = -1; + cmd.cmd.mic_info.data2 = ENOMEM; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + } + + ret = infinipath_get_ctrs_port(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, + c, cmd.cmd.mic_info.data1); + + cmd.cmd.mic_info.data1 = ret; + if (ret <= 0) { + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + } + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (c) free(c); + goto err; + } + + if (cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(c, cmd.cmd.mic_info.data1*sizeof(*c)); + } + if (c) free(c); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_CC_SETTINGS: + { + char ccabuf[256]; + + ret = ipath_get_cc_settings_bin(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, ccabuf); + + cmd.cmd.mic_info.data1 = ret; + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + + if (cmd.cmd.mic_info.data1 == 1) { + ret = psmd_scif_send(ccabuf, 84); + if (ret) goto err; + } + break; + } + + case IPATH_CMD_GET_CC_TABLE: + { + uint16_t *cct = NULL; + + ret = ipath_get_cc_table_bin(cmd.cmd.mic_info.unit, + cmd.cmd.mic_info.port, &cct); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (cct) free(cct); + goto err; + } + + if (cmd.cmd.mic_info.data1 > 0) { + ret = psmd_scif_send(cct, + (cmd.cmd.mic_info.data1+1)*sizeof(uint16_t)); + } + if (cct) free(cct); + if (ret) goto err; + break; + } + + case IPATH_CMD_WAIT_FOR_PACKET: + { + ret = ipath_cmd_wait_for_packet(cmd.cmd.mic_info.data1); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + case IPATH_CMD_GET_UNIT_FLASH: + { + char *data = NULL; + + ret = infinipath_get_unit_flash(cmd.cmd.mic_info.unit, &data); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + else cmd.cmd.mic_info.data2 = strlen(data); + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) { + if (data) free(data); + goto err; + } + + if (data) { + ret = psmd_scif_send(data, strlen(data)+1); + free(data); + if (ret) goto err; + } + break; + } + + case IPATH_CMD_PUT_UNIT_FLASH: + { + char *data; + int len; + + len = cmd.cmd.mic_info.data1; + data = malloc(len + 1); + if (!data) goto err; + + ret = psmd_scif_recv(data, len); + if (ret) { + free(data); + goto err; + } + + ret = infinipath_put_unit_flash(cmd.cmd.mic_info.unit, data, len); + free(data); + + cmd.cmd.mic_info.data1 = ret; + if (ret < 0) cmd.cmd.mic_info.data2 = errno; + + ret = psmd_scif_send(&cmd, sizeof(cmd)); + if (ret) goto err; + break; + } + + default: + goto err; + } /* switch */ + } /* while (1) */ + +err: + psmd_syslog("error, request type = %d", cmd.type); + scif_close(psm_epd); + psm_epd = -1; + return 1; +} + +int +main(int argc, char *argv[]) +{ + uid_t uid; + gid_t gid; + pid_t pid; + scif_epd_t epd; + struct scif_portID portID; + struct sigaction act; + int count; + + /* Only root can run this code */ + if (getuid()) { + fprintf(stderr, "Only root can run psmd\n"); + psmd_syslog("Only root can run psmd"); + exit(1); + } + + /* start to daemonize psmd */ + pid = fork(); + if (pid < 0) { + psmd_syslog("fork() failed with err %d", errno); + exit(1); + } + if (pid > 0) { + exit(0); + } + + /* At this point we are executing as the child process */ + + /* Change the file mode mask */ + umask(0); + + /* Create a new SID for the child process */ + if (setsid() < 0) { + psmd_syslog("setsid() failed with err %d", errno); + exit(1); + } + + /* Change the current working directory.*/ + if ((chdir("/tmp")) < 0) { + psmd_syslog("chdir() failed with err %d", errno); + exit(1); + } + + /* Redirect standard files to /dev/null */ + if (freopen( "/dev/null", "r", stdin) == NULL || + freopen( "/dev/null", "w", stdout) == NULL || + freopen( "/dev/null", "w", stderr) == NULL) { + psmd_syslog("freopen() failed with err %d", errno); + exit(1); + } + + /* Install sigchild handler */ + memset(&act, 0, sizeof act); + act.sa_handler = child_handler; + sigaction(SIGCHLD, &act, NULL); + + /* open end pt */ + if ((epd = scif_open()) < 0) { + psmd_syslog("scif_open() failed with err %d", errno); + exit(1); + } + + /* bind end pt to specified port */ + if (scif_bind(epd, PSMD_HOST_PORT) < 0) { + scif_close(epd); + psmd_syslog("scif_bind() failed with err %d", errno); + exit(1); + } + + /* marks an end pt as listening end pt and queues up a maximum of BACKLOG + * no: of incoming connection requests + */ + if (scif_listen(epd, BACKLOG) != 0) { + scif_close(epd); + psmd_syslog("scif_listen() failed with err %d", errno); + exit(1); + } + + count = 0; + while (1) { + /* accepts a conn request by creating a new end pt that connects to peer */ + if (scif_accept(epd, &portID, &psm_epd, SCIF_ACCEPT_SYNC) < 0) { + if (errno == EINTR) continue; + psmd_syslog("scif_accept() failed with err %d", errno); + count++; + if (count < 20) continue; + scif_close(epd); + exit(1); + } + count = 0; /* not error in row */ + + /* if connection is from host, reject it. */ + if (portID.node == 0) { + psmd_syslog("reject connection from host"); + scif_close(psm_epd); + psm_epd = -1; + continue; + } + + if (scif_recv(psm_epd, &uid, sizeof(uid), SCIF_RECV_BLOCK) != sizeof(uid)) { + psmd_syslog("cannot get peer uid"); + scif_close(psm_epd); + psm_epd = -1; + continue; + } + if (scif_recv(psm_epd, &gid, sizeof(gid), SCIF_RECV_BLOCK) != sizeof(gid)) { + psmd_syslog("cannot get peer gid"); + scif_close(psm_epd); + psm_epd = -1; + continue; + } + + pid = fork(); + if (pid == 0) { + /* need to change gid first */ + if (setgid(gid)) { + psmd_syslog("cannot set peer gid"); + scif_close(psm_epd); + psm_epd = -1; + exit(1); + } + if (setgroups(0, 0)) { + psmd_syslog("cannot setgroups(0,0)"); + scif_close(psm_epd); + psm_epd = -1; + exit(1); + } + if (setuid(uid)) { + psmd_syslog("cannot set peer uid"); + scif_close(psm_epd); + psm_epd = -1; + exit(1); + } + + exit(psmd_service()); + } else { + scif_close(psm_epd); + psm_epd = -1; + } + } + + exit(0); +} diff --git a/ptl.h b/ptl.h new file mode 100644 index 0000000..3aefcab --- /dev/null +++ b/ptl.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Interface implemented by Packet Transport layers such as + * ips and active messages. + * + * This interface can be volatile, it is never seen by PSM clients, and it will + * probably change as the AM ptl is developed. + */ + +#ifndef PSM_PTL_H +#define PSM_PTL_H +#include +#include +#include +#include + +/* We currently have 3 PTLs, 0 is reserved. */ +#define PTL_DEVID_IPS 1 +#define PTL_DEVID_AMSH 2 +#define PTL_DEVID_SELF 3 + +/* We can currently initialize up to 3 PTLs */ +#define PTL_MAX_INIT 3 + +struct ptl; +typedef struct ptl ptl_t; + +struct ptl_epaddr; +typedef struct ptl_epaddr ptl_epaddr_t; + +struct ptl_ctl; +typedef struct ptl_ctl ptl_ctl_t; + +struct ptl_mq_req; +typedef struct ptl_mq_req ptl_mq_req_t; + +/* To be filled in statically by all PTLs */ +struct ptl_ctl_init +{ + size_t + (*sizeof_ptl)(void); + + psm_error_t + (*init)(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl); + + psm_error_t + (*fini)(ptl_t *ptl, int force, uint64_t timeout_ns); + + psm_error_t + (*setopt)(const void *component_obj, int optname, + const void *optval, uint64_t optlen); + + psm_error_t + (*getopt)(const void *component_obj, int optname, + void *optval, uint64_t *optlen); +}; + +typedef +struct ptl_arg { + union { + struct { + uint16_t u16w3; + uint16_t u16w2; + uint16_t u16w1; + uint16_t u16w0; + }; + struct { + uint32_t u32w1; + uint32_t u32w0; + }; + uint64_t u64w0; + uint64_t u64; + void *uptr; + }; +} +ptl_arg_t; + +#include "ptl_self/ptl_fwd.h" +#include "ptl_ips/ptl_fwd.h" +#include "ptl_am/ptl_fwd.h" + +/* To be filled in as part of ptl_init */ +struct ptl_ctl +{ + ptl_t *ptl; /* pointer to ptl */ + + /* EP-specific stuff */ + psm_error_t (*ep_poll)(ptl_t *ptl, int replyonly); + + /* PTL-level connect + * + * This PTL-level is slightly different from the top-level PSM connect. + * + * pre 1: Caller has masked off epids in epid array that are already + * connected at the PSM level. + * + * post 0: PTL has allocate all epaddrs and whatever internal ptladdr that + * ptl needs. + * post 1: PTL marks error[i] as UNREACHABLE if PTL can't get to epid[i] + * post 2: PTL marks error[i] as UNKNOWN for all epid[i] that couldn't be + * connected before a timeout occurred. + * post 3: PTL returns OK iff all epids are either OK or UNREACHABLE + * post 4: PTL defines content or epaddr[i] only if epaddr[i] is OK. + */ + psm_error_t (*ep_connect)(ptl_t *ptl, + int num_ep, + const psm_epid_t input_array_of_epid[], + const int array_of_epid_mask[], + psm_error_t output_array_of_errors[], + psm_epaddr_t output_array_of_epddr[], + uint64_t timeout_ns); + + psm_error_t (*ep_disconnect)(ptl_t *ptl, int force, + int num_ep, + const psm_epaddr_t input_array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t output_array_of_errors[], + uint64_t timeout_ns); + + /* MQ stuff */ + psm_error_t (*mq_send)(psm_mq_t mq, psm_epaddr_t dest, + uint32_t flags, uint64_t stag, const void *buf, uint32_t len); + psm_error_t (*mq_isend)(psm_mq_t mq, psm_epaddr_t dest, + uint32_t flags, uint64_t stag, const void *buf, uint32_t len, + void *ctxt, psm_mq_req_t *req); + + int (*epaddr_stats_num)(void); + int (*epaddr_stats_init)(char *desc[], uint16_t *flags); + int (*epaddr_stats_get)(psm_epaddr_t epaddr, uint64_t *stats); + + /* AM stuff, only for Active messages PTL. Eventually we will expose + * this to PSM clients. */ + psm_error_t (*am_short_request)(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + psm_error_t (*am_short_reply)(psm_am_token_t token, psm_handler_t handler, + psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + psm_error_t (*am_long_request)(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int flags); + psm_error_t (*am_long_reply)(psm_am_token_t token, psm_handler_t handler, + psm_amarg_t *args, int nargs, void *src, + size_t len, void *dest, int flags); +}; +#endif diff --git a/ptl_am/Makefile b/ptl_am/Makefile new file mode 100644 index 0000000..f15e0cc --- /dev/null +++ b/ptl_am/Makefile @@ -0,0 +1,45 @@ +# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +include $(top_srcdir)/buildflags.mak +INCLUDES += -I$(top_srcdir) + +${TARGLIB}-objs := am_reqrep.o am_reqrep_shmem.o ptl.o kcopyrwu.o knemrwu.o scifrwu.o + +all: ${${TARGLIB}-objs} + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) $(if $(PSM_HAVE_SCIF:0=),$(SCIF_INCLUDE_FLAGS)) -c $< -o $@ + +clean: + rm -f *.o + diff --git a/ptl_am/am_reqrep.c b/ptl_am/am_reqrep.c new file mode 100644 index 0000000..192c040 --- /dev/null +++ b/ptl_am/am_reqrep.c @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_am.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" + +psm_error_t +psmi_amsh_am_short_request(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm_amarg_t req_args[NSHORT_ARGS] = {}; + + /* All sends are synchronous. Ignore PSM_AM_FLAG_ASYNC. + * TODO: Treat PSM_AM_FLAG_NOREPLY as "advisory". This was mainly + * used to optimize the IPS path though we could put a stricter interpretation + * on it to disallow any replies. + */ + + /* For now less than NSHORT_ARGS-1. We use the first arg to carry the handler + * index. + */ + psmi_assert(nargs < (NSHORT_ARGS - 1)); + req_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void*) &req_args[1], (const void*) args, + (nargs * sizeof(psm_amarg_t))); + psmi_amsh_short_request(epaddr->ptl, epaddr, am_handler_hidx, + req_args, nargs + 1, + src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM_OK; +} + +psm_error_t +psmi_amsh_am_short_reply(psm_am_token_t tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + psm_amarg_t rep_args[NSHORT_ARGS] = {}; + + /* For now less than NSHORT_ARGS-1. We use the first arg to carry the handler + * index. + */ + psmi_assert(nargs < (NSHORT_ARGS - 1)); + rep_args[0].u32w0 = (uint32_t) handler; + psmi_mq_mtucpy((void*) &rep_args[1], (const void*) args, + (nargs * sizeof(psm_amarg_t))); + + psmi_amsh_short_reply((amsh_am_token_t*) tok, am_handler_hidx, rep_args, nargs+1, src, len, 0); + + if (completion_fn) + completion_fn(completion_ctxt); + + return PSM_OK; +} + diff --git a/ptl_am/am_reqrep_shmem.c b/ptl_am/am_reqrep_shmem.c new file mode 100644 index 0000000..50d86f4 --- /dev/null +++ b/ptl_am/am_reqrep_shmem.c @@ -0,0 +1,3513 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include /* shm_open and signal handling */ +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "kcopyrw.h" +#include "knemrw.h" +#include "scifrw.h" + +struct psm_am_max_sizes { + uint32_t nargs; + uint32_t request_short; + uint32_t reply_short; + uint32_t request_long; + uint32_t reply_long; +}; + +int psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + +#ifdef PSM_HAVE_SCIF +#define PSM_SCIF_CONNECT_RETRIES_DEFAULT 40 +int psmi_scif_connect_retries = PSM_SCIF_CONNECT_RETRIES_DEFAULT; +#endif + +/* If we push bulk packets, we place them in the target's bulk packet region, + * if we don't push bulk packets, we place them in *our* bulk packet region and + * have the target pull the data from our region when it needs it. */ +#define AMSH_BULK_PUSH 1 + +/* When do we start using the "huge" buffers -- at 1MB */ +#define AMSH_HUGE_BYTES 1024*1024 + +#define AMMED_SZ 2048 +#define AMLONG_SZ 8192 +#define AMHUGE_SZ (524288+sizeof(am_pkt_bulk_t)) /* 512k + E */ + +/* short med long huge */ +static const amsh_qinfo_t amsh_qcounts = + { 1024, 256, 16, 1, 1024, 256, 16, 8 }; + +/* short med long huge */ +static const amsh_qinfo_t amsh_qelemsz = + { sizeof(am_pkt_short_t), AMMED_SZ+64, AMLONG_SZ, AMHUGE_SZ, + sizeof(am_pkt_short_t), AMMED_SZ+64, AMLONG_SZ, AMHUGE_SZ }; + +/* we use this internally to break up packets into MTUs */ +static const amsh_qinfo_t amsh_qpkt_max = + { NSHORT_ARGS*8, AMMED_SZ, AMLONG_SZ-sizeof(am_pkt_bulk_t), + AMHUGE_SZ-sizeof(am_pkt_bulk_t), + NSHORT_ARGS*8, AMMED_SZ, AMLONG_SZ-sizeof(am_pkt_bulk_t), + AMHUGE_SZ-sizeof(am_pkt_bulk_t), + }; + +/* We expose max sizes for the AM ptl. */ +static const struct psm_am_max_sizes psmi_am_max_sizes = + { 6, AMMED_SZ, (uint32_t) -1, + AMMED_SZ, (uint32_t) -1 }; + +/* + * Macro expansion trickery to handle 6 different fifo types: + * + * _fifo is one of 'reqFifoShort', 'reqFifoMed', 'reqFifoLong', + * 'repFifoShort', 'repFifoMed', 'repFifoLong' + * + * _fifotyp is one of 'short' or 'bulk' + */ +#define QGETPTR(ptl, _shmidx_, _fifo, _fifotyp, _idx) \ + (am_pkt_ ## _fifotyp ## _t *) \ + (((uintptr_t)ptl->ep->amsh_qdir[(_shmidx_)].q ## _fifo) + \ + (_idx) *amsh_qelemsz.q ## _fifo) + +#define QGETPTR_SCIF(ptl, _shmidx_, _node_, _fifo, _fifotyp, _idx) \ + (am_pkt_ ## _fifotyp ## _t *) \ + (((uintptr_t)ptl->ep->amsh_qdir[(_shmidx_)].qptrs[_node_].q ## _fifo) +\ + (_idx) *amsh_qelemsz.q ## _fifo) + +#ifdef PSM_HAVE_SCIF +static void *am_ctl_accept_thread(void *arg); +static psm_error_t amsh_scif_detach(psm_ep_t ep); +#endif +static psm_error_t amsh_poll(ptl_t *ptl, int replyonly); +static psm_error_t amsh_poll_internal_inner(ptl_t *ptl, int replyonly, int is_internal); +static void process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq); +static void amsh_conn_handler(void *toki, psm_amarg_t *args, int narg, + void *buf, size_t len); +static void am_update_directory(ptl_t *ptl, int shmidx); + +/* Kassist helper functions */ +static const char * psmi_kassist_getmode(int mode); +static int psmi_get_kassist_mode(); + +/* SCIF DMA helper functions */ +#ifdef PSM_HAVE_SCIF +static const char * psmi_scif_dma_getmode(int mode); +static int psmi_get_scif_dma_mode(); +static int psmi_get_scif_dma_threshold(); +#endif + +/* Kcopy functionality */ +int psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr); +static int psmi_kcopy_find_minor(int *minor); +static int psmi_kcopy_open_minor(int minor); + +static inline void +am_ctl_qhdr_init(volatile am_ctl_qhdr_t *q, int elem_cnt, int elem_sz) +{ + q->head = 0; + q->elem_cnt = elem_cnt; + q->elem_sz = elem_sz; +} + +static void +am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems) +{ + int i; + am_pkt_bulk_t *bulkpkt; + uintptr_t bulkptr = (uintptr_t) base_ptr; + + for (i = 0; i < nelems; i++, bulkptr += elemsz) { + bulkpkt = (am_pkt_bulk_t *) bulkptr; + bulkpkt->idx = i; + } +} + +#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \ + PSMI_PAGESIZE) +static inline uintptr_t +am_ctl_sizeof_block() +{ + return + PSMI_ALIGNUP( + PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + /* reqctrl block */ + _PA(reqFifoShort) + _PA(reqFifoMed) + _PA(reqFifoLong) + + _PA(reqFifoHuge) + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + /*reqctrl block*/ + _PA(repFifoShort) + _PA(repFifoMed) + _PA(repFifoLong) + + _PA(repFifoHuge), + PSMI_PAGESIZE); /* align to page size */ +} +#undef _PA + +/** + * Given a number of PEs, determine the amount of memory required. + */ +static +size_t +psmi_amsh_segsize(int num_pe, int num_nodes) +{ + size_t segsz; + segsz = PSMI_ALIGNUP(sizeof(struct am_ctl_dirpage), PSMI_PAGESIZE); + segsz += am_ctl_sizeof_block() * num_pe * num_nodes; + return segsz; +} + +static +void +amsh_atexit() +{ + static pthread_mutex_t mutex_once = PTHREAD_MUTEX_INITIALIZER; + static int atexit_once = 0; + psm_ep_t ep; + extern psm_ep_t psmi_opened_endpoint; + + pthread_mutex_lock(&mutex_once); + if (atexit_once) { + pthread_mutex_unlock(&mutex_once); + return; + } + else + atexit_once = 1; + pthread_mutex_unlock(&mutex_once); + + ep = psmi_opened_endpoint; + while (ep) { + if (ep->amsh_keyname != NULL) { + _IPATH_VDBG("unlinking shm file %s\n", ep->amsh_keyname); + shm_unlink(ep->amsh_keyname); + } + + if (ep->psmi_kassist_fd != -1) { + close(ep->psmi_kassist_fd); + ep->psmi_kassist_fd = -1; + } + ep = ep->user_ep_next; + } + + return; +} + +static +void +amsh_mmap_fault(int sig) +{ + static char shm_errmsg[256]; + + snprintf(shm_errmsg, sizeof shm_errmsg, + "%s: Unable to allocate shared memory for intra-node messaging.\n" + "%s: Delete stale shared memory files in /dev/shm.\n", + psmi_gethostname(), psmi_gethostname()); + amsh_atexit(); + if (write(2, shm_errmsg, strlen(shm_errmsg)+1) == -1) + exit(2); + else + exit(1); /* XXX revisit this... there's probably a better way to exit */ +} + +/* + * Scif init to modify the epid of current process. + */ +#ifdef PSM_HAVE_SCIF +static +psm_error_t +amsh_scif_init(psm_ep_t ep) +{ + scif_epd_t epd; + int port, nnodes; + uint16_t self; + psm_error_t err; + union psmi_envvar_val env_retries; + + if(!psmi_getenv("PSM_SCIF_CONNECT_RETRIES", + "PSM SCIF connection retry count", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) psmi_scif_connect_retries, + &env_retries)) { + psmi_scif_connect_retries = env_retries.e_uint; + } + + /* open end pt */ + if ((epd = scif_open()) < 0) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_open() failed with err %d", errno); + return err; + } + + /* bind end pt to specified port */ + if ((port = scif_bind(epd, 0)) < 0) { + scif_close(epd); + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_bind() failed with err %d", errno); + return err; + } + + /* marks an end pt as listening end pt and queues up a maximum of 32 + * incoming connection requests */ + if (scif_listen(epd, 40) != 0) { + scif_close(epd); + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_listen() failed with err %d", errno); + return err; + } + + if ((nnodes = scif_get_nodeIDs(NULL, 0, &self)) < 0) { + scif_close(epd); + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_get_nodeIDs() failed with err %d", errno); + return err; + } + + _IPATH_VDBG("listening on SCIF %d:%d\n", self, port); + + /* Save total scif node #, modify epid to include port and self node ID.*/ + ep->scif_epd = epd; + ep->scif_mynodeid = (int)self; + ep->scif_nnodes = nnodes; + + /* Modify epid with acquired info as below */ + ep->epid |= (((uint64_t)self)&0xFF)<<48; + ep->epid |= (((uint64_t)port)&0xFFFF)<<32; + + return PSM_OK; +} +#endif + +/** + * Attach endpoint shared-memory. + * + * We only try to obtain an shmidx at this point. + */ +psm_error_t +psmi_shm_attach(psm_ep_t ep, int *shmidx_o) +{ + int ismaster = 1; + int i; + int use_kcopy, use_kassist; + int shmidx; + int kcopy_minor = -1; + char shmbuf[256]; + void *mapptr; + size_t segsz; + psm_error_t err = PSM_OK; + + if (ep->amsh_shmidx != -1) { + *shmidx_o = ep->amsh_shmidx; + return PSM_OK; + } + + *shmidx_o = -1; + if (ep->amsh_keyname != NULL) { + if (psmi_uuid_compare(ep->amsh_keyno, ep->key) != 0) { + psmi_uuid_unparse(ep->amsh_keyno, shmbuf); + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Shared memory segment already initialized with key=%s", + shmbuf); + goto fail; + } + } + else { + char *p; + memcpy(&ep->amsh_keyno, ep->key, sizeof(psm_uuid_t)); + strncpy(shmbuf, "/psm_shm.", sizeof shmbuf); + p = shmbuf + strlen(shmbuf); + psmi_uuid_unparse(ep->amsh_keyno, p); + ep->amsh_keyname = psmi_strdup(NULL, shmbuf); + if (ep->amsh_keyname == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + } + +#ifdef PSM_HAVE_SCIF + ep->amsh_qdir = psmi_calloc(NULL, PER_PEER_ENDPOINT, + PTL_AMSH_MAX_LOCAL_PROCS*ep->scif_nnodes, + sizeof(struct amsh_qdirectory)); +#else + ep->amsh_qdir = psmi_calloc(NULL, PER_PEER_ENDPOINT, + PTL_AMSH_MAX_LOCAL_PROCS, + sizeof(struct amsh_qdirectory)); +#endif + + if (ep->amsh_qdir == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + + /* Get which kassist mode to use. */ + ep->psmi_kassist_mode = psmi_get_kassist_mode(); + use_kassist = (ep->psmi_kassist_mode != PSMI_KASSIST_OFF); + use_kcopy = (ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY); + +#ifdef PSM_HAVE_SCIF + ep->scif_dma_mode = psmi_get_scif_dma_mode(); + ep->scif_dma_threshold = psmi_get_scif_dma_threshold(); +#endif + + /* Reserve enough space in the shared memory region for up to + PTL_AMSH_MAX_LOCAL_PROCS. Although that much space is reserved in + virtual memory, physical pages are not allocated until the + corresponding memory location is touched. Memory in this region is + only touched as processes initialize their shared queue area in + amsh_init_segment(), and physical memory is only allocated by the OS + accordingly. So, it looks like this is consumes a lot of memory, + but really it consumes as much as necessary for each active process. */ +#ifdef PSM_HAVE_SCIF + segsz = psmi_amsh_segsize(PTL_AMSH_MAX_LOCAL_PROCS, + PTL_AMSH_MAX_LOCAL_NODES); +#else + /* In the non-SCIF case we should be able to get away with just allocating + * enough shm for the number of mpi ranks, if the number of ranks is + * unavailable, then we will fallback to the number of online cpu cores. + * This will help cut back on virtual memory usage. + */ + int nranks, rankid, nprocs; + psmi_sharedcontext_params(&nranks, &rankid); + nprocs = (nranks <= 0) ? sysconf(_SC_NPROCESSORS_ONLN) : nranks; + segsz = psmi_amsh_segsize(nprocs, PTL_AMSH_MAX_LOCAL_NODES); +#endif + + ep->amsh_shmfd = shm_open(ep->amsh_keyname, + O_RDWR | O_CREAT | O_EXCL | O_TRUNC, S_IRWXU); + if (ep->amsh_shmfd < 0) { + ismaster = 0; + if (errno != EEXIST) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error creating shared memory object in shm_open%s%s", + errno != EACCES ? ": " : + "(/dev/shm may have stale shm files that need to be removed): ", + strerror(errno)); + goto fail; + } + + /* Try to open again, knowing we won't be the shared memory master */ + ep->amsh_shmfd = shm_open(ep->amsh_keyname, O_RDWR, S_IRWXU); + if (ep->amsh_shmfd < 0) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error attaching to shared memory object in shm_open: %s", + strerror(errno)); + goto fail; + } + } + + /* Now register the atexit handler for cleanup, whether master or slave */ + atexit(amsh_atexit); + + _IPATH_PRDBG("Registered as %s to key %s\n", ismaster ? "master" : "slave", + ep->amsh_keyname); + + if (ismaster) { + if (ftruncate(ep->amsh_shmfd, segsz) != 0) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error setting size of shared memory object to %u bytes in " + "ftruncate: %s\n", (uint32_t) segsz, strerror(errno)); + goto fail; + } + } + else { + /* Before we do the mmap, make sure that the master has had time to + * apply the ftruncate, or else we will get a successful mmap on a + * 0-sized object */ + struct stat fdstat; + off_t cursize = 0; + while (cursize == 0) { + if (fstat(ep->amsh_shmfd, &fdstat)) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error querying size of shared memory object: %s", + strerror(errno)); + goto fail; + } + cursize = fdstat.st_size; + if (cursize == 0) + usleep(1); /* be gentle in tight fstat loop */ + } + } + + /* We map the entire shared memory area, consisting of a control structure + * followed by per-process shared queue structures. The "master" creates + * the control structure and initializes it but every process must lock + * appropriate data structures before it reads or writes it. + */ + mapptr = mmap(NULL, segsz, PROT_READ|PROT_WRITE, MAP_SHARED, + ep->amsh_shmfd, 0); + if (mapptr == MAP_FAILED) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error mmapping shared memory: %s", strerror(errno)); + goto fail; + } + + ep->amsh_shmbase = (uintptr_t) mapptr; + ep->amsh_dirpage = (struct am_ctl_dirpage *) ep->amsh_shmbase; + ep->amsh_blockbase = ep->amsh_shmbase + psmi_amsh_segsize(0, 0); + + /* We core dump right after here if we don't check the mmap */ + void (*old_handler_segv)(int) = signal (SIGSEGV, amsh_mmap_fault); + void (*old_handler_bus)(int) = signal (SIGBUS, amsh_mmap_fault); + + _IPATH_PRDBG("Mapped shm control object at %p\n", mapptr); + if (ismaster) { + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&(ep->amsh_dirpage->lock), &attr); + pthread_mutexattr_destroy(&attr); + + ep->amsh_dirpage->num_attached = 0; + ep->amsh_dirpage->max_idx = -1; + + for (i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) { + ep->amsh_dirpage->shmidx_map_epid[i] = 0; + ep->amsh_dirpage->kassist_pids[i] = 0; + } + + for(i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS*PTL_AMSH_MAX_LOCAL_NODES; i++) { + struct amsh_qtail* qtail = &ep->amsh_dirpage->qtails[i]; + + qtail->reqFifoShort.tail = 0; + qtail->reqFifoMed.tail = 0; + qtail->reqFifoLong.tail = 0; + qtail->reqFifoHuge.tail = 0; + + qtail->repFifoShort.tail = 0; + qtail->repFifoMed.tail = 0; + qtail->repFifoLong.tail = 0; + qtail->repFifoHuge.tail = 0; + + pthread_spin_init(&qtail->reqFifoShort.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->reqFifoMed.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->reqFifoLong.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->reqFifoHuge.lock, PTHREAD_PROCESS_SHARED); + + pthread_spin_init(&qtail->repFifoShort.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->repFifoMed.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->repFifoLong.lock, PTHREAD_PROCESS_SHARED); + pthread_spin_init(&qtail->repFifoHuge.lock, PTHREAD_PROCESS_SHARED); + } + + if (use_kassist) { + if (use_kcopy) { + ep->psmi_kassist_fd = psmi_kcopy_find_minor(&kcopy_minor); + if (ep->psmi_kassist_fd >= 0) + ep->amsh_dirpage->kcopy_minor = kcopy_minor; + else + ep->amsh_dirpage->kcopy_minor = -1; + } + else { /* Setup knem */ + psmi_assert_always(ep->psmi_kassist_mode & PSMI_KASSIST_KNEM); + ep->psmi_kassist_fd = knem_open_device(); + } + + } + else + ep->psmi_kassist_fd = -1; + + ips_mb(); + + ep->amsh_dirpage->is_init = 1; + _IPATH_PRDBG("Mapped and initialized shm object control page at %p," + "size=%zu, kcopy minor is %d (mode=%s)\n", mapptr, + segsz, kcopy_minor, + psmi_kassist_getmode(ep->psmi_kassist_mode)); + } + else { + volatile int *is_init = &ep->amsh_dirpage->is_init; + while (*is_init == 0) + usleep(1); + _IPATH_PRDBG("Slave synchronized object control page at " + "%p, size=%d, kcopy minor is %d (mode=%s)\n", + mapptr, (int) segsz, kcopy_minor, + psmi_kassist_getmode(ep->psmi_kassist_mode)); + } + + /* + * First safe point where we can try to attach to the segment. + * + * Here we reserve the shmidx slot by marking the epid to '1'. We only + * update our epid in the init phase once we actually know what our epid + * is. + */ + pthread_mutex_lock((pthread_mutex_t *) &(ep->amsh_dirpage->lock)); + shmidx = -1; + for (i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) { + if (ep->amsh_dirpage->shmidx_map_epid[i] == 0) { + ep->amsh_dirpage->shmidx_map_epid[i] = 1; + ep->amsh_dirpage->psm_verno[i] = PSMI_VERNO; + ep->amsh_dirpage->kassist_pids[i] = (int) getpid(); + + if (use_kassist) { + if (!use_kcopy) { + if (!ismaster) + ep->psmi_kassist_fd = knem_open_device(); + + /* If we are able to use KNEM assume everyone else on the + * node can also use it. Advertise that KNEM is active via + * the feature flag. + */ + if (ep->psmi_kassist_fd >= 0) { + ep->amsh_dirpage->amsh_features[i] |= AMSH_HAVE_KNEM; + psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_KNEM; + } + else { + ep->psmi_kassist_mode = PSMI_KASSIST_OFF; + use_kassist = 0; + psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } + else if(use_kcopy) { + psmi_assert_always(use_kcopy); + kcopy_minor = ep->amsh_dirpage->kcopy_minor; + if (!ismaster && kcopy_minor >= 0) + ep->psmi_kassist_fd = psmi_kcopy_open_minor(kcopy_minor); + + /* If we are able to use KCOPY assume everyone else on the + * node can also use it. Advertise that KCOPY is active via + * the feature flag. + */ + if (ep->psmi_kassist_fd >= 0) { + ep->amsh_dirpage->amsh_features[i] |= AMSH_HAVE_KCOPY; + psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_KCOPY; + } + else { + ep->psmi_kassist_mode = PSMI_KASSIST_OFF; + use_kassist = 0; use_kcopy = 0; + psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + } + } + } + else + psmi_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; + _IPATH_PRDBG("KASSIST MODE: %s\n", psmi_kassist_getmode(ep->psmi_kassist_mode)); +#ifdef PSM_HAVE_SCIF + _IPATH_PRDBG("SCIF DMA MODE: %s\n", psmi_scif_dma_getmode(ep->scif_dma_mode)); + _IPATH_PRDBG("SCIF DMA THRESHOLD: %d\n", ep->scif_dma_threshold); +#endif + + ep->amsh_shmidx = shmidx = *shmidx_o = i; + _IPATH_PRDBG("Grabbed shmidx %d\n", shmidx); + ep->amsh_dirpage->num_attached++; + break; + } + } + pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock)); + + /* install the old sighandler back */ + signal(SIGSEGV, old_handler_segv); + signal(SIGBUS, old_handler_bus); + + if (shmidx == -1) + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Exceeded maximum of %d support local endpoints: %s", + PTL_AMSH_MAX_LOCAL_PROCS, strerror(errno)); + +fail: + return err; +} + +/** + * Initialize endpoint shared-memory AM. + * + * This function ensures that the given endpoint initializes enough shared + * memory storage to communicate with up to PSMI_AMSH_MAX_LOCAL_PROCS local + * peers. In reality, the implementation need not grow any shared structures + * if a single endpoint needs to communicate to 2 or 20 local peers (A local + * peer is a peer having a context on any locally-attached LID). + * + * [pre] Endpoint address epaddr has already been allocated. + */ + +#define AMSH_QSIZE(type) \ + PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \ + PSMI_PAGESIZE) + +static +psm_error_t +amsh_init_segment(ptl_t *ptl) +{ + struct amsh_qptrs* qptrs; + int shmidx; + int i; + psm_error_t err = PSM_OK; + int scif_nnodes; + + /* Preconditions */ + psmi_assert_always(ptl != NULL); + psmi_assert_always(ptl->ep != NULL); + psmi_assert_always(ptl->epaddr != NULL); + psmi_assert_always(ptl->ep->epid != 0); + psmi_assert_always(ptl->ep->amsh_shmidx != -1); + + shmidx = ptl->ep->amsh_shmidx; + + ptl->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort); + ptl->amsh_qsizes.qreqFifoMed = AMSH_QSIZE(reqFifoMed); + ptl->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong); + ptl->amsh_qsizes.qreqFifoHuge = AMSH_QSIZE(reqFifoHuge); + ptl->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort); + ptl->amsh_qsizes.qrepFifoMed = AMSH_QSIZE(repFifoMed); + ptl->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong); + ptl->amsh_qsizes.qrepFifoHuge = AMSH_QSIZE(repFifoHuge); + + /* We core dump right after here if we don't check the mmap */ + void (*old_handler_segv)(int) = signal (SIGSEGV, amsh_mmap_fault); + void (*old_handler_bus)(int) = signal (SIGBUS, amsh_mmap_fault); + + pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + + /* + * Now that we know our epid, update it in the shmidx array + */ + ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] = ptl->ep->epid; + + if (shmidx > ptl->ep->amsh_dirpage->max_idx) { + ptl->ep->amsh_dirpage->max_idx = shmidx; + } + + ptl->shmidx = shmidx; + ptl->ep->amsh_qdir[shmidx].amsh_epaddr = ptl->ep->epaddr; + for(i = 0; i < PTL_AMSH_MAX_LOCAL_NODES; i++) { + ptl->reqH[i].base = ptl->reqH[i].head = ptl->reqH[i].end = NULL; + ptl->repH[i].base = ptl->repH[i].head = ptl->repH[i].end = NULL; + } + + /* Update all of the local directory entries once here. */ + for(i = 0; i < PTL_AMSH_MAX_LOCAL_PROCS; i++) { + ptl->ep->amsh_qdir[i].amsh_base = + (void *)(ptl->ep->amsh_blockbase + + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES * i); + + ptl->ep->amsh_qdir[i].amsh_shmidx = ptl->shmidx; + + /* Encode our SCIF nodeid here. The full epid for local peers isn't + known yet, but we do know their nodeid, which is the same as ours. + Marking the nodeid here enables process_packet() to work correctly + when packets arrive before this epid value has been set with the + proper epid, without extra branches in the communication path. */ +#ifdef PSM_HAVE_SCIF + ptl->ep->amsh_qdir[i].amsh_epid = + ((psm_epid_t)ptl->ep->scif_mynodeid & 0xff) << 48; +#endif + + /* Clear the SCIF socket to -1. This indicates that the socket is not + going to be used, ever -- which is true since this is a local peer. + This prevents later code from trying to connect to self. */ + //ptl->ep->amsh_qdir[i].amsh_epd[0] = -1; + + am_update_directory(ptl, i); + } + +#ifdef PSM_HAVE_SCIF + scif_nnodes = ptl->ep->scif_nnodes; +#else + /* No SCIF: assume one node. */ + scif_nnodes = 1; +#endif + + /* touch all of my pages */ + memset(ptl->ep->amsh_qdir[shmidx].amsh_base, + 0, am_ctl_sizeof_block() * scif_nnodes); + + for(i = 0; i < scif_nnodes; i++) { + qptrs = &ptl->ep->amsh_qdir[shmidx].qptrs[i]; + + am_ctl_qhdr_init(&qptrs->qreqH->shortq, + amsh_qcounts.qreqFifoShort, amsh_qelemsz.qreqFifoShort); + am_ctl_qhdr_init(&qptrs->qreqH->medbulkq, + amsh_qcounts.qreqFifoMed, amsh_qelemsz.qreqFifoMed); + am_ctl_qhdr_init(&qptrs->qreqH->longbulkq, + amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong); + am_ctl_qhdr_init(&qptrs->qreqH->hugebulkq, + amsh_qcounts.qreqFifoHuge, amsh_qelemsz.qreqFifoHuge); + + am_ctl_qhdr_init(&qptrs->qrepH->shortq, + amsh_qcounts.qrepFifoShort, amsh_qelemsz.qrepFifoShort); + am_ctl_qhdr_init(&qptrs->qrepH->medbulkq, + amsh_qcounts.qrepFifoMed, amsh_qelemsz.qrepFifoMed); + am_ctl_qhdr_init(&qptrs->qrepH->longbulkq, + amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong); + am_ctl_qhdr_init(&qptrs->qrepH->hugebulkq, + amsh_qcounts.qrepFifoHuge, amsh_qelemsz.qrepFifoHuge); + + /* Set bulkidx in every bulk packet */ + am_ctl_bulkpkt_init(qptrs->qreqFifoMed, + amsh_qelemsz.qreqFifoMed, + amsh_qcounts.qreqFifoMed); + am_ctl_bulkpkt_init(qptrs->qreqFifoLong, + amsh_qelemsz.qreqFifoLong, + amsh_qcounts.qreqFifoLong); + am_ctl_bulkpkt_init(qptrs->qreqFifoHuge, + amsh_qelemsz.qreqFifoHuge, + amsh_qcounts.qreqFifoHuge); + + am_ctl_bulkpkt_init(qptrs->qrepFifoMed, + amsh_qelemsz.qrepFifoMed, + amsh_qcounts.qrepFifoMed); + am_ctl_bulkpkt_init(qptrs->qrepFifoLong, + amsh_qelemsz.qrepFifoLong, + amsh_qcounts.qrepFifoLong); + am_ctl_bulkpkt_init(qptrs->qrepFifoHuge, + amsh_qelemsz.qrepFifoHuge, + amsh_qcounts.qrepFifoHuge); + } + + /* install the old sighandler back */ + signal(SIGSEGV, old_handler_segv); + signal(SIGBUS, old_handler_bus); + + pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + return err; +} + +psm_error_t +psmi_shm_detach(psm_ep_t ep) +{ + psm_error_t err = PSM_OK; + + if (ep->amsh_shmidx == -1 || ep->amsh_keyname == NULL) + return err; + +#ifdef PSM_HAVE_SCIF + if (amsh_scif_detach(ep)) { + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error with amsh_scif_detach() of shared segment: %s", + strerror(errno)); + goto fail; + } +#endif + + _IPATH_VDBG("unlinking shm file %s\n", ep->amsh_keyname+1); + shm_unlink(ep->amsh_keyname); + psmi_free(ep->amsh_keyname); + ep->amsh_keyname = NULL; + + if (ep->psmi_kassist_fd != -1) { + close(ep->psmi_kassist_fd); + ep->psmi_kassist_fd = -1; + } + + /* go mark my shmidx as free */ + pthread_mutex_lock((pthread_mutex_t *) &(ep->amsh_dirpage->lock)); + + ep->amsh_dirpage->num_attached--; + ep->amsh_dirpage->shmidx_map_epid[ep->amsh_shmidx] = 0; + ep->amsh_shmidx = -1; + + if (ep->amsh_dirpage->num_attached == 0) { /* truncate to nothing */ + pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock)); + + /* Instead of dynamically shrinking the shared memory region, we always + leave it allocated for up to PTL_AMSH_MAX_LOCAL_PROCS or number + of processors online. + Thus mremap() is never necessary, nor is ftruncate() here. + However when the attached process count does go to 0, we should + fully munmap() the entire region. + */ +#ifdef PSM_HAVE_SCIF + if (munmap((void *) ep->amsh_shmbase, + psmi_amsh_segsize(PTL_AMSH_MAX_LOCAL_PROCS, + PTL_AMSH_MAX_LOCAL_NODES))) { +#else + int nranks, rankid, nprocs; + psmi_sharedcontext_params(&nranks, &rankid); + nprocs = (nranks <= 0) ? sysconf(_SC_NPROCESSORS_ONLN) : nranks; + if (munmap((void *) ep->amsh_shmbase, + psmi_amsh_segsize(nprocs, PTL_AMSH_MAX_LOCAL_NODES))) { +#endif + err = psmi_handle_error(NULL, PSM_SHMEM_SEGMENT_ERR, + "Error with munamp of shared segment: %s", strerror(errno)); + goto fail; + } + } + else { + int i, new_max_idx = ep->amsh_dirpage->max_idx; + for (i = ep->amsh_dirpage->max_idx; i >= 0; i--) { + if (ep->amsh_dirpage->shmidx_map_epid[i] == 0) + new_max_idx = i; + else + break; + } + + ep->amsh_dirpage->max_idx = new_max_idx; + + pthread_mutex_unlock((pthread_mutex_t *) &(ep->amsh_dirpage->lock)); + } + + ep->amsh_max_idx = -1; + ep->amsh_shmfd = -1; + + ep->amsh_shmbase = ep->amsh_blockbase = 0; + ep->amsh_dirpage = NULL; + memset(ep->amsh_keyno, 0, sizeof(ep->amsh_keyno)); + + return PSM_OK; + +fail: + return err; +} + +/** + * Update pointers to our req/rep receive queues + * + * Only called from am_update_directory() + */ +static +void +am_hdrcache_update_short(ptl_t *ptl, int shmidx, + am_ctl_qshort_cache_t *reqH, + am_ctl_qshort_cache_t *repH) +{ + int node; + + for(node = 0; node < PTL_AMSH_MAX_LOCAL_NODES; node++) { + reqH[node].base = QGETPTR_SCIF(ptl, shmidx, node, + reqFifoShort, short, 0); + reqH[node].head = QGETPTR_SCIF(ptl, shmidx, node, + reqFifoShort, short, 0); + reqH[node].end = QGETPTR_SCIF(ptl, shmidx, node, + reqFifoShort, short, amsh_qcounts.qreqFifoShort); + + repH[node].base = QGETPTR_SCIF(ptl, shmidx, node, + repFifoShort, short, 0); + repH[node].head = QGETPTR_SCIF(ptl, shmidx, node, + repFifoShort, short, 0); + repH[node].end = QGETPTR_SCIF(ptl, shmidx, node, + repFifoShort, short, amsh_qcounts.qrepFifoShort); + } +} + +/** + * Update locally cached shared-pointer directory. + * + * @param shmidx Endpoint index for which to update local directory. + */ + +static +void +am_update_directory(ptl_t *ptl, int shmidx) +{ + psm_ep_t ep = ptl->ep; + uintptr_t base_this; + uintptr_t base_node; + struct amsh_qptrs* qptrs; + int i; + + psmi_assert_always(shmidx != -1); + base_this = + (uintptr_t)ep->amsh_qdir[shmidx].amsh_base + AMSH_BLOCK_HEADER_SIZE; + + if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { + if(ep->amsh_dirpage->amsh_features[shmidx] & AMSH_HAVE_KASSIST) { + ep->amsh_qdir[shmidx].kassist_pid = + ep->amsh_dirpage->kassist_pids[shmidx]; + } + } else { + ep->amsh_qdir[shmidx].kassist_pid = 0; + } + + for(i = 0; i < PTL_AMSH_MAX_LOCAL_NODES; i++) { + qptrs = &ep->amsh_qdir[shmidx].qptrs[i]; + + base_node = base_this + (i * am_ctl_sizeof_block()); + + /* Request queues */ + qptrs->qreqH = (am_ctl_blockhdr_t *) base_node; + + qptrs->qreqFifoShort = (am_pkt_short_t *) + ((uintptr_t) qptrs->qreqH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + qptrs->qreqFifoMed = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qreqFifoShort + + ptl->amsh_qsizes.qreqFifoShort); + qptrs->qreqFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qreqFifoMed + + ptl->amsh_qsizes.qreqFifoMed); + qptrs->qreqFifoHuge = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qreqFifoLong + + ptl->amsh_qsizes.qreqFifoLong); + + /* Reply queues */ + qptrs->qrepH = (am_ctl_blockhdr_t *) + ((uintptr_t) qptrs->qreqFifoHuge + + ptl->amsh_qsizes.qreqFifoHuge); + + qptrs->qrepFifoShort = (am_pkt_short_t *) + ((uintptr_t) qptrs->qrepH + + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); + qptrs->qrepFifoMed = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qrepFifoShort + + ptl->amsh_qsizes.qrepFifoShort); + qptrs->qrepFifoLong = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qrepFifoMed + + ptl->amsh_qsizes.qrepFifoMed); + qptrs->qrepFifoHuge = (am_pkt_bulk_t *) + ((uintptr_t) qptrs->qrepFifoLong + + ptl->amsh_qsizes.qrepFifoLong); + + _IPATH_VDBG("shmidx=%d node=%d Request Hdr=%p,Pkt=%p,Med=%p,Long=%p,Huge=%p\n", + shmidx, i, + qptrs->qreqH, + qptrs->qreqFifoShort, + qptrs->qreqFifoMed, + qptrs->qreqFifoLong, + qptrs->qreqFifoHuge); + _IPATH_VDBG("shmidx=%d node=%d Reply Hdr=%p,Pkt=%p,Med=%p,Long=%p,Huge=%p\n", + shmidx, i, + qptrs->qrepH, + qptrs->qrepFifoShort, + qptrs->qrepFifoMed, + qptrs->qrepFifoLong, + qptrs->qrepFifoHuge); + } + + /* Update local shorthand pointers */ +#ifdef PSM_HAVE_SCIF + qptrs = &ep->amsh_qdir[shmidx].qptrs[ptl->ep->scif_mynodeid]; +#else + qptrs = &ep->amsh_qdir[shmidx].qptrs[0]; +#endif + + ep->amsh_qdir[shmidx].qreqH = qptrs->qreqH; + ep->amsh_qdir[shmidx].qreqFifoShort = qptrs->qreqFifoShort; + ep->amsh_qdir[shmidx].qreqFifoMed = qptrs->qreqFifoMed; + ep->amsh_qdir[shmidx].qreqFifoLong = qptrs->qreqFifoLong; + ep->amsh_qdir[shmidx].qreqFifoHuge = qptrs->qreqFifoHuge; + + ep->amsh_qdir[shmidx].qrepH = qptrs->qrepH; + ep->amsh_qdir[shmidx].qrepFifoShort = qptrs->qrepFifoShort; + ep->amsh_qdir[shmidx].qrepFifoMed = qptrs->qrepFifoMed; + ep->amsh_qdir[shmidx].qrepFifoLong = qptrs->qrepFifoLong; + ep->amsh_qdir[shmidx].qrepFifoHuge = qptrs->qrepFifoHuge; + + /* If we're updating our shmidx, we update our cached pointers */ + if (ptl->shmidx == shmidx) + am_hdrcache_update_short(ptl, shmidx, + (am_ctl_qshort_cache_t *) ptl->reqH, + (am_ctl_qshort_cache_t *) ptl->repH); + + /* Sanity check */ + uintptr_t base_next = + (uintptr_t) ep->amsh_qdir[shmidx].qptrs[PTL_AMSH_MAX_LOCAL_NODES - 1].qrepFifoHuge + ptl->amsh_qsizes.qrepFifoHuge; + + psmi_assert_always(base_next - base_this <= + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES); +} + +/* ep_epid_share_memory wrapper */ +static +int +amsh_epid_reachable(ptl_t *ptl, psm_epid_t epid) +{ + int result; + psm_error_t err; + err = psm_ep_epid_share_memory(ptl->ep, epid, &result); + psmi_assert_always(err == PSM_OK); + return result; +} + +static +psm_error_t +amsh_epaddr_add(ptl_t *ptl, psm_epid_t epid, int shmidx, psm_epaddr_t *epaddr_o) +{ + psm_epaddr_t epaddr; + psm_error_t err = PSM_OK; + + psmi_assert(psmi_epid_lookup(ptl->ep, epid) == NULL); + + if (epid == ptl->epid) { + epaddr = ptl->epaddr; + } else { + epaddr = (psm_epaddr_t) psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, + 1, sizeof(struct psm_epaddr)); + if (epaddr == NULL) { + return PSM_NO_MEMORY; + } + psmi_assert_always(ptl->ep->amsh_qdir[shmidx].amsh_epaddr == NULL); + } + + epaddr->ptl = ptl; + epaddr->ptlctl = ptl->ctl; + STAILQ_INIT(&epaddr->egrlong); + epaddr->mctxt_prev = epaddr; + epaddr->mctxt_next = epaddr; + epaddr->mctxt_master = epaddr; + epaddr->epid = epid; + epaddr->ep = ptl->ep; + epaddr->_shmidx = shmidx; + AMSH_CSTATE_TO_SET(epaddr, NONE); + AMSH_CSTATE_FROM_SET(epaddr, NONE); + if ((err = psmi_epid_set_hostname(psm_epid_nid(epid), + psmi_gethostname(), 0))) + goto fail; + + ptl->ep->amsh_qdir[shmidx].amsh_epaddr = epaddr; + + /* Finally, add to table */ + if ((err = psmi_epid_add(ptl->ep, epid, epaddr))) + goto fail; + + _IPATH_VDBG("epaddr=%s added to ptl=%p\n", + psmi_epaddr_get_name(epid), ptl); + + *epaddr_o = epaddr; + return PSM_OK; +fail: + if (epaddr != ptl->epaddr) psmi_free(epaddr); + return err; +} + +struct ptl_connection_req +{ + int isdone; + int op; /* connect or disconnect */ + int numep; + int numep_left; + int phase; + + int *epid_mask; + const psm_epid_t *epids; /* input epid list */ + psm_epaddr_t *epaddr; + psm_error_t *errors; /* inout errors */ + + /* Used for connect/disconnect */ + psm_amarg_t args[4]; +}; + +/* + * function to make scif connection between nodes and exchange shared memory + */ +#ifdef PSM_HAVE_SCIF +static int +amsh_scif_send(scif_epd_t epd, void *buf, size_t len) +{ + int ret; + while (len) { + ret = scif_send(epd, buf, (uint32_t)len, SCIF_SEND_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static int +amsh_scif_recv(scif_epd_t epd, void *buf, size_t len) +{ + int ret; + while (len) { + ret = scif_recv(epd, buf, (uint32_t)len, SCIF_RECV_BLOCK); + if (ret < 0) { + if (errno == EINTR) continue; + return ret; + } + buf += ret; + len -= ret; + } + return 0; +} + +static +psm_error_t +amsh_scif_connect(uint16_t nodeid, uint16_t port, scif_epd_t *epd_o) +{ + int tries; + struct scif_portID portID; + scif_epd_t epd; + psm_error_t err; + + epd = scif_open(); + if (epd < 0) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_open failed with error %d\n", errno); + return err; + } + + portID.port = port; + portID.node = nodeid; + + _IPATH_VDBG("scif connecting to %d:%d\n", nodeid, port); + + for(tries = 0; tries < psmi_scif_connect_retries; tries++) { + if (scif_connect(epd, &portID) >= 0) { + break; + } else if(errno != ECONNREFUSED) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_connect failed with error %d (%s)\n", + errno, strerror(errno)); + scif_close(epd); + return err; + } + + /* Wait a bit before trying again. */ + if(tries < 20) { + usleep(100000); + } else { + usleep(250000); + } + } + + if(tries == psmi_scif_connect_retries) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_connect retry limit exceeded\n"); + return err; + } + + *epd_o = epd; + return PSM_OK; +} + +/* Establish a connection to a single epid. */ +static psm_error_t amsh_scif_setup(ptl_t* ptl, psm_epid_t epid) +{ + psm_ep_t ep = ptl->ep; + psm_error_t err = PSM_OK; + scif_epd_t epd = -1; + void* addr; + int peeridx; + + /* Send this struct to identify ourselves to the peer (offset unused) */ + /* Receive this struct to get memory mapping information. */ + struct { off_t offset; int verno; psm_epid_t epid; } buf; + + int port = (int)((epid>>32)&0xffff); + int nodeid = (int)((epid>>48)&0xff); + int shmidx = (int)((epid>>56)&0xff); + + /* Skip peers on the same node */ + if (nodeid == ep->scif_mynodeid) { + return PSM_OK; + } + + /* Figure out the peer's index. */ + /* 0 1 mynodeid 3 4 */ + /* nodeid 0 1 3 4 */ + if(nodeid > ep->scif_mynodeid) { + peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * nodeid) + shmidx; + } else /*nodeid < ep->scif_mynodeid) */ { + peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * (nodeid + 1)) + shmidx; + } + + _IPATH_VDBG("%lx scif_connect to %d:%d %d %lx\n", + ep->epid, nodeid, port, peeridx, epid); + + if(ep->amsh_qdir[peeridx].amsh_epd[0] != 0) { + /* Already established this side of the connection; all done. */ + return err; + } + + buf.offset = 0; + buf.verno = PSMI_VERNO; + buf.epid = ep->epid; + + err = amsh_scif_connect(nodeid, port, &epd); + if(err) { + return err; + } + + /* Send our identification information. */ + if (amsh_scif_send(epd, &buf, sizeof(buf))) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_send failed: %d %s\n", errno, strerror(errno)); + scif_close(epd); + return err; + } + + /* Receive memory registration information. */ + if(amsh_scif_recv(epd, &buf, sizeof(buf))) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_recv failed: %d %s\n", errno, strerror(errno)); + scif_close(epd); + return err; + } + + addr = scif_mmap(NULL, am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES, + SCIF_PROT_READ|SCIF_PROT_WRITE, 0, epd, buf.offset); + if(addr == SCIF_MMAP_FAILED) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_mmap failed: %d %s\n", errno, strerror(errno)); + scif_close(epd); + return err; + } + + _IPATH_PRDBG("%lx scif_mmap offset %p -> %p to addr %p -> %p length %ld\n", + ep->epid, (void*)buf.offset, + (void*)(buf.offset + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES), + addr, + (void*)((uintptr_t)addr + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES), + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES); + + ep->amsh_qdir[peeridx].amsh_offset = buf.offset; + ep->amsh_qdir[peeridx].amsh_base = addr; + ep->amsh_qdir[peeridx].amsh_epid = buf.epid; + ep->amsh_qdir[peeridx].amsh_verno = buf.verno; + + /* Calculate my index from the peer's perspective. */ + /* 0 1 mynodeid 3 4 */ + /* nodeid 0 1 3 4 */ + if(ep->scif_mynodeid < nodeid) { + ep->amsh_qdir[peeridx].amsh_shmidx = + (PTL_AMSH_MAX_LOCAL_PROCS * (ep->scif_mynodeid + 1)) + + ep->amsh_shmidx; + } else { + ep->amsh_qdir[peeridx].amsh_shmidx = + (PTL_AMSH_MAX_LOCAL_PROCS * ep->scif_mynodeid) + + ep->amsh_shmidx; + } + + /* There are eventually two connections. epd[0] always has the remote + memory mapped region associated with it, and is used to make requests + to that peer. epd[1] exposes our local shared memory, and is used + to respond to remote requests. */ + ep->amsh_qdir[peeridx].amsh_epd[0] = epd; + + am_update_directory(ptl, peeridx); + + _IPATH_VDBG("shmidx %d connected! set peeridx %d amsh_shmidx %d epd %d\n", + ep->amsh_shmidx, peeridx, + ep->amsh_qdir[peeridx].amsh_shmidx, + ep->amsh_qdir[peeridx].amsh_epd[0]); + return err; +} + +static +psm_error_t +amsh_scif_detach(psm_ep_t ep) +{ + int i; + int size = am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES; + + /* do the rest scif cleanup work */ + for (i = 0; i < ep->scif_nnodes*PTL_AMSH_MAX_LOCAL_PROCS; i++) { + if (ep->amsh_qdir[i].amsh_epd[0] == 0) continue; + + if(i >= PTL_AMSH_MAX_LOCAL_PROCS) { + if(scif_munmap(ep->amsh_qdir[i].amsh_base, size)) { + _IPATH_INFO("SCIF: unmapping addr %p length %d failed: (%d) %s\n", + ep->amsh_qdir[i].amsh_base, size, + errno, strerror(errno)); + return PSM_INTERNAL_ERR; + } + + ep->amsh_qdir[i].amsh_base = NULL; + } + + if(scif_close(ep->amsh_qdir[i].amsh_epd[0])) { + _IPATH_INFO("SCIF: closing epd[0] %d failed: (%d) %s\n", + ep->amsh_qdir[i].amsh_epd[0], + errno, strerror(errno)); + return PSM_INTERNAL_ERR; + } + + if(scif_close(ep->amsh_qdir[i].amsh_epd[1])) { + _IPATH_INFO("SCIF: closing epd[1] %d failed: (%d) %s\n", + ep->amsh_qdir[i].amsh_epd[1], + errno, strerror(errno)); + return PSM_INTERNAL_ERR; + } + + ep->amsh_qdir[i].amsh_epd[0] = 0; + ep->amsh_qdir[i].amsh_epd[1] = 0; + } + + /* The accept thread will detect that the listen socket has been closed + and will shut down gracefully. */ + if(scif_close(ep->scif_epd)) { + _IPATH_INFO("SCIF: closing listen epd %d failed: (%d) %s\n", + ep->scif_epd, + errno, strerror(errno)); + return PSM_INTERNAL_ERR; + } + + pthread_join(ep->scif_thread, NULL); + + return PSM_OK; +} + +#endif //PSM_HAVE_SCIF + +#define PTL_OP_CONNECT 0 +#define PTL_OP_DISCONNECT 1 +#define PTL_OP_ABORT 2 + +static +psm_error_t +amsh_ep_connreq_init(ptl_t *ptl, + int op, /* connect, disconnect or abort */ + int numep, + const psm_epid_t *array_of_epid, /* non-NULL on connect */ + const int array_of_epid_mask[], + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + struct ptl_connection_req **req_o) +{ + int i, cstate; + psm_epaddr_t epaddr; + psm_epid_t epid; + struct ptl_connection_req *req = NULL; + + req = (struct ptl_connection_req *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1, + sizeof(struct ptl_connection_req)); + if (req == NULL) + return PSM_NO_MEMORY; + + req->isdone = 0; + req->op = op; + req->numep = numep; + req->numep_left = 0; + req->phase = ptl->connect_phase; + req->epid_mask = (int *) + psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, numep, sizeof(int)); + + if (req->epid_mask == NULL) { + psmi_free(req); + return PSM_NO_MEMORY; + } + + req->epaddr = array_of_epaddr; + req->epids = array_of_epid; + req->errors = array_of_errors; + + /* First check if there's really something to connect/disconnect + * for this PTL */ + for (i = 0; i < numep; i++) { + req->epid_mask[i] = AMSH_CMASK_NONE; /* no connect by default */ + if (!array_of_epid_mask[i]) + continue; + if (op == PTL_OP_CONNECT) { + epid = array_of_epid[i]; + if (!amsh_epid_reachable(ptl, epid)) { + array_of_errors[i] = PSM_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + _IPATH_VDBG("looking at epid %llx\n", (unsigned long long) epid); + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr != NULL) { + if (epaddr->ptl != ptl) { + array_of_errors[i] = PSM_EPID_UNREACHABLE; + array_of_epaddr[i] = NULL; + continue; + } + cstate = AMSH_CSTATE_TO_GET(epaddr); + if (cstate == AMSH_CSTATE_TO_ESTABLISHED) { + array_of_epaddr[i] = epaddr; + array_of_errors[i] = PSM_OK; + } + else { + psmi_assert(cstate == AMSH_CSTATE_TO_NONE); + array_of_errors[i] = PSM_TIMEOUT; + array_of_epaddr[i] = epaddr; + req->epid_mask[i] = AMSH_CMASK_PREREQ; + } + } + else { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + array_of_epaddr[i] = NULL; + +#ifdef PSM_HAVE_SCIF + psm_error_t err = amsh_scif_setup(ptl, req->epids[i]); + if(err != PSM_OK) { + psmi_free(req->epid_mask); + psmi_free(req); + return err; + } +#endif + } + } + else { /* disc or abort */ + epaddr = array_of_epaddr[i]; + psmi_assert(epaddr != NULL); + cstate = AMSH_CSTATE_TO_GET(epaddr); + if (cstate == AMSH_CSTATE_TO_ESTABLISHED) { + req->epid_mask[i] = AMSH_CMASK_PREREQ; + _IPATH_VDBG("Just set index %d to AMSH_CMASK_PREREQ\n", i); + } + /* XXX undef ? */ + } + if (req->epid_mask[i] != AMSH_CMASK_NONE) + req->numep_left++; + } + + if (req->numep_left == 0) { /* nothing to do */ + psmi_free(req->epid_mask); + psmi_free(req); + _IPATH_VDBG("Nothing to connect, bump up phase\n"); + ptl->connect_phase++; + *req_o = NULL; + return PSM_OK; + } + else { + *req_o = req; + return PSM_OK_NO_PROGRESS; + } +} + +static +psm_error_t +amsh_ep_connreq_poll(ptl_t *ptl, struct ptl_connection_req *req) +{ + int i, j, cstate, shmidx; + psm_error_t err = PSM_OK; + psm_epid_t epid; + psm_epaddr_t epaddr; + + if (req == NULL || req->isdone) + return PSM_OK; + + psmi_assert_always(ptl->ep->amsh_dirpage != NULL); + psmi_assert_always(ptl->connect_phase == req->phase); + + if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) { + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE || + req->epid_mask[i] == AMSH_CMASK_DONE) + continue; + + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + int shmidx = epaddr->_shmidx; +#ifdef PSM_HAVE_SCIF + if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { /* not remote nodes */ +#endif + /* Make sure the target of the disconnect is still there */ + pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + if (ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] != epaddr->epid) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + AMSH_CSTATE_TO_SET(epaddr, NONE); + } + pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); +#ifdef PSM_HAVE_SCIF + } +#endif + } + + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->args[0].u32w0 = PSMI_AM_DISC_REQ; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + req->args[2].u32w0 = PSMI_VERNO; + req->args[2].u32w1 = PSM_OK; + req->args[3].u64w0 = (uint64_t)(uintptr_t)&req->errors[i]; + psmi_amsh_short_request(ptl, epaddr, + amsh_conn_handler_hidx, + req->args, 4, NULL, 0, 0); + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + } + else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + cstate = AMSH_CSTATE_TO_GET(epaddr); + if (cstate == AMSH_CSTATE_TO_DISC_REPLIED) { + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + AMSH_CSTATE_TO_SET(epaddr, NONE); + } + } + } + } + else { + /* First see if we've made progress on any postreqs */ + int n_prereq = 0; + for (i = 0; i < req->numep; i++) { + int cstate; + if (req->epid_mask[i] != AMSH_CMASK_POSTREQ) { + if (req->epid_mask[i] == AMSH_CMASK_PREREQ) + n_prereq++; + continue; + } + epaddr = req->epaddr[i]; + psmi_assert(epaddr != NULL); + cstate = AMSH_CSTATE_TO_GET(epaddr); + if (cstate == AMSH_CSTATE_TO_REPLIED) { + req->numep_left--; + AMSH_CSTATE_TO_SET(epaddr, ESTABLISHED); + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + } + if (n_prereq > 0) { + char buf[32]; + uint16_t their_verno; + + psmi_assert(req->numep_left > 0); + /* Go through the list of peers we need to connect to and find out + * if they each shared ep is mapped into shm */ + pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] != AMSH_CMASK_PREREQ) + continue; + epid = req->epids[i]; + epaddr = req->epaddr[i]; + +#if PSM_HAVE_SCIF + /* Get the peer node-ID and scif port # from epid */ + int nodeid = (int)((epid>>48)&0xff); + if (nodeid != ptl->ep->scif_mynodeid) { + int peeridx = (int)((epid>>56)&0xff); + + //Don't use a loop, compute the shmidx directly. + if(nodeid < ptl->ep->scif_mynodeid) { + shmidx = + (nodeid + 1) * PTL_AMSH_MAX_LOCAL_PROCS + peeridx; + } else { + shmidx = nodeid * PTL_AMSH_MAX_LOCAL_PROCS + peeridx; + } + + psmi_assert(shmidx >= PTL_AMSH_MAX_LOCAL_PROCS); + their_verno = ptl->ep->amsh_qdir[shmidx].amsh_verno; + } else +#endif + { + /* Go through mapped epids and find the epid we're looking for */ + for (shmidx = -1, j = 0; j <= + ptl->ep->amsh_dirpage->max_idx; j++) { + /* epid is connected and ready to go */ + if (ptl->ep->amsh_dirpage->shmidx_map_epid[j] == epid) { + shmidx = j; + break; + } + } + + if (shmidx == -1) /* couldn't find epid, go to next */ + continue; + their_verno = ptl->ep->amsh_dirpage->psm_verno[shmidx]; + } + + /* Before we even send the request out, check to see if + * versions are interoperable */ + if (!psmi_verno_isinteroperable(their_verno)) { + snprintf(buf,sizeof buf, "%d.%d", + PSMI_VERNO_GET_MAJOR(their_verno), + PSMI_VERNO_GET_MINOR(their_verno)); + + _IPATH_INFO( + "Local endpoint id %" PRIx64 " has version %s " + "which is not supported by library version %d.%d", + epid, buf, PSM_VERNO_MAJOR, PSM_VERNO_MINOR); + req->errors[i] = PSM_EPID_INVALID_VERSION; + req->numep_left--; + req->epid_mask[i] = AMSH_CMASK_DONE; + continue; + } + + if (epaddr != NULL) { + psmi_assert(epaddr->_shmidx == shmidx); + } + else if ((epaddr = psmi_epid_lookup(ptl->ep, epid)) == NULL) { + if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr))) { + pthread_mutex_unlock( + (pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + return err; + } + } + + req->epaddr[i] = epaddr; + req->args[0].u32w0 = PSMI_AM_CONN_REQ; + req->args[0].u32w1 = ptl->connect_phase; + req->args[1].u64w0 = (uint64_t) ptl->epid; + req->args[2].u32w0 = PSMI_VERNO; + req->args[2].u32w1 = PSM_OK; + req->args[3].u64w0 = (uint64_t)(uintptr_t)&req->errors[i]; + req->epid_mask[i] = AMSH_CMASK_POSTREQ; + psmi_amsh_short_request(ptl, epaddr, amsh_conn_handler_hidx, + req->args, 4, NULL, 0, 0); + _IPATH_PRDBG("epaddr=%p, epid=%" PRIx64 " at shmidx=%d\n", + epaddr, epid, shmidx); + } + pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + } + } + + if (req->numep_left == 0) { /* we're all done */ + req->isdone = 1; + return PSM_OK; + } + else { + sched_yield(); + return PSM_OK_NO_PROGRESS; + } +} + +static +psm_error_t +amsh_ep_connreq_fini(ptl_t *ptl, struct ptl_connection_req *req) +{ + psm_error_t err = PSM_OK; + int i; + + /* Whereever we are at in our connect process, we've been instructed to + * finish the connection process */ + if (req == NULL) + return PSM_OK; + + /* This prevents future connect replies from referencing data structures + * that disappeared */ + ptl->connect_phase++; + + /* First process any leftovers in postreq or prereq */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { + int cstate; + req->epid_mask[i] = AMSH_CMASK_DONE; + cstate = AMSH_CSTATE_TO_GET(req->epaddr[i]); + if (cstate == AMSH_CSTATE_TO_REPLIED) { + req->numep_left--; + AMSH_CSTATE_TO_SET(req->epaddr[i], ESTABLISHED); + } + else { /* never actually got reply */ + req->errors[i] = PSM_TIMEOUT; + } + } + /* If we couldn't go from prereq to postreq, that means we couldn't + * find the shmidx for an epid in time. This can only be a case of + * time out */ + else if (req->epid_mask[i] == AMSH_CMASK_PREREQ) { + req->errors[i] = PSM_TIMEOUT; + req->numep_left--; + req->epaddr[i] = NULL; + req->epid_mask[i] = AMSH_CMASK_DONE; + } + } + + /* Whatever is left can only be in DONE or NONE state */ + for (i = 0; i < req->numep; i++) { + if (req->epid_mask[i] == AMSH_CMASK_NONE) + continue; + psmi_assert(req->epid_mask[i] == AMSH_CMASK_DONE); + + err = psmi_error_cmp(err, req->errors[i]); + /* Report errors in connection. */ + /* XXX de-alloc epaddr */ + } + + psmi_free(req->epid_mask); + psmi_free(req); + + return err; +} + +/* Wrapper for 2.0's use of connect/disconect. The plan is to move the + * init/poll/fini interface up to the PTL level for 2.2 */ +#define CONNREQ_ZERO_POLLS_BEFORE_YIELD 20 +static +psm_error_t +amsh_ep_connreq_wrap(ptl_t *ptl, int op, + int numep, + const psm_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + uint64_t timeout_ns) +{ + psm_error_t err; + uint64_t t_start; + struct ptl_connection_req *req = NULL; + int num_polls_noprogress = 0; + static int shm_polite_attach = -1; + + if (shm_polite_attach == -1) { + char *p = getenv("PSM_SHM_POLITE_ATTACH"); + if (p && *p && atoi(p) != 0) { + fprintf(stderr, "%s: Using Polite SHM segment attach\n", + psmi_gethostname()); + shm_polite_attach = 1; + } + shm_polite_attach = 0; + } + + /* Initialize */ + err = amsh_ep_connreq_init(ptl, op, numep, + array_of_epid, array_of_epid_mask, array_of_errors, + array_of_epaddr, &req); + if (err != PSM_OK_NO_PROGRESS) /* Either we're all done with connect or + * there was an error */ + return err; + + /* Poll until either + * 1. We time out + * 2. We are done with connecting + */ + t_start = get_cycles(); + do { + psmi_poll_internal(ptl->ep, 1); + err = amsh_ep_connreq_poll(ptl, req); + if (err == PSM_OK) + break; /* Finished before timeout */ + else if (err != PSM_OK_NO_PROGRESS) { + psmi_free(req->epid_mask); + psmi_free(req); + goto fail; + } else if (shm_polite_attach && + ++num_polls_noprogress == CONNREQ_ZERO_POLLS_BEFORE_YIELD) { + num_polls_noprogress = 0; + PSMI_PYIELD(); + } + } + while (psmi_cycles_left(t_start, timeout_ns)); + + err = amsh_ep_connreq_fini(ptl, req); + + /* Ensure that both sides of all connections are established before + returning. This prevents MPI-level deadlocks where one rank returns from + here before responding to another ranks handshake and enters a barrier + (which does not poll PSM). That other rank stays in PSM, never + receiving the handshake, and never entering the barrier: deadlock. */ + /* This is fixed by Intel MPI 5.0. */ +#if 0 + if(op == PTL_OP_CONNECT) { + while(ptl->connect_to > ptl->connect_from) { + psmi_poll_internal(ptl->ep, 1); + } + } else { //ABORT or DISCONNECT + while(ptl->connect_to < ptl->connect_from) { + psmi_poll_internal(ptl->ep, 1); + } + } +#endif + +fail: + return err; +} + +static +psm_error_t +amsh_ep_connect(ptl_t *ptl, + int numep, + const psm_epid_t *array_of_epid, + const int array_of_epid_mask[], + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_ns); +} + +static +psm_error_t +amsh_ep_disconnect(ptl_t *ptl, int force, int numep, + const psm_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t array_of_errors[], + uint64_t timeout_ns) +{ + return amsh_ep_connreq_wrap(ptl, force ? PTL_OP_ABORT : PTL_OP_DISCONNECT, + numep, NULL, array_of_epaddr_mask, array_of_errors, + (psm_epaddr_t *) array_of_epaddr, timeout_ns); +} + +/* am_ctl_getslot_remote_inner works just like am_ctl_getslot_pkt_inner, but + instead of using the tail/lock in the shq, use a separate per-domain + tail/lock. The queue is actually located on a remote node, but tailinfo + is located on the local node (and shared by peers on the same node) */ +static +am_pkt_short_t* +am_ctl_getslot_pkt_inner(struct amsh_qtail_info* tailinfo, + volatile am_ctl_qhdr_t *shq, + am_pkt_short_t *pkt0) +{ + am_pkt_short_t* pkt; + uint32_t idx; + + /* Acquire a slot/packet in the remote queue. */ + pthread_spin_lock(&tailinfo->lock); + idx = tailinfo->tail; + + /* Careful here -- pkt is pointing to memory on a remote node, so any + accesses will be expensive over PCIE. */ + pkt = (void*)((uintptr_t)pkt0 + idx * shq->elem_sz); + if(pkt->flag == QFREE) { + ips_sync_reads(); + pkt->flag = QUSED; + + tailinfo->tail += 1; + if(tailinfo->tail == shq->elem_cnt) { + tailinfo->tail = 0; + } + } else { + pkt = NULL; + } + pthread_spin_unlock(&tailinfo->lock); + + return pkt; +} + +/* AWF - leaving this code for now. With the addition of SCIF/symmetric + support, all communication uses the 'remote' path. */ +#if 0 +#undef CSWAP +/* AWF - cswap appears to be broken.. fix? */ +PSMI_ALWAYS_INLINE( +int32_t +cswap(volatile uint32_t *p, uint32_t old_value, uint32_t new_value)) +{ + asm volatile ("lock cmpxchg %2, %0" : + "+m" (*p), "+a" (old_value) : + "r" (new_value) : + "memory"); + return old_value; +} + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt_inner(volatile am_ctl_qhdr_t *shq, am_pkt_short_t *pkt0) +) +{ + am_pkt_short_t *pkt; + uint32_t idx; +#ifndef CSWAP + pthread_spin_lock(&shq->lock); + idx = shq->tail; + pkt = (am_pkt_short_t *)((uintptr_t) pkt0 + idx * shq->elem_sz); + if (pkt->flag == QFREE) { + ips_sync_reads(); + pkt->flag = QUSED; + shq->tail += 1; + if (shq->tail == shq->elem_cnt) + shq->tail = 0; + } else { + pkt = NULL; + } + pthread_spin_unlock(&shq->lock); +#else + uint32_t idx_next; + do { + idx = shq->tail; + idx_next = (idx+1 == shq->elem_cnt) ? 0 : idx+1; + } while (cswap(&shq->tail, idx, idx_next) != idx); + + pkt = (am_pkt_short_t *)((uintptr_t) pkt0 + idx * shq->elem_sz); + //AWF - why is another cswap needed here? we already have the packet.. + //We'll wait until the packet goes from QUSED -> QFREE + // And as soon as it does, toggle it back to QUSED. + while (cswap(&pkt->flag, QFREE, QUSED) != QFREE) + ; +#endif + return pkt; +} +#endif + +/* This is safe because 'flag' is at the same offset on both pkt and bulkpkt */ +#define am_ctl_getslot_bulkpkt_inner(shq,pkt0) ((am_pkt_bulk_t *) \ + am_ctl_getslot_pkt_inner(shq,(am_pkt_short_t *)(pkt0))) + +PSMI_ALWAYS_INLINE( +am_pkt_short_t * +am_ctl_getslot_pkt(ptl_t *ptl, int shmidx, int is_reply) +) +{ + struct amsh_qtail_info* tailinfo; + volatile am_ctl_qhdr_t *shq; + am_pkt_short_t *pkt0; + + /* It's not obvious, but the packet acquisition code below is accessing + memory mapped remotely from a peer on another SCIF node. Thus we + have to make sure a SCIF connection to that peer is already + established. */ +#ifdef PSM_HAVE_SCIF + if(shmidx >= PTL_AMSH_MAX_LOCAL_PROCS && + ptl->ep->amsh_qdir[shmidx].amsh_epd[0] == 0) { + if(amsh_scif_setup(ptl, ptl->ep->amsh_qdir[shmidx].amsh_epid) + != PSM_OK) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "am_ctl_getslot_remote(): amsh_scif_setup failed"); + } + } +#endif + + if(!is_reply) { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoShort; + shq = &(ptl->ep->amsh_qdir[shmidx].qreqH->shortq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoShort; + } else { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoShort; + shq = &(ptl->ep->amsh_qdir[shmidx].qrepH->shortq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoShort; + } + + return am_ctl_getslot_pkt_inner(tailinfo, shq, pkt0); +} + +PSMI_ALWAYS_INLINE( +am_pkt_bulk_t * +am_ctl_getslot_med(ptl_t *ptl, int shmidx, int is_reply) +) +{ + struct amsh_qtail_info* tailinfo; + volatile am_ctl_qhdr_t *shq; + am_pkt_bulk_t *pkt0; + + if(!is_reply) { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoMed; + shq = &(ptl->ep->amsh_qdir[shmidx].qreqH->medbulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoMed; + } else { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoMed; + shq = &(ptl->ep->amsh_qdir[shmidx].qrepH->medbulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoMed; + } + + return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo, + shq, (am_pkt_short_t*)pkt0); +} + +PSMI_ALWAYS_INLINE( +am_pkt_bulk_t * +am_ctl_getslot_long(ptl_t *ptl, int shmidx, int is_reply) +) +{ + struct amsh_qtail_info* tailinfo; + volatile am_ctl_qhdr_t *shq; + am_pkt_bulk_t *pkt0; + + if(!is_reply) { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoLong; + shq = &(ptl->ep->amsh_qdir[shmidx].qreqH->longbulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoLong; + } else { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoLong; + shq = &(ptl->ep->amsh_qdir[shmidx].qrepH->longbulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoLong; + } + + return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo, + shq, (am_pkt_short_t*)pkt0); +} + +PSMI_ALWAYS_INLINE( +am_pkt_bulk_t * +am_ctl_getslot_huge(ptl_t *ptl, int shmidx, int is_reply) +) +{ + struct amsh_qtail_info* tailinfo; + volatile am_ctl_qhdr_t *shq; + am_pkt_bulk_t *pkt0; + + if(!is_reply) { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].reqFifoHuge; + shq = &(ptl->ep->amsh_qdir[shmidx].qreqH->hugebulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qreqFifoHuge; + } else { + tailinfo = &ptl->ep->amsh_dirpage->qtails[shmidx].repFifoHuge; + shq = &(ptl->ep->amsh_qdir[shmidx].qrepH->hugebulkq); + pkt0 = ptl->ep->amsh_qdir[shmidx].qrepFifoHuge; + } + + return (am_pkt_bulk_t*)am_ctl_getslot_pkt_inner(tailinfo, + shq, (am_pkt_short_t*)pkt0); +} + +psmi_handlertab_t psmi_allhandlers[] = { + { 0 }, + { amsh_conn_handler }, + { psmi_am_mq_handler }, + { psmi_am_mq_handler_data }, + { psmi_am_mq_handler_rtsmatch }, + { psmi_am_mq_handler_rtsdone }, + { psmi_am_handler } +}; + +PSMI_ALWAYS_INLINE( +void +advance_head(volatile am_ctl_qshort_cache_t *hdr)) +{ + QMARKFREE(hdr->head); + hdr->head++; + if (hdr->head == hdr->end) + hdr->head = hdr->base; +} + +#define AMSH_ZERO_POLLS_BEFORE_YIELD 64 +#define AMSH_POLLS_BEFORE_PSM_POLL 16 + +/* XXX this can be made faster. Instead of checking the flag of the head, keep + * a cached copy of the integer value of the tail and compare it against the + * previous one we saw. + * AWF this trick won't work across nodes, since the receiver doesn't have + * access to the tail value. + */ + +PSMI_ALWAYS_INLINE( +psm_error_t +amsh_poll_internal_inner(ptl_t *ptl, int replyonly, int is_internal)) +{ + psm_error_t err = PSM_OK_NO_PROGRESS; + + /* poll replies */ +#ifdef PSM_HAVE_SCIF + int node; + int nnodes = ptl->ep->scif_nnodes; + + for(node = 0; node < nnodes; node++) { + if (!QISEMPTY(ptl->repH[node].head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl, (am_pkt_short_t *) ptl->repH[node].head, 0); + advance_head(&ptl->repH[node]); + err = PSM_OK; + } while (!QISEMPTY(ptl->repH[node].head->flag)); + } + } +#else + if (!QISEMPTY(ptl->repH[0].head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl, (am_pkt_short_t *) ptl->repH[0].head, 0); + advance_head(&ptl->repH[0]); + err = PSM_OK; + } while (!QISEMPTY(ptl->repH[0].head->flag)); + } +#endif + + if (!replyonly) { + /* Request queue not enable for 2.0, will be re-enabled to support long + * replies */ + if (!is_internal && ptl->psmi_am_reqq_fifo.first != NULL) { + psmi_am_reqq_drain(ptl); + err = PSM_OK; + } + +#ifdef PSM_HAVE_SCIF + for(node = 0; node < nnodes; node++) { + if (!QISEMPTY(ptl->reqH[node].head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl, + (am_pkt_short_t *) ptl->reqH[node].head, 1); + advance_head(&ptl->reqH[node]); + err = PSM_OK; + } while (!QISEMPTY(ptl->reqH[node].head->flag)); + } + } +#else + if (!QISEMPTY(ptl->reqH[0].head->flag)) { + do { + ips_sync_reads(); + process_packet(ptl, + (am_pkt_short_t *) ptl->reqH[0].head, 1); + advance_head(&ptl->reqH[0]); + err = PSM_OK; + } while (!QISEMPTY(ptl->reqH[0].head->flag)); + } +#endif + } + + if (is_internal) { + if (err == PSM_OK) /* some progress, no yields */ + ptl->zero_polls = 0; + else if (++ptl->zero_polls == AMSH_ZERO_POLLS_BEFORE_YIELD) { + /* no progress for AMSH_ZERO_POLLS_BEFORE_YIELD */ + sched_yield(); + ptl->zero_polls = 0; + } + + if (++ptl->amsh_only_polls == AMSH_POLLS_BEFORE_PSM_POLL) { + psmi_poll_internal(ptl->ep, 0); + ptl->amsh_only_polls = 0; + } + } + return err; /* if we actually did something */ +} + +/* non-inlined version */ +static +psm_error_t +amsh_poll_internal(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 1); +} + +#ifdef PSM_PROFILE + #define AMSH_POLL_UNTIL(ptl,isreply,cond) do { \ + PSMI_PROFILE_BLOCK(); \ + while (!(cond)) { \ + PSMI_PROFILE_REBLOCK( \ + amsh_poll_internal(ptl,isreply) == \ + PSM_OK_NO_PROGRESS); \ + } \ + PSMI_PROFILE_UNBLOCK(); \ + } while (0) +#else + #define AMSH_POLL_UNTIL(ptl,isreply,cond) do { \ + while (!(cond)) { \ + amsh_poll_internal(ptl,isreply); \ + } \ + } while (0) +#endif + +static +psm_error_t +amsh_poll(ptl_t *ptl, int replyonly) +{ + return amsh_poll_internal_inner(ptl, replyonly, 0); +} + +PSMI_ALWAYS_INLINE( +void +am_send_pkt_short(ptl_t *ptl, uint32_t destidx, uint32_t bulkidx, + uint16_t fmt, uint16_t nargs, uint16_t handleridx, + psm_amarg_t *args, const void *src, uint32_t len, int isreply)) +{ + int i; + volatile am_pkt_short_t *pkt; + + AMSH_POLL_UNTIL(ptl, isreply, + (pkt = am_ctl_getslot_pkt(ptl, destidx, isreply)) != NULL); + +#ifdef __MIC__ + /* On MIC, a local copy of the packet struct should be filled in, then + copied using one vector operation. MIC does not have write combining, + and the acquired packet is in remote (via PCIE) memory, so filling in + each struct member will cause a separate PCIE transaction. Using a + single vector write reduces latency. */ + am_pkt_short_t lcl_pkt; /* Local version of packet data */ + + lcl_pkt.bulkidx = bulkidx; + lcl_pkt.shmidx = ptl->ep->amsh_qdir[destidx].amsh_shmidx; + lcl_pkt.type = fmt; + lcl_pkt.nargs = nargs; + lcl_pkt.handleridx = handleridx; + + for (i = 0; i < nargs; i++) + lcl_pkt.args[i] = args[i]; + + if (fmt == AMFMT_SHORT_INLINE) + mq_copy_tiny((uint32_t *) &lcl_pkt.args[nargs], (uint32_t *) src, len); + + /* Skip the memory fences in QMARKREADY; not necessary here. */ + //QMARKREADY(lcl_pkt); + lcl_pkt.flag = QREADY; + + /* Now copy the local packet data to the remote packet. */ + memcpy((void*)pkt, &lcl_pkt, sizeof(am_pkt_short_t)); + +#else + /* got a free pkt... fill it in */ + pkt->bulkidx = bulkidx; + pkt->shmidx = ptl->ep->amsh_qdir[destidx].amsh_shmidx; + pkt->type = fmt; + pkt->nargs = nargs; + pkt->handleridx = handleridx; + + for (i = 0; i < nargs; i++) + pkt->args[i] = args[i]; + + if (fmt == AMFMT_SHORT_INLINE) + mq_copy_tiny((uint32_t *) &pkt->args[nargs], (uint32_t *) src, len); + + QMARKREADY(pkt); +#endif +} + +/* It's probably unlikely that the alloca below is problematic, but + * in case we think it is, define the next to 1 + */ +#define ALLOCA_AS_SCRATCH 0 + +#if ALLOCA_AS_SCRATCH +static char amsh_medscratch[AMMED_SZ]; +#endif + +#ifdef __MIC__ +#define amsh_shm_copy_short memcpy +#define amsh_shm_copy_long memcpy +#define amsh_shm_copy_huge psmi_memcpyo +#else +#define amsh_shm_copy_short psmi_mq_mtucpy +#define amsh_shm_copy_long psmi_mq_mtucpy +#define amsh_shm_copy_huge psmi_memcpyo +#endif + +PSMI_ALWAYS_INLINE( +int +psmi_amsh_generic_inner(uint32_t amtype, ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags)) +{ + uint16_t type; + uint32_t bulkidx; + uint16_t hidx = (uint16_t) handler; + int destidx = epaddr->_shmidx; + int is_reply = AM_IS_REPLY(amtype); + volatile am_pkt_bulk_t *bulkpkt; + + _IPATH_VDBG("%s epaddr=%s, shmidx=%d, type=%d LOOPBACK=%s\n", + is_reply ? "reply" : "request", + psmi_epaddr_get_name(epaddr->epid), epaddr->_shmidx, amtype, + ptl->epaddr == epaddr ? "YES" : "NO"); + if (ptl->epaddr == epaddr) { /* loopback */ + amsh_am_token_t tok; + void *bufa; + + tok.tok.epaddr_from = epaddr; + tok.ptl = ptl; + tok.mq = ptl->ep->mq; + tok.shmidx = ptl->shmidx; + if (len > 0) { + if (AM_IS_LONG(amtype)) + bufa = dst; + else { + psmi_assert_always(len <= AMMED_SZ); +#if ALLOCA_AS_SCRATCH + bufa = (void *) amsh_medscratch; +#else + bufa = alloca(len); +#endif + } + psmi_assert(bufa != NULL); + amsh_shm_copy_short((void *) bufa, src, len); + } + else + bufa = NULL; + psmi_handler_fn_t fn = + (psmi_handler_fn_t) psmi_allhandlers[hidx].fn; + fn(&tok, args, nargs, bufa, len); + + return 1; + } + + switch (amtype) { + case AMREQUEST_SHORT: + case AMREPLY_SHORT: + if (len + (nargs<<3) <= (NSHORT_ARGS<<3)) { + /* Payload fits in args packet */ + type = AMFMT_SHORT_INLINE; + bulkidx = len; + } + else { + psmi_assert(len < amsh_qelemsz.qreqFifoMed); + psmi_assert(src != NULL); + type = AMFMT_SHORT; +#if 1 + AMSH_POLL_UNTIL(ptl, is_reply, + (bulkpkt = am_ctl_getslot_med(ptl, destidx, is_reply)) != NULL); +#else + /* This version exposes a compiler bug */ + while (1) { + bulkpkt = am_ctl_getslot_med(ptl, destidx, is_reply); + if (bulkpkt == NULL) + break; + amsh_poll_internal(ptl, is_reply); + } +#endif + bulkidx = bulkpkt->idx; + bulkpkt->len = len; + _IPATH_VDBG("bulkpkt %p flag is %d from idx %d\n", + bulkpkt, bulkpkt->flag, destidx); + amsh_shm_copy_short((void*) bulkpkt->payload, src, (uint32_t) len); + QMARKREADY(bulkpkt); + } + am_send_pkt_short(ptl, destidx, bulkidx, type, nargs, hidx, + args, src, len, is_reply); + break; + + case AMREQUEST_LONG: + case AMREPLY_LONG: + { + uint32_t bytes_left = len; + uint8_t *src_this = (uint8_t *) src; + uint8_t *dst_this = (uint8_t *) dst; + uint32_t bytes_this; + uint32_t mtu_this; + type = (bytes_left >= AMSH_HUGE_BYTES ? AMFMT_HUGE : AMFMT_LONG); + /* XXX put in my shm block */ + int destidx_l = AMSH_BULK_PUSH ? destidx : ptl->shmidx; + + if (type == AMFMT_HUGE) + mtu_this = is_reply ? amsh_qpkt_max.qrepFifoHuge : + amsh_qpkt_max.qreqFifoHuge; + else + mtu_this = is_reply ? amsh_qpkt_max.qrepFifoLong : + amsh_qpkt_max.qreqFifoLong; + + _IPATH_VDBG("[long][%s] src=%p,dest=%p,len=%d,hidx=%d\n", + is_reply ? "rep" : "req", src, dst, (uint32_t)len, hidx); + + while (bytes_left) { + if (type == AMFMT_HUGE) { + bytes_this = min(bytes_left, mtu_this); + + AMSH_POLL_UNTIL(ptl, is_reply, + (bulkpkt = am_ctl_getslot_huge(ptl, destidx_l, is_reply)) != NULL); + bytes_left -= bytes_this; + if (bytes_left == 0) + type = AMFMT_HUGE_END; + bulkidx = bulkpkt->idx; + amsh_shm_copy_huge((void *) bulkpkt->payload, + src_this, bytes_this); + } + else { + bytes_this = min(bytes_left, mtu_this); + AMSH_POLL_UNTIL(ptl, is_reply, + (bulkpkt = am_ctl_getslot_long(ptl, destidx_l, is_reply)) != NULL); + bytes_left -= bytes_this; + if (bytes_left == 0) + type = AMFMT_LONG_END; + bulkidx = bulkpkt->idx; + amsh_shm_copy_long((void *) bulkpkt->payload, src_this, + bytes_this); + + } + + bulkpkt->dest = (uintptr_t) dst; + bulkpkt->dest_off = + (uint32_t)((uintptr_t)dst_this - (uintptr_t)dst); + bulkpkt->len = bytes_this; + QMARKREADY(bulkpkt); + + am_send_pkt_short(ptl, destidx, bulkidx, type, nargs, + hidx, args, NULL, 0, is_reply); + src_this += bytes_this; + dst_this += bytes_this; + } + break; + } + default: + break; + } + return 1; +} + +/* A generic version that's not inlined */ +int +psmi_amsh_generic(uint32_t amtype, ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dst, int flags) +{ + return psmi_amsh_generic_inner(amtype,ptl,epaddr,handler,args,nargs,src,len, + dst,flags); +} + +int +psmi_amsh_short_request(ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_SHORT, ptl, epaddr, handler, args, nargs, + src, len, NULL, flags); +} + +int +psmi_amsh_long_request(ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + return psmi_amsh_generic_inner(AMREQUEST_LONG, ptl, epaddr, handler, args, nargs, + src, len, dest, flags); +} + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_SHORT, tok->ptl, tok->tok.epaddr_from, + handler, args, nargs, src, len, NULL, flags); + return; +} + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags) +{ + psmi_amsh_generic_inner(AMREPLY_LONG, tok->ptl, tok->tok.epaddr_from, + handler, args, nargs, src, len, dest, flags); + return; +} + +void +psmi_am_reqq_init(ptl_t *ptl) +{ + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; +} + +psm_error_t +psmi_am_reqq_drain(ptl_t *ptl) +{ + am_reqq_t *reqn = ptl->psmi_am_reqq_fifo.first; + am_reqq_t *req; + psm_error_t err = PSM_OK_NO_PROGRESS; + + /* We're going to process the entire list, and running the generic handler + * below can cause other requests to be enqueued in the queue that we're + * processing. */ + ptl->psmi_am_reqq_fifo.first = NULL; + ptl->psmi_am_reqq_fifo.lastp = &ptl->psmi_am_reqq_fifo.first; + + while ((req = reqn) != NULL) { + err = PSM_OK; + reqn = req->next; + _IPATH_VDBG("push of reqq=%p epaddr=%s localreq=%p remotereq=%p\n", req, + psmi_epaddr_get_hostname(req->epaddr->epid), + (void *) (uintptr_t) req->args[1].u64w0, + (void *) (uintptr_t) req->args[0].u64w0); + psmi_amsh_generic(req->amtype, req->ptl, req->epaddr, + req->handler, req->args, req->nargs, req->src, + req->len, req->dest, req->amflags); + if (req->flags & AM_FLAG_SRC_TEMP) + psmi_free(req->src); + psmi_free(req); + } + return err; +} + +void +psmi_am_reqq_add(int amtype, ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int amflags) +{ + int i; + int flags = 0; + am_reqq_t *nreq = + (am_reqq_t *) psmi_malloc(ptl->ep, UNDEFINED, sizeof(am_reqq_t)); + psmi_assert_always(nreq != NULL); + _IPATH_VDBG("alloc of reqq=%p, to epaddr=%s, ptr=%p, len=%d, " + "localreq=%p, remotereq=%p\n", nreq, + psmi_epaddr_get_hostname(epaddr->epid), dest, + (int)len, (void *) (uintptr_t) args[1].u64w0, + (void *) (uintptr_t) args[0].u64w0); + + psmi_assert(nargs <= 8); + nreq->next = NULL; + nreq->amtype = amtype; + nreq->ptl = ptl; + nreq->epaddr = epaddr; + nreq->handler = handler; + for (i = 0; i < nargs; i++) + nreq->args[i] = args[i]; + nreq->nargs = nargs; + if (AM_IS_LONG(amtype) && src != NULL && + len > 0 && !(amflags & AM_FLAG_SRC_ASYNC)) + { + abort(); + flags |= AM_FLAG_SRC_TEMP; + nreq->src = psmi_malloc(ptl->ep, UNDEFINED, len); + psmi_assert_always(nreq->src != NULL); /* XXX mem */ + amsh_shm_copy_short(nreq->src, src, len); + } + else + nreq->src = src; + nreq->len = len; + nreq->dest = dest; + nreq->amflags = amflags; + nreq->flags = flags; + + nreq->next = NULL; + *(ptl->psmi_am_reqq_fifo.lastp) = nreq; + ptl->psmi_am_reqq_fifo.lastp = &nreq->next; +} + +static +void +process_packet(ptl_t *ptl, am_pkt_short_t *pkt, int isreq) +{ + amsh_am_token_t tok; + psmi_handler_fn_t fn; + int shmidx = pkt->shmidx; + + tok.tok.epaddr_from = ptl->ep->amsh_qdir[shmidx].amsh_epaddr; + tok.ptl = ptl; + tok.mq = ptl->ep->mq; + tok.shmidx = shmidx; + + uint16_t hidx = (uint16_t) pkt->handleridx; + int myshmidx = ptl->shmidx; + int shmidx_l = AMSH_BULK_PUSH ? myshmidx : shmidx; + uint32_t bulkidx = pkt->bulkidx; + uintptr_t bulkptr; + am_pkt_bulk_t *bulkpkt; + + /* It is possible for packets to arrive (the initial ones for connection + establishment) before amsh_epid is set correctly. However this can only + happen for peers in the same node -- those connecting inter-node via + SCIF will always have their epid set first. Since our local nodeid is + encoded in the amsh_epid of all local proces at initialization time, + it can always be safely extracted here, even before the amsh_epid is + set to its proper value for a given peer. */ +#ifdef PSM_HAVE_SCIF + int nodeid = (int)((ptl->ep->amsh_qdir[shmidx].amsh_epid >> 48) & 0xff); +#else + const int nodeid = 0; +#endif + + fn = (psmi_handler_fn_t) psmi_allhandlers[hidx].fn; + psmi_assert(fn != NULL); + psmi_assert((uintptr_t) pkt > ptl->ep->amsh_blockbase); + + if (pkt->type == AMFMT_SHORT_INLINE) { + _IPATH_VDBG("%s inline flag=%d nargs=%d from_idx=%d pkt=%p hidx=%d\n", + isreq ? "request" : "reply", + pkt->flag, pkt->nargs, shmidx, pkt, hidx); + + fn(&tok, pkt->args, pkt->nargs, pkt->length > 0 ? + (void *) &pkt->args[pkt->nargs] : NULL, pkt->length); + } + else { + int isend = 0; + switch (pkt->type) { + case AMFMT_SHORT: + if (isreq) { + bulkptr = (uintptr_t) + ptl->ep->amsh_qdir[myshmidx].qptrs[nodeid].qreqFifoMed; + bulkptr += bulkidx * amsh_qelemsz.qreqFifoMed; + } else { + bulkptr = (uintptr_t) + ptl->ep->amsh_qdir[myshmidx].qptrs[nodeid].qrepFifoMed; + bulkptr += bulkidx * amsh_qelemsz.qrepFifoMed; + } + break; + + case AMFMT_LONG_END: + isend = 1; + case AMFMT_LONG: + if (isreq) { + bulkptr = (uintptr_t) + ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qreqFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong; + } + else { + bulkptr = (uintptr_t) + ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qrepFifoLong; + bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong; + } + break; + + case AMFMT_HUGE_END: + isend = 1; + case AMFMT_HUGE: + if (isreq) { + bulkptr = (uintptr_t) ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qreqFifoHuge; + bulkptr += bulkidx * amsh_qelemsz.qreqFifoHuge; + } + else { + bulkptr = (uintptr_t) ptl->ep->amsh_qdir[shmidx_l].qptrs[nodeid].qrepFifoHuge; + bulkptr += bulkidx * amsh_qelemsz.qrepFifoHuge; + } + break; + default: + bulkptr = 0; + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown/unhandled packet type 0x%x", pkt->type); + return; + } + + bulkpkt = (am_pkt_bulk_t *) bulkptr; + _IPATH_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d " + "from_idx=%d pkt=%p/%p hidx=%d\n", + ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, + bulkpkt->flag, pkt->nargs, shmidx, pkt, bulkpkt, hidx); + psmi_assert(bulkpkt->flag == QREADY); + if (pkt->type == AMFMT_SHORT) { + fn(&tok, pkt->args, pkt->nargs, + (void *) bulkpkt->payload, bulkpkt->len); + QMARKFREE(bulkpkt); + } + else { + if (pkt->type == AMFMT_HUGE || pkt->type == AMFMT_HUGE_END) + amsh_shm_copy_huge((void *) (bulkpkt->dest + bulkpkt->dest_off), + bulkpkt->payload, bulkpkt->len); + else + amsh_shm_copy_long((void *) (bulkpkt->dest + bulkpkt->dest_off), + bulkpkt->payload, bulkpkt->len); + + /* If this is the last packet, copy args before running the + * handler */ + if (isend) { + psm_amarg_t args[8]; + int nargs = pkt->nargs; + int i; + void *dest = (void *) bulkpkt->dest; + size_t len = (size_t) (bulkpkt->dest_off + bulkpkt->len); + for (i = 0; i < nargs; i++) + args[i] = pkt->args[i]; + QMARKFREE(bulkpkt); + fn(&tok, args, nargs, dest, len); + } + else + QMARKFREE(bulkpkt); + } + } + return; +} + +static +psm_error_t +amsh_mq_rndv(ptl_t *ptl, psm_mq_t mq, psm_mq_req_t req, + psm_epaddr_t epaddr, uint64_t tag, const void *buf, uint32_t len) +{ + psm_amarg_t args[5] = {}; + psm_error_t err = PSM_OK; + + args[0].u32w0 = MQ_MSG_RTS; + args[0].u32w1 = len; + args[1].u64w0 = tag; + args[2].u64w0 = (uint64_t)(uintptr_t) req; + args[3].u64w0 = (uint64_t)(uintptr_t) buf; + + /* OK so we want to use SCIF DMA here if enabled. + First check: same node? Use existing local path. + */ + +#ifdef PSM_HAVE_SCIF + int shmidx = epaddr->_shmidx; + if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { +#endif + /* Intra-node: consider using kassist methods */ + if (ptl->ep->psmi_kassist_mode == PSMI_KASSIST_KNEM_GET) + /* If KNEM Get is active register region for peer to get from */ + args[4].u64w0 = knem_register_region((void*) buf, len, PSMI_FALSE); + else + args[4].u64w0 = 0; +#ifdef PSM_HAVE_SCIF + } else { + /* Inter-node: use SCIF DMA */ + if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET && + ptl->ep->scif_dma_threshold <= len) { + /* Register the memory region with SCIF and pass the offset over. */ + off_t offset; + + scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[0]; + + err = scif_register_region(epd, (void*)buf, len, &offset); + if(err != PSM_OK) { + return err; + } + + args[4].u64w0 = offset; + } else { + args[4].u64w0 = 0; + } + } +#endif + + psmi_assert(req != NULL); + req->type = MQE_TYPE_SEND; + req->buf = (void *) buf; + req->buf_len = len; + req->send_msglen = len; + req->send_msgoff = 0; + + psmi_amsh_short_request(ptl, epaddr, mq_handler_hidx, args, 5, NULL, 0, 0); + + return err; +} + +/* + * All shared am mq sends, req can be NULL + */ +PSMI_ALWAYS_INLINE( +psm_error_t +amsh_mq_send_inner(psm_mq_t mq, psm_mq_req_t req, psm_epaddr_t epaddr, + uint32_t flags, uint64_t tag, const void *ubuf, uint32_t len)) +{ + psm_amarg_t args[3] = {}; + psm_error_t err = PSM_OK; + int is_blocking = (req == NULL); + + if (!flags && len <= psmi_am_max_sizes.request_short) { + if (len <= 32) + args[0].u32w0 = MQ_MSG_TINY; + else + args[0].u32w0 = MQ_MSG_SHORT; + args[1].u64 = tag; + + psmi_amsh_short_request(epaddr->ptl, epaddr, mq_handler_hidx, args, 2, + ubuf, len, 0); + } + else if (flags & PSM_MQ_FLAG_SENDSYNC) + goto do_rendezvous; + else if (len <= mq->shm_thresh_rv) { + uint32_t bytes_left = len; + uint32_t bytes_this = min(bytes_left, psmi_am_max_sizes.request_short); + uint8_t *buf = (uint8_t *)ubuf; + args[0].u32w0 = MQ_MSG_LONG; + args[0].u32w1 = len; + args[1].u64 = tag; + psmi_amsh_short_request(epaddr->ptl, epaddr, mq_handler_hidx, args, 2, + buf, bytes_this, 0); + bytes_left -= bytes_this; + buf += bytes_this; + args[2].u32w0 = 0; + while (bytes_left) { + args[2].u32w0 += bytes_this; + bytes_this = min(bytes_left, psmi_am_max_sizes.request_short); + /* Here we kind of bend the rules, and assume that shared-memory + * active messages are delivered in order */ + psmi_amsh_short_request(epaddr->ptl, epaddr, + mq_handler_data_hidx, args, + 3, buf, bytes_this, 0); + buf += bytes_this; + bytes_left -= bytes_this; + } + } + else { +do_rendezvous: + if (is_blocking) { + req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf (req == NULL) + return PSM_NO_MEMORY; + req->send_msglen = len; + req->tag = tag; + } + err = amsh_mq_rndv(epaddr->ptl,mq,req,epaddr,tag,ubuf,len); + + if (err == PSM_OK && is_blocking) { /* wait... */ + err = psmi_mq_wait_internal(&req); + } + return err; /* skip eager accounting below */ + } + + /* All eager async sends are always "all done" */ + if (req != NULL) { + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + } + + mq->stats.tx_num++; + mq->stats.tx_shm_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +static +psm_error_t +amsh_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len, void *context, + psm_mq_req_t *req_o) +{ + psm_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf (req == NULL) + return PSM_NO_MEMORY; + + req->send_msglen = len; + req->tag = tag; + req->context = context; + + _IPATH_VDBG("[ishrt][%s->%s][n=0][b=%p][l=%d][t=%"PRIx64"]\n", + psmi_epaddr_get_name(epaddr->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag); + + amsh_mq_send_inner(mq, req, epaddr, flags, tag, ubuf, len); + + *req_o = req; + return PSM_OK; +} + +static +psm_error_t +amsh_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len) +{ + amsh_mq_send_inner(mq, NULL, epaddr, flags, tag, ubuf, len); + + _IPATH_VDBG("[shrt][%s->%s][n=0][b=%p][l=%d][t=%"PRIx64"]\n", + psmi_epaddr_get_name(epaddr->ep->epid), + psmi_epaddr_get_name(epaddr->epid), ubuf, len, tag); + + return PSM_OK; +} + +/* Kcopy-related handling */ +int +psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr) +{ + int shmidx = epaddr->_shmidx; + return epaddr->ep->amsh_qdir[shmidx].kassist_pid; +} + +static +int +psmi_kcopy_find_minor(int *minor) +{ + int i; + char path[128]; + + /* process-wide kcopy filedescriptor */ + static int fd = -1; + static int kcopy_minor = -1; + + if (fd >= 0) { + *minor = kcopy_minor; + return fd; + } + + for (i = 0; i < 256; i++) { + snprintf(path, sizeof(path), "/dev/kcopy/%02d", i); + fd = open(path, O_WRONLY | O_EXCL); + if (fd >= 0) { + *minor = kcopy_minor = i; + break; + } + } + + return fd; +} + +static +int +psmi_kcopy_open_minor(int minor) +{ + char path[128]; + + /* process-wide kcopy filedescriptor */ + static int fd = -1; + if (fd >= 0) + return fd; + + if (minor >= 0 && minor < 256) { + snprintf(path, sizeof(path), "/dev/kcopy/%02d", minor); + fd = open(path, O_WRONLY); + } + return fd; +} + +static +const char * +psmi_kassist_getmode(int mode) +{ + switch (mode) { + case PSMI_KASSIST_OFF: + return "kassist off"; + case PSMI_KASSIST_KCOPY_PUT: + return "kcopy put"; + case PSMI_KASSIST_KCOPY_GET: + return "kcopy get"; + case PSMI_KASSIST_KNEM_GET: + return "knem get"; + case PSMI_KASSIST_KNEM_PUT: + return "knem put"; + default: + return "unknown"; + } +} + +static +int +psmi_get_kassist_mode() +{ + int mode = PSMI_KASSIST_MODE_DEFAULT; + union psmi_envvar_val env_kassist; + + /* Preserve backward compatibility */ + if (!psmi_getenv("PSM_SHM_KCOPY", + "PSM Shared Memory use kcopy (put,get,none)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) "put", + &env_kassist)) + { + char *s = env_kassist.e_str; + if (strcasecmp(s, "put") == 0) + mode = PSMI_KASSIST_KCOPY_PUT; + else if (strcasecmp(s, "get") == 0) + mode = PSMI_KASSIST_KCOPY_PUT; + else + mode = PSMI_KASSIST_OFF; + } + else if(!psmi_getenv("PSM_KASSIST_MODE", + "PSM Shared memory kernel assist mode " + "(knem-put, knem-get, kcopy-put, kcopy-get, none)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) PSMI_KASSIST_MODE_DEFAULT_STRING, + &env_kassist)) + { + char *s = env_kassist.e_str; + if (strcasecmp(s, "kcopy-put") == 0) + mode = PSMI_KASSIST_KCOPY_PUT; + else if (strcasecmp(s, "kcopy-get") == 0) + mode = PSMI_KASSIST_KCOPY_GET; + else if (strcasecmp(s, "knem-put") == 0) + mode = PSMI_KASSIST_KNEM_PUT; + else if (strcasecmp(s, "knem-get") == 0) + mode = PSMI_KASSIST_KNEM_GET; + else + mode = PSMI_KASSIST_OFF; + +#if !defined(PSM_USE_KNEM) + if (mode & PSMI_KASSIST_KNEM) { + _IPATH_ERROR("KNEM kassist mode requested which has not been compiled " + "into this version of PSM. Switching kassist mode off.\n"); + mode = PSMI_KASSIST_OFF; + } +#endif + } + else { + +#if defined(PSM_USE_KNEM) + int res; + + /* KNEM is the preferred access mechanism if available. Else default to + * using KCOPY. + */ + res = access(KNEM_DEVICE_FILENAME, R_OK | W_OK); + if (res == 0) + mode = PSMI_KASSIST_KNEM_PUT; + else + mode = PSMI_KASSIST_KCOPY_PUT; +#else + mode = PSMI_KASSIST_KCOPY_PUT; +#endif + } + + return mode; +} + +#ifdef PSM_HAVE_SCIF +static int +psmi_get_scif_dma_mode() +{ + int mode = PSMI_SCIF_DMA_MODE_DEFAULT; + union psmi_envvar_val env_scif_dma; + + if(!psmi_getenv("PSM_SCIF_DMA_MODE", + "PSM Shared memory SCIF DMA transport mode " + "(scif-put, scif-get, none)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) PSMI_SCIF_DMA_MODE_DEFAULT_STRING, + &env_scif_dma)) + { + char *s = env_scif_dma.e_str; + if (strcasecmp(s, "scif-put") == 0) + mode = PSMI_SCIF_DMA_PUT; + else if (strcasecmp(s, "scif-get") == 0) + mode = PSMI_SCIF_DMA_GET; + else + mode = PSMI_SCIF_DMA_OFF; + } + + return mode; +} + +static int +psmi_get_scif_dma_threshold() +{ + int threshold = PSMI_MQ_RV_THRESH_SCIF_DMA; + union psmi_envvar_val env_scif_dma; + + if(!psmi_getenv("PSM_SCIF_DMA_THRESH", + "PSM SCIF DMA (rendezvous) switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) threshold, + &env_scif_dma)) { + threshold = env_scif_dma.e_uint; + } + + return threshold; +} + +static +const char * +psmi_scif_dma_getmode(int mode) +{ + switch (mode) { + case PSMI_SCIF_DMA_OFF: + return "SCIF DMA off"; + case PSMI_SCIF_DMA_PUT: + return "SCIF put"; + case PSMI_SCIF_DMA_GET: + return "SCIF get"; + default: + return "unknown"; + } +} +#endif // PSM_HAVE_SCIF + +/* Connection handling for shared memory AM. + * + * arg0 => conn_op, result (PSM error type) + * arg1 => epid (always) + * arg2 => version. + * arg3 => pointer to error for replies. + */ +static +void +amsh_conn_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + int op = args[0].u32w0; + int phase = args[0].u32w1; + psm_epid_t epid = args[1].u64w0; + psm_error_t err = (psm_error_t) args[2].u32w1; + psm_error_t *perr = (psm_error_t *) (uintptr_t) args[3].u64w0; + + psm_epaddr_t epaddr; + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + int shmidx = tok->shmidx; + int is_valid; + ptl_t *ptl = tok->ptl; + + /* We do this because it's an assumption below */ + psmi_assert_always(buf == NULL && len == 0); + + _IPATH_VDBG("Conn op=%d, phase=%d, epid=%llx, err=%d\n", + op, phase, (unsigned long long) epid, err); + switch (op) { + case PSMI_AM_CONN_REQ: + _IPATH_VDBG("Connect from %d:%d\n", + (int) psm_epid_nid(epid), + (int) psm_epid_context(epid)); + + epaddr = psmi_epid_lookup(ptl->ep, epid); + if (epaddr == NULL) { + /* This can be nasty. If the segment moves as a result of + * adding a new peer, we have to fix the input pointer 'args' + * since it comes from a shared memory location */ + if ((err = amsh_epaddr_add(ptl, epid, shmidx, &epaddr))) + /* Unfortunately, no way out of here yet */ + psmi_handle_error(PSMI_EP_NORETURN, err, "Fatal error " + "in connecting to shm segment"); + psmi_assert(psmi_epid_lookup(ptl->ep, epid) != NULL); + } + + /* Do some version comparison, error checking if required. */ + /* Rewrite args */ + ptl->connect_from++; + args[0].u32w0 = PSMI_AM_CONN_REP; + args[1].u64w0 = (psm_epid_t) ptl->epid; + args[2].u32w1 = PSM_OK; + AMSH_CSTATE_FROM_SET(epaddr, ESTABLISHED); + tok->tok.epaddr_from = epaddr; /* adjust token */ + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + + break; + + case PSMI_AM_CONN_REP: + if (ptl->connect_phase != phase) { + _IPATH_VDBG("Out of phase connect reply\n"); + return; + } + epaddr = ptl->ep->amsh_qdir[shmidx].amsh_epaddr; + *perr = err; + AMSH_CSTATE_TO_SET(epaddr, REPLIED); + ptl->connect_to++; + break; + + case PSMI_AM_DISC_REQ: + epaddr = tok->tok.epaddr_from; + args[0].u32w0 = PSMI_AM_DISC_REP; + args[2].u32w1 = PSM_OK; + AMSH_CSTATE_FROM_SET(epaddr, DISC_REQ); + ptl->connect_from--; + /* Before sending the reply, make sure the process + * is still connected */ + + is_valid = 1; +#ifdef PSM_HAVE_SCIF + if (shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { +#endif + pthread_mutex_lock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); + if (ptl->ep->amsh_dirpage->shmidx_map_epid[shmidx] != epaddr->epid) + is_valid = 0; + pthread_mutex_unlock((pthread_mutex_t *) &(ptl->ep->amsh_dirpage->lock)); +#ifdef PSM_HAVE_SCIF + } +#endif + + if (is_valid) { + psmi_amsh_short_reply(tok, amsh_conn_handler_hidx, + args, narg, NULL, 0, 0); + } + break; + + case PSMI_AM_DISC_REP: + if (ptl->connect_phase != phase) { + _IPATH_VDBG("Out of phase disconnect reply\n"); + return; + } + *perr = err; + epaddr = tok->tok.epaddr_from; + AMSH_CSTATE_TO_SET(epaddr, DISC_REPLIED); + ptl->connect_to--; + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown/unhandled connect handler op=%d", op); + break; + } + return; +} + +static +size_t +amsh_sizeof(void) +{ + return sizeof(ptl_t); +} + +/** + * @param ep PSM Endpoint, guaranteed to have initialized epaddr and epid. + * @param ptl Pointer to caller-allocated space for PTL (fill in) + * @param ctl Pointer to caller-allocated space for PTL-control + * structure (fill in) + */ +static +psm_error_t +amsh_init(psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl) +{ + int shmidx; + psm_error_t err = PSM_OK; + + _IPATH_VDBG("PSM Symmetric Mode!\n"); + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + /* Setup scif listen port and query node information */ + /* This is important to get the node count for initializing queues */ +#ifdef PSM_HAVE_SCIF + if ((err = amsh_scif_init(ep))) + goto fail; +#endif + + /* If we haven't attached to the segment yet, do it now */ + if ((err = psmi_shm_attach(ep, &shmidx))) + goto fail; + + /* Modify epid with acquired info as below */ + ep->epid |= ((((uint64_t)shmidx)&0xFF)<<56); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->zero_polls = 0; + + pthread_mutex_init(&ptl->connect_lock, NULL); + ptl->connect_phase = 0; + ptl->connect_from = 0; + ptl->connect_to = 0; + + memset(&ptl->amsh_empty_shortpkt, 0, sizeof ptl->amsh_empty_shortpkt); + memset(&ptl->psmi_am_reqq_fifo, 0, sizeof ptl->psmi_am_reqq_fifo); + + if ((err = amsh_init_segment(ptl))) + goto fail; + + psmi_am_reqq_init(ptl); + memset(ctl, 0, sizeof(*ctl)); + + /* Fill in the control structure */ + ctl->ptl = ptl; + ctl->ep_poll = amsh_poll; + ctl->ep_connect = amsh_ep_connect; + ctl->ep_disconnect = amsh_ep_disconnect; + + ctl->mq_send = amsh_mq_send; + ctl->mq_isend = amsh_mq_isend; + + ctl->am_short_request = psmi_amsh_am_short_request; + ctl->am_short_reply = psmi_amsh_am_short_reply; + + /* No stats in shm (for now...) */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; + +#ifdef PSM_HAVE_SCIF + /* Start a thread to service incoming SCIF connections. */ + if (pthread_create(&ptl->ep->scif_thread, NULL, + am_ctl_accept_thread, (void*)ptl)) { + err = psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "amsh_init_segment(): pthread_create() failed: %d %s", + errno, strerror(errno)); + goto fail; + } +#endif + +fail: + return err; +} + +static +psm_error_t +amsh_fini(ptl_t *ptl, int force, uint64_t timeout_ns) +{ + struct psmi_eptab_iterator itor; + psm_epaddr_t epaddr; + psm_error_t err = PSM_OK; + psm_error_t err_seg; + uint64_t t_start = get_cycles(); + int i = 0; + + /* Close whatever has been left open -- this will be factored out for 2.1 */ + if (ptl->connect_to > 0) { + int num_disc = 0; + int *mask; + psm_error_t *errs; + psm_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptl != ptl) + continue; + if (AMSH_CSTATE_TO_GET(epaddr) == AMSH_CSTATE_TO_ESTABLISHED) + num_disc++; + } + psmi_epid_itor_fini(&itor); + + mask = (int *) psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(int)); + errs = (psm_error_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm_error_t)); + epaddr_array = (psm_epaddr_t *) + psmi_calloc(ptl->ep, UNDEFINED, num_disc, sizeof(psm_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) psmi_free(epaddr_array); + if (errs) psmi_free(errs); + if (mask) psmi_free(mask); + err = PSM_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, ptl->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptl == ptl) { + if (AMSH_CSTATE_TO_GET(epaddr) == AMSH_CSTATE_TO_ESTABLISHED) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + } + } + } + psmi_epid_itor_fini(&itor); + psmi_assert(i == num_disc && num_disc > 0); + err = amsh_ep_disconnect(ptl, force, num_disc, epaddr_array, + mask, errs, timeout_ns); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + //At this point we are never getting a disconnect request from two peers. + //Those peers are polling.. waiting for a response? + //Are we somehow losing a message that arrives somewhere between where we + //start to disconnect, and here? + + if (ptl->connect_from > 0 || ptl->connect_to > 0) { + while (ptl->connect_from > 0 || ptl->connect_to > 0) { + if (!psmi_cycles_left(t_start, timeout_ns)) { + err = PSM_TIMEOUT; + _IPATH_VDBG("CCC timed out with from=%d,to=%d\n", + ptl->connect_from, + ptl->connect_to); + break; + } + psmi_poll_internal(ptl->ep, 1); + } + } + else { + _IPATH_VDBG("CCC complete disconnect from=%d,to=%d\n", + ptl->connect_from, + ptl->connect_to); + } + + if ((err_seg = psmi_shm_detach(ptl->ep))) { + err = err_seg; + goto fail; + } + + /* This prevents poll calls between now and the point where the endpoint is + * deallocated to reference memory that disappeared */ +#ifdef PSM_HAVE_SCIF + for(i = 0; i < ptl->ep->scif_nnodes; i++) { + ptl->repH[i].head = &ptl->amsh_empty_shortpkt; + ptl->reqH[i].head = &ptl->amsh_empty_shortpkt; + } +#else + ptl->repH[0].head = &ptl->amsh_empty_shortpkt; + ptl->reqH[0].head = &ptl->amsh_empty_shortpkt; +#endif + + return PSM_OK; +fail: + return err; + +} + +static +psm_error_t +amsh_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown AM ptl option %u.", optname); +} + +static +psm_error_t +amsh_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for AM PTL at the moment */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown AM ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_amsh = { + amsh_sizeof, amsh_init, amsh_fini, amsh_setopt, amsh_getopt +}; + +#ifdef PSM_HAVE_SCIF +/* Wait for incoming connections on the SCIF listen socket. + When a connection arrives, store the SCIF socket in the correct place and + respond so that the remote process can map our shared queue area. + */ +static void* am_ctl_accept_thread(void* arg) +{ + ptl_t* ptl = (ptl_t*)arg; + psm_ep_t ep = ptl->ep; + struct scif_portID peer; + scif_epd_t epd; + void* addr; + int peeridx; + int shmidx; + int nodeid; + + /* Receive this struct to ID the peer (offset unused). */ + /* Send this struct to share memory mapping information. */ + struct { off_t offset; int verno; psm_epid_t epid; } inbuf, outbuf; + + while(1) { + /* Block on accepting a new connection on the SCIF listen socket. */ + if(scif_accept(ep->scif_epd, &peer, &epd, SCIF_ACCEPT_SYNC)) { + if(errno == EINTR) { + /* Time to quit! */ + _IPATH_VDBG("SCIF accept thread quitting\n"); + pthread_exit(NULL); + return NULL; + } + + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "scif_accept failed: %d %s\n", errno, strerror(errno)); + continue; + } + + /* Register the shared memory area this peer should access. */ + /* SCIF_MAP_FIXED is use to ensure that offset == addr, so that the + returned offset does not need to be tracked as well. */ + addr = ep->amsh_qdir[ep->amsh_shmidx].amsh_base; + outbuf.offset = scif_register(epd, addr, + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES, + (off_t)addr, SCIF_PROT_READ|SCIF_PROT_WRITE, SCIF_MAP_FIXED); + + _IPATH_PRDBG("registered addr %p at offset %p length %ld\n", + addr, (void*)outbuf.offset, + am_ctl_sizeof_block() * PTL_AMSH_MAX_LOCAL_NODES); + if(outbuf.offset == SCIF_REGISTER_FAILED) { + psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_register failed: %d %s\n", errno, strerror(errno)); + scif_close(epd); + continue; + } + + outbuf.verno = PSMI_VERNO; + outbuf.epid = ep->epid; + + if (amsh_scif_send(epd, &outbuf, sizeof(outbuf))) { + psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_send epd %d failed: %d %s\n", + epd, errno, strerror(errno)); + scif_close(epd); + continue; + } + + /* Receive peer identification information */ + if(amsh_scif_recv(epd, &inbuf, sizeof(inbuf))) { + psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "scif_recv failed: %d %s\n", errno, strerror(errno)); + scif_close(epd); + continue; + } + + /* Extract information from the peer's epid. */ + nodeid = (int)((inbuf.epid>>48)&0xff); + shmidx = (int)((inbuf.epid>>56)&0xff); + + /* Port isn't supposed to match -- we have the peer's listen port, + which won't be the same as the connect socket's port. */ + if(peer.node != nodeid) { + psmi_handle_error(NULL, PSM_EP_NO_RESOURCES, + "SCIF node:port %d:%d does not match encoded epid nodeid %d", + peer.node, peer.port, nodeid); + scif_close(epd); + continue; + } + + /* Now that the peer's identity is known, store the new connection. */ + /* 0 1 mynodeid 3 4 */ + /* mynodeid 0 1 3 4 */ + if(nodeid > ep->scif_mynodeid) { + peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * nodeid) + shmidx; + } else if(nodeid < ep->scif_mynodeid) { + peeridx = (PTL_AMSH_MAX_LOCAL_PROCS * (nodeid + 1)) + shmidx; + } else { + peeridx = shmidx; + } + + ptl->ep->amsh_qdir[peeridx].amsh_epid = inbuf.epid; + ptl->ep->amsh_qdir[peeridx].amsh_verno = inbuf.verno; + + /* There are eventually two connections. epd[0] always has the remote + memory mapped region associated with it, and is used to make requests + to that peer. epd[1] exposes our local shared memory, and is used + to respond to remote requests. */ + ptl->ep->amsh_qdir[peeridx].amsh_epd[1] = epd; + + _IPATH_VDBG( + "shmidx %d accepted %d:%d peeridx %d epd %d shmidx %d\n", + ep->amsh_shmidx, peer.node, peer.port, peeridx, + ep->amsh_qdir[peeridx].amsh_epd[1], + ep->amsh_qdir[peeridx].amsh_shmidx); + } + + return NULL; +} +#endif //PSM_HAVE_SCIF + diff --git a/ptl_am/kcopyrw.h b/ptl_am/kcopyrw.h new file mode 100644 index 0000000..c50127c --- /dev/null +++ b/ptl_am/kcopyrw.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +/* + * read from remote process pid + */ +int64_t kcopy_get(int fd, pid_t pid, const void *src, void *dst, int64_t n); + +/* + * write to remote process pid + */ +int64_t kcopy_put(int fd, const void *src, pid_t pid, void *dst, int64_t n); + +/* + * return the ABI version or -1 on error + */ +int kcopy_abi(int fd); diff --git a/ptl_am/kcopyrwu.c b/ptl_am/kcopyrwu.c new file mode 100644 index 0000000..839846f --- /dev/null +++ b/ptl_am/kcopyrwu.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "kcopyrw.h" + +#define KCOPY_GET_SYSCALL 1 +#define KCOPY_PUT_SYSCALL 2 +#define KCOPY_ABI_SYSCALL 3 + +struct kcopy_syscall { + uint32_t tag; + pid_t pid; + uint64_t n; + uint64_t src; + uint64_t dst; +}; + +int64_t kcopy_get(int fd, pid_t pid, const void *src, void *dst, int64_t n) { + struct kcopy_syscall e = { + .tag = KCOPY_GET_SYSCALL, + .pid = pid, + .n = n, + .src = (uint64_t) (uintptr_t) src, + .dst = (uint64_t) (uintptr_t) dst + }; + int64_t ret; + + ret = write(fd, &e, sizeof(e)); + if (ret == sizeof(e)) + ret = n; + else if (ret > 0 && ret != sizeof(e)) + ret = 0; + + return ret; +} + +int64_t kcopy_put(int fd, const void *src, pid_t pid, void *dst, int64_t n) { + struct kcopy_syscall e = { + .tag = KCOPY_PUT_SYSCALL, + .pid = pid, + .n = n, + .src = (uint64_t) (uintptr_t) src, + .dst = (uint64_t) (uintptr_t) dst + }; + int64_t ret; + + ret = write(fd, &e, sizeof(e)); + if (ret == sizeof(e)) + ret = n; + else if (ret > 0 && ret != sizeof(e)) + ret = 0; + + return ret; +} + +int kcopy_abi(int fd) { + int32_t abi; + struct kcopy_syscall e = { + .tag = KCOPY_ABI_SYSCALL, + .dst = (uint64_t) (uintptr_t) &abi + }; + int ret; + + ret = write(fd, &e, sizeof(e)); + if (ret == sizeof(e)) + ret = abi; + else if (ret > 0 && ret != sizeof(e)) + ret = 0; + + return ret; +} diff --git a/ptl_am/knemrw.h b/ptl_am/knemrw.h new file mode 100644 index 0000000..4e22e0f --- /dev/null +++ b/ptl_am/knemrw.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2010. QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(PSM_USE_KNEM) +#include "knem_io.h" +#endif + +/* + * Open handle to knem device. + */ +int knem_open_device(); + +/* + * read from remote process given a cookie + */ +int64_t knem_get(int fd, int64_t cookie, const void *src, int64_t n); + +/* + * write to remote process pid given a cookie + */ +int64_t knem_put(int fd, const void *src, int64_t n, int64_t cookie); + +/* + * register a memory region for put/get + */ +int64_t knem_register_region(void *buffer, size_t len, int write); diff --git a/ptl_am/knemrwu.c b/ptl_am/knemrwu.c new file mode 100644 index 0000000..358f555 --- /dev/null +++ b/ptl_am/knemrwu.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2010. QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "knemrw.h" + +int knem_open_device() +{ + /* Process wide knem handle */ + static int fd = -1; + +#if defined(PSM_USE_KNEM) + if (fd >= 0) + return fd; + + fd = open(KNEM_DEVICE_FILENAME, O_RDWR); +#endif + return fd; +} + +int64_t knem_get(int fd, int64_t cookie, const void *src, int64_t n) +{ + +#if defined(PSM_USE_KNEM) + struct knem_cmd_inline_copy c; + struct knem_cmd_param_iovec iov; + int err; + + iov.base = (uint64_t) (uintptr_t) src; + iov.len = n; + + c.local_iovec_array = (uintptr_t) &iov; + c.local_iovec_nr = 1; + c.remote_cookie = cookie; + c.remote_offset = 0; + c.write = 0; /* Do a Read/Get from remote memory region */ + c.flags = 0; + err = ioctl(fd, KNEM_CMD_INLINE_COPY, &c); + + if (c.current_status != KNEM_STATUS_SUCCESS) { + _IPATH_INFO("KNEM: Get request of size 0x%"PRIx64" failed with error %d.\n", + n, c.current_status); + err = c.current_status; + } + + return err; +#else + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Attempt to use KNEM kassist (get), support for which has " + "not been compiled in."); + + return PSM_INTERNAL_ERR; +#endif +} + +int64_t knem_put(int fd, const void *src, int64_t n, int64_t cookie) +{ + +#if defined(PSM_USE_KNEM) + struct knem_cmd_inline_copy c; + struct knem_cmd_param_iovec iov; + int err; + + iov.base = (uint64_t) (uintptr_t) src; + iov.len = n; + + c.local_iovec_array = (uintptr_t) &iov; + c.local_iovec_nr = 1; + c.remote_cookie = cookie; + c.remote_offset = 0; + c.write = 1; /* Do a Write/Put to remote memory region */ + c.flags = 0; + err = ioctl(fd, KNEM_CMD_INLINE_COPY, &c); + + if (c.current_status != KNEM_STATUS_SUCCESS) { + _IPATH_INFO("KNEM: Put request of size 0x%"PRIx64" failed with error %d.\n", + n, c.current_status); + err = c.current_status; + } + + return err; +#else + + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Attempt to use KNEM kassist (put), support for which has " + "not been compiled in."); + + return PSM_INTERNAL_ERR; +#endif + +} + +int64_t knem_register_region(void *buffer, size_t len, int write) +{ + +#if defined(PSM_USE_KNEM) + struct knem_cmd_create_region create; + struct knem_cmd_param_iovec iov; + + iov.base = (uint64_t) (uintptr_t) buffer; + iov.len = len; + create.iovec_array = (uintptr_t) &iov; + create.iovec_nr = 1; + create.flags = KNEM_FLAG_SINGLEUSE; /* Automatically destroy after put */ + create.protection = write ? PROT_WRITE : PROT_READ; + + /* AV: Handle failure in memory registration */ + ioctl(psmi_kassist_fd, KNEM_CMD_CREATE_REGION, &create); + return create.cookie; /* Cookie for registered memory region */ +#else + + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Attempt to use KNEM kassist (reg), support for which has " + "not been compiled in."); + return 0; +#endif + +} diff --git a/ptl_am/psm_am_internal.h b/ptl_am/psm_am_internal.h new file mode 100644 index 0000000..34c1342 --- /dev/null +++ b/ptl_am/psm_am_internal.h @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef PSMI_AM_H +#define PSMI_AM_H + +#include "../psm_am_internal.h" + +#define NSHORT_ARGS 6 +typedef +struct amsh_am_token +{ + struct psmi_am_token tok; + + + ptl_t *ptl; /**> What PTL was it received on */ + psm_mq_t mq; /**> What matched queue is this for ? */ + int shmidx; /**> what shmidx sent this */ + int loopback; /**> Whether to reply as loopback */ +} +amsh_am_token_t; + +typedef void (*psmi_handler_fn_t)(void *token, psm_amarg_t *args, int nargs, void *src, size_t len); + +typedef struct psmi_handlertab { + psmi_handler_fn_t fn; +} psmi_handlertab_t; + +/* + * Can change the rendezvous threshold based on usage of kcopy (or not) + */ +#define PSMI_MQ_RV_THRESH_KCOPY 16000 + +/* + * Can change the rendezvous threshold based on usage of knem (or not) + */ +#define PSMI_MQ_RV_THRESH_KNEM 16000 + +/* If no kernel assisted copy is available this is the rendezvous threshold */ +#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000 + +/* Threshold for using SCIF DMA to do data transfers */ +#define PSMI_MQ_RV_THRESH_SCIF_DMA (150000) + +#define PSMI_AM_CONN_REQ 1 +#define PSMI_AM_CONN_REP 2 +#define PSMI_AM_DISC_REQ 3 +#define PSMI_AM_DISC_REP 4 + +#define PSMI_KASSIST_OFF 0x0 +#define PSMI_KASSIST_KCOPY_GET 0x1 +#define PSMI_KASSIST_KCOPY_PUT 0x2 +#define PSMI_KASSIST_KNEM_GET 0x4 +#define PSMI_KASSIST_KNEM_PUT 0x8 + +#define PSMI_KASSIST_KCOPY 0x3 +#define PSMI_KASSIST_KNEM 0xC +#define PSMI_KASSIST_GET 0x15 +#define PSMI_KASSIST_PUT 0x2A +#define PSMI_KASSIST_MASK 0x3F + +#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_KNEM_PUT +#define PSMI_KASSIST_MODE_DEFAULT_STRING "knem-put" + +int psmi_epaddr_kcopy_pid(psm_epaddr_t epaddr); + +#define PSMI_SCIF_DMA_OFF 0x0 +#define PSMI_SCIF_DMA_GET 0x1 +#define PSMI_SCIF_DMA_PUT 0x2 + +#define PSMI_SCIF_DMA_MODE_DEFAULT PSMI_SCIF_DMA_GET +#define PSMI_SCIF_DMA_MODE_DEFAULT_STRING "scif-get" + +/* + * Eventually, we will allow users to register handlers as "don't reply", which + * may save on some of the buffering requirements + */ +#define PSMI_HANDLER_NEEDS_REPLY(handler) 1 +#define PSMI_VALIDATE_REPLY(handler) assert(PSMI_HANDLER_NEEDS_REPLY(handler)) + +int psmi_amsh_poll(ptl_t *ptl, int replyonly); + +/* Shared memory AM, forward decls */ +int +psmi_amsh_short_request(ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +void +psmi_amsh_short_reply(amsh_am_token_t *tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, int flags); + +int +psmi_amsh_long_request(ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void +psmi_amsh_long_reply(amsh_am_token_t *tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + const void *src, size_t len, void *dest, int flags); + +void psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); + +void psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); +void psmi_am_mq_handler_data(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); +void psmi_am_mq_handler_complete(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); +void psmi_am_mq_handler_rtsmatch(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); +void psmi_am_mq_handler_rtsdone(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); +void psmi_am_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len); + +/* AM over shared memory (forward decls) */ +psm_error_t +psmi_amsh_am_short_request(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); +psm_error_t +psmi_amsh_am_short_reply(psm_am_token_t tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + +#define amsh_conn_handler_hidx 1 +#define mq_handler_hidx 2 +#define mq_handler_data_hidx 3 +#define mq_handler_rtsmatch_hidx 4 +#define mq_handler_rtsdone_hidx 5 +#define am_handler_hidx 6 + +#define AMREQUEST_SHORT 0 +#define AMREQUEST_LONG 1 +#define AMREPLY_SHORT 2 +#define AMREPLY_LONG 3 +#define AM_IS_REPLY(x) ((x)&0x2) +#define AM_IS_REQUEST(x) (!AM_IS_REPLY(x)) +#define AM_IS_LONG(x) ((x)&0x1) +#define AM_IS_SHORT(x) (!AM_IS_LONG(x)) + +#define AM_FLAG_SRC_ASYNC 0x1 +#define AM_FLAG_SRC_TEMP 0x2 + +/* + * Request Fifo. + */ +typedef +struct am_reqq { + struct am_reqq *next; + int amtype; + + ptl_t *ptl; + psm_epaddr_t epaddr; + psm_handler_t handler; + psm_amarg_t args[8]; + int nargs; + void *src; + uint32_t len; + void *dest; + int amflags; + int flags; +} +am_reqq_t; + +struct am_reqq_fifo_t { + am_reqq_t *first; + am_reqq_t **lastp; +}; + +psm_error_t psmi_am_reqq_drain(ptl_t *ptl); +void psmi_am_reqq_add(int amtype, ptl_t *ptl, psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, void *dest, int flags); + +/* + * Shared memory Active Messages, implementation derived from + * Lumetta, Mainwaring, Culler. Multi-Protocol Active Messages on a Cluster of + * SMP's. Supercomputing 1997. + * + * We support multiple endpoints in shared memory, but we only support one + * shared memory context with up to AMSH_MAX_LOCAL_PROCS local endpoints. Some + * structures are endpoint specific (as denoted * with amsh_ep_) and others are + * specific to the single shared memory context * (amsh_ global variables). + * + * Each endpoint maintains a shared request block and a shared reply block. + * Each block is composed of queues for small, medium and large messages. + */ + +#define QFREE 0 +#define QUSED 1 +#define QREADY 2 +#define QREADYMED 3 +#define QREADYLONG 4 + +#define QISEMPTY(flag) (flagflag = (_flag); \ + _QMARK_FLAG_FENCE(); \ + } while (0) + +#define QMARKFREE(pkt_ptr) _QMARK_FLAG(pkt_ptr, QFREE) +#define QMARKREADY(pkt_ptr) _QMARK_FLAG(pkt_ptr, QREADY) +#define QMARKUSED(pkt_ptr) _QMARK_FLAG(pkt_ptr, QUSED) + +#define AMFMT_SYSTEM 1 +#define AMFMT_SHORT_INLINE 2 +#define AMFMT_SHORT 3 +#define AMFMT_LONG 4 +#define AMFMT_LONG_END 5 +#define AMFMT_HUGE 6 +#define AMFMT_HUGE_END 7 + +#define _shmidx _ptladdr_u32[0] +#define _cstate _ptladdr_u32[1] + +#define AMSH_CMASK_NONE 0 +#define AMSH_CMASK_PREREQ 1 +#define AMSH_CMASK_POSTREQ 2 +#define AMSH_CMASK_DONE 3 + +#define AMSH_CSTATE_TO_MASK 0x0f +#define AMSH_CSTATE_TO_NONE 0x01 +#define AMSH_CSTATE_TO_REPLIED 0x02 +#define AMSH_CSTATE_TO_ESTABLISHED 0x03 +#define AMSH_CSTATE_TO_DISC_REPLIED 0x04 +#define AMSH_CSTATE_TO_GET(epaddr) ((epaddr)->_cstate & AMSH_CSTATE_TO_MASK) +#define AMSH_CSTATE_TO_SET(epaddr,state) \ + (epaddr)->_cstate = (((epaddr)->_cstate & ~AMSH_CSTATE_TO_MASK) | \ + ((AMSH_CSTATE_TO_ ## state) & AMSH_CSTATE_TO_MASK)) + +#define AMSH_CSTATE_FROM_MASK 0xf0 +#define AMSH_CSTATE_FROM_NONE 0x10 +#define AMSH_CSTATE_FROM_DISC_REQ 0x40 +#define AMSH_CSTATE_FROM_ESTABLISHED 0x50 +#define AMSH_CSTATE_FROM_GET(epaddr) ((epaddr)->_cstate & AMSH_CSTATE_FROM_MASK) +#define AMSH_CSTATE_FROM_SET(epaddr,state) \ + (epaddr)->_cstate = (((epaddr)->_cstate & ~AMSH_CSTATE_FROM_MASK) | \ + ((AMSH_CSTATE_FROM_ ## state) & AMSH_CSTATE_FROM_MASK)) + +/********************************** + * Shared memory packet formats + **********************************/ +typedef +struct am_pkt_short { + uint32_t flag; /**> Packet state */ + union { + uint32_t bulkidx; /**> index in bulk packet queue */ + uint32_t length; /**> length when no bulkidx used */ + }; + uint16_t shmidx; /**> index in shared segment */ + uint16_t type; + uint16_t nargs; + uint16_t handleridx; + + psm_amarg_t args[NSHORT_ARGS]; /* AM arguments */ + + /* We eventually will expose up to 8 arguments, but this isn't implemented + * For now. >6 args will probably require a medium instead of a short */ +} +am_pkt_short_t PSMI_CACHEALIGN; +PSMI_STRICT_SIZE_DECL(am_pkt_short_t,64); + +typedef struct am_pkt_bulk { + uint32_t flag; + uint32_t idx; + uintptr_t dest; /* Destination pointer in "longs" */ + uint32_t dest_off; /* Destination pointer offset */ + uint32_t len; /* Destination length within offset */ + psm_amarg_t args[2]; /* Additional "spillover" for >6 args */ + uint8_t payload[0]; +} +am_pkt_bulk_t; +/* No strict size decl, used for mediums and longs */ + +/**************************************************** + * Shared memory header and block control structures + ***************************************************/ + +/* Each pkt queue has the same header format, although the queue + * consumers don't use the 'head' index in the same manner. */ +typedef struct am_ctl_qhdr { + uint32_t head; /* Touched only by 1 consumer */ + uint8_t _pad0[64-4]; + + /* tail is now located on the dirpage. */ + uint32_t elem_cnt; + uint32_t elem_sz; + uint8_t _pad1[64-2*sizeof(uint32_t)]; +} +am_ctl_qhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_qhdr_t,128); + +/* Each block reserves some space at the beginning to store auxiliary data */ +#define AMSH_BLOCK_HEADER_SIZE 4096 + +/* Each process has a reply qhdr and a request qhdr */ +typedef struct am_ctl_blockhdr { + volatile am_ctl_qhdr_t shortq; + volatile am_ctl_qhdr_t medbulkq; + volatile am_ctl_qhdr_t longbulkq; + volatile am_ctl_qhdr_t hugebulkq; +} +am_ctl_blockhdr_t; +PSMI_STRICT_SIZE_DECL(am_ctl_blockhdr_t,128*3); + +/* We cache the "shorts" because that's what we poll on in the critical path. + * We take care to always update these pointers whenever the segment is remapped. + */ +typedef struct am_ctl_qshort_cache { + volatile am_pkt_short_t *base; + volatile am_pkt_short_t *head; + volatile am_pkt_short_t *end; +} +am_ctl_qshort_cache_t; + +struct amsh_qptrs { + am_ctl_blockhdr_t *qreqH; + am_pkt_short_t *qreqFifoShort; + am_pkt_bulk_t *qreqFifoMed; + am_pkt_bulk_t *qreqFifoLong; + am_pkt_bulk_t *qreqFifoHuge; + + am_ctl_blockhdr_t *qrepH; + am_pkt_short_t *qrepFifoShort; + am_pkt_bulk_t *qrepFifoMed; + am_pkt_bulk_t *qrepFifoLong; + am_pkt_bulk_t *qrepFifoHuge; +}; + +/****************************************** + * Shared segment local directory (global) + ****************************************** + * + * Each process keeps a directory for where request and reply structures are + * located at its peers. + */ +struct amsh_qdirectory { + /* These pointers are convenience aliases for the local node queues + also found in the qptrs array. */ + am_ctl_blockhdr_t *qreqH; + am_pkt_short_t *qreqFifoShort; + am_pkt_bulk_t *qreqFifoMed; + am_pkt_bulk_t *qreqFifoLong; + am_pkt_bulk_t *qreqFifoHuge; + + am_ctl_blockhdr_t *qrepH; + am_pkt_short_t *qrepFifoShort; + am_pkt_bulk_t *qrepFifoMed; + am_pkt_bulk_t *qrepFifoLong; + am_pkt_bulk_t *qrepFifoHuge; + + struct amsh_qptrs qptrs[PTL_AMSH_MAX_LOCAL_NODES]; + + int kassist_pid; + +/* + * Peer view of my index. for initial node, it is the same as ep->amsh_shmidx, + * for other remote nodes, it is calculated by circular offset of + * PTL_AMSH_MAX_LOCAL_PROCS, node-ID, and ep->amsh_shmidx. + */ + int amsh_shmidx; + psm_epid_t amsh_epid; + uint16_t amsh_verno; +#ifdef PSM_HAVE_SCIF + scif_epd_t amsh_epd[2]; +#endif + off_t amsh_offset; + void *amsh_base; + psm_epaddr_t amsh_epaddr; +} __attribute__ ((aligned(8))); + +typedef struct amsh_qtail_info +{ + volatile uint32_t tail; + volatile pthread_spinlock_t lock; + uint8_t _pad0[64-1*4-sizeof(pthread_spinlock_t)]; +} amsh_qtail_info_t; +PSMI_STRICT_SIZE_DECL(amsh_qtail_info_t,64); + +struct amsh_qtail +{ + amsh_qtail_info_t reqFifoShort; + amsh_qtail_info_t reqFifoMed; + amsh_qtail_info_t reqFifoLong; + amsh_qtail_info_t reqFifoHuge; + + amsh_qtail_info_t repFifoShort; + amsh_qtail_info_t repFifoMed; + amsh_qtail_info_t repFifoLong; + amsh_qtail_info_t repFifoHuge; +} __attribute__ ((aligned(64))); + +/* The first shared memory page is a control page to support each endpoint + * independently adding themselves to the shared memory segment. */ +struct am_ctl_dirpage { + pthread_mutex_t lock; + char _pad0[64-sizeof(pthread_mutex_t)]; + volatile int is_init; + char _pad1[64-sizeof(int)]; + + uint16_t psm_verno[PTL_AMSH_MAX_LOCAL_PROCS]; + uint32_t amsh_features[PTL_AMSH_MAX_LOCAL_PROCS]; + int num_attached; /* 0..MAX_LOCAL_PROCS-1 */ + int max_idx; + + psm_epid_t shmidx_map_epid[PTL_AMSH_MAX_LOCAL_PROCS]; + int kcopy_minor; + int kassist_pids[PTL_AMSH_MAX_LOCAL_PROCS]; + + /* A set of tail queue data for each remote domain. Each domain has + a reserved set of queues for each other domain. The queues are located + in shared memory on the target domain, while the tail pointer is + located on the source domain. */ + /* The tail pointers are located in the dirpage because each peer in this + domain will be sharing them (atomically). The dirpage is mapped by + all processes already, so just use it. */ + struct amsh_qtail qtails[PTL_AMSH_MAX_LOCAL_PROCS*PTL_AMSH_MAX_LOCAL_NODES]; +}; + +#define AMSH_HAVE_KCOPY 0x01 +#define AMSH_HAVE_KNEM 0x02 +#define AMSH_HAVE_SCIF 0x04 +#define AMSH_HAVE_KASSIST 0x7 + +/****************************************** + * Shared fifo element counts and sizes + ****************************************** + * These values are context-wide, they can only be set early on and can't be * + * modified at runtime. All endpoints are expected to use the same values. + */ +typedef +struct amsh_qinfo { + int qreqFifoShort; + int qreqFifoMed; + int qreqFifoLong; + int qreqFifoHuge; + + int qrepFifoShort; + int qrepFifoMed; + int qrepFifoLong; + int qrepFifoHuge; +} +amsh_qinfo_t; + +/****************************************** + * Per-endpoint structures (ep-local) + ****************************************** + * Each endpoint keeps its own information as to where it resides in the + * directory, and maintains its own cached copies of where the short header + * resides in shared memory. + * + * NOTE: All changes must be reflected in PSMI_AMSH_EP_SIZE + */ +struct ptl { + psm_ep_t ep; + psm_epid_t epid; + psm_epaddr_t epaddr; + ptl_ctl_t *ctl; + int shmidx; + am_ctl_qshort_cache_t reqH[PTL_AMSH_MAX_LOCAL_NODES]; + am_ctl_qshort_cache_t repH[PTL_AMSH_MAX_LOCAL_NODES]; + int zero_polls; + int amsh_only_polls; + + pthread_mutex_t connect_lock; + int connect_phase; + int connect_to; + int connect_from; + +/* List of context-specific shared variables */ + amsh_qinfo_t amsh_qsizes; + am_pkt_short_t amsh_empty_shortpkt; + struct am_reqq_fifo_t psmi_am_reqq_fifo; + +}; + +#endif diff --git a/ptl_am/ptl.c b/ptl_am/ptl.c new file mode 100644 index 0000000..8638652 --- /dev/null +++ b/ptl_am/ptl.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "kcopyrw.h" +#include "knemrw.h" +#include "scifrw.h" + +static +psm_error_t +ptl_handle_rtsmatch_request(psm_mq_req_t req, int was_posted, amsh_am_token_t *tok) +{ + psm_amarg_t args[5] = {}; + psm_epaddr_t epaddr = req->rts_peer; + ptl_t *ptl = epaddr->ptl; + int pid = 0; + int used_get = 0; + + psmi_assert((tok != NULL && was_posted) || (tok == NULL && !was_posted)); + + _IPATH_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n", + req, req->buf, req->recv_msglen, tok); + + args[0].u64w0 = (uint64_t)(uintptr_t) req->ptl_req_ptr; + args[1].u64w0 = (uint64_t)(uintptr_t) req; + args[2].u64w0 = (uint64_t)(uintptr_t) req->buf; + args[3].u32w0 = req->recv_msglen; + args[3].u32w1 = tok != NULL ? 1 : 0; + args[4].u64w0 = 0; + + /* First check: is the peer local? */ +#ifdef PSM_HAVE_SCIF + int shmidx = epaddr->_shmidx; + if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { +#endif + /* Use kassist if enabled */ + if ((ptl->ep->psmi_kassist_mode & PSMI_KASSIST_GET) && + req->recv_msglen > 0 && + (pid = psmi_epaddr_kcopy_pid(epaddr))) + { + if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY) { + /* kcopy can be done in handler context or not. */ + size_t nbytes = kcopy_get(ptl->ep->psmi_kassist_fd, pid, + (void *) req->rts_sbuf, req->buf, req->recv_msglen); + psmi_assert_always(nbytes == req->recv_msglen); + } else { + psmi_assert_always(ptl->ep->psmi_kassist_mode & + PSMI_KASSIST_KNEM); + + /* knem copy can be done in handler context or not */ + knem_get(ptl->ep->psmi_kassist_fd, (int64_t) req->rts_sbuf, + (void*) req->buf, req->recv_msglen); + } + + used_get = 1; + } + + /* If KNEM PUT is active register region for peer to PUT data to */ + if (ptl->ep->psmi_kassist_mode == PSMI_KASSIST_KNEM_PUT) + args[4].u64w0 = knem_register_region(req->buf, req->recv_msglen, + PSMI_TRUE); + +#ifdef PSM_HAVE_SCIF + } else if(ptl->ep->scif_dma_threshold <= req->recv_msglen) { + /* Remote node and threshold is met, consider using SCIF DMA */ + + if(epaddr->ep->scif_dma_mode == PSMI_SCIF_DMA_GET) { + /* Read via SCIF DMA */ + scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[1]; + + if(scif_vreadfrom(epd, req->buf, req->recv_msglen, + req->rts_sbuf, SCIF_RMA_USECACHE|SCIF_RMA_SYNC)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ptl_handle_rtsmatch_request(): scif_vreadfrom failed: (%d) %s", + errno, strerror(errno)); + } + + /* Give the remote offset back to the sender. */ + args[4].u64w0 = req->rts_sbuf; + used_get = 1; + } + else if(epaddr->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT) { + /* Peer issues DMA commands on amsh_epd[0] */ + scif_epd_t epd = epaddr->ep->amsh_qdir[shmidx].amsh_epd[1]; + + off_t reg; + if(scif_register_region(epd, + req->buf, req->recv_msglen, ®) != PSM_OK) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ptl_handle_rtsmatch_request(): SCIF memory registration failed"); + } + + /* Stuff the SCIF registration offset into the buffer pointer. + This is needed later in psmi_am_mq_handler_rtsdone to unregister + the buffer. The registration is also passed across for the + sender side to issue a DMA write.*/ + req->buf = (void*)reg; + args[4].u64w0 = reg; + } + } +#endif + + if (tok != NULL) { + psmi_am_reqq_add(AMREQUEST_SHORT, tok->ptl, tok->tok.epaddr_from, + mq_handler_rtsmatch_hidx, args, 5, NULL, 0, NULL, 0); + } + else + psmi_amsh_short_request(ptl, epaddr, mq_handler_rtsmatch_hidx, + args, 5, NULL, 0, 0); + + /* 0-byte completion or we used kcopy */ + if (used_get == 1 || req->recv_msglen == 0) + psmi_mq_handle_rts_complete(req); + return PSM_OK; +} + +static +psm_error_t +ptl_handle_rtsmatch(psm_mq_req_t req, int was_posted) +{ + /* was_posted == 0 allows us to assume that we're not running this callback + * within am handler context (i.e. we can poll) */ + psmi_assert(was_posted == 0); + return ptl_handle_rtsmatch_request(req, 0, NULL); +} + +void +psmi_am_mq_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + ptl_t *ptl = tok->ptl; + psm_mq_req_t req; + int rc; + int mode = args[0].u32w0; + uint64_t tag = args[1].u64; + uint32_t msglen = mode <= MQ_MSG_SHORT ? len : args[0].u32w1; + + _IPATH_VDBG("mq=%p mode=%d, len=%d, msglen=%d\n", + tok->mq, mode, (int) len, msglen); + + switch(mode) { + case MQ_MSG_TINY: + rc = psmi_mq_handle_tiny_envelope(tok->mq, tok->tok.epaddr_from, tag, + buf, (uint32_t) len); + return; + break; + case MQ_MSG_SHORT: + case MQ_MSG_LONG: + rc = psmi_mq_handle_envelope(tok->mq, mode, tok->tok.epaddr_from, + tag, (union psmi_egrid) 0U, + msglen, buf, (uint32_t) len); + return; + break; + default: { + void *sreq = (void *)(uintptr_t) args[2].u64w0; + uintptr_t sbuf = (uintptr_t) args[3].u64w0; + psmi_assert(narg == 5); + psmi_assert_always(mode == MQ_MSG_RTS); + rc = psmi_mq_handle_rts(tok->mq, tag, sbuf, msglen, + tok->tok.epaddr_from, + ptl_handle_rtsmatch, &req); + req->ptl_req_ptr = sreq; + + /* Overload rts_sbuf to contain the cookie for remote region */ + if(ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KNEM) + req->rts_sbuf = (uintptr_t) args[4].u64w0; +#ifdef PSM_HAVE_SCIF + else if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET && + ptl->ep->scif_dma_threshold <= msglen && + tok->tok.epaddr_from->_shmidx >= PTL_AMSH_MAX_LOCAL_PROCS) { + req->rts_sbuf = (uintptr_t) args[4].u64w0; + } +#endif + + if (rc == MQ_RET_MATCH_OK) /* handler context: issue a reply */ + ptl_handle_rtsmatch_request(req, 1, tok); + /* else will be called later */ + } + } + return; +} + +void +psmi_am_mq_handler_data(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + psm_mq_req_t req = STAILQ_FIRST(&tok->tok.epaddr_from->egrlong); + psmi_mq_handle_data(req, tok->tok.epaddr_from, 0, args[2].u32w0, buf, len); + + return; +} + +void +psmi_am_mq_handler_rtsmatch(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + ptl_t *ptl = tok->ptl; + psm_mq_req_t sreq = (psm_mq_req_t) (uintptr_t) args[0].u64w0; + void *dest = (void *)(uintptr_t) args[2].u64w0; + uint32_t msglen = args[3].u32w0; + int pid = 0; + psm_amarg_t rarg[1] = {}; + + _IPATH_VDBG("[rndv][send] req=%p dest_req=%p src=%p dest=%p len=%d\n", + sreq, (void*)(uintptr_t)args[1].u64w0, sreq->buf, dest, msglen); + + if (msglen > 0) { + rarg[0].u64w0 = args[1].u64w0; /* rreq */ + +#ifdef PSM_HAVE_SCIF + int shmidx = tok->tok.epaddr_from->_shmidx; + if(shmidx < PTL_AMSH_MAX_LOCAL_PROCS) { +#endif + /* Try Intra-node kassist */ + if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_MASK) + pid = psmi_epaddr_kcopy_pid(tok->tok.epaddr_from); + else + pid = 0; + + if (!pid) + psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, + sreq->buf, msglen, dest, 0); + else if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_PUT) + { + if (ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KCOPY) { + size_t nbytes = kcopy_put(ptl->ep->psmi_kassist_fd, sreq->buf, + pid, dest, msglen); + psmi_assert_always(nbytes == msglen); + } else { + int64_t cookie = args[4].u64w0; + + psmi_assert_always( + ptl->ep->psmi_kassist_mode & PSMI_KASSIST_KNEM); + + /* Do a PUT using KNEM */ + knem_put(ptl->ep->psmi_kassist_fd, + sreq->buf, msglen, cookie); + } + + /* Send response that PUT is complete */ + psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, + NULL, 0, 0); + } +#ifdef PSM_HAVE_SCIF + } else { + /* Try SCIF DMA */ + scif_epd_t epd = + tok->tok.epaddr_from->ep->amsh_qdir[shmidx].amsh_epd[0]; + + if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT && + ptl->ep->scif_dma_threshold <= msglen) { + off_t target_offset = args[4].u64w0; + + /* The DMA operation is NOT completed here. It is + initiated here, then the receiving side is notified. + The target issues a DMA fence to wait for the DMA + complete, then responds that it has completed handling + the transfer on that side. */ + /* The 'v' form takes care of local registration. */ + if(scif_vwriteto(epd, sreq->buf, msglen, target_offset, + SCIF_RMA_USECACHE)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "psmi_am_mq_handler_rtsmatch(): scif_vwriteto failed: (%d) %s", errno, strerror(errno)); + } + + /* Send response that PUT is complete */ + psmi_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, + NULL, 0, 0); + } else if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_GET && + ptl->ep->scif_dma_threshold <= msglen) { + /* GET mode: receiver has performed DMA read, so unregister. */ + scif_unregister_region(epd, args[4].u64w0, msglen); + } else { + /* No form of DMA is enabled -- use the memory copying path */ + psmi_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, + sreq->buf, msglen, dest, 0); + } + } +#endif + } //msglen > 0 + + psmi_mq_handle_rts_complete(sreq); +} + +void +psmi_am_mq_handler_rtsdone(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + psm_mq_req_t rreq = (psm_mq_req_t) (uintptr_t) args[0].u64w0; + psmi_assert(narg == 1); + + _IPATH_VDBG("[rndv][recv] req=%p dest=%p len=%d\n", rreq, rreq->buf, rreq->recv_msglen); + +#ifdef PSM_HAVE_SCIF + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + ptl_t *ptl = tok->ptl; + + psm_epaddr_t rmt_epaddr = rreq->rts_peer; + + if(ptl->ep->scif_dma_mode == PSMI_SCIF_DMA_PUT && + ptl->ep->scif_dma_threshold <= rreq->recv_msglen && + rmt_epaddr->_shmidx >= PTL_AMSH_MAX_LOCAL_PROCS) { + /* SCIF DMA commands are initiated on amsh_epd[0]; the receive (for put) + side registration is on amsh_epd[1]. */ + scif_epd_t epd = + rmt_epaddr->ep->amsh_qdir[rmt_epaddr->_shmidx].amsh_epd[1]; + + int mark; + if(scif_fence_mark(epd, SCIF_FENCE_INIT_PEER, &mark)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "psmi_am_mq_handler_rtsdone(): scif_fence_mark failed: (%d) %s", + errno, strerror(errno)); + } + + /* When registered, the rreq->buf address is replaced with the SCIF + registration offset so that it can be used here. */ + scif_unregister_region(epd, (off_t)rreq->buf, rreq->recv_msglen); + + if(scif_fence_wait(epd, mark)) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "psmi_am_mq_handler_rtsdone(): scif_fence_wait failed: (%d) %s", + errno, strerror(errno)); + } + } +#endif + + psmi_mq_handle_rts_complete(rreq); +} + +void +psmi_am_handler(void *toki, psm_amarg_t *args, int narg, void *buf, size_t len) +{ + amsh_am_token_t *tok = (amsh_am_token_t *) toki; + psm_am_handler_fn_t hfn; + + hfn = psm_am_get_handler_function(tok->mq->ep, + (psm_handler_t) args[0].u32w0); + + /* Invoke handler function. For AM we do not support break functionality */ + hfn(toki, tok->tok.epaddr_from, args+1, narg-1, buf, len); + + return; +} diff --git a/ptl_am/ptl_fwd.h b/ptl_am/ptl_fwd.h new file mode 100644 index 0000000..3be8f5b --- /dev/null +++ b/ptl_am/ptl_fwd.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PTL_FWD_AMSH_H +#define _PTL_FWD_AMSH_H + +#define PTL_AMSH_MAX_LOCAL_PROCS 256 + +/* SCIF manual says it is optimized for up to 8 nodes, so choose 16 for + future expansion. */ +#ifdef PSM_HAVE_SCIF +#define PTL_AMSH_MAX_LOCAL_NODES 8 +#else +/* Compiling without SCIF: assume one node */ +#define PTL_AMSH_MAX_LOCAL_NODES 1 +#endif + +/* Symbol in am ptl */ +struct ptl_ctl_init psmi_ptl_amsh; + +/* Special non-ptl function exposed to pre-attach to shm segment */ +psm_error_t psmi_shm_attach(psm_ep_t ep, int *shmidx_o); +psm_error_t psmi_shm_detach(psm_ep_t ep); + +extern int psmi_shm_mq_rv_thresh; + +#endif diff --git a/ptl_am/scifrw.h b/ptl_am/scifrw.h new file mode 100644 index 0000000..eb78126 --- /dev/null +++ b/ptl_am/scifrw.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2010. QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#if defined(PSM_HAVE_SCIF) +#include + +/* + * register a memory region for put/get + */ +int scif_register_region(scif_epd_t epd, void* addr, size_t len, off_t* offset); + +/* + * unregister a memory region that was previously registered + */ +int scif_unregister_region(scif_epd_t epd, off_t reg, size_t len); + +#endif diff --git a/ptl_am/scifrwu.c b/ptl_am/scifrwu.c new file mode 100644 index 0000000..d3ccd63 --- /dev/null +++ b/ptl_am/scifrwu.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2010. QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "psm_user.h" +#include "psm_mq_internal.h" +#include "psm_am_internal.h" +#include "scifrw.h" + +#if defined(PSM_HAVE_SCIF) +int scif_register_region(scif_epd_t epd, void* addr, size_t len, off_t* offset) +{ + /* SCIF requires registrations on page granularity. The address must be + rounded down to a page boundary, and the length must be rounded up. */ + off_t addr_offset = (off_t)addr & 0xFFF; + uintptr_t reg_addr = (uintptr_t)addr & ~0xFFF; + size_t reg_len = len + addr_offset; + + if(reg_len & 0xFFF) { + reg_len += 0x1000 - (reg_len & 0xFFF); + } + + off_t reg = scif_register(epd, (void*)reg_addr, reg_len, 0, + SCIF_PROT_READ|SCIF_PROT_WRITE, 0); + + if(reg == SCIF_REGISTER_FAILED) { + _IPATH_INFO("SCIF: Registering memory %p (%p) length %ld (%ld) epd %d failed: (%d) %s\n", + addr, (void*)reg_addr, len, reg_len, epd, + errno, strerror(errno)); + + *offset = SCIF_REGISTER_FAILED; + return PSM_INTERNAL_ERR; + } + + /* Although the registration is rounded out to whole pages, return the + exact SCIF-space registration offset for the specified address. */ + *offset = reg + addr_offset; + return PSM_OK; +} + +int scif_unregister_region(scif_epd_t epd, off_t reg, size_t len) +{ + /* SCIF requires registrations on page granularity. The address must be + rounded down to a page boundary, and the length must be rounded up. */ + off_t reg_addr = reg & ~0xFFF; + size_t reg_len = len + ((size_t)reg & 0xFFF); + + if(reg_len & 0xFFF) { + reg_len += 0x1000 - (reg_len & 0xFFF); + } + + if(scif_unregister(epd, reg_addr, reg_len)) { + _IPATH_INFO("SCIF: Unregistering offset %lx (%lx) length %ld (%ld) epd %d failed: (%d) %s\n", + reg, reg_addr, len, reg_len, epd, + errno, strerror(errno)); + return PSM_INTERNAL_ERR; + } + + return PSM_OK; +} + +#endif /* defined(PSM_USE_SCIF) */ + diff --git a/ptl_ips/Makefile b/ptl_ips/Makefile new file mode 100644 index 0000000..dc06808 --- /dev/null +++ b/ptl_ips/Makefile @@ -0,0 +1,55 @@ +# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +include $(top_srcdir)/buildflags.mak +INCLUDES += -I$(top_srcdir) -I$(top_srcidr)/ptl_ips + +${TARGLIB}-objs := ptl.o ptl_rcvthread.o ips_proto.o ipserror.o ips_recvq.o \ + ips_recvhdrq.o ips_spio.o ips_proto_recv.o ips_proto_connect.o \ + ips_proto_dump.o ips_proto_mq.o ips_subcontext.o \ + ips_writehdrq.o ips_proto_expected.o ips_tid.o + +# enable mov,0 -> xor optimization for ips +ifeq (${CCARCH},pathcc) + ifeq (,${PSM_DEBUG}) + CFLAGS += -CG:use_xortozero=1 + endif +endif + +all: ${${TARGLIB}-objs} + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + rm -f *.o + diff --git a/ptl_ips/ips_crc32.c b/ptl_ips/ips_crc32.c new file mode 100644 index 0000000..6a7b85a --- /dev/null +++ b/ptl_ips/ips_crc32.c @@ -0,0 +1,91 @@ +/* The code in this file was derived from crc32.c in zlib 1.2.3, and + modified from its original form to suit our requirements. The zlib + license and crc32.c copyright and credits are preserved below. */ + +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2005 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results in about a + * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +#include "ips_proto.h" +#include "ips_proto_internal.h" + +/* Table of CRCs of all 8-bit messages. */ +static uint32_t crc_table[256]; + +/* Flag: has the table been computed? Initially false. */ +static int crc_table_computed = 0; + +/* Make the table for a fast CRC. */ +static void make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n; + for (k = 0; k < 8; k++) { + if (c & 1) + c = 0xedb88320 ^ (c >> 1); + else + c = c >> 1; + } + crc_table[n] = c; + } + crc_table_computed = 1; +} + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + * should be initialized to all 1's, and the transmitted value + * is the 1's complement of the final running CRC (see the + * crc() routine below)). + */ + +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc) +{ + uint32_t c = crc; + uint32_t n; + + if (!crc_table_computed) { + make_crc_table(); + } + for (n = 0; n < len; n++) { + c = crc_table[(c ^ data[n]) & 0xff] ^ (c >> 8); + } + return c; +} diff --git a/ptl_ips/ips_epstate.c b/ptl_ips/ips_epstate.c new file mode 100644 index 0000000..43c81ba --- /dev/null +++ b/ptl_ips/ips_epstate.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_epstate.h" + +/* The indexes are used to map a particular endpoint to a strcture at the + * receiver. Although we take extra care to validate the identity of endpoints + * when packets are received, the communication index is at an offset selected + * by the endpoint that allocates the index. This narrows the window of two + * jobs communicated with the same set of indexes from getting crosstalk. + */ +/* Allocate new epaddrs in chunks of 128 */ +#define PTL_EPADDR_ALLOC_CHUNK 128 + +psm_error_t +ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *context) +{ + memset(eps, 0, sizeof(*eps)); + eps->context = context; + eps->eps_base_idx = ((ips_epstate_idx)get_cycles()) & + (IPS_EPSTATE_COMMIDX_MAX-1); + return PSM_OK; +} + +psm_error_t +ips_epstate_fini(struct ips_epstate *eps) +{ + if (eps->eps_tab) + psmi_free(eps->eps_tab); + memset(eps, 0, sizeof(*eps)); + return PSM_OK; +} + +/* + * Add ipsaddr with epid to the epstate table, return new index to caller in + * 'commidx'. + */ +psm_error_t +ips_epstate_add(struct ips_epstate *eps, struct ptl_epaddr *ipsaddr, + ips_epstate_idx *commidx_o) +{ + int i, j; + ips_epstate_idx commidx; + uint16_t lmc_mask = ~((1 << ipsaddr->proto->epinfo.ep_lmc) - 1); + + if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ + struct ips_epstate_entry *newtab; + eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; + newtab = (struct ips_epstate_entry *) + psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, eps->eps_tabsize, + sizeof(struct ips_epstate_entry)); + if (newtab == NULL) + return PSM_NO_MEMORY; + else if (eps->eps_tab) { /* NOT first alloc */ + for (i = 0; i < eps->eps_tabsize-PTL_EPADDR_ALLOC_CHUNK; i++) + newtab[i] = eps->eps_tab[i]; /* deep copy */ + psmi_free(eps->eps_tab); + } + eps->eps_tab = newtab; + } + /* Find the next free hole. We can afford to do this since connect is not + * in the critical path */ + for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { + if (j == eps->eps_tabsize) + j = 0; + if (eps->eps_tab[j].epid == 0) { + eps->eps_tab_nextidx = j + 1; + if (eps->eps_tab_nextidx == eps->eps_tabsize) + eps->eps_tab_nextidx = 0; + break; + } + } + psmi_assert_always(i != eps->eps_tabsize); + commidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1); + _IPATH_VDBG("node %s gets commidx=%d (table idx %d)\n", + psmi_epaddr_get_name(ipsaddr->epaddr->epid), commidx, j); + eps->eps_tab[j].epid = + PSMI_EPID_PACK(ipsaddr->epr.epr_base_lid & lmc_mask, + ipsaddr->epr.epr_context, + ipsaddr->epr.epr_subcontext); + eps->eps_tab[j].ipsaddr = ipsaddr; + if (j >= IPS_EPSTATE_COMMIDX_MAX) { + return psmi_handle_error(eps->context->ep, PSM_TOO_MANY_ENDPOINTS, + "Can't connect to more than %d non-local endpoints", + IPS_EPSTATE_COMMIDX_MAX); + } + *commidx_o = commidx; + return PSM_OK; +} + +psm_error_t +ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx commidx) +{ + ips_epstate_idx idx; + /* actual table index */ + idx = (commidx + eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1); + psmi_assert_always(idx < eps->eps_tabsize); + _IPATH_VDBG("commidx=%d, table_idx=%d\n", commidx, idx); + eps->eps_tab[idx].epid = 0; + eps->eps_tab[idx].ipsaddr = NULL; + /* We may eventually want to release memory, but probably not */ + eps->eps_tabsizeused--; + return PSM_OK; +} + diff --git a/ptl_ips/ips_epstate.h b/ptl_ips/ips_epstate.h new file mode 100644 index 0000000..b6aca57 --- /dev/null +++ b/ptl_ips/ips_epstate.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_EPSTATE_H +#define _IPS_EPSTATE_H + +#include "psm_user.h" + +typedef uint32_t ips_epstate_idx; +#define IPS_EPSTATE_COMMIDX_MAX (1<<20) +#define IPS_EPSTATE_COMMIDX_MASK 0xF0000 +#define IPS_EPSTATE_COMMIDX_SHIFT 14 +#define IPS_EPSTATE_COMMIDX_PACK(ipscommidx) \ + ((ipscommidx & IPS_EPSTATE_COMMIDX_MASK) \ + >> IPS_EPSTATE_COMMIDX_SHIFT) + +struct ptl_epaddr; + +struct ips_epstate_entry { + uint64_t epid; + struct ptl_epaddr *ipsaddr; +}; + +struct ips_epstate { + const psmi_context_t *context; + ips_epstate_idx eps_base_idx; + int eps_tabsize; + int eps_tabsizeused; + int eps_tab_nextidx; + + struct ips_epstate_entry *eps_tab; +}; + +psm_error_t ips_epstate_init(struct ips_epstate *eps, const psmi_context_t *contextj); +psm_error_t ips_epstate_fini(struct ips_epstate *eps); + +psm_error_t ips_epstate_add(struct ips_epstate *eps, + struct ptl_epaddr *ipsaddr, + ips_epstate_idx *commidx); +psm_error_t ips_epstate_del(struct ips_epstate *eps, ips_epstate_idx commidx); + +PSMI_INLINE( +struct ips_epstate_entry * +ips_epstate_lookup(const struct ips_epstate *eps, ips_epstate_idx idx)) +{ + idx = (idx + eps->eps_base_idx) & (IPS_EPSTATE_COMMIDX_MAX-1); + if (idx < eps->eps_tabsize) + return &eps->eps_tab[idx]; + else + return NULL; +} + +#endif /* _IPS_EPSTATE_H */ diff --git a/ptl_ips/ips_expected_proto.h b/ptl_ips/ips_expected_proto.h new file mode 100644 index 0000000..f45c687 --- /dev/null +++ b/ptl_ips/ips_expected_proto.h @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Control and state structure for one instance of the expected protocol. The + * protocol depends on some upcalls from internal portions of the receive + * protocol (such as opcodes dedicated for expected protocol handling) + */ + +/* Generate an expected header every 16 packets */ +#define PSM_DEFAULT_EXPECTED_HEADER 16 + +struct ips_protoexp { + const struct ptl *ptl; + struct ips_proto *proto; + struct psmi_timer_ctrl *timerq; + struct ips_tid tidc; + struct ips_tfctrl tfctrl; + + unsigned int tidflow_seed; + ptl_epaddr_flow_t tid_ep_flow; + uint32_t tid_flags; + psm_transfer_type_t tid_xfer_type; + struct ips_scbctrl tid_scbc_rv; + mpool_t tid_desc_send_pool; + mpool_t tid_desc_recv_pool; + mpool_t tid_getreq_pool; + mpool_t tid_sreq_pool; /* backptr into proto->ep->mq */ + mpool_t tid_rreq_pool; /* backptr into proto->ep->mq */ + uint32_t tid_send_fragsize; + uint32_t tid_page_offset_mask; + uint64_t tid_page_mask; + uint64_t tid_to_cyc_min; + uint64_t tid_to_cyc_max; + uint32_t tid_to_intr; + uint32_t tid_min_expsend_cnt; + uint32_t hdr_pkt_interval; + struct ips_tidinfo *tid_info; + + STAILQ_HEAD(ips_tid_send_pend, /* pending exp. sends */ + ips_tid_send_desc) pend_sendq; + struct psmi_timer timer_send; + + STAILQ_HEAD(ips_tid_get_pend, + ips_tid_get_request) pend_getreqsq; /* pending tid reqs */ + struct psmi_timer timer_getreqs; + + /* stats */ + uint64_t tid_grant_resends; + uint64_t tid_release_resends; + uint64_t tid_intr_reqs; +}; + +/* + * TID member list format used in communication. The receiver associates + * physical pages to tids and communicates a list of tid,offset,length for + * each registered page. + * + * This format is currently the only one we support, although it is not as + * compact as we would like and other formats are planned in the near future + */ +#define IPS_TID_SESSTYPE_MEMBER_LIST 1 + +typedef struct { + uint16_t tid; + uint16_t offset; + uint16_t length; +} +ips_tid_session_member; + +typedef struct { + uint16_t tsess_type; + uint16_t tsess_tidcount; + uint16_t tsess_tidlist_length; + uint16_t tsess_unaligned_start; + uint16_t tsess_unaligned_end; + + ptl_arg_t tsess_descid; + uint32_t tsess_seqno; + uint32_t tsess_srcoff; + uint32_t tsess_length; + + ips_tid_session_member tsess_list[0]; /* must be last in struct */ +} +ips_tid_session_list; + +/* + * Send-side expected send descriptors. + * + * Descriptors are allocated when tid grant requests are received (the 'target' + * side of an RDMA get request). Descriptors are added to a pending queue of + * expected sends and processed one at a time (scb's are requested and messages + * sent until all fragments of the descriptor's length are put on the wire). + * + */ +#define TIDSENDC_SDMA_VEC_DEFAULT 260 + +struct ips_tid_send_desc { + struct ips_protoexp *protoexp; + STAILQ_ENTRY(ips_tid_send_desc) next; + + /* Filled in at allocation time */ + ptl_arg_t descid; + uint32_t length; + ips_epaddr_t *ipsaddr; + psm_mq_req_t mqreq; + struct ips_flow tidflow; + + uint32_t ctrl_msg_queued; /* bitmap of queued control messages for flow */ + uint32_t completion_counter; + + /* Iterated during send progress */ + void *buffer; + void *bounce_buf; + int tid_idx; + int is_complete; + uint32_t remaining_bytes; + uint32_t remaining_bytes_in_page; + uint32_t frame_send; + uint32_t offset; + uint32_t iovec_cntr_last; + uint32_t release_cnt; + uint32_t unaligned_sent; + uint32_t pad; + + psmi_timer timer_tidrelease; + + union { + ips_tid_session_list tid_list; + uint8_t filler[2096]; + }; +}; + +#define TIDRECVC_STATE_FREE 0 +#define TIDRECVC_STATE_GRANT 1 +#define TIDRECVC_STATE_GRANT_ACK 2 +#define TIDRECVC_STATE_DONE 3 + +struct ips_expected_recv_stats { + uint32_t nSeqErr; + uint32_t nGenErr; + uint32_t nReXmit; + uint32_t nErrChkReceived; +}; + +struct ips_tid_recv_desc { + const psmi_context_t *context; + struct ips_protoexp *protoexp; + ips_epaddr_t *ipsaddr; + STAILQ_ENTRY(ips_tid_recv_desc) next; + + /* desc id held in tid_list below */ + void *buffer; + uint32_t num_recv_hdrs; + uint32_t recv_msglen; + uint32_t grant_cnt; + uint32_t state; + uint32_t cksum; + uint16_t recv_framecnt; + uint16_t flags; + + /* TF protocol state (recv) */ + uint32_t tidflow_idx; + uint32_t tidflow_active_gen; + + psmi_seqnum_t tidflow_genseq; + uint16_t tidflow_nswap_gen; + uint16_t pad; + + uint32_t ctrl_msg_queued; /* bitmap of queued control messages for */ + struct ips_expected_recv_stats stats; + + struct ips_tid_get_request *getreq; + psmi_timer timer_tidreq; + + ips_tidmap_t ts_map; + union { + ips_tid_session_list tid_list; + uint8_t filler[2096]; + }; +}; + +/* + * Get requests, issued by MQ when there's a match on a large message. Unlike + * an RDMA get, the initiator identifies the location of the data at the target + * using a 'send token' instead of a virtual address. This, of course, assumes + * that the target has already registered the token and communicated it to the + * initiator beforehand (it actually sends the token as part of the initial + * MQ message that contains the MQ tag). + * + * The operation is semantically a two-sided RDMA get. + */ +struct ips_tid_get_request { + STAILQ_ENTRY(ips_tid_get_request) tidgr_next; + struct ips_protoexp *tidgr_protoexp; + psm_epaddr_t tidgr_epaddr; + + void *tidgr_lbuf; + uint32_t tidgr_length; + uint32_t tidgr_rndv_winsz; + uint32_t tidgr_sendtoken; + ips_tid_completion_callback_t tidgr_callback; + void *tidgr_ucontext; + + uint32_t tidgr_offset; /* offset in bytes */ + uint32_t tidgr_bytesdone; + uint32_t tidgr_desc_seqno; + uint32_t tidgr_flags; +}; + +/* + * For debug and/or other reasons, we can log the state of each tid and + * optionally associate it to a particular receive descriptor + */ + +#define TIDSTATE_FREE 0 +#define TIDSTATE_USED 1 + +struct ips_tidinfo { + uint16_t tid; + uint16_t state; + struct ips_tid_recv_desc *tidrecvc; +}; + +/* + * Descriptor limits, structure contents of struct psmi_rlimit_mpool for + * normal, min and large configurations. + */ +#define TID_SENDSESSIONS_LIMITS { \ + .env = "PSM_TID_SENDSESSIONS_MAX", \ + .descr = "Tid max send session descriptors", \ + .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ + .minval = 1, \ + .maxval = 1<<30, \ + .mode[PSMI_MEMMODE_NORMAL] = { 256, 4096 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 512, 8192 } \ + } + +#define TID_RECVSESSIONS_LIMITS { \ + .env = "PSM_TID_RECVSESSIONS_MAX", \ + .descr = "Tid max receive session descriptors", \ + .env_level = PSMI_ENVVAR_LEVEL_HIDDEN, \ + .minval = 1, \ + .maxval = 512, \ + .mode[PSMI_MEMMODE_NORMAL] = { 32, 512 }, \ + .mode[PSMI_MEMMODE_MINIMAL] = { 1, 1 }, \ + .mode[PSMI_MEMMODE_LARGE] = { 32, 512 } \ + } diff --git a/ptl_ips/ips_opp_path_rec.c b/ptl_ips/ips_opp_path_rec.c new file mode 100644 index 0000000..affe5da --- /dev/null +++ b/ptl_ips/ips_opp_path_rec.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include + +#define DF_OPP_LIBRARY "libofedplus.so" +#define DATA_VFABRIC_OFFSET 8 + +/* SLID and DLID are in network byte order */ +static psm_error_t +ips_opp_get_path_rec(ips_path_type_t type, struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthca_type, + ips_path_rec_t **path_rec) +{ + psm_error_t err = PSM_OK; + ibta_path_rec_t query; + ips_opp_path_rec_t *opp_path_rec; + int opp_err; + ENTRY elid, *epath = NULL; + char eplid[128]; + uint64_t timeout_ack_ms; + + /* Query path record query cache first */ + bzero(&query, sizeof(query)); + bzero(eplid, sizeof(eplid)); + + /* Bulk service ID is control service id + 1 */ + switch(type) { + case IPS_PATH_NORMAL_PRIORITY: + case IPS_PATH_LOW_PRIORITY: + query.service_id = + __cpu_to_be64(proto->ep->service_id + DATA_VFABRIC_OFFSET); + break; + case IPS_PATH_HIGH_PRIORITY: + default: + query.service_id = __cpu_to_be64(proto->ep->service_id); + } + + query.slid = slid; + query.dlid = dlid; + + snprintf(eplid, sizeof(eplid), "%s_%x_%x", (type == IPS_PATH_HIGH_PRIORITY) ? "HIGH" : "LOW", query.slid,query.dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { /* Unable to find path record in cache */ + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + opp_path_rec = (ips_opp_path_rec_t*) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_opp_path_rec_t)); + if (!elid.key || !opp_path_rec) { + if (elid.key) psmi_free(elid.key); + if (opp_path_rec) psmi_free(opp_path_rec); + err = PSM_NO_MEMORY; + goto fail; + } + + /* Get path record between local LID and remote */ + opp_err = proto->opp_fn.op_path_get_path_by_rec(proto->opp_ctxt, &query, + &opp_path_rec->opp_response); + if (opp_err) { + psmi_free(opp_path_rec); + psmi_free(elid.key); + err = PSM_EPID_PATH_RESOLUTION; + goto fail; + } + + /* Create path record */ + opp_path_rec->ips.epr_slid = opp_path_rec->opp_response.slid; + opp_path_rec->ips.epr_dlid = opp_path_rec->opp_response.dlid; + opp_path_rec->ips.epr_mtu = + min(ibta_mtu_enum_to_int(opp_path_rec->opp_response.mtu & 0x3f), + proto->epinfo.ep_mtu); + opp_path_rec->ips.epr_pkey = ntohs(opp_path_rec->opp_response.pkey); + opp_path_rec->ips.epr_sl = ntohs(opp_path_rec->opp_response.qos_class_sl); + opp_path_rec->ips.epr_static_rate = opp_path_rec->opp_response.rate & 0x3f; + opp_path_rec->ips.epr_static_ipd = + proto->ips_ipd_delay[opp_path_rec->ips.epr_static_rate]; + + /* Setup CCA parameters for path */ + if (opp_path_rec->ips.epr_sl > 15) { + psmi_free(opp_path_rec); + psmi_free(elid.key); + err = PSM_INTERNAL_ERR; + goto fail; + } + if (!(proto->ccti_ctrlmap&(1<ips.epr_sl))) { + _IPATH_CCADBG("No CCA for sl %d, disable CCA\n", + opp_path_rec->ips.epr_sl); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + } + opp_path_rec->ips.proto = proto; + opp_path_rec->ips.epr_ccti_min = proto->cace[opp_path_rec->ips.epr_sl].ccti_min; + opp_path_rec->ips.epr_ccti = opp_path_rec->ips.epr_ccti_min; + psmi_timer_entry_init(&opp_path_rec->ips.epr_timer_cca, + ips_cca_timer_callback, &opp_path_rec->ips); + + /* Determine active IPD for path. Is max of static rate and CCT table */ + if ((opp_path_rec->ips.epr_static_ipd) && + ((opp_path_rec->ips.epr_static_ipd + 1) > + (proto->cct[opp_path_rec->ips.epr_ccti] & CCA_IPD_MASK))) { + opp_path_rec->ips.epr_active_ipd = opp_path_rec->ips.epr_static_ipd + 1; + opp_path_rec->ips.epr_cca_divisor = 0; /*Static rate has no CCA divisor */ + } + else { + /* Pick it from the CCT table */ + opp_path_rec->ips.epr_active_ipd = + proto->cct[opp_path_rec->ips.epr_ccti] & CCA_IPD_MASK; + opp_path_rec->ips.epr_cca_divisor = + proto->cct[opp_path_rec->ips.epr_ccti] >> CCA_DIVISOR_SHIFT; + } + + /* Compute max timeout based on pkt life time for path */ + timeout_ack_ms = ((4096UL * (1UL << (opp_path_rec->opp_response.pkt_life & 0x3f)))/ 1000000UL); + opp_path_rec->ips.epr_timeout_ack = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT); + opp_path_rec->ips.epr_timeout_ack_max = + ms_2_cycles(IPS_PROTO_ERRCHK_MS_MIN_DEFAULT + timeout_ack_ms); + opp_path_rec->ips.epr_timeout_ack_factor = IPS_PROTO_ERRCHK_FACTOR_DEFAULT; + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void*) opp_path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } + else /* Path record found in cache */ + opp_path_rec = (ips_opp_path_rec_t*) epath->data; + + /* Dump path record stats */ + _IPATH_PRDBG("Path Record ServiceID: %"PRIx64" %x -----> %x\n", (uint64_t) __be64_to_cpu(query.service_id), __be16_to_cpu(slid), __be16_to_cpu(dlid)); + _IPATH_PRDBG("MTU: %x, %x\n", (opp_path_rec->opp_response.mtu & 0x3f), opp_path_rec->ips.epr_mtu); + _IPATH_PRDBG("PKEY: 0x%04x\n", ntohs(opp_path_rec->opp_response.pkey)); + _IPATH_PRDBG("SL: 0x%04x\n", ntohs(opp_path_rec->opp_response.qos_class_sl)); + _IPATH_PRDBG("Rate: %x, IPD: %x\n", (opp_path_rec->opp_response.rate & 0x3f), opp_path_rec->ips.epr_static_ipd); + _IPATH_PRDBG("Timeout Init.: 0x%"PRIx64" Max: 0x%"PRIx64"\n", opp_path_rec->ips.epr_timeout_ack, opp_path_rec->ips.epr_timeout_ack_max); + + /* Return the IPS path record */ + *path_rec = &opp_path_rec->ips; + + fail: + return err; +} + +static psm_error_t +ips_opp_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthca_type, + unsigned long timeout, + ips_epaddr_t *ipsaddr) +{ + psm_error_t err = PSM_OK; + uint16_t pidx, cpath, num_path = (1 << proto->epinfo.ep_lmc); + ips_path_type_t path_type = IPS_PATH_NORMAL_PRIORITY; + ips_path_rec_t *path; + uint16_t path_slid, path_dlid; + psmi_context_t *context = &proto->ep->context; + + /* + * High Priority Path + * ------------------ + * + * Uses the "base" Service ID. For now there exists only 1 high priority + * path between nodes even for non zero LMC fabrics. + * TODO: Investigate if there are any benefits for using multiple high + * priority paths. Initial empirical data shows that this leads to worse + * performance as the bulk data can induce HOL blocking. + * Currently the normal and low priority paths are same but at some point + * we can create separate vFabrics to further distinguish/isolate those + * traffic flows. + * + * Normal/Low Priority Paths + * ------------------------- + * + * Currently these paths are the same i.e. they are queried for the same + * Service ID/vFabric which is the Base Service ID for High Priority + 1. + * + * Use case Scenarios + * ------------------ + * + * Since with vFabrics we have the capability to define different QoS + * parameters per vFabric it is envisioned that the IPS_PATH_HIGH_PRIORITY is + * setup in a separate vFabric for high priority traffic. The NORMAL paths + * are setup in a separate vFabric optimized for high bandwidth. This allows + * us to potentially have control traffic (RTS, CTS etc.) not be bottlenecked + * by bulk transfer data. All control messages (ACKs,NAKs, TID_GRANT etc.) + * also use the high priority control vFabric. + * + * NOTE: In order to distinguish between the different vFabrics the user + * specifies the service ID to use via mpirun (or environment variable). + * This is the service ID for the high priority control traffic. The bulk + * data vFabric is identified by service ID + 1. So for each MPI application + * one should specify two service IDs for the high priority and bulk data. + * Both these service IDs can be placed in the same vFabric which can be + * configured for high priority or bandwidth traffic giving us the default + * behavior upto Infinipath 2.5 release. + * + * NOTE: All of the above would have really helped if the S20 silicon could + * correctly support IBTA QoS features. Due to S20 design we can only have + * high priority VLarb table (low priority VLarb table results in round + * robin arbitration ignoring the weights!). But if this is fixed in a + * subsequent chip respin then this may potentially help our scalability + * on large fabrics. + * + * Mesh/Torus and DOR routed networks + * ---------------------------------- + * + * In a mesh/torus fabric we always have a non zero LMC (atleast 1 can be + * more). We would like to take advantage of dispersive routing on these + * fabrics as well to obtain better "worst case/congested" bandwidth. For + * these networks currently the base LIDs are used for UPDN routing which + * is suboptimal on these networks. Higher order LIDs (+1 .. +N) use DOR + * routing (Dimension Ordered Routing) to avoid deadlocks and provide + * higher performance. If a fabric is disrupted then only the base UPDN + * routing is available. PSM should continue to operate in this environment + * albeit with degraded performance. In disrupted fabric the OPP path + * record queries may fail for some DOR routed LIDs i.e. no path exists + * PSM should hence ignore path record failures as they indicate a disrupted + * fabric and only use valid paths that are returned from the replica. This + * will degenerate to only using the UPDN paths on disrupted fabrics and DOR + * routes only for fully configured fabrics. Note: For a clean fabric the + * base LIDs that are configured for UPDN route will not exist in the replica + * as DOR routes are preferred. Hence we will only dispersively route across + * the DOR routes only using the UPDN route for disrupted fabrics. + * + * AS LONG AS ONE PATH EXISTS (for each of the priorities) COMMUNICATION CAN + * TAKE PLACE. + */ + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = + ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] = + ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = 0; + + /* For now there is always only one high priority path between nodes. */ + for (pidx = 0,cpath = 0; pidx < num_path && cpath == 0; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = ips_opp_get_path_rec(IPS_PATH_HIGH_PRIORITY, proto, + path_slid, path_dlid, + desthca_type, &path); + + if (err == PSM_OK) { /* Valid high priority path found */ + /* Resolved high priority path successfully */ + ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]++; + ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][cpath] = path; + + /* Increment current path index */ + cpath++; + } + } + + /* Make sure we have atleast 1 high priority path */ + if (ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] == 0) { + err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION, + "OFEF Plus path lookup failed. Unable to resolve high priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"PRIx64" defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id); + goto fail; + } + + /* Next setup the bulk paths. If the subnet administrator has misconfigured + * or rather not configured two separate service IDs we place the bulk + * paths in the same vFabric as the control paths. + */ + for (pidx = 0,cpath = 0; pidx < num_path; pidx++) { + path_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + path_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + retry_path_res: + err = ips_opp_get_path_rec(path_type, proto, + path_slid, path_dlid, desthca_type, + &path); + if (err != PSM_OK) { + if (path_type == IPS_PATH_NORMAL_PRIORITY) { + /* Subnet may only be configured for one service ID/vFabric. Default + * to using the control vFabric/service ID for bulk data as well. + */ + path_type = IPS_PATH_HIGH_PRIORITY; + goto retry_path_res; + } + + /* Unable to resolve path for . This is possible + * for disrupted fabrics using DOR routing so continue to acquire paths + */ + err = PSM_OK; + continue; + } + + /* Valid path. For now both normal and low priority paths are the same */ + ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][cpath] = path; + ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][cpath] = path; + ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY]++; + ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY]++; + cpath++; + } + + /* Make sure we have atleast have a single bulk data transfer path */ + if ((ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] == 0) || + (ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] == 0)) { + err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION, + "OFEF Plus path lookup failed. Unable to resolve normal/low priority network path for LID 0x%x <---> 0x%x. Is the SM running or service ID %"PRIx64" defined?", ntohs(slid), ntohs(dlid), (uint64_t) proto->ep->service_id); + goto fail; + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + ipsaddr->epr.epr_hpp_index = 0; + ipsaddr->epr.epr_next_path[IPS_PATH_NORMAL_PRIORITY] = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY]; + ipsaddr->epr.epr_next_path[IPS_PATH_LOW_PRIORITY] = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY]; + } + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + ipsaddr->epr.epr_hpp_index = + ipsaddr->epr.epr_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + ipsaddr->epr.epr_hpp_index = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]; + else /* Base LID */ + ipsaddr->epr.epr_hpp_index = 0; + + fail: + if (err != PSM_OK) + _IPATH_PRDBG("Unable to get path record for LID 0x%x <---> DLID 0x%x.\n", slid, dlid); + return err; +} + +static psm_error_t ips_opp_fini(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} + +psm_error_t ips_opp_init(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + struct ipath_base_info *base_info = &proto->ep->context.base_info; + char hcaName[32]; + + proto->opp_lib = dlopen(DF_OPP_LIBRARY, RTLD_NOW); + if (!proto->opp_lib) { + char *err = dlerror(); + _IPATH_ERROR("Unable to open OFED Plus Plus library %s. Error: %s\n", DF_OPP_LIBRARY, + err ? err : "no dlerror()"); + goto fail; + } + + /* Resolve symbols that we require within opp library */ + proto->opp_fn.op_path_find_hca = dlsym(proto->opp_lib, "op_path_find_hca"); + proto->opp_fn.op_path_open = dlsym(proto->opp_lib, "op_path_open"); + proto->opp_fn.op_path_close = dlsym(proto->opp_lib, "op_path_close"); + proto->opp_fn. op_path_get_path_by_rec = dlsym(proto->opp_lib, "op_path_get_path_by_rec"); + + /* If we can't resovle any symbol then fail to load opp module */ + if (!proto->opp_fn.op_path_find_hca || !proto->opp_fn.op_path_open || + !proto->opp_fn.op_path_close || !proto->opp_fn.op_path_get_path_by_rec) { + _IPATH_PRDBG("Unable to resolve symbols in OPP library. Unloading.\n"); + goto fail; + } + + /* If PSM_IDENTIFY is set display the OPP library location being used. */ + if (getenv("PSM_IDENTIFY")) { + Dl_info info_opp; + _IPATH_INFO("PSM path record queries using OFED Plus Plus (%s) from %s\n", + DF_OPP_LIBRARY, + dladdr(proto->opp_fn.op_path_open, &info_opp) ? info_opp.dli_fname : + "Unknown/unsupported version of OPP library found!"); + } + + /* Obtain handle to hca (requires verbs on node) */ + snprintf(hcaName, sizeof(hcaName), "qib%d", base_info->spi_unit); + proto->hndl = proto->opp_fn.op_path_find_hca(hcaName, &proto->device); + if (!proto->hndl) { + _IPATH_ERROR("OPP: Unable to find HCA %s. Disabling OPP interface for path record queries.\n", hcaName); + goto fail; + } + + /* Get OPP context */ + proto->opp_ctxt = proto->opp_fn.op_path_open(proto->device, base_info->spi_port); + if (!proto->opp_ctxt) { + _IPATH_ERROR("OPP: Unable to optain OPP context. Disabling OPP interface for path record queries.\n"); + goto fail; + } + + /* OPP initialized successfully */ + proto->ibta.get_path_rec = ips_opp_path_rec; + proto->ibta.fini = ips_opp_fini; + proto->flags |= IPS_PROTO_FLAG_QUERY_PATH_REC; + + return err; + + fail: + _IPATH_ERROR("Make sure SM is running...\n"); + _IPATH_ERROR("Make sure service dist_sa is running...\n"); + _IPATH_ERROR("to start dist_sa: service dist_sa start\n"); + _IPATH_ERROR("or enable it at boot time: iba_config -E dist_sa\n\n"); + + err = psmi_handle_error(NULL, PSM_EPID_PATH_RESOLUTION, + "Unable to initialize OFED Plus library successfully.\n"); + + if (proto->opp_lib) + dlclose(proto->opp_lib); + + return err; +} + diff --git a/ptl_ips/ips_path_rec.c b/ptl_ips/ips_path_rec.c new file mode 100644 index 0000000..be3b41c --- /dev/null +++ b/ptl_ips/ips_path_rec.c @@ -0,0 +1,660 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +static void +ips_gen_ipd_table(struct ips_proto *proto) +{ + /* Based on our current link rate setup the IPD table */ + switch(proto->epinfo.ep_link_rate) { + case IBTA_RATE_10_GBPS: + proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 0; + proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 1; + proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 3; + break; + case IBTA_RATE_20_GBPS: + proto->ips_ipd_delay[IBTA_RATE_20_GBPS] = 0; + proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 1; + proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 3; + proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 7; + break; + case IBTA_RATE_40_GBPS: + default: + proto->ips_ipd_delay[IBTA_RATE_40_GBPS] = 0; + proto->ips_ipd_delay[IBTA_RATE_30_GBPS] = 1; + proto->ips_ipd_delay[IBTA_RATE_20_GBPS] = 1; + proto->ips_ipd_delay[IBTA_RATE_10_GBPS] = 3; + proto->ips_ipd_delay[IBTA_RATE_5_GBPS] = 7; + proto->ips_ipd_delay[IBTA_RATE_2_5_GBPS] = 15; + break; + } +} + +static psm_error_t +ips_gen_cct_table(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + uint32_t cca_divisor, ipdidx, ipdval = 1; + uint16_t *cct_table; + + /* The CCT table is static currently. If it's already created then return */ + if (proto->cct) + goto fail; + + /* Allocate the CCT table */ + cct_table = psmi_calloc(proto->ep, UNDEFINED, + proto->ccti_size, sizeof(uint16_t)); + if (!cct_table) { + err = PSM_NO_MEMORY; + goto fail; + } + + /* The first table entry is always 0 i.e. no IPD delay */ + cct_table[0] = 0; + + /* Generate the remaining CCT table entries */ + for (ipdidx = 1; ipdidx < proto->ccti_size; ipdidx += 4,ipdval++) + for (cca_divisor = 0; cca_divisor < 4; cca_divisor++) { + if ((ipdidx+cca_divisor) == proto->ccti_size) break; + cct_table[ipdidx+cca_divisor] = + (((cca_divisor ^ 0x3) << CCA_DIVISOR_SHIFT) | (ipdval & 0x3FFF)); + _IPATH_VDBG("CCT[%d] = %x. Divisor: %x, IPD: %x\n", ipdidx+cca_divisor, cct_table[ipdidx+cca_divisor], (cct_table[ipdidx+cca_divisor] >> CCA_DIVISOR_SHIFT), cct_table[ipdidx+cca_divisor] & CCA_IPD_MASK); + } + + /* On link up/down CCT is re-generated. If CCT table is previously created + * free it + */ + if (proto->cct) { + psmi_free(proto->cct); + proto->cct = NULL; + } + + /* Update to the new CCT table */ + proto->cct = cct_table; + + fail: + return err; +} + +static ibta_rate +ips_default_hca_rate(uint16_t hca_type) +{ + ibta_rate rate = IBTA_RATE_40_GBPS; + + switch(hca_type){ + case PSMI_HCA_TYPE_QLE73XX: + rate = IBTA_RATE_40_GBPS; + break; + case PSMI_HCA_TYPE_QLE72XX: + rate = IBTA_RATE_20_GBPS; + break; + case PSMI_HCA_TYPE_QLE71XX: + rate = IBTA_RATE_10_GBPS; + break; + } + + return rate; +} + +static ibta_rate +ips_rate_to_enum(int link_rate) +{ + ibta_rate rate; + + switch(link_rate) { + case 40: + rate = IBTA_RATE_40_GBPS; + break; + case 20: + rate = IBTA_RATE_20_GBPS; + break; + case 10: + rate = IBTA_RATE_10_GBPS; + break; + case 5: + rate = IBTA_RATE_5_GBPS; + break; + case 2: + rate = IBTA_RATE_2_5_GBPS; + break; + default: + rate = IBTA_RATE_PORT_CURRENT; + } + + return rate; +} + +static psm_error_t +ips_none_get_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthca_type, + unsigned long timeout, ips_path_rec_t **prec) +{ + psm_error_t err = PSM_OK; + ENTRY elid, *epath = NULL; + char eplid[128]; + ips_path_rec_t *path_rec; + + /* Query the path record cache */ + snprintf(eplid, sizeof(eplid), "%x_%x", slid, dlid); + elid.key = eplid; + hsearch_r(elid, FIND, &epath, &proto->ips_path_rec_hash); + + if (!epath) { + elid.key = psmi_calloc(proto->ep, UNDEFINED, 1, strlen(eplid) + 1); + path_rec = (ips_path_rec_t*) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(ips_path_rec_t)); + if (!elid.key || !path_rec) { + if (elid.key) psmi_free(elid.key); + if (path_rec) psmi_free(path_rec); + return PSM_NO_MEMORY; + } + + /* Create path record */ + path_rec->epr_slid = slid; + path_rec->epr_dlid = dlid; + path_rec->epr_mtu = proto->epinfo.ep_mtu; + path_rec->epr_pkey = proto->epinfo.ep_pkey; + path_rec->epr_sl = proto->epinfo.ep_sl; + + /* Determine the IPD based on our local link rate and default link rate for + * remote hca type. + */ + path_rec->epr_static_rate = + ips_default_hca_rate(desthca_type); + path_rec->epr_static_ipd = + proto->ips_ipd_delay[path_rec->epr_static_rate]; + + /* Setup CCA parameters for path */ + if (path_rec->epr_sl > 15) { + psmi_free(elid.key); + psmi_free(path_rec); + return PSM_INTERNAL_ERR; + } + if (!(proto->ccti_ctrlmap&(1<epr_sl))) { + _IPATH_CCADBG("No CCA for sl %d, disable CCA\n", path_rec->epr_sl); + proto->flags &= ~IPS_PROTO_FLAG_CCA; + } + path_rec->proto = proto; + path_rec->epr_ccti_min = proto->cace[path_rec->epr_sl].ccti_min; + path_rec->epr_ccti = path_rec->epr_ccti_min; + psmi_timer_entry_init(&path_rec->epr_timer_cca, + ips_cca_timer_callback, path_rec); + + /* Determine active IPD for path. Is max of static rate and CCT table */ + if ((path_rec->epr_static_ipd) && + ((path_rec->epr_static_ipd + 1) > + (proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK))) { + path_rec->epr_active_ipd = path_rec->epr_static_ipd + 1; + path_rec->epr_cca_divisor = 0; + } + else { + /* Pick it from the CCT table */ + path_rec->epr_active_ipd = proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK; + path_rec->epr_cca_divisor = + proto->cct[path_rec->epr_ccti] >> CCA_DIVISOR_SHIFT; + } + + /* Setup default errorcheck timeout. */ + path_rec->epr_timeout_ack = + proto->epinfo.ep_timeout_ack; + path_rec->epr_timeout_ack_max = + proto->epinfo.ep_timeout_ack_max; + path_rec->epr_timeout_ack_factor = + proto->epinfo.ep_timeout_ack_factor; + + /* Add path record into cache */ + strcpy(elid.key, eplid); + elid.data = (void*) path_rec; + hsearch_r(elid, ENTER, &epath, &proto->ips_path_rec_hash); + } + else + path_rec = (ips_path_rec_t*) epath->data; + + /* Return IPS path record */ + *prec = path_rec; + + return err; +} + +static psm_error_t +ips_none_path_rec(struct ips_proto *proto, + uint16_t slid, uint16_t dlid, uint16_t desthca_type, + unsigned long timeout, + ips_epaddr_t *ipsaddr) +{ + psm_error_t err = PSM_OK; + uint16_t pidx, num_path = (1 << proto->epinfo.ep_lmc); + uint16_t base_slid, base_dlid; + psmi_context_t *context = &proto->ep->context; + + /* For the "none" path record resolution all paths are assumed to be of equal + * priority however since we want to isolate all control traffic (acks, naks) + * to a separate path for non zero LMC subnets the "first path" between a + * pair of endpoints is always the "higher" priority paths. The rest of the + * paths are the normal (and low priority) paths. + */ + + /* If base lids are only used then reset num_path to 1 */ + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + num_path = 1; + + if (num_path > 1) { + /* One control path and (num_path - 1) norm and low priority paths */ + ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] = num_path - 1; + ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = num_path - 1; + } + else { + /* LMC of 0. Use the same path for all priorities */ + ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY] = 1; + ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY] = 1; + ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY] = 1; + } + + /* For "none" path record we just setup 2^lmc paths. To get better load + * balance + */ + for (pidx = 0; pidx < num_path; pidx++) { + ips_path_rec_t *path; + + base_slid = __cpu_to_be16(__be16_to_cpu(slid) + pidx); + base_dlid = __cpu_to_be16(__be16_to_cpu(dlid) + pidx); + + err = ips_none_get_path_rec(proto, base_slid, base_dlid, desthca_type, + timeout, &path); + if (err != PSM_OK) + goto fail; + + if (num_path > 1) { + if (pidx == 0) { + /* First path is always the high priority path */ + ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0] = path; + } + else { + ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][pidx-1] = path; + ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][pidx-1] = path; + } + } + else { + ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0] = path; + ipsaddr->epr.epr_path[IPS_PATH_NORMAL_PRIORITY][0] = path; + ipsaddr->epr.epr_path[IPS_PATH_LOW_PRIORITY][0] = path; + } + } + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + ipsaddr->epr.epr_hpp_index = 0; + ipsaddr->epr.epr_next_path[IPS_PATH_NORMAL_PRIORITY] = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_NORMAL_PRIORITY]; + ipsaddr->epr.epr_next_path[IPS_PATH_LOW_PRIORITY] = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_LOW_PRIORITY]; + } + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + ipsaddr->epr.epr_hpp_index = + ipsaddr->epr.epr_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + ipsaddr->epr.epr_hpp_index = + context->base_info.spi_context % ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY]; + else /* Base LID */ + ipsaddr->epr.epr_hpp_index = 0; + + fail: + if (err != PSM_OK) + _IPATH_PRDBG("Unable to get path record for LID %x <---> DLID %x.\n", slid, dlid); + return err; +} + +static psm_error_t ips_none_path_rec_init(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + union psmi_envvar_val psm_set_hca_pkey; + + /* Obtain the SL and PKEY to use from the environment (IPATH_SL & PSM_KEY) */ + proto->epinfo.ep_sl = psmi_epid_sl(proto->ep->epid); + proto->epinfo.ep_pkey = (uint16_t) proto->ep->network_pkey; + + /* + * Parse the err_chk settings from the environment. + * :: + */ + { + union psmi_envvar_val env_to; + char *errchk_to = PSM_TID_TIMEOUT_DEFAULT; + int tvals[3] = { + IPS_PROTO_ERRCHK_MS_MIN_DEFAULT, + IPS_PROTO_ERRCHK_MS_MAX_DEFAULT, + IPS_PROTO_ERRCHK_FACTOR_DEFAULT }; + + if (!psmi_getenv("PSM_ERRCHK_TIMEOUT", + "Errchk timeouts in mS ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) errchk_to, &env_to)) + { + /* Not using default values, parse what we can */ + errchk_to = env_to.e_str; + psmi_parse_str_tuples(errchk_to, 3, tvals); + /* Adjust for max smaller than min, things would break */ + if (tvals[1] < tvals[0]) + tvals[1] = tvals[0]; + } + proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]); + proto->epinfo.ep_timeout_ack_max = ms_2_cycles(tvals[1]); + proto->epinfo.ep_timeout_ack_factor = tvals[2]; + } + + /* With no path records queries set pkey manually if PSM_SET_HCA_PKEY is + * set. + */ + psmi_getenv("PSM_SET_HCA_PKEY", + "Force write of PKey to HCA (default is disabled)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 0, &psm_set_hca_pkey); + + if (psm_set_hca_pkey.e_uint) { + if (ipath_set_pkey(proto->ep->context.ctrl, + (uint16_t) proto->ep->network_pkey) != 0) { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Couldn't set device pkey %d: %s", + (int) proto->ep->network_pkey, + strerror(errno)); + goto fail; + } + } + + proto->ibta.get_path_rec = ips_none_path_rec; + proto->ibta.fini = NULL; + + fail: + return err; +} + +/* (Re)load the SL2VL table */ +psm_error_t ips_ibta_init_sl2vl_table(struct ips_proto *proto) +{ + int ret, sli; + + /* Get SL2VL table for unit, port */ + for (sli = 0; sli < 16; sli++) { + if ((ret = ipath_get_port_sl2vl(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port, + (uint8_t) sli)) < 0) { + /* Unable to get SL2VL. Set it to default */ + ret = PSMI_VL_DEFAULT; + } + + proto->sl2vl[sli] = ret; + } + + return PSM_OK; +} + +/* On link up/down we need to update some state */ +psm_error_t ips_ibta_link_updown_event(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + int ret; + + /* Get base lid, lmc and rate as these may have changed if the link bounced */ + proto->epinfo.ep_base_lid = + __cpu_to_be16((uint16_t) psm_epid_nid(proto->ep->context.epid)); + if ((ret = ipath_get_port_lmc(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port)) < 0) { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Could obtain LMC for unit %u:%d. Error: %s", + proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port, + strerror(errno)); + goto fail; + } + proto->epinfo.ep_lmc = min(ret, IPS_MAX_PATH_LMC); + + if ((ret = ipath_get_port_rate(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port)) < 0) { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Could obtain link rate for unit %u:%d. Error: %s", + proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port, + strerror(errno)); + goto fail; + } + proto->epinfo.ep_link_rate = ips_rate_to_enum(ret); + + /* Load the SL2VL table */ + ips_ibta_init_sl2vl_table(proto); + + /* Regenerate new IPD table for the updated link rate. */ + ips_gen_ipd_table(proto); + + /* Generate the CCT table. */ + err = ips_gen_cct_table(proto); + + fail: + return err; +} + +psm_error_t ips_ibta_init(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + union psmi_envvar_val psm_path_policy; + union psmi_envvar_val disable_cca; + + /* Get the path selection policy */ + psmi_getenv("PSM_PATH_SELECTION", + "Policy to use if multiple paths are available between endpoints. Options are adaptive, static_src, static_dest, static_base. Default is adaptive.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) "adaptive", + &psm_path_policy); + + if (!strcasecmp((const char*) psm_path_policy.e_str, "adaptive")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_ADAPTIVE; + else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_src")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_SRC; + else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_dest")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_DST; + else if (!strcasecmp((const char*) psm_path_policy.e_str, "static_base")) + proto->flags |= IPS_PROTO_FLAG_PPOLICY_STATIC_BASE; + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) + _IPATH_PRDBG("Using adaptive path selection.\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + _IPATH_PRDBG("Static path selection: Src Context\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + _IPATH_PRDBG("Static path selection: Dest Context\n"); + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_BASE) + _IPATH_PRDBG("Static path selection: Base LID \n"); + + psmi_getenv("PSM_DISABLE_CCA", + "Disable use of Congestion Control Architecure (CCA) [enabled] ", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, + &disable_cca); + if (disable_cca.e_uint) + _IPATH_CCADBG("CCA is disabled for congestion control.\n"); + else + proto->flags |= IPS_PROTO_FLAG_CCA; + + { + /* Get CCA related parameters from the environment */ + union psmi_envvar_val ccti_incr; + union psmi_envvar_val ccti_timer; + union psmi_envvar_val ccti_size; + int i; + char ccabuf[256]; + uint8_t *p; + +/* + * If user set any environment variable, use self CCA. + */ + if (getenv("PSM_CCTI_INCREMENT") || getenv("PSM_CCTI_TIMER") || getenv("PSM_CCTI_TABLE_SIZE")) { + goto selfcca; + } + +/* + * Check qib driver CCA setting, and try to use it if available. + * Fall to self CCA setting if errors. + */ + i = ipath_get_cc_settings_bin(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port, ccabuf); + if (i <= 0) { + goto selfcca; + } + p = (uint8_t *)ccabuf; + memcpy(&proto->ccti_portctrl, p, 2); p += 2; + memcpy(&proto->ccti_ctrlmap, p, 2); p += 2; + for (i=0; i<16; i++) { + proto->cace[i].ccti_increase = *p; p++; + memcpy(&proto->cace[i].ccti_timer_cycles, p, 2); p += 2; + proto->cace[i].ccti_timer_cycles = + us_2_cycles(proto->cace[i].ccti_timer_cycles); + proto->cace[i].ccti_threshold = *p; p++; + proto->cace[i].ccti_min = *p; p++; + } + + i = ipath_get_cc_table_bin(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port, &proto->cct); + if (i < 0) { + err = PSM_NO_MEMORY; + goto fail; + } else if (i == 0) { + goto selfcca; + } + proto->ccti_limit = i; + proto->ccti_size = proto->ccti_limit + 1; + goto finishcca; + +/* + * Since there is no qib driver CCA settings, use self built CCA. + */ + selfcca: + psmi_getenv("PSM_CCTI_INCREMENT", + "IBTA_CCA: Index increment for CCT table on receipt of a BECN packet (less than table size, default 1)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 1, + &ccti_incr); + + psmi_getenv("PSM_CCTI_TIMER", + "IBTA_CCA: CCT table congestion timer (>0, default 1 us)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 1, + &ccti_timer); + + psmi_getenv("PSM_CCTI_TABLE_SIZE", + "IBTA_CCA: Number of entries in CCT table (multiple of 64, default 128)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) DF_CCT_TABLE_SIZE, //128 + &ccti_size); + + /* Check the invalid values. */ + if (ccti_size.e_uint < 64 || ccti_size.e_uint%64) { + _IPATH_INFO("Invalid PSM_CCTI_TABLE_SIZE=%d, at least 64 and multiple of 64, setting to default 128\n", + ccti_size.e_uint); + ccti_size.e_uint = 128; + } + proto->ccti_size = ccti_size.e_uint; + /* For now the CCT limit is same as table size. + * This does not have to be the case. */ + proto->ccti_limit = proto->ccti_size - 1; + + if (ccti_timer.e_uint <= 0) { + _IPATH_INFO("Invalid PSM_CCTI_TIMER=%d, should be bigger than 0, setting to default 1\n", + ccti_timer.e_uint); + ccti_timer.e_uint = 1; + } + if (ccti_incr.e_uint <= 0 || ccti_incr.e_uint >= ccti_size.e_uint) { + _IPATH_INFO("Invalid PSM_CCTI_INCREMENT=%d, should be less than table size, setting to default 1\n", + ccti_incr.e_uint); + ccti_incr.e_uint = 1; + } + + /* Setup CCA parameters for port */ + proto->ccti_portctrl = 1; /* SL/Port based congestion control */ + proto->ccti_ctrlmap = 0xFFFF; + for (i=0; i<16; i++) { + proto->cace[i].ccti_increase = ccti_incr.e_uint; + proto->cace[i].ccti_timer_cycles = us_2_cycles(ccti_timer.e_uint); + proto->cace[i].ccti_threshold = 8; + proto->cace[i].ccti_min = 0; + } + } + + finishcca: + /* Seed the random number generator with our pid */ + srand(getpid()); + + /* Initialize path record hash table */ + hcreate_r(DF_PATH_REC_HASH_SIZE, &proto->ips_path_rec_hash); + + /* On startup treat it as a link up/down event to setup state . */ + if ((err = ips_ibta_link_updown_event(proto)) != PSM_OK) + goto fail; + + /* Setup the appropriate query interface for the endpoint */ + switch(proto->ep->path_res_type) { + case PSM_PATH_RES_OPP: + err = ips_opp_init(proto); + if (err != PSM_OK) + _IPATH_ERROR("Unable to use OFED Plus Plus for path record queries.\n"); + break; + case PSM_PATH_RES_UMAD: + _IPATH_ERROR("Path record queries using UMAD is not supported in PSM version %d.%dx\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR); + err = PSM_EPID_PATH_RESOLUTION; + break; + case PSM_PATH_RES_NONE: + default: + err = ips_none_path_rec_init(proto); + } + + fail: + return err; +} + +psm_error_t ips_ibta_fini(struct ips_proto *proto) +{ + psm_error_t err = PSM_OK; + + if (proto->ibta.fini) + err = proto->ibta.fini(proto); + + /* Destroy the path record hash */ + hdestroy_r(&proto->ips_path_rec_hash); + + return err; +} diff --git a/ptl_ips/ips_path_rec.h b/ptl_ips/ips_path_rec.h new file mode 100644 index 0000000..5d43cac --- /dev/null +++ b/ptl_ips/ips_path_rec.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * 2009,2010. QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PATH_REC_H_ +#define _IPS_PATH_REC_H_ + +#include + +/* Default size of path record hash table */ +#define DF_PATH_REC_HASH_SIZE 2047 + +/* Default size of CCT table. Must be multiple of 64 */ +#define DF_CCT_TABLE_SIZE 128 + +/* CCT max IPD delay. QLE73XX is limited to 32us */ +#define DF_CCT_MAX_IPD_DELAY_US 21 + +/* CCA divisor shift */ +#define CCA_DIVISOR_SHIFT 14 + +/* CCA ipd mask */ +#define CCA_IPD_MASK 0x3FFF + +/* A lot of these are IBTA specific defines that are available in other header + * files. To minimize dependencies with PSM build process they are listed + * here. Most of this is used to implement IBTA compliance features with PSM + * like path record querye etc. + */ + +enum ibta_mtu { + IBTA_MTU_256 = 1, + IBTA_MTU_512 = 2, + IBTA_MTU_1024 = 3, + IBTA_MTU_2048 = 4, + IBTA_MTU_4096 = 5 +}; + +typedef enum { + IBTA_RATE_PORT_CURRENT = 0, + IBTA_RATE_2_5_GBPS = 2, + IBTA_RATE_5_GBPS = 5, + IBTA_RATE_10_GBPS = 3, + IBTA_RATE_20_GBPS = 6, + IBTA_RATE_30_GBPS = 4, + IBTA_RATE_40_GBPS = 7, + IBTA_RATE_60_GBPS = 8, + IBTA_RATE_80_GBPS = 9, + IBTA_RATE_120_GBPS = 10 +} ibta_rate; + +static inline int ibta_mtu_enum_to_int(enum ibta_mtu mtu) +{ + switch (mtu) { + case IBTA_MTU_256: return 256; + case IBTA_MTU_512: return 512; + case IBTA_MTU_1024: return 1024; + case IBTA_MTU_2048: return 2048; + case IBTA_MTU_4096: return 4096; + default: return -1; + } +} + +/* This is same as ob_path_rec from ib_types.h. Listed here to be self + * contained to minimize dependencies during build etc. + */ +typedef struct _ibta_path_rec { + uint64_t service_id; /* net order */ + uint8_t dgid[16]; + uint8_t sgid[16]; + uint16_t dlid; /* net order */ + uint16_t slid; /* net order */ + uint32_t hop_flow_raw; /* net order */ + uint8_t tclass; + uint8_t num_path; + uint16_t pkey; /* net order */ + uint16_t qos_class_sl; /* net order */ + uint8_t mtu; /* IBTA encoded */ + uint8_t rate; /* IBTA encoded */ + uint8_t pkt_life; /* IBTA encoded */ + uint8_t preference; + uint8_t resv2[6]; +} ibta_path_rec_t; + +/* + * PSM IPS path record components for endpoint. + */ +struct ips_proto; +typedef struct ips_path_rec { + uint16_t epr_slid; /* For Torus/non zero LMC fabrics this can be diff */ + uint16_t epr_dlid; + uint16_t epr_mtu; + uint16_t epr_pkey; + uint8_t epr_sl; + uint8_t epr_static_rate; + uint16_t epr_static_ipd; /* Static rate IPD from path record */ + + /* IBTA CCA parameters per path */ + struct ips_proto *proto; + uint16_t epr_ccti; + uint16_t epr_ccti_min; + psmi_timer epr_timer_cca; /* Congestion timer for epr_ccti increment. */ + uint16_t epr_active_ipd; /* The current active IPD. max(static,cct) */ + uint8_t epr_cca_divisor; /* CCA divisor [14:15] in CCT entry */ + uint8_t epr_pad; + + /* TODO: The endpoint timeout should also adjust based on epr_ird */ + uint32_t epr_timeout_ack_factor; + uint64_t epr_timeout_ack; + uint64_t epr_timeout_ack_max; +} ips_path_rec_t; + +typedef struct _ips_opp_path_rec { + ibta_path_rec_t opp_response; + ips_path_rec_t ips; +} ips_opp_path_rec_t; + +psm_error_t ips_opp_init(struct ips_proto *proto); + +#endif diff --git a/ptl_ips/ips_proto.c b/ptl_ips/ips_proto.c new file mode 100644 index 0000000..e9715e4 --- /dev/null +++ b/ptl_ips/ips_proto.c @@ -0,0 +1,2061 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * IPS - Interconnect Protocol Stack. + */ + +#include +#include /* writev */ +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_proto_help.h" + +/* + * host ipv4 and pid used in ERR_CHK messages to detect stray processes + */ +static uint32_t host_ipv4addr = 0; /* be */ +static uint32_t host_pid = 0; /* be */ + +/* + * Control message types have their own flag to determine whether a message of + * that type is queued or not. These flags are kept in a state bitfield. + */ +#define CTRL_MSG_ACK_QUEUED 0x0001 +#define CTRL_MSG_NAK_QUEUED 0x0002 +#define CTRL_MSG_ERR_CHK_QUEUED 0x0004 +#define CTRL_MSG_ERR_CHK_PLS_QUEUED 0x0008 +#define CTRL_MSG_CONNECT_REQUEST_QUEUED 0x0010 +#define CTRL_MSG_CONNECT_REPLY_QUEUED 0x0020 +#define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0040 +#define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0080 +#define CTRL_MSG_TIDS_RELEASE_QUEUED 0x0100 +#define CTRL_MSG_TIDS_RELEASE_CONFIRM_QUEUED 0x0200 +#define CTRL_MSG_CLOSE_QUEUED 0x0400 +#define CTRL_MSG_CLOSE_ACK_QUEUED 0x0800 +#define CTRL_MSG_ABORT_QUEUED 0x1000 +#define CTRL_MSG_TIDS_GRANT_QUEUED 0x2000 +#define CTRL_MSG_TIDS_GRANT_ACK_QUEUED 0x4000 +#define CTRL_MSG_ERR_CHK_GEN_QUEUED 0x8000 +#define CTRL_MSG_FLOW_CCA_BECN 0x10000 + +#define CTRL_MSG_QUEUE_ALWAYS 0x80000000 + +#define _desc_idx u32w0 +#define _desc_genc u32w1 + +static void ctrlq_init(struct ips_ctrlq *ctrlq, int flowid, + struct ips_proto *proto); +static psm_error_t proto_sdma_init(struct ips_proto *proto, + const psmi_context_t *context); + +psm_error_t +ips_proto_init(const psmi_context_t *context, const ptl_t *ptl, + int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, + const struct ips_epstate *epstate, + const struct ips_spio *spioc, + struct ips_proto *proto) +{ + const struct ipath_base_info *base_info = &context->base_info; + uint32_t protoexp_flags, cksum_sz = 0; + union psmi_envvar_val env_tid, env_cksum, env_mtu; + psm_error_t err = PSM_OK; + + /* + * Checksum packets within PSM. Default is off. + * This is heavy weight and done in software so not recommended for + * production runs. + */ + + psmi_getenv("PSM_CHECKSUM", + "Enable checksum of messages (0 disables checksum)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 0, + &env_cksum); + + memset(proto, 0, sizeof(struct ips_proto)); + proto->ptl = (ptl_t *) ptl; + proto->ep = context->ep; /* cached */ + proto->mq = context->ep->mq; /* cached */ + proto->fd = context->fd; /* cached */ + proto->pend_sends.proto = proto; + psmi_timer_entry_init(&proto->pend_sends.timer, + ips_proto_timer_pendq_callback, &proto->pend_sends); + STAILQ_INIT(&proto->pend_sends.pendq); + proto->epstate = (struct ips_epstate *) epstate; + proto->timerq = (struct psmi_timer_ctrl *) timerq; + proto->spioc = (struct ips_spio *) spioc; + + proto->epinfo.ep_baseqp = base_info->spi_qpair; + proto->epinfo.ep_context = base_info->spi_context; /* "real" context */ + + proto->epinfo.ep_subcontext = base_info->spi_subcontext; + proto->epinfo.ep_hca_type = psmi_epid_hca_type(context->epid); + + proto->epinfo.ep_unit = base_info->spi_unit; + proto->epinfo.ep_hdrq_msg_size = (IPS_HEADER_QUEUE_HWORDS + + IPS_HEADER_QUEUE_IWORDS + + IPS_HEADER_QUEUE_UWORDS_MIN) << 2; + + /* If checksums enabled we insert checksum at end of packet */ + cksum_sz = env_cksum.e_uint ? PSM_CRC_SIZE_IN_BYTES : 0; + + proto->epinfo.ep_mtu = base_info->spi_mtu - + proto->epinfo.ep_hdrq_msg_size - + CRC_SIZE_IN_BYTES - PCB_SIZE_IN_BYTES; + proto->epinfo.ep_mtu = ips_next_low_pow2(proto->epinfo.ep_mtu); + /* Decrement checksum accounting AFTER lowering power of two */ + proto->epinfo.ep_mtu -= cksum_sz; + + /* See if user specifies a lower MTU to use */ + if (!psmi_getenv("PSM_MTU", "MTU specified by user: 1-5,256-4096[4/2048]", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val) -1, + &env_mtu)) { + if (env_mtu.e_int != 256 && env_mtu.e_int != 512 + && env_mtu.e_int != 1024 && env_mtu.e_int != 2048 + && env_mtu.e_int != 4096) { + if (env_mtu.e_int < 1 || env_mtu.e_int > 5) env_mtu.e_int = 4; + env_mtu.e_int = ibta_mtu_enum_to_int((enum ibta_mtu)env_mtu.e_int); + } + if (proto->epinfo.ep_mtu > env_mtu.e_int) + proto->epinfo.ep_mtu = env_mtu.e_int; + } + + proto->epinfo.ep_piosize = base_info->spi_piosize - + proto->epinfo.ep_hdrq_msg_size - + CRC_SIZE_IN_BYTES - PCB_SIZE_IN_BYTES - cksum_sz; + + /* Keep PIO as multiple of cache line size */ + if (proto->epinfo.ep_piosize > PSM_CACHE_LINE_BYTES) + proto->epinfo.ep_piosize &= ~(PSM_CACHE_LINE_BYTES - 1); + + + proto->timeout_send = us_2_cycles(IPS_PROTO_SPIO_RETRY_US_DEFAULT); + + proto->iovec_cntr_next_inflight = 0; + proto->iovec_thresh_eager= proto->iovec_thresh_eager_blocking = ~0U; + proto->scb_max_inflight = 2*num_of_send_desc; + proto->scb_bufsize = PSMI_ALIGNUP(max(base_info->spi_piosize, + base_info->spi_mtu), + PSMI_PAGESIZE), + proto->t_init = get_cycles(); + proto->t_fini = 0; + proto->flags = env_cksum.e_uint ? + IPS_PROTO_FLAG_CKSUM : 0; + + proto->num_connected_to = 0; + proto->num_connected_from = 0; + proto->num_disconnect_requests = 0; + proto->stray_warn_interval = (uint64_t) -1; + proto->done_warning = 0; + proto->done_once = 0; + proto->num_bogus_warnings = 0; + proto->psmi_logevent_tid_send_reqs.interval_secs = 15; + proto->psmi_logevent_tid_send_reqs.next_warning = 0; + proto->psmi_logevent_tid_send_reqs.count = 0; + + /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */ + if ((err = ips_ibta_init(proto))) + goto fail; + + { + /* Disable coalesced ACKs? */ + union psmi_envvar_val env_coalesce_acks; + + psmi_getenv("PSM_COALESCE_ACKS", + "Coalesce ACKs on the wire (default is enabled i.e. 1)", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 1, /* Enabled by default */ + &env_coalesce_acks); + + if (env_coalesce_acks.e_uint) + proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS; + } + + { + /* Number of credits per flow */ + union psmi_envvar_val env_flow_credits; + int df_flow_credits = min(PSM_FLOW_CREDITS, num_of_send_desc); + + psmi_getenv("PSM_FLOW_CREDITS", + "Number of unacked packets (credits) per flow (default is 64)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) df_flow_credits, + &env_flow_credits); + proto->flow_credits = env_flow_credits.e_uint; + } + + if ((context->runtime_flags & IPATH_RUNTIME_SDMA)) + if ((err = proto_sdma_init(proto, context))) + goto fail; + + /* + * Clone sendreq mpool configuration for pend sends config + */ + { + uint32_t chunks, maxsz; + + psmi_assert_always(proto->ep->mq->sreq_pool != NULL); + psmi_mpool_get_obj_info(proto->ep->mq->sreq_pool, &chunks, &maxsz); + + proto->pend_sends_pool = + psmi_mpool_create(sizeof(struct ips_pend_sreq), chunks, maxsz, + 0, DESCRIPTORS, NULL, NULL); + if (proto->pend_sends_pool == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + } + + /* + * Register ips protocol statistics + * + * We put a (*) in the output to denote stats that may cause a drop in + * performance. + * + * We put a (**) in the output of those stats that "should never happen" + */ + { + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECLU64("pio busy count", + &proto->stats.pio_busy_cnt), + /* Throttling by kernel */ + PSMI_STATS_DECLU64("writev busy cnt", + &proto->stats.writev_busy_cnt), + /* When local dma completion is in the way... */ + PSMI_STATS_DECLU64("writev compl. eagain", + &proto->stats.writev_compl_eagain), + /* When remote completion happens before local completion */ + PSMI_STATS_DECLU64("writev compl. delay (*)", + &proto->stats.writev_compl_delay), + PSMI_STATS_DECLU64("scb unavail eager count", + &proto->stats.scb_egr_unavail_cnt), + PSMI_STATS_DECLU64("scb unavail exp count", + &proto->stats.scb_exp_unavail_cnt), + PSMI_STATS_DECLU64("rcvhdr overflows", /* Normal egr/hdr ovflw */ + &proto->stats.hdr_overflow), + PSMI_STATS_DECLU64("rcveager overflows", + &proto->stats.egr_overflow), + PSMI_STATS_DECLU64("lid zero errs (**)", /* shouldn't happen */ + &proto->stats.lid_zero_errs), + PSMI_STATS_DECLU64("unknown packets (**)", /* shouldn't happen */ + &proto->stats.unknown_packets), + PSMI_STATS_DECLU64("stray packets (*)", + &proto->stats.stray_packets), + PSMI_STATS_DECLU64("send dma misaligns (*)", + &proto->stats.send_dma_misaligns), + PSMI_STATS_DECLU64("amreply no bufs (*)", + &proto->proto_am.amreply_nobufs), + PSMI_STATS_DECLU64("pio stalls (*)", /* shouldn't happen too often */ + &proto->spioc->spio_num_stall_total), + PSMI_STATS_DECLU64("Invariant CRC error (*)", + &proto->error_stats.num_icrc_err), + PSMI_STATS_DECLU64("Variant CRC error (*)", + &proto->error_stats.num_vcrc_err), + PSMI_STATS_DECLU64("ECC error ", + &proto->error_stats.num_ecc_err), + PSMI_STATS_DECLU64("IB Len error", + &proto->error_stats.num_len_err), + PSMI_STATS_DECLU64("IB MTU error ", + &proto->error_stats.num_mtu_err), + PSMI_STATS_DECLU64("KDETH error ", + &proto->error_stats.num_khdr_err), + PSMI_STATS_DECLU64("TID error ", + &proto->error_stats.num_tid_err), + PSMI_STATS_DECLU64("MK error ", + &proto->error_stats.num_mk_err), + PSMI_STATS_DECLU64("IB error ", + &proto->error_stats.num_ib_err), + + }; + + err = psmi_stats_register_type("InfiniPath low-level protocol stats", + PSMI_STATSTYPE_IPSPROTO, + entries, + PSMI_STATS_HOWMANY(entries), + NULL); + if (err != PSM_OK) + goto fail; + } + + /* + * Control Queue and messaging + */ + { + int idx; + + for (idx = 0; idx < EP_FLOW_LAST; idx++) + ctrlq_init(&proto->ctrlq[idx], idx, proto); + } + + /* + * Receive-side handling + */ + if ((err = ips_proto_recv_init(proto))) + goto fail; + + /* + * Eager buffers. We don't care to receive a callback when eager buffers + * are newly released since we actively poll for new bufs. + */ + if ((err = ips_scbctrl_init(context, num_of_send_desc, + num_of_send_bufs, imm_size, proto->scb_bufsize, + NULL, NULL, &proto->scbc_egr))) + goto fail; + + /* + * Expected protocol handling. + * If we enable tid-based expected rendezvous, the expected protocol code + * handles its own rv scb buffers. If not, we have to enable eager-based + * rendezvous and we allocate scb buffers for it. + */ + psmi_getenv("PSM_TID", + "Tid proto flags (0 disables protocol)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) IPS_PROTOEXP_FLAGS_DEFAULT, + &env_tid); + protoexp_flags = env_tid.e_uint; + + if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) { + proto->scbc_rv = NULL; + if ((err = ips_protoexp_init(context, proto, protoexp_flags, + num_of_send_bufs, num_of_send_desc, + &proto->protoexp))) + goto fail; + } + else { + proto->protoexp = NULL; + proto->scbc_rv = (struct ips_scbctrl *) + psmi_calloc(proto->ep, DESCRIPTORS, + 1, sizeof(struct ips_scbctrl)); + if (proto->scbc_rv == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + /* + * Rendezvous buffers. We want to get a callback for rendezvous bufs + * since we asynchronously try to make progress on these sends and only + * schedule them on the timerq if there are pending sends and available + * bufs. + */ + if ((err = ips_scbctrl_init(context, num_of_send_desc, 0 /* no bufs */, + 0, 0 /* bufsize==0 */, ips_proto_rv_scbavail_callback, + proto, proto->scbc_rv))) + goto fail; + } + + /* + * Parse the tid error settings from the environment. + * : + */ + { + int tvals[2]; + char *tid_err; + union psmi_envvar_val env_tiderr; + + tid_err = "-1:0"; /* no tiderr warnings, never exits */ + tvals[0] = -1; + tvals[1] = 0; + + if (!psmi_getenv("PSM_TID_ERROR", + "Tid error control ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) tid_err, + &env_tiderr)) { + /* not using default values */ + tid_err = env_tiderr.e_str; + psmi_parse_str_tuples(tid_err, 2, tvals); + } + if (tvals[0] >= 0) + proto->tiderr_warn_interval = sec_2_cycles(tvals[0]); + else + proto->tiderr_warn_interval = UINT64_MAX; + proto->tiderr_max = tvals[1]; + _IPATH_PRDBG("Tid error control: warning every %d secs%s, " + "fatal error after %d tid errors%s\n", + tvals[0], (tvals[0] < 0) ? " (no warnings)" : "", + tvals[1], (tvals[1] == 0) ? " (never fatal)" : ""); + } + + /* + * Active Message interface. AM requests compete with MQ for eager + * buffers, since request establish the amount of buffering in the network + * (maximum number of requests in flight). AM replies use the same amount + * of request buffers -- we can never run out of AM reply buffers because a + * request handler can only be run if we have at least one reply buffer (or + * else the AM request is dropped). + */ + if ((err = ips_proto_am_init(proto, num_of_send_bufs, num_of_send_desc, + imm_size, &proto->proto_am))) + goto fail; + + if (!host_pid) { + char ipbuf[INET_ADDRSTRLEN], *p; + host_pid = (uint32_t) getpid(); + host_ipv4addr = psmi_get_ipv4addr(); /* already be */ + if (host_ipv4addr == 0) { + _IPATH_DBG("Unable to obtain local IP address, " + "not fatal but some features may be disabled\n"); + } + else if (host_ipv4addr == __cpu_to_be32(0x7f000001)) { + _IPATH_INFO("Localhost IP address is set to the " + "loopback address 127.0.0.1, " + "not fatal but some features may be disabled\n"); + } + else { + p = (char *) inet_ntop(AF_INET, (const void *) &host_ipv4addr, + ipbuf, sizeof ipbuf); + _IPATH_PRDBG("Ethernet Host IP=%s and PID=%d\n", p, host_pid); + } + + /* Store in big endian for use in ERR_CHK */ + host_pid = __cpu_to_be32(host_pid); + } + +fail: + return err; +} + +psm_error_t +ips_proto_fini(struct ips_proto *proto, int force, uint64_t timeout_in) +{ + struct psmi_eptab_iterator itor; + uint64_t t_start; + uint64_t t_grace_start, t_grace_time, t_grace_finish, t_grace_interval; + psm_epaddr_t epaddr; + psm_error_t err = PSM_OK; + int i; + union psmi_envvar_val grace_intval; + + psmi_getenv("PSM_CLOSE_GRACE_PERIOD", + "Additional grace period in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, + &grace_intval); + + if (getenv("PSM_CLOSE_GRACE_PERIOD")) { + t_grace_time = grace_intval.e_uint * SEC_ULL; + } + else if (timeout_in > 0) { + /* default to half of the close time-out */ + t_grace_time = timeout_in / 2; + } + else { + /* propagate the infinite time-out case */ + t_grace_time = 0; + } + + if (t_grace_time > 0 && t_grace_time < PSMI_MIN_EP_CLOSE_TIMEOUT) + t_grace_time = PSMI_MIN_EP_CLOSE_TIMEOUT; + + /* At close we will busy wait for the grace interval to see if any + * receive progress is made. If progress is made we will wait for + * another grace interval, until either no progress is made or the + * entire grace period has passed. If the grace interval is too low + * we may miss traffic and exit too early. If the grace interval is + * too large the additional time spent while closing the program + * will become visible to the user. */ + psmi_getenv("PSM_CLOSE_GRACE_INTERVAL", + "Grace interval in seconds for closing end-point.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 0, + &grace_intval); + + if (getenv("PSM_CLOSE_GRACE_INTERVAL")) { + t_grace_interval = grace_intval.e_uint * SEC_ULL; + } + else { + /* A heuristic is used to scale up the timeout linearly with + * the number of endpoints, and we allow one second per 1000 + * endpoints. */ + t_grace_interval = (proto->ep->connections * SEC_ULL) / 1000; + } + + if (t_grace_interval < PSMI_MIN_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MIN_EP_CLOSE_GRACE_INTERVAL; + if (t_grace_interval > PSMI_MAX_EP_CLOSE_GRACE_INTERVAL) + t_grace_interval = PSMI_MAX_EP_CLOSE_GRACE_INTERVAL; + + PSMI_PLOCK_ASSERT(); + + t_start = proto->t_fini = get_cycles(); + + /* Close whatever has been left open */ + if (proto->num_connected_to > 0) { + int num_disc = 0; + int *mask; + psm_error_t *errs; + psm_epaddr_t *epaddr_array; + + psmi_epid_itor_init(&itor, proto->ep); + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptl == proto->ptl) + num_disc++; + } + psmi_epid_itor_fini(&itor); + mask = (int *) psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(int)); + errs = (psm_error_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm_error_t)); + epaddr_array = (psm_epaddr_t *) + psmi_calloc(proto->ep, UNDEFINED, num_disc, sizeof(psm_epaddr_t)); + + if (errs == NULL || epaddr_array == NULL || mask == NULL) { + if (epaddr_array) psmi_free(epaddr_array); + if (errs) psmi_free(errs); + if (mask) psmi_free(mask); + err = PSM_NO_MEMORY; + goto fail; + } + psmi_epid_itor_init(&itor, proto->ep); + i = 0; + while ((epaddr = psmi_epid_itor_next(&itor))) { + if (epaddr->ptl == proto->ptl) { + mask[i] = 1; + epaddr_array[i] = epaddr; + i++; + PSM_MCTXT_REMOVE(epaddr); + } + } + psmi_epid_itor_fini(&itor); + err = ips_proto_disconnect(proto, force, num_disc, epaddr_array, + mask, errs, timeout_in); + psmi_free(mask); + psmi_free(errs); + psmi_free(epaddr_array); + } + + t_grace_start = get_cycles(); + + while (psmi_cycles_left(t_grace_start, t_grace_time)) { + uint64_t t_grace_interval_start = get_cycles(); + int num_disconnect_requests = proto->num_disconnect_requests; + PSMI_BLOCKUNTIL(proto->ep, err, + (proto->num_connected_from == 0 || + !psmi_cycles_left(t_start, timeout_in)) && + (!psmi_cycles_left(t_grace_interval_start, t_grace_interval) || + !psmi_cycles_left(t_grace_start, t_grace_time))); + if (num_disconnect_requests == proto->num_disconnect_requests) { + /* nothing happened in this grace interval so break out early */ + break; + } + } + + t_grace_finish = get_cycles(); + + _IPATH_PRDBG("Closing endpoint disconnect left to=%d,from=%d after %d millisec of grace (out of %d)\n", + proto->num_connected_to, proto->num_connected_from, + (int) (cycles_to_nanosecs(t_grace_finish - t_grace_start) / MSEC_ULL), + (int) (t_grace_time / MSEC_ULL)); + + if ((err = ips_ibta_fini(proto))) + goto fail; + + if ((err = ips_proto_am_fini(&proto->proto_am))) + goto fail; + + if ((err = ips_scbctrl_fini(&proto->scbc_egr))) + goto fail; + + ips_proto_recv_fini(proto); + + if (proto->protoexp) { + if ((err = ips_protoexp_fini(proto->protoexp))) + goto fail; + } + else { + ips_scbctrl_fini(proto->scbc_rv); + psmi_free(proto->scbc_rv); + } + + psmi_mpool_destroy(proto->pend_sends_pool); + +fail: + proto->t_fini = proto->t_init = 0; + return err; +} + +static +psm_error_t +proto_sdma_init(struct ips_proto *proto, const psmi_context_t *context) +{ + union psmi_envvar_val env_sdma, env_ipathegr; + char *c; + uint32_t defval = IPS_PROTO_FLAGS_DEFAULT & IPS_PROTO_FLAGS_ALL_SDMA; + psm_error_t err = PSM_OK; + int egrmode; + + /* + * Only initialize if RUNTIME_SDMA is enabled. + */ + psmi_assert_always(context->runtime_flags & IPATH_RUNTIME_SDMA); + + if ((c = getenv("PSM_SDMA")) && *c && !strncmp("always", c, 7)) + defval = IPS_PROTO_FLAGS_ALL_SDMA; + + psmi_getenv("PSM_SDMA", + "ipath send dma flags (0 disables send dma)", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) defval, + &env_sdma); + + if(env_sdma.e_uint != 1) + proto->flags |= env_sdma.e_uint & IPS_PROTO_FLAGS_ALL_SDMA; + + /* If anything uses send dma, figure out our max packet threshold to call + * send dma with */ + proto->scb_max_sdma = IPS_SDMA_MAX_SCB; + if (proto->flags & IPS_PROTO_FLAGS_ALL_SDMA) { + psmi_getenv("PSM_SDMA_THRESH", + "ipath send dma max packet per call", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) proto->scb_max_sdma, + &env_sdma); + proto->scb_max_sdma = env_sdma.e_uint; + if (proto->scb_max_sdma < 1) { + _IPATH_ERROR("Overriding PSM_SDMA_THRESH=%u to be '%u'\n", + proto->scb_max_sdma, 1); + proto->scb_max_sdma = 1; + } + } + + egrmode = proto->flags & + (IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA|IPS_PROTO_FLAG_MQ_EAGER_SDMA); + + /* Some modes don't make sense or at least, MQ doesn't expect them to + * be a functional mode. For example, it's not possible to use DMA + * message envelopes with PIO eager data. + */ + if (egrmode == IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA) { + err = psmi_handle_error(proto->ep, PSM_PARAM_ERR, + "Unsupported Send DMA mode 0x%x: dma envelopes and pio eager", + proto->flags); + goto fail; + } + /* Only bother with switchover for pio-envelope,dma-eagerdata */ + else if (egrmode == IPS_PROTO_FLAG_MQ_EAGER_SDMA) { + /* Reduce threshold to use SDMA for QLE73XX as we are PIO limited for + * medium message sizes on it. + */ + uint32_t hca_type = psmi_get_hca_type((psmi_context_t*) context); + + defval = (hca_type == PSMI_HCA_TYPE_QLE73XX) ? + MQ_IPATH_THRESH_EGR_SDMA_SQ : MQ_IPATH_THRESH_EGR_SDMA; + psmi_getenv("PSM_MQ_EAGER_SDMA_SZ", + "ipath pio-to-sdma eager switchover", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) defval, &env_ipathegr); + + /* Has to be at least 1 MTU */ + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = + max(proto->epinfo.ep_piosize, env_ipathegr.e_uint); + + /* For QLE73XX bump up the eager SDMA threshold for blocking sends if + * the user has not explicitly set one. + */ + if ((hca_type == PSMI_HCA_TYPE_QLE73XX) && + (proto->iovec_thresh_eager == defval)) + proto->iovec_thresh_eager_blocking = MQ_IPATH_THRESH_EGR_SDMA; + } + else if (egrmode == + (IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA|IPS_PROTO_FLAG_MQ_EAGER_SDMA)) + { + /* Has to be 0 so we never try to split pio and dma */ + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = 0; + } + else if (egrmode == 0) { /* all pio */ + proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U; + } + +fail: + return err; +} + +static +void +ctrlq_init(struct ips_ctrlq *ctrlq, int flowid, struct ips_proto *proto) +{ + // clear the ctrl send queue + memset(ctrlq, 0, sizeof(*ctrlq)); + + proto->message_type_to_index[OPCODE_ACK] = CTRL_MSG_ACK_QUEUED; + proto->message_type_to_index[OPCODE_NAK] = CTRL_MSG_NAK_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK] = CTRL_MSG_ERR_CHK_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK_PLS] = CTRL_MSG_ERR_CHK_PLS_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REQUEST] = + CTRL_MSG_CONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_CONNECT_REPLY] = + CTRL_MSG_CONNECT_REPLY_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REQUEST] = + CTRL_MSG_DISCONNECT_REQUEST_QUEUED; + proto->message_type_to_index[OPCODE_DISCONNECT_REPLY] = + CTRL_MSG_DISCONNECT_REPLY_QUEUED; + proto->message_type_to_index[OPCODE_CLOSE] = CTRL_MSG_CLOSE_QUEUED; + proto->message_type_to_index[OPCODE_CLOSE_ACK] = CTRL_MSG_CLOSE_ACK_QUEUED; + proto->message_type_to_index[OPCODE_ABORT] = CTRL_MSG_ABORT_QUEUED; + proto->message_type_to_index[OPCODE_TIDS_GRANT] = CTRL_MSG_TIDS_GRANT_QUEUED; + proto->message_type_to_index[OPCODE_TIDS_GRANT_ACK] = CTRL_MSG_TIDS_GRANT_ACK_QUEUED; + proto->message_type_to_index[OPCODE_ERR_CHK_GEN] = CTRL_MSG_ERR_CHK_GEN_QUEUED; + proto->message_type_to_index[OPCODE_FLOW_CCA_BECN] = CTRL_MSG_FLOW_CCA_BECN; + + ctrlq->ctrlq_head = ctrlq->ctrlq_tail = 0; + ctrlq->ctrlq_overflow = 0; + ctrlq->ctrlq_proto = proto; + ctrlq->ctrlq_flowid = flowid; + /* We never enqueue connect messages. They require 512 bytes and we don't + * want to stack allocate 512 bytes just when sending back acks. + */ + proto->ctrl_msg_queue_never_enqueue = CTRL_MSG_CONNECT_REQUEST_QUEUED | + CTRL_MSG_CONNECT_REPLY_QUEUED | + CTRL_MSG_DISCONNECT_REQUEST_QUEUED | + CTRL_MSG_DISCONNECT_REPLY_QUEUED | + CTRL_MSG_ERR_CHK_GEN_QUEUED | + CTRL_MSG_TIDS_GRANT_QUEUED; + + psmi_timer_entry_init(&ctrlq->ctrlq_timer, + ips_proto_timer_ctrlq_callback, ctrlq); + + return; +} + +static int inline +_build_ctrl_message(struct ips_proto *proto, + struct ips_proto_ctrl_message *msg, + ips_epaddr_t *ipsaddr, uint8_t message_type, + struct ips_flow *flow, + void *payload, uint8_t *discard_msg) +{ + uint32_t tot_paywords = sizeof(struct ips_message_header) >> 2; + struct ips_epinfo *epinfo = &proto->epinfo; + struct ips_epinfo_remote *epr = &ipsaddr->epr; + uint16_t pkt_flags = IPS_EPSTATE_COMMIDX_PACK(epr->epr_commidx_to); + struct ips_message_header *p_hdr = &msg->pbc_hdr.hdr; + ips_path_rec_t *ctrl_path = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][ipsaddr->epr.epr_hpp_index]; + int paylen = 0; + + if ((proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) && + (++ipsaddr->epr.epr_hpp_index >= + ipsaddr->epr.epr_num_paths[IPS_PATH_HIGH_PRIORITY])) + ipsaddr->epr.epr_hpp_index = 0; + + /* Control messages go over the control path. */ + p_hdr->lrh[0] = __cpu_to_be16(IPATH_LRH_BTH | + (ctrl_path->epr_sl << 4) | + (proto->sl2vl[ctrl_path->epr_sl] << LRH_VL_SHIFT)); + p_hdr->lrh[1] = ctrl_path->epr_dlid; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC); + p_hdr->lrh[3] = ctrl_path->epr_slid; + + p_hdr->bth[0] = __cpu_to_be32((IPATH_OPCODE_USER1 << 24) + + ctrl_path->epr_pkey); + + /* If flow is congested then generate a BECN for path. */ + if_pf (flow->flags & IPS_FLOW_FLAG_GEN_BECN) { + _IPATH_CCADBG("Generating BECN for flow %x ----> %x. Num congested packets: 0x%"PRIx64". Msg type: %d\n", __be16_to_cpu(flow->path->epr_slid), __be16_to_cpu(flow->path->epr_dlid), ipsaddr->stats.congestion_pkts, message_type); + p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp | 1 << BTH_BECN_SHIFT); + flow->flags &= ~IPS_FLOW_FLAG_GEN_BECN; + } + else + p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp); + p_hdr->bth[2] = 0; + + p_hdr->commidx = epr->epr_commidx_to; + p_hdr->sub_opcode = message_type; + p_hdr->ack_seq_num = 0; + IPS_HEADER_SRCCONTEXT_SET(p_hdr, epinfo->ep_context); + p_hdr->src_subcontext = epinfo->ep_subcontext; + p_hdr->dst_subcontext = epr->epr_subcontext; + p_hdr->flags = 0; + p_hdr->mqhdr = 0; + p_hdr->flowid = flow->flowid; + + switch (message_type) { + case OPCODE_ACK: + if_pt (flow->protocol != PSM_PROTOCOL_TIDFLOW) + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + else { + ptl_arg_t *args = (ptl_arg_t*) payload; + uint32_t tid_recv_sessid; + struct ips_tid_recv_desc *tidrecvc; + + /* TIDFLOW ACK. + * args[0] = send descriptor id + * args[1] = receive descriptor id + */ + ips_ptladdr_lock(ipsaddr); + + tid_recv_sessid = args[1]._desc_idx; + tidrecvc = + psmi_mpool_find_obj_by_index(proto->protoexp->tid_desc_recv_pool, + tid_recv_sessid); + if (tidrecvc == NULL) { + *discard_msg = 1; + ips_ptladdr_unlock(ipsaddr); + break; + } + if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != args[1]._desc_genc) { + *discard_msg = 1; + ips_ptladdr_unlock(ipsaddr); + break; + } + + p_hdr->data[0].u64 = args[0].u64; + p_hdr->ack_seq_num = tidrecvc->tidflow_genseq.psn; + ips_ptladdr_unlock(ipsaddr); + } + break; + + case OPCODE_NAK: + if_pf (flow->protocol != PSM_PROTOCOL_TIDFLOW) { + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + } + else { + ptl_arg_t *args = (ptl_arg_t*) payload; + uint32_t tid_recv_sessid; + struct ips_tid_recv_desc *tidrecvc; + psmi_seqnum_t ack_seq_num; + + /* TIDFLOW NAK. + * args[0] = send descriptor id + * args[1] = receive descriptor id + * args[2].u16w0 = Old generation to NAK + */ + ips_ptladdr_lock(ipsaddr); + + tid_recv_sessid = args[1]._desc_idx; + tidrecvc = + psmi_mpool_find_obj_by_index(proto->protoexp->tid_desc_recv_pool, + tid_recv_sessid); + if (tidrecvc == NULL) { + *discard_msg = 1; + ips_ptladdr_unlock(ipsaddr); + break; + } + if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != args[1]._desc_genc) { + *discard_msg = 1; + ips_ptladdr_unlock(ipsaddr); + break; + } + + p_hdr->data[0].u64 = args[0].u64; /* Send descriptor id */ + p_hdr->data[1].u32w0 = tidrecvc->tidflow_genseq.val; /*New flowgenseq*/ + + /* Ack seqnum contains the old generation we are acking for */ + ack_seq_num = tidrecvc->tidflow_genseq; + ack_seq_num.gen = args[2].u16w0; + p_hdr->ack_seq_num = ack_seq_num.psn; + + ips_ptladdr_unlock(ipsaddr); + } + break; + + case OPCODE_ERR_CHK: + { + psmi_seqnum_t err_chk_seq; + ips_ptladdr_lock(ipsaddr); + + err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ? + flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num; + err_chk_seq.pkt -= 1; + p_hdr->bth[2] = __cpu_to_be32(err_chk_seq.psn); + ips_ptladdr_unlock(ipsaddr); + p_hdr->data[0].u32w0 = host_ipv4addr; + p_hdr->data[0].u32w1 = host_pid; + + if (ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) + pkt_flags |= INFINIPATH_KPF_INTR; + } + break; + + case OPCODE_ERR_CHK_GEN: + { + struct ips_scb_unackedq *unackedq = &flow->scb_unacked; + + /* TIDFLOW ERR_CHK_GEN + * args[0] = receive descriptor id + * args[1] = send descriptor id + */ + if (!STAILQ_EMPTY(unackedq)) { + ips_scb_t *scb = STAILQ_FIRST(unackedq); + psmi_seqnum_t err_chk_seq; + + ips_ptladdr_lock(ipsaddr); + + psmi_assert_always(scb->tidsendc); + + err_chk_seq = (SLIST_EMPTY(&flow->scb_pend)) ? + flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num; + err_chk_seq.seq -= 1; + + /* NOTE: If error check gen is cached and we get a NAK + * the scbs are flushed again. This can increase the DMA counter + * as scb's are retransmitted which we don't check for here. + * One way is never cache the ERR_CHK_GEN messages so it's only + * called from the ack timeout callback. Other way is that we + * send the ERR_CHK_GEN message over SDMA so they are serialized with + * respect to each other. Note: In this case we don't need to + * wait for the DMA completion counters in the ack timeout. + */ + p_hdr->bth[2] = __cpu_to_be32(err_chk_seq.psn); + + /* Receive descriptor index */ + p_hdr->data[0].u64 = scb->tidsendc->tid_list.tsess_descid.u64; + /* Send descriptor index */ + p_hdr->data[1].u64 = scb->tidsendc->descid.u64; + + ips_ptladdr_unlock(ipsaddr); + + if (ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) + pkt_flags |= INFINIPATH_KPF_INTR; + } + else + *discard_msg = 1; + } + break; + + case OPCODE_FLOW_CCA_BECN: + _IPATH_CCADBG("Generating Explicit BECN for flow %x ----> %x. Num congested packets: 0x%"PRIx64"\n", __be16_to_cpu(flow->path->epr_slid), __be16_to_cpu(flow->path->epr_dlid), ipsaddr->stats.congestion_pkts); + p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp | 1 << BTH_BECN_SHIFT); + p_hdr->data[0].u32w0 = flow->cca_ooo_pkts; + break; + + case OPCODE_ERR_CHK_BAD: + p_hdr->data[0].u32w0 = host_ipv4addr; + p_hdr->data[0].u32w1 = host_pid; + break; + + case OPCODE_STARTUP: + case OPCODE_STARTUP_ACK: + case OPCODE_STARTUP_EXT: + case OPCODE_STARTUP_ACK_EXT: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unexpected use of old connect protocol"); + break; + + case OPCODE_CONNECT_REQUEST: + case OPCODE_CONNECT_REPLY: + p_hdr->hdr_dlen = (epinfo->ep_hdrq_msg_size>>2) - + IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS; + p_hdr->bth[0] = __cpu_to_be32((IPATH_OPCODE_USER1 << 24) + + ctrl_path->epr_pkey); + paylen = + ips_proto_build_connect_message(proto, msg, ipsaddr, + message_type, payload); + /* Rewrite packet length since this subopcode has an eager payload */ + tot_paywords += paylen >> 2; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC); + +#if 0 /* MARKDEBBAGE - disabled this as it slows down connect at scale */ + /* On request message, always set the kpf flag. If reply, only set it + * if we know that the recvthread is running */ + if (message_type == OPCODE_CONNECT_REQUEST || + ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) + pkt_flags |= INFINIPATH_KPF_INTR; +#endif + break; + + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + paylen = + ips_proto_build_connect_message(proto, msg, ipsaddr, + message_type, payload); + tot_paywords += paylen >> 2; + p_hdr->hdr_dlen = (epinfo->ep_hdrq_msg_size>>2) - + IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC); + break; + + case OPCODE_TIDS_RELEASE: + case OPCODE_TIDS_RELEASE_CONFIRM: + case OPCODE_TIDS_GRANT_ACK: + case OPCODE_TIDS_GRANT: + paylen = ips_protoexp_build_ctrl_message(proto->protoexp, ipsaddr, + p_hdr->data, &pkt_flags, message_type, payload); + if (paylen < 0) { + *discard_msg = 1; + break; + } + tot_paywords += paylen >> 2; + p_hdr->lrh[2] = __cpu_to_be16(tot_paywords + SIZE_OF_CRC); + break; + + default: + break; + } + + p_hdr->iph.ver_context_tid_offset = __cpu_to_le32( + (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) + + (epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) + + (IPATH_EAGER_TID_ID << INFINIPATH_I_TID_SHIFT)); + p_hdr->iph.pkt_flags = __cpu_to_le16(pkt_flags); + + ips_kdeth_cksum(p_hdr); // Generate KDETH checksum + + /* Require 4-byte alignment always */ + psmi_assert(!(paylen & 0x3)); + return paylen; +} + +psm_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); + +psm_error_t __recvpath +ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type, + uint32_t *msg_queue_mask, void *payload) +{ + struct ips_proto_ctrl_message msg; + psm_error_t err = PSM_EP_NO_RESOURCES; + ptl_arg_t *args = (ptl_arg_t *) payload; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ipsaddr->proto; + struct ips_ctrlq *ctrlq = &proto->ctrlq[IPS_FLOWID2INDEX(flow->flowid)&0x3]; + struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe; + uint32_t cksum = 0; + int paylen; + uint8_t discard_msg = 0; + + /* Drain queue if non-empty */ + if (cqe[ctrlq->ctrlq_tail].ipsaddr) + ips_proto_timer_ctrlq_callback(&ctrlq->ctrlq_timer, 0ULL); + + if (!cqe[ctrlq->ctrlq_tail].ipsaddr) { + paylen = _build_ctrl_message(proto, &msg, ipsaddr, message_type, + flow, payload, &discard_msg); + + if_pt (!discard_msg) { + /* If enabled checksum control message */ + ips_do_cksum(proto, &msg.pbc_hdr.hdr, payload, paylen, &cksum); + + /* Error check messages are serialized with respect to the underlying + * transfer mechanism. + */ + if ((message_type == OPCODE_ERR_CHK) || + (message_type == OPCODE_ERR_CHK_GEN) || + (message_type == OPCODE_ERR_CHK_BAD)) { + switch(flow->transfer) { + case PSM_TRANSFER_PIO: + case PSM_TRANSFER_LAST: + err = ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, + payload, paylen, PSMI_TRUE, + (proto->flags & IPS_PROTO_FLAG_CKSUM), + cksum); + break; + case PSM_TRANSFER_DMA: + err = ips_dma_transfer_frame(proto, flow, &msg.pbc_hdr, payload, + paylen, cksum); + break; + } + } + else + if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA) + err = ips_dma_transfer_frame(proto, flow, &msg.pbc_hdr, payload, + paylen, cksum); + else + err = ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, + payload, paylen, PSMI_TRUE, + (proto->flags & IPS_PROTO_FLAG_CKSUM), + cksum); + + if (err == PSM_OK) + ips_epaddr_stats_send(ipsaddr, message_type); + } + else + err = PSM_OK; /* Ctrl message is discarded. May want to add stats */ + + _IPATH_VDBG("transfer_frame of opcode=0x%x,remote_lid=%d," + "src=%p,len=%d returns %d\n", (int) msg.pbc_hdr.hdr.sub_opcode, + __be16_to_cpu(msg.pbc_hdr.hdr.lrh[1]), payload, paylen, err); + } + if (err != PSM_EP_NO_RESOURCES) + return err; + if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA) + proto->stats.writev_busy_cnt++; + else + proto->stats.pio_busy_cnt++; + + if (!(proto->ctrl_msg_queue_never_enqueue & proto->message_type_to_index[message_type])) { + + if ((*msg_queue_mask) & proto->message_type_to_index[message_type]) { + /* This type of control message is already queued, skip it */ + err = PSM_OK; + } else if (cqe[ctrlq->ctrlq_head].ipsaddr == NULL) { + // entry is free + *msg_queue_mask |= message_type2index(proto, message_type); + + cqe[ctrlq->ctrlq_head].ipsaddr = ipsaddr; + cqe[ctrlq->ctrlq_head].message_type = message_type; + cqe[ctrlq->ctrlq_head].msg_queue_mask = msg_queue_mask; + cqe[ctrlq->ctrlq_head].flow = flow; + + if (args) { + cqe[ctrlq->ctrlq_head].args[0].u64w0 = args[0].u64w0; + cqe[ctrlq->ctrlq_head].args[1].u64w0 = args[1].u64w0; + cqe[ctrlq->ctrlq_head].args[2].u64w0 = args[2].u64w0; + } + + ctrlq->ctrlq_head = (ctrlq->ctrlq_head + 1) % CTRL_MSG_QEUEUE_SIZE; + //_IPATH_INFO("requesting ctrlq timer for msgtype=%d!\n", message_type); + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + + err = PSM_OK; + } else { + proto->ctrl_msg_queue_overflow++; + } + } + + return err; +} + +psm_error_t __recvpath +ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_expire) +{ + struct ips_ctrlq *ctrlq = (struct ips_ctrlq *) timer->context; + struct ips_proto *proto = ctrlq->ctrlq_proto; + struct ips_proto_ctrl_message msg; + struct ips_ctrlq_elem *cqe = ctrlq->ctrlq_cqe; + struct ips_flow *flow; + uint8_t msg_type; + psm_error_t err; + struct ptl_epaddr *ipsaddr; + uint32_t cksum = 0; + int paylen; + uint8_t discard_msg = 0; + + // service ctrl send queue first + while (cqe[ctrlq->ctrlq_tail].ipsaddr) { + msg_type = cqe[ctrlq->ctrlq_tail].message_type; + ipsaddr = cqe[ctrlq->ctrlq_tail].ipsaddr; + flow = cqe[ctrlq->ctrlq_tail].flow; + + paylen = _build_ctrl_message(proto, &msg, + ipsaddr, msg_type, flow, + cqe[ctrlq->ctrlq_tail].args, + &discard_msg); + + psmi_assert_always(paylen == 0); + + if_pt (!discard_msg) { + /* If enabled checksum control message */ + ips_do_cksum(proto, &msg.pbc_hdr.hdr, NULL, 0, &cksum); + + /* Error check messages are serialized with respect to the underlying + * transfer mechanism. + */ + if ((msg_type == OPCODE_ERR_CHK) || + (msg_type == OPCODE_ERR_CHK_GEN) || + (msg_type == OPCODE_ERR_CHK_BAD)) { + switch(flow->transfer) { + case PSM_TRANSFER_DMA: + err = ips_dma_transfer_frame(proto,flow,&msg.pbc_hdr,0,0, cksum); + break; + case PSM_TRANSFER_PIO: + default: + err = + ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, + NULL, 0, PSMI_TRUE, + (proto->flags & IPS_PROTO_FLAG_CKSUM), + cksum); + break; + } + } + else + if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA) + err = ips_dma_transfer_frame(proto,flow,&msg.pbc_hdr,NULL,0,cksum); + else + err = + ips_spio_transfer_frame(proto->spioc, flow, &msg.pbc_hdr.hdr, + 0, 0, PSMI_TRUE, + (proto->flags & IPS_PROTO_FLAG_CKSUM), + cksum); + } + else + err = PSM_OK; /* Discard ctrl message */ + + if (err == PSM_OK) { + ips_epaddr_stats_send(ipsaddr, msg_type); + *cqe[ctrlq->ctrlq_tail].msg_queue_mask &= + ~message_type2index(proto, cqe[ctrlq->ctrlq_tail].message_type); + cqe[ctrlq->ctrlq_tail].ipsaddr = NULL; + ctrlq->ctrlq_tail = (ctrlq->ctrlq_tail + 1) % CTRL_MSG_QEUEUE_SIZE; + } else { + psmi_assert(err == PSM_EP_NO_RESOURCES); + + if (proto->flags & IPS_PROTO_FLAG_CTRL_SDMA) + proto->stats.writev_busy_cnt++; + else + proto->stats.pio_busy_cnt++; + /* re-request a timer expiration */ + psmi_timer_request(proto->timerq, &ctrlq->ctrlq_timer, + PSMI_TIMER_PRIO_0); + return PSM_OK; + } + } + + return PSM_OK; +} + +void __sendpath +ips_proto_flow_enqueue(struct ips_flow *flow, ips_scb_t *scb) +{ + ips_epaddr_t *ipsaddr = flow->ipsaddr; + + /* Don't support send to self */ + psmi_assert(flow->path->epr_dlid != flow->path->epr_slid); + + ips_scb_prepare_flow_inner(scb, flow->epinfo, &ipsaddr->epr, flow); + ips_do_cksum(ipsaddr->proto, &scb->ips_lrh, + scb->payload, scb->payload_size, &scb->cksum); + + STAILQ_INSERT_TAIL(&flow->scb_unacked, scb, nextq); + flow->scb_num_pending++; + flow->scb_num_unacked++; + + /* Every ipsaddr has a pending head that points into the unacked queue. + * If sends are already pending, process those first */ + if (SLIST_EMPTY(&flow->scb_pend)) + SLIST_FIRST(&flow->scb_pend) = scb; +} + +/* + * This function attempts to flush the current list of pending + * packets through PIO. + * + * Recoverable errors: + * PSM_OK: Packet triggered through PIO. + * PSM_EP_NO_RESOURCES: No PIO bufs available or cable pulled. + * + * Unrecoverable errors: + * PSM_EP_NO_NETWORK: No network, no lid, ... + * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + */ +psm_error_t __sendpath +ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = flow->ipsaddr->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + int num_sent = 0; + uint64_t t_cyc; + ips_scb_t *scb; + psm_error_t err = PSM_OK; + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf ((!flow->credits) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) + return PSM_OK; + + while (!SLIST_EMPTY(scb_pend) && flow->credits) { + scb = SLIST_FIRST(scb_pend); + + if ((err = ips_spio_transfer_frame(proto->spioc, flow, &scb->ips_lrh, + scb->payload, scb->payload_size, + PSMI_FALSE, + (proto->flags & IPS_PROTO_FLAG_CKSUM) && (scb->tid == IPATH_EAGER_TID_ID), + scb->cksum)) == PSM_OK) + { + t_cyc = get_cycles(); + scb->flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = flow->path->epr_timeout_ack; + scb->abs_timeout = flow->path->epr_timeout_ack + t_cyc; + psmi_timer_request(proto->timerq, &flow->timer_ack, + scb->abs_timeout); + num_sent++; + flow->scb_num_pending--; + flow->credits--; + SLIST_REMOVE_HEAD(scb_pend, next); + + } + else + break; + } + + /* If out of flow credits re-schedule send timer */ + if (!SLIST_EMPTY(scb_pend)) { + proto->stats.pio_busy_cnt++; + psmi_timer_request(proto->timerq, &flow->timer_send, + get_cycles() + proto->timeout_send); + } + + if (nflushed != NULL) + *nflushed = num_sent; + + return err; +} + +/* + * Flush all packets currently marked as pending + */ +static psm_error_t scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int num, + int *num_sent); + +#ifdef PSM_DEBUG +#define PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto) \ + do { \ + uint32_t cntr_inflight; \ + ipath_sdma_inflight(proto->ptl->context->ctrl, &cntr_inflight); \ + VALGRIND_MAKE_MEM_DEFINED(&cntr_inflight, sizeof(uint32_t)); \ + psmi_assert_always(cntr_inflight == \ + proto->iovec_cntr_next_inflight); \ + } while (0) +#else +#define PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto) +#endif + +/* + * Flush all packets queued up on a flow via send DMA. + * + * Recoverable errors: + * PSM_OK: Able to flush entire pending queue for DMA. + * PSM_OK_NO_PROGRESS: Flushed at least 1 but not all pending packets for DMA. + * PSM_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * + * Unrecoverable errors: + * PSM_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM_EP_NO_NETWORK: No network, no lid, ... + */ +psm_error_t __sendpath +ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed) +{ + struct ips_proto *proto = flow->ipsaddr->proto; + struct ips_scb_pendlist *scb_pend = &flow->scb_pend; + uint32_t cntr_init; + ips_scb_t *scb; + psm_error_t err = PSM_OK; + int howmany = 0; + int nsent = 0; + + /* Out of credits - ACKs/NAKs reclaim recredit or congested flow */ + if_pf ((!flow->credits) || (flow->flags & IPS_FLOW_FLAG_CONGESTED)) { + if (nflushed) + *nflushed = 0; + return PSM_EP_NO_RESOURCES; + } + + if (SLIST_EMPTY(scb_pend)) + goto success; + + /* + * Count how many are to be sent and fire dma. + */ +#ifdef PSM_DEBUG + SLIST_FOREACH(scb, scb_pend, next) + howmany++; + psmi_assert_always(howmany == flow->scb_num_pending); +#else + howmany = min(flow->scb_num_pending, flow->credits); +#endif + + howmany = min(howmany, proto->scb_max_sdma); + + if (howmany == 0) + goto success; + + PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto); /* Pre-check */ + + cntr_init = proto->iovec_cntr_next_inflight; + err = scb_dma_send(proto, flow, scb_pend, howmany, &nsent); + if (err != PSM_OK && err != PSM_EP_NO_RESOURCES && + err != PSM_OK_NO_PROGRESS) + goto fail; + + /* scb_dma_send shouldn't modify iovec_cntr_next_inflight */ + psmi_assert_always(cntr_init == proto->iovec_cntr_next_inflight); + + if (nsent > 0) { + uint64_t t_cyc = get_cycles(); + uint32_t new_inflight = proto->iovec_cntr_next_inflight + nsent; + int i = 0; + + /* We have to ensure that the inflight counter doesn't drift away too + * far from the completion counter or else our wraparound arithmetic + * in ips_proto_dma_wait_until will fail. + */ + if ((int) new_inflight - (int) proto->iovec_cntr_last_completed < 0) + ips_proto_dma_wait_until(proto, + proto->iovec_cntr_last_completed + nsent); + + flow->scb_num_pending -= nsent; + flow->credits = max((int) flow->credits - nsent, 0); + + SLIST_FOREACH(scb, scb_pend, next) { + if (++i > nsent) + break; + scb->flags &= ~IPS_SEND_FLAG_PENDING; + scb->ack_timeout = scb->nfrag*flow->path->epr_timeout_ack; + scb->abs_timeout = scb->nfrag*flow->path->epr_timeout_ack + t_cyc; + scb->dma_ctr = proto->iovec_cntr_next_inflight++; + if (scb->tidsendc) + ips_protoexp_scb_inflight(scb); + } + SLIST_FIRST(scb_pend) = scb; + } + + PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto); /* Post Check */ + + if (SLIST_FIRST(scb_pend) != NULL) { + psmi_assert(flow->scb_num_pending > 0); + + switch(flow->protocol) { + case PSM_PROTOCOL_TIDFLOW: + /* For Tidflow we can cancel the ack timer if we have flow credits + * available and schedule the send timer. If we are out of flow + * credits then the ack timer is scheduled as we are waiting for + * an ACK to reclaim credits. This is required since multiple + * tidflows may be active concurrently. + */ + if (flow->credits) { + /* Cancel ack timer and reschedule send timer. Increment + * writev_busy_cnt as this really is DMA buffer exhaustion. + */ + psmi_timer_cancel(proto->timerq, &flow->timer_ack); + psmi_timer_request(proto->timerq, &flow->timer_send, + get_cycles() + (proto->timeout_send << 1)); + proto->stats.writev_busy_cnt++; + } + else { + /* Re-instate ACK timer to reap flow credits */ + psmi_timer_request(proto->timerq, &flow->timer_ack, + get_cycles() + (flow->path->epr_timeout_ack>>2)); + } + + break; + case PSM_PROTOCOL_GO_BACK_N: + default: + if (flow->credits) { + /* Schedule send timer and increment writev_busy_cnt */ + psmi_timer_request(proto->timerq, &flow->timer_send, + get_cycles() + (proto->timeout_send << 1)); + proto->stats.writev_busy_cnt++; + } + else { + /* Schedule ACK timer to reap flow credits */ + psmi_timer_request(proto->timerq, &flow->timer_ack, + get_cycles() + (flow->path->epr_timeout_ack>>2)); + } + break; + } + } + else { + /* Schedule ack timer */ + psmi_timer_cancel(proto->timerq, &flow->timer_send); + psmi_timer_request(proto->timerq, &flow->timer_ack, + get_cycles() + flow->path->epr_timeout_ack); + } + + /* We overwrite error with its new meaning for flushing packets */ + if (nsent > 0) + if (nsent < howmany) + err = PSM_OK_NO_PROGRESS; /* partial flush */ + else + err = PSM_OK; /* complete flush */ + else + err = PSM_EP_NO_RESOURCES; /* no flush at all */ + +success: +fail: + if (nflushed) + *nflushed = nsent; + + return err; +} + +/* + * Fault injection in dma sends. Since DMA through writev() is all-or-nothing, + * we don't inject faults on a packet-per-packet basis since the code gets + * quite complex. Instead, each call to flush_dma or transfer_frame is treated + * as an "event" and faults are generated according to the IPS_FAULTINJ_DMASEND + * setting. + * + * The effect is as if the event was successful but dropped on the wire + * somewhere. + */ +PSMI_ALWAYS_INLINE( +int +dma_do_fault()) +{ + + if_pf (PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi, "dmalost", 1, IPS_FAULTINJ_DMALOST); + return psmi_faultinj_is_fault(fi); + } + else + return 0; +} + +/* ips_dma_transfer_frame is used only for control messages, and is + * not enabled by default, and not tested by QA; expected send + * dma goes through scb_dma_send() */ +psm_error_t __sendpath +ips_dma_transfer_frame(struct ips_proto *proto, struct ips_flow *flow, + struct ips_pbc_header *pbc_hdr_i, + void *payload, uint32_t paylen, uint32_t cksum) +{ + struct iovec iovec; + ssize_t ret; + psm_error_t err; + uint32_t have_cksum = + ((proto->flags & IPS_PROTO_FLAG_CKSUM) && + (((__le32_to_cpu(pbc_hdr_i->hdr.iph.ver_context_tid_offset) >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK) == IPATH_EAGER_TID_ID) && (pbc_hdr_i->hdr.mqhdr != MQ_MSG_DATA_BLK) && (pbc_hdr_i->hdr.mqhdr != MQ_MSG_DATA_REQ_BLK)); + + psmi_assert((paylen & 0x3) == 0); /* require 4-byte multiple */ + psmi_assert(((uintptr_t) payload & 0x3) == 0); /* require 4-byte alignment */ + psmi_assert(paylen < proto->epinfo.ep_mtu); + + /* See comments above for fault injection */ + if_pf (dma_do_fault()) + return PSM_OK; + + ips_proto_pbc_update(proto, flow, PSMI_TRUE, &pbc_hdr_i->pbc, + sizeof(struct ips_message_header), + payload, paylen + + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0)); + + /* If we have a payload, we need to copy it inline to a single element to + * ensure that the driver copies it out completely as part of the writev + * call since the payload can be stack-allocated memory. + */ + if (paylen > 0) { + uint32_t len = sizeof(struct ips_pbc_header) + + paylen + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); + struct ips_pbc_header *pbc_hdr = alloca(len); + + if_pf (pbc_hdr == NULL) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY, + "alloca for %d bytes failed in writev", len); + goto fail; + } + + psmi_mq_mtucpy(pbc_hdr, pbc_hdr_i, sizeof(struct ips_pbc_header)); + psmi_mq_mtucpy(pbc_hdr+1, payload, paylen); + + if (have_cksum) { + uint32_t *ckptr = (uint32_t*) ((uint8_t*) pbc_hdr + + (len - PSM_CRC_SIZE_IN_BYTES)); + *ckptr = cksum; + ckptr++; + *ckptr = cksum; + } + + iovec.iov_base = pbc_hdr; + iovec.iov_len = len; + ret = ipath_cmd_writev(proto->fd, &iovec, 1); + } + else { + uint32_t len = sizeof(struct ips_pbc_header) + + (have_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); + struct ips_pbc_header *pbc_hdr = have_cksum ? alloca(len) : pbc_hdr_i; + + if_pf (pbc_hdr == NULL) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY, + "alloca for %d bytes failed in writev", len); + goto fail; + } + + if (have_cksum) { + uint32_t *ckptr = (uint32_t*) (pbc_hdr + 1); + psmi_mq_mtucpy(pbc_hdr, pbc_hdr_i, sizeof(struct ips_pbc_header)); + *ckptr = cksum; + ckptr++; + *ckptr = cksum; + } + + iovec.iov_base = pbc_hdr; + iovec.iov_len = len; + ret = ipath_cmd_writev(proto->fd, &iovec, 1); + } + + if (ret > 0) { + /* Even though we won't care about a completion in this frame send, we + * still increment the iovec packet counter */ + proto->iovec_cntr_next_inflight += ret; + err = PSM_OK; + psmi_assert_always(ret == 1); + } + else { + /* + * ret == 0: Driver did not queue packet. Try later. + * ENOMEM: No kernel memory to queue request, try later? * + * ECOMM: Link may have gone down + * EINTR: Got interrupt while in writev + */ + if (ret == 0 || errno == ENOMEM || errno == ECOMM || errno == EINTR) + err = PSM_EP_NO_RESOURCES; + else + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Unhandled error in writev(): %s (fd=%d,iovec=%p,len=%d)", + strerror(errno), proto->fd, &iovec, 1); + } + +fail: + return err; +} + +/* + * Caller still expects num_sent to always be correctly set in case of an + * error. + * + * Recoverable errors: + * PSM_OK: At least one packet was successfully queued up for DMA. + * PSM_EP_NO_RESOURCES: No scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma + * queue). + * PSM_OK_NO_PROGRESS: Cable pulled. + * + * Unrecoverable errors: + * PSM_EP_DEVICE_FAILURE: Error calling ipath_sdma_inflight() or unexpected + * error in calling writev(), or chip failure, rxe/txe + * parity error. + * PSM_EP_NO_NETWORK: No network, no lid, ... + */ +static +psm_error_t __sendpath +scb_dma_send(struct ips_proto *proto, struct ips_flow *flow, + struct ips_scb_pendlist *slist, int num, int *num_sent) +{ + ssize_t ret; + struct ips_scb *scb = SLIST_FIRST(slist); + unsigned int vec_idx = 0, scb_idx = 0, scb_sent = 0; + unsigned int max_elem; + struct iovec *iovec; + psm_error_t err = PSM_OK; + uint32_t cksum; + + psmi_assert(num > 0); + psmi_assert(scb != NULL); + + /* See comments above for fault injection */ + if_pf (dma_do_fault()) + goto fail; + + max_elem = 3*num; + iovec = alloca(sizeof(struct iovec) * max_elem); + + if_pf (iovec == NULL) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM_NO_MEMORY, + "alloca for %d bytes failed in writev", + (int)(sizeof(struct iovec) * max_elem)); + goto fail; + } + +writev_again: + vec_idx = 0; + + SLIST_FOREACH(scb, slist, next) { + /* Can't exceed posix max writev count */ + if (vec_idx + (int) !!(scb->payload_size > 0) >= UIO_MAXIOV) + break; + + psmi_assert(vec_idx < max_elem); + psmi_assert_always((scb->payload_size & 0x3) == 0); + + /* Checksum all eager packets */ + cksum = ((proto->flags & IPS_PROTO_FLAG_CKSUM) && + (scb->tid == IPATH_EAGER_TID_ID) && + (scb->ips_lrh.mqhdr != MQ_MSG_DATA_BLK) && + (scb->ips_lrh.mqhdr != MQ_MSG_DATA_REQ_BLK)); + + ips_proto_pbc_update(proto, flow, PSMI_FALSE, &scb->pbc, + sizeof(struct ips_message_header), + scb->payload, + scb->payload_size + + (cksum ? PSM_CRC_SIZE_IN_BYTES : 0)); + + iovec[vec_idx].iov_base = &scb->pbc; + iovec[vec_idx].iov_len = sizeof(struct ips_message_header) + + sizeof(union ipath_pbc); + vec_idx++; + + if (scb->payload_size > 0) { + /* + * Payloads must be 4-byte aligned. If not, we need a bounce + * buffer for them. This should be rare, but may be a performance + * penalty, so we log it as a stat in case we need to narrow in + * on a performance problem. + * + * If checksum is enabled use a bounce buffer. + */ + if ((((uintptr_t) scb->payload) & 0x3) || cksum) { + void *buf = scb->payload; + uint32_t len = scb->payload_size; + + if (scb->nfrag > 1) { + err = psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "buffer alignment for sdma error"); + goto fail; + } + + /* Only allocate buffer if current buffer is a user buffer */ + if (!((scb->payload >= scb->scbc->sbuf_buf_base) && + (scb->payload <= scb->scbc->sbuf_buf_last))){ + + if (!ips_scbctrl_bufalloc(scb)) { + err = PSM_EP_NO_RESOURCES; + if (--vec_idx == 0) /* Remove header, nothing to send */ + goto fail; + else /* send what we have so far, but no more */ + break; + } + + /* Only need to copy if bounce buffer is used. */ + psmi_mq_mtucpy(scb->payload, buf, len); + scb->payload_size = len; + } + + /* If checksum then update checksum */ + if (cksum) { + uint32_t *ckptr = (uint32_t*) ((uint8_t*) scb->payload + len); + *ckptr = scb->cksum; + ckptr++; + *ckptr = scb->cksum; + } + + if (((uintptr_t) buf) & 0x3) + proto->stats.send_dma_misaligns++; + } + + iovec[vec_idx].iov_base = scb->payload; + iovec[vec_idx].iov_len = scb->payload_size + + (cksum ? PSM_CRC_SIZE_IN_BYTES : 0); + vec_idx++; + + _IPATH_VDBG("seqno=%d hdr=%p,%d payload=%p,%d\n", + scb->seq_num.psn, + iovec[vec_idx-2].iov_base, (int) iovec[vec_idx-2].iov_len, + iovec[vec_idx-1].iov_base, (int) iovec[vec_idx-1].iov_len); + + /* + * if there are multiple frag payload, set the right frag size. + */ + if (scb->nfrag > 1) { + scb->pbc.fill1 = __cpu_to_le16(scb->frag_size); + + /* give tidinfo to qib driver */ + if (scb->tidsendc) { + iovec[vec_idx].iov_base = scb->tsess; + iovec[vec_idx].iov_len = scb->tsess_length; + vec_idx++; + } + } + } + else { + /* If checksum enabled need to send checksum at end of header + * as we have no payload. + */ + if (cksum) { + char *pbc_hdr = alloca(iovec[vec_idx-1].iov_len + + PSM_CRC_SIZE_IN_BYTES); + uint32_t *ckptr = (uint32_t*) + ((uint8_t*) pbc_hdr + iovec[vec_idx-1].iov_len); + + psmi_mq_mtucpy(pbc_hdr, iovec[vec_idx-1].iov_base,iovec[vec_idx-1].iov_len); + *ckptr = scb->cksum; + ckptr++; + *ckptr = scb->cksum; + + iovec[vec_idx-1].iov_base = pbc_hdr; + iovec[vec_idx-1].iov_len += PSM_CRC_SIZE_IN_BYTES; + + } + + _IPATH_VDBG("hdr=%p,%d\n", + iovec[vec_idx-1].iov_base, (int) iovec[vec_idx-1].iov_len); + } + + /* Can bound the number to send by 'num' */ + if (++scb_idx == num) + break; + } + psmi_assert(vec_idx > 0); + ret = ipath_cmd_writev(proto->fd, iovec, vec_idx); + + /* + * Successfully wrote entire vector + */ + if (ret == scb_idx) { + scb_sent += ret; + /* scbs are left if we didn't want to send less and didn't have + * to break out of scbctrl_bufalloc */ + if (scb != NULL && scb_idx < num && err == PSM_OK) + goto writev_again; + } + else { + if (ret < 0) { + uint32_t cntr_fini; + + /* ENOMEM: No kernel memory to queue request, try later? + * ECOMM: Link may have gone down + * EINTR: Got interrupt while in writev + */ + if (errno == ENOMEM || errno == ECOMM || errno == EINTR) { + err = psmi_context_check_status( + (const psmi_context_t *) &proto->ep->context); + if (err == PSM_OK) + err = PSM_EP_NO_RESOURCES; + } + else { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Unexpected error in writev(): %s (errno=%d) " + "(fd=%d,iovec=%p,len=%d)", strerror(errno), errno, + proto->fd, iovec, vec_idx); + goto fail; + } + /* Find out the latest packet that we were able to put in flight */ + if (ipath_sdma_inflight(proto->ptl->context->ctrl, &cntr_fini) < 0) + { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "Unable to retrieve inflight sdma counter: %s", + strerror(errno)); + goto fail; + } + + /* Re-write ret to actual inflight count */ + scb_sent += cntr_fini - proto->iovec_cntr_next_inflight; + } + else { + /* No need for inflight system call, we can infer it's value from + * writev's return value */ + scb_sent += ret; + } + } + +fail: + *num_sent = scb_sent; + psmi_assert(*num_sent <= num && *num_sent >= 0); + return err; +} + +/* + * Because we only lazily reap send dma completions, it's possible that we + * receive a packet's remote acknowledgement before seeing that packet's local + * completion. As part of processing ack packets and releasing scbs, we issue + * a wait for the local completion if the scb is marked as having been sent via + * send dma. + */ +psm_error_t __sendpath +ips_proto_dma_wait_until(struct ips_proto *proto, uint32_t dma_cntr) +{ + psm_error_t err = PSM_OK; + int spin_cnt = 0; + int did_yield = 0; + + PSM_DEBUG_CHECK_INFLIGHT_CNTR(proto); + + if ((int) proto->iovec_cntr_last_completed - (int) dma_cntr >= 0) + return PSM_OK; + + PSMI_PROFILE_BLOCK(); + + while ((int) proto->iovec_cntr_last_completed - (int) dma_cntr < 0) + { + if (spin_cnt++ == proto->ep->yield_spin_cnt) { + /* Have to yield holding the PSM lock, mostly because we don't + * support another thread changing internal state at this point in + * the code. + */ + did_yield = 1; + sched_yield(); + } + + /* Not there yet in completion count. Update our view of + * last_completed. */ + if (ipath_sdma_complete(proto->ptl->context->ctrl, + &proto->iovec_cntr_last_completed) == -1) + { + err = psmi_handle_error(proto->ep, PSM_EP_DEVICE_FAILURE, + "unable to retrieve completion sdma counter: %s", + strerror(errno)); + break; + } + } + + if (did_yield) + proto->stats.writev_compl_delay++; + + PSMI_PROFILE_UNBLOCK(); + + return err; +} + +#define ERRCHK_NOT_SERIALIZED 1 + +psm_error_t +ips_proto_timer_ack_callback(struct psmi_timer *current_timer, uint64_t current) +{ + struct ips_flow *flow = (struct ips_flow *) current_timer->context; + ips_epaddr_t *ipsaddr = flow->ipsaddr; + struct ips_proto *proto = ipsaddr->proto; + uint64_t t_cyc_next = get_cycles(); + ips_scb_t *scb; + + if (STAILQ_EMPTY(&flow->scb_unacked)) + return PSM_OK; + + scb = STAILQ_FIRST(&flow->scb_unacked); + + if (current >= scb->abs_timeout) { + int done_local; + +#if ERRCHK_NOT_SERIALIZED + /* We have to ensure that the send is at least locally complete before + * sending an error check or else earlier data can get to the + * destination *after* we pio this err_chk. + */ + if (flow->transfer == PSM_TRANSFER_DMA) { + uint32_t dma_cntr; + uint32_t scb_cntr = + STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->dma_ctr; + done_local = + (ipath_sdma_complete(proto->ptl->context->ctrl, &dma_cntr) > 0 && + ((int) dma_cntr - (int) scb_cntr >= 0)); + if (!done_local) + proto->stats.writev_compl_eagain++; + } + else + done_local = 1; /* Always done for PIO flows */ +#else + done_local = 1; /* Otherwise always done */ +#endif + + scb->ack_timeout = + min(scb->ack_timeout * flow->path->epr_timeout_ack_factor, + flow->path->epr_timeout_ack_max); + scb->abs_timeout = t_cyc_next + scb->ack_timeout; + + if (done_local) { + _IPATH_VDBG("sending err_chk flow=%d with first=%d,last=%d\n", + flow->flowid, STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn, + STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->seq_num.psn); + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) + ips_proto_send_ctrl_message(flow, + OPCODE_ERR_CHK_GEN, + &scb->tidsendc->ctrl_msg_queued, + NULL); + else + ips_proto_send_ctrl_message(flow, + OPCODE_ERR_CHK, + &flow->ipsaddr->ctrl_msg_queued, + NULL); + } + + t_cyc_next = get_cycles() + scb->ack_timeout; + } + else + t_cyc_next += (scb->abs_timeout - current); + + psmi_timer_request(proto->timerq, current_timer, t_cyc_next); + + return PSM_OK; +} + +psm_error_t +ips_proto_timer_send_callback(struct psmi_timer *current_timer, uint64_t current) +{ + struct ips_flow *flow = (struct ips_flow *) current_timer->context; + + /* If flow is marked as congested adjust injection rate - see process nak + * when a congestion NAK is received. + */ + if_pf (flow->flags & IPS_FLOW_FLAG_CONGESTED) { + struct ips_proto *proto = flow->ipsaddr->proto; + + /* Clear congestion flag and decrease injection rate */ + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + if ((flow->path->epr_ccti + + proto->cace[flow->path->epr_sl].ccti_increase) <= + proto->ccti_limit) + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->epr_sl].ccti_increase); + } + + flow->fn.xfer.flush(flow, NULL); + return PSM_OK; +} + +psm_error_t +ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment) +{ + struct ips_proto *proto = path_rec->proto; + uint16_t prev_ipd, prev_divisor; + + /* Increment/decrement ccti for path */ + psmi_assert_always(path_rec->epr_ccti >= path_rec->epr_ccti_min); + path_rec->epr_ccti += cct_increment; + + /* Determine new active IPD. */ + prev_ipd = path_rec->epr_active_ipd; + prev_divisor = path_rec->epr_cca_divisor; + if ((path_rec->epr_static_ipd) && + ((path_rec->epr_static_ipd + 1) > + (proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK))) { + path_rec->epr_active_ipd = path_rec->epr_static_ipd + 1; + path_rec->epr_cca_divisor = 0; + } + else { + path_rec->epr_active_ipd = proto->cct[path_rec->epr_ccti] & CCA_IPD_MASK; + path_rec->epr_cca_divisor = + proto->cct[path_rec->epr_ccti] >> CCA_DIVISOR_SHIFT; + } + + _IPATH_CCADBG("CCA: %s injection rate to <%x.%x> from <%x.%x>\n", (cct_increment > 0) ? "Decreasing" : "Increasing", path_rec->epr_cca_divisor, path_rec->epr_active_ipd, prev_divisor, prev_ipd); + + /* Reschedule CCA timer if this path is still marked as congested */ + if (path_rec->epr_ccti > path_rec->epr_ccti_min) { + psmi_timer_request(proto->timerq, + &path_rec->epr_timer_cca, + get_cycles() + + proto->cace[path_rec->epr_sl].ccti_timer_cycles); + } + + return PSM_OK; +} + +psm_error_t +ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current) +{ + ips_path_rec_t *path_rec = (ips_path_rec_t *) current_timer->context; + + /* Increase injection rate for flow. Decrement CCTI */ + if (path_rec->epr_ccti > path_rec->epr_ccti_min) + return ips_cca_adjust_rate(path_rec, -1); + else + return PSM_OK; +} diff --git a/ptl_ips/ips_proto.h b/ptl_ips/ips_proto.h new file mode 100644 index 0000000..12f55f1 --- /dev/null +++ b/ptl_ips/ips_proto.h @@ -0,0 +1,701 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_H +#define _IPS_PROTO_H + +#include "psm_user.h" + +#include "ips_recvhdrq.h" +#include "ips_tid.h" +#include "ips_scb.h" +#include "ips_epstate.h" +#include "ips_spio.h" +#include "ips_stats.h" +#include "ips_proto_am.h" +#include "ips_tidflow.h" +#include "ips_path_rec.h" + +typedef enum ips_path_type { + IPS_PATH_LOW_PRIORITY, + IPS_PATH_NORMAL_PRIORITY, + IPS_PATH_HIGH_PRIORITY, + IPS_PATH_MAX_PRIORITY +} ips_path_type_t; + +/* + * Local Endpoint info. + * + * Contains information necessary for composing packets for the local endpoint + */ +struct ips_epinfo { + uint32_t ep_baseqp; + uint16_t ep_base_lid; + uint8_t ep_lmc; + uint8_t ep_pad; + ibta_rate ep_link_rate; + uint16_t ep_context; + uint16_t ep_subcontext; + uint16_t ep_hca_type; + uint16_t ep_sl; /* IPATH_SL only when path record not used */ + uint16_t ep_unit; + uint16_t ep_mtu; + uint16_t ep_piosize; + uint16_t ep_hdrq_msg_size; + uint16_t ep_pkey; /* PSM_PKEY only when path record not used */ + uint64_t ep_timeout_ack; /* PSM_ERRCHK_TIMEOUT if no path record */ + uint64_t ep_timeout_ack_max; + uint32_t ep_timeout_ack_factor; +}; + +/* + * Remote Endpoint info. + * + * Contains information necessary for composing packets for a remote endpoint + */ +#define IPS_MAX_PATH_LMC 3 +struct ips_epinfo_remote { + uint32_t epr_qp; /* qp+context encoding */ + uint32_t epr_commidx_to; + uint32_t epr_commidx_from; + uint16_t epr_piosize; + uint16_t epr_context; /* Real context value */ + uint16_t epr_subcontext; + uint8_t epr_hca_type; + uint8_t epr_hpp_index; + + /* For LMC/Torus keep list of base and max dlid. Used for pkt verification */ + uint16_t epr_base_lid; + uint16_t epr_pkt_context; /* Context encoding in packet header */ + uint16_t epr_max_lid; + uint8_t epr_num_paths[IPS_PATH_MAX_PRIORITY]; + uint8_t epr_next_path[IPS_PATH_MAX_PRIORITY]; + ips_path_rec_t *epr_path[IPS_PATH_MAX_PRIORITY][1 << IPS_MAX_PATH_LMC]; +}; + +/* + * Control messages. + * + * ips low-level control messages to ensure reliability of eager packets. + * + */ +struct ips_proto; +psm_error_t +ips_proto_init(const psmi_context_t *context, + const struct ptl *ptl, + int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, + const struct psmi_timer_ctrl *timerq, /* PTL's timerq */ + const struct ips_epstate *epstate, /* PTL's epstate */ + const struct ips_spio *spioc, /* PTL's spio control */ + struct ips_proto *proto); /* output protocol */ + +psm_error_t ips_proto_fini(struct ips_proto *proto, int force, + uint64_t timeout); + +/* + * For writev support, we need to pass the pbc along with the message header + */ +struct ips_pbc_header { + union ipath_pbc pbc; + struct ips_message_header hdr; +} PSMI_CACHEALIGN; + +/* + * Control message structures + */ +#define CTRL_MSG_QEUEUE_SIZE 32 /* power of two */ + +struct ips_proto_ctrl_message { + struct ips_pbc_header pbc_hdr; + uint8_t _hdr_uwords[IPS_HEADER_QUEUE_UWORDS_MAX<<2]; +} PSMI_CACHEALIGN; + +/* Control messages saved in the control queue. Even though we only + * always send 2 ptl_args on the wire, some message types will save + * more than 16 bytes in arguments. + */ +struct ips_flow; +struct ips_tid_recv_desc; + +struct ips_ctrlq_elem { + struct ptl_epaddr *ipsaddr; + uint8_t message_type; + uint8_t flowid; + uint16_t pad; + uint32_t *msg_queue_mask; + struct ips_flow *flow; + ptl_arg_t args[3]; +}; + +struct ips_ctrlq { + /* Queued control messages, queued when pio is busy */ + struct ips_proto *ctrlq_proto; + + int ctrlq_flowid; + uint32_t ctrlq_head; + uint32_t ctrlq_tail; + uint32_t ctrlq_overflow; + uint32_t ctrlq_never_enqueue; + + struct ips_ctrlq_elem ctrlq_cqe[CTRL_MSG_QEUEUE_SIZE] PSMI_CACHEALIGN; + struct psmi_timer ctrlq_timer; /* when in timerq */ +}; + +/* + * Connect/disconnect, as implemented by ips + */ +psm_error_t ips_proto_connect(struct ips_proto *proto, int numep, + const psm_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm_error_t ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + const psm_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t array_of_errors[], + uint64_t timeout_in); + +int ips_proto_isconnected(struct ptl_epaddr *ipsaddr); + +/* + * Pending operation structures + */ +struct ips_pend_sreq { + STAILQ_ENTRY(ips_pend_sreq) next; + psm_mq_req_t req; + uint32_t type; +}; + +#define IPS_PENDSEND_EAGER_DATA 1 +#define IPS_PENDSEND_EAGER_REQ 2 +#define IPS_PENDSEND_EXP_TIDS 3 +#define IPS_PENDSEND_EXP_SENDS 4 + +STAILQ_HEAD(ips_pendsendq, ips_pend_sreq); + +struct ips_pend_sends { + struct ips_proto *proto; /* back ptr */ + struct psmi_timer timer; + struct ips_pendsendq pendq; +}; + +/* + * One instance of the protocol + */ + +struct ips_protoexp; + +struct ips_proto_stats { + uint64_t pio_busy_cnt; + uint64_t writev_busy_cnt; + uint64_t writev_compl_eagain; + uint64_t writev_compl_delay; + uint64_t scb_egr_unavail_cnt; + uint64_t scb_exp_unavail_cnt; + uint64_t hdr_overflow; + uint64_t egr_overflow; + uint64_t lid_zero_errs; + uint64_t unknown_packets; + uint64_t stray_packets; + uint64_t send_dma_misaligns; +}; + +struct ips_proto_error_stats { + uint64_t num_icrc_err; + uint64_t num_vcrc_err; + uint64_t num_ecc_err; + uint64_t num_len_err; + uint64_t num_mtu_err; + uint64_t num_khdr_err; + uint64_t num_tid_err; + uint64_t num_mk_err; + uint64_t num_ib_err; +}; + +// OPP support structure. +struct opp_api { + void* (*op_path_find_hca)(const char*name, void **device); + void* (*op_path_open)(void *device, int port_num); + void (*op_path_close)(void *context); + int (*op_path_get_path_by_rec)(void *context, ibta_path_rec_t *query, ibta_path_rec_t *response); + /* TODO: Need symbol to ibv_close_device. */ +}; + +struct ips_ibta_compliance_fn { + psm_error_t (*get_path_rec)(struct ips_proto *proto, uint16_t slid, + uint16_t dlid, uint16_t desthca_type, + unsigned long timeout, + ips_epaddr_t *ipsaddr); + psm_error_t (*fini)(struct ips_proto *proto); +}; + +typedef enum ptl_epaddr_flow { + EP_FLOW_GO_BACK_N_PIO, + EP_FLOW_GO_BACK_N_DMA, + EP_FLOW_GO_BACK_N_AM_REQ, + EP_FLOW_GO_BACK_N_AM_RSP, + EP_FLOW_LAST /* Keep this the last endpoint flow */ +} ptl_epaddr_flow_t; + +struct ips_proto { + struct ptl *ptl; /* cached */ + psm_ep_t ep; /* cached, for errors */ + psm_mq_t mq; /* cached, for mq handling */ + int fd; /* cached, for writev ops */ + + /* Pending sends */ + struct ips_pend_sends pend_sends; + struct ips_epstate *epstate; + struct psmi_timer_ctrl *timerq; + + struct ips_protoexp *protoexp; + struct ips_scbctrl *scbc_rv; + struct ips_spio *spioc; + struct ips_scbctrl scbc_egr; + struct ips_epinfo epinfo; + uint64_t timeout_send; + uint32_t flags; + uint32_t iovec_cntr_next_inflight; + uint32_t iovec_cntr_last_completed; + uint32_t iovec_thresh_eager; + uint32_t iovec_thresh_eager_blocking; + uint32_t scb_max_sdma; + uint32_t scb_bufsize; + uint16_t scb_max_inflight; + uint16_t flow_credits; + mpool_t pend_sends_pool; + struct ips_ibta_compliance_fn ibta; + struct ips_proto_stats stats; + struct ips_proto_error_stats error_stats; + + struct ips_proto_am proto_am; + + struct ips_ctrlq ctrlq[EP_FLOW_LAST]; + + /* Handling tid errors */ + uint32_t tiderr_cnt; + uint32_t tiderr_max; + uint64_t tiderr_tnext; + uint64_t tiderr_warn_interval; + uint32_t tiderr_context_tid_off; + psm_epid_t tiderr_epid; + + uint64_t t_init; + uint64_t t_fini; + uint32_t runid_key; + + int num_connected_to; + int num_connected_from; + int num_disconnect_requests; + + /* misc state variables. */ +// Smallest interval in cycles between which we warn about stray messages +// This is a per-endpoint quantity, overridable with PSM_STRAY_WARN_INTERVAL +// We use the same interval to send the "die" message. + uint64_t stray_warn_interval; + int done_warning; + int done_once; + int num_bogus_warnings; + struct { + uint32_t interval_secs; + uint64_t next_warning; + uint64_t count; + } psmi_logevent_tid_send_reqs; + + /* SL2VL table for protocol */ + int sl2vl[16]; + + /* CCA per port */ + uint16_t *cct; /* cct table */ + uint16_t ccti_size; /* ccti table size */ + uint16_t ccti_limit; /* should be <= size-1 */ + + uint16_t ccti_portctrl; /* QP or SL CC */ + uint16_t ccti_ctrlmap; /* map for valid sl */ + struct cace { /* CACongestionEntry */ + uint8_t ccti_increase; /* steps to increase */ + //uint16_t ccti_timer; /* CCTI Timer in units of 1.024 usec */ + uint64_t ccti_timer_cycles; /* coverted from us_2_cycles() */ + uint8_t ccti_threshold; /* threshod to make log */ + uint8_t ccti_min; /* min value for ccti */ + } cace[16]; /* 16 service level */ + + /* Path record support */ + uint8_t ips_ipd_delay[IBTA_RATE_120_GBPS + 1]; + struct hsearch_data ips_path_rec_hash; + void *opp_lib; + void *hndl; + void *device; + void *opp_ctxt; + struct opp_api opp_fn; + +/* + * Control message queue for pending messages. + * + * Control messages are queued as pending when no PIO is available for sending + * the message. They are composed on the fly and do not need buffering. + * + * Variables here are write once (at init) and read afterwards (except the msg + * queue overflow counters). + */ + uint32_t ctrl_msg_queue_overflow; + uint32_t ctrl_msg_queue_never_enqueue; + uint32_t message_type_to_index[256]; +#define message_type2index(proto, msg_type) (proto->message_type_to_index[(msg_type)] & ~CTRL_MSG_QUEUE_ALWAYS) + +}; + +/* + * Updates to these stats must be reflected in ips_ptl_epaddr_stats_init + */ +struct ptl_epaddr_stats { + uint64_t err_chk_send; + uint64_t err_chk_recv; + uint64_t nak_send; + uint64_t nak_recv; + uint64_t connect_req; + uint64_t disconnect_req; + uint64_t tids_grant_send; + uint64_t tids_grant_recv; + uint64_t send_rexmit; + uint64_t congestion_pkts; /* IB CCA FECN packets */ +}; + +/* + * Endpoint address, encapsulates per-endpoint protocol metadata + * + * Directly implements the ptl epaddr. + */ + +/* + * Flow index (6 bits) encodes the following: + * + * Protocol: 3 bits + * Flow Index: 3 bits + * + * Currently only two protocols supported: Go Back N (the "original" flow) + * and the TIDFLOW. We may look at adding other protocols like + * Selective ACK and maybe even STCP. + * + * The Flow index is protocol specific. For a Go Back N protocol this usually + * refers to the index of the flow between two endpoints. For TIDFLOWS + * this is not currently used. + */ + +#define IPS_MAX_PROTOCOL 8 +#define IPS_MAX_FLOWINDEX 8 + +#define IPS_FLOWID_PACK(protocol,flowindex) \ + ( ((((uint16_t)protocol)&0x7) << 3) | \ + (((uint16_t)flowindex)&0x7) ) + +#define IPS_FLOWID_GET_PROTO(flow) (((flow)>>3)&0x7) +#define IPS_FLOWID_GET_INDEX(flow) ((flow) % 4) + +#define IPS_FLOWID2INDEX(flow) \ + ((flow)&0x7) + +typedef void (*ips_flow_enqueue_fn_t)(struct ips_flow *flow, ips_scb_t *scb); +typedef psm_error_t (*ips_flow_flush_fn_t)(struct ips_flow *, int *nflushed); +typedef void (*ips_flow_nak_postprocess_fn_t)(struct ips_flow *, struct ips_message_header *p_hdr); + +typedef enum psm_transfer_type { + PSM_TRANSFER_PIO, + PSM_TRANSFER_DMA, + PSM_TRANSFER_LAST /* Keep this the last transfer type */ +} psm_transfer_type_t; + +typedef enum psm_protocol_type { + PSM_PROTOCOL_GO_BACK_N, + PSM_PROTOCOL_TIDFLOW, + PSM_PROTOCOL_LAST /* Keep this the last protocol type */ +} psm_protocol_type_t; + +struct ips_transfer_fn { + /* Functions dealing with enqueuing and flushing scbs to the network */ + ips_flow_enqueue_fn_t enqueue; + ips_flow_flush_fn_t flush; +}; + +struct ips_protocol_fn { + /* FLOW_ADD: Other functions for is_valid etc. */ + ips_flow_nak_postprocess_fn_t nak_post_process; +}; + +struct ips_flow_fn { + struct ips_transfer_fn xfer; + struct ips_protocol_fn protocol; +}; + +#define PIO_TRANSFER_FUNCTIONS { \ + .enqueue = ips_proto_flow_enqueue, \ + .flush = ips_proto_flow_flush_pio \ +} + +#define DMA_TRANSFER_FUNCTIONS { \ + .enqueue = ips_proto_flow_enqueue, \ + .flush = ips_proto_flow_flush_dma \ +} + +#define GO_BACK_N_PROTOCOL_FUNCTIONS { \ + .nak_post_process = NULL \ +} + +#define TIDFLOW_PROTOCOL_FUNCTIONS { \ + .nak_post_process = ips_tidflow_nak_post_process \ +} + +struct ips_flow { + SLIST_ENTRY(ips_flow) next; /* List of flows with pending acks */ + struct ips_flow_fn fn; + + struct ptl_epaddr *ipsaddr; /* back pointer, remote endpoint */ + struct ips_epinfo *epinfo; /* back pointer, local epinfo */ + ips_path_rec_t *path; /* Path to use for flow */ + psm_transfer_type_t transfer; + psm_protocol_type_t protocol; + + uint32_t flowid; + uint32_t frag_size; + uint16_t flags; + uint16_t sl; + uint16_t cca_ooo_pkts; + uint16_t credits; /* Current credits available to send on flow */ + uint16_t cwin; /* Size of congestion window */ + uint16_t ack_interval; + uint16_t msg_ooo_toggle; /* toggle for OOO message */ + uint16_t msg_ooo_seqnum; /* seqnum for OOO message */ + + psmi_seqnum_t xmit_seq_num; + psmi_seqnum_t xmit_ack_num; + psmi_seqnum_t recv_seq_num; + psmi_seqnum_t last_seq_num; + + uint32_t scb_num_pending; + uint32_t scb_num_unacked; + + psmi_timer timer_send; /* timer for frames that got a busy PIO */ + psmi_timer timer_ack; /* timer for unacked frames */ + + STAILQ_HEAD(ips_scb_unackedq, ips_scb) scb_unacked; + SLIST_HEAD(ips_scb_pendlist, ips_scb) scb_pend; +}; + +struct ptl_epaddr { + struct ptl *ptl; /* cached */ + psm_epaddr_t epaddr; /* back pointer to psm top-level epaddr */ + struct ips_proto *proto; /* back pointer to protocol */ + psm_mq_t mq; /* cached */ + + uint16_t flags; /* per-endpoint flags */ + struct ips_epinfo_remote epr; /* remote endpoint params */ + struct ips_flow flows[EP_FLOW_LAST] PSMI_CACHEALIGN; + struct ips_flow tidgr_flow; /* tidflow */ + + uint32_t ctrl_msg_queued; /* bitmap of queued control messages to be send */ + uint32_t delay_in_ms; /* used in close */ + uint64_t s_timeout; /* used as a time in close */ + int credit; + + pthread_mutex_t sesslock; + struct ptl_epaddr_stats stats; + + uint32_t runid_key; + uint16_t psm_verno; + uint16_t connect_verno; /* The lowest connect version we can support */ + uint16_t cstate_to; + uint16_t cstate_from; + psm_error_t cerror_to; + psm_error_t cerror_from; +} +__attribute__((aligned(64))); + + +/* + * Send support on scbs. + * + */ +void ips_flow_init(struct ips_flow *flow, ips_path_rec_t *path, + ips_epaddr_t *ipsaddr, + psm_transfer_type_t transfer_type, + psm_protocol_type_t protocol, ips_path_type_t path_type, + uint32_t flow_index); + +void ips_scb_prepare_flow(ips_scb_t *scb, struct ips_epinfo *epinfo, + struct ips_epinfo_remote *epr, struct ips_flow *flow); + +void ips_proto_flow_enqueue(struct ips_flow *flow, ips_scb_t *scb); + +psm_error_t ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed); +psm_error_t ips_proto_flow_flush_dma(struct ips_flow *flow, int *nflushed); + +/* Wrapper for enqueue + flush */ +psm_error_t ips_proto_scb_pio_send(struct ips_flow *flow, ips_scb_t *scb); + +void ips_proto_scb_dma_enqueue(struct ips_proto *proto, ips_scb_t *scb); +psm_error_t ips_proto_scb_dma_flush(struct ips_proto *proto, ips_epaddr_t *ipsaddr, + int *nflushed); +psm_error_t ips_proto_dma_wait_until(struct ips_proto *proto, uint32_t dma_ctr); +psm_error_t ips_proto_dma_wait(struct ips_proto *proto, uint32_t dma_ctr, + uint32_t *dma_ctr_out); + +psm_error_t ips_dma_transfer_frame(struct ips_proto *proto, + struct ips_flow *flow, + struct ips_pbc_header *pbc_hdr, + void *payload, uint32_t paylen, + uint32_t cksum); + +/* Special-case for expected sends */ +void ips_protoexp_scb_inflight(ips_scb_t *scb); + +/* + * Protocol receive processing + * + */ +/* NAK post processing for tidflows */ +void ips_tidflow_nak_post_process(struct ips_flow *flow, + struct ips_message_header *p_hdr); +/* Actual receive processing is an inline in ips_proto_help.h */ +int ips_proto_process_packet_inner(struct ips_recvhdrq_event *rcv_ev); +/* Error handling for unknown packet, packet is unknown when epid doesn't match + * in epstate table */ +int ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev); +/* Exposed for fastpath only */ +void ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev); +/* Handling error cases */ +int ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev); + +/* + * Protocol exception handling and frame dumps + */ +void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len); +void ips_proto_dump_err_stats(struct ips_proto *proto); +void ips_proto_show_rhf_errors(const uint32_t *rhdr); +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg); +void ips_proto_dump_frame(void *frame, int lenght, char *message); +void ips_proto_dump_data(void *data, int data_length); +void ips_proto_dump_eager(uint32_t *curr_rcv_hdr); + +/* + * Checksum of ips packets + */ +uint32_t ips_crc_calculate(uint32_t len, uint8_t *data, uint32_t crc); + +/* + * Expected send support + */ +/* + * The expsend token is currently always a pointer to a MQ request. It is + * echoed on the wire throughout various phases of the expected send protocol + * to identify a particular send. + */ +typedef void (*ips_tid_completion_callback_t)(void *); + +psm_error_t ips_protoexp_init(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, + int num_of_send_bufs, + int num_of_send_desc, + struct ips_protoexp **protoexp_o); + +psm_error_t ips_protoexp_fini(struct ips_protoexp *protoexp); +void ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev); + +void ips_protoexp_recv_unaligned_data(struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev); + +void ips_protoexp_tid_grant(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_tid_grant_ack(const struct ips_recvhdrq_event *rcv_ev); +int ips_protoexp_tid_release(const struct ips_recvhdrq_event *rcv_ev); +void ips_protoexp_tid_release_ack(const struct ips_recvhdrq_event *rcv_ev); + +int ips_protoexp_build_ctrl_message(struct ips_protoexp *protoexp, + struct ptl_epaddr *ipsaddr, + ptl_arg_t *args, + uint16_t *pkt_flags, + uint8_t opcode, void *payload); +psm_error_t ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc); + +/* + * Peer is waiting (blocked) for this request + */ +#define IPS_PROTOEXP_TIDGET_WAIT 0x1 +#define IPS_PROTOEXP_TIDGET_PEERWAIT 0x2 +psm_error_t ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, + void *buf, uint32_t length, + psm_epaddr_t epaddr, + uint32_t remote_tok, uint32_t flags, + ips_tid_completion_callback_t callback, + void *context); + +/* + * Matched-Queue processing and sends + */ +psm_error_t ips_proto_mq_push_eager_req(struct ips_proto *proto, + psm_mq_req_t req); +psm_error_t ips_proto_mq_push_eager_data(struct ips_proto *proto, + psm_mq_req_t req); + +int ips_proto_mq_handle_cts(struct ips_proto *proto, ptl_arg_t *args); + +int ips_proto_mq_handle_rts_envelope(psm_mq_t mq, int mode, psm_epaddr_t epaddr, + uint64_t tag, uint32_t reqidx_peer, + uint32_t msglen); +int ips_proto_mq_handle_rts_envelope_outoforder(psm_mq_t mq, int mode, + psm_epaddr_t epaddr, uint16_t msg_seqnum, + uint64_t tag, uint32_t reqidx_peer, + uint32_t msglen); + +psm_error_t ips_proto_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, + uint32_t flags, uint64_t tag, const void *ubuf, + uint32_t len); + +psm_error_t ips_proto_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, + uint32_t flags, uint64_t tag, const void *ubuf, + uint32_t len, void *context, psm_mq_req_t *req_o); + +int ips_proto_am(struct ips_recvhdrq_event *rcv_ev); + +/* IBTA feature related functions (path record, sl2vl etc.) */ +psm_error_t ips_ibta_init_sl2vl_table(struct ips_proto *proto); +psm_error_t ips_ibta_link_updown_event(struct ips_proto *proto); +psm_error_t ips_ibta_init(struct ips_proto *proto); +psm_error_t ips_ibta_fini(struct ips_proto *proto); + +#endif /* _IPS_PROTO_H */ diff --git a/ptl_ips/ips_proto_am.c b/ptl_ips/ips_proto_am.c new file mode 100644 index 0000000..9f2bf18 --- /dev/null +++ b/ptl_ips/ips_proto_am.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "psm_am.h" +#include "psm_am_internal.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +#define IPS_AMFLAG_ISTINY 1 + +struct ips_am_token { + struct psmi_am_token tok; + + /* ptl-specific token stuff */ + struct ips_proto_am *proto_am; +}; + +psm_error_t +ips_proto_am_init(struct ips_proto *proto, + int num_of_send_bufs, int num_of_send_desc, + uint32_t imm_size, struct ips_proto_am *proto_am) +{ + psm_error_t err = PSM_OK; + int send_buf_size = proto->scb_bufsize; + + proto_am->proto = proto; + proto_am->scbc_request = &proto->scbc_egr; + + if ((err = ips_scbctrl_init(&proto->ep->context, num_of_send_desc, + num_of_send_bufs, imm_size, send_buf_size, + NULL, NULL, &proto_am->scbc_reply))) + goto fail; +fail: + return err; +} + +psm_error_t +ips_proto_am_fini(struct ips_proto_am *proto_am) +{ + return PSM_OK; +} + +static +psm_error_t +am_short_reqrep(struct ips_proto_am *proto_am, ips_scb_t *scb, + struct ptl_epaddr *ipsaddr, + psm_amarg_t *args, int nargs, uint8_t sub_opcode, + void *src, size_t len, int flags, int pad_bytes) + +{ + int i, hdr_qwords = PSM_AM_HDR_QWORDS; + ptl_epaddr_flow_t flowid = ((sub_opcode == OPCODE_AM_REQUEST) || + (sub_opcode == OPCODE_AM_REQUEST_NOREPLY)) ? + EP_FLOW_GO_BACK_N_AM_REQ : EP_FLOW_GO_BACK_N_AM_RSP; + struct ips_flow *flow = &ipsaddr->flows[flowid]; + + _IPATH_VDBG("%s src=%p len=%d, nargs=%d\n", + ((sub_opcode == OPCODE_AM_REQUEST) || + (sub_opcode == OPCODE_AM_REQUEST_NOREPLY)) ? "req" : "rep", + src, (int) len, nargs); + + if (nargs == 1) { /* fastpath */ + scb->ips_lrh.data[0].u64w0 = args[0].u64w0; + hdr_qwords--; + } + else if (nargs > 1) { + /* Easily unrollable but leave as is in case we can increase qwords + * on the chip in the near future */ + for (i = 0; i < PSM_AM_HDR_QWORDS; i++, hdr_qwords--) + scb->ips_lrh.data[i].u64w0 = args[i].u64w0; + + if (nargs > PSM_AM_HDR_QWORDS) { + /* Slow case -- we don't have iovec and not enough space in the + * message header, so we have to copy the user's arguments even if + * the payload is marked ASYNC */ + uintptr_t bufp = (uintptr_t) scb->payload; + psmi_mq_mtucpy((void *) bufp, &args[PSM_AM_HDR_QWORDS], + sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS)); + bufp += sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS); + scb->payload_size = sizeof(psm_amarg_t) * (nargs-PSM_AM_HDR_QWORDS); + if (src != NULL && len > 0) { + psmi_mq_mtucpy((void *) bufp, src, len); + scb->payload_size += len; + } + scb->payload_size += pad_bytes; + scb->ips_lrh.hdr_dlen = pad_bytes; + goto send_scb; + } + } + + /* + * If small enough, try to stuff the message in a header only + */ + if (len <= (hdr_qwords<<3)) { /* can handle len == 0 */ + psmi_mq_mtucpy(&scb->ips_lrh.data[PSM_AM_HDR_QWORDS-hdr_qwords], src, len); + scb->payload_size = 0; + scb->ips_lrh.hdr_dlen = len; + scb->ips_lrh.amhdr_flags |= IPS_AMFLAG_ISTINY; + } + else { /* Whatever's left requires a separate payload */ + if (scb->payload == NULL) { /* Just attach the buffer */ + scb->payload = src; + } + else { /* May need to re-xmit user data, keep it around */ + psmi_mq_mtucpy(scb->payload, src, len); + } + scb->payload_size = len + pad_bytes; + scb->ips_lrh.hdr_dlen = pad_bytes; + } + +send_scb: + scb->ips_lrh.sub_opcode = sub_opcode; + flow->fn.xfer.enqueue(flow, scb); + flow->fn.xfer.flush(flow, NULL); + return PSM_OK; +} + +static inline int +calculate_pad_bytes (struct ips_proto_am *proto_am, int nargs, size_t len) +{ + if ((nargs <= PSM_AM_HDR_QWORDS) && + (len <= ((PSM_AM_HDR_QWORDS - nargs) << 3))) + return 0; + else { + size_t arg_overflow = (nargs > PSM_AM_HDR_QWORDS) ? + (sizeof(psm_amarg_t) * (nargs - PSM_AM_HDR_QWORDS)) : 0; + size_t cache_aligned_len = (len + arg_overflow + PSM_CACHE_LINE_BYTES-1) & + ~(PSM_CACHE_LINE_BYTES - 1); + if (cache_aligned_len <= proto_am->proto->scb_bufsize) + return cache_aligned_len - (len + arg_overflow); + else + return 0; + } +} + +static inline +void +ips_am_scb_init(ips_scb_t *scb, uint8_t handler, int nargs, + int pad_bytes, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + scb->completion_am = completion_fn; + scb->cb_param = completion_ctxt; + scb->ips_lrh.amhdr_hidx = handler; + scb->ips_lrh.hdr_dlen = pad_bytes; + scb->ips_lrh.amhdr_nargs = nargs; + scb->ips_lrh.amhdr_flags = 0; + if (completion_fn) + scb->flags |= IPS_SEND_FLAG_ACK_REQ; + return; +} + +psm_error_t +ips_am_short_request(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + struct ips_proto_am *proto_am = &epaddr->ptl->proto.proto_am; + psm_error_t err; + ips_scb_t *scb; + int pad_bytes = calculate_pad_bytes(proto_am, nargs, len); + int payload_sz = (nargs << 3) + pad_bytes; + + if_pt (!(flags & PSM_AM_FLAG_ASYNC)) + payload_sz += len; + + if (payload_sz > (PSM_AM_HDR_QWORDS << 3)) { + /* Payload can't fit in header - allocate buffer to carry data */ + int arg_sz = (nargs > PSM_AM_HDR_QWORDS) ? + ((nargs - PSM_AM_HDR_QWORDS) << 3) : 0; + + /* len + pad_bytes + overflow_args */ + PSMI_BLOCKUNTIL(epaddr->ep,err, + ((scb = ips_scbctrl_alloc(proto_am->scbc_request, 1, + len + pad_bytes + arg_sz, + IPS_SCB_FLAG_ADD_BUFFER)) != NULL)); + } + else { + PSMI_BLOCKUNTIL(epaddr->ep,err, + ((scb = ips_scbctrl_alloc_tiny(proto_am->scbc_request)) != NULL)); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + + return am_short_reqrep(proto_am, scb, epaddr->ptladdr, args, nargs, + (flags & PSM_AM_FLAG_NOREPLY) ? + OPCODE_AM_REQUEST_NOREPLY : OPCODE_AM_REQUEST, + src, len, flags, pad_bytes); +} + +psm_error_t +ips_am_short_reply(psm_am_token_t tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt) +{ + ips_scb_t *scb; + struct ips_am_token *token = (struct ips_am_token *) tok; + struct ips_proto_am *proto_am = token->proto_am; + struct ptl_epaddr *ipsaddr = token->tok.epaddr_from->ptladdr; + int scb_flags = 0; + int pad_bytes = calculate_pad_bytes(proto_am, nargs, len); + + if (!token->tok.can_reply) { + /* Trying to reply for an AM request that did not expect a reply */ + _IPATH_ERROR("Invalid AM reply for request!"); + return PSM_AM_INVALID_REPLY; + } + + psmi_assert_always(ips_scbctrl_avail(&proto_am->scbc_reply)); + + if ((nargs<<3) + len <= (PSM_AM_HDR_QWORDS<<3)) { + psmi_assert_always(pad_bytes == 0); + scb = ips_scbctrl_alloc_tiny(&proto_am->scbc_reply); + } + else { + int payload_sz = (nargs << 3) + pad_bytes; + + payload_sz += (flags & PSM_AM_FLAG_ASYNC) ? 0 : len; + scb_flags |= (payload_sz > (PSM_AM_HDR_QWORDS << 3)) ? + IPS_SCB_FLAG_ADD_BUFFER : 0; + + scb = ips_scbctrl_alloc(&proto_am->scbc_reply, 1, payload_sz, scb_flags); + } + + psmi_assert_always(scb != NULL); + ips_am_scb_init(scb, handler, nargs, pad_bytes, + completion_fn, completion_ctxt); + am_short_reqrep(proto_am, scb, ipsaddr, args, nargs, OPCODE_AM_REPLY, + src, len, flags, pad_bytes); + return PSM_OK; +} + +/* Prepares and runs a handler from a receive event. */ +static int +ips_am_run_handler(struct ips_am_token *tok, + const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am; + psm_am_handler_fn_t hfn; + + int nargs = p_hdr->amhdr_nargs; + tok->tok.flags = p_hdr->amhdr_flags; + tok->tok.epaddr_from = rcv_ev->ipsaddr->epaddr; + tok->tok.can_reply = (p_hdr->sub_opcode == OPCODE_AM_REQUEST); + tok->proto_am = proto_am; + + hfn = psm_am_get_handler_function(rcv_ev->proto->ep, + p_hdr->amhdr_hidx); + _IPATH_VDBG("amhdr_len=%d, amhdr_flags=%x, amhdr_nargs=%d, p_hdr=%p\n", + p_hdr->hdr_dlen, p_hdr->amhdr_flags, p_hdr->amhdr_nargs, p_hdr); + + /* Fast path: everything fits only in a header */ + if (tok->tok.flags & IPS_AMFLAG_ISTINY) { + return hfn(tok, tok->tok.epaddr_from, + (psm_amarg_t *) &p_hdr->data[0].u64, nargs, + &p_hdr->data[nargs].u64, p_hdr->hdr_dlen); + } + else { + /* Arguments and payload may split across header/eager_payload + * boundaries. */ + psm_amarg_t args[8] = {}; + int i; + uint64_t *payload = (uint64_t *) ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + for (i = 0; i < nargs; i++) { + if (i < PSM_AM_HDR_QWORDS) + args[i].u64 = p_hdr->data[i].u64; + else { + args[i].u64 = *payload++; + paylen -= 8; + } + } + + paylen -= p_hdr->hdr_dlen; + return hfn(tok, tok->tok.epaddr_from, args, nargs, payload, paylen); + } +} + +int +ips_proto_am(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_am_token token; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ptl_epaddr *ipsaddr = rcv_ev->ipsaddr; + struct ips_proto_am *proto_am = &rcv_ev->proto->proto_am; + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow = &ipsaddr->flows[flowid]; + int ret = IPS_RECVHDRQ_CONTINUE; + +/* + * Based on AM request/reply traffic pattern, if we don't have + * a reply scb slot then we can't process the request packet, + * we just silently drop it. Otherwise, it will be a deadlock. + * note: ips_proto_is_expected_or_nak() can not be called in this case. + */ + if (p_hdr->sub_opcode == OPCODE_AM_REQUEST && + !ips_scbctrl_avail(&proto_am->scbc_reply)) { + proto_am->amreply_nobufs++; + return ret; + } + + if (ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev)) { + /* run handler */ + if (ips_am_run_handler(&token, rcv_ev)) + ret = IPS_RECVHDRQ_BREAK; + + /* Look if the handler replied, if it didn't, ack the request */ + if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow); + } + + ips_proto_process_ack(rcv_ev); + return ret; +} diff --git a/ptl_ips/ips_proto_am.h b/ptl_ips/ips_proto_am.h new file mode 100644 index 0000000..9e9ad06 --- /dev/null +++ b/ptl_ips/ips_proto_am.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_AM_H +#define _IPS_PROTO_AM_H + +#include "psm_user.h" +#include "ips_scb.h" + +#define PSM_AM_HDR_QWORDS 2 /* Needs to be at least 2 */ + +struct ips_proto_am { + struct ips_proto *proto; /* back pointer */ + struct ips_scbctrl *scbc_request; + struct ips_scbctrl scbc_reply; + + uint64_t amreply_nobufs; +}; + +psm_error_t +ips_am_short_reply(psm_am_token_t tok, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm_error_t +ips_am_short_request(psm_epaddr_t epaddr, + psm_handler_t handler, psm_amarg_t *args, int nargs, + void *src, size_t len, int flags, + psm_am_completion_fn_t completion_fn, + void *completion_ctxt); + +psm_error_t ips_proto_am_init(struct ips_proto *proto, int num_of_send_bufs, + int num_of_send_desc, uint32_t imm_size, + struct ips_proto_am *proto_am); + +psm_error_t ips_proto_am_fini(struct ips_proto_am *proto_am); + +#endif /* _IPS_PROTO_AM_H */ diff --git a/ptl_ips/ips_proto_connect.c b/ptl_ips/ips_proto_connect.c new file mode 100644 index 0000000..3e73de8 --- /dev/null +++ b/ptl_ips/ips_proto_connect.c @@ -0,0 +1,1639 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +#define COMMIDX_MAX 65535 /* Last valid communication idx is 65535 */ + +/* Connections are not pairwise but we keep a single 'epaddr' for messages from + * and messages to a remote 'epaddr'. State transitions for connecting TO and + * FROM 'epaddrs' are the following: + * Connect TO: + * NONE -> WAITING -> ESTABLISHED -> WAITING_DISC -> DISCONNECTED -> NONE + * + * Connect FROM (we receive a connect request) + * NONE -> ESTABLISHED -> NONE + */ +#define CSTATE_ESTABLISHED 1 +#define CSTATE_NONE 2 +#define CSTATE_TO_DISCONNECTED 3 +#define CSTATE_TO_WAITING 4 +#define CSTATE_TO_WAITING_DISC 5 + +#define IPS_CONNECT_VERNO 0x0201 /* major,major,minor,minor */ +#define BIG_ENDIAN_TEST_WORD 0xA5A5 + +/* We can use up to 16-bits of features, we only use 5 of them for now. */ +#define EP_FEATURES_ENDIAN_BIG 0x0001 +#define EP_FEATURES_ENDIAN_LITTLE 0x0002 +#define EP_FEATURES_BITWIDTH_32 0x0004 +#define EP_FEATURES_BITWIDTH_64 0x0008 +#define EP_FEATURES_RCVTHREAD 0x8000 +#define EP_FEATURES_MULTIFLOW 0x4000 + +#define EP_FEATURES_NODETYPE 0x0f + +struct connect_msghdr { + uint8_t opcode; + uint8_t _unused1; + + uint16_t connect_verno; /* be */ + uint16_t psm_verno; /* be */ + uint16_t phase; /* be connect/disconnect phase (unused now) */ + + uint16_t hca_type; /* HCA type of remote endpoint */ + uint16_t sl; /* Default SL request for remote endpoint*/ + uint32_t _unused[1]; + + psm_uuid_t uuid; +}; +#define IPS_CONNECT_MSGHDR_SIZE 32 /* 16 + 16-byte-uuid */ + +struct ips_connect_reqrep { + struct connect_msghdr hdr; + uint32_t flags; /* unused */ + uint16_t connect_result; /* be */ + + /* Per-job info */ + uint32_t commidx; /* ignore if 0xffffffff */ + uint32_t runid_key; /* one-time stamp connect key */ + uint16_t job_pkey; /* (future use) */ + uint64_t _unused1[4]; + + /* Per-node characteristics */ + uint32_t features; /* be - endpoint desc (endian + bidwidth) */ + uint16_t hdrq_msg_size; /* where is the header/eager cutoff */ + uint16_t mtu; /* receive payload */ + char hostname[128]; /* always NULL-terminated */ + uint64_t _unused2[4]; + + uint8_t version_1_offset[0]; +}; + +/* Used for sanity checking in processing message arrivals */ +#define IPS_CONNECT_REQREP_MINIMUM_SIZE \ + (offsetof(struct ips_connect_reqrep, version_1_offset)) +#define IPS_MAX_CONNECT_PAYLEN 512 + +struct ips_disconnect_reqrep { + struct connect_msghdr hdr; + uint32_t flags; /* unused */ + + uint16_t mode; + uint16_t _unused1[3]; + uint64_t _unused2[4]; + uint8_t version_1_offset[0]; +}; +/* Used for sanity checking in processing message arrivals */ +#define IPS_DISCONNECT_REQREP_MINIMUM_SIZE \ + (offsetof(struct ips_disconnect_reqrep, version_1_offset)) + +const struct ips_transfer_fn psmi_xfer_fn[PSM_TRANSFER_LAST] = + { + PIO_TRANSFER_FUNCTIONS, + DMA_TRANSFER_FUNCTIONS + }; + +const struct ips_protocol_fn psmi_protocol_fn[PSM_PROTOCOL_LAST] = + { + GO_BACK_N_PROTOCOL_FUNCTIONS, + TIDFLOW_PROTOCOL_FUNCTIONS + }; + +/* Startup protocol in PSM/IPS + * + * Start timer. + * + * For all nodes to connect to: + * Grab connect lock + * Look up epid in table + * MATCH. + * assert cstate_to != CONNECT_WAITING (no re-entrancy) + * If cstate_to == CONNECT_DONE + * return the already connected address. + * else + * assert cstate_to == CONNECT_NONE + * assert cstate_from == CONNECT_DONE + * cstate_to := CONNECT_WAITING + * assert commidx_to != UNKNOWN && commidx_from != UNKNOWN + * req->commidx := epaddr->commidx_from + * add to list of pending connect. + * NO MATCH + * allocate epaddr and put in table + * cstate_to := CONNECT_WAITING + * cstate_from := CONNECT_NONE + * commidx_to := UNKNOWN + * req->commidx := epaddr->commidx_from := NEW commidx integer + * add to list of pending connect + * Release connect lock + * + * expected_connect_count = ep->total_connect_count + num_to_connect + * while (expected_connect_count != ep->total_connect_count) + * check for timeout + * progress(); + * + * For all connection requests received (within progress loop) + * If uuid doesn't match, NAK the connect and skip request + * Grab connect lock + * Lock up epid in table + * MATCH + * if cstate_from == CONNECT_DONE + * req->commidx := epaddr->commidx_from + * compose reply and send again (this is a dupe request). + * else + * assert cstate_from == CONNECT_NONE + * assert cstate_to == (CONNECT_WAITING | CONNECT_DONE) + * cstate_from := CONNECT_DONE + * epaddr->commidx_to := req->commidx + * req->commidx := epaddr->commidx_from + * NO MATCH + * allocate epaddr and put in table + * cstate_from := CONNECT_DONE + * epaddr->commidx_to = req->commidx; + * rep->commidx := epaddr->commidx_from := NEW commidx integer + * compose connect reply and send + * Release connect lock + * + * For all connection replies received: + * If connect_result != 0, process error and skip. + * assert cstate_to == CONNECT_WAITING + * if cstate_from == CONNECT_DONE + * assert rep->commidx == epaddr->commidx_to + * else + * epaddr->commidx_to := rep->commidx + * cstate_to := CONNECT_DONE + * ep->total_connect_count ++ + * + * * Fill in a connection request: + * 1. Set connect protocol version and PSM versions + * 2. Set the uuid attached to current endpoint and add the job_pkey + * the node wishes to communicate post-connect. + * 3. Set our mtu, bitwidth and endianess to detect inconsistencies + * + */ + +/* Due to an oversight in the inital protocol, only 16 of the 32 bits can + * actually be used because the little-to-big endian conversion was done with + * 16 bits from the first version in 2.0. */ +static +uint32_t +psmi_ips_node_features(psm_ep_t ep) +{ + uint32_t features = 0; + if (BIG_ENDIAN_TEST_WORD == __cpu_to_be16(BIG_ENDIAN_TEST_WORD)) + features |= EP_FEATURES_ENDIAN_BIG; + else + features |= EP_FEATURES_ENDIAN_LITTLE; + if (sizeof(uintptr_t) == 8) + features |= EP_FEATURES_BITWIDTH_64; + else + features |= EP_FEATURES_BITWIDTH_32; + if (ep->context.runtime_flags & PSMI_RUNTIME_RCVTHREAD) + features |= EP_FEATURES_RCVTHREAD; + features |= EP_FEATURES_MULTIFLOW; + + return features; +} + +static +int +node_matches_bitendian(psm_ep_t ep, uint32_t features) +{ + if ((features & EP_FEATURES_NODETYPE) == + (psmi_ips_node_features(ep) & EP_FEATURES_NODETYPE)) + return 1; + else + return 0; +} + +/* + * Given a connection request, set mtu, communication index and hdr length + * parameters. + * + * The most subtle parameter is the mtu. When set as 'req->mtu', the mtu + * is our connecting peer's declared mtu (which may not be the same as our + * mtu). The approach is to take the smaller of both mtus when communicating + * with that peer. Also, when using pio, the size can be further restricted by + * the pio send buffer sizes (i.e. 4K IB MTU but only 2K PIO buffers). + */ +static +psm_error_t +ips_ipsaddr_set_req_params(struct ips_proto *proto, + ips_epaddr_t *ipsaddr, + const struct ips_connect_reqrep *req, + uint32_t paylen) +{ + psmi_assert_always(req->mtu > 0); + + uint32_t peer_mtu = min(req->mtu, proto->epinfo.ep_mtu); + + ipsaddr->epr.epr_piosize = min(peer_mtu, proto->epinfo.ep_piosize); + ipsaddr->epr.epr_hca_type= req->hdr.hca_type; + + if (ipsaddr->epr.epr_piosize > PSM_CACHE_LINE_BYTES) + ipsaddr->epr.epr_piosize &= ~(PSM_CACHE_LINE_BYTES - 1); + + /* + * DMA is bounded by the peer's mtu put also our local PIO send size + */ + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].frag_size = ipsaddr->epr.epr_piosize; + ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].frag_size = peer_mtu; + ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ].frag_size=ipsaddr->epr.epr_piosize; + ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP].frag_size=ipsaddr->epr.epr_piosize; + + ipsaddr->epr.epr_commidx_to = req->commidx; + + /* + * For static routes i.e. "none" path resolution update all paths to + * have the same profile (mtu, sl etc.). + * + * For path record queries the epr_mtu and epr_sl are setup correctly + * from the path itself. + */ + if (proto->ep->path_res_type == PSM_PATH_RES_NONE) { + int ptype, pidx; + for (ptype = IPS_PATH_LOW_PRIORITY; ptype < IPS_PATH_MAX_PRIORITY;ptype++) + for (pidx = 0; pidx < ipsaddr->epr.epr_num_paths[ptype]; pidx++) { + ipsaddr->epr.epr_path[ptype][pidx]->epr_mtu = peer_mtu; + ipsaddr->epr.epr_path[ptype][pidx]->epr_sl = req->hdr.sl; + } + } + + if (paylen > sizeof(struct ips_connect_reqrep)) { + int count; + char *p = (char *)(req + 1); + paylen -= sizeof(struct ips_connect_reqrep); + if (paylen%(sizeof(uint64_t)+sizeof(psm_epid_t))) { + return PSM_INTERNAL_ERR; + } + count = paylen / (sizeof(uint64_t)+sizeof(psm_epid_t)); + if (count > IPATH_MAX_UNIT) return PSM_INTERNAL_ERR; + + memcpy(ipsaddr->epaddr->mctxt_gidhi, p, count*sizeof(uint64_t)); + p += count*sizeof(uint64_t); + memcpy(ipsaddr->epaddr->mctxt_epid, p, count*sizeof(psm_epid_t)); + ipsaddr->epaddr->mctxt_epcount = count; + } + + return psmi_epid_set_hostname(psm_epid_nid(ipsaddr->epaddr->epid), + (char*) req->hostname, 0); +} + +static psm_error_t __recvpath +ips_proto_send_ctrl_message_request(struct ips_proto *proto, + struct ips_flow *flow, uint8_t message_type, + uint32_t *msg_queue_mask, void *payload, + uint64_t timeout) +{ + psm_error_t err = PSM_OK; + + while (get_cycles() < timeout) { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, payload); + if (err == PSM_OK) { + break; + } + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) { + break; + } + } + return err; +} + +static psm_error_t __recvpath +ips_proto_send_ctrl_message_reply(struct ips_flow *flow, uint8_t message_type, + uint32_t *msg_queue_mask, void *payload) +{ + /* This will try up to 100 times until the message is sent. The code + * is persistent becausing dropping replies will lead to a lack of + * overall progress on the connection/disconnection. We do not want + * to poll from here, and we cannot afford a lengthy timeout, since + * this is called from the receive path. + */ + psm_error_t err = PSM_OK; + int i; + for (i = 0; i < 100; i++) { + err = ips_proto_send_ctrl_message(flow, message_type, + msg_queue_mask, payload); + if (err == PSM_OK) { + break; + } + } + return err; +} + +int +ips_proto_build_connect_message(struct ips_proto *proto, + struct ips_proto_ctrl_message *msg, + ips_epaddr_t *ipsaddr, uint8_t opcode, + void *payload) +{ + struct connect_msghdr *hdr = (struct connect_msghdr *) payload; + struct ips_connect_reqrep *req = + (struct ips_connect_reqrep *) payload; + uint32_t paylen = sizeof(struct connect_msghdr); + + /* Write standard header that goes out on all connect msgs */ + hdr->connect_verno = __cpu_to_be16(IPS_CONNECT_VERNO); + hdr->psm_verno = __cpu_to_be16(PSMI_VERNO); + hdr->opcode = opcode; + hdr->phase = 0; + hdr->hca_type = proto->epinfo.ep_hca_type; + hdr->sl = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl; + + /* Some times we simply echo disconnect requests since we can get dupe + * disconnect requests. Unless that's the case, we always send the full + * uuid */ + psmi_assert_always(proto != NULL); + memcpy(&hdr->uuid, &proto->ep->key, sizeof(psm_uuid_t)); + + switch (opcode) { + case OPCODE_CONNECT_REPLY: + case OPCODE_CONNECT_REQUEST: +#if 0 + psmi_assert_always(ipsaddr->cerror_from != PSM_OK || + !COMMIDX_IS_UNKNOWN(proto, ipsaddr->commidx_from)); +#endif + if (opcode == OPCODE_CONNECT_REQUEST) { + req->connect_result = __cpu_to_be16(PSM_OK); + req->runid_key = proto->runid_key; + } + else { + req->connect_result = __cpu_to_be16(ipsaddr->cerror_from); + req->runid_key = ipsaddr->runid_key; + } + req->flags = 0; + req->commidx = (uint32_t) ipsaddr->epr.epr_commidx_from; + req->job_pkey = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_pkey; + + req->features = + __cpu_to_be16(psmi_ips_node_features(proto->ep)); + req->hdrq_msg_size = proto->epinfo.ep_hdrq_msg_size; + req->mtu = ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_mtu; + strncpy(req->hostname, psmi_gethostname(), + sizeof(req->hostname) - 1); + req->hostname[sizeof(req->hostname) - 1] = '\0'; + paylen = sizeof(struct ips_connect_reqrep); + + /* Attach all multi-context subnetids and epids. */ + if (proto->ep->mctxt_master == proto->ep) { + psm_epid_t *epid; + psm_ep_t ep = proto->ep->mctxt_next; + uint64_t *subnetid = (uint64_t *)(req + 1); + /* first all subnetids */ + while (ep != proto->ep) { + *subnetid = ep->gid_hi; + subnetid++; + ep = ep->mctxt_next; + paylen += sizeof(uint64_t); + } + ep = proto->ep->mctxt_next; + epid = (psm_epid_t *)subnetid; + /* second all epids */ + while (ep != proto->ep) { + *epid = ep->epid; + epid++; + ep = ep->mctxt_next; + paylen += sizeof(psm_epid_t); + } + } + psmi_assert_always(paylen <= IPS_MAX_CONNECT_PAYLEN); + break; + + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + paylen = sizeof(struct ips_disconnect_reqrep); + break; + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unexpected/unhandled connection opcode 0x%x\n", + opcode); + break; + } + return paylen; +} + +void +ips_flow_init(struct ips_flow *flow, ips_path_rec_t *path, ips_epaddr_t *ipsaddr, psm_transfer_type_t transfer_type, psm_protocol_type_t protocol, ips_path_type_t path_type, uint32_t flow_index) +{ + struct ips_proto *proto = ipsaddr->proto; + + psmi_assert_always(protocol < IPS_MAX_PROTOCOL); + psmi_assert_always(flow_index < IPS_MAX_FLOWINDEX); + + SLIST_NEXT(flow, next) = NULL; + flow->fn.xfer = psmi_xfer_fn[transfer_type]; + flow->fn.protocol = psmi_protocol_fn[protocol]; + + /* If path is not specified pick one accordingly */ + if (!path) + path = ips_select_path(proto, path_type, ipsaddr); + + flow->path = path; + flow->ipsaddr = ipsaddr; + flow->epinfo = &proto->epinfo; + flow->transfer= transfer_type; + flow->protocol= protocol; + flow->flowid = IPS_FLOWID_PACK(protocol, flow_index); + flow->xmit_seq_num.val = 0; + flow->xmit_ack_num.val = 0; + flow->xmit_ack_num.pkt--; /* last acked */ + flow->recv_seq_num.val = 0; + flow->flags = 0; + flow->sl = flow->path->epr_sl; + flow->cca_ooo_pkts = 0; + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1); + flow->scb_num_pending = 0; + flow->scb_num_unacked = 0; + + psmi_timer_entry_init(&(flow->timer_ack), + ips_proto_timer_ack_callback, flow); + + psmi_timer_entry_init(&(flow->timer_send), + ips_proto_timer_send_callback, flow); + + STAILQ_INIT(&flow->scb_unacked); + SLIST_INIT(&flow->scb_pend); + return; +} + +static +size_t +epaddr_size() +{ + return (size_t) (sizeof(struct psm_epaddr) + sizeof(struct ptl_epaddr)); +} + +static +psm_error_t +ips_init_ep_qp_and_pkt_context(uint16_t hca_type, uint32_t qp, + uint32_t context, ips_epaddr_t *ipsaddr) +{ + psm_error_t err = PSM_OK; + switch(hca_type) { + case PSMI_HCA_TYPE_QLE73XX: + /* Bit 5 of the context is inserted into bit 0 of QP */ + ipsaddr->epr.epr_qp = (qp & ~0x1) | (context >> 4); + ipsaddr->epr.epr_pkt_context = context & 0xf; + break; + case PSMI_HCA_TYPE_QLE72XX: + if (context == 16) { + /* For context 16, the bottom bit of qp is toggled */ + ipsaddr->epr.epr_qp = qp ^ 1; + ipsaddr->epr.epr_pkt_context = 15; + } + else { + ipsaddr->epr.epr_qp = qp; + ipsaddr->epr.epr_pkt_context = context; + } + break; + case PSMI_HCA_TYPE_QLE71XX: + ipsaddr->epr.epr_qp = qp; + ipsaddr->epr.epr_pkt_context = context; + break; + default: + err = PSM_PARAM_ERR; + break; + } + return err; +} + +static +psm_epaddr_t +ips_alloc_epaddr(struct ips_proto *proto, psm_epid_t epid, + const char *hostname, unsigned long timeout) +{ + psm_error_t err = PSM_OK; + psm_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + uint64_t lid, context, subcontext; + uint16_t hca_type, path_dlid; + uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1); + int i; + ips_path_type_t prio; + + /* The PSM/PTL-level and ips-level epaddr structures are colocated in + * memory for performance reasons -- this is why ips allocates memory for + * both the PSM/PTL-level and ips-level epaddr structure. + * + * The PSM/PTL structure data is filled in upon successfuly ep connect in + * ips_ptl_connect(). + */ + epaddr = (psm_epaddr_t) psmi_calloc(proto->ep, PER_PEER_ENDPOINT, + 1, epaddr_size()); + if (epaddr == NULL) + return NULL; + + epaddr->ptl = proto->ptl; + epaddr->ptlctl = proto->ptl->ctl; + epaddr->ep = proto->ep; + STAILQ_INIT(&epaddr->egrlong); + STAILQ_INIT(&epaddr->egrdata); + epaddr->xmit_egrlong.egr_data = 0; + epaddr->outoforder_q.first = NULL; + epaddr->outoforder_q.lastp = &epaddr->outoforder_q.first; + epaddr->mctxt_master = epaddr; + epaddr->mctxt_current = epaddr; + epaddr->mctxt_prev = epaddr->mctxt_next = epaddr; + + /* IPS-level epaddr */ + ipsaddr = (ips_epaddr_t *)(epaddr+1); + epaddr->ptladdr = ipsaddr; + + ipsaddr->ptl = proto->ptl; + ipsaddr->mq = proto->mq; + ipsaddr->epaddr = epaddr; + ipsaddr->proto = proto; + + /* Setup base fields for remote epid before doing path record lookup: + */ + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + hca_type = PSMI_EPID_GET_HCATYPE(epid); + /* Actual context of peer */ + ipsaddr->epr.epr_context = context; + + /* Setup remote endpoint */ + err = ips_init_ep_qp_and_pkt_context(hca_type, proto->epinfo.ep_baseqp, + context, ipsaddr); + if (err != PSM_OK) { + _IPATH_ERROR("Connect: Warning! unknown HCA type %d. Assuming remote HCA is same as local.\n", hca_type); + ips_init_ep_qp_and_pkt_context(hca_type, proto->epinfo.ep_baseqp, + PSMI_EPID_GET_CONTEXT(proto->ep->epid), ipsaddr); + } + + /* Subcontext */ + ipsaddr->epr.epr_subcontext = subcontext; + + /* Get path record for tuple */ + err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, + __cpu_to_be16(lid), hca_type, timeout, + ipsaddr); + if (err != PSM_OK) { + psmi_free(epaddr); + return NULL; + } + + /* Determine base lid across all paths */ + ipsaddr->epr.epr_base_lid = + __be16_to_cpu(ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_dlid); + + for (prio = IPS_PATH_LOW_PRIORITY; prio < IPS_PATH_MAX_PRIORITY; prio++) + for (i = 0; i < ipsaddr->epr.epr_num_paths[prio]; i++) { + path_dlid = __be16_to_cpu(ipsaddr->epr.epr_path[prio][i]->epr_dlid); + if (path_dlid < ipsaddr->epr.epr_base_lid) + ipsaddr->epr.epr_base_lid = path_dlid; + } + + + /* Finally construct the resolved epaddr->epid for this peer (For torus + * SL and even the lid may be different!) + */ + path_dlid = ipsaddr->epr.epr_base_lid & lmc_mask; + + epaddr->epid = + PSMI_EPID_PACK_EXT(path_dlid, + context, subcontext, + hca_type, + ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl); + + /* Add this epid as a known hostname to our epid hostname db */ + if (psmi_epid_set_hostname(psm_epid_nid(epid), hostname, 0)) + return NULL; + + ipsaddr->flags = 0; + + /* All flows are over BULK path. Only control messages use the high + * priority CONTROL path. + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], NULL, + ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_PIO); + + /* DMA flow uses the same path as PIO flow due to multi MTU sized + * eager messages. If we use separate paths we are more likely to have + * payload arrive out of order with respect to envelope leading to + * un-necessary NAKs. + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA], + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path, + ipsaddr, PSM_TRANSFER_DMA, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_DMA); + + /* AM Request messages also use the same path as the PIO flow as they + * also require order with respect to the MPI request messages. + */ + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ], + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].path, + ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_AM_REQ); + + ips_flow_init(&ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP], NULL, + ipsaddr, PSM_TRANSFER_PIO, PSM_PROTOCOL_GO_BACK_N, + IPS_PATH_NORMAL_PRIORITY, EP_FLOW_GO_BACK_N_AM_RSP); + + /* tidflow for tid get request */ + ips_flow_init(&ipsaddr->tidgr_flow, NULL, ipsaddr, + PSM_TRANSFER_DMA, PSM_PROTOCOL_TIDFLOW, + IPS_PATH_LOW_PRIORITY, 0); + + ipsaddr->cstate_to = CSTATE_NONE; + ipsaddr->cstate_from = CSTATE_NONE; + + /* For now, set these to our PSM versions and connect versions. They will + * be overwritten to the peer's versions in handling connection reqs + */ + ipsaddr->psm_verno = PSMI_VERNO; + ipsaddr->connect_verno = IPS_CONNECT_VERNO; + + /* Add epaddr to PSM's epid table */ + psmi_epid_add(proto->ep, epaddr->epid, epaddr); + psmi_assert_always(psmi_epid_lookup(proto->ep, epaddr->epid) == epaddr); + + return epaddr; +} + +static +void +ips_free_epaddr(ips_epaddr_t *ipsaddr) +{ + psm_epaddr_t epaddr = ipsaddr->epaddr; + _IPATH_VDBG("epaddr=%p,ipsaddr=%p,commidx_from=%d\n", epaddr, ipsaddr, + ipsaddr->epr.epr_commidx_from); + psmi_epid_remove(ipsaddr->proto->ep, epaddr->epid); + ips_epstate_del(ipsaddr->proto->epstate, ipsaddr->epr.epr_commidx_from); + psmi_free(epaddr); + return; +} + +static psm_error_t ips_get_addr_from_epid(struct ips_proto *proto, + psm_epid_t epid, + unsigned long timeout, + psm_epaddr_t *epaddr) +{ + psm_error_t err; + uint64_t lid, context, subcontext; + uint16_t hca_type, path_dlid; + psm_epid_t path_epid; + psm_epaddr_t ep_address = NULL; + uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1); + ips_epaddr_t ipsaddr; + + /* First unpack to get slid/dlid. */ + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + hca_type = PSMI_EPID_GET_HCATYPE(epid); + + /* Get path record for tuple */ + err = proto->ibta.get_path_rec(proto, proto->epinfo.ep_base_lid, + __cpu_to_be16(lid), hca_type, + timeout, &ipsaddr); + if (err != PSM_OK) + goto fail; + + /* Generate path epid to do lookup on - uses the SL from the path record. + */ + path_dlid = (__be16_to_cpu(ipsaddr.epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_dlid)) & lmc_mask; + + path_epid = + PSMI_EPID_PACK_EXT(path_dlid, + context, subcontext, hca_type, + ipsaddr.epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_sl); + ep_address = psmi_epid_lookup(proto->ep, path_epid); + + fail: + *epaddr = ep_address; + return err; +} + +static +psm_error_t +ptl_handle_connect_req(struct ips_proto *proto, psm_epid_t epid, + psm_epaddr_t epaddr, struct ips_connect_reqrep *req, + uint32_t paylen, int uuid_valid); + +psm_error_t +ips_proto_process_connect(struct ips_proto *proto, psm_epid_t epid, + uint8_t opcode, struct ips_message_header *p_hdr, + void *payload, uint32_t paylen) +{ + psm_epaddr_t epaddr; + ips_epaddr_t *ipsaddr; + struct connect_msghdr *hdr; + uint16_t connect_result; + psm_ep_t ep = proto->ep; + int uuid_valid; + int uwords = (proto->epinfo.ep_hdrq_msg_size>>2) - + IPS_HEADER_QUEUE_IWORDS - IPS_HEADER_QUEUE_HWORDS; + int hdrq_extra; + uint32_t lid, context, subcontext; + uint16_t lmc_mask = ~((1 << proto->epinfo.ep_lmc) - 1); + + PSMI_PLOCK_ASSERT(); + + struct ips_connect_reqrep *req; + psm_error_t err = PSM_OK; + + /* If the sender doesn't have the same header/eager cutoff, we need to make + * sure we copy the connect data into a contiguous buffer */ + char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN; + + hdrq_extra = uwords - p_hdr->hdr_dlen; + if (hdrq_extra != 0) { + uint32_t *bufp = (uint32_t *) buf; + uint32_t *payp = (uint32_t *) payload; + _IPATH_VDBG("hdrq_extra is %d, uwords=%d, inwords=%d\n", + hdrq_extra, uwords, p_hdr->hdr_dlen); + int hdrq_extra = uwords - p_hdr->hdr_dlen; + if (hdrq_extra > 0) { /* some of it went into our hdrq */ + psmi_mq_mtucpy(bufp, &p_hdr->data[0].u32w0 + p_hdr->hdr_dlen, + hdrq_extra<<2); + psmi_mq_mtucpy(bufp+hdrq_extra, payload, paylen); + paylen += (hdrq_extra<<2); + } + else { /* we got some useless padding in eager */ + hdrq_extra = -hdrq_extra; + paylen -= (hdrq_extra<<2); + psmi_mq_mtucpy(bufp, payp + hdrq_extra, paylen); + } + payload = buf; + } + + hdr = (struct connect_msghdr *) payload; + if (paylen < sizeof(struct connect_msghdr)) { /* drop */ + _IPATH_PRDBG("dropping unknown connect message of length %d\n", paylen); + return PSM_OK; + } + + /* Obtain HCA type and SL from request and regenerate epid */ + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + epid = PSMI_EPID_PACK_EXT(lid & lmc_mask, context, subcontext, hdr->hca_type, hdr->sl); + + /* Don't need to call ips_get_addr_from_epid as the epid cache is keyed + * of the IPS_PATH_HIGH_PRIORITY dlid and the SL which we already have from + * the connect request (as all control messages uses the CONTROL path). + */ + epaddr = psmi_epid_lookup(proto->ep, epid); + ipsaddr = epaddr ? epaddr->ptladdr : NULL; + + uuid_valid = (psmi_uuid_compare(ep->key, hdr->uuid) == 0); + + if ((opcode == OPCODE_CONNECT_REQUEST || opcode == OPCODE_CONNECT_REPLY) && + paylen < IPS_CONNECT_REQREP_MINIMUM_SIZE) + { + uint64_t lid, context, subcontext; + char *type = opcode == OPCODE_CONNECT_REQUEST ? "request" : "reply"; + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + psmi_syslog(proto->ep, 1, LOG_INFO, + "Unrecognized connect %s (size is %d instead of %d) " + "from epid %ld:%ld:%ld\n", type, paylen, + (int) IPS_CONNECT_REQREP_MINIMUM_SIZE, + (long) lid, (long) context, (long) subcontext); + goto fail; /* Not fatal, just drop the packet */ + } + + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + err = ptl_handle_connect_req(proto, epid, epaddr, + (struct ips_connect_reqrep *) payload, paylen, uuid_valid); + break; + + case OPCODE_CONNECT_REPLY: + req = (struct ips_connect_reqrep *) payload; + if (!ipsaddr || req->runid_key != proto->runid_key) { + uint64_t lid, context, subcontext; + + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + _IPATH_PRDBG("Unknown connectrep (ipsaddr=%p, %d,%d) " + "from epid %ld:%ld:%ld bad_uuid=%s\n", + ipsaddr, req->runid_key, proto->runid_key, + (long) lid, (long) context, (long) subcontext, + uuid_valid ? "NO" : "YES"); + break; + } + if (ipsaddr->cstate_to != CSTATE_TO_WAITING) { + /* possible dupe */ + _IPATH_VDBG("connect dupe, expected %d got %d\n", + CSTATE_TO_WAITING, ipsaddr->cstate_to); + break; + } + connect_result = __be16_to_cpu(req->connect_result); + + /* Reply to our request for connection (i.e. outgoing connection) */ + if (ipsaddr->cstate_from != CSTATE_ESTABLISHED) { + err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); + if (err) goto fail; + } + ipsaddr->cstate_to = CSTATE_ESTABLISHED; + ipsaddr->cerror_to = connect_result; + + break; + + case OPCODE_DISCONNECT_REQUEST: + { + ips_epaddr_t ipsaddr_f; /* fake a ptl addr */ + int ipsaddr_do_free = 0; + psmi_assert_always(paylen >= IPS_DISCONNECT_REQREP_MINIMUM_SIZE); + _IPATH_VDBG("Got a disconnect from %s\n", psmi_epaddr_get_name(epid)); + proto->num_disconnect_requests++; + /* It's possible to get a disconnection request on a ipsaddr that + * we've since removed if the request is a dupe. Instead of + * silently dropping the packet, we "echo" the request in the + * reply. */ + if (ipsaddr == NULL) { + uint16_t src_context = IPS_HEADER_SRCCONTEXT_GET(p_hdr); + uint32_t qp; + + ipsaddr = &ipsaddr_f; + memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); + ipsaddr_f.epr.epr_context = src_context; + ipsaddr_f.epr.epr_subcontext = p_hdr->src_subcontext; + + /* QLE72XX is special for context 16 */ + if ((hdr->hca_type == PSMI_HCA_TYPE_QLE72XX) && + (src_context == 16)) + ipsaddr_f.epr.epr_pkt_context = 15; + + /* Get path record for peer */ + err = proto->ibta.get_path_rec(proto, + proto->epinfo.ep_base_lid, + __cpu_to_be16(lid), + hdr->hca_type, + 3000, &ipsaddr_f); + if (err != PSM_OK) + goto fail; + + qp = proto->epinfo.ep_baseqp; + err = ips_init_ep_qp_and_pkt_context(hdr->hca_type, qp, + src_context, &ipsaddr_f); + if (err != PSM_OK) { + _IPATH_ERROR("Disconnect: Warning! unknown HCA type %d.\n", hdr->hca_type); + goto fail; + } + + ipsaddr_f.proto = proto; + ipsaddr_f.ptl = (ptl_t *) -1; + /* If the send fails because of pio_busy, don't let ips queue + * the request on an invalid ipsaddr, just drop the reply */ + ipsaddr_f.ctrl_msg_queued = ~0; + ips_flow_init(&ipsaddr_f.flows[EP_FLOW_GO_BACK_N_PIO], NULL, + &ipsaddr_f, PSM_TRANSFER_PIO, + PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY, + EP_FLOW_GO_BACK_N_PIO); + _IPATH_VDBG("Disconnect on unknown epaddr, just echo request\n"); + } + else if (ipsaddr->cstate_from != CSTATE_NONE) { + ipsaddr->cstate_from = CSTATE_NONE; + proto->num_connected_from--; + if (ipsaddr->cstate_to == CSTATE_NONE) { + ipsaddr_do_free = 1; + } + if (!uuid_valid) { + uint64_t lid, context, subcontext; + + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + _IPATH_VDBG("Unknown disconnect request from epid %d:%d.%d " + "bad_uuid=%s\n", (int) lid, + (int) context, (int) subcontext, uuid_valid ? "NO" : "YES"); + } + } + + memset(buf, 0, sizeof buf); + ips_proto_send_ctrl_message_reply(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_DISCONNECT_REPLY, + &ipsaddr->ctrl_msg_queued, buf); + /* We can safely free the ipsaddr if required since disconnect + * messages are never enqueued so no reference to ipsaddr is kept */ + if (ipsaddr_do_free) + ips_free_epaddr(ipsaddr); + } + break; + + case OPCODE_DISCONNECT_REPLY: + if (!ipsaddr || !uuid_valid) { + uint64_t lid, context, subcontext; + lid = PSMI_EPID_GET_LID(epid); + context = PSMI_EPID_GET_CONTEXT(epid); + subcontext = PSMI_EPID_GET_SUBCONTEXT(epid); + _IPATH_VDBG("Unknown disconnect reply from epid %d:%d.%d bad_uuid=%s\n", + (int) lid, (int) context, (int) subcontext, + uuid_valid ? "NO" : "YES"); + break; + } + else if (ipsaddr->cstate_to == CSTATE_TO_WAITING_DISC) { + ipsaddr->cstate_to = CSTATE_TO_DISCONNECTED; + /* Freed in disconnect() if cstate_from == NONE */ + } /* else dupe reply */ + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unexpected/unhandled connect opcode 0x%x\n", + opcode); + } +fail: + return err; +} + +static +psm_error_t +ptl_handle_connect_req(struct ips_proto *proto, psm_epid_t epid, + psm_epaddr_t epaddr, struct ips_connect_reqrep *req, + uint32_t paylen, int uuid_valid) +{ + ips_epaddr_t *ipsaddr; + psm_error_t err = PSM_OK; + uint16_t connect_result = PSM_OK; + uint16_t psm_verno; + uint16_t c_verno; + uint16_t features; + int newconnect = 0; + char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN; + + if (epid == proto->ep->epid) { + /* For 2.0, we won't expose handling for this error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR, + "Network connectivity problem: Locally detected duplicate " + "LIDs 0x%04x on hosts %s and %s. (Exiting)", + (uint32_t) psm_epid_nid(epid), + psmi_epaddr_get_hostname(epid), + psmi_gethostname()); + /* XXX no return */ + abort(); + } + else if (epaddr == NULL) { /* new ep connect before we call into connect */ + newconnect = 1; + if ((epaddr = ips_alloc_epaddr(proto, epid, req->hostname, + 5000)) == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + } + ipsaddr = epaddr->ptladdr; + if (ipsaddr->cstate_from == CSTATE_ESTABLISHED) { + /* Duplicate lid detection. */ + if (ipsaddr->runid_key == req->runid_key && uuid_valid) + goto do_reply; /* duplicate request, not duplicate lid */ + else if (uuid_valid) { + /* True blue duplicate lid, both connect messages are part of the + * same context since they use the same uuid */ + /* For 2.0, we won't expose handling for this error */ + psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR, + "Network connectivity problem: Detected duplicate " + "LIDs 0x%x on hosts %s (key=%d) and %s (key=%d). (Exiting)", + (uint32_t) psm_epid_nid(ipsaddr->epaddr->epid), + psmi_epaddr_get_hostname(epid), + ipsaddr->runid_key, + req->hostname, + req->runid_key); + } + else { /* Some out of context message. Just drop it */ + if (!proto->done_warning) { + psmi_syslog(proto->ep, 1, LOG_INFO, + "Non-fatal connection problem: Received an out-of-context " + "connection message from host %s LID=0x%x context=%d. (Ignoring)", + req->hostname, (int) psm_epid_nid(epid), psm_epid_context(epid)); + proto->done_warning = 1; + } + goto no_reply; + } + } + psmi_assert_always(ipsaddr->cstate_from == CSTATE_NONE); + + /* Save requestor's connection and psm version numbers */ + c_verno = __be16_to_cpu(req->hdr.connect_verno); + psm_verno = __be16_to_cpu(req->hdr.psm_verno); + features = __be16_to_cpu(req->features); + + /* On PSM pre-2.0, just print message and exit if the connect version + * number is not at least 0x0201 */ + if (c_verno < 0x0201) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_INVALID_VERSION, + "Connect protocol (%x,%x) is obsolete and incompatible", + (c_verno >> 8) & 0xff, c_verno & 0xff); + connect_result = PSM_EPID_INVALID_CONNECT; + } + /* Whenever there's a protocol change, adjust handling here */ + else if ((IPS_CONNECT_VERNO & 0xff00) != (ipsaddr->connect_verno & 0xff00)) { + connect_result = PSM_EPID_INVALID_VERSION; + } + else if (!node_matches_bitendian(proto->ep, features)) + connect_result = PSM_EPID_INVALID_NODE; + else if (!psmi_verno_isinteroperable(__be16_to_cpu(req->hdr.psm_verno))) { + connect_result = PSM_EPID_INVALID_VERSION; + } + else if (!(proto->flags & IPS_PROTO_FLAG_QUERY_PATH_REC) && + proto->epinfo.ep_pkey != IPATH_DEFAULT_P_KEY && + proto->epinfo.ep_pkey != req->job_pkey) { + connect_result = PSM_EPID_INVALID_PKEY; + } + else if (!uuid_valid) { + char ep_key[37], req_key[37]; + connect_result = PSM_EPID_INVALID_UUID_KEY; + psmi_uuid_unparse(proto->ep->key, ep_key); + psmi_uuid_unparse(req->hdr.uuid, req_key); + _IPATH_PRDBG("UUID key mismatch request key=%s endpoint key=%s\n", + req_key, ep_key); + } + else if (!psmi_verno_isinteroperable(ipsaddr->psm_verno)) { + connect_result = PSM_INIT_BAD_API_VERSION; + } + else { + connect_result = PSM_OK; + if (ipsaddr->cstate_to == CSTATE_NONE) { + ips_epstate_idx idx; + psmi_assert_always(newconnect == 1); + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->epr.epr_commidx_from = idx; + } + } + ipsaddr->connect_verno = c_verno; + ipsaddr->psm_verno = psm_verno; + + /* Incoming connection request */ + if (ipsaddr->cstate_to != CSTATE_ESTABLISHED) { + err = ips_ipsaddr_set_req_params(proto, ipsaddr, req, paylen); + if (err) goto fail; + } + ipsaddr->cstate_from = CSTATE_ESTABLISHED; + ipsaddr->cerror_from = connect_result; + + ipsaddr->runid_key = req->runid_key; + ipsaddr->flags |= features & EP_FEATURES_RCVTHREAD ? + SESS_FLAG_HAS_RCVTHREAD : 0; + ipsaddr->flags |= proto->ep->context.runtime_flags & PSMI_RUNTIME_RCVTHREAD ? + SESS_FLAG_LOCK_SESS : 0; + ipsaddr->flags |= features & EP_FEATURES_MULTIFLOW ? + SESS_FLAG_HAS_FLOWID : 0; + + pthread_mutex_init(&ipsaddr->sesslock, NULL); + + proto->num_connected_from++; + +do_reply: + ips_proto_send_ctrl_message_reply(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_CONNECT_REPLY, + &ipsaddr->ctrl_msg_queued, buf); +no_reply: +fail: + return err; +} + +psm_error_t +ips_proto_connect(struct ips_proto *proto, int numep, + const psm_epid_t *array_of_epid, + const int *array_of_epid_mask, psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + int i, n, n_first; + psm_error_t err = PSM_OK; + psm_epaddr_t epaddr; + ips_epaddr_t *ipsaddr = NULL; + int numep_toconnect = 0, numep_left; + char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN; + union psmi_envvar_val credits_intval; + int connect_credits; + + psmi_getenv("PSM_CONNECT_CREDITS", + "End-point connect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 100, + &credits_intval); + + connect_credits = credits_intval.e_uint; + + PSMI_PLOCK_ASSERT(); + + /* All timeout values are in cycles */ + uint64_t t_start = get_cycles(); + /* Print a timeout at the warning interval */ + union psmi_envvar_val warn_intval; + uint64_t to_warning_interval; + uint64_t to_warning_next; + + /* Setup warning interval */ + psmi_getenv("PSM_CONNECT_WARN_INTERVAL", + "Period in seconds to warn if connections are not completed." + "Default is 300 seconds, 0 to disable", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 300, + &warn_intval); + + to_warning_interval = nanosecs_to_cycles(warn_intval.e_uint * SEC_ULL); + to_warning_next = t_start + to_warning_interval; + + /* Some sanity checks */ + psmi_assert_always(sizeof(struct connect_msghdr) == IPS_CONNECT_MSGHDR_SIZE); + psmi_assert_always(array_of_epid_mask != NULL); + psmi_assert_always(sizeof(struct ips_connect_reqrep) >= + IPS_CONNECT_REQREP_MINIMUM_SIZE); + psmi_assert_always(sizeof(struct ips_disconnect_reqrep) >= + IPS_DISCONNECT_REQREP_MINIMUM_SIZE); + + /* First pass: make sure array of errors is at least fully defined */ + for (i = 0; i < numep; i++) { + uint64_t lid, context, subcontext; + + lid = PSMI_EPID_GET_LID(array_of_epid[i]); + context = PSMI_EPID_GET_CONTEXT(array_of_epid[i]); + subcontext = PSMI_EPID_GET_SUBCONTEXT(array_of_epid[i]); + _IPATH_VDBG("epid-connect=%s connect to %ld:%ld:%ld\n", + array_of_epid_mask[i] ? "YES" : " NO", + (long) lid, (long) context, (long) subcontext); + if (array_of_epid_mask[i]) { + array_of_errors[i] = PSM_EPID_UNKNOWN; + array_of_epaddr[i] = NULL; + } + } + + /* Second pass: see what to connect and what is connectable. */ + for (i = 0, numep_toconnect = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + /* Can't send to epid on same lid */ + if (psm_epid_nid(proto->ep->epid) == psm_epid_nid(array_of_epid[i])) { + array_of_errors[i] = PSM_EPID_UNREACHABLE; + continue; + } + + err = ips_get_addr_from_epid(proto, array_of_epid[i], 30000, &epaddr); + if (err) + goto fail; + if (epaddr == NULL) { + ips_epstate_idx idx; + /* We're sending a connect request message before some other node + * has sent its connect message */ + epaddr = ips_alloc_epaddr(proto, array_of_epid[i], + NULL, (timeout_in / 1000000UL)); + if (epaddr == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + ipsaddr = epaddr->ptladdr; + err = ips_epstate_add(proto->epstate, ipsaddr, &idx); + if (err) + goto fail; + ipsaddr->epr.epr_commidx_from = idx; + ipsaddr->cstate_from = CSTATE_NONE; + } else if (epaddr->ptladdr->cstate_to != CSTATE_NONE) { /* already connected */ + psmi_assert_always(epaddr->ptladdr->cstate_to == CSTATE_ESTABLISHED); + array_of_errors[i] = PSM_EPID_ALREADY_CONNECTED; + array_of_epaddr[i] = epaddr; + continue; + } else { + /* We've already received a connect request message from a remote + * peer, it's time to send our own. */ + ipsaddr = epaddr->ptladdr; + /* No re-entrancy sanity check and makes sure we are not connected + * twice (caller's precondition) */ + psmi_assert_always(ipsaddr->cstate_to == CSTATE_NONE); + psmi_assert_always(ipsaddr->cstate_from != CSTATE_NONE); +#if 0 + psmi_assert_always(ipsaddr->cerror_from != PSM_OK || + !COMMIDX_IS_UNKNOWN(ptl, ipsaddr->commidx_from)); + psmi_assert_always(!COMMIDX_IS_UNKNOWN(ptl, ipsaddr->commidx_to)); +#endif + } + + ipsaddr->cstate_to = CSTATE_TO_WAITING; + ipsaddr->cerror_to = PSM_OK; + array_of_epaddr[i] = epaddr; + ipsaddr->s_timeout = get_cycles(); + ipsaddr->delay_in_ms = 1; + ipsaddr->credit = 0; + numep_toconnect++; + } + + /* Second pass: do the actual connect. + * PSM_EPID_UNKNOWN: Not connected yet. + * PSM_EPID_UNREACHABLE: Not to be connected. + * PSM_OK: Successfully connected. + * Start sending connect messages at a random index between 0 and numep-1 + */ + numep_left = numep_toconnect; + n_first = ((uint32_t) get_cycles()) % numep; + while (numep_left > 0) { + for (n = 0; n < numep; n++) { + int keep_polling = 1; + i = (n_first + n) % numep; + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + case PSM_EPID_UNREACHABLE: + case PSM_EPID_ALREADY_CONNECTED: + case PSM_OK: + continue; + default: + break; + } + psmi_assert_always(array_of_epaddr[i] != NULL); + ipsaddr = array_of_epaddr[i]->ptladdr; + if (ipsaddr->cstate_to == CSTATE_ESTABLISHED) { + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_to */ + array_of_errors[i] = PSM_OK; + numep_left--; + connect_credits++; + ipsaddr->credit = 0; + continue; + } + while (keep_polling) { + if (!psmi_cycles_left(t_start, timeout_in)) { + err = PSM_TIMEOUT; + goto err_timeout; + } + if (to_warning_interval && get_cycles() >= to_warning_next) { + uint64_t waiting_time = + cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL; + const char *first_name = NULL; + int num_waiting = 0; + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i] || + array_of_errors[i] != PSM_EPID_UNKNOWN) + continue; + if (!first_name) + first_name = psmi_epaddr_get_name(array_of_epid[i]); + num_waiting++; + } + if (first_name) { + _IPATH_INFO("Couldn't connect to %s (and %d others). " + "Time elapsed %02i:%02i:%02i. Still trying...\n", + first_name, num_waiting, + (int) (waiting_time / 3600), + (int) ((waiting_time / 60) - + ((waiting_time / 3600) * 60)), + (int) (waiting_time - ((waiting_time / 60) * 60))); + } + to_warning_next = get_cycles() + to_warning_interval; + } + + if (get_cycles() > ipsaddr->s_timeout) { + if (!ipsaddr->credit && connect_credits) { + ipsaddr->credit = 1; + connect_credits--; + } + if (ipsaddr->credit) { + _IPATH_VDBG("Connect req to %u:%u:%u\n", + __be16_to_cpu(ipsaddr->epr.epr_base_lid), + ipsaddr->epr.epr_context, + ipsaddr->epr.epr_subcontext); + if (ips_proto_send_ctrl_message(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_CONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + buf) == PSM_OK) { + keep_polling = 0; + ipsaddr->delay_in_ms = + min(100, ipsaddr->delay_in_ms << 1); + ipsaddr->s_timeout = get_cycles() + + nanosecs_to_cycles(ipsaddr->delay_in_ms * MSEC_ULL); + } + /* If not, send got "busy", keep trying */ + } + else { + keep_polling = 0; + } + } + + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + + if (ipsaddr->cstate_to == CSTATE_ESTABLISHED) { + /* This is not the real error code, we only set OK here + * so we know to stop polling for the reply. The actual + * error is in ipsaddr->cerror_to */ + array_of_errors[i] = PSM_OK; + numep_left--; + connect_credits++; + ipsaddr->credit = 0; + break; + } + } + } + } + +err_timeout: + /* Find the worst error to report */ + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + switch (array_of_errors[i]) { + /* These are benign */ + case PSM_EPID_UNREACHABLE: + case PSM_EPID_ALREADY_CONNECTED: + break; + case PSM_EPID_UNKNOWN: + array_of_errors[i] = PSM_TIMEOUT; + err = psmi_error_cmp(err, PSM_TIMEOUT); + break; + case PSM_OK: + /* Restore the real connect error */ + ipsaddr = array_of_epaddr[i]->ptladdr; + array_of_errors[i] = ipsaddr->cerror_to; + psmi_assert_always( + array_of_epaddr[i]->ptladdr->cstate_to == CSTATE_ESTABLISHED); + if (ipsaddr->cerror_to != PSM_OK) { + err = psmi_error_cmp(err, ipsaddr->cerror_to); + ips_free_epaddr(array_of_epaddr[i]->ptladdr); + array_of_epaddr[i] = NULL; + } + else { + proto->num_connected_to++; + psmi_assert_always(ipsaddr->epr.epr_path[IPS_PATH_HIGH_PRIORITY][0]->epr_mtu > 0); + } + break; + default: + break; + } + } + +fail: + return err; +} + +/* Repercutions on MQ. + * + * If num_connected==0, everything that exists in the posted queue should + * complete and the error must be marked epid_was_closed. + * + */ + +psm_error_t +ips_proto_disconnect(struct ips_proto *proto, int force, int numep, + const psm_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t array_of_errors[], + uint64_t timeout_in) +{ + ips_epaddr_t *ipsaddr; + int numep_left, numep_todisc, i, n; + int n_first; + int cstate; + int has_pending; + uint64_t timeout; + psm_error_t err = PSM_OK; + char buf[IPS_MAX_CONNECT_PAYLEN] PSMI_CACHEALIGN; + uint64_t reqs_sent = 0; + union psmi_envvar_val credits_intval; + int disconnect_credits; + uint64_t t_warning, t_start; + union psmi_envvar_val warn_intval; + unsigned warning_secs; + + psmi_assert_always(numep > 0); + + psmi_getenv("PSM_DISCONNECT_CREDITS", + "End-point disconnect request credits.", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 100, + &credits_intval); + + disconnect_credits = credits_intval.e_uint; + + /* Setup warning interval */ + psmi_getenv("PSM_DISCONNECT_WARN_INTERVAL", + "Period in seconds to warn if disconnections are not completed." + "Default is 300 seconds, 0 to disable.", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 300, + &warn_intval); + + warning_secs = warn_intval.e_uint; + + PSMI_PLOCK_ASSERT(); + + /* First pass: see what to disconnect and what is disconnectable */ + for (i = 0, numep_todisc = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + psmi_assert_always(array_of_epaddr[i]->ptl == proto->ptl); + cstate = array_of_epaddr[i]->ptladdr->cstate_to; + array_of_epaddr[i]->ptladdr->credit = 0; + if (cstate == CSTATE_NONE) { + array_of_errors[i] = PSM_OK; + continue; + } + else { + psmi_assert_always(cstate == CSTATE_ESTABLISHED); + } + _IPATH_VDBG("disconnecting %p\n", array_of_epaddr[i]); + array_of_errors[i] = PSM_EPID_UNKNOWN; + numep_todisc++; + } + if (numep_todisc == 0) + goto success; + + /* Wait for everyone to ack previous packets before putting */ + if (timeout_in == 0) + timeout = ~0ULL; + else + timeout = get_cycles() + nanosecs_to_cycles(timeout_in); + + t_start = get_cycles(); + t_warning = t_start + nanosecs_to_cycles(warning_secs * SEC_ULL); + + n_first = ((uint32_t) get_cycles()) % numep; + if (!force) { + numep_left = numep_todisc; + do { + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i] || array_of_errors[i] == PSM_OK) + continue; + ipsaddr = array_of_epaddr[i]->ptladdr; + switch (ipsaddr->cstate_to) { + case CSTATE_TO_DISCONNECTED: + array_of_errors[i] = PSM_OK; + numep_left--; + disconnect_credits++; + ipsaddr->credit = 0; + continue; + case CSTATE_TO_WAITING_DISC: + if (ipsaddr->s_timeout > get_cycles()) + continue; + ipsaddr->delay_in_ms = + min(100, ipsaddr->delay_in_ms << 1); + ipsaddr->s_timeout = get_cycles() + + nanosecs_to_cycles(ipsaddr->delay_in_ms*MSEC_ULL); + ips_proto_send_ctrl_message_request(proto, &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + buf, timeout); + reqs_sent++; + break; + case CSTATE_ESTABLISHED: + /* Still pending acks, hold off for now */ + ips_ptladdr_lock(ipsaddr); + has_pending = + !STAILQ_EMPTY(&ipsaddr-> + flows[EP_FLOW_GO_BACK_N_PIO].scb_unacked) || + !STAILQ_EMPTY(&ipsaddr-> + flows[EP_FLOW_GO_BACK_N_DMA].scb_unacked) || + !STAILQ_EMPTY(&ipsaddr-> + flows[EP_FLOW_GO_BACK_N_AM_REQ].scb_unacked) || + !STAILQ_EMPTY(&ipsaddr-> + flows[EP_FLOW_GO_BACK_N_AM_RSP].scb_unacked); + ips_ptladdr_unlock(ipsaddr); + if (has_pending) + continue; + if (!ipsaddr->credit && disconnect_credits) { + ipsaddr->credit = 1; + disconnect_credits--; + } + if (!ipsaddr->credit) + continue; + ipsaddr->delay_in_ms = 1; + ipsaddr->cstate_to = CSTATE_TO_WAITING_DISC; + ipsaddr->s_timeout = get_cycles() + + nanosecs_to_cycles(MSEC_ULL); + ips_proto_send_ctrl_message_request(proto, &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + buf, timeout); + reqs_sent++; + break; + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unhandled/unknown close state %d", + ipsaddr->cstate_to); + break; + } + } + if (numep_left == 0) + break; + + if ((err = psmi_err_only(psmi_poll_internal(proto->ep, 1)))) + goto fail; + + if (warning_secs && get_cycles() > t_warning) { + _IPATH_INFO("graceful close in progress for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc, + (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL), + (int) (timeout_in / MSEC_ULL), + (unsigned long long) reqs_sent); + t_warning = get_cycles() + nanosecs_to_cycles(warning_secs * SEC_ULL); + } + } + while (timeout > get_cycles()); + + if (numep_left > 0) { + err = PSM_TIMEOUT; + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i]) + continue; + if (array_of_errors[i] == PSM_EPID_UNKNOWN) { + array_of_errors[i] = PSM_TIMEOUT; + _IPATH_VDBG("disc timeout on index %d, epaddr %s\n", + i, psmi_epaddr_get_name(array_of_epaddr[i]->epid)); + } + } + _IPATH_PRDBG("graceful close incomplete for %d/%d peers " + "(elapsed=%d millisecs,timeout=%d millisecs,reqs=%lld)\n", numep_left, numep_todisc, + (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL), + (int) (timeout_in / MSEC_ULL), + (unsigned long long) reqs_sent); + } + else + _IPATH_PRDBG("graceful close complete from %d peers in %d millisecs, reqs_sent=%lld\n", + numep_todisc, + (int) (cycles_to_nanosecs(get_cycles() - t_start) / MSEC_ULL), + (unsigned long long) reqs_sent); + } else { + for (n = 0; n < numep; n++) { + i = (n_first + n) % numep; + if (!array_of_epaddr_mask[i]) + continue; + ipsaddr = array_of_epaddr[i]->ptladdr; + psmi_assert_always(ipsaddr->cstate_to == CSTATE_ESTABLISHED); + ips_proto_send_ctrl_message(&ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO], + OPCODE_DISCONNECT_REQUEST, + &ipsaddr->ctrl_msg_queued, + buf); + /* Force state to DISCONNECTED */ + ipsaddr->cstate_to = CSTATE_TO_DISCONNECTED; + array_of_errors[i] = PSM_OK; + } + _IPATH_VDBG("non-graceful close complete from %d peers\n", numep); + } + + for (i = 0; i < numep; i++) { + if (!array_of_epaddr_mask[i] || array_of_errors[i] != PSM_OK) + continue; + ipsaddr = array_of_epaddr[i]->ptladdr; + if (ipsaddr->cstate_to == CSTATE_NONE) + continue; + psmi_assert_always(ipsaddr->cstate_to == CSTATE_TO_DISCONNECTED); + proto->num_connected_to--; + /* Remote disconnect req arrived already, remove this epid. If it + * hasn't arrived yet, that's okay, we'll pick it up later and just + * mark our connect-to status as being "none". */ + if (ipsaddr->cstate_from == CSTATE_NONE) { + ips_free_epaddr(ipsaddr); + } + else + ipsaddr->cstate_to = CSTATE_NONE; + } + +fail: +success: + return err; +} + +int +ips_proto_isconnected(ips_epaddr_t *ipsaddr) +{ + if (ipsaddr->cstate_to == CSTATE_ESTABLISHED || + ipsaddr->cstate_from == CSTATE_ESTABLISHED) + return 1; + else + return 0; +} + diff --git a/ptl_ips/ips_proto_dump.c b/ptl_ips/ips_proto_dump.c new file mode 100644 index 0000000..3cbfa28 --- /dev/null +++ b/ptl_ips/ips_proto_dump.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_proto_header.h" +#include "ips_proto_help.h" +#include "ips_epstate.h" + +void ips_proto_dump_frame(void *frame, int lenght, char *message) +{ + uint8_t *raw_frame = frame; + int counter; + char default_message[] = ""; + + if(!message) + message = default_message; + + printf("\nHex dump of %i bytes at %p from %s\n", lenght, frame, message); + + for(counter = 0; counter < lenght; counter++) { + if((counter % 16) == 0) + printf("\n"); + + if((counter % 4) == 0) + printf(" "); + + printf("%02X ", raw_frame[counter]); + } + printf("\n"); +} + +void ips_proto_dump_data(void *data, int data_length) +{ + int counter; + uint8_t *payload = (uint8_t *)data; + + printf("\nHex dump of data, length = %i\n", + data_length); + + for(counter = 0; counter < data_length; counter++) { + if((counter % 16) == 0) + printf("\n %04d: ", counter); + + if((counter % 4) == 0) + printf(" "); + + printf("%02X ", payload[counter]); + } + printf("\n"); +} + +void ips_proto_show_header(struct ips_message_header *p_hdr, char *msg) +{ + uint32_t tid; + psm_protocol_type_t protocol; + psmi_seqnum_t ack_seq_num; + + printf("\nHeader decoding %s\n",msg?msg:""); + + printf("LRH: VL4-LVer4-SL4-Res2-LNH2: %x\n", + __be16_to_cpu(p_hdr->lrh[0])); + printf("LRH: DLID %x\n", __be16_to_cpu(p_hdr->lrh[1])); + printf("LRH: PktLen %i (0x%x)\n", __be16_to_cpu(p_hdr->lrh[2]), + __be16_to_cpu(p_hdr->lrh[2])); + printf("LRH: SLID %x\n", __be16_to_cpu(p_hdr->lrh[3])); + printf("BTH: OpCode8-SE1-M1-PC2-TVer4-Pkey16 %x\n", + __be32_to_cpu(p_hdr->bth[0])); + printf("BTH: R8-DestQP24 %x\n", __be32_to_cpu(p_hdr->bth[1])); + printf("BTH: AR1-Res7-PSN24 %x\n", __be32_to_cpu(p_hdr->bth[2])); + printf("IPH: chksum %x\n", __le16_to_cpu(p_hdr->iph.chksum)); + printf("IPH: pkt_flags %x\n", __le16_to_cpu( + p_hdr->iph.pkt_flags) & INFINIPATH_KPF_INTR_HDRSUPP_MASK); + printf("IPH: ver %i\n", + (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) + >> INFINIPATH_I_VERS_SHIFT) & INFINIPATH_I_VERS_MASK); + printf("IPH: context %i\n", + (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) + >> INFINIPATH_I_CONTEXT_SHIFT) & INFINIPATH_I_CONTEXT_MASK); + printf("IPH: subcontext %i\n", p_hdr->dst_subcontext); + tid = (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) + >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK; + printf("IPH: tid %x\n", tid); + printf("IPH: offset %x\n", + (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) + >> INFINIPATH_I_OFFSET_SHIFT) & INFINIPATH_I_OFFSET_MASK); + + printf("sub-opcode %x\n", p_hdr->sub_opcode); + + ack_seq_num.psn = p_hdr->ack_seq_num; + protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid); + if (protocol == PSM_PROTOCOL_GO_BACK_N) + printf("ack_seq_num %x\n", ack_seq_num.psn); + else + printf("TidFlow Flow: %x, Gen: %x, Seq: %x\n", ack_seq_num.flow, ack_seq_num.gen, ack_seq_num.seq); + + printf("context %d (src_context %d src_context_ext %d) src_subcontext %d\n", + IPS_HEADER_SRCCONTEXT_GET(p_hdr), p_hdr->src_context, p_hdr->src_context_ext, + p_hdr->src_subcontext); + printf("src_rank/commidx %i\n", p_hdr->commidx | + INFINIPATH_KPF_RESERVED_BITS(p_hdr->iph.pkt_flags)); + if (tid != IPATH_EAGER_TID_ID) + printf("expected_tid_session_id %i\n", p_hdr->data[0].u32w0); + printf("flags %x\n", p_hdr->flags); + printf("mqhdr %x\n", p_hdr->mqhdr); +} + +// linux doesn't have strlcat; this is a stripped down implementation +// not super-efficient, but we use it rarely, and only for short strings +// not fully standards conforming! +static size_t strlcat(char *d, const char *s, size_t l) +{ + int dlen = strlen(d), slen, max; + if(l<=dlen) // bug + return l; + slen = strlen(s); + max = l-(dlen+1); + if(slen>max) + slen = max; + memcpy(d+dlen, s, slen); + d[dlen+slen] = '\0'; + return dlen+slen+1; // standard says to return full length, not actual +} + +// decode RHF errors; only used one place now, may want more later +void ips_proto_get_rhf_errstring(uint32_t err, char *msg, size_t len) +{ + *msg = '\0'; // if no errors, and so don't need to check what's first + + if(err & INFINIPATH_RHF_H_ICRCERR) + strlcat(msg, "icrcerr ", len); + if(err & INFINIPATH_RHF_H_VCRCERR) + strlcat(msg, "vcrcerr ", len); + if(err & INFINIPATH_RHF_H_PARITYERR) + strlcat(msg, "parityerr ", len); + if(err & INFINIPATH_RHF_H_LENERR) + strlcat(msg, "lenerr ", len); + if(err & INFINIPATH_RHF_H_MTUERR) + strlcat(msg, "mtuerr ", len); + if(err & INFINIPATH_RHF_H_IHDRERR) + strlcat(msg, "ipathhdrerr ", len); + if(err & INFINIPATH_RHF_H_TIDERR) + strlcat(msg, "tiderr ", len); + if(err & INFINIPATH_RHF_H_MKERR) + strlcat(msg, "mkerr ", len); + if(err & INFINIPATH_RHF_H_IBERR) + strlcat(msg, "iberr ", len); + if(err & INFINIPATH_RHF_L_SWA) + strlcat(msg, "swA ", len); + if(err & INFINIPATH_RHF_L_SWB) + strlcat(msg, "swB ", len); +} + +void ips_proto_dump_err_stats(struct ips_proto *proto) +{ + char err_stat_msg[2048]; + char tmp_buf[128]; + int len = sizeof(err_stat_msg); + + if (!(infinipath_debug & __IPATH_PKTDBG)) + return; + + *err_stat_msg = '\0'; + + if (proto->error_stats.num_icrc_err || + proto->error_stats.num_vcrc_err || + proto->error_stats.num_ecc_err || + proto->error_stats.num_len_err || + proto->error_stats.num_mtu_err || + proto->error_stats.num_khdr_err || + proto->error_stats.num_tid_err || + proto->error_stats.num_mk_err || + proto->error_stats.num_ib_err) { + + snprintf(tmp_buf, sizeof(tmp_buf), "ERROR STATS: "); + + if (proto->error_stats.num_icrc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "ICRC: %"PRIu64" ", proto->error_stats.num_icrc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_vcrc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "VCRC: %"PRIu64" ", proto->error_stats.num_vcrc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_ecc_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "ECC: %"PRIu64" ", proto->error_stats.num_ecc_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_len_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "LEN: %"PRIu64" ", proto->error_stats.num_len_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_mtu_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "MTU: %"PRIu64" ", proto->error_stats.num_mtu_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_khdr_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "KHDR: %"PRIu64" ", proto->error_stats.num_khdr_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_tid_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "TID: %"PRIu64" ", proto->error_stats.num_tid_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_mk_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "MKERR: %"PRIu64" ", proto->error_stats.num_mk_err); + strlcat(err_stat_msg, tmp_buf, len); + } + + if (proto->error_stats.num_ib_err) { + snprintf(tmp_buf, sizeof(tmp_buf), "IBERR: %"PRIu64" ", proto->error_stats.num_ib_err); + strlcat(err_stat_msg, tmp_buf, len); + } + strlcat(err_stat_msg, "\n", len); + } + else + strlcat(err_stat_msg, "No previous errors.\n", len); + + _IPATH_ERROR("%s", err_stat_msg); +} + diff --git a/ptl_ips/ips_proto_expected.c b/ptl_ips/ips_proto_expected.c new file mode 100644 index 0000000..05d7a6e --- /dev/null +++ b/ptl_ips/ips_proto_expected.c @@ -0,0 +1,2489 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +/* + * Expected tid operations are carried out over "sessions". One session is a + * collection of N tids where N is determined by the expected message window + * size (-W option or PSM_MQ_RNDV_IPATH_WINDOW). Since naks can cause + * retransmissions, each session has an session index (_desc_idx) and a + * generation count (_desc_genc) to be able to identify if retransmitted + * packets reference the correct session. + * + * index and generation count are each 4 bytes encoded in one ptl_arg. They + * could be compressed further but we have the header space, so we don't + * bother. + */ +#define _desc_idx u32w0 +#define _desc_genc u32w1 + +/* + * Easy switch to (say) _IPATH_INFO if debugging in the expected protocol is + * needed + */ +#define _IPATH_EXP _IPATH_VDBG + +/* + * Timer callbacks. When we need work to be done out of the receive process + * loop, we schedule work on timers to be done at a later time. + */ +static psm_error_t +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm_error_t +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm_error_t +ips_tid_release_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm_error_t +ips_tid_grant_timer_callback(struct psmi_timer *timer, uint64_t current); + +static psm_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, psm_mq_req_t req, + uint32_t msglen, int flags, ptl_epaddr_t *ipsaddr, + psmi_seqnum_t flowgenseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size); + +static void +ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context); + +static void +ips_tid_flowavail_callback(struct ips_tfctrl *tfctrl, void *context); + +static void +ips_tid_mpool_tidrecv_callback(void *context); + +/* Defined at the ptl-level (breaks abstractions but needed for shared vs + * non-shared contexts */ +extern int ips_ptl_recvq_isempty(const struct ptl *ptl); + +static psm_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc); + +psm_error_t +ips_protoexp_init(const psmi_context_t *context, + const struct ips_proto *proto, + uint32_t protoexp_flags, + int num_of_send_bufs, + int num_of_send_desc, + struct ips_protoexp **protoexp_o) +{ + struct ips_protoexp *protoexp = NULL; + uint32_t tidmtu_max; + psm_error_t err = PSM_OK; + + protoexp = (struct ips_protoexp *) + psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_protoexp)); + if (protoexp == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + *protoexp_o = protoexp; + + protoexp->ptl = (const struct ptl *) proto->ptl; + protoexp->proto = (struct ips_proto *) proto; + protoexp->timerq = proto->timerq; + protoexp->tid_flags = protoexp_flags; + protoexp->tidflow_seed = (unsigned int) getpid(); + + /* Must be initialized already */ + /* Comment out because of Klockwork scanning critical error. CQ 11/16/2012 + psmi_assert_always(proto->ep != NULL && proto->ep->mq != NULL && + proto->ep->mq->rreq_pool != NULL && + proto->ep->mq->sreq_pool != NULL); + */ + psmi_assert_always(proto->timerq != NULL); + /* Make sure pbc is at the right place before the message header */ + psmi_assert_always(sizeof(union ipath_pbc) == (size_t) + (offsetof(struct ips_scb, ips_lrh) - offsetof(struct ips_scb, pbc))); + + /* These request pools are managed by the MQ component */ + protoexp->tid_sreq_pool = proto->ep->mq->sreq_pool; + protoexp->tid_rreq_pool = proto->ep->mq->rreq_pool; + + if (proto->flags & IPS_PROTO_FLAG_MQ_EXPECTED_SDMA) { + protoexp->tid_ep_flow = EP_FLOW_GO_BACK_N_DMA; + protoexp->tid_xfer_type = PSM_TRANSFER_DMA; + } + else { + protoexp->tid_ep_flow = EP_FLOW_GO_BACK_N_PIO; + protoexp->tid_xfer_type = PSM_TRANSFER_PIO; + } + + /* Initialze tid flow control. */ + { + const struct ipath_user_info *user_info = &context->user_info; + const struct ipath_base_info *base_info = &context->base_info; + uint32_t num_flow, start_flow, end_flow; + uint32_t has_hw_hdrsupp = (context->runtime_flags & IPATH_RUNTIME_HDRSUPP); + + if (!user_info->spu_subcontext_cnt || !has_hw_hdrsupp) { + /* If no context sharing enabled can use full tidflow table for + * all HCAs. + */ + start_flow = 0; + num_flow = INFINIPATH_TF_NFLOWS; + } + else { + /* Context sharing on QLE73XX requires hardware tidflow table to be + * shared as well. + */ + num_flow = (uint32_t) (INFINIPATH_TF_NFLOWS / user_info->spu_subcontext_cnt); + start_flow = base_info->spi_subcontext * num_flow; + } + + end_flow = start_flow + num_flow; + + if ((err = ips_tf_init(context, &protoexp->tfctrl, + start_flow, end_flow, + ips_tid_flowavail_callback, protoexp))) + goto fail; + } + + /* Fix the fragsize to be a power of two (usually 2048) */ + protoexp->tid_send_fragsize = context->base_info.spi_tid_maxsize; + if (proto->flags & IPS_PROTO_FLAG_MQ_EXPECTED_SDMA) + tidmtu_max = proto->epinfo.ep_mtu; + else + tidmtu_max = proto->epinfo.ep_piosize; + + while (protoexp->tid_send_fragsize > tidmtu_max) + protoexp->tid_send_fragsize /= 2; + + if ((err = ips_tid_init(&protoexp->tidc, context))) + goto fail; + + { + uint32_t bounce_size, num_bounce_bufs; + + if ((protoexp->tid_xfer_type == PSM_TRANSFER_DMA) || + (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM)) { + num_bounce_bufs = max(8, num_of_send_bufs >> 2); + bounce_size = protoexp->tid_send_fragsize; + } + else { + /* no bufs, we only need the buffers to handle misalignment on the + * sender when using send dma. */ + num_bounce_bufs = 0; + bounce_size = 0; + } + if ((err = ips_scbctrl_init(context, num_of_send_desc, num_bounce_bufs, + 0, bounce_size, ips_tid_scbavail_callback, + protoexp, &protoexp->tid_scbc_rv))) + goto fail; + } + + { + /* Determine interval to generate headers (relevant only when header + * suppression is enabled) else headers will always be generated. + * + * The PSM_EXPECTED_HEADERS environment variable can specify the + * packet interval to generate headers at. Else a header packet is + * generated every + * min(PSM_DEFAULT_EXPECTED_HEADER, window_size/tid_send_fragsize). + * Note: A header is always generated for the last packet in the flow. + */ + + union psmi_envvar_val env_exp_hdr; + uint32_t defval = + min(PSM_DEFAULT_EXPECTED_HEADER, + proto->mq->ipath_window_rv/protoexp->tid_send_fragsize); + + psmi_getenv("PSM_EXPECTED_HEADERS", + "Interval to generate expected protocol headers", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) defval, &env_exp_hdr); + + protoexp->hdr_pkt_interval = env_exp_hdr.e_uint; + /* Account for flow credits - Should try to have atleast 4 headers + * generated per window. + */ + protoexp->hdr_pkt_interval = + max(min(protoexp->hdr_pkt_interval, proto->flow_credits >> 2), 1); + + if (protoexp->hdr_pkt_interval != env_exp_hdr.e_uint) { + _IPATH_VDBG("Overriding PSM_EXPECTED_HEADERS=%u to be '%u'\n", + env_exp_hdr.e_uint, protoexp->hdr_pkt_interval); + } + + } + + /* Send descriptors. + * + * There can be up to 2^32 of these send descriptors. We conservatively + * allocate 256 but large node configurations can allocate up to sdesc_num + * of these (they are about 2k each). + * We impose a theoretical limit of 2^30. + */ + { + struct psmi_rlimit_mpool rlim = TID_SENDSESSIONS_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + protoexp->tid_desc_send_pool = + psmi_mpool_create(sizeof(struct ips_tid_send_desc), chunksz, maxsz, + 0, DESCRIPTORS, NULL, NULL); + + if (protoexp->tid_desc_send_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM_NO_MEMORY, + "Couldn't allocate tid descriptor memory pool"); + goto fail; + } + } + + /* Receive descriptors. + * + * There can only be 256 of these because the field to identify the receive + * descriptor is only 8 bits. This currently isn't a problem because we + * only have 512 tids and each descriptor consumes ~32 tids per tid window. + * This means only roughly 16 descriptors are ever used. + */ + + { + struct psmi_rlimit_mpool rlim = TID_RECVSESSIONS_LIMITS; + uint32_t maxsz, chunksz; + + if ((err = psmi_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; + + protoexp->tid_desc_recv_pool = + psmi_mpool_create(sizeof(struct ips_tid_recv_desc), chunksz, maxsz, + 0, DESCRIPTORS, ips_tid_mpool_tidrecv_callback, + protoexp); + + if (protoexp->tid_desc_recv_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM_NO_MEMORY, + "Couldn't allocate tid descriptor memory pool"); + goto fail; + } + } + + /* This pool can never be smaller than the max number of rreqs that can be + * allocated. */ + { + uint32_t rreq_per_chunk, rreq_max; + + psmi_assert_always(protoexp->proto->mq->rreq_pool != NULL); + + psmi_mpool_get_obj_info(protoexp->proto->mq->rreq_pool, + &rreq_per_chunk, + &rreq_max); + + protoexp->tid_getreq_pool = + psmi_mpool_create(sizeof(struct ips_tid_get_request), + rreq_per_chunk, rreq_max, 0, DESCRIPTORS, NULL, NULL); + + if (protoexp->tid_getreq_pool == NULL) { + err = psmi_handle_error(proto->ep, PSM_NO_MEMORY, + "Couldn't allocate getreq descriptor memory pool"); + goto fail; + } + } + + /* + * Parse the tid timeout settings from the environment. + * :: + * + */ + { + int tvals[3]; + char *tid_to; + union psmi_envvar_val env_to; + + if (context->runtime_flags & PSMI_RUNTIME_RCVTHREAD) { + tvals[0] = 200; + tvals[1] = 1000; + tvals[2] = 2; + tid_to = "200:1000:2"; + } + else { + /* This has always been the behavior ips < 2.1 */ + tid_to = "100:100:3"; + tvals[0] = 100; + tvals[1] = 100; + tvals[2] = 3; + } + + if (!psmi_getenv("PSM_TID_TIMEOUT", + "Tid timeout control ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) tid_to, + &env_to)) { + /* not using default values */ + tid_to = env_to.e_str; + psmi_parse_str_tuples(tid_to, 3, tvals); + } + protoexp->tid_to_cyc_min = us_2_cycles((uint64_t) tvals[0]); + protoexp->tid_to_cyc_max = us_2_cycles((uint64_t) tvals[1]); + protoexp->tid_to_intr = tvals[2]; + _IPATH_PRDBG("Tid control message settings: timeout min=%dus/max=%dus, " + "interrupt when trying attempt #%d\n", + tvals[0], tvals[1], tvals[2]); + } + + /* + * Make sure that the rendezvous window size settings are not larger than + * the largest packet we can put on the wire. + */ + { + uint32_t winsize = protoexp->proto->mq->ipath_window_rv; + + if (winsize < ips_tid_page_size(&protoexp->tidc)) { + _IPATH_INFO("Overriding request for rndv window size %d " + "to minimum supported value %d bytes\n", + winsize, ips_tid_page_size(&protoexp->tidc)); + protoexp->proto->mq->ipath_window_rv = + ips_tid_page_size(&protoexp->tidc); + } + else { /* Figure out maximum supportable value assuming we can + * send a maxmium payload of 2048 bytes */ + int maxtids = 0; + + while (PSMI_ALIGNUP((sizeof(ips_tid_session_list) + + ((maxtids+1) * sizeof(ips_tid_session_member))), 4) + < IPS_PROTOEXP_MIN_MTU) + { + maxtids++; + } + + /* Assume worse-case alignment when deriving the amount of tids, + * need one tid for bad page-alignment and another for spillover + * into last page */ + winsize = (maxtids-2) * ips_tid_page_size(&protoexp->tidc); + + if (protoexp->proto->mq->ipath_window_rv > winsize) { + _IPATH_INFO("Overriding request for rndv window size %d " + "to maximum supported value %d bytes\n", + protoexp->proto->mq->ipath_window_rv, + winsize); + protoexp->proto->mq->ipath_window_rv = winsize; + } + } + } + + /* + * Allow setting of PSM_TID_MIN_EXPSEND, the minimum amount of expected + * send packets we send before checking the receive queue. + */ + { + union psmi_envvar_val env_mincnt; + + psmi_getenv("PSM_TID_MIN_EXPSEND", + "Min expsend pkt cnt before recv", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) 3, &env_mincnt); + protoexp->tid_min_expsend_cnt = env_mincnt.e_uint; + } + + /* Timers to handle requeueing of work out of the receive path */ + psmi_timer_entry_init(&protoexp->timer_send, + ips_tid_pendsend_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_sendq); + psmi_timer_entry_init(&protoexp->timer_getreqs, + ips_tid_pendtids_timer_callback, protoexp); + STAILQ_INIT(&protoexp->pend_getreqsq); + + protoexp->tid_page_offset_mask = + (uint32_t) context->base_info.spi_tid_maxsize - 1; + protoexp->tid_page_mask = + ~((uint64_t) context->base_info.spi_tid_maxsize - 1); + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) { + protoexp->tid_info = (struct ips_tidinfo *) + psmi_calloc(context->ep, UNDEFINED, IPS_TID_MAX_TIDS, + sizeof (struct ips_tidinfo)); + if (protoexp->tid_info == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + } + else + protoexp->tid_info = NULL; + + psmi_assert(err == PSM_OK); + return err; + +fail: + if (protoexp != NULL && protoexp->tid_getreq_pool != NULL) + psmi_mpool_destroy(protoexp->tid_getreq_pool); + if (protoexp != NULL && protoexp->tid_desc_recv_pool != NULL) + psmi_mpool_destroy(protoexp->tid_desc_recv_pool); + if (protoexp != NULL && protoexp->tid_desc_send_pool != NULL) + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + if (protoexp != NULL) + ips_scbctrl_fini(&protoexp->tid_scbc_rv); + if (protoexp != NULL) + psmi_free(protoexp); + return err; +} + +psm_error_t +ips_protoexp_fini(struct ips_protoexp *protoexp) +{ + psm_error_t err = PSM_OK; + + psmi_mpool_destroy(protoexp->tid_getreq_pool); + psmi_mpool_destroy(protoexp->tid_desc_recv_pool); + psmi_mpool_destroy(protoexp->tid_desc_send_pool); + + if ((err = ips_scbctrl_fini(&protoexp->tid_scbc_rv))) + goto fail; + + if ((err = ips_tid_fini(&protoexp->tidc))) + goto fail; + + if ((err = ips_tf_fini(&protoexp->tfctrl))) + goto fail; + + _IPATH_PRDBG("Tid control resends: tid_grant=%lld,tid_release=%lld," + "request_intr=%lld\n", + (long long) protoexp->tid_grant_resends, + (long long) protoexp->tid_release_resends, + (long long) protoexp->tid_intr_reqs); + + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) + psmi_free(protoexp->tid_info); + + psmi_free(protoexp); + +fail: + return err; +} + +/* New scbs now available. If we have pending sends because we were out of + * scbs, put the pendq on the timerq so it can be processed. */ +static +void +ips_tid_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) context; + + if (!STAILQ_EMPTY(&protoexp->pend_sendq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_send, PSMI_TIMER_PRIO_1); + return; +} + +/* New Tid Flows are available. If there are pending get requests put the + * get timer on the timerq so it can be processed. */ +static +void +ips_tid_flowavail_callback(struct ips_tfctrl *tfctrl, void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) context; + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + return; +} + +/* + * The tid get request is always issued from within the receive progress loop, + * which is why we always enqueue the request instead of issuing it directly. + * Eventually, if we expose tid_get to users, we will want to differentiate + * when the request comes from the receive progress loop from cases where the + * tid_get is issued directly from user code. + * + */ +psm_error_t +ips_protoexp_tid_get_from_token( + struct ips_protoexp *protoexp, + void *buf, + uint32_t length, + psm_epaddr_t epaddr, + uint32_t remote_tok, + uint32_t flags, + ips_tid_completion_callback_t callback, + void *context) +{ + struct ips_tid_get_request *getreq; + int count, fragsize; + + getreq = (struct ips_tid_get_request *) + psmi_mpool_get(protoexp->tid_getreq_pool); + + /* We can't *really* run out of these here because we always allocate as + * much as available receive reqs */ + if_pf (getreq == NULL) + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Ran out of 'getreq' descriptors"); + + getreq->tidgr_protoexp = protoexp; + getreq->tidgr_epaddr = epaddr; + getreq->tidgr_lbuf = buf; + getreq->tidgr_length = length; + getreq->tidgr_sendtoken = remote_tok; + getreq->tidgr_ucontext = context; + getreq->tidgr_callback = callback; + getreq->tidgr_offset = 0; + getreq->tidgr_bytesdone = 0; + getreq->tidgr_desc_seqno= 0; + getreq->tidgr_flags = flags; + + /* nsconn is the # of slave channels. */ + /* fragsize is the bytes each channel should transfer. */ + count = epaddr->mctxt_master->mctxt_nsconn; + fragsize = (length+count)/(count+1); + if (fragsize < 4096) fragsize = 4096; + getreq->tidgr_rndv_winsz= min(fragsize, epaddr->ep->mq->ipath_window_rv); + + STAILQ_INSERT_TAIL(&protoexp->pend_getreqsq, getreq, tidgr_next); + if (ips_tid_num_available(&protoexp->tidc) >= + ips_tid_num_required(&protoexp->tidc, (void *) NULL, + getreq->tidgr_rndv_winsz)) + ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); + else + psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + return PSM_OK; +} + +/* List of perf events */ +#define _ips_logeventid_tid_send_reqs 0 /* out of tid send descriptors */ + +#define ips_logevent_id(event) _ips_logeventid_ ## event +#define ips_logevent(proto, event,ptr) ips_logevent_inner(proto, ips_logevent_id(event), ptr) + +static +void +ips_logevent_inner(struct ips_proto *proto, int eventid, void *context) +{ + uint64_t t_now = get_cycles(); + + switch (eventid) { + case ips_logevent_id(tid_send_reqs): { + ips_epaddr_t *ipsaddr = (ips_epaddr_t *) context; + proto->psmi_logevent_tid_send_reqs.count++; + + if (t_now >= proto->psmi_logevent_tid_send_reqs.next_warning) { + psmi_handle_error(PSMI_EP_LOGEVENT, PSM_OK, + "Non-fatal temporary exhaustion of send tid dma descriptors " + "(elapsed=%.3fs, source LID=0x%x/context=%d, count=%lld)", + (double) cycles_to_nanosecs(t_now - ipsaddr->proto->t_init) / 1.0e9, + (int) psm_epid_nid(ipsaddr->epaddr->epid), + (int) psm_epid_context(ipsaddr->epaddr->epid), + (long long) proto->psmi_logevent_tid_send_reqs.count); + proto->psmi_logevent_tid_send_reqs.next_warning = t_now + + sec_2_cycles(proto->psmi_logevent_tid_send_reqs.interval_secs); + } + } + break; + + default: + break; + } + + return; +} + +/* + * Expected Protocol. + * + * We're granted tids (as part of a tid get request) and expected to fulfill + * the request by associating the request's sendtoken to a tid send descriptor. + * + * It's possible to be out of tid send descriptors when somehow all allocated + * descriptors can't complete all of their sends. For example, the targets of + * the sends may be busy in computation loops and not processing incoming + * packets. + */ + +void __fastpath +ips_protoexp_tid_grant(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_tid_session_list *tid_list; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + uint32_t paylen, msglen; + uint32_t reqidx; + psmi_seqnum_t flowgenseq; + psm_error_t err = PSM_OK; + psm_mq_req_t req; + ptl_arg_t args[3]; + uint8_t index, seqno; + + paylen = ips_recvhdrq_event_paylen(rcv_ev); + tid_list = (ips_tid_session_list *) ips_recvhdrq_event_payload(rcv_ev); + reqidx = p_hdr->data[0].u32w0; + msglen = p_hdr->data[0].u32w1; + flowgenseq.val = p_hdr->data[1].u32w0; + + /* Increment grant received stats for endpoint */ + ipsaddr->stats.tids_grant_recv++; + index = tid_list->tsess_seqno % sizeof(req->tid_grant); + seqno = tid_list->tsess_seqno / sizeof(req->tid_grant); + + req = psmi_mpool_find_obj_by_index(protoexp->tid_sreq_pool, reqidx); + + if (req) { + _IPATH_VDBG("req=%p (%d) wait=%s req_seqno=%d pkt_len=%d, seqno=%d, msglen=%d\n", + req, reqidx, req->type & MQE_TYPE_WAITING ? "yes" : "no", + req->recv_msgoff, paylen, tid_list->tsess_seqno, msglen); + } + + /* We use recv_msgoff to track the latest receive sequence number */ + + if (req == NULL) { + /* Not found, bogus req, ack it anyway */ + } + else if (seqno < req->tid_grant[index]) { + /* dupe, ack it */ + } + else if (seqno > req->tid_grant[index]) { + /* lost tidreq, wait for rexmit */ + /* XXX count this to see if it's worth handling instead of dropping */ + goto no_ack; + } + else { + req->tid_grant[index]++; + /* Safe to keep updating every time */ + req->send_msglen = msglen; + if ((err = ips_tid_send_handle_tidreq(protoexp, req, msglen, 0, ipsaddr, flowgenseq, tid_list, paylen)) != PSM_OK) + { + ips_logevent(rcv_ev->proto, tid_send_reqs, ipsaddr); + /* Out of send reqs, wait for rexmit */ + goto no_ack; + } + req->recv_msgoff = tid_list->tsess_seqno + 1; + rcv_ev->proto->psmi_logevent_tid_send_reqs.next_warning = 0; + } + + /* At this point we can ack the request */ + args[0] = tid_list->tsess_descid; + + ips_proto_send_ctrl_message(&ipsaddr->flows[protoexp->tid_ep_flow], + OPCODE_TIDS_GRANT_ACK, + &ipsaddr->ctrl_msg_queued, args); + +no_ack: + return; +} + +void __fastpath +ips_protoexp_tid_grant_ack(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t desc_tidrecvc; + + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc == NULL) /* dupe or gone, drop it */ + return; + + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + + _IPATH_VDBG("desc_req:id=%d,gen=%d desc_tidc:id=%d,gen=%d\n", + desc_id._desc_idx, desc_id._desc_genc, + desc_tidrecvc._desc_idx, desc_tidrecvc._desc_genc); + + if (desc_tidrecvc.u64 == desc_id.u64 && + tidrecvc->state == TIDRECVC_STATE_GRANT) + { + psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq); + tidrecvc->state = TIDRECVC_STATE_GRANT_ACK; + } + return; +} + +void +__fastpath +ips_protoexp_recv_unaligned_data(struct ips_recvhdrq_event *rcv_ev) +{ + + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ptl_epaddr *ipsaddr = rcv_ev->ipsaddr; + uint32_t tid_recv_sessid; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id = rcv_ev->p_hdr->data[0]; + int i; + uint8_t *byte_index = (uint8_t *) &p_hdr->data[1].u32w0; + uint8_t *buffer; + + if (!ips_proto_is_expected_or_nak(rcv_ev)) goto process_ack; + + psmi_assert(p_hdr->flags & (IPS_SEND_FLAG_UNALIGNED_DATA | IPS_SEND_FLAG_ACK_REQ)); + + tid_recv_sessid = desc_id._desc_idx; + tidrecvc = + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + tid_recv_sessid); + + if_pf (tidrecvc == NULL) { + _IPATH_ERROR("No tidrecv session with index %d\n", + tid_recv_sessid); + goto process_ack; + } + + if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) { + _IPATH_ERROR("Expected packet to tid session %d, now %d instead " + "of %d; skipping\n", tid_recv_sessid, + psmi_mpool_get_obj_gen_count(tidrecvc), + desc_id._desc_genc); + goto process_ack; /* skip */ + } + + psmi_assert(p_hdr->hdr_dlen == + (tidrecvc->tid_list.tsess_unaligned_start + tidrecvc->tid_list.tsess_unaligned_end)); + + /* Cancel tid grant timer (if still active) */ + if (tidrecvc->num_recv_hdrs++ == 0) + psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq); + + buffer = tidrecvc->buffer; + for (i = 0; i < tidrecvc->tid_list.tsess_unaligned_start; i++) + *buffer++ = *byte_index++; + + buffer = + (uint8_t *) tidrecvc->buffer + tidrecvc->recv_msglen - + tidrecvc->tid_list.tsess_unaligned_end; + byte_index = (uint8_t *)&p_hdr->data[1].u32w1; + + for (i = 0; i < tidrecvc->tid_list.tsess_unaligned_end; i++) + *buffer++ = *byte_index++; + + /* If packet has checksum for window cache it */ + if (p_hdr->flags & IPS_SEND_FLAG_HAS_CKSUM) { + uint32_t *cksum = (uint32_t*) ips_recvhdrq_event_payload(rcv_ev); + + psmi_assert_always(protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM); + psmi_assert_always(ips_recvhdrq_event_payload(rcv_ev)); + psmi_assert_always(ips_recvhdrq_event_paylen(rcv_ev)); + tidrecvc->cksum = *cksum; + } + +process_ack: + ips_proto_process_ack(rcv_ev); + /* May require ACK for this packet. */ + if (p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) + ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, + &ipsaddr->flows[ips_proto_flowid(p_hdr)]); + + return; +} + +void +__fastpath +ips_protoexp_data(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + uint32_t tid_recv_sessid; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id = rcv_ev->p_hdr->data[0]; + ptl_arg_t send_descid = rcv_ev->p_hdr->data[1]; + uint32_t paylen; + psmi_seqnum_t sequence_num, expected_sequence_num; + uint32_t has_hw_hdrsupp = (protoexp->ptl->context->runtime_flags & IPATH_RUNTIME_HDRSUPP); + ptl_arg_t args[3]; + + paylen = ips_recvhdrq_event_paylen(rcv_ev); + tid_recv_sessid = desc_id._desc_idx; + tidrecvc = + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + tid_recv_sessid); + + if_pf (tidrecvc == NULL) { + _IPATH_ERROR("No tidrecv session with index %d\n", + tid_recv_sessid); + return; + } + + if_pf (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER && paylen != 0) { + _IPATH_ERROR("Expected packet, but eager index is set; skipping\n"); + return; + } + + if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) { + _IPATH_ERROR("Expected packet to tid session %d, now %d instead " + "of %d; skipping\n", tid_recv_sessid, + psmi_mpool_get_obj_gen_count(tidrecvc), + desc_id._desc_genc); + return; /* skip */ + } + + sequence_num.val = __be32_to_cpu(p_hdr->bth[2]); + expected_sequence_num = tidrecvc->tidflow_genseq; + + /* On QLE73XX this is only called if data was fully received or the ACK + * interval was reached else the gen/seq error handlers are called + * from ips_proto_recv. + */ + if (has_hw_hdrsupp) { + + /* Drop packet if generation number does not match */ + if (expected_sequence_num.gen != sequence_num.gen) + return; + + /* Increment the expected sequence number taking into account the number + * of headers that were suppressed. + */ + expected_sequence_num.seq += (protoexp->hdr_pkt_interval - 1); + + /* Special case for last packet as may be lesser than interval. */ + if (p_hdr->flags & IPS_SEND_FLAG_EXPECTED_DONE) + expected_sequence_num = sequence_num; + + /* TIDFLOW will restart in the if block below */ + if_pf (sequence_num.psn != expected_sequence_num.psn) { + _IPATH_EPDBG("Expected: Packet PSN %d received and were expecting %d. Restarting flow.\n", sequence_num.psn, expected_sequence_num.psn); + } + + } + + /* IBTA CCA handling for expected flow. */ + if (rcv_ev->is_congested & IPS_RECV_EVENT_FECN) { + /* Mark flow to generate BECN in control packet */ + tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN; + /* Update stats for congestion encountered */ + rcv_ev->ipsaddr->stats.congestion_pkts++; + /* Clear FECN event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + } + + if_pf (sequence_num.psn != expected_sequence_num.psn) { + psmi_assert(sequence_num.flow == tidrecvc->tidflow_idx); + psmi_assert(sequence_num.flow == tidrecvc->tidflow_genseq.flow); + + /* Generation mismatch */ + if (sequence_num.gen != tidrecvc->tidflow_genseq.gen) + return ips_protoexp_handle_tf_generr(rcv_ev); + + /* Sequence mismatch error */ + return ips_protoexp_handle_tf_seqerr(rcv_ev); + } + else { + + /* Update the shadow tidflow_genseq */ + tidrecvc->tidflow_genseq.seq = sequence_num.seq + 1; + + /* On QLE71XX/QLE72XX update tidflow table in software */ + if (!has_hw_hdrsupp) + ipath_tidflow_set_entry(tidrecvc->context->ctrl, + tidrecvc->tidflow_idx, + tidrecvc->tidflow_genseq.gen, + tidrecvc->tidflow_genseq.seq); + + /* Reset the swapped generation count as we received a valid packet */ + tidrecvc->tidflow_nswap_gen = 0; + } + + /* Do some sanity checking */ + psmi_assert_always(((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3) == 0); + psmi_assert_always(tidrecvc->state != TIDRECVC_STATE_DONE); + + /* If first packet received cancel tid grant timer */ + if (tidrecvc->num_recv_hdrs++ == 0) + psmi_timer_cancel(protoexp->timerq, &tidrecvc->timer_tidreq); + + /* If last packet we can close the tidflow. + * We can deallocate tidflow even if the unaligned data has not been + * received. The TID_RELEASE message will deallocate the receive + * descriptor. + * + * Note: If we were out of tidflows this will invoke the callback to + * schedule pending transfers. + */ + + if (p_hdr->flags & IPS_SEND_FLAG_EXPECTED_DONE) { + + psm_error_t ret = PSM_OK; + + /* Acquire lock before updating state (ERR_CHK_GEN also tests for + * state before responding. + */ + + ips_ptladdr_lock(rcv_ev->ipsaddr); + + /* Mark receive as done */ + tidrecvc->state = TIDRECVC_STATE_DONE; + + ret = ips_tf_deallocate(&protoexp->tfctrl, + tidrecvc->tidflow_idx); + psmi_assert_always (ret == PSM_OK); + + /* Release lock */ + ips_ptladdr_unlock(rcv_ev->ipsaddr); + } + + /* Respond with an ACK if sender requested one or incoming flow faced + * congestion. The ACK in this case will have the BECN bit set. + */ + if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) || + (tidrecvc->ipsaddr->tidgr_flow.flags & IPS_FLOW_FLAG_GEN_BECN)) { + + /* Ack sender with descriptor index */ + args[0] = send_descid; + args[1] = tidrecvc->tid_list.tsess_descid; + + ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow, + OPCODE_ACK, + &tidrecvc->ctrl_msg_queued, args); + } + + return; +} + +#ifndef PSM_DEBUG +# define ips_dump_tids(tid_list,msg,...) +#else +static +void +ips_dump_tids(ips_tid_session_list *tid_list, const char *msg, ...) +{ + char buf[256]; + size_t off = 0; + int i, num_tids = tid_list->tsess_tidcount; + + va_list argptr; + va_start(argptr, msg); + off += vsnprintf(buf, sizeof buf - off, msg, argptr); + va_end(argptr); + + for (i = 0; i < num_tids && off < (sizeof buf - 1); i++) + off += snprintf(buf + off, sizeof buf - off, "%d%s", + (int) tid_list->tsess_list[i].tid, + i < num_tids-1 ? "," : ""); + + _IPATH_VDBG("%s\n", buf); + return; +} +#endif + +static +void +ips_expsend_tiderr(struct ips_tid_send_desc *tidsendc) +{ + char buf[256]; + size_t off = 0; + int i; + + off += snprintf(buf + off, sizeof buf - off, + "Remaining bytes: %d Member id %d is not in tid_session_id=%d :", tidsendc->remaining_bytes, tidsendc->tid_idx, + tidsendc->tid_list.tsess_descid._desc_idx); + + for (i = 0; i < tidsendc->tid_list.tsess_tidcount+1; i++) + off += snprintf(buf + off, sizeof buf - off, "%d,", + tidsendc->tid_list.tsess_list[i].tid); + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Trying to use tid idx %d and there are %d members: %s\n", + tidsendc->tid_idx, tidsendc->tid_list.tsess_tidcount, buf); + return; +} + +void +ips_protoexp_scb_inflight(ips_scb_t *scb) +{ + if (scb->tidsendc) + scb->tidsendc->iovec_cntr_last = scb->dma_ctr; + return; +} + +static +void __fastpath +ips_tid_send_tid_release_msg(struct ips_tid_send_desc *tidsendc) +{ + psm_error_t err; + struct ips_protoexp *protoexp = tidsendc->protoexp; + psm_mq_req_t req = tidsendc->mqreq; + ptl_arg_t desc_id[3] = {}; + uint64_t t_cyc; + + desc_id[0] = tidsendc->tid_list.tsess_descid; + desc_id[1] = tidsendc->descid; + desc_id[2].u32w0 = tidsendc->release_cnt; + + err = ips_proto_send_ctrl_message(&tidsendc->ipsaddr-> + flows[protoexp->tid_ep_flow], + OPCODE_TIDS_RELEASE, + &tidsendc->ctrl_msg_queued, + desc_id); + + if (err != PSM_EP_NO_RESOURCES) { + tidsendc->release_cnt++; + t_cyc = get_cycles() + protoexp->tid_to_cyc_min; + } + else + t_cyc = get_cycles() + protoexp->proto->timeout_send; + + psmi_timer_request_always(protoexp->timerq, &tidsendc->timer_tidrelease, + t_cyc); + + req->send_msgoff += tidsendc->length; + + _IPATH_VDBG("[rndv][send] tid chunk of size %d done %d/%d for req=%p%s\n", + tidsendc->length, req->send_msgoff, req->send_msglen, req, + req->send_msgoff == req->send_msglen ? " (complete)" : ""); + + if (req->send_msgoff == req->send_msglen) + psmi_mq_handle_rts_complete(req); +} + +static +int __fastpath +ips_tid_send_completion_unaligned_callback(void * param, uint32_t nbytes) +{ + struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *) param; + + /* Decrement completion counter and complete if unaligned data sent */ + tidsendc->completion_counter--; + + psmi_assert(tidsendc->completion_counter >= 0); + + if (tidsendc->completion_counter == 0) + ips_tid_send_tid_release_msg(tidsendc); + + return IPS_RECVHDRQ_CONTINUE; +} + +static +int __fastpath +ips_tid_send_completion_callback(void * param, uint32_t nbytes) +{ + struct ips_tid_send_desc *tidsendc = (struct ips_tid_send_desc *) param; + struct ips_protoexp *protoexp = tidsendc->protoexp; + + if (protoexp->tid_xfer_type == PSM_TRANSFER_DMA) + ips_proto_dma_wait_until(protoexp->proto, tidsendc->iovec_cntr_last); + + if (tidsendc->bounce_buf) psmi_free(tidsendc->bounce_buf); + + /* Decrement completion counter and complete if unaligned data sent */ + tidsendc->completion_counter--; + + psmi_assert(tidsendc->completion_counter >= 0); + + if (tidsendc->completion_counter == 0) + ips_tid_send_tid_release_msg(tidsendc); + + return IPS_RECVHDRQ_CONTINUE; +} + +static +psm_error_t __fastpath +ips_tid_release_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_tid_send_desc *tidsendc = + (struct ips_tid_send_desc *) timer->context; + struct ips_protoexp *protoexp = tidsendc->protoexp; + uint64_t t_cyc; + psm_error_t err; + ptl_arg_t desc_id[3] = {}; + + /* 0 contain's the receiver's desc_id, 1 contains the sender's desc_id */ + desc_id[0] = tidsendc->tid_list.tsess_descid; + desc_id[1] = tidsendc->descid; + desc_id[2].u32w0 = tidsendc->release_cnt; + + err = ips_proto_send_ctrl_message(&tidsendc->ipsaddr-> + flows[protoexp->tid_ep_flow], + OPCODE_TIDS_RELEASE, + &tidsendc->ctrl_msg_queued, + desc_id); + + if (err == PSM_EP_NO_RESOURCES) { + t_cyc = get_cycles() + protoexp->proto->timeout_send; + } + else { + tidsendc->release_cnt++; + protoexp->tid_release_resends++; + t_cyc = get_cycles() + + min(tidsendc->release_cnt * protoexp->tid_to_cyc_min, + protoexp->tid_to_cyc_max); + } + + psmi_timer_request_always(protoexp->timerq, + &tidsendc->timer_tidrelease, + t_cyc); + + return PSM_OK; +} + +static +psm_error_t __fastpath +ips_tid_grant_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_tid_recv_desc *tidrecvc = + (struct ips_tid_recv_desc *) timer->context; + struct ips_protoexp *protoexp = tidrecvc->protoexp; + ips_epaddr_t *ipsaddr = tidrecvc->ipsaddr; + psm_error_t err; + uint64_t t_cyc; + + err = ips_proto_send_ctrl_message(&ipsaddr->flows[protoexp->tid_ep_flow], + OPCODE_TIDS_GRANT, + &tidrecvc->ctrl_msg_queued, + &tidrecvc->tid_list); + + if (err == PSM_EP_NO_RESOURCES) { + t_cyc = get_cycles() + protoexp->proto->timeout_send; + } + else { + tidrecvc->grant_cnt++; + protoexp->tid_grant_resends++; + t_cyc = get_cycles() + + min(tidrecvc->grant_cnt * protoexp->tid_to_cyc_min, + protoexp->tid_to_cyc_max); + } + + psmi_timer_request_always(protoexp->timerq, timer, t_cyc); + + return PSM_OK; +} + +static +__fastpath +psm_error_t +ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, + psm_mq_req_t req, uint32_t msglen, + int flags, ptl_epaddr_t *ipsaddr, + psmi_seqnum_t flowgenseq, + ips_tid_session_list *tid_list, + uint32_t tid_list_size) +{ + struct ips_tid_send_desc *tidsendc; + req->send_msglen = msglen; + + psmi_assert(tid_list_size >= sizeof(ips_tid_session_list)); + psmi_assert(tid_list_size <= 2096); + + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_get(protoexp->tid_desc_send_pool); + if (tidsendc == NULL) + return PSM_EP_NO_RESOURCES; + + tidsendc->protoexp = protoexp; + + /* Uniquely identify this send descriptor in space and time */ + tidsendc->descid._desc_idx = psmi_mpool_get_obj_index(tidsendc); + tidsendc->descid._desc_genc = psmi_mpool_get_obj_gen_count(tidsendc); + + psmi_mq_mtucpy(&tidsendc->tid_list, tid_list, tid_list_size); + tid_list = &tidsendc->tid_list; + + tidsendc->length = tid_list->tsess_length; + tidsendc->ipsaddr = ipsaddr; + tidsendc->mqreq = req; + tidsendc->bounce_buf = NULL; + tidsendc->buffer = + (void *)((uintptr_t)req->buf + tid_list->tsess_srcoff); + tidsendc->tid_idx = 0; + tidsendc->is_complete= 0; + tidsendc->release_cnt= 0; + + /* Initialize tidflow for window. Use path requested by remote endpoint */ + ips_flow_init(&tidsendc->tidflow, NULL, ipsaddr, protoexp->tid_xfer_type, + PSM_PROTOCOL_TIDFLOW, IPS_PATH_LOW_PRIORITY, 0); + + tidsendc->tidflow.xmit_seq_num = flowgenseq; + tidsendc->tidflow.xmit_ack_num = flowgenseq; + tidsendc->tidflow.xmit_ack_num.seq--; /* last acked */ + tidsendc->ctrl_msg_queued = 0; + tidsendc->completion_counter = 1; + + /* If unaligned data will need to send a separate packet containing + * unaligned data. + */ + if ((tidsendc->tid_list.tsess_unaligned_start) || + (tidsendc->tid_list.tsess_unaligned_end) || + (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM)) + tidsendc->completion_counter += 1; + + if (tid_list->tsess_tidcount == 0) { + _IPATH_VDBG("no tids used, alloc eager tid\n"); + tid_list->tsess_list[0].tid = IPATH_EAGER_TID_ID; + tid_list->tsess_list[0].length = 0; + tid_list->tsess_list[0].offset = 0; + } + + tidsendc->frame_send = 0; + tidsendc->remaining_bytes = tid_list->tsess_length; + tidsendc->remaining_bytes_in_page = + tid_list->tsess_list[0].length; + tidsendc->offset = tid_list->tsess_list[0].offset; + tidsendc->unaligned_sent = 0; + + psmi_timer_entry_init(&tidsendc->timer_tidrelease, + ips_tid_release_timer_callback, tidsendc); + + _IPATH_EXP("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d,s=%d,e=%d\n", + tidsendc->descid._desc_idx, tid_list->tsess_descid._desc_idx, + tid_list->tsess_srcoff, tid_list->tsess_length, + tid_list->tsess_unaligned_start, + tid_list->tsess_unaligned_end + ); + + /* We have no tids, we're expected to stuff everything in user + * header words, so mark it as an eager packet */ + if (tid_list->tsess_tidcount > 0) { + ips_dump_tids(&tidsendc->tid_list, + "Received %d tids: ", tidsendc->tid_list.tsess_tidcount); + } + + /* Add as a pending op and ring up the timer */ + STAILQ_INSERT_TAIL(&protoexp->pend_sendq, tidsendc, next); + psmi_timer_request(protoexp->timerq, &protoexp->timer_send, PSMI_TIMER_PRIO_1); + + /* Consider breaking out of progress engine here */ + return PSM_OK; +} + +void __fastpath +ips_protoexp_tid_release_ack(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_tid_send_desc *tidsendc; + ptl_arg_t desc_id = rcv_ev->p_hdr->data[1]; + + tidsendc = (struct ips_tid_send_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, + desc_id._desc_idx); + _IPATH_VDBG("desc_id=%d (%p)\n", desc_id._desc_idx, tidsendc); + if (tidsendc == NULL) { + _IPATH_ERROR("OPCODE_TIDS_RELEASE_CONFIRM ERROR: Index %d is out of range\n", + desc_id._desc_idx); + } + else { + ptl_arg_t desc_tidsendc; + psmi_mpool_get_obj_index_gen_count(tidsendc, + &desc_tidsendc._desc_idx, + &desc_tidsendc._desc_genc); + + _IPATH_VDBG("desc_req:id=%d,gen=%d desc_sendc:id=%d,gen=%d\n", + desc_id._desc_idx, desc_id._desc_genc, + desc_tidsendc._desc_idx, desc_tidsendc._desc_genc); + + /* See if the reference is still live and valid */ + if (desc_tidsendc.u64 == desc_id.u64) { + psmi_timer_cancel(protoexp->timerq, &tidsendc->timer_tidrelease); + psmi_timer_cancel(rcv_ev->proto->timerq, + &tidsendc->tidflow.timer_send); + psmi_timer_cancel(rcv_ev->proto->timerq, + &tidsendc->tidflow.timer_ack); + psmi_mpool_put(tidsendc); + } + } + return; +} + +static +psm_error_t __fastpath +ips_scb_send_unaligned_data(ips_scb_t *scb) +{ + struct ips_tid_send_desc *tidsendc = scb->tidsendc; + struct ips_protoexp *protoexp = tidsendc->protoexp; + uint8_t *bufptr = tidsendc->buffer; + int frame_extra, i; + uint8_t *packptr; + uint8_t *unptr_beg = bufptr; + uint8_t *unptr_end = bufptr + tidsendc->length - + tidsendc->tid_list.tsess_unaligned_end; + struct ips_flow *flow = &tidsendc->ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + + psmi_assert(tidsendc->tid_idx == 0); + + /* arg[0] is recv descriptor id */ + scb->ips_lrh.data[0] = tidsendc->tid_list.tsess_descid; + + if (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM) { + uint32_t cksum = 0xffffffff; + + if (!ips_scbctrl_bufalloc(scb)) { + ips_scbctrl_free(scb); + return PSM_EP_NO_RESOURCES; + } + + cksum = ips_crc_calculate(tidsendc->length, + (uint8_t*) tidsendc->buffer, cksum); + *(uint32_t*) ips_scb_buffer(scb) = cksum; + ips_scb_length(scb) = sizeof(cksum); + scb->flags |= IPS_SEND_FLAG_HAS_CKSUM; + } + + // Make sure not to over read unaligned buffer + packptr = (uint8_t *)&scb->ips_lrh.data[1].u32w0; + for (i = 0; i < tidsendc->tid_list.tsess_unaligned_start; i++) + packptr[i] = unptr_beg[i]; + + packptr = (uint8_t *)&scb->ips_lrh.data[1].u32w1; + for (i = 0; i < tidsendc->tid_list.tsess_unaligned_end; i++) + packptr[i] = unptr_end[i]; + + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_EXPTID_UNALIGNED; + ips_scb_hdr_dlen(scb) = tidsendc->tid_list.tsess_unaligned_start + + tidsendc->tid_list.tsess_unaligned_end; + + ips_scb_cb(scb) = ips_tid_send_completion_unaligned_callback; + ips_scb_cb_param(scb) = tidsendc; + scb->flags |= IPS_SEND_FLAG_UNALIGNED_DATA | IPS_SEND_FLAG_ACK_REQ; + + bufptr += tidsendc->tid_list.tsess_unaligned_start; + frame_extra = tidsendc->tid_list.tsess_unaligned_start + + tidsendc->tid_list.tsess_unaligned_end; + + + tidsendc->remaining_bytes -= frame_extra; + + tidsendc->buffer = bufptr; + + /* Enqueue scb on the flow and flush */ + flow->fn.xfer.enqueue(flow, scb); + flow->fn.xfer.flush(flow, NULL); + + return PSM_OK; +} + +static +ips_scb_t * __fastpath +ips_scb_prepare_tid_sendctrl(struct ips_flow *flow, + struct ips_tid_send_desc *tidsendc) +{ + struct ips_protoexp *protoexp = tidsendc->protoexp; + uint8_t *bufptr = tidsendc->buffer; + uint16_t frame_len, frag_size, nfrag; + int payload_size, idx; + ips_scb_t *scb; + + if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL) + return NULL; + + /* + * Expected sends require 4-byte alignment, so we stuff whatever + * misalignment in the header's available user bytes. + * + * In the current interface, misalignment can only occur at the + * start or end of the packet, so we handle it as a special packet + * before the first packet can be sent off. + * + * If checksum is enabled we send the checksum for the send window + * wiithin/as an unaligned packet as well. + */ + + if (tidsendc->length && + (tidsendc->tid_list.tsess_unaligned_start || + tidsendc->tid_list.tsess_unaligned_end || + (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM)) && + !(tidsendc->unaligned_sent)) { + + /* Send unaligned data separately over ipsaddr->flow. Completion over + * both flows is synchronized to generate TIDS_RELEASE. The receive will + * only finish when tid release is received. */ + scb->tidsendc = tidsendc; + if (ips_scb_send_unaligned_data(scb) != PSM_OK) + return NULL; + + /* Sent unaligned data */ + tidsendc->unaligned_sent = 1; + + + /* Buffer may have been updated (unaligned start) */ + bufptr = tidsendc->buffer; + + /* Try to obtain another scb after sending unaligned data */ + if ((scb = ips_scbctrl_alloc(&protoexp->tid_scbc_rv, 1, 0, 0)) == NULL) + return NULL; + } + + if ((uintptr_t)bufptr & 0x3) { + bufptr = psmi_malloc(protoexp->proto->ep, + UNDEFINED, tidsendc->remaining_bytes); + if (!bufptr) { + ips_scbctrl_free(scb); + return NULL; + } + + memcpy(bufptr, tidsendc->buffer, tidsendc->remaining_bytes); + tidsendc->buffer = tidsendc->bounce_buf = bufptr; + } + + idx = tidsendc->tid_idx; + scb->tidsendc = tidsendc; + SLIST_NEXT(scb,next) = NULL; + + scb->ips_lrh.sub_opcode = OPCODE_SEQ_MQ_EXPTID; + scb->ips_lrh.data[0] = tidsendc->tid_list.tsess_descid; + scb->ips_lrh.data[1] = tidsendc->descid; + scb->tid = tidsendc->tid_list.tsess_list[idx].tid; + scb->tsess = (void *)&tidsendc->tid_list.tsess_list[idx]; + scb->offset = tidsendc->offset; + scb->payload = (void *) bufptr; + + /* + * Loop over the tid session list, count the frag number and payload size. + * The payload size is limited by the pbc.length field which is 16 bits in + * DWORD, including both message header and payload. This translates to + * less than 256K payload. So 128K is used. + */ + nfrag = 0; + payload_size = 0; + frag_size = min(protoexp->tid_send_fragsize, flow->path->epr_mtu); + frame_len = min(tidsendc->remaining_bytes_in_page, frag_size); + while (1) { + nfrag++; + payload_size += frame_len; + + /* adjust counter and pointers */ + tidsendc->remaining_bytes -= frame_len; + tidsendc->remaining_bytes_in_page -= frame_len; + tidsendc->offset += frame_len; + + if (!tidsendc->remaining_bytes_in_page) { + /* Done with this page, move on to the next tid */ + tidsendc->tid_idx++; + tidsendc->remaining_bytes_in_page = + tidsendc->tid_list.tsess_list[tidsendc->tid_idx].length; + tidsendc->offset = + tidsendc->tid_list.tsess_list[tidsendc->tid_idx].offset; + + /* The payload size is limited by the pbc.length field which + * is 16 bits in DWORD, including both message header and + * payload. This translates to less than 256K payload. So 128K + * is used. */ + /* break when current page is done */ + if (payload_size > 131072) break; + } + +#if 0 + if (1) { +#else + if (flow->transfer == PSM_TRANSFER_PIO) { +#endif + break; /* turn on to use single frag-size packet */ + } + + if (!tidsendc->remaining_bytes) break; + frame_len = min(tidsendc->remaining_bytes_in_page, frag_size); + } + scb->nfrag = nfrag; + scb->frag_size = frag_size; + scb->payload_size = payload_size; + scb->tsess_length = sizeof(ips_tid_session_member) * + (tidsendc->tid_idx - idx); + + /* Keep track of latest buffer location so we restart at the + * right location, if we don't complete the transfer */ + tidsendc->buffer = bufptr + payload_size; + + /* If last packet, we want a completion notification */ + if (!tidsendc->remaining_bytes) { + scb->flags = (IPS_SEND_FLAG_ACK_REQ | IPS_SEND_FLAG_EXPECTED_DONE); + scb->callback = ips_tid_send_completion_callback; + scb->cb_param = tidsendc; + + tidsendc->is_complete = 1; + } else { + scb->flags = IPS_SEND_FLAG_HDR_SUPPRESS; + scb->callback = NULL; + scb->cb_param = NULL; + } + +#if 0 + if (1) { +#else + if (flow->transfer == PSM_TRANSFER_PIO) { +#endif + /* turn on to use single frag-size packet */ + /* Do not suppress header every hdr_pkt_interval or the last packet */ + if ((++tidsendc->frame_send % protoexp->hdr_pkt_interval) == 0) { + scb->flags &= ~IPS_SEND_FLAG_HDR_SUPPRESS; + scb->flags |= IPS_SEND_FLAG_ACK_REQ; /* Request an ACK */ + } + } + + return scb; +} + +/* + * Returns: + * + * PSM_OK: scb was allocated for at least one frame, the packet may be queued + * or actually sent. + * + * PSM_OK_NO_PROGRESS: Reached a limit on the maximum number of sends we allow + * to be enqueued before polling receive queue. + * + * PSM_EP_NO_RESOURCES: No scbs, available, a callback will be issued when more + * scbs become available. + * + * PSM_TIMEOUT: PIO-busy or DMA-busy, stop trying to send for now. + * + */ + +psm_error_t __fastpath +ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) +{ + ips_scb_t *scb = NULL; + psm_error_t err = PSM_OK, err_f; + struct ips_protoexp *protoexp = tidsendc->protoexp; + struct ips_proto *proto = protoexp->proto; + struct ips_flow *flow = &tidsendc->tidflow; + + /* + * We aggressively try to grab as many scbs as possible, enqueue them to a + * flow and flush them when either we're out of scbs our we've completely + * filled the send request. + */ + while (!tidsendc->is_complete) + { + if_pf (tidsendc->tid_list.tsess_tidcount && + (tidsendc->tid_idx >= tidsendc->tid_list.tsess_tidcount || + tidsendc->tid_idx < 0) ) + ips_expsend_tiderr(tidsendc); + + if ((scb = ips_scb_prepare_tid_sendctrl(flow, tidsendc)) == NULL) { + proto->stats.scb_exp_unavail_cnt++; + err = PSM_EP_NO_RESOURCES; + break; + } + else { + flow->fn.xfer.enqueue(flow, scb); + } + } + + if (!SLIST_EMPTY(&flow->scb_pend)) { /* Something to flush */ + int num_sent; + err_f = flow->fn.xfer.flush(flow, &num_sent); + + if (err != PSM_EP_NO_RESOURCES) { + /* PSM_EP_NO_RESOURCES is reserved for out-of-scbs */ + if (err_f == PSM_EP_NO_RESOURCES) + err = PSM_TIMEOUT; /* force a resend reschedule */ + else if (err_f == PSM_OK && num_sent > 0 && + !ips_ptl_recvq_isempty(protoexp->ptl)) + err = PSM_OK_NO_PROGRESS; /* force a rcvhdrq service */ + } + } + + return err; +} + +static +psm_error_t __recvpath +ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) timer->context; + struct ips_tid_send_pend *phead = &protoexp->pend_sendq; + struct ips_tid_send_desc *tidsendc; + psm_error_t err = PSM_OK; + + while (!STAILQ_EMPTY(phead)) { + tidsendc = STAILQ_FIRST(phead); + + err = ips_tid_send_exp(tidsendc); + + if (tidsendc->is_complete) + STAILQ_REMOVE_HEAD(phead, next); + + if (err == PSM_OK) { + /* Was able to complete the send, keep going */ + +#if 0 + _IPATH_EXP("tidsess=%6d tid=%4d @ %3d size=%4d offset=%4d, next=%p\n", + tidsendc->descid.u32w0, + tidsendc->tid_list.tsess_list[tidsendc->tid_idx].tid, + tidsendc->tid_idx, + tidsendc->length, + tidsendc->length - tidsendc->remaining_bytes, + STAILQ_FIRST(phead) + ); +#endif + } + else if (err == PSM_EP_NO_RESOURCES) { + /* No more sendbufs available, sendbuf callback will requeue this + * timer */ + break; + } + else if (err == PSM_TIMEOUT) { + /* Always a case of try later: + * On PIO flow, means no send pio bufs available + * On DMA flow, means kernel can't queue request or would have to block + */ + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, + get_cycles() + protoexp->proto->timeout_send); + break; + } + else { + /* Forced to reschedule later so we can check receive queue */ + psmi_assert(err == PSM_OK_NO_PROGRESS); + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_send, PSMI_TIMER_PRIO_1); + break; + } + } + + return PSM_OK; +} + +// Right now, in the kernel we are allowing for virtually non-contiguous pages, +// in a single call, and we are therefore locking one page at a time, but since +// the intended use of this routine is for a single group of +// virtually contiguous pages, that should change to improve +// performance. That means possibly changing the calling MPI code. +// Doing so gets rid of some of the loop stuff here, and in the driver, +// and allows for a single call to the core VM code in the kernel, +// rather than one per page, definitely improving performance. + +static +psm_error_t __fastpath +ips_tid_recv_alloc_frag(struct ips_protoexp *protoexp, + void *buf, uint32_t buflen, + ips_tid_session_list *tid_list, + uint64_t *ts_map) +{ + uint16_t unalignment; + uint32_t remaining_buffer_size = buflen; + uint32_t num_tids; + uint32_t num_tids_avail = ips_tid_num_available(&protoexp->tidc); + uint16_t tidids[IPS_TID_MAX_TIDS]; + void *bufmap; + uint8_t *bufptr = (uint8_t *) buf; + const uint32_t page_size = ips_tid_page_size(&protoexp->tidc); + const uint32_t page_offset_mask = protoexp->tid_page_offset_mask; + int i; + psm_error_t err = PSM_OK; + + /* + * The following remaining_buffer_size calculation + * does not work with buflen<4 and byte aligned + * buf, it can get negative value. + * In function ips_tid_pendtids_timer_callback(), + * we try to avoid nbytes_this(which is buflen) + * to be a few bytes. + */ + if (buflen < 4) { + tid_list->tsess_unaligned_start = buflen; + tid_list->tsess_unaligned_end = 0; + remaining_buffer_size = 0; + } else { + tid_list->tsess_unaligned_start = unalignment = + ((uintptr_t) buf & 3) ? (4 - ((uintptr_t) buf & 3)) : 0; + remaining_buffer_size -= unalignment; + bufptr += unalignment; + + tid_list->tsess_unaligned_end = unalignment = + remaining_buffer_size & 3; + remaining_buffer_size -= unalignment; + } + + bufmap = bufptr; + psmi_assert_always(ips_tid_num_required(&protoexp->tidc, bufmap, + remaining_buffer_size) <= num_tids_avail); + + tid_list->tsess_list[0].tid = 0; + tid_list->tsess_list[0].offset = 0; + tid_list->tsess_list[0].length = 0; + + for (i = 0, num_tids = 0; remaining_buffer_size && i < num_tids_avail; i++) { + uint32_t page_off = (uintptr_t) bufptr & page_offset_mask; + uint32_t page_len = min(remaining_buffer_size, page_size - page_off); + tid_list->tsess_list[i].offset = page_off; + tid_list->tsess_list[i].length = page_len; + bufptr += page_len; + remaining_buffer_size -= page_len; + tidids[i] = 0; /* Ensure tidids[i] is never seen as uninitialized */ + num_tids++; + } + psmi_assert_always(remaining_buffer_size == 0); + + if (num_tids && + (err = ips_tid_acquire(&protoexp->tidc, + (void *) ((uintptr_t) bufmap & + (uintptr_t) protoexp->tid_page_mask), + num_tids, ts_map, tidids))) + goto fail; + + tid_list->tsess_tidcount = num_tids; + for (i = 0; i < num_tids; i++) + tid_list->tsess_list[i].tid = tidids[i]; + + ips_dump_tids(tid_list, "Registered %d tids: ", num_tids); + +fail: + return err; +} + +static +void +ips_tid_mpool_tidrecv_callback(void *context) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) context; + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) + psmi_timer_request(protoexp->proto->timerq, + &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); + + return; +} + +static +__fastpath +struct ips_tid_recv_desc * +ips_tid_recv_alloc(struct ips_protoexp *protoexp, ips_epaddr_t *ipsaddr, + const struct ips_tid_get_request *getreq, uint32_t nbytes_this) +{ + struct ips_tid_recv_desc *tidrecvc; + psm_error_t err = PSM_OK; + + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_get(protoexp->tid_desc_recv_pool); + if (tidrecvc == NULL) + return NULL; + + tidrecvc->context = &protoexp->proto->ep->context; + tidrecvc->protoexp = protoexp; + tidrecvc->ipsaddr = ipsaddr; + tidrecvc->state = TIDRECVC_STATE_GRANT; + tidrecvc->buffer = + (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset); + tidrecvc->num_recv_hdrs = 0; + tidrecvc->recv_msglen = nbytes_this; + tidrecvc->tid_list.tsess_tidcount = 0; + tidrecvc->getreq = (struct ips_tid_get_request *) getreq; + tidrecvc->grant_cnt = 0; + tidrecvc->recv_framecnt = 0; + tidrecvc->flags = 0; + tidrecvc->tidflow_active_gen = IPS_TF_INVALID_GENERATION; + tidrecvc->ctrl_msg_queued = 0; + tidrecvc->cksum = 0xb5b5b5b5; + tidrecvc->stats.nSeqErr = 0; + tidrecvc->stats.nGenErr = 0; + tidrecvc->stats.nReXmit = 0; + tidrecvc->stats.nErrChkReceived = 0; + + if ((err = ips_tf_allocate(&protoexp->tfctrl, + &tidrecvc->tidflow_idx, + &tidrecvc->tidflow_active_gen))){ + /* Unable to get a tidflow for expected protocol. */ + psmi_mpool_put(tidrecvc); + /* XXX log this event */ + return NULL; + } + + tidrecvc->tidflow_genseq.flow = tidrecvc->tidflow_idx; + tidrecvc->tidflow_genseq.gen = tidrecvc->tidflow_active_gen; + tidrecvc->tidflow_genseq.seq = rand_r(&protoexp->tidflow_seed) & 0x3ff; + + ipath_tidflow_set_entry(tidrecvc->context->ctrl, + tidrecvc->tidflow_genseq.flow, + tidrecvc->tidflow_genseq.gen, + tidrecvc->tidflow_genseq.seq); + + tidrecvc->tidflow_nswap_gen = 0; + tidrecvc->tid_list.tsess_type = IPS_TID_SESSTYPE_MEMBER_LIST; + tidrecvc->tid_list.tsess_tidcount = 0; + tidrecvc->tid_list.tsess_tidlist_length = 0; + tidrecvc->tid_list.tsess_unaligned_start = 0; + tidrecvc->tid_list.tsess_unaligned_end = 0; + + tidrecvc->tid_list.tsess_descid._desc_idx = + psmi_mpool_get_obj_index(tidrecvc); + tidrecvc->tid_list.tsess_descid._desc_genc = + psmi_mpool_get_obj_gen_count(tidrecvc); + + tidrecvc->tid_list.tsess_seqno = getreq->tidgr_desc_seqno; + tidrecvc->tid_list.tsess_srcoff = getreq->tidgr_offset; + tidrecvc->tid_list.tsess_length = nbytes_this; + + psmi_timer_entry_init(&tidrecvc->timer_tidreq, + ips_tid_grant_timer_callback, tidrecvc); + + if (nbytes_this > 0) { + if ((err = ips_tid_recv_alloc_frag(protoexp, tidrecvc->buffer, + nbytes_this, &tidrecvc->tid_list, tidrecvc->ts_map))) { + tidrecvc->tid_list.tsess_tidcount = 0; + ips_tf_deallocate(&protoexp->tfctrl, tidrecvc->tidflow_idx); + psmi_mpool_put(tidrecvc); + /* XXX log me !!! */ + return NULL; + } + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) + { + int num_tids = tidrecvc->tid_list.tsess_tidcount; + int tid, i; + for (i = 0; i < num_tids; i++) { + tid = tidrecvc->tid_list.tsess_list[i].tid; + psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_FREE); + protoexp->tid_info[tid].tid = tid; + protoexp->tid_info[tid].state = TIDSTATE_USED; + protoexp->tid_info[tid].tidrecvc = tidrecvc; + } + } + } + + /* This gets sent out as a control message, so we need to force 4-byte IB + * alignment */ + tidrecvc->tid_list.tsess_tidlist_length = (uint16_t) + PSMI_ALIGNUP((sizeof(ips_tid_session_list) + + (tidrecvc->tid_list.tsess_tidcount * + sizeof(ips_tid_session_member))), 4); + + _IPATH_EXP("alloc tidrecv=%d, ntid=%d, paylen=%d\n", + tidrecvc->tid_list.tsess_descid._desc_idx, + tidrecvc->tid_list.tsess_tidcount, + tidrecvc->tid_list.tsess_tidlist_length); + + return tidrecvc; +} + +static +psm_error_t __recvpath +ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) +{ + struct ips_protoexp *protoexp = (struct ips_protoexp *) timer->context; + struct ips_tid_get_pend *phead = &protoexp->pend_getreqsq; + struct ips_tid_get_request *getreq; + struct ips_tid_recv_desc *tidrecvc; + uint32_t nbytes_this, leftover; + uint64_t t_cyc; + uintptr_t bufptr; + psm_epaddr_t epaddr; + ptl_epaddr_t *ipsaddr; + psm_error_t err = PSM_OK; + + while (!STAILQ_EMPTY(phead)) { + getreq = STAILQ_FIRST(phead); + epaddr = getreq->tidgr_epaddr; + +next_epaddr: + ipsaddr = epaddr->ptladdr; + protoexp = ipsaddr->proto->protoexp; + nbytes_this = min(getreq->tidgr_length - getreq->tidgr_offset, + getreq->tidgr_rndv_winsz); + /* + * if the leftover is less than half window size, + * we reduce nbytes_this by half, we want to avoid + * to send a few bytes in a tid transaction. + */ + leftover = getreq->tidgr_length - + (getreq->tidgr_offset + nbytes_this); + if (leftover && leftover < getreq->tidgr_rndv_winsz/2) { + nbytes_this /= 2; + } + + bufptr = (uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset; + + if ((ips_tid_num_required(&protoexp->tidc, (void *) bufptr, nbytes_this) > ips_tid_num_available(&protoexp->tidc)) || + !ips_tf_available(&protoexp->tfctrl)) { + /* We're out of tids/tidflow, tid release will requeue the callback */ + ; + } + else if ((tidrecvc = ips_tid_recv_alloc(protoexp, ipsaddr, + getreq, nbytes_this)) != NULL) { + + err = ips_proto_send_ctrl_message(&ipsaddr-> + flows[protoexp->tid_ep_flow], + OPCODE_TIDS_GRANT, + &tidrecvc->ctrl_msg_queued, + &tidrecvc->tid_list); + + if (err != PSM_EP_NO_RESOURCES) { + tidrecvc->grant_cnt++; + t_cyc = get_cycles() + protoexp->tid_to_cyc_min; + } + else + t_cyc = get_cycles() + protoexp->proto->timeout_send; + + psmi_timer_request_always(protoexp->timerq, + &tidrecvc->timer_tidreq, t_cyc); + + getreq->tidgr_offset += nbytes_this; + _IPATH_VDBG("GRANT tididx=%d.%d srcoff=%d nbytes=%d/%d\n", + tidrecvc->tid_list.tsess_descid._desc_idx, + getreq->tidgr_desc_seqno, + getreq->tidgr_offset, nbytes_this, getreq->tidgr_length); + + getreq->tidgr_desc_seqno++; + if (getreq->tidgr_offset == getreq->tidgr_length) { + getreq->tidgr_protoexp = NULL; + getreq->tidgr_epaddr = NULL; + STAILQ_REMOVE_HEAD(phead, tidgr_next); + continue; + } + epaddr = epaddr->mctxt_next; + goto next_epaddr; + } + else { + /* out of tidrecv desc. The not-empty tidrecv mpool callback will + * cause us to requeue the getreq on the active timer queue */ + ; + } + + epaddr = epaddr->mctxt_next; + if (epaddr != getreq->tidgr_epaddr) goto next_epaddr; + break; + } + return PSM_OK; /* XXX err-broken */ +} + +static +psm_error_t __fastpath +ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) +{ + struct ips_tid_get_request *getreq = tidrecvc->getreq; + struct ips_protoexp *protoexp = tidrecvc->protoexp; + int tidcount = tidrecvc->tid_list.tsess_tidcount; + psm_error_t err = PSM_OK; + + psmi_assert(getreq != NULL); + + /* If checksum is enabled, make sure we have valid data for window */ + if (protoexp->proto->flags & IPS_PROTO_FLAG_CKSUM) { + uint32_t cksum = ips_crc_calculate(tidrecvc->recv_msglen, + (uint8_t*) tidrecvc->buffer, + 0xffffffff); + if (tidrecvc->cksum != cksum) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Rendezvous stats: nSeqErr: %d, nGenErr: %d, nReXmits: %d, nErrChkGen: %d. Aborting! \n", tidrecvc->cksum, cksum, __be16_to_cpu(tidrecvc->ipsaddr->tidgr_flow.path->epr_dlid), tidrecvc->stats.nSeqErr, tidrecvc->stats.nGenErr, tidrecvc->stats.nReXmit, tidrecvc->stats.nErrChkReceived); + ips_proto_dump_data(tidrecvc->buffer, tidrecvc->recv_msglen); + + /* TODO: In order to recover from this we need to restart the rendezvous + * window again. This requires modifying the sender to not complete the + * send locally till TID_RELEASE_CONFIRM is released - currently it + * locally completes before sending the TID_RELEASE message. + */ + } + } + + psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_DONE); + + if (tidcount > 0) { + if (protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) + { + int num_tids = tidrecvc->tid_list.tsess_tidcount; + int tid, i; + for (i = 0; i < num_tids; i++) { + tid = tidrecvc->tid_list.tsess_list[i].tid; + psmi_assert(protoexp->tid_info[tid].state == TIDSTATE_USED); + psmi_assert(protoexp->tid_info[tid].tidrecvc == tidrecvc); + protoexp->tid_info[tid].state = TIDSTATE_FREE; + } + } + + ips_dump_tids(&tidrecvc->tid_list, "Deregistered %d tids: ", + tidrecvc->tid_list.tsess_tidcount); + + if ((err = ips_tid_release(&tidrecvc->protoexp->tidc, + tidrecvc->ts_map, tidcount))) + goto fail; + + } + + getreq->tidgr_bytesdone += tidrecvc->recv_msglen; + + _IPATH_EXP("req=%p bytes=%d/%d\n", + getreq->tidgr_ucontext, + getreq->tidgr_bytesdone, + getreq->tidgr_length); + + tidrecvc->state = TIDRECVC_STATE_FREE; + psmi_mpool_put(tidrecvc); + + if (getreq->tidgr_bytesdone == getreq->tidgr_length) { + if (getreq->tidgr_callback) + getreq->tidgr_callback(getreq->tidgr_ucontext); + psmi_mpool_put(getreq); + } + + /* We just released some tids. If requests are waiting on tids to be + * freed, queue up the timer */ + if (tidcount > 0) { + if (getreq->tidgr_offset < getreq->tidgr_length) { +#if 0 + psmi_timer_request(getreq->tidgr_protoexp->timerq, + &getreq->tidgr_protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); +#endif + ips_tid_pendtids_timer_callback( + &getreq->tidgr_protoexp->timer_getreqs, 0); + } + + if (!STAILQ_EMPTY(&protoexp->pend_getreqsq)) { + psmi_timer_request(protoexp->timerq, + &protoexp->timer_getreqs, + PSMI_TIMER_PRIO_1); + } + } + +fail: + return err; +} + +int +__fastpath +ips_protoexp_tid_release(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_tid_recv_desc *tidrecvc; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t args[3]; + int rc = IPS_RECVHDRQ_CONTINUE; + + args[0] = p_hdr->data[0]; + args[1] = p_hdr->data[1]; + + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc == NULL) + _IPATH_ERROR("OPCODE_TIDS_RELEASE: ERROR: Index %d is out of range\n", + desc_id._desc_idx); + else { + ptl_arg_t desc_tidrecvc; + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + + _IPATH_VDBG("desc_req:id=%d,gen=%d desc_tidc:id=%d,gen=%d\n", + desc_id._desc_idx, desc_id._desc_genc, + desc_tidrecvc._desc_idx, desc_tidrecvc._desc_genc); + + /* See if the reference is still live and valid */ + if (desc_tidrecvc.u64 == desc_id.u64) + ips_tid_recv_free(tidrecvc); + } + + /* Unconditionally echo back the confirmation. If the release is a dupe + * because a previous confirmation was lost, it still needs to be released + * at the other end. */ + ips_proto_send_ctrl_message(&rcv_ev->ipsaddr->flows[protoexp->tid_ep_flow], + OPCODE_TIDS_RELEASE_CONFIRM, + &rcv_ev->ipsaddr->ctrl_msg_queued, + args); + return rc; +} + +int __fastpath +ips_protoexp_build_ctrl_message(struct ips_protoexp *protoexp, + struct ptl_epaddr *ipsaddr, + ptl_arg_t *pargs, + uint16_t *pkt_flags, uint8_t opcode, + void *payload) +{ + switch (opcode) { + case OPCODE_TIDS_GRANT: + { + ips_tid_session_list *tid_list = (ips_tid_session_list *) payload; + uint32_t desc_idx = tid_list->tsess_descid._desc_idx; + struct ips_tid_recv_desc *tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_idx); + if (tidrecvc == NULL) return -1; + + pargs[0].u32w0 = tidrecvc->getreq->tidgr_sendtoken; + pargs[0].u32w1 = tidrecvc->getreq->tidgr_length; + pargs[1].u32w0 = tidrecvc->tidflow_genseq.val; + + if (tidrecvc->grant_cnt >= protoexp->tid_to_intr && + ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD && + !(tidrecvc->getreq->tidgr_flags & IPS_PROTOEXP_TIDGET_PEERWAIT)) + { + + *pkt_flags |= INFINIPATH_KPF_INTR; + protoexp->tid_intr_reqs++; + } + return tid_list->tsess_tidlist_length; + break; + } + + case OPCODE_TIDS_RELEASE: + case OPCODE_TIDS_RELEASE_CONFIRM: + case OPCODE_TIDS_GRANT_ACK: + { + ptl_arg_t *args = (ptl_arg_t *) payload; + pargs[0].u64w0 = args[0].u64w0; + pargs[1].u64w0 = args[1].u64w0; + if (opcode == OPCODE_TIDS_RELEASE) { + uint32_t release_cnt = args[2].u32w0; + if (release_cnt >= protoexp->tid_to_intr && + ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) + { + *pkt_flags |= INFINIPATH_KPF_INTR; + protoexp->tid_intr_reqs++; + } + } + return 0; + } + default: + return 0; + } +} + +void +__fastpath +ips_protoexp_handle_tiderr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_tid_recv_desc *tidrecvc; + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t desc_tidrecvc; + int tid = IPS_HDR_TID(p_hdr); + + /* Expected sends not enabled */ + if (protoexp == NULL) + return; + + /* Not doing extra tid debugging or not really a tiderr */ + if (!(protoexp->tid_flags & IPS_PROTOEXP_FLAG_TID_DEBUG) || + !(rcv_ev->error_flags & INFINIPATH_RHF_H_TIDERR)) + return; + + if (tid >= IPS_TID_MAX_TIDS || rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) { + _IPATH_ERROR("Unexpected tid value %d or ptype %d is not expected " + "in tid debugging\n", tid, rcv_ev->ptype); + return; + } + + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc != NULL) + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + + if (protoexp->tid_info[tid].state != TIDSTATE_USED) { + char buf[128]; + char *s = "invalid (not even in table)"; + if (tidrecvc != NULL) { + if (desc_tidrecvc._desc_idx == desc_id._desc_idx) { + if (desc_tidrecvc._desc_genc == desc_id._desc_genc) + s = "valid"; + else { + snprintf(buf, sizeof buf - 1, "valid session, but wrong " + "generation (gen=%d,received=%d)", + desc_tidrecvc._desc_genc, desc_id._desc_genc); + buf[sizeof buf - 1] = '\0'; + s = buf; + } + } + else { + snprintf(buf, sizeof buf - 1, "invalid session %d", + desc_id._desc_idx); + buf[sizeof buf - 1] = '\0'; + s = buf; + } + + if (protoexp->tid_info[tid].tidrecvc != tidrecvc) { + _IPATH_ERROR("tid %d not a known member of tidsess %d\n", tid, + desc_id._desc_idx); + } + } + + _IPATH_ERROR("tid %d is marked unused (session=%d): %s\n", tid, + desc_id._desc_idx, s); + } + return; +} + +void +__fastpath +ips_protoexp_handle_data_err(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_tid_recv_desc *tidrecvc; + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + int hdr_err = rcv_ev->error_flags & INFINIPATH_RHF_H_IHDRERR; + uint8_t op_code = __be32_to_cpu(p_hdr->bth[0]) >> 24 & 0xFF; + char pktmsg[128]; + char errmsg[256]; + + ips_proto_get_rhf_errstring(rcv_ev->error_flags, pktmsg, sizeof(pktmsg)); + + snprintf(errmsg, sizeof(errmsg), + "%s pkt type opcode 0x%x at hd=0x%x %s\n", + (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "Eager" : + (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "Expected" : + (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "Non-kd" : + "", + op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); + + if (!hdr_err) { + uint32_t tid_recv_sessid; + ptl_arg_t desc_id = p_hdr->data[0]; + psmi_seqnum_t sequence_num; + uint32_t cur_flowgenseq, tfgen, tfseq; + uint16_t kdeth_cksum; + + /* See if the KDETH checksum validates */ + kdeth_cksum = + (uint16_t) IPATH_LRH_BTH + + (uint16_t) (__be16_to_cpu(p_hdr->lrh[2])) - + (uint16_t) ((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)>>16) & + LOWER_16_BITS) - + (uint16_t) (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) & + LOWER_16_BITS) - + (uint16_t) __le16_to_cpu(p_hdr->iph.pkt_flags); + + if (kdeth_cksum != __le16_to_cpu(p_hdr->iph.chksum)) { + _IPATH_EPDBG("Data Error Pkt With Invalid KDETH Checksum: Computed: 0x%04x, IPH_CKSUM: 0x%04x %s", kdeth_cksum, __le16_to_cpu(p_hdr->iph.chksum), errmsg); + return; + } + + tid_recv_sessid = desc_id._desc_idx; + tidrecvc = + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + tid_recv_sessid); + + if_pf (tidrecvc == NULL) { + _IPATH_EPDBG("Data Error Pkt and Invalid Recv Handle: %s", errmsg); + return; + } + + if_pf (psmi_mpool_get_obj_gen_count(tidrecvc) != desc_id._desc_genc) { + /* Print this at very verbose level. Noisy links can have a few of + * these! */ + _IPATH_VDBG("Data Error Pkt and Recv Generation Mismatch: %s", errmsg); + return; /* skip */ + } + + if (tidrecvc->state == TIDRECVC_STATE_DONE) { + _IPATH_EPDBG("Data Error Pkt for a Completed Rendezvous: %s", errmsg); + return; /* skip */ + } + + /* See if CRC error for a previous packet */ + cur_flowgenseq = ipath_tidflow_get(tidrecvc->context->ctrl, + tidrecvc->tidflow_idx); + tfgen = ipath_tidflow_get_genval(cur_flowgenseq); + tfseq = ipath_tidflow_get_seqnum(cur_flowgenseq); + + sequence_num.val = __be32_to_cpu(p_hdr->bth[2]); + + if ((sequence_num.gen == tfgen) && (sequence_num.seq < tfseq)) { + /* Try to recover the flow by restarting from previous known good + * sequence (possible if the packet with CRC error is after the "known + * good PSN" else we can't restart the flow. + */ + if (tidrecvc->tidflow_genseq.seq < sequence_num.seq) + return ips_protoexp_handle_tf_seqerr(rcv_ev); + else + _IPATH_EPDBG("ErrPkt: CRC Error for packet %d.%d. Currently at %d.%d. %s.\n", sequence_num.gen, sequence_num.seq, tfgen, tfseq, errmsg); + } + else { + /* Print this at very verbose level */ + _IPATH_VDBG("Data Error Packet. GenMismatch: %s. Tidrecvc: %p. Pkt Gen.Seq: %d.%d, TF Gen.Seq: %d.%d. %s\n", (sequence_num.gen != tfgen) ? "Yes" : "No", tidrecvc, sequence_num.gen, sequence_num.seq, tfgen, tfseq, errmsg); + } + + } + else { + _IPATH_VDBG("HDR_ERROR: %s\n", errmsg); + } + +} + +psm_error_t +__fastpath +ips_protoexp_flow_newgen(struct ips_tid_recv_desc *tidrecvc) +{ + psmi_assert_always(tidrecvc->state != TIDRECVC_STATE_DONE); + ips_tfgen_allocate(&tidrecvc->protoexp->tfctrl, + tidrecvc->tidflow_idx, + &tidrecvc->tidflow_active_gen); + + /* Update tidflow table with new generation number */ + tidrecvc->tidflow_genseq.gen = tidrecvc->tidflow_active_gen; + ipath_tidflow_set_entry(tidrecvc->context->ctrl, + tidrecvc->tidflow_genseq.flow, + tidrecvc->tidflow_genseq.gen, + tidrecvc->tidflow_genseq.seq); + + /* Increment swapped generation count for tidflow */ + tidrecvc->tidflow_nswap_gen++; + return PSM_OK; +} + +void +__fastpath +ips_protoexp_handle_tf_seqerr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + struct ips_tid_recv_desc *tidrecvc; + ptl_arg_t desc_id = rcv_ev->p_hdr->hdr_data[0]; + ptl_arg_t send_descid = rcv_ev->p_hdr->hdr_data[1]; + ptl_arg_t desc_tidrecvc; + psmi_seqnum_t sequence_num; + ptl_arg_t args[3] = {}; + psm_error_t err; + + psmi_assert_always(protoexp != NULL); + + desc_tidrecvc.u64 = 0; + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc != NULL) + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + + if (tidrecvc && desc_tidrecvc.u64 == desc_id.u64) { + + /* Update stats for sequence errors */ + tidrecvc->stats.nSeqErr++; + + if (tidrecvc->state != TIDRECVC_STATE_DONE) { + + sequence_num.val = __be32_to_cpu(p_hdr->bth[2]); + + /* Only care about sequence error for currently active generation */ + if (tidrecvc->tidflow_active_gen == sequence_num.gen) { + + /* For a sequence error we restart from where the last header + * was successfully delivered for us since this is the last + * known good state for this flow. The PSM version of the flow + * sequence is the "safe" sequence number to restart at. + */ + + /* If a "large" number of swapped generation we are loosing packets + * for this flow. Request throttling of tidflow by generating a + * BECN. With header suppression we will miss some FECN packet + * on QLE73XX hence keeping track of swapped generation is another + * mechanism to do congestion control for tidflows. + * + * For mismatched sender/receiver/link speeds we can get into a + * deadly embrace where minimal progress is made due to generation + * mismatch errors. This can occur if we wrap around the generation + * count without making progress. Hence in cases where the swapped + * generation count is > 254 stop sending BECN (and the NAK) so the + * send -> receiver pipeline is flushed with an error check and things + * can sync up. This should be an extremely rare event. + */ + + if_pf (tidrecvc->tidflow_nswap_gen >= 254) + goto fail; /* Do not send NAK. Let error check kick in. */ + + if_pf ((tidrecvc->tidflow_nswap_gen > 4) && + (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { + _IPATH_CCADBG("Generating BECN. Number of swapped generations: %d.\n", tidrecvc->tidflow_nswap_gen); + /* Mark flow to generate BECN in control packet */ + tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN; + + /* Update stats for congestion encountered */ + if (rcv_ev->ipsaddr) + rcv_ev->ipsaddr->stats.congestion_pkts++; + } + + /* Swap generation for the flow. */ + err = ips_protoexp_flow_newgen(tidrecvc); + if (err != PSM_OK) + goto fail; + + /* NAK the tid flow. Note: We can generate the latest NAK for this flow + * based on the tidrecvc->tidflow_{active|passive}_gen fields. */ + args[0] = send_descid; + args[1] = tidrecvc->tid_list.tsess_descid; + args[2].u16w0 = sequence_num.gen; /* Older Gen to NAK */ + + ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow, + OPCODE_NAK, + &tidrecvc->ctrl_msg_queued, args); + + /* Update stats for retransmit */ + tidrecvc->stats.nReXmit++; + } + } /* tidrecvc->state != DONE */ + } + + fail: + return; +} + +void +__fastpath +ips_protoexp_handle_tf_generr(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_protoexp *protoexp = rcv_ev->proto->protoexp; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + int tid = IPS_HDR_TID(p_hdr); + struct ips_tid_recv_desc *tidrecvc; + psmi_assert(rcv_ev->p_hdr->data != NULL); + ptl_arg_t desc_id = rcv_ev->p_hdr->data[0]; + ptl_arg_t desc_tidrecvc; + + if (tid >= IPS_TID_MAX_TIDS || rcv_ev->ptype != RCVHQ_RCV_TYPE_EXPECTED) { + _IPATH_ERROR("Unexpected tid value %d or ptype %d is not expected " + "in tid debugging\n", tid, rcv_ev->ptype); + return; + } + + /* For a generation error our NAK crossed on the wire or this is a stale + * packet. Error recovery should sync things up again. Just drop this + * packet. + */ + desc_tidrecvc.u64 = 0; + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc != NULL) { + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + if (desc_tidrecvc.u64 == desc_id.u64) { + tidrecvc->stats.nGenErr++; /* Update stats for generation errors */ + + /* TODO_CCA: If packet faced congestion we may want to generate a CN + * packet to rate control sender. + */ + } + + } + +} diff --git a/ptl_ips/ips_proto_header.h b/ptl_ips/ips_proto_header.h new file mode 100644 index 0000000..3e3ee90 --- /dev/null +++ b/ptl_ips/ips_proto_header.h @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_HEADER_H +#define _IPS_PROTO_HEADER_H + +/* The actual size of the message header is determined by three paramters: + * IPS_HEADER_QUEUE_IWORDS (fixed at 5 by hardware) + * InfiniBand words contain LRH and BTH + * IPS_HEADER_QUEUE_HWORDS (fixed at 7 by ips protocol) + * IPS header words contain ips-protocol-specific data + * IPS_HEADER_QUEUE_UWORDS (variable sized, from 2 to 32) + * Size depends on the target. The connect protocol always assumes 2 + * uwords, and post-connect communication will use a length determined at + * connect time. + * + * The header message size is determined to as IWORDS + HWORDS + UWORDS + */ +struct ips_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; + __u8 flags; + __u16 commidx; + /* 24 bits. The upper 8 bit is available for other use */ + union { + /* NOTE: always access src_context with HEADER_SRCCONTEXT macros. + * actual context value is split to preserve wire compatibility */ + struct { + unsigned ack_seq_num:24; + unsigned src_context:4; + unsigned src_subcontext:2; + unsigned src_context_ext:2; + }; + __u32 ack_seq_num_org; + }; + __u8 flowid; + __u8 hdr_dlen; /* data length in header */ + + union { + struct { + __u16 mqhdr : 14; /* PSM matched queues */ + __u16 dst_subcontext : 2; /* Destination subcontext */ + }; + struct { /* for PSM Active Messages */ + __u16 amhdr_hidx : 8; + __u16 amhdr_nargs : 3; + __u16 amhdr_flags : 3; /* Reduced from 5 bits previously */ + }; + __u16 mqhdr_org; + }; + /* Access to uwords */ + union { + ptl_arg_t hdr_data[2]; + ptl_arg_t data[0]; + __u32 uwords[4]; + }; +}; + +#define IPS_HEADER_QUEUE_IWORDS 5 /* LRH+BTH (fixed) */ + +/* These two define the same thing, but they exist in sizeof and as a constant + * for sanity checking */ +#define IPS_HEADER_QUEUE_IPS_PROTOCOL_WORDS 5 +#define IPS_HEADER_QUEUE_HWORDS 5 + +/* Min is used by the connect protocol. + * Max bounds the size of the preallocated communication headers. + * Req is the current desired receive header queue size. The actual size is + * returned after userinit. */ +#define IPS_HEADER_QUEUE_UWORDS_MIN 4 +#define IPS_HEADER_QUEUE_UWORDS_MAX 32 +#define IPS_HEADER_QUEUE_UWORDS_REQ 12 + +#define IPS_HEADER_QUEUE_PBC_WORDS 2 + +/* Figure out "real" size of ips_message_header given the size of the receive + * header queue entry */ +/* Actual message length includes iwords */ +#define IPS_HEADER_MSGLEN(rcvhdrq_size) \ + ((IPS_HEADER_QUEUE_IWORDS+(rcvhdrq_size))<<2) + +/* Old define */ +#define IPS_HEADER_QUEUE_WORDS \ + ((sizeof(struct ips_message_header) - \ + offsetof(struct ips_message_header, iph)) >> 2) + +/* sub OpCodes - ips */ +#define OPCODE_SEQ_DATA 0x01 +#define OPCODE_SEQ_CTRL 0x02 + +#define OPCODE_SEQ_MQ_DATA 0x03 +#define OPCODE_SEQ_MQ_CTRL 0x04 +#define OPCODE_SEQ_MQ_HDR 0x05 +#define OPCODE_SEQ_MQ_EXPTID 0x06 +#define OPCODE_SEQ_MQ_EXPTID_UNALIGNED 0x07 + +#define OPCODE_ACK 0x10 +#define OPCODE_NAK 0x11 + +#define OPCODE_ERR_CHK_OLD 0x20 +#define OPCODE_ERR_CHK_PLS 0x21 +#define OPCODE_ERR_CHK 0x22 /* error check with ip + pid */ +#define OPCODE_ERR_CHK_BAD 0x23 /* error check out of context */ +#define OPCODE_ERR_CHK_GEN 0x24 /* TF protocol error check */ + +/* Pre-2.0 startup */ +#define OPCODE_STARTUP 0x30 +#define OPCODE_STARTUP_ACK 0x31 +#define OPCODE_STARTUP_NAK 0x32 +#define OPCODE_STARTUP_EXT 0x34 +#define OPCODE_STARTUP_ACK_EXT 0x35 +#define OPCODE_STARTUP_NAK_EXT 0x36 +/* 2.0+ startup */ +#define OPCODE_CONNECT_REQUEST 0x60 +#define OPCODE_CONNECT_REPLY 0x61 +#define OPCODE_DISCONNECT_REQUEST 0x62 +#define OPCODE_DISCONNECT_REPLY 0x63 + +#define OPCODE_AM_REQUEST 0x70 +#define OPCODE_AM_REPLY 0x71 +#define OPCODE_AM_REQUEST_NOREPLY 0x72 + +#define OPCODE_TIDS_RELEASE 0x40 +#define OPCODE_TIDS_RELEASE_CONFIRM 0x41 +#define OPCODE_TIDS_GRANT 0x42 +#define OPCODE_TIDS_GRANT_ACK 0x43 + +#define OPCODE_CLOSE 0x50 +#define OPCODE_CLOSE_ACK 0x51 + +/* Explicit CCA related messages */ +#define OPCODE_FLOW_CCA_BECN 0x80 + +/* + * like OPCODE_CLOSE, but no complaint if other side has already closed. + * Used when doing abort(), MPI_Abort(), etc. + */ +#define OPCODE_ABORT 0x52 + +#endif /* _IPS_PROTO_HEADER_H */ diff --git a/ptl_ips/ips_proto_help.h b/ptl_ips/ips_proto_help.h new file mode 100644 index 0000000..96aa509 --- /dev/null +++ b/ptl_ips/ips_proto_help.h @@ -0,0 +1,759 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_HELP_H +#define _IPS_PROTO_HELP_H + +#include "ips_recvhdrq.h" +#include "ips_proto.h" +#include "ipserror.h" +#include "psm_mq_internal.h" // psmi_mq_handle_tiny_envelope +#include "ptl_ips.h" +#include "ips_epstate.h" + +/* Some tunable compile-time options */ +#define IPS_TINY_PROCESS_MQTINY 1 /* whether mq processing of tiny pkts is + done separately from non-tiny packets */ + +PSMI_ALWAYS_INLINE( +uint8_t +ips_flow_gen_ackflags(ips_scb_t *scb, struct ips_flow *flow)) +{ + uint32_t diff = (flow->protocol == PSM_PROTOCOL_TIDFLOW) ? + (flow->xmit_seq_num.seq - flow->xmit_ack_num.seq) : + (flow->xmit_seq_num.pkt - flow->xmit_ack_num.pkt); + + /* + * This is currently disabled pending more experimentation. The goal + * is to eventually use the FLAG_INTR to tighten the control loop + * between two endpoints. + */ +#if 0 + /* At every 64, request ack w/ interrupt */ + if ((diff & 0x3f) == 0) + scb->flags |= IPS_SEND_FLAG_ACK_REQ | + (flow->ipsaddr->flags & SESS_FLAG_HAS_RCVTHREAD) ? + IPS_SEND_FLAG_INTR : 0; + /* At every 16, request ack */ + else +#endif + if (((diff & flow->ack_interval) == 0) || (flow->credits == 1)) + scb->flags |= IPS_SEND_FLAG_ACK_REQ; + + /* Bottom 8 bits wind up in protocol header fields, other bits + * control other aspects of packet composition */ + return (uint8_t) (scb->flags & IPS_SEND_FLAG_PROTO_OPTS); +} + +PSMI_ALWAYS_INLINE( +ptl_epaddr_flow_t ips_proto_flowid(struct ips_message_header *p_hdr)) +{ + ptl_epaddr_flow_t flowidx = IPS_FLOWID2INDEX(p_hdr->flowid); + psmi_assert(flowidx < EP_FLOW_LAST); + return flowidx; +} + +PSMI_ALWAYS_INLINE( +void ips_kdeth_cksum(struct ips_message_header *p_hdr)) +{ + /* Compute KDETH checksum */ + p_hdr->iph.chksum = __cpu_to_le16( + (uint16_t) IPATH_LRH_BTH + + (uint16_t) (__be16_to_cpu(p_hdr->lrh[2])) - + (uint16_t) ((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset)>>16) & + LOWER_16_BITS) - + (uint16_t) (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) & + LOWER_16_BITS) - + (uint16_t) __le16_to_cpu(p_hdr->iph.pkt_flags)); +} + +PSMI_ALWAYS_INLINE( +int ips_do_cksum(struct ips_proto *proto, + struct ips_message_header *p_hdr, + void *payload, + uint32_t paylen, + uint32_t *cksum)) +{ + + if_pf ((proto->flags & IPS_PROTO_FLAG_CKSUM) && + (((__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) >> INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK) == IPATH_EAGER_TID_ID) && (p_hdr->mqhdr != MQ_MSG_DATA_BLK) && (p_hdr->mqhdr != MQ_MSG_DATA_REQ_BLK)) { + + uint16_t paywords; + + /* Update the payload words in header */ + paywords = (sizeof(struct ips_message_header) + + paylen + PSM_CRC_SIZE_IN_BYTES) >> BYTE2WORD_SHIFT; + p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC); + + /* Need to regenerate KDETH checksum after updating payload length */ + ips_kdeth_cksum(p_hdr); + + *cksum = 0xffffffff; + + /* Checksum header */ + *cksum = ips_crc_calculate(sizeof(struct ips_message_header), + (uint8_t*) p_hdr, *cksum); + + /* Checksum payload (if any) */ + if (paylen) { + psmi_assert_always(payload); + *cksum = ips_crc_calculate(paylen, (uint8_t*) payload, + *cksum); + } + } + + return 0; +} + +/* Get pbc static rate value for flow for a given message length */ +PSMI_ALWAYS_INLINE( +uint32_t ips_proto_pbc_static_rate(struct ips_flow *flow, uint32_t msgLen)) +{ + uint32_t rate = 0; + + /* The PBC rate is based on which HCA type as QLE73XX/QLE72XX have different + * mechanism for static rate control. QLE71XX does not even have static + * rate control capability. + */ + + switch(flow->epinfo->ep_hca_type) { + case PSMI_HCA_TYPE_QLE73XX: + { + + /* Rate = IPD * Time to transmit the packet. The rate value is + * programmed into the PBC which counts down at a rate of 500 MHz the + * TXE to IBC interface speed (Section 7.8.1). Since time to transmit + * depends on our local link speed we need to convert that into the + * clock frequency of the TXE in 500 MHz units. To transfer a message of + * MSgLen bytes for various local link rates we obtain: + * + * Link Rate (LinWidth * LinkSpeed) Cycle Count + * SDR (10 Gbit/sec) (MsgLen >> 1) + * DDR (20 Gbit/sec) (MsgLen >> 2) + * QDR (40 Gbit/sec) (MsgLen >> 3) + */ + static uint8_t qle73xx_rate_divisor[IBTA_RATE_120_GBPS + 1] = { + [IBTA_RATE_2_5_GBPS] = 0, + [IBTA_RATE_5_GBPS] = 0, + [IBTA_RATE_10_GBPS] = 1, + [IBTA_RATE_20_GBPS] = 2, + [IBTA_RATE_30_GBPS] = 2, + [IBTA_RATE_40_GBPS] = 3 + }; + + uint32_t time_to_send = (msgLen >> + qle73xx_rate_divisor[flow->epinfo->ep_link_rate]); + /* IBTA CCA additionally has a shift_field for finer grained control + * of IPD (This is bit [14:15] in the CCT entry. For static rate control + * this value is always so. + */ + rate = (time_to_send >> flow->path->epr_cca_divisor) * + (flow->path->epr_active_ipd); + + /* For QLE73XX the max rate is 0x3FF*/ + rate = min(rate, 0x3FFF); + } + break; + case PSMI_HCA_TYPE_QLE72XX: + /* TODO_CCA: Implement for QLE72XX to take into account the PREVIOUS + * messages IPD for this flow/path. + */ + rate = 0; + break; + default: + rate = 0; + } + + return rate; +} + +/* This is only used for SDMA cases; pbc is really a pointer to + * struct ips_pbc_header * or the equivalent un-named structure + * in ips_scb */ +PSMI_ALWAYS_INLINE( +void ips_proto_pbc_update(struct ips_proto *proto, + struct ips_flow *flow, uint32_t isCtrlMsg, + union ipath_pbc *pbc, uint32_t hdrlen, + void *payload, uint32_t paylen)) +{ + struct ips_spio *ctrl = proto->spioc; + struct ips_message_header *p_hdr = (struct ips_message_header*) &pbc[1]; + int vl = (__be16_to_cpu(p_hdr->lrh[0]) >> LRH_VL_SHIFT) & 0xf; + uint32_t static_rate = 0; + + if_pf (!isCtrlMsg && flow->path->epr_active_ipd) + static_rate = ips_proto_pbc_static_rate(flow, hdrlen + paylen); + + pbc->qword = 0ULL; + pbc->length = __cpu_to_le16( ((hdrlen + paylen) >> 2) + 1); + if (ctrl->portnum > 1) + pbc->pbcflags |= __cpu_to_le32(vl << __PBC_VLSHIFT | + __PBC_IBPORT | + static_rate); + else + pbc->pbcflags |= __cpu_to_le32(vl << __PBC_VLSHIFT | + static_rate); + + return; +} + +/* + * Helpers to extract header information + */ +/* With QLE73XX/QLE72XX, we put context 16 in src_context_ext */ +#define IPS_HEADER_SRCCONTEXT_GET(msg_hdr) \ + (((msg_hdr)->src_context) | ((msg_hdr)->src_context_ext<<4)) + +#define IPS_HEADER_SRCCONTEXT_SET(msg_hdr,context) do { \ + (msg_hdr)->src_context = (context) & 0xf; \ + (msg_hdr)->src_context_ext = (context>>4) & 0x3; \ + } while (0) + +PSMI_ALWAYS_INLINE( +uint32_t ips_proto_dest_context_from_header(struct ips_proto *proto, + struct ips_message_header *p_hdr)) +{ + uint16_t hca_type; + uint32_t dest_context; + + hca_type = PSMI_EPID_GET_HCATYPE(proto->ep->epid); + + dest_context = + (__le32_to_cpu(p_hdr->iph.ver_context_tid_offset) >> INFINIPATH_I_CONTEXT_SHIFT) & INFINIPATH_I_CONTEXT_MASK; + switch(hca_type) { + case PSMI_HCA_TYPE_QLE73XX: + dest_context |= ((__be32_to_cpu(p_hdr->bth[1]) & 1) << 4); + break; + case PSMI_HCA_TYPE_QLE72XX: + /* Context 16 is special cased on QLE72XX */ + dest_context |= ((__be32_to_cpu(p_hdr->bth[1]) & 1) << 4); + if (dest_context == 0x1f) + dest_context = 16; + break; + case PSMI_HCA_TYPE_QLE71XX: + default: + /* This is a no-op. */ + break; + } + + return dest_context; +} + +PSMI_ALWAYS_INLINE( +void ips_proto_hdr(ips_scb_t *scb, + struct ips_epinfo *epinfo, + struct ips_epinfo_remote *epr, + struct ips_flow *flow, + uint32_t paywords, + uint32_t extra_bytes, + uint16_t kpf_flags, + uint8_t flags)) +{ + struct ips_message_header *p_hdr = &scb->ips_lrh; + + /* + * This scb has been used by this connection last time, + * so some of the header fields are already set. + */ + if (scb->flow == flow && scb->epaddr == flow->ipsaddr) { + p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn); + p_hdr->flags = flags; + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + + /* check if extra bytes is changed */ + if (scb->extra_bytes != extra_bytes) { + p_hdr->bth[0] = + __cpu_to_be32((IPATH_OPCODE_USER1 << BTH_OPCODE_SHIFT) + + (extra_bytes << BTH_EXTRA_BYTE_SHIFT) + + flow->path->epr_pkey); + scb->extra_bytes = extra_bytes; + } + + /* If header is exactly the same */ + if (scb->tid == IPATH_EAGER_TID_ID && + scb->pkt_flags == kpf_flags && + scb->payload_bytes == scb->payload_size) { + return; + } + + /* context, version, and TID are already known to be in range, no + * masking needed; offset in low INFINIPATH_I_OFFSET_MASK bits */ + p_hdr->iph.ver_context_tid_offset = __cpu_to_le32( + (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) + + (epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) + + (scb->tid << INFINIPATH_I_TID_SHIFT) + + (scb->offset >> 2)); // convert from byte to word offset + + p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC); + p_hdr->iph.pkt_flags = __cpu_to_le16(kpf_flags); + + ips_kdeth_cksum(p_hdr); // Generate KDETH checksum + + scb->pkt_flags = kpf_flags; + scb->payload_bytes = scb->payload_size; + + return; + } + + p_hdr->lrh[0] = + __cpu_to_be16(IPATH_LRH_BTH | + (flow->sl << 4) | /* SL for flow */ + /* VL for flow */ (flow->path->proto->sl2vl[flow->sl] << LRH_VL_SHIFT)); + p_hdr->lrh[1] = flow->path->epr_dlid; + p_hdr->lrh[2] = __cpu_to_be16(paywords + SIZE_OF_CRC); + p_hdr->lrh[3] = flow->path->epr_slid; + + p_hdr->bth[0] = + __cpu_to_be32((IPATH_OPCODE_USER1 << BTH_OPCODE_SHIFT) + + (extra_bytes << BTH_EXTRA_BYTE_SHIFT) + + flow->path->epr_pkey); + p_hdr->bth[1] = __cpu_to_be32(epr->epr_qp); + p_hdr->bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn); + p_hdr->commidx = (uint16_t) epr->epr_commidx_to; + + /* context, version, and TID are already known to be in range, no + * masking needed; offset in low INFINIPATH_I_OFFSET_MASK bits */ + p_hdr->iph.ver_context_tid_offset = __cpu_to_le32( + (IPS_PROTO_VERSION << INFINIPATH_I_VERS_SHIFT) + + (epr->epr_pkt_context << INFINIPATH_I_CONTEXT_SHIFT) + + (scb->tid << INFINIPATH_I_TID_SHIFT) + + (scb->offset >> 2)); // convert from byte to word offset + p_hdr->iph.pkt_flags = __cpu_to_le16(kpf_flags); + + ips_kdeth_cksum(p_hdr); // Generate KDETH checksum + + p_hdr->flags = flags; + p_hdr->flowid = flow->flowid; + p_hdr->ack_seq_num = flow->recv_seq_num.psn; + IPS_HEADER_SRCCONTEXT_SET(p_hdr, epinfo->ep_context); + p_hdr->src_subcontext = epinfo->ep_subcontext; + p_hdr->dst_subcontext = epr->epr_subcontext; + + scb->extra_bytes = extra_bytes; + scb->pkt_flags = kpf_flags; + scb->payload_bytes = scb->payload_size; + scb->flow = flow; + scb->epaddr = flow->ipsaddr; + + return; +} + +/* + * Assumes that the following fields are already set in scb: + * payload + * payload_size + * flags + */ +PSMI_INLINE( +void +ips_scb_prepare_flow_inner(ips_scb_t *scb, + struct ips_epinfo *epinfo, + struct ips_epinfo_remote *epr, + struct ips_flow *flow)) +{ + uint32_t extra_bytes; + uint32_t tot_paywords; + uint16_t pkt_flags = IPS_EPSTATE_COMMIDX_PACK(epr->epr_commidx_to); + + extra_bytes = scb->payload_size & 3; + if (extra_bytes) { + extra_bytes = 4 - extra_bytes; + scb->payload_size += extra_bytes; + } + tot_paywords = (sizeof(struct ips_message_header) + scb->payload_size) + >> BYTE2WORD_SHIFT; + pkt_flags |= (scb->flags & IPS_SEND_FLAG_INTR) ? INFINIPATH_KPF_INTR : 0; + pkt_flags |= (scb->flags & IPS_SEND_FLAG_HDR_SUPPRESS) ? + INFINIPATH_KPF_HDRSUPP : 0; + + ips_proto_hdr(scb, epinfo, epr, flow, + tot_paywords, extra_bytes, + pkt_flags, ips_flow_gen_ackflags(scb, flow)); + + scb->ack_timeout = flow->path->epr_timeout_ack; + scb->abs_timeout = TIMEOUT_INFINITE; + scb->flags |= IPS_SEND_FLAG_PENDING; + + if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { + flow->xmit_seq_num.seq += scb->nfrag; + scb->seq_num = flow->xmit_seq_num; + scb->seq_num.seq--; + } else { + flow->xmit_seq_num.pkt += scb->nfrag; + scb->seq_num = flow->xmit_seq_num; + scb->seq_num.pkt--; + } + + return; +} + +PSMI_ALWAYS_INLINE( +psm_epid_t +ips_epid_from_phdr(const uint16_t lmc_mask, + const struct ips_message_header *p_hdr)) +{ + uint16_t lid = __be16_to_cpu(p_hdr->lrh[3]) & lmc_mask; + uint16_t context = (uint16_t) IPS_HEADER_SRCCONTEXT_GET(p_hdr); + uint16_t subcontext = (uint16_t) p_hdr->src_subcontext; + + return PSMI_EPID_PACK(lid, context, subcontext); +} + +PSMI_ALWAYS_INLINE( +void +ips_epaddr_stats_send(struct ptl_epaddr *ptladdr, uint8_t msgtype)) +{ + switch (msgtype) { + case OPCODE_ACK: + break; + case OPCODE_TIDS_GRANT: + ptladdr->stats.tids_grant_send++; + break; + case OPCODE_ERR_CHK: + case OPCODE_ERR_CHK_GEN: + ptladdr->stats.err_chk_send++; + break; + case OPCODE_NAK: + ptladdr->stats.nak_send++; + break; + case OPCODE_CONNECT_REQUEST: + ptladdr->stats.connect_req++; + break; + case OPCODE_DISCONNECT_REQUEST: + ptladdr->stats.disconnect_req++; + break; + default: + break; + } + return; +} + +/* + * Exported there solely for inlining is_expected_or_nak and mq_tiny handling + */ +extern +psm_error_t ips_proto_send_ctrl_message(struct ips_flow *flow, + uint8_t message_type, + uint32_t *msg_queue_mask, + void *payload); + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_ack(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt (recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_NAK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; /* ACK clears NAK */ + } + else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_ACK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_ACK; + } + else { + /* Coalesced ACKs disabled. Send ACK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr->ctrl_msg_queued, NULL); + } +} + +PSMI_ALWAYS_INLINE( +void +ips_proto_send_nak(struct ips_recvhdrq *recvq, struct ips_flow *flow)) +{ + if_pt (recvq->proto->flags & IPS_PROTO_FLAG_COALESCE_ACKS) { + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; /* NAK clears ACK */ + } + else if (!(flow->flags & IPS_FLOW_FLAG_PENDING_NAK)) { + SLIST_INSERT_HEAD(&recvq->pending_acks, flow, next); + } + + flow->flags |= IPS_FLOW_FLAG_PENDING_NAK; + } + else { + /* Coalesced ACKs disabled. Send NAK immediately */ + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr->ctrl_msg_queued, NULL); + } +} + +/* return 1 if packet is next expected in flow + * return 0 if packet is not next expected in flow (and nak packet). + */ +PSMI_ALWAYS_INLINE( +int +ips_proto_is_expected_or_nak(struct ips_recvhdrq_event *rcv_ev)) +{ + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow = &ipsaddr->flows[flowid]; + psmi_seqnum_t sequence_num; + + psmi_assert((flowid == EP_FLOW_GO_BACK_N_PIO) || + (flowid == EP_FLOW_GO_BACK_N_DMA) || + (flowid == EP_FLOW_GO_BACK_N_AM_REQ) || + (flowid == EP_FLOW_GO_BACK_N_AM_RSP) + ); + + /* If packet faced congestion generate BECN in NAK. */ + if_pf ((rcv_ev->is_congested & IPS_RECV_EVENT_FECN) && + ((flow->cca_ooo_pkts & 0xf) == 0)) { + /* Generate a BECN for every 16th OOO packet marked with a FECN. */ + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + flow->cca_ooo_pkts++; + ipsaddr->stats.congestion_pkts++; + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; /* Clear FECN event */ + } + + sequence_num.val = __be32_to_cpu(p_hdr->bth[2]); + if_pf (flow->recv_seq_num.pkt != sequence_num.pkt) { + int16_t diff = (int16_t) (sequence_num.pkt - flow->last_seq_num.pkt); + + if (diff < 0) + return 0; + + flow->cca_ooo_pkts = diff; + if (flow->cca_ooo_pkts > flow->ack_interval) { + ipsaddr->stats.congestion_pkts++; + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + _IPATH_CCADBG("BECN Generation. Expected: %d, Got: %d.\n", flow->recv_seq_num.pkt, sequence_num.pkt); + } + flow->last_seq_num = sequence_num; + + if (!(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) { + /* Queue/Send NAK to peer */ + ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + flow->cca_ooo_pkts = 0; + } + else if (flow->flags & IPS_FLOW_FLAG_GEN_BECN) { + /* Send Control message to throttle flow. Will clear flow flag and + * reset cca_ooo_pkts. + */ + ips_proto_send_ctrl_message(flow, OPCODE_FLOW_CCA_BECN, + &flow->ipsaddr->ctrl_msg_queued, + NULL); + } + + return 0; + } + else { + flow->flags &= ~IPS_FLOW_FLAG_NAK_SEND; + + flow->last_seq_num = sequence_num; + flow->recv_seq_num.pkt += 1; + flow->cca_ooo_pkts = 0; + return 1; + } +} + +/* + * Return value: + * 1: in order message; + * 0: out of order, no touch; + * -1: out of order, buffered in outoforder queue. + */ +PSMI_ALWAYS_INLINE( +int +ips_proto_check_msg_order(psm_epaddr_t epaddr, + struct ips_flow *flow, struct ips_message_header *p_hdr)) +{ + uint16_t msg_seqnum = (uint16_t)(flow->last_seq_num.msg + + ((p_hdr->ack_seq_num>>8)&0xff00)); + + if (msg_seqnum != epaddr->mctxt_master->mctxt_recv_seqnum) { + flow->msg_ooo_toggle = !flow->msg_ooo_toggle; + + if (flow->msg_ooo_toggle) { + flow->recv_seq_num.pkt -= 1; + flow->msg_ooo_seqnum = msg_seqnum; + return 0; + } + + psmi_assert(msg_seqnum == flow->msg_ooo_seqnum); + return -1; + } + + flow->msg_ooo_toggle = 0; + epaddr->mctxt_master->mctxt_recv_seqnum++; + return 1; +} + +#if IPS_TINY_PROCESS_MQTINY +PSMI_ALWAYS_INLINE( +int +ips_proto_process_mq_tiny(const struct ips_recvhdrq_event *rcv_ev)) +{ + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + psm_epaddr_t epaddr = ipsaddr->epaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow = &ipsaddr->flows[flowid]; + int ret = IPS_RECVHDRQ_CONTINUE; + + if (ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev)) { + ret = ips_proto_check_msg_order(epaddr, flow, p_hdr); + if (ret == 0) return IPS_RECVHDRQ_OOO; + if (ret == -1) { + psmi_mq_handle_envelope_outoforder(ipsaddr->proto->mq, + (uint16_t) p_hdr->mqhdr, + epaddr, flow->msg_ooo_seqnum, + p_hdr->data[0].u64, /* tag */ + epaddr->xmit_egrlong, /* place hold only */ + (uint32_t) p_hdr->hdr_dlen, + (void *) &p_hdr->data[1], + (uint32_t) p_hdr->hdr_dlen); + ret = IPS_RECVHDRQ_BREAK; + } else { + psmi_mq_handle_tiny_envelope( + ipsaddr->proto->mq, + epaddr, p_hdr->data[0].u64, /* tag */ + (void *) &p_hdr->data[1], + (uint32_t) p_hdr->hdr_dlen); + if (epaddr->mctxt_master->outoforder_c) { + psmi_mq_handle_outoforder_queue(epaddr->mctxt_master); + } + ret = IPS_RECVHDRQ_CONTINUE; + } + if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow); + } + + ips_proto_process_ack((struct ips_recvhdrq_event *) rcv_ev); + return ret; +} +#endif + +PSMI_INLINE( +int +ips_proto_process_packet(const struct ips_recvhdrq_event *rcv_ev)) +{ +#if IPS_TINY_PROCESS_MQTINY + if (rcv_ev->p_hdr->sub_opcode == OPCODE_SEQ_MQ_HDR) { + psmi_assert(rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER); + return ips_proto_process_mq_tiny(rcv_ev); + } + else +#endif + return ips_proto_process_packet_inner((struct ips_recvhdrq_event *) rcv_ev); +} + +#if PSMI_PLOCK_DISABLED + #define ips_ptladdr_lock(ipsaddr) \ + if (((ipsaddr)->flags & SESS_FLAG_LOCK_SESS)) \ + pthread_mutex_lock(&(ipsaddr)->sesslock) + + #define ips_ptladdr_unlock(ipsaddr) \ + if (((ipsaddr)->flags & SESS_FLAG_LOCK_SESS)) \ + pthread_mutex_unlock(&(ipsaddr)->sesslock) +#else + #define ips_ptladdr_lock(ipsaddr) + #define ips_ptladdr_unlock(ipsaddr) +#endif + +/* + * Breaks header encapsulation but needed in mq sends so we can pay + * "near-equal" attention to putting sends on the wire and servicing the + * receive queue. + */ + +PSMI_ALWAYS_INLINE( +psm_error_t +ips_recv_progress_if_busy(ptl_t *ptl, psm_error_t err)) +{ + if (err == PSM_EP_NO_RESOURCES) { + ptl->ctl->ep_poll(ptl, 0); + return PSM_OK; + } + else + return err; +} + +/* Find next lowest power of a two for a 32 bit number*/ +PSMI_ALWAYS_INLINE( +unsigned int +ips_next_low_pow2(unsigned int v)) +{ + + const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + const unsigned int S[] = {1, 2, 4, 8, 16}; + register unsigned int r = 1; + int i; + + for (i = 4; i >= 0; i--) + { + if (v & b[i]) + { + v >>= S[i]; + r <<= S[i]; + } + } + + return r; +} + +PSMI_ALWAYS_INLINE( +ips_path_rec_t *ips_select_path(struct ips_proto *proto, + ips_path_type_t path_type, + ips_epaddr_t *ipsaddr)) +{ + uint32_t path_idx; + + if (proto->flags & IPS_PROTO_FLAG_PPOLICY_ADAPTIVE) { + /* If dispersive routes are configured then select the routes in round + * robin order. We may want to use congestion information to select the + * least lightly loaded path. + */ + path_idx = ipsaddr->epr.epr_next_path[path_type]; + if (++ipsaddr->epr.epr_next_path[path_type] >= + ipsaddr->epr.epr_num_paths[path_type]) + ipsaddr->epr.epr_next_path[path_type] = 0; + } + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_DST) + path_idx = /* Key on destination context */ + ipsaddr->epr.epr_context % ipsaddr->epr.epr_num_paths[path_type]; + else if (proto->flags & IPS_PROTO_FLAG_PPOLICY_STATIC_SRC) + path_idx = /* Key off src context */ + ipsaddr->proto->ep->context.base_info.spi_context % ipsaddr->epr.epr_num_paths[path_type]; + else /* Base LID routed - Default in Infinipath 2.5 (Oct 09). */ + path_idx = 0; + + return ipsaddr->epr.epr_path[path_type][path_idx]; +} + +#endif /* _IPS_PROTO_HELP_H */ diff --git a/ptl_ips/ips_proto_internal.h b/ptl_ips/ips_proto_internal.h new file mode 100644 index 0000000..8954ff3 --- /dev/null +++ b/ptl_ips/ips_proto_internal.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_INTERNAL_H +#define _IPS_PROTO_INTERNAL_H + +#include "ips_proto_header.h" +#include "ips_expected_proto.h" +#include "ips_proto_help.h" + +/* + * Connect protocol. + * + * On receive, handled by upcalling into the connect interface. + * On send, handled by ips_proto by having connect compose the message. + */ +psm_error_t ips_proto_process_connect(struct ips_proto *proto, psm_epid_t epid, + uint8_t opcode, + struct ips_message_header *p_hdr, + void *payload, uint32_t paylen); +int ips_proto_build_connect_message(struct ips_proto *proto, + struct ips_proto_ctrl_message *msg, + ips_epaddr_t *ptladdr, uint8_t opcode, + void *payload); + +psm_error_t ips_proto_timer_ack_callback(struct psmi_timer *, uint64_t); +psm_error_t ips_proto_timer_send_callback(struct psmi_timer *, uint64_t); +psm_error_t ips_proto_timer_ctrlq_callback(struct psmi_timer *, uint64_t); +psm_error_t ips_proto_timer_pendq_callback(struct psmi_timer *, uint64_t); +psm_error_t ips_cca_adjust_rate(ips_path_rec_t *path_rec, int cct_increment); +psm_error_t ips_cca_timer_callback(struct psmi_timer *current_timer, uint64_t current); +void +ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context); + +psm_error_t ips_proto_recv_init(struct ips_proto *proto); +psm_error_t ips_proto_recv_fini(struct ips_proto *proto); + +#define IPS_PROTO_MQ_CTS_MSGSIZE 64 + +#endif /* _IPS_PROTO_INTERNAL_H */ diff --git a/ptl_ips/ips_proto_mq.c b/ptl_ips/ips_proto_mq.c new file mode 100644 index 0000000..3297753 --- /dev/null +++ b/ptl_ips/ips_proto_mq.c @@ -0,0 +1,964 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +#define MQ_NUM_MTUS(size,mtu) (((size) + (mtu) - 1) / (mtu)) +#define MQ_EGRLONG_ENABLE_MULTIFLOW 0 + +PSMI_NEVER_INLINE( +ips_scb_t * __sendpath +ips_poll_scb(struct ips_proto *proto, + int npkts, int len, uint32_t flags, int istiny)) +{ + ips_scb_t *scb = NULL; + psmi_assert(npkts > 0); + psm_error_t err; + + proto->stats.scb_egr_unavail_cnt++; + + PSMI_BLOCKUNTIL(proto->ep,err, + ((scb = istiny ? + ips_scbctrl_alloc_tiny(&proto->scbc_egr) : + ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags)) != NULL)); + psmi_assert(scb != NULL); + return scb; +} + +PSMI_ALWAYS_INLINE( +ips_scb_t * +mq_alloc_tiny(struct ips_proto *proto)) +{ + ips_scb_t* scb = ips_scbctrl_alloc_tiny(&proto->scbc_egr); + // common case should branch right through + if_pt (scb != NULL) + return scb; + else + return ips_poll_scb(proto, 1, 0, 0, 1); +} + +PSMI_ALWAYS_INLINE( +ips_scb_t * +mq_alloc_pkts(struct ips_proto *proto, int npkts, int len, uint32_t flags)) +{ + psmi_assert(npkts > 0); + ips_scb_t* scb = ips_scbctrl_alloc(&proto->scbc_egr, npkts, len, flags); + if_pt (scb != NULL) { + return scb; + } + else { + return ips_poll_scb(proto, npkts, len, flags, 0 /* not tiny scb */); + } +} + +static +int __recvpath +ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes) +{ + psm_mq_req_t req = (psm_mq_req_t)reqp; + + req->send_msgoff += nbytes; + if (req->send_msgoff == req->send_msglen) { + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&req->mq->completed_q, req); + } + return IPS_RECVHDRQ_CONTINUE; +} + +static +int __recvpath +ips_proto_mq_rv_complete(void *reqp) +{ + psm_mq_req_t req = (psm_mq_req_t) reqp; + psmi_mq_handle_rts_complete(req); + + return IPS_RECVHDRQ_CONTINUE; +} + +static +void __recvpath +ips_proto_mq_rv_complete_exp(void *reqp) +{ + ips_proto_mq_rv_complete(reqp); + return; +} + +extern psm_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); + +/* + * Mechanism to capture PIO-ing or DMA-ing the MQ message envelope + * + * Recoverable errors: + * PSM_OK: If PIO, envelope is sent. + * If DMA, all queued up packets on flow were flushed. + * + * Recoverable errors converted to PSM_OK just before return: + * PSM_OK_NO_PROGRESS: DMA-only, flushed 1 but not all queued packets. + * PSM_EP_NO_RESOURCES: + * If PIO, no pio available or cable currently pulled. + * If DMA, can be that no scb's available to handle unaligned packets + * or writev returned a recoverable error (no mem for + * descriptors, dma interrupted or no space left in dma queue). + * + * Unrecoverable errors (PIO or DMA). + * PSM_EP_DEVICE_FAILURE: Unexpected error calling writev(), chip failure, + * rxe/txe parity error. + * PSM_EP_NO_NETWORK: No network, no lid, ... + */ +PSMI_ALWAYS_INLINE( +psm_error_t +ips_mq_send_envelope(struct ips_proto *proto, psm_epaddr_t mepaddr, + ips_epaddr_t *ipsaddr, struct ips_scb *scb, int do_flush)) +{ + psm_error_t err = PSM_OK; + struct ips_flow *flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + + if_pf (proto->flags & IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA) { + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA]; + + if_pt (ips_scb_length(scb)) /* For DMA envelope need local completion */ + ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA; + } + + flow->xmit_seq_num.msg = mepaddr->mctxt_send_seqnum&0xff; + flow->recv_seq_num.msg = (mepaddr->mctxt_send_seqnum>>8)&0xff; + mepaddr->mctxt_send_seqnum++; + + flow->fn.xfer.enqueue(flow, scb); + + if ((flow->transfer == PSM_TRANSFER_PIO) || + (flow->transfer == PSM_TRANSFER_DMA && do_flush)) + err = flow->fn.xfer.flush(flow, NULL); + + if (do_flush) + err = ips_recv_progress_if_busy(ipsaddr->ptl, err); + + PSMI_BLOCKUNTIL(proto->ep,err, (scb->flags&IPS_SEND_FLAG_PENDING) == 0); + + /* As per the PSM error model (or lack thereof), PSM clients expect to see + * only PSM_OK as a recoverable error */ + if (err == PSM_EP_NO_RESOURCES || err == PSM_OK_NO_PROGRESS) + err = PSM_OK; + return err; +} + +/* + * We don't use message striping for middle message protocol, + * Tests on sandy-bridge two HCAs show lower bandwidth if + * message striping is used. + */ +void __sendpath +ips_mq_send_payload(psm_epaddr_t epaddr, psmi_egrid_t egrid, + void *ubuf, uint32_t len, uint32_t offset, + psm_mq_req_t req, uint32_t flags) +{ + psm_error_t err; + + ips_scb_t *scb; + uintptr_t buf = (uintptr_t) ubuf; + uint32_t nbytes_left = len; + uint32_t pktlen, frag_size; + ips_epaddr_t *ipsaddr; + struct ips_proto *proto; + int is_blocking = !!(req == NULL); + ptl_epaddr_flow_t flowid = + (flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA) ? + EP_FLOW_GO_BACK_N_DMA : EP_FLOW_GO_BACK_N_PIO; + struct ips_flow *flow; + + psmi_assert(len > 0); + ipsaddr = epaddr->ptladdr; + proto = ipsaddr->proto; + flow = &ipsaddr->flows[flowid]; + frag_size = flow->frag_size; + + if (!(flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA)) goto spio; + + psmi_assert(req != NULL); + pktlen = len; + /* The payload size is limited by the pbc.length field which is 16 bits in + * DWORD, including both message header and payload. This translates to + * less than 256K payload. So 128K is used. */ + if (pktlen > 131072) pktlen = 131072; + + do { + scb = mq_alloc_pkts(proto, 1, 0, 0); + psmi_assert(scb != NULL); + +#if 0 + /* turn on to use single frag-size packet */ + pktlen = min(frag_size, nbytes_left); +#else + pktlen = min(pktlen, nbytes_left); +#endif + ips_scb_length(scb) = pktlen; + ips_scb_mqhdr(scb) = MQ_MSG_DATA_BLK; + ips_scb_mqparam(scb).u32w0 = egrid.egr_data; + ips_scb_mqparam(scb).u32w1 = offset; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_buffer(scb) = (void *) buf; + + buf += pktlen; + offset += pktlen; + nbytes_left -= pktlen; + + if (nbytes_left == 0) { + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + } else { + req->send_msgoff += pktlen; + } + + scb->nfrag = (pktlen + frag_size - 1) / frag_size; + scb->frag_size = frag_size; + + /* attach checksum if enabled, this matches what is done for tid-sdma */ + if (proto->flags & IPS_PROTO_FLAG_CKSUM && !nbytes_left) { + uint32_t cksum = 0xffffffff; + cksum = ips_crc_calculate(len, (uint8_t *)(buf-len), cksum); + scb->ips_lrh.data[0].u32w0 = cksum; + scb->ips_lrh.data[0].u32w1 = offset - len; + } + + flow->fn.xfer.enqueue(flow, scb); + + ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA; + + if (nbytes_left == 0) { + err = flow->fn.xfer.flush(flow, NULL); + if (err == PSM_EP_NO_RESOURCES || err == PSM_OK_NO_PROGRESS) { + err = ips_recv_progress_if_busy + (ipsaddr->ptl, PSM_EP_NO_RESOURCES); + } + } + + } while (nbytes_left); + + return; + +spio: + do { +/* + * Each flow/proto uses its own scb. If a scb from one proto is + * used by another proto, there is a teardown problem, where + * a proto deallocates the scb still in use by another proto. + */ + pktlen = min(frag_size, nbytes_left); + scb = mq_alloc_pkts(proto, 1, pktlen, is_blocking ? IPS_SCB_FLAG_ADD_BUFFER : 0); + psmi_assert(scb != NULL); + + ips_scb_length(scb) = pktlen; + ips_scb_mqhdr(scb) = MQ_MSG_DATA; + ips_scb_mqparam(scb).u32w0 = egrid.egr_data; + ips_scb_mqparam(scb).u32w1 = offset; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + + _IPATH_VDBG("payload=%p, thislen=%d, frag_size=%d, nbytes_left=%d\n", + (void *) buf, pktlen, frag_size, nbytes_left); + if (!is_blocking) /* non-blocking, send from user's buffer */ + ips_scb_buffer(scb) = (void *) buf; + else /* blocking, copy to bounce buffer */ + psmi_mq_mtucpy(ips_scb_buffer(scb), (void *) buf, pktlen); + + buf += pktlen; + offset += pktlen; + nbytes_left -= pktlen; + + if (nbytes_left == 0) { /* last packet */ + if (!is_blocking) { + /* non-blocking mode, need completion */ + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + } else { + if (!is_blocking) { + req->send_msgoff += pktlen; + } + } + + flow->fn.xfer.enqueue(flow, scb); + + /* we need to flush the pending queue */ + err = flow->fn.xfer.flush(flow, NULL); + err = ips_recv_progress_if_busy(ipsaddr->ptl, err); + + } while (nbytes_left); + + return; +} + + +PSMI_ALWAYS_INLINE( +void +ips_shortcpy(void* vdest, const void* vsrc, uint32_t nchars) +) +{ +#ifdef __MIC__ + memcpy(vdest, vsrc, nchars); +#else + unsigned char *dest = vdest; + const unsigned char *src = vsrc; + + if(nchars>>2) + ipath_dwordcpy((uint32_t*)dest, (uint32_t*)src, nchars>>2); + dest += (nchars>>2)<<2; + src += (nchars>>2)<<2; + switch (nchars&0x03) { + case 3: *dest++ = *src++; + case 2: *dest++ = *src++; + case 1: *dest++ = *src++; + } +#endif + return; +} + +static __sendpath +psm_error_t +ips_ptl_mq_rndv(psm_mq_req_t req, psm_epaddr_t mepaddr, ips_epaddr_t *ipsaddr, + const void *buf, uint32_t len) +{ + ips_scb_t *scb; + psm_error_t err = PSM_OK; + struct ips_proto *proto = ipsaddr->proto; + + req->buf = (void *) buf; + req->buf_len = len; + req->send_msglen = len; + req->send_msgoff = 0; + req->recv_msgoff = 0; + req->rts_peer = ipsaddr->epaddr; + + scb = mq_alloc_tiny(proto); + + /* If the expected tid protocol is active, use it or else resort to + * eager-based r-v. */ + if (proto->protoexp != NULL) + ips_scb_mqhdr(scb) = req->type & MQE_TYPE_WAITING ? + MQ_MSG_RTS_WAIT : MQ_MSG_RTS; + else + ips_scb_mqhdr(scb) = MQ_MSG_RTS_EGR; + + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + + ips_scb_uwords(scb)[0].u64 = req->tag; + ips_scb_uwords(scb)[1].u32w0 = psmi_mpool_get_obj_index(req); + ips_scb_uwords(scb)[1].u32w1 = len; + + memset(&req->tid_grant, 0, sizeof(req->tid_grant)); + if ((err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE))) + goto fail; + + /* Assume that we already put a few rndv requests in flight. This helps + * for bibw microbenchmarks and doesn't hurt the 'blocking' case since + * we're going to poll anyway */ + psmi_poll_internal(ipsaddr->epaddr->ep, 1); + +fail: + _IPATH_VDBG("[rndv][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p/%d]: %s\n", + psmi_epaddr_get_name(proto->ep->epid), + psmi_epaddr_get_name(ipsaddr->epaddr->epid), buf, len, req->tag, req, + psmi_mpool_get_obj_index(req), + psm_error_get_string(err)); + + return err; +} + +psm_error_t __sendpath +ips_proto_mq_isend(psm_mq_t mq, psm_epaddr_t mepaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len, void *context, + psm_mq_req_t *req_o) +{ + uint8_t *buf = (uint8_t *) ubuf; + uint32_t pktlen = 0; + ips_scb_t *scb; + psm_epaddr_t epaddr = mepaddr->mctxt_current; + ips_epaddr_t *ipsaddr = epaddr->ptladdr; + struct ips_proto *proto = ipsaddr->proto; + uint32_t pad_write_bytes; + psm_error_t err = PSM_OK; + psm_mq_req_t req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf (req == NULL) + return PSM_NO_MEMORY; + + mepaddr->mctxt_current = epaddr->mctxt_next; + req->send_msglen = len; + req->tag = tag; + req->context = context; + + if (!flags && len <= MQ_IPATH_THRESH_TINY) { + scb = mq_alloc_tiny(proto); + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_HDR; + ips_scb_hdr_dlen(scb) = len; + ips_scb_mqhdr(scb) = MQ_MSG_TINY; + ips_scb_mqtag(scb) = tag; + mq_copy_tiny((uint32_t *)&ips_scb_mqparam(scb), (uint32_t *)buf, len); + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE); + /* We can mark this op complete since all the data is now copied + * into an SCB that remains live until it is remotely acked */ + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + _IPATH_VDBG("[itiny][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, len, tag, req); + *req_o = req; + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + return err; + } + else if (flags & PSM_MQ_FLAG_SENDSYNC) {/* skip eager accounting below */ + err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len); + *req_o = req; + return err; + } + else if (len <= ipsaddr->epr.epr_piosize) { + uint32_t cksum_len = (proto->flags & IPS_PROTO_FLAG_CKSUM) ? + PSM_CRC_SIZE_IN_BYTES : 0; + + pad_write_bytes = ((PSM_CACHE_LINE_BYTES - + ((len + cksum_len) & (PSM_CACHE_LINE_BYTES-1))) & + (PSM_CACHE_LINE_BYTES-1)); + + if_pf ((pad_write_bytes + len) > ipsaddr->epr.epr_piosize) + pad_write_bytes = 0; + scb = mq_alloc_pkts(proto, 1, (len + pad_write_bytes), + IPS_SCB_FLAG_ADD_BUFFER); + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_hdr_dlen(scb) = pad_write_bytes; + ips_scb_length(scb) = len + pad_write_bytes; + ips_scb_mqhdr(scb) = MQ_MSG_SHORT; + ips_scb_mqtag(scb) = tag; + ips_shortcpy (ips_scb_buffer(scb), buf, len); + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE); + req->state = MQ_STATE_COMPLETE; + mq_qq_append(&mq->completed_q, req); + _IPATH_VDBG("[ishrt][%s->%s][b=%p][m=%d][t=%"PRIx64"][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, len, tag, req); + } + else if (len <= mq->ipath_thresh_rv) { + uint32_t proto_flags = proto->flags & IPS_PROTO_FLAG_MQ_MASK; + psmi_egrid_t egrid; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + /* directly send from user's buffer */ + ips_scb_buffer(scb) = buf; + + if (len < proto->iovec_thresh_eager) { + if (len <= 2 * ipsaddr->epr.epr_piosize) { + // split into 2 packets and round second down to dword multiple + pktlen = len - (((len >> 1) + 3) & ~0x3); + } + else { + pktlen = min(len, ipsaddr->epr.epr_piosize); + } + proto_flags &= ~IPS_PROTO_FLAG_MQ_EAGER_SDMA; + + /* + * since following packets are sent on the same flow, + * we only wait for completion for the last packet + */ + req->send_msgoff = pktlen; + } + else { + psmi_assert(proto_flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA); + /* send the unaligned bytes only, this is required by sdma. */ + pktlen = (uint32_t)((uintptr_t)buf & 0x3); + if (pktlen) pktlen = 4 - pktlen; + + /* send from user buffer, need completion */ + req->send_msgoff = 0; + if (pktlen) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } + } + psmi_assert(pktlen <= ipsaddr->epr.epr_piosize); + + ips_scb_length(scb) = pktlen; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_mqhdr(scb) = MQ_MSG_LONG; + ips_scb_mqtag(scb) = tag; + ips_scb_mqparam(scb).u32w1 = len; + + /* We need a new eager long message number */ + egrid.egr_data = ips_scb_mqparam(scb).u32w0 = + mepaddr->xmit_egrlong.egr_data; + mepaddr->xmit_egrlong.egr_msgno++; + + /* Send the envelope but don't flush if writev is enabled */ + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_FALSE); + ips_mq_send_payload(epaddr, egrid, + buf+pktlen, len-pktlen, pktlen, req, + proto_flags); + + _IPATH_VDBG("[ilong][%s->%s][b=%p][l=%d][m=%d][t=%"PRIx64"][req=%p]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, pktlen, len, tag, req); + } + else { /* skip eager accounting below */ + err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len); + *req_o = req; + return err; + } + + *req_o = req; + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +__sendpath +psm_error_t +ips_proto_mq_send(psm_mq_t mq, psm_epaddr_t mepaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len) +{ + uint8_t *buf = (uint8_t *) ubuf; + uint32_t pktlen; + ips_scb_t *scb; + psm_epaddr_t epaddr = mepaddr->mctxt_current; + ips_epaddr_t *ipsaddr = epaddr->ptladdr; + uint32_t pad_write_bytes; + psm_error_t err = PSM_OK; + struct ips_proto *proto = ipsaddr->proto; + + mepaddr->mctxt_current = epaddr->mctxt_next; + + if (flags == 0 && len <= MQ_IPATH_THRESH_TINY) { + scb = mq_alloc_tiny(proto); + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_HDR; + ips_scb_hdr_dlen(scb) = len; + ips_scb_mqhdr(scb) = MQ_MSG_TINY; + ips_scb_mqtag(scb) = tag; + + mq_copy_tiny((uint32_t *)&ips_scb_mqparam(scb), (uint32_t *)buf, len); + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE); + _IPATH_VDBG("[tiny][%s->%s][b=%p][m=%d][t=%"PRIx64"]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, len, tag); + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + return err; + } + else if ((flags & PSM_MQ_FLAG_SENDSYNC)) { + goto do_rendezvous; + } + else if (len <= ipsaddr->epr.epr_piosize) { + uint32_t cksum_len = (proto->flags & IPS_PROTO_FLAG_CKSUM) ? + PSM_CRC_SIZE_IN_BYTES : 0; + + pad_write_bytes = ((PSM_CACHE_LINE_BYTES - + ((len + cksum_len) & (PSM_CACHE_LINE_BYTES-1))) & + (PSM_CACHE_LINE_BYTES-1)); + + if_pf ((pad_write_bytes + len) > ipsaddr->epr.epr_piosize) + pad_write_bytes = 0; + + scb = mq_alloc_pkts(proto, 1, (len + pad_write_bytes), + IPS_SCB_FLAG_ADD_BUFFER); + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_hdr_dlen(scb) = pad_write_bytes; + ips_scb_length(scb) = len + pad_write_bytes; + ips_scb_mqhdr(scb) = MQ_MSG_SHORT; + ips_scb_mqtag(scb) = tag; + + ips_shortcpy (ips_scb_buffer(scb), buf, len); + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_TRUE); + _IPATH_VDBG("[shrt][%s->%s][b=%p][m=%d][t=%"PRIx64"]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, len, tag); + } + else if (len <= mq->ipath_thresh_rv) { + uint32_t proto_flags = proto->flags & IPS_PROTO_FLAG_MQ_MASK; + psmi_egrid_t egrid; + psm_mq_req_t req = NULL; + + if (len < proto->iovec_thresh_eager_blocking) { + if (len <= 2 * ipsaddr->epr.epr_piosize) { + // split into 2 packets and round second down to dword multiple + pktlen = len - (((len >> 1) + 3) & ~0x3); + } + else { + pktlen = min(len, ipsaddr->epr.epr_piosize); + } + proto_flags &= ~IPS_PROTO_FLAG_MQ_EAGER_SDMA; + + scb = mq_alloc_pkts(proto, 1, pktlen, IPS_SCB_FLAG_ADD_BUFFER); + /* In blocking mode, copy to scb bounce buffer */ + ips_shortcpy (ips_scb_buffer(scb), buf, pktlen); + } + else { + psmi_assert(proto_flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA); + /* send the unaligned bytes only, this is required by sdma. */ + pktlen = (uint32_t)((uintptr_t)buf & 0x3); + if (pktlen) pktlen = 4 - pktlen; + + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + req->type |= MQE_TYPE_WAITING; + req->send_msglen = len; + req->tag = tag; + + scb = mq_alloc_pkts(proto, 1, 0, 0); + /* directly send from user's buffer */ + ips_scb_buffer(scb) = buf; + + /* send from user buffer, need completion */ + req->send_msgoff = 0; + if (pktlen) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } + } + psmi_assert(pktlen <= ipsaddr->epr.epr_piosize); + + ips_scb_length(scb) = pktlen; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_mqhdr(scb) = MQ_MSG_LONG; + ips_scb_mqtag(scb) = tag; + ips_scb_mqparam(scb).u32w1 = len; + + /* We need a new eager long message number */ + egrid.egr_data = ips_scb_mqparam(scb).u32w0 = + mepaddr->xmit_egrlong.egr_data; + mepaddr->xmit_egrlong.egr_msgno++; + + /* Send the envelope but don't flush if writev is enabled */ + err = ips_mq_send_envelope(proto, mepaddr, ipsaddr, scb, PSMI_FALSE); + ips_mq_send_payload(epaddr, egrid, + buf+pktlen, len-pktlen, pktlen, req, + proto_flags); + if (req) psmi_mq_wait_internal(&req); + + _IPATH_VDBG("[long][%s->%s][b=%p][l=%d][m=%d][t=%"PRIx64"]\n", + psmi_epaddr_get_name(mq->ep->epid), + psmi_epaddr_get_name(epaddr->epid), buf, pktlen, len, tag); + } + else { + psm_mq_req_t req; +do_rendezvous: + /* Block until we can get a req */ + PSMI_BLOCKUNTIL(mq->ep, err, + (req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND))); + req->type |= MQE_TYPE_WAITING; + req->tag = tag; + err = ips_ptl_mq_rndv(req, mepaddr, ipsaddr, ubuf, len); + if (err != PSM_OK) + return err; + psmi_mq_wait_internal(&req); + return err; /* skip accounting, done separately at completion time */ + } + + mq->stats.tx_num++; + mq->stats.tx_eager_num++; + mq->stats.tx_eager_bytes += len; + + return err; +} + +static +psm_error_t __recvpath +ips_proto_mq_rts_match_callback(psm_mq_req_t req, int was_posted) +{ + psm_epaddr_t epaddr = req->rts_peer; + ips_epaddr_t *ipsaddr = epaddr->ptladdr; + struct ips_proto *proto = ipsaddr->proto; + + /* We have a match. + * + * If we're doing eager-based r-v, just send back the sreq and length and + * have the sender complete the send. + * + */ + if (proto->protoexp == NULL) { /* only eager-based r-v so far */ + struct ips_pend_sends *pends = &proto->pend_sends; + struct ips_pend_sreq *sreq = psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + if (sreq == NULL) return PSM_NO_MEMORY; + sreq->type = IPS_PENDSEND_EAGER_REQ; + sreq->req = req; + + STAILQ_INSERT_TAIL(&pends->pendq, sreq, next); + psmi_timer_request(proto->timerq, &pends->timer, PSMI_TIMER_PRIO_1); + } + else { + ips_protoexp_tid_get_from_token( + proto->protoexp, req->buf, req->recv_msglen, epaddr, + req->rts_reqidx_peer, + req->type & MQE_TYPE_WAITING_PEER ? IPS_PROTOEXP_TIDGET_PEERWAIT : 0, + ips_proto_mq_rv_complete_exp, req); + } + + _IPATH_VDBG("req=%p, dest=%p, len=%d, recv_msglen=%d, stok=%p, expected=%s\n", + req, req->buf, req->buf_len, req->recv_msglen, + req->ptl_req_ptr, was_posted ? "YES" : "NO"); + + return PSM_OK; +} + +psm_error_t __recvpath +ips_proto_mq_push_eager_req(struct ips_proto *proto, psm_mq_req_t req) +{ + ips_scb_t *scb; + ptl_arg_t *args; + ips_epaddr_t *ipsaddr; + struct ips_flow *flow; + + scb = ips_scbctrl_alloc(&proto->scbc_egr, 1, 0, 0); + if (scb == NULL) + return PSM_OK_NO_PROGRESS; + + args = (ptl_arg_t *) ips_scb_uwords(scb); + + args[0].u32w0 = req->rts_reqidx_peer; + args[0].u32w1 = psmi_mpool_get_obj_index(req); + args[1].u32w0 = req->recv_msglen; + req->egrid.egr_data = args[0].u32w1; + + ipsaddr = req->rts_peer->ptladdr; + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_mqhdr (scb) = MQ_MSG_CTS_EGR; + + if (req->recv_msglen == 0) { + ips_proto_mq_rv_complete(req); + } + + flow->fn.xfer.enqueue(flow, scb); + flow->fn.xfer.flush(flow, NULL); + + return PSM_OK; +} + +psm_error_t __recvpath +ips_proto_mq_push_eager_data(struct ips_proto *proto, psm_mq_req_t req) +{ + uintptr_t buf = (uintptr_t) req->buf; + ips_epaddr_t *ipsaddr = req->rts_peer->ptladdr; + uint32_t nbytes_this; + uint32_t nbytes_left = req->send_msglen - req->recv_msgoff; + uint16_t frag_size; + struct ips_flow *flow; + ips_scb_t *scb; + + psmi_assert(nbytes_left > 0); + + if (!(proto->flags & IPS_PROTO_FLAG_MQ_EAGER_SDMA)) goto spio; + + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA]; + frag_size = flow->frag_size; + nbytes_this = 131072/8; + while (nbytes_left > 0) { + scb = ips_scbctrl_alloc(proto->scbc_rv, 1, 0, 0); + if (scb == NULL) + return PSM_OK_NO_PROGRESS; + +#if 0 + /* turn on to use single frag-size packet */ + nbytes_this = min(frag_size, nbytes_left); +#else + nbytes_this = min(nbytes_this, nbytes_left); +#endif + + ips_scb_length(scb) = nbytes_this; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_mqhdr (scb) = MQ_MSG_DATA_REQ_BLK; + ips_scb_buffer(scb) = (void *)(buf + req->recv_msgoff); + ips_scb_mqparam(scb).u32w0 = req->rts_reqidx_peer; + ips_scb_mqparam(scb).u32w1 = req->recv_msgoff; + + if (nbytes_left == nbytes_this) { + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + } else { + req->send_msgoff += nbytes_this; + } + + scb->nfrag = (nbytes_this + frag_size - 1) / frag_size; + scb->frag_size = frag_size; + + /* attach checksum if enabled, this matches what is done for tid-sdma */ + if (proto->flags&IPS_PROTO_FLAG_CKSUM && nbytes_left==nbytes_this) { + uint32_t cksum = 0xffffffff; + cksum = ips_crc_calculate(req->send_msglen, req->buf, cksum); + scb->ips_lrh.data[0].u32w0 = cksum; + } + + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + ips_scb_flags(scb) |= IPS_SEND_FLAG_WAIT_SDMA; + SLIST_NEXT(scb, next) = NULL; + + flow->fn.xfer.enqueue(flow, scb); + flow->fn.xfer.flush(flow, NULL); + + nbytes_left -= nbytes_this; + req->recv_msgoff += nbytes_this; + } + + return PSM_OK; + +spio: + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + frag_size = flow->frag_size; + while (nbytes_left > 0) { + scb = ips_scbctrl_alloc(proto->scbc_rv, 1, 0, 0); + if (scb == NULL) + return PSM_OK_NO_PROGRESS; + + nbytes_this = min(nbytes_left, frag_size); + ips_scb_length(scb) = nbytes_this; + ips_scb_subopcode(scb) = OPCODE_SEQ_MQ_CTRL; + ips_scb_mqhdr (scb) = MQ_MSG_DATA_REQ; + ips_scb_buffer(scb) = (void *)(buf + req->recv_msgoff); + ips_scb_mqparam(scb).u32w0 = req->rts_reqidx_peer; + ips_scb_mqparam(scb).u32w1 = req->recv_msgoff; + + ips_scb_cb(scb) = ips_proto_mq_eager_complete; + ips_scb_cb_param(scb) = req; + if (nbytes_left == nbytes_this) { + ips_scb_flags(scb) |= IPS_SEND_FLAG_ACK_REQ; + } +#if 0 + _IPATH_INFO("send req %p, off %d/%d, len %d, last=%s\n", + req, req->send_msgoff, req->send_msglen, nbytes_this, + nbytes_left == nbytes_this ? "YES" : "NO"); +#endif + SLIST_NEXT(scb, next) = NULL; + + flow->fn.xfer.enqueue(flow, scb); + flow->fn.xfer.flush(flow, NULL); + + nbytes_left -= nbytes_this; + req->recv_msgoff += nbytes_this; + } + + return PSM_OK; +} + +int __recvpath +ips_proto_mq_handle_cts(struct ips_proto *proto, ptl_arg_t *args) +{ + psm_mq_req_t req; + psm_mq_t mq = proto->ep->mq; + uint32_t reqidx, reqidx_peer; + struct ips_pend_sreq *sreq; + uint32_t msglen; + + reqidx = args[0].u32w0; + reqidx_peer = args[0].u32w1; + msglen = args[1].u32w0; + + req = psmi_mpool_find_obj_by_index(mq->sreq_pool, reqidx); + psmi_assert(req != NULL); + if (req == NULL) return IPS_RECVHDRQ_BREAK; + + if (msglen == 0) { + ips_proto_mq_rv_complete(req); + return IPS_RECVHDRQ_CONTINUE; + } + + sreq = psmi_mpool_get(proto->pend_sends_pool); + psmi_assert(sreq != NULL); + if (sreq == NULL) return IPS_RECVHDRQ_BREAK; + sreq->type = IPS_PENDSEND_EAGER_DATA; + sreq->req = req; + req->rts_reqidx_peer = reqidx_peer; + req->send_msglen = msglen; + req->send_msgoff = 0; + STAILQ_INSERT_TAIL(&proto->pend_sends.pendq, sreq, next); + /* Make sure it's processed by timer */ + psmi_timer_request(proto->timerq, &proto->pend_sends.timer, + PSMI_TIMER_PRIO_1); + + /* XXX Optimization here: If the 'req' is blocking in the MPI sense, we + * could choose to break out of the progress loop and make progress on it + * ASAP instead of continuing to process the receive queue */ + return IPS_RECVHDRQ_CONTINUE; +} + +int __recvpath +ips_proto_mq_handle_rts_envelope(psm_mq_t mq, int mode, psm_epaddr_t epaddr, + uint64_t tag, uint32_t reqidx_peer, + uint32_t msglen) +{ + psm_mq_req_t req; + _IPATH_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", + (long long) tag, reqidx_peer, msglen); + int rc = psmi_mq_handle_rts(mq, tag, 0, msglen, epaddr, + ips_proto_mq_rts_match_callback, &req); + req->rts_reqidx_peer = reqidx_peer; + if (mode == MQ_MSG_RTS_WAIT) + req->type |= MQE_TYPE_WAITING_PEER; + + if (rc == MQ_RET_MATCH_OK) { + ips_proto_mq_rts_match_callback(req, 1); + /* XXX if blocking, break out of progress loop */ + } + + /* If no match, will be called when send actually matches */ + return IPS_RECVHDRQ_CONTINUE; +} + +int __recvpath +ips_proto_mq_handle_rts_envelope_outoforder(psm_mq_t mq, int mode, + psm_epaddr_t peer, uint16_t msg_seqnum, + uint64_t tag, uint32_t reqidx_peer, + uint32_t msglen) +{ + psm_mq_req_t req; + _IPATH_VDBG("tag=%llx reqidx_peer=%d, msglen=%d\n", + (long long) tag, reqidx_peer, msglen); + psmi_mq_handle_rts_outoforder(mq, tag, 0, msglen, + peer, msg_seqnum, + ips_proto_mq_rts_match_callback, &req); + req->rts_reqidx_peer = reqidx_peer; + if (mode == MQ_MSG_RTS_WAIT) + req->type |= MQE_TYPE_WAITING_PEER; + + /* If no match, will be called when send actually matches */ + return IPS_RECVHDRQ_CONTINUE; +} + diff --git a/ptl_ips/ips_proto_params.h b/ptl_ips/ips_proto_params.h new file mode 100644 index 0000000..62a4e0a --- /dev/null +++ b/ptl_ips/ips_proto_params.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PROTO_PARAMS_H +#define _IPS_PROTO_PARAMS_H + +/* Packet header formats */ +#define CRC_SIZE_IN_BYTES 4 +#define PCB_SIZE_IN_BYTES 8 +#define LRH_VL_SHIFT 12 +#define BTH_OPCODE_SHIFT 24 +#define BTH_EXTRA_BYTE_SHIFT 20 +#define BTH_BECN_SHIFT 30 +#define BTH_FECN_SHIFT 31 +#define BYTE2WORD_SHIFT 2 +#define LOWER_24_BITS 0xFFFFFF +#define LOWER_16_BITS 0xFFFF +#define LOWER_8_BITS 0xFF +#define MAX_VL_SUPPORTED 8 +#define PSM_CRC_SIZE_IN_BYTES 8 /* Change in ipath_user.h as well */ +#define PSM_CACHE_LINE_BYTES 64 +#define PSM_FLOW_CREDITS 64 + +#ifndef BITS_PER_BYTE +# define BITS_PER_BYTE 8 +#endif + +/* Send retransmission */ +#define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ + +#define IPS_PROTO_ERRCHK_MS_MIN_DEFAULT 8 /* in millisecs */ +#define IPS_PROTO_ERRCHK_MS_MAX_DEFAULT 32 /* in millisecs */ +#define IPS_PROTO_ERRCHK_FACTOR_DEFAULT 2 +#define PSM_TID_TIMEOUT_DEFAULT "8:32:2" /* update from above params */ + +#define IPS_HDR_TID(p_hdr) \ + ((__le32_to_cpu((p_hdr)->iph.ver_context_tid_offset) >> \ + INFINIPATH_I_TID_SHIFT) & INFINIPATH_I_TID_MASK) + +/* time conversion macros */ +#define us_2_cycles(us) nanosecs_to_cycles(1000ULL*(us)) +#define ms_2_cycles(ms) nanosecs_to_cycles(1000000ULL*(ms)) +#define sec_2_cycles(sec) nanosecs_to_cycles(1000000000ULL*(sec)) + +/* Per-flow flags */ +#define IPS_FLOW_FLAG_NAK_SEND 0x01 +#define IPS_FLOW_FLAG_WRITEV 0x02 +#define IPS_FLOW_FLAG_PENDING_ACK 0x04 +#define IPS_FLOW_FLAG_GEN_BECN 0x08 +#define IPS_FLOW_FLAG_CONGESTED 0x10 +#define IPS_FLOW_FLAG_PENDING_NAK 0x20 + +/* per-ipsaddr Flags (sess is ipsaddr) */ +#define SESS_FLAG_HAS_RCVTHREAD 0x2 +#define SESS_FLAG_LOCK_SESS 0x4 +#define SESS_FLAG_HAS_FLOWID 0x8 + +/* tid session expected send flags */ +#define EXP_SEND_FLAG_CLEAR_ALL 0x00 +#define EXP_SEND_FLAG_FREE_TIDS 0x01 + +#define TIMEOUT_INFINITE 0xFFFFFFFFFFFFFFFFULL /* 64 bit all-one's */ + +/* ips_scb_t flags, powers of 2, and disjoint from SEND_FLAG_* values. + * Only the lower 8 bytes are wire-protocol options */ +#define IPS_SEND_FLAG_NONE 0x00 +// Unused -- future use maybe. +//#define IPS_SEND_FLAG_ACK_REQ_INTR 0x02 /* request ack with intr */ +#define IPS_SEND_FLAG_ACK_REQ 0x04 /* request ack (normal) */ +#define IPS_SEND_FLAG_UNALIGNED_DATA 0x08 /* unaligned data in hdr */ +#define IPS_SEND_FLAG_HAS_CKSUM 0x10 /* Has checksum */ +#define IPS_SEND_FLAG_EXPECTED_DONE 0x20 /* Last expected packet */ +#define IPS_SEND_FLAG_CCA_BECN 0x40 /* BECN bit for congestion */ +#define IPS_SEND_FLAG_PROTO_OPTS 0xff + +#define IPS_SEND_FLAG_PENDING 0x0100 +#define IPS_SEND_FLAG_PERSISTENT 0x0200 +#define IPS_SEND_FLAG_INTR 0x0400 +#define IPS_SEND_FLAG_WAIT_SDMA 0x0800 +#define IPS_SEND_FLAG_HDR_SUPPRESS 0x1000 + +#define IPS_PROTO_FLAG_MQ_ENVELOPE_SDMA 0x01 +#define IPS_PROTO_FLAG_MQ_EAGER_SDMA 0x02 +#define IPS_PROTO_FLAG_MQ_EXPECTED_SDMA 0x04 +#define IPS_PROTO_FLAG_MQ_MASK 0x0f /* contains all MQ proto flags */ +#define IPS_PROTO_FLAG_CTRL_SDMA 0x10 + +/* Alias for use send dma for everything */ +#define IPS_PROTO_FLAGS_ALL_SDMA 0x17 + +#define IPS_PROTO_FLAG_CKSUM 0x20 +/* Coalesced ACKs (On by default) */ +#define IPS_PROTO_FLAG_COALESCE_ACKS 0x80 + +/* Use Path Record query (off by default) */ +#define IPS_PROTO_FLAG_QUERY_PATH_REC 0x100 + +/* Path selection policies: + * + * (a) Adaptive - Dynamically determine the least loaded paths using various + * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs. + * + * (b) Static schemes - + * (i) static_src - Use path keyed off source context + * (ii) static_dest - Use path keyed off destination context + * (iii) static_base - Use only the base lid path - default till Oct'09. + * + * The default is adaptive. If a zero lmc network is used then there exists + * just one path between endpoints the (b)(iii) case above. + * + */ + +#define IPS_PROTO_FLAG_PPOLICY_ADAPTIVE 0x200 +#define IPS_PROTO_FLAG_PPOLICY_STATIC_SRC 0x400 +#define IPS_PROTO_FLAG_PPOLICY_STATIC_DST 0x800 +#define IPS_PROTO_FLAG_PPOLICY_STATIC_BASE 0x1000 + +/* All static policies */ +#define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00 + +/* IBTA CCA Protocol support */ +#define IPS_PROTO_FLAG_CCA 0x2000 + +/* By default, we use dma in eager (based on PSM_MQ_EAGER_SDMA_SZ) and + * always use it in expected. + */ +#define IPS_PROTO_FLAGS_DEFAULT (IPS_PROTO_FLAG_MQ_EAGER_SDMA | \ + IPS_PROTO_FLAG_MQ_EXPECTED_SDMA | \ + IPS_PROTO_FLAG_COALESCE_ACKS) + +#define IPS_PROTOEXP_FLAG_ENABLED 0x01 /* default */ +//#define IPS_PROTOEXP_FLAG_NAKOPT 0x02 /* *not* default, broken */ +#define IPS_PROTOEXP_FLAG_TID_DEBUG 0x04 /* *not* default */ +#define IPS_PROTOEXP_FLAG_HDR_SUPP 0x08 /* Header suppression enabled */ + +#define IPS_PROTOEXP_FLAGS_DEFAULT (IPS_PROTOEXP_FLAG_ENABLED | \ + IPS_PROTOEXP_FLAG_HDR_SUPP) + +/* We have to get an MTU of at least 2K, or else this breaks some assumptions + * in the packets that handle tid descriptors + */ +#define IPS_PROTOEXP_MIN_MTU 2048 + +/* Bound on the number of packets to feed to send dma at a time. This ensures + * we don't "disappear" in the kernel for too long. + */ +#define IPS_SDMA_MAX_SCB 32 + +/* Fault injection, becomes parameters to psmi_faultinj_getspec so + * a comma-delimited list of + * "spec_name", num, denom + * Where num/denom means fault num out of every denom. + * The defines set 'denum' and assume that num is set to 1 + * + * These values are all defaults, each is overridable via + * PSM_FI_ in the environment (and yes, spec_name is in lowercase + * *in the environment* just to minimize it appearing in the wild). The format + * there is so the same thing except that one can set + * a specific seed to the random number generator. + */ +#if 1 +#define IPS_FAULTINJ_DMALOST 20 /* 1 every 20 dma writev get lost */ +#define IPS_FAULTINJ_PIOLOST 100 /* 1 every 100 pio writes get lost */ +#define IPS_FAULTINJ_PIOBUSY 10 /* 1 every 10 pio sends get busy */ +#define IPS_FAULTINJ_RECVLOST 200 /* 1 every 200 pkts dropped at recv */ +#else +#define IPS_FAULTINJ_DMALOST 500 /* 1 every 500 dma writev get lost */ +#define IPS_FAULTINJ_PIOLOST 3000 /* 1 every 3000 pio writes get lost */ +#define IPS_FAULTINJ_PIOBUSY 100 /* 1 every 100 pio sends get busy */ +#define IPS_FAULTINJ_RECVLOST 500 /* 1 every 500 pkts dropped at recv */ +#endif + +#endif /* _IPS_PROTO_PARAMS_H */ diff --git a/ptl_ips/ips_proto_recv.c b/ptl_ips/ips_proto_recv.c new file mode 100644 index 0000000..572c522 --- /dev/null +++ b/ptl_ips/ips_proto_recv.c @@ -0,0 +1,1547 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ipserror.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" + +#define PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS 30 +static void ips_report_strays(struct ips_proto *proto); + +#define INC_TIME_SPEND(timer) + +#define _desc_idx u32w0 +#define _desc_genc u32w1 + +psm_error_t +ips_proto_recv_init(struct ips_proto *proto) +{ + uint32_t interval_secs; + union psmi_envvar_val env_stray; + + psmi_getenv("PSM_STRAY_WARNINTERVAL", + "min secs between stray process warnings", + PSMI_ENVVAR_LEVEL_HIDDEN, + PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val) PSM_STRAY_WARN_INTERVAL_DEFAULT_SECS, + &env_stray); + interval_secs = env_stray.e_uint; + if (interval_secs > 0) + proto->stray_warn_interval = sec_2_cycles(interval_secs); + else + proto->stray_warn_interval = 0; + + return PSM_OK; +} + +psm_error_t +ips_proto_recv_fini(struct ips_proto *proto) +{ + ips_report_strays(proto); + return PSM_OK; +} + +#define cycles_to_sec_f(cycles) \ + (((double)cycles_to_nanosecs(cycles)) / 1000000000.0) + +struct ips_stray_epid { + psm_epid_t epid; + uint32_t err_check_bad_sent; + uint32_t ipv4_addr; + uint32_t pid; + uint32_t num_messages; + uint64_t t_warn_next; + uint64_t t_first; + uint64_t t_last; +}; + +static +void +ips_report_strays(struct ips_proto *proto) +{ + struct ips_stray_epid *sepid; + struct psmi_eptab_iterator itor; + psmi_epid_itor_init(&itor, PSMI_EP_CROSSTALK); + double t_runtime = cycles_to_sec_f(proto->t_fini - proto->t_init); + + while ((sepid = psmi_epid_itor_next(&itor))) { + char ipbuf[INET_ADDRSTRLEN], *ip = NULL; + char bufpid[32]; + uint32_t lid = psm_epid_nid(sepid->epid); + double t_first = cycles_to_sec_f(sepid->t_first - proto->t_init); + double t_last = cycles_to_sec_f(sepid->t_last - proto->t_init); + if (sepid->ipv4_addr) + ip = (char *) + inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, sizeof ipbuf); + if (!ip) + snprintf(ipbuf, sizeof ipbuf, "%d (%x)", lid, lid); + + if (sepid->pid) + snprintf(bufpid, sizeof bufpid, "PID=%d", sepid->pid); + else + snprintf(bufpid, sizeof bufpid, "PID unknown"); + + _IPATH_INFO("Process %s on host %s=%s sent %d stray message(s) and " + "was told so %d time(s) (first stray message at %.1fs " + "(%d%%), last at %.1fs (%d%%) into application run)\n", + bufpid, ip ? "IP" : "LID", ipbuf, sepid->num_messages, + sepid->err_check_bad_sent, t_first, + (int) (t_first * 100.0 / t_runtime), t_last, + (int) (t_last * 100.0 / t_runtime)); + + psmi_epid_remove(PSMI_EP_CROSSTALK, sepid->epid); + psmi_free(sepid); + } + psmi_epid_itor_fini(&itor); + return; +} + +/* New scbs now available. If we have pending sends because we were out of + * scbs, put the pendq on the timerq so it can be processed. */ +void +ips_proto_rv_scbavail_callback(struct ips_scbctrl *scbc, void *context) +{ + struct ips_proto *proto = (struct ips_proto *) context; + struct ips_pend_sreq *sreq = STAILQ_FIRST(&proto->pend_sends.pendq); + if (sreq != NULL) + psmi_timer_request(proto->timerq, + &proto->pend_sends.timer, PSMI_TIMER_PRIO_1); + return; +} + +psm_error_t __recvpath +ips_proto_timer_pendq_callback(struct psmi_timer *timer, uint64_t current) +{ + psm_error_t err = PSM_OK; + struct ips_pend_sends *pend_sends = + (struct ips_pend_sends *) timer->context; + struct ips_pendsendq *phead = &pend_sends->pendq; + struct ips_proto *proto = (struct ips_proto *) pend_sends->proto; + struct ips_pend_sreq *sreq; + + while (!STAILQ_EMPTY(phead)) { + sreq = STAILQ_FIRST(phead); + switch (sreq->type) { + case IPS_PENDSEND_EAGER_REQ: + err = ips_proto_mq_push_eager_req(proto, sreq->req); + break; + case IPS_PENDSEND_EAGER_DATA: + err = ips_proto_mq_push_eager_data(proto, sreq->req); + break; + + default: + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown pendq state %d\n", sreq->type); + } + + if (err == PSM_OK) { + STAILQ_REMOVE_HEAD(phead, next); + psmi_mpool_put(sreq); + } + else { /* out of scbs. wait for the next scb_avail callback */ + //printf("!!!!! breaking out of pendq progress\n"); + break; + } + } + + return err; +} + +static +int __recvpath +_process_mq(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev); + uint32_t msglen = paylen; + uint16_t mode = p_hdr->mqhdr; + psm_mq_req_t req; + psmi_egrid_t egrid; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + psm_epaddr_t epaddr = ipsaddr->epaddr; + psm_mq_t mq = rcv_ev->proto->mq; + ptl_arg_t *args; + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow = &ipsaddr->flows[flowid]; + int ret = IPS_RECVHDRQ_CONTINUE; + + if (!ips_proto_is_expected_or_nak((struct ips_recvhdrq_event*) rcv_ev)) + goto skip_ack_req; + + _IPATH_VDBG("Rcvd ctrl packet %s length = %i, mode=%d, arg0=%llx arg1=%llx\n", + psmi_epaddr_get_name(epaddr->epid), + paylen, p_hdr->mqhdr, + (long long) p_hdr->data[0].u64, (long long) p_hdr->data[1].u64); + + if (mode <= MQ_MSG_RTS_WAIT) { + ret = ips_proto_check_msg_order(epaddr, flow, p_hdr); + if (ret == 0) return IPS_RECVHDRQ_OOO; + + if (mode <= MQ_MSG_LONG) { + egrid.egr_data = 0; + if (mode == MQ_MSG_SHORT) { + /* May have padded writes, account for it */ + paylen -= p_hdr->hdr_dlen; + msglen = paylen; + } + else if (mode == MQ_MSG_TINY) { + payload = (void *) &p_hdr->data[1]; + msglen = paylen = p_hdr->hdr_dlen; + } + else if (mode == MQ_MSG_LONG) { + msglen = p_hdr->data[1].u32w1; + if (ipsaddr->flags & SESS_FLAG_HAS_FLOWID) { + egrid.egr_data = p_hdr->data[1].u32w0; + _IPATH_VDBG("egrid-msglong is 0x%x\n", egrid.egr_data); + } + } + + if (ret == 1) + psmi_mq_handle_envelope( + mq, mode, epaddr, p_hdr->data[0].u64, /* tag */ + egrid, msglen, (void *) payload, paylen); + else + psmi_mq_handle_envelope_outoforder( + mq, mode, epaddr, flow->msg_ooo_seqnum, + p_hdr->data[0].u64, /* tag */ + egrid, msglen, (void *) payload, paylen); + } else { + args = (ptl_arg_t *) p_hdr->data; + if (ret == 1) + ips_proto_mq_handle_rts_envelope(mq, mode, epaddr, + args[0].u64, args[1].u32w0, args[1].u32w1); + else + ips_proto_mq_handle_rts_envelope_outoforder(mq, mode, + epaddr, flow->msg_ooo_seqnum, + args[0].u64, args[1].u32w0, args[1].u32w1); + } + + if (ret == 1) { + if (epaddr->mctxt_master->outoforder_c) { + psmi_mq_handle_outoforder_queue(epaddr->mctxt_master); + } + ret = IPS_RECVHDRQ_CONTINUE; + } else { + ret = IPS_RECVHDRQ_BREAK; + } + } else if (mode == MQ_MSG_DATA || mode == MQ_MSG_DATA_BLK) { + psm_mq_req_t req; + + req = STAILQ_FIRST(&epaddr->mctxt_master->egrlong); + while (req) { + if (req->egrid.egr_data == p_hdr->data[1].u32w0) break; + req = STAILQ_NEXT(req, nextq); + } + +/* + * Even with single context, since the header is sent via pio-flow, + * and data is sent via sdma-flow, data could be received first, + * thus causes req=NULL. + */ + if (req == NULL) { + flow->msg_ooo_toggle = !flow->msg_ooo_toggle; + if (flow->msg_ooo_toggle) { + flow->recv_seq_num.pkt -= 1; + return IPS_RECVHDRQ_OOO; + } + } else { + flow->msg_ooo_toggle = 0; + } + + psmi_mq_handle_data(req, epaddr, p_hdr->data[1].u32w0, + p_hdr->data[1].u32w1, payload, paylen); + + /* If checksum is enabled, this matches what is done for tid-sdma */ + /* if OOO and req is NULL, header is not received and we ignore chksum */ + if (rcv_ev->proto->flags & IPS_PROTO_FLAG_CKSUM && + mode == MQ_MSG_DATA_BLK && + req && req->state == MQ_STATE_COMPLETE) { + uint32_t cksum = ips_crc_calculate( + req->recv_msglen - p_hdr->data[0].u32w1, + (uint8_t *)req->buf + p_hdr->data[0].u32w1, + 0xffffffff); + if (p_hdr->data[0].u32w0 != cksum) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Aborting! \n", p_hdr->data[0].u32w0, cksum, __be16_to_cpu(flow->path->epr_dlid)); + ips_proto_dump_data(req->buf, req->recv_msglen); + } + } + + } else if (mode == MQ_MSG_DATA_REQ || mode == MQ_MSG_DATA_REQ_BLK) { + req = psmi_mpool_find_obj_by_index(mq->rreq_pool, + p_hdr->data[1].u32w0); + if (!req) goto skip_ack_req; + psmi_mq_handle_data(req, epaddr, p_hdr->data[1].u32w0, + p_hdr->data[1].u32w1, (void *) payload, paylen); + + /* If checksum is enabled, this matches what is done for tid-sdma */ + if (rcv_ev->proto->flags & IPS_PROTO_FLAG_CKSUM && + mode == MQ_MSG_DATA_REQ_BLK && + req->state == MQ_STATE_COMPLETE) { + uint32_t cksum = ips_crc_calculate( + req->recv_msglen, req->buf, 0xffffffff); + if (p_hdr->data[0].u32w0 != cksum) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ErrPkt: Checksum mismatch. Expected: 0x%08x, Received: 0x%08x Source LID: %i. Aborting! \n", p_hdr->data[0].u32w0, cksum, __be16_to_cpu(flow->path->epr_dlid)); + ips_proto_dump_data(req->buf, req->recv_msglen); + } + } + + } else if (mode == MQ_MSG_CTS_EGR) { + args = p_hdr->data; + ips_proto_mq_handle_cts(rcv_ev->proto, args); + } else { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown frame mode %x", mode); + } + + if ((p_hdr->flags & IPS_SEND_FLAG_ACK_REQ) || + (flow->flags & IPS_FLOW_FLAG_GEN_BECN)) + ips_proto_send_ack((struct ips_recvhdrq *) rcv_ev->recvq, flow); + +skip_ack_req: + ips_proto_process_ack(rcv_ev); + + return ret; // skip +} + +PSMI_INLINE( +int between(int first_seq, int last_seq, int seq)) +{ + if (last_seq >= first_seq) { + if (seq < first_seq || seq > last_seq) { + return 0; + } + } else { + if (seq > last_seq && seq < first_seq) { + return 0; + } + } + return 1; +} + +PSMI_INLINE( +int pio_dma_ack_valid(struct ips_flow *flow, psmi_seqnum_t ack_seq_num, + uint32_t ack_window)) +{ + uint32_t first_pkt, last_pkt; + struct ips_scb_unackedq *unackedq = &flow->scb_unacked; + + if (STAILQ_EMPTY(unackedq)) + return 0; + + first_pkt = flow->xmit_ack_num.pkt + 1; + last_pkt = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.pkt; + return between(first_pkt, last_pkt, ack_seq_num.pkt); +} + +PSMI_INLINE( +struct ips_flow* get_tidflow(ips_epaddr_t *ipsaddr, + struct ips_message_header *p_hdr, + psmi_seqnum_t ack_seq_num, + uint32_t ack_window)) +{ + struct ips_flow *flow; + struct ips_protoexp *protoexp = ipsaddr->proto->protoexp; + struct ips_tid_send_desc *tidsendc; + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t desc_tidsendc; + uint32_t first_seq, last_seq; + struct ips_scb_unackedq *unackedq; + + tidsendc = (struct ips_tid_send_desc*) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_send_pool, + desc_id._desc_idx); + if (tidsendc == NULL) { + _IPATH_ERROR("OPCODE_ACK: Index %d is out of range in tidflow ack\n", desc_id._desc_idx); + return NULL; + } + + /* Ensure generation matches */ + psmi_mpool_get_obj_index_gen_count(tidsendc, + &desc_tidsendc._desc_idx, + &desc_tidsendc._desc_genc); + if (desc_tidsendc.u64 != desc_id.u64) + return NULL; + + /* Ensure ack is within window */ + flow = &tidsendc->tidflow; + unackedq = &flow->scb_unacked; + + /* No unacked scbs */ + if (STAILQ_EMPTY(unackedq)) + return NULL; + + first_seq = flow->xmit_ack_num.seq + 1; + last_seq = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.seq; + if (between(first_seq, last_seq, ack_seq_num.seq) == 0) { + return NULL; + } + + /* Generation for ack should match */ + if (STAILQ_FIRST(unackedq)->seq_num.gen != ack_seq_num.gen) + return NULL; + + return flow; +} + +/* NAK post process for tid flow */ +void ips_tidflow_nak_post_process(struct ips_flow *flow, + struct ips_message_header *p_hdr) +{ + + ips_scb_t *scb; + struct ips_scb_unackedq *unackedq = &flow->scb_unacked; +#ifdef PSM_DEBUG + psmi_seqnum_t new_flowgenseq; + + new_flowgenseq.val = p_hdr->data[1].u32w0; + /* Update any pending scb's to the new generation count. + * Note: flow->xmit_seq_num was updated to the new generation when the + * NAK was received. + */ + psmi_assert(STAILQ_FIRST(unackedq)->seq_num.flow==new_flowgenseq.flow); + psmi_assert(STAILQ_FIRST(unackedq)->seq_num.gen != new_flowgenseq.gen); + psmi_assert(STAILQ_FIRST(unackedq)-> + seq_num.seq-STAILQ_FIRST(unackedq)->nfrag+1 == new_flowgenseq.seq); +#endif + + /* Update unacked scb's to use the new flowgenseq */ + scb = STAILQ_FIRST(unackedq); + while (scb) { + scb->ips_lrh.bth[2] = __cpu_to_be32(flow->xmit_seq_num.psn); + flow->xmit_seq_num.seq += scb->nfrag; + scb->seq_num = flow->xmit_seq_num; + scb->seq_num.seq--; + scb = SLIST_NEXT(scb, next); + } + +} + +// process an incoming ack message. Separate function to allow +// for better optimization by compiler +void __recvpath +ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) +{ + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + psmi_seqnum_t ack_seq_num, last_seq_num; + ips_scb_t *scb; + struct ips_proto *proto = ipsaddr->proto; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psm_protocol_type_t protocol; + ptl_epaddr_flow_t flowid; + + ips_ptladdr_lock(ipsaddr); + + protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid); + flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid); + ack_seq_num.psn = p_hdr->ack_seq_num; + + switch(protocol){ + case PSM_PROTOCOL_GO_BACK_N: + flow = &ipsaddr->flows[flowid]; + ack_seq_num.pkt -= 1; + if (!pio_dma_ack_valid(flow, ack_seq_num, proto->scb_max_inflight)) + goto ret; + flow->xmit_ack_num = ack_seq_num; + break; + case PSM_PROTOCOL_TIDFLOW: + ack_seq_num.seq -= 1; + flow = get_tidflow(ipsaddr, p_hdr, ack_seq_num, proto->scb_max_inflight); + if (!flow) /* Invalid ack for flow */ + goto ret; + flow->xmit_ack_num = ack_seq_num; + break; + default: + _IPATH_ERROR("OPCODE_ACK: Unknown flow type %d in ACK\n", flowid); + goto ret; + } + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + if (STAILQ_EMPTY(unackedq)) goto ret; // only for Klockwork scan. + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + INC_TIME_SPEND(TIME_SPEND_USER2); + + /* For tidflow, we want to match all flow/gen/seq, + for gobackn, we only match pkt#, msg# is not known. + msg# is the message envelope number in the stream, + you don't know if the next packet has the old msg# + or starts a new msg#. + */ + /* first release all xmit buffer that has been receveid */ + while ((protocol==PSM_PROTOCOL_GO_BACK_N) ? + between(STAILQ_FIRST(unackedq)->seq_num.pkt, + last_seq_num.pkt, ack_seq_num.pkt) : + between(STAILQ_FIRST(unackedq)->seq_num.psn, + last_seq_num.psn, ack_seq_num.psn) + ) { + + /* take it out of the xmit queue and .. */ + scb = STAILQ_FIRST(unackedq); + STAILQ_REMOVE_HEAD(unackedq, nextq); + flow->scb_num_unacked--; + flow->credits++; + + if (scb == SLIST_FIRST(scb_pend)) { + flow->scb_num_pending--; + SLIST_REMOVE_HEAD(scb_pend, next); + } + + if (scb->flags & IPS_SEND_FLAG_WAIT_SDMA) + ips_proto_dma_wait_until(proto, scb->dma_ctr); + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->payload_size-scb->extra_bytes); + + if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames have been + * acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, &flow->timer_ack); + psmi_timer_cancel(proto->timerq, &flow->timer_send); + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window - all packets ACK'd */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + goto ret; + } + } + + /* CCA: If flow is congested adjust rate */ + if_pf (rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { + if ((flow->path->epr_ccti + + proto->cace[flow->path->epr_sl].ccti_increase) <= + proto->ccti_limit) { + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->epr_sl].ccti_increase); + /* Clear congestion event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + } + } + else { + /* Increase congestion window if flow is not congested */ + if_pf (flow->cwin < proto->flow_credits) { + flow->credits += + min(flow->cwin << 1, proto->flow_credits) - flow->cwin; + flow->cwin = min(flow->cwin << 1, proto->flow_credits); + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + } + } + + /* Reclaimed some credits - attempt to flush flow */ + flow->fn.xfer.flush(flow, NULL); + + /* + * If the next packet has not even been put on the wire, cancel the + * retransmission timer since we're still presumably waiting on free + * pio bufs + */ + if (STAILQ_FIRST(unackedq)->abs_timeout == TIMEOUT_INFINITE) + psmi_timer_cancel(proto->timerq, &flow->timer_ack); + +ret: + ips_ptladdr_unlock(ipsaddr); + return; +} + +// process an incoming nack message. Separate function to allow +// for better optimization by compiler +static void +_process_nak(struct ips_recvhdrq_event *rcv_ev) +{ + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + psmi_seqnum_t ack_seq_num, last_seq_num; + ips_scb_t *scb; + struct ips_proto *proto = ipsaddr->proto; + struct ips_flow *flow = NULL; + struct ips_scb_unackedq *unackedq; + struct ips_scb_pendlist *scb_pend; + psm_protocol_type_t protocol; + ptl_epaddr_flow_t flowid; + int num_resent = 0; + + ips_ptladdr_lock(ipsaddr); + + protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid); + flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid); + + INC_TIME_SPEND(TIME_SPEND_USER3); + + ack_seq_num.psn = p_hdr->ack_seq_num; + + switch(protocol){ + case PSM_PROTOCOL_GO_BACK_N: + flow = &ipsaddr->flows[flowid]; + if (!pio_dma_ack_valid(flow, ack_seq_num, proto->scb_max_inflight)) + goto ret; + ack_seq_num.pkt--; + flow->xmit_ack_num = ack_seq_num; + break; + case PSM_PROTOCOL_TIDFLOW: + flow = get_tidflow(ipsaddr, p_hdr, ack_seq_num, proto->scb_max_inflight); + if (!flow) + goto ret; /* Invalid ack for flow */ + ack_seq_num.seq--; + /* Update xmit seq num to the new flowgenseq */ + flow->xmit_seq_num = (psmi_seqnum_t)p_hdr->data[1].u32w0; + flow->xmit_ack_num = flow->xmit_seq_num; + flow->xmit_ack_num.seq--; + break; + default: + _IPATH_ERROR("OPCODE_NAK: Unknown flow type %d in ACK\n", flowid); + goto ret; + } + + unackedq = &flow->scb_unacked; + scb_pend = &flow->scb_pend; + if (STAILQ_EMPTY(unackedq)) goto ret; // only for Klockwork scan. + last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; + + ipsaddr->stats.nak_recv++; + + _IPATH_VDBG("got a nack %d on flow %d, " + "first is %d, last is %d\n", ack_seq_num.psn, + flowid, STAILQ_EMPTY(unackedq)?-1:STAILQ_FIRST(unackedq)->seq_num.psn, + STAILQ_EMPTY(unackedq)?-1:STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num.psn); + + /* For tidflow, we want to match all flow/gen/seq, + for gobackn, we only match pkt#, msg# is not known. + msg# is the message envelope number in the stream, + you don't know if the next packet has the old msg# + or starts a new msg#. + */ + /* first release all xmit buffer that has been receveid */ + while ((protocol==PSM_PROTOCOL_GO_BACK_N) ? + between(STAILQ_FIRST(unackedq)->seq_num.pkt, + last_seq_num.pkt, ack_seq_num.pkt) : + between(STAILQ_FIRST(unackedq)->seq_num.psn, + last_seq_num.psn, ack_seq_num.psn) + ) { + /* take it out of the xmit queue and .. */ + scb = STAILQ_FIRST(unackedq); + STAILQ_REMOVE_HEAD(unackedq, nextq); + flow->scb_num_unacked--; + + if (scb->flags & IPS_SEND_FLAG_WAIT_SDMA) + ips_proto_dma_wait_until(proto, scb->dma_ctr); + + if (scb->callback) + (*scb->callback) (scb->cb_param, scb->payload_size-scb->extra_bytes); + + if (!(scb->flags & IPS_SEND_FLAG_PERSISTENT)) + ips_scbctrl_free(scb); + + /* set all index pointer to NULL if all frames has been acked */ + if (STAILQ_EMPTY(unackedq)) { + psmi_timer_cancel(proto->timerq, &flow->timer_ack); + psmi_timer_cancel(proto->timerq, &flow->timer_send); + SLIST_FIRST(scb_pend) = NULL; + psmi_assert(flow->scb_num_pending == 0); + /* Reset congestion window if all packets acknowledged */ + flow->credits = flow->cwin = proto->flow_credits; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + flow->flags &= ~IPS_FLOW_FLAG_CONGESTED; + goto ret; + } + } + + psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ + + if (flow->fn.protocol.nak_post_process) + flow->fn.protocol.nak_post_process(flow, p_hdr); + + /* Always cancel ACK timer as we are going to restart the flow */ + psmi_timer_cancel(proto->timerq, &flow->timer_ack); + + /* What's now pending is all that was unacked */ + SLIST_FIRST(scb_pend) = STAILQ_FIRST(unackedq); + flow->scb_num_pending = flow->scb_num_unacked; + + /* If NAK with congestion bit set - delay re-transmitting and THEN adjust + * CCA rate. + */ + if_pf (rcv_ev->is_congested & IPS_RECV_EVENT_BECN) { + uint64_t offset; + + /* Clear congestion event and mark flow as congested */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + flow->flags |= IPS_FLOW_FLAG_CONGESTED; + + /* For congested flow use slow start i.e. reduce congestion window. + * For TIDFLOW we cannot reduce congestion window as peer expects + * header packets at regular intervals (protoexp->hdr_pkt_interval). + */ + if (flow->protocol != PSM_PROTOCOL_TIDFLOW) + flow->credits = flow->cwin = 1; + else + flow->credits = flow->cwin; + + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + + /* During congestion cancel send timer and delay retransmission by + * random interval + */ + psmi_timer_cancel(proto->timerq, &flow->timer_send); + if (SLIST_FIRST(scb_pend)->ack_timeout != TIMEOUT_INFINITE) + offset = (SLIST_FIRST(scb_pend)->ack_timeout >> 1); + else + offset = 0; + psmi_timer_request(proto->timerq, &flow->timer_send, + (get_cycles() + + (uint64_t)(offset * (rand()/RAND_MAX + 1.0)))); + } + else { + /* Reclaim all credits upto congestion window only */ + flow->credits = flow->cwin; + flow->ack_interval = max((flow->credits >> 2) - 1, 1); + + /* Flush pending scb's */ + flow->fn.xfer.flush(flow, &num_resent); + ipsaddr->stats.send_rexmit += num_resent; + } + +ret: + ips_ptladdr_unlock(ipsaddr); + return; +} + +static void +_process_err_chk(struct ips_recvhdrq *recvq, ips_epaddr_t *ipsaddr, + struct ips_message_header *p_hdr) +{ + psmi_seqnum_t seq_num; + int16_t seq_off; + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_flow *flow = &ipsaddr->flows[flowid]; + + INC_TIME_SPEND(TIME_SPEND_USER4); + + ipsaddr->stats.err_chk_recv++; + + seq_num.val = __be32_to_cpu(p_hdr->bth[2]); + seq_off = (int16_t)(ipsaddr->flows[flowid].recv_seq_num.pkt - seq_num.pkt); + + if_pf (seq_off <= 0) { + _IPATH_VDBG("naking for seq=%d, off=%d on flowid %d\n", + seq_num.pkt, seq_off, flowid); + + if (seq_off < -flow->ack_interval) + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + + ips_proto_send_nak(recvq, flow); + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + } + else { + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &ipsaddr->ctrl_msg_queued, NULL); + } +} + +static void +_process_err_chk_gen(ips_epaddr_t *ipsaddr, struct ips_message_header *p_hdr) +{ + struct ips_protoexp *protoexp = ipsaddr->proto->protoexp; + struct ips_tid_recv_desc *tidrecvc; + psmi_seqnum_t err_seqnum; + ptl_arg_t desc_id = p_hdr->data[0]; + ptl_arg_t send_desc_id = p_hdr->data[1]; + ptl_arg_t desc_tidrecvc; + ptl_arg_t args[3] = {}; + int16_t seq_off; + uint8_t ack_type; + + INC_TIME_SPEND(TIME_SPEND_USER4); + + ipsaddr->stats.err_chk_recv++; + + /* Get the flowgenseq for err chk gen */ + err_seqnum.val = __be32_to_cpu(p_hdr->bth[2]); + + ips_ptladdr_lock(ipsaddr); + + /* Get receive descriptor */ + tidrecvc = (struct ips_tid_recv_desc *) + psmi_mpool_find_obj_by_index(protoexp->tid_desc_recv_pool, + desc_id._desc_idx); + + if (tidrecvc == NULL) { + _IPATH_DBG("ERR_CHK_GEN: invalid rendezvous handle\n"); + ips_ptladdr_unlock(ipsaddr); + return; + } + psmi_mpool_get_obj_index_gen_count(tidrecvc, + &desc_tidrecvc._desc_idx, + &desc_tidrecvc._desc_genc); + + if (desc_id.u64 != desc_tidrecvc.u64) { + /* Receive descriptor mismatch in time and space. + * Stale err chk gen, drop packet + */ + _IPATH_DBG("ERR_CHK_GEN: rendezvous handle generation mismatch. Pkt: 0x%08x, Current: 0x%08x\n", desc_id._desc_genc, desc_tidrecvc._desc_genc); + ips_ptladdr_unlock(ipsaddr); + return; + } + + psmi_assert(tidrecvc->tidflow_idx == err_seqnum.flow); + + /* Note: Do not read the tidflow table to determine the sequence to restart + * from. Always respond with the last known "good" packet that we received + * which is updated in protoexp_data(). + */ + + /* Either lost packets or lost ack */ + seq_off = (int16_t) (tidrecvc->tidflow_genseq.seq - err_seqnum.seq); + + if (seq_off <= 0) { + ack_type = OPCODE_NAK; + + if (err_seqnum.gen == tidrecvc->tidflow_active_gen) { + /* Swap generations */ + psm_error_t err; + + /* Allocate new generation for the flow. */ + err = ips_protoexp_flow_newgen(tidrecvc); + if (err != PSM_OK) + return; /* Out of generation. Drop packet and we will recover later */ + } + } + else { + ack_type = OPCODE_ACK; + + if (err_seqnum.gen != tidrecvc->tidflow_genseq.gen) + ack_type = OPCODE_NAK; /* NAK without allocating a new generation */ + } + + args[0] = send_desc_id; + args[1] = tidrecvc->tid_list.tsess_descid; + args[2].u16w0 = err_seqnum.gen; /* If NAK, generation number */ + + ips_ptladdr_unlock(ipsaddr); + + /* May want to generate a BECN if a lot of swapped generations */ + if_pf ((tidrecvc->tidflow_nswap_gen > 4) && + (protoexp->proto->flags & IPS_PROTO_FLAG_CCA)) { + _IPATH_CCADBG("ERR_CHK_GEN: Generating BECN. Number of swapped generations: %d.\n", tidrecvc->tidflow_nswap_gen); + /* Mark flow to generate BECN in control packet */ + tidrecvc->ipsaddr->tidgr_flow.flags |= IPS_FLOW_FLAG_GEN_BECN; + + /* Update stats for congestion encountered */ + ipsaddr->stats.congestion_pkts++; + } + + ips_proto_send_ctrl_message(&tidrecvc->ipsaddr->tidgr_flow, + ack_type, &tidrecvc->ctrl_msg_queued, args); + + /* Update stats for expected window */ + tidrecvc->stats.nErrChkReceived++; + if (ack_type == OPCODE_NAK) + tidrecvc->stats.nReXmit++; /* Update stats for retransmit (Sent a NAK) */ +} + +static void +parse_ip_or_lid(char *buf, size_t len, uint32_t ip, psm_epid_t epid) +{ + char ipbuf[INET_ADDRSTRLEN], *p; + in_addr_t in_loop = inet_addr("127.0.0.1"); + in_addr_t in_any = inet_addr("0.0.0.0"); + + p = (char *) inet_ntop(AF_INET, (const void *) &ip, ipbuf, sizeof ipbuf); + if (ip != in_loop && ip != in_any && p) + snprintf(buf, len-1, "IP %s", p); + else + snprintf(buf, len-1, "LID 0x%x", (int) psm_epid_nid(epid)); + buf[len-1] = '\0'; +} + +#define IPS_MAX_BOGUS_ERR_CHK_BAD 15 + +static void +_process_err_chk_bad(ips_epaddr_t *ipsaddr, struct ips_message_header *p_hdr) +{ + uint32_t ipv4_addr = p_hdr->data[0].u32w0; + uint32_t pid = __be32_to_cpu(p_hdr->data[0].u32w1); + union psmi_envvar_val env_stray; + char buf[32]; + psm_epid_t epid = ipsaddr->epaddr->epid; + + parse_ip_or_lid(buf, sizeof buf, ipv4_addr, epid); + + /* First make sure that we actually do have a connection to this lid+context, + * if not, we just ignore the err_chk_bad message since it might be some + * oddly timed packet */ + if (!ips_proto_isconnected(ipsaddr)) { + int lid = (int) psm_epid_nid(epid); + int context = (int) psm_epid_context(epid); + if (++ipsaddr->proto->num_bogus_warnings <= IPS_MAX_BOGUS_ERR_CHK_BAD) + psmi_syslog(ipsaddr->proto->ep, 1, LOG_INFO, + "PSM pid %d on host %s complains that I am a stray process but " + "I'm not even connected to LID %d context %d (ignoring %s\n", + pid, buf, lid, context, + ipsaddr->proto->num_bogus_warnings == IPS_MAX_BOGUS_ERR_CHK_BAD ? + "all future stray warning checks from unknown endpoints)." : + ")."); + return; + } + + /* At this point the bad error check is a real one, from a host we thought + * we were connected to. We only go through this path once. If + * PSM_STRAY_ENABLED=0, we'll print this warning once, if it's 1 we'll die. + */ + if (ipsaddr->proto->done_once++) + return; + + psmi_getenv("PSM_STRAY_ENABLED", "Enable stray process detection", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_YESNO, + (union psmi_envvar_val) 1, /* yes by default */ + &env_stray); + + if (env_stray.e_uint) + psmi_handle_error(PSMI_EP_NORETURN, PSM_EPID_NETWORK_ERROR, "PSM pid " + "%d on host %s has detected that I am a stray process, exiting.", + pid, buf); + else + psmi_syslog(ipsaddr->proto->ep, 1, LOG_INFO, "PSM pid " + "%d on host %s has detected that I am a stray process, " + "PSM_STRAY_ENABLED is off and future messages are ignored."); + return; +} + +static void ips_bad_opcode(uint8_t op_code, struct ips_message_header *proto) +{ + _IPATH_DBG("Discarding message with bad opcode 0x%x\n", + op_code); + + if(infinipath_debug&__IPATH_DBG) { + ips_proto_show_header(proto, "received bad opcode"); + ips_proto_dump_frame(proto, sizeof(struct ips_message_header), + "Opcode error protocol header dump"); + } +} + +static void +_process_unknown_opcode(struct ips_proto *proto, + struct ips_message_header *protocol_header) +{ + proto->stats.unknown_packets++; + + switch (protocol_header->sub_opcode) { + /* A bunch of pre-PSM packets that we don't handle any more */ + case OPCODE_SEQ_DATA: + case OPCODE_SEQ_CTRL: + case OPCODE_STARTUP: + case OPCODE_STARTUP_EXT: + case OPCODE_STARTUP_ACK: + case OPCODE_STARTUP_ACK_EXT: + case OPCODE_STARTUP_NAK: + case OPCODE_STARTUP_NAK_EXT: + case OPCODE_CLOSE: + case OPCODE_ABORT: + case OPCODE_CLOSE_ACK: + break; + default: + ips_bad_opcode(protocol_header->sub_opcode, protocol_header); + break; + } +} + +PSMI_NEVER_INLINE( +int +_process_connect(const struct ips_recvhdrq_event *rcv_ev)) +{ + const uint16_t lmc_mask = ~((1 << rcv_ev->proto->epinfo.ep_lmc) - 1); + + return ips_proto_process_connect(rcv_ev->proto, + ips_epid_from_phdr(lmc_mask, rcv_ev->p_hdr), + rcv_ev->p_hdr->sub_opcode, + rcv_ev->p_hdr, + ips_recvhdrq_event_payload(rcv_ev), + ips_recvhdrq_event_paylen(rcv_ev)); +} + +// Return 1 if packet is ok. +// Return 0 if packet should be skipped +int +ips_proto_process_unknown(const struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + uint8_t ptype = rcv_ev->ptype; + const uint64_t current_count = get_cycles(); + struct ips_stray_epid *sepid; + struct ips_proto *proto = rcv_ev->proto; + psm_ep_t ep_err; + psm_epid_t epid; + char *pkt_type; + int opcode = (int) p_hdr->sub_opcode; + double t_elapsed; + ptl_epaddr_flow_t flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid); + const uint16_t lmc_mask = ~((1 << rcv_ev->proto->epinfo.ep_lmc) - 1); + + /* + * If the protocol is disabled or not yet enabled, no processing happens + * We set it t_init to 0 when disabling the protocol + */ + if (proto->t_init == 0) + return IPS_RECVHDRQ_CONTINUE; + + /* + * If lid is 0, something bad happened in queue processing + */ + epid = ips_epid_from_phdr(lmc_mask, p_hdr); + if (psm_epid_nid(epid) == 0ULL) { + proto->stats.lid_zero_errs++; + _IPATH_DBG("Skipping stray packet processing with LID=0\n"); + return IPS_RECVHDRQ_CONTINUE; + } + + /* Connect messages don't have to be from a known epaddr */ + switch (opcode) { + case OPCODE_CONNECT_REQUEST: + case OPCODE_CONNECT_REPLY: + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + _process_connect(rcv_ev); + return IPS_RECVHDRQ_CONTINUE; + case OPCODE_ERR_CHK_BAD: /* ignore, old opcode */ + return IPS_RECVHDRQ_CONTINUE; + default: + break; + } + + /* Packet from "unknown" peer. Log the packet and payload if at appropriate + * verbose level. + */ + { + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + + ips_proto_dump_err_stats(proto); + + if(infinipath_debug & __IPATH_PKTDBG) { + ips_proto_dump_frame(rcv_ev->p_hdr, IPATH_MESSAGE_HDR_SIZE, "header"); + if (paylen) + ips_proto_dump_frame(payload, paylen, "data"); + } + } + + /* Other messages are definitely crosstalk. */ + /* out-of-context expected messages are always fatal */ + if (ptype == RCVHQ_RCV_TYPE_EXPECTED) { + ep_err = PSMI_EP_NORETURN; + pkt_type = "expected"; + } + else if (ptype == RCVHQ_RCV_TYPE_EAGER) { + ep_err = PSMI_EP_LOGEVENT; + pkt_type = "eager"; + } + else { + ep_err = PSMI_EP_NORETURN; + pkt_type = "unknown"; + } + + proto->stats.stray_packets++; + + /* If we have debug mode, print the complete packet every time */ + if (infinipath_debug & __IPATH_PKTDBG) + ips_proto_show_header(p_hdr, "invalid commidx"); + t_elapsed = (double) + cycles_to_nanosecs(get_cycles()-proto->t_init) / 1.0e9; + + sepid = (struct ips_stray_epid *) + psmi_epid_lookup(PSMI_EP_CROSSTALK, epid); + if (sepid == NULL) { /* Never seen crosstalk from this node, log it */ + sepid = (struct ips_stray_epid *) + psmi_calloc(proto->ep, UNDEFINED, 1, sizeof(struct ips_stray_epid)); + if (sepid == NULL) return 0; /* skip packet if no memory */ + psmi_epid_add(PSMI_EP_CROSSTALK, epid, (void *) sepid); + sepid->epid = epid; + if (proto->stray_warn_interval) + sepid->t_first = sepid->t_warn_next = current_count; + } + sepid->num_messages++; + sepid->t_last = current_count; + + /* If we're not going to warn the user and if this not to be a fatal + * packet, just skip it */ + if (sepid->t_warn_next > current_count && ep_err != PSMI_EP_NORETURN) + return 0; + + sepid->t_warn_next = current_count + proto->stray_warn_interval; + + if (p_hdr->sub_opcode == OPCODE_ERR_CHK) { + /* With the new err_check, we can print out extra information */ + char ipbuf[INET_ADDRSTRLEN], *ip = NULL; + sepid->ipv4_addr = p_hdr->data[0].u32w0; + sepid->pid = __be32_to_cpu(p_hdr->data[0].u32w1); + ip = (char *) inet_ntop(AF_INET, &sepid->ipv4_addr, ipbuf, sizeof ipbuf); + + /* If the IP and PID make sense, go ahead and print useful info and + * even reply with ERR_CHK_BAD. If not, fall through and print the + * generic bad error message + */ + if (ip != NULL && sepid->pid) { + /* Make up a fake ipsaddr and reply */ + ips_epaddr_t ipsaddr_f; + psm_error_t err; + + /* debugging sanity, and catch bugs */ + memset(&ipsaddr_f, 0, sizeof(ips_epaddr_t)); + ipsaddr_f.epr.epr_context = IPS_HEADER_SRCCONTEXT_GET(p_hdr); + ipsaddr_f.epr.epr_subcontext = p_hdr->dst_subcontext; + ipsaddr_f.epr.epr_pkt_context = + ipsaddr_f.epr.epr_context & 0xf; + + /* Get path record for peer */ + err = proto->ibta.get_path_rec(proto, + proto->epinfo.ep_base_lid, + p_hdr->lrh[3], /* SLID */ + PSMI_HCA_TYPE_QLE73XX, + 3000, &ipsaddr_f); + if (err != PSM_OK) + goto fail; + + ipsaddr_f.epr.epr_qp = __be32_to_cpu(p_hdr->bth[1]); + ipsaddr_f.epr.epr_qp &= 0xffffff; /* QP is 24 bits */ + ipsaddr_f.ptl = (ptl_t *) -1; + ipsaddr_f.proto = proto; + /* Pretend the ctrlmsg is already queued, so it doesn't get queued + * in this fake (stack-allocated) ptladdr */ + ipsaddr_f.ctrl_msg_queued = ~0; + flowid = EP_FLOW_GO_BACK_N_PIO; + ips_flow_init(&ipsaddr_f.flows[flowid], NULL, + &ipsaddr_f, PSM_TRANSFER_PIO, + PSM_PROTOCOL_GO_BACK_N, IPS_PATH_LOW_PRIORITY, flowid); + + if (!ips_proto_send_ctrl_message(&ipsaddr_f.flows[flowid], + OPCODE_ERR_CHK_BAD, + &ipsaddr_f.ctrl_msg_queued,NULL)){ + sepid->err_check_bad_sent++; + _IPATH_VDBG("did reply with ERR_CHK_BAD\n"); + } + else + _IPATH_VDBG("did *NOT* reply with ERR_CHK_BAD\n"); + + fail: + psmi_handle_error(ep_err, PSM_EPID_NETWORK_ERROR, + "Received %d out-of-context %s message(s) from stray process " + "PID=%d running on host %s (LID 0x%x, ptype=0x%x, subop=0x%x, " + "elapsed=%.3fs) %s", + sepid->num_messages, pkt_type, sepid->pid, ip, + (int) psm_epid_nid(epid), ptype, opcode, t_elapsed, + (ep_err == PSMI_EP_NORETURN) ? "Aborting." : ""); + return 0; + } + } + + /* At this point we either have a OPCODE_ERR_CHECK where we couldn't + * extract a valid ip and pid OR some other opcode */ + psmi_handle_error(ep_err, PSM_EPID_NETWORK_ERROR, + "Received out-of-context %s message(s) from a stray process " + "running on LID 0x%x ptype=0x%x subop=0x%x elapsed=%.3fs", + pkt_type, (int) psm_epid_nid(epid), ptype, opcode, t_elapsed); + + return 0; /* Always skip this packet unless the above call was a noreturn + * call */ +} + +/* get the error string as a number and a string */ +static void rhf_errnum_string(char *msg, size_t msglen, long err) +{ + int len; + char *errmsg; + + len = snprintf(msg, msglen, "RHFerror %lx: ", err); + if(len > 0 && len < msglen) { + errmsg = msg + len; + msglen -= len; + } + else + errmsg = msg; + *errmsg = 0; + ips_proto_get_rhf_errstring(err, errmsg, msglen); +} + +/* + * Error handling + */ +int __recvpath +ips_proto_process_packet_error(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_proto *proto = rcv_ev->proto; + int pkt_verbose_err = infinipath_debug & __IPATH_PKTDBG; + int tiderr = rcv_ev->error_flags & INFINIPATH_RHF_H_TIDERR; + int tf_seqerr = rcv_ev->error_flags & INFINIPATH_RHF_H_TFSEQERR; + int tf_generr = rcv_ev->error_flags & INFINIPATH_RHF_H_TFGENERR; + int data_err = rcv_ev->error_flags & + (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR | + INFINIPATH_RHF_H_PARITYERR | INFINIPATH_RHF_H_LENERR | + INFINIPATH_RHF_H_MTUERR | INFINIPATH_RHF_H_IHDRERR | + INFINIPATH_RHF_H_IBERR); + char pktmsg[128]; + + *pktmsg = 0; + /* + * Tid errors on eager pkts mean we get a headerq overflow, perfectly + * safe. Tid errors on expected or other packets means trouble. + */ + if (tiderr && rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) { + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + + + /* Payload dropped - Determine flow for this header and see if + * we need to generate a NAK. + * + * ALL PACKET DROPS IN THIS CATEGORY CAN BE FLAGGED AS DROPPED DUE TO + * CONGESTION AS THE EAGER BUFFER IS FULL. + * + * Possible eager packet type: + * + * Ctrl Message - ignore + * MQ message - Can get flow and see if we need to NAK. + * AM message - Can get flow and see if we need to NAK. + */ + + proto->stats.hdr_overflow++; + if (data_err) + return 0; + + switch(p_hdr->sub_opcode) { + case OPCODE_SEQ_MQ_HDR: + case OPCODE_SEQ_MQ_CTRL: + case OPCODE_AM_REQUEST: + case OPCODE_AM_REQUEST_NOREPLY: + case OPCODE_AM_REPLY: + { + ptl_epaddr_flow_t flowid = ips_proto_flowid(p_hdr); + struct ips_epstate_entry *epstaddr; + struct ips_flow *flow; + psmi_seqnum_t sequence_num; + int16_t diff; + + /* Obtain ipsaddr for packet */ + epstaddr = ips_epstate_lookup(rcv_ev->recvq->epstate, + rcv_ev->p_hdr->commidx + + INFINIPATH_KPF_RESERVED_BITS(p_hdr->iph.pkt_flags)); + if_pf (epstaddr == NULL || epstaddr->epid != rcv_ev->epid) + return 0; /* Unknown packet - drop */ + + rcv_ev->ipsaddr = epstaddr->ipsaddr; + flow = &rcv_ev->ipsaddr->flows[flowid]; + sequence_num.val = __be32_to_cpu(p_hdr->bth[2]); + diff = (int16_t) (sequence_num.pkt - flow->recv_seq_num.pkt); + + if (diff >= 0 && !(flow->flags & IPS_FLOW_FLAG_NAK_SEND)) { + /* Mark flow as congested and attempt to generate NAK */ + flow->flags |= IPS_FLOW_FLAG_GEN_BECN; + rcv_ev->ipsaddr->stats.congestion_pkts++; + flow->last_seq_num = sequence_num; + + flow->flags |= IPS_FLOW_FLAG_NAK_SEND; + flow->cca_ooo_pkts = 0; + ips_proto_send_nak((struct ips_recvhdrq *) rcv_ev->recvq, flow); + } + + /* Safe to process ACKs from header */ + ips_proto_process_ack(rcv_ev); + } + break; + default: + break; + } + } + else if (tiderr) /* tid error, but not on an eager pkt */ + { + psm_ep_t ep_err = PSMI_EP_LOGEVENT; + int many_tids = 0, many_epids = 0; + uint32_t context_tid_off = + __le32_to_cpu(rcv_ev->p_hdr->iph.ver_context_tid_offset); + uint16_t tid, offset; + uint64_t t_now = get_cycles(); + + proto->tiderr_cnt++; + + /* Whether and how we will be logging this event */ + if (proto->tiderr_max > 0 && proto->tiderr_cnt >= proto->tiderr_max) + ep_err = PSMI_EP_NORETURN; + else if (proto->tiderr_warn_interval != UINT64_MAX && + proto->tiderr_tnext <= t_now) + proto->tiderr_tnext = get_cycles() + proto->tiderr_warn_interval; + else + ep_err = NULL; + + if (ep_err != NULL) { + if (proto->tiderr_context_tid_off != context_tid_off) { /* many tids */ + if (proto->tiderr_context_tid_off != 0) + many_tids = 1; + proto->tiderr_context_tid_off = context_tid_off; + } + + if (proto->tiderr_epid != rcv_ev->epid) { /* many epids */ + if (proto->tiderr_epid != 0) + many_epids = 1; + proto->tiderr_epid = rcv_ev->epid; + } + + rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags); + + tid = (context_tid_off >> INFINIPATH_I_TID_SHIFT) & + INFINIPATH_I_TID_MASK; + offset = (context_tid_off>>INFINIPATH_I_OFFSET_SHIFT) & + INFINIPATH_I_OFFSET_MASK; + + psmi_handle_error(ep_err, PSM_EP_DEVICE_FAILURE, + "%s with tid=%d,offset=%d,count=%d " + "from %s%s %s %s", + many_tids ? "Multiple TID Errors" : "TID Error", + tid, offset, proto->tiderr_cnt, + psmi_epaddr_get_name(rcv_ev->epid), + many_epids ? " (and other hosts)" : "", + pktmsg, ep_err == PSMI_EP_NORETURN ? + "(Terminating...)" : ""); + } + + if (proto->protoexp && rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) + ips_protoexp_handle_tiderr(rcv_ev); + } + else if (tf_generr) + ips_protoexp_handle_tf_generr(rcv_ev); + else if (tf_seqerr) + ips_protoexp_handle_tf_seqerr(rcv_ev); + else if (data_err) { + uint8_t op_code = __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF; + + if (!pkt_verbose_err) { + rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags); + _IPATH_DBG("Error %s pkt type opcode 0x%x at hd=0x%x %s\n", + (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) ? "eager" : + (rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) ? "expected" : + (rcv_ev->ptype == RCVHQ_RCV_TYPE_NON_KD) ? "non-kd" : + "", + op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); + } + + if (proto->protoexp && rcv_ev->ptype == RCVHQ_RCV_TYPE_EXPECTED) + ips_protoexp_handle_data_err(rcv_ev); + } + else { /* not a tid or data error -- some other error */ + uint8_t op_code = __be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 24 & 0xFF; + + if (!pkt_verbose_err) + rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags); + + /* else RHFerr decode printed below */ + _IPATH_DBG("Error pkt type 0x%x opcode 0x%x at hd=0x%x %s\n", + rcv_ev->ptype, op_code, rcv_ev->recvq->state->hdrq_head, pktmsg); + } + if (pkt_verbose_err) { + if(!*pktmsg) + rhf_errnum_string(pktmsg, sizeof(pktmsg), rcv_ev->error_flags); + ips_proto_show_header(rcv_ev->p_hdr, pktmsg); + } + + return 0; +} + +/* + * Only valid packets make it to this point. + */ +int __recvpath +ips_proto_process_packet_inner(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_message_header *p_hdr = rcv_ev->p_hdr; + ips_epaddr_t *ipsaddr = rcv_ev->ipsaddr; + int ret = IPS_RECVHDRQ_CONTINUE; + + /* NOTE: Fault injection will currently not work with hardware suppression + * on QLE73XX. See TODO below for reason why as we currently do not update + * the hardware tidflow table if FI is dropping the packet. + * + * TODO: We need to look into the packet before dropping it and + * if it's an expected packet AND we have hardware suppression then we + * need to update the hardware tidflow table and the associated tidrecvc + * state to fake having received a packet uptil some point in the window + * defined by the loss rate. This way the subsequent err chk will be NAKd + * and we can resync the flow with the sender. + * + * Note: For real errors the hardware generates seq/gen errors which are + * handled appropriately by the protocol. + */ + + if_pf (PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi_recv, "recvlost", 1, IPS_FAULTINJ_RECVLOST); + if (psmi_faultinj_is_fault(fi_recv)) + return ret; + } + + switch (rcv_ev->ptype) { + case RCVHQ_RCV_TYPE_EAGER: + #if 0 + _IPATH_VDBG("got packet from %d with opcode=%x, seqno=%d\n", + p_hdr->commidx, + p_hdr->sub_opcode, + __be32_to_cpu(p_hdr->bth[2])); + #endif + + switch ( p_hdr->sub_opcode ) { + case OPCODE_SEQ_MQ_HDR: + case OPCODE_SEQ_MQ_CTRL: + ret = _process_mq(rcv_ev); + break; + + case OPCODE_ACK: + ips_proto_process_ack(rcv_ev); + break; + + case OPCODE_NAK: + _process_nak(rcv_ev); + break; + + case OPCODE_AM_REQUEST: + case OPCODE_AM_REQUEST_NOREPLY: + case OPCODE_AM_REPLY: + ret = ips_proto_am(rcv_ev); + break; + case OPCODE_FLOW_CCA_BECN: + { + struct ips_proto *proto = ipsaddr->proto; + struct ips_flow *flow = NULL; + psm_protocol_type_t protocol; + ptl_epaddr_flow_t flowid; + + protocol = IPS_FLOWID_GET_PROTO(p_hdr->flowid); + flowid = IPS_FLOWID_GET_INDEX(p_hdr->flowid); + psmi_assert_always(protocol == PSM_PROTOCOL_GO_BACK_N); + flow = &ipsaddr->flows[flowid]; + + if ((flow->path->epr_ccti + + proto->cace[flow->path->epr_sl].ccti_increase) <= + proto->ccti_limit) { + ips_cca_adjust_rate(flow->path, + proto->cace[flow->path->epr_sl].ccti_increase); + /* Clear congestion event */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_BECN; + } + } + break; + + case OPCODE_ERR_CHK: + case OPCODE_ERR_CHK_OLD: + _process_err_chk((struct ips_recvhdrq *) rcv_ev->recvq, + ipsaddr, p_hdr); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_ERR_CHK_GEN: + _process_err_chk_gen(ipsaddr, p_hdr); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_ERR_CHK_PLS: /* skip for now */ + break; + + case OPCODE_ERR_CHK_BAD: + _process_err_chk_bad(ipsaddr, p_hdr); + break; + + case OPCODE_TIDS_GRANT: + ips_protoexp_tid_grant(rcv_ev); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_TIDS_GRANT_ACK: + ips_protoexp_tid_grant_ack(rcv_ev); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_TIDS_RELEASE: + ret = ips_protoexp_tid_release(rcv_ev); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_TIDS_RELEASE_CONFIRM: + ips_protoexp_tid_release_ack(rcv_ev); + /* Ignore FECN bit since this is the control path */ + rcv_ev->is_congested &= ~IPS_RECV_EVENT_FECN; + break; + + case OPCODE_SEQ_MQ_EXPTID: + ips_protoexp_data(rcv_ev); + break; + + case OPCODE_SEQ_MQ_EXPTID_UNALIGNED: + ips_protoexp_recv_unaligned_data(rcv_ev); + break; + + case OPCODE_CONNECT_REQUEST: + case OPCODE_CONNECT_REPLY: + case OPCODE_DISCONNECT_REQUEST: + case OPCODE_DISCONNECT_REPLY: + _process_connect(rcv_ev); + break; + + default: /* skip unsupported opcodes */ + _process_unknown_opcode(rcv_ev->proto, p_hdr); + break; + } /* switch (op_code) */ + break; + + case RCVHQ_RCV_TYPE_EXPECTED: + ips_protoexp_data(rcv_ev); + break; + + default: /* unknown frame type */ + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Unknown frame type %x", rcv_ev->ptype); + break; + } /* switch (ptype) */ + + return ret; +} diff --git a/ptl_ips/ips_recvhdrq.c b/ptl_ips/ips_recvhdrq.c new file mode 100644 index 0000000..861b66c --- /dev/null +++ b/ptl_ips/ips_recvhdrq.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_recvhdrq.h" + +/* + * TUNABLES TUNABLES TUNABLES + */ + +/* + * Receive Queue progress optimizations + * + * The recvhdrq_progress function supports 2 chip features, so can be written + * to support 4 possible combinations in chip features (although only 3/4 are + * currently implemented in our chips). + * + * We can either support recvhdrq_progress by implementing the function in 4 + * ways and calling it through a function pointer + * (IPS_RCVHDRQ_THRU_FUNCTION_POINTER=1) or having one implementation that + * covers all possible combinations (IPS_RCVHDRQ_THRU_FUNCTION_POINTER=0). + */ +#define IPS_RCVHDRQ_THRU_FUNCTION_POINTER 1 + +#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER +static psm_error_t ips_recvhdrq_progress_none(struct ips_recvhdrq *recvq); +static psm_error_t ips_recvhdrq_progress_nortail(struct ips_recvhdrq *recvq); +#endif + + +psm_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvq_params *hdrq_params, + const struct ips_recvq_params *egrq_params, + const struct ips_recvhdrq_callbacks *callbacks, + uint32_t runtime_flags, + uint32_t subcontext, + struct ips_recvhdrq *recvq, + struct ips_recvhdrq_state *recvq_state) +{ + const struct ipath_base_info *base_info = &context->base_info; + psm_error_t err = PSM_OK; + + memset(recvq, 0, sizeof(*recvq)); + recvq->proto = (struct ips_proto *) proto; + recvq->state = recvq_state; + recvq->context = context; + recvq->subcontext = subcontext; + /* This runtime flags may be different from the context's runtime flags since + * a receive queue may be initialised to represent a "software" receive + * queue (shared contexts) or a hardware receive queue */ + recvq->runtime_flags = runtime_flags; + recvq->hdrq = *hdrq_params; /* deep copy */ + pthread_spin_init(&recvq->hdrq_lock, PTHREAD_PROCESS_SHARED); + recvq->hdrq_rhf_off = base_info->spi_rhf_offset; + + if (recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + recvq->hdrq_rhf_notail = 1; + recvq->state->hdrq_rhf_seq = 1; + } + else { + recvq->hdrq_rhf_notail = 0; + recvq->state->hdrq_rhf_seq = 0; /* _seq is ignored */ + } + recvq->hdrq_elemlast = ((recvq->hdrq.elemcnt - 1) * recvq->hdrq.elemsz); + + recvq->egrq = *egrq_params; /* deep copy */ + recvq->egrq_buftable = + ips_recvq_egrbuf_table_alloc(context->ep, recvq->egrq.base_addr, + base_info->spi_rcv_egrchunksize, + recvq->egrq.elemcnt, recvq->egrq.elemsz); + if (recvq->egrq_buftable == NULL) { + err = psmi_handle_error(proto->ep, PSM_NO_MEMORY, + "Couldn't allocate memory for eager buffer index table"); + goto fail; + } + + recvq->epstate = epstate; + + /* NOTE: We should document PSM_RCVHDRCOPY is not available with QIB? */ + +#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER + /* Only either have NODMA RTAIL (for QLE73XX/QLE72XX) or just the vanilla + version for QLE71XX where RTAIL is DMA'd */ + if (recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) + recvq->progress_fn = ips_recvhdrq_progress_nortail; + else + recvq->progress_fn = ips_recvhdrq_progress_none; +#endif + + recvq->recvq_callbacks = *callbacks; /* deep copy */ + SLIST_INIT(&recvq->pending_acks); + + recvq->state->hdrq_head = 0; + recvq->state->rcv_egr_index_head = NO_EAGER_UPDATE; + recvq->state->num_hdrq_done = 0; + recvq->state->hdr_countdown = 0; + + { + union psmi_envvar_val env_hdr_update; + psmi_getenv("PSM_HEAD_UPDATE", + "header queue update interval (0 to update after all entries are processed). Default is 16", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, + (union psmi_envvar_val) 16, &env_hdr_update); + + /* Cap max header update interval to size of header/eager queue */ + recvq->state->head_update_interval = + min(env_hdr_update.e_uint, + min(recvq->hdrq.elemcnt-1, recvq->egrq.elemcnt-1)); + } + +fail: + return err; +} + +psm_error_t +ips_recvhdrq_fini(struct ips_recvhdrq *recvq) +{ + ips_recvq_egrbuf_table_free(recvq->egrq_buftable); + return PSM_OK; +} + +// flush the eager buffers, by setting the eager index head to eager index tail +// if eager buffer queue is full. +// +// Called when we had eager buffer overflows (ERR_TID/INFINIPATH_RHF_H_TIDERR +// was set in RHF errors), and no good eager packets were received, so +// that eager head wasn't advanced. +// + +#if 0 +static void ips_flush_egrq_if_required(struct ips_recvhdrq *recvq) +{ + const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); + const uint32_t head = ips_recvq_head_get(&recvq->egrq); + uint32_t egr_cnt = recvq->egrq.elemcnt; + + if ((head % egr_cnt) == ((tail+1)%egr_cnt)) { + _IPATH_DBG("eager array full after overflow, flushing " + "(head %llx, tail %llx)\n", + (long long)head, (long long)tail); + recvq->proto->stats.egr_overflow++; + } + return; +} +#endif + +/* + * Helpers for ips_recvhdrq_progress. + */ + +static __inline__ int +_get_proto_subcontext(const struct ips_message_header *p_hdr) +{ + return p_hdr->dst_subcontext; +} + +/* ipath_opcode is not the ips-level opcode. */ +static __inline__ uint8_t +_get_proto_ipath_opcode(const struct ips_message_header *p_hdr) +{ + return __be32_to_cpu(p_hdr->bth[0]) >> BTH_OPCODE_SHIFT & 0xFF; +} + +/* Detrmine if FECN bit is set IBTA 1.2.1 CCA Annex A*/ +static __inline__ uint8_t +_is_cca_fecn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> BTH_FECN_SHIFT); +} + +/* Detrmine if BECN bit is set IBTA 1.2.1 CCA Annex A*/ +static __inline__ uint8_t +_is_cca_becn_set(const struct ips_message_header *p_hdr) +{ + return (__be32_to_cpu(p_hdr->bth[1]) >> BTH_BECN_SHIFT) & 0x1; +} + +static __inline__ struct ips_message_header * +_get_proto_hdr_from_rhf(const uint32_t *rcv_hdr, const __le32 *rhf) +{ + return (struct ips_message_header *) (rcv_hdr + ipath_hdrget_offset(rhf)); +} + +static __inline__ struct ips_message_header * +_get_proto_hdr(const uint32_t *rcv_hdr) +{ + return (struct ips_message_header *) &rcv_hdr[2]; +} + +static __inline__ uint32_t +_get_rhf_seq(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr) +{ + return ipath_hdrget_seq((const __le32 *) rcv_hdr + recvq->hdrq_rhf_off); +} + +static __inline__ uint32_t +_get_rhf_len_in_bytes(struct ips_recvhdrq *recvq, const __u32 *rcv_hdr) +{ + return ipath_hdrget_length_in_bytes((const __le32*) rcv_hdr + recvq->hdrq_rhf_off); +} + +static __inline__ void +_dump_invalid_pkt(struct ips_recvhdrq_event *rcv_ev) +{ + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + + if(infinipath_debug & __IPATH_PKTDBG) { + ips_proto_dump_frame(rcv_ev->p_hdr, IPATH_MESSAGE_HDR_SIZE, "header"); + if (paylen) + ips_proto_dump_frame(payload, paylen, "data"); + } + +} + +static __inline__ void +_update_error_stats(struct ips_proto *proto, uint32_t err) +{ + + if (err & INFINIPATH_RHF_H_ICRCERR) + proto->error_stats.num_icrc_err++; + if (err & INFINIPATH_RHF_H_VCRCERR) + proto->error_stats.num_vcrc_err++; + if (err & INFINIPATH_RHF_H_PARITYERR) + proto->error_stats.num_ecc_err++; + if (err & INFINIPATH_RHF_H_LENERR) + proto->error_stats.num_len_err++; + if (err & INFINIPATH_RHF_H_MTUERR) + proto->error_stats.num_mtu_err++; + if (err & INFINIPATH_RHF_H_IHDRERR) + proto->error_stats.num_khdr_err++; + if (err & INFINIPATH_RHF_H_TIDERR) + proto->error_stats.num_tid_err++; + if (err & INFINIPATH_RHF_H_MKERR) + proto->error_stats.num_mk_err++; + if (err & INFINIPATH_RHF_H_IBERR) + proto->error_stats.num_ib_err++; +} + +static int +_check_headers(struct ips_recvhdrq_event *rcv_ev) +{ + struct ips_recvhdrq *recvq = (struct ips_recvhdrq*) rcv_ev->recvq; + struct ips_proto *proto = rcv_ev->proto; + uint32_t *lrh = (uint32_t*) rcv_ev->p_hdr; + const uint32_t *rcv_hdr = rcv_ev->rcv_hdr; + uint32_t dest_context; + const uint16_t pkt_dlid = __be16_to_cpu(rcv_ev->p_hdr->lrh[1]); + const uint16_t base_dlid = __be16_to_cpu(recvq->proto->epinfo.ep_base_lid); + + /* Check that the receive header queue entry has a sane sequence number */ + if (_get_rhf_seq(recvq, rcv_hdr) > LAST_RHF_SEQNO) { + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ErrPkt: Invalid header queue entry! RHF Sequence in Hdrq Seq: %d, Recvq State Seq: %d. LRH[0]: 0x%08x, LRH[1] (PktCount): 0x%08x\n", _get_rhf_seq(recvq, rcv_hdr), recvq->state->hdrq_rhf_seq, lrh[0], lrh[1]); + return -1; + } + + /* Verify that the packet was destined for our context */ + dest_context = ips_proto_dest_context_from_header(proto, rcv_ev->p_hdr); + if_pf (dest_context != recvq->proto->epinfo.ep_context) { + + struct ips_recvhdrq_state *state = recvq->state; + + /* Packet not targetted at us. Drop packet and continue */ + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "ErrPkt: Received packet for context %d on context %d. Receive Header Queue offset: 0x%x. Exiting.\n", dest_context, recvq->proto->epinfo.ep_context, state->hdrq_head); + + return -1; + } + + + if_pf (rcv_ev->error_flags || + (_get_proto_ipath_opcode(rcv_ev->p_hdr) != IPATH_OPCODE_USER1)) { + + return 0; /* Error flags are special case. Let main receive loop handle + * packet processing after we account for it. + */ + } + + /* Verify that rhf packet length matches the length in LRH */ + if_pf (_get_rhf_len_in_bytes(recvq, rcv_hdr) != + (__be16_to_cpu(rcv_ev->p_hdr->lrh[2]) << 2)) { + _IPATH_EPDBG("ErrPkt: RHF Packet Len (0x%x) does not match LRH (0x%x).\n", _get_rhf_len_in_bytes(recvq, rcv_hdr) >> 2, __be16_to_cpu(rcv_ev->p_hdr->lrh[2])); + + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + return -1; + } + + /* Verify that the DLID matches our local LID. */ + if_pf (!((base_dlid <= pkt_dlid) && + (pkt_dlid <= (base_dlid + (1 << recvq->proto->epinfo.ep_lmc))))) { + _IPATH_EPDBG("ErrPkt: DLID in LRH (0x%04x) does not match local LID (0x%04x) Skipping packet!\n", rcv_ev->p_hdr->lrh[1], recvq->proto->epinfo.ep_base_lid); + ips_proto_dump_err_stats(proto); + _dump_invalid_pkt(rcv_ev); + return -1; + } + + return 0; +} + +static __inline__ +int +do_pkt_cksum(struct ips_recvhdrq_event *rcv_ev) +{ + char *payload = ips_recvhdrq_event_payload(rcv_ev); + uint32_t paylen = ips_recvhdrq_event_paylen(rcv_ev) + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3); + uint32_t *ckptr; + uint32_t recv_cksum, cksum, dest_subcontext; + + /* With checksum every packet has a payload */ + psmi_assert_always(payload); + + ckptr = (uint32_t*) (payload + paylen); + recv_cksum = ckptr[0]; + + /* Calculate checksum hdr + payload (includes any padding words) */ + cksum = 0xffffffff; + cksum = ips_crc_calculate(IPATH_MESSAGE_HDR_SIZE, + (uint8_t*) rcv_ev->p_hdr,cksum); + if (paylen) + cksum = ips_crc_calculate(paylen, (uint8_t*) payload, cksum); + + if ((cksum != recv_cksum) || (ckptr[0] != ckptr[1])) { + struct ips_epstate_entry *epstaddr; + uint32_t lcontext; + uint32_t hd, tl; + + epstaddr = + ips_epstate_lookup(rcv_ev->recvq->epstate, rcv_ev->p_hdr->commidx + + INFINIPATH_KPF_RESERVED_BITS(rcv_ev->p_hdr->iph.pkt_flags)); + epstaddr = (epstaddr && epstaddr->ipsaddr) ? epstaddr : NULL; + + lcontext = + epstaddr ? epstaddr->ipsaddr->proto->epinfo.ep_context : -1; + + hd = rcv_ev->recvq->context->ctrl->__ipath_rcvhdrhead[0]; + tl = rcv_ev->recvq->context->ctrl->__ipath_rcvhdrhead[-2]; + + dest_subcontext = _get_proto_subcontext(rcv_ev->p_hdr); + + _IPATH_ERROR("ErrPkt: SharedContext: %s. Local Context: %i, Checksum mismatch from LID %d! Received Checksum: 0x%08x, Expected: 0x%08x & 0x%08x. Opcode: 0x%08x, Error Flag: 0x%08x. hdrq hd 0x%x tl 0x%x rhf 0x%x,%x, rhfseq 0x%x\n", (dest_subcontext != rcv_ev->recvq->subcontext) ? "Yes" : "No", lcontext, epstaddr ? __be16_to_cpu(epstaddr->ipsaddr->epr.epr_base_lid) : -1, cksum, ckptr[0], ckptr[1], rcv_ev->p_hdr->sub_opcode, rcv_ev->error_flags,hd, tl, rcv_ev->rhf[0], rcv_ev->rhf[1], _get_rhf_seq((struct ips_recvhdrq *) rcv_ev->recvq, rcv_ev->rcv_hdr)); + + /* Dump packet */ + _dump_invalid_pkt(rcv_ev); + return 0; /* Packet checksum error */ + } + + return 1; +} + +PSMI_ALWAYS_INLINE( +void +process_pending_acks(struct ips_recvhdrq *recvq)) +{ + /* If any pending acks, dispatch them now */ + while (!SLIST_EMPTY(&recvq->pending_acks)) { + struct ips_flow *flow = SLIST_FIRST(&recvq->pending_acks); + + SLIST_REMOVE_HEAD(&recvq->pending_acks, next); + SLIST_NEXT(flow, next) = NULL; + + if (flow->flags & IPS_FLOW_FLAG_PENDING_ACK) { + psmi_assert_always((flow->flags & IPS_FLOW_FLAG_PENDING_NAK) == 0); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_ACK; + ips_proto_send_ctrl_message(flow, OPCODE_ACK, + &flow->ipsaddr->ctrl_msg_queued, NULL); + } + else { + psmi_assert_always(flow->flags & IPS_FLOW_FLAG_PENDING_NAK); + + flow->flags &= ~IPS_FLOW_FLAG_PENDING_NAK; + ips_proto_send_ctrl_message(flow, OPCODE_NAK, + &flow->ipsaddr->ctrl_msg_queued, NULL); + } + + } + +} + +/* + * Core receive progress function + * + * recvhdrq_progress is the core function that services the receive header + * queue and optionally, the eager queue. At the lowest level, it identifies + * packets marked with errors by the chip and also detects and corrects when + * eager overflow conditions occur. At the highest level, it queries the + * 'epstate' interface to classify packets from "known" and "unknown" + * endpoints. In order to support shared contexts, it can also handle packets + * destined for other contexts (or "subcontexts"). + */ + +#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER +PSMI_ALWAYS_INLINE( +psm_error_t +ips_recvhdrq_progress_inner(struct ips_recvhdrq *recvq, + const int has_no_rtail)) +#else +psm_error_t __recvpath +ips_recvhdrq_progress(struct ips_recvhdrq *recvq) +#endif +{ + struct ips_recvhdrq_state *state = recvq->state; + const __le32 *rhf; + PSMI_CACHEALIGN struct ips_recvhdrq_event rcv_ev = { .proto = recvq->proto, + .recvq = recvq }; + + uint32_t num_hdrq_done = 0; + const int num_hdrq_todo = recvq->hdrq.elemcnt; + const uint32_t hdrq_elemsz = recvq->hdrq.elemsz; + uint32_t dest_subcontext; + + int ret = IPS_RECVHDRQ_CONTINUE; + int done = 0; + int do_hdr_update = 0; + const uint16_t lmc_mask = ~((1 << recvq->proto->epinfo.ep_lmc) - 1); + + /* Chip features */ +#if !IPS_RCVHDRQ_THRU_FUNCTION_POINTER + const int has_no_rtail = recvq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL; +#endif + + /* Both optional_eager and no_rtail features are in the same chip rev */ +#define has_optional_eagerbuf recvq->hdrq_rhf_off + + /* Returns whether the currently set 'rcv_hdr'/head is a readable entry */ +#define next_hdrq_is_ready() \ + (has_no_rtail ? \ + recvq->state->hdrq_rhf_seq == _get_rhf_seq(recvq, rcv_hdr) \ + : state->hdrq_head != hdrq_tail) + + const uint32_t hdrq_tail = has_no_rtail ? 0 + : ips_recvq_tail_get(&recvq->hdrq); + const uint32_t *rcv_hdr = + (const uint32_t *) recvq->hdrq.base_addr + state->hdrq_head; + uint32_t tmp_hdrq_head; + + done = !next_hdrq_is_ready(); + + while (!done) + { + + rhf = (const __le32 *) rcv_hdr + recvq->hdrq_rhf_off; + rcv_ev.error_flags = ipath_hdrget_err_flags(rhf); + rcv_ev.ptype = ipath_hdrget_rcv_type(rhf); + rcv_ev.rhf = rhf; + rcv_ev.rcv_hdr= rcv_hdr; + rcv_ev.p_hdr = recvq->hdrq_rhf_off ? _get_proto_hdr_from_rhf(rcv_hdr, rhf) + : _get_proto_hdr(rcv_hdr); + rcv_ev.epid = ips_epid_from_phdr(lmc_mask, rcv_ev.p_hdr); + rcv_ev.has_cksum = + ((recvq->proto->flags & IPS_PROTO_FLAG_CKSUM) && + (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER) && + (rcv_ev.p_hdr->mqhdr != MQ_MSG_DATA_BLK) && + (rcv_ev.p_hdr->mqhdr != MQ_MSG_DATA_REQ_BLK)); + + if_pt (recvq->proto->flags & IPS_PROTO_FLAG_CCA) { + /* IBTA CCA handling: + * If FECN bit set handle IBTA CCA protocol. For the flow that + * suffered congestion we flag it to generate a control packet with + * the BECN bit set - This is currently an unsolicited ACK. + * + * For all MQ packets the FECN processing/BECN generation is done + * in the is_expected_or_nak function as each eager packet is + * inspected there. + * + * For TIDFLOW/Expected data transfers the FECN bit/BECN generation + * is done in protoexp_data. Since header suppression can result + * in even FECN packets being suppressed the expected protocol + * generated addiional BECN packets if a "large" number of generations + * are swapped without progress being made for receive. "Large" is + * set empirically to 4. + * + * FECN packets are ignored for all control messages (except ACKs + * and NAKs) since they indicate congestion on the control path which + * is not rate controlled. The CCA specification allows FECN on + * ACKs to be disregarded as well. + */ + rcv_ev.is_congested = + _is_cca_fecn_set(rcv_ev.p_hdr) & IPS_RECV_EVENT_FECN; + rcv_ev.is_congested |= + (_is_cca_becn_set(rcv_ev.p_hdr) << (IPS_RECV_EVENT_BECN - 1)); + } + else + rcv_ev.is_congested = 0; + + dest_subcontext = _get_proto_subcontext(rcv_ev.p_hdr); + + if_pf (_check_headers(&rcv_ev)) + goto skip_packet; + + if_pf (rcv_ev.error_flags || + (_get_proto_ipath_opcode(rcv_ev.p_hdr) != IPATH_OPCODE_USER1)) + { + + _update_error_stats(recvq->proto, rcv_ev.error_flags); + + if ((rcv_ev.error_flags & INFINIPATH_RHF_H_TIDERR) || + (rcv_ev.error_flags & INFINIPATH_RHF_H_TFSEQERR) || + (rcv_ev.error_flags & INFINIPATH_RHF_H_TFGENERR)) { + /* Subcontexts need to see expected tid errors */ + if (rcv_ev.ptype == RCVHQ_RCV_TYPE_EXPECTED && + dest_subcontext != recvq->subcontext) + goto subcontext_packet; + + recvq->recvq_callbacks.callback_error(&rcv_ev); + + if (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER) { + /* tiderr and eager, don't consider updating egr head */ + if (state->hdr_countdown == 0 && + state->rcv_egr_index_head == NO_EAGER_UPDATE) { + /* eager-full is not currently under tracing. */ + uint32_t egr_cnt = recvq->egrq.elemcnt; + const uint32_t etail = ips_recvq_tail_get(&recvq->egrq); + const uint32_t ehead = ips_recvq_head_get(&recvq->egrq); + + if (ehead == ((etail+1)%egr_cnt)) { + /* eager is full, trace existing header entries */ + uint32_t hdr_size = recvq->hdrq_elemlast + hdrq_elemsz; + const uint32_t htail = ips_recvq_tail_get(&recvq->hdrq); + const uint32_t hhead = state->hdrq_head; + + state->hdr_countdown = (htail > hhead) ? + (htail - hhead) : (htail + hdr_size - hhead); + } + } + goto skip_packet_no_egr_update; + } + } + else + recvq->recvq_callbacks.callback_error(&rcv_ev); + goto skip_packet; + } + + /* If checksum is enabled, verify that it is valid */ + if_pf (rcv_ev.has_cksum && !do_pkt_cksum(&rcv_ev)) + goto skip_packet; + + if (dest_subcontext == recvq->subcontext) { + /* Classify packet from a known or unknown endpoint */ + struct ips_epstate_entry *epstaddr; + + epstaddr = + ips_epstate_lookup(recvq->epstate, rcv_ev.p_hdr->commidx + + INFINIPATH_KPF_RESERVED_BITS(rcv_ev.p_hdr->iph.pkt_flags)); + if_pf (epstaddr == NULL || epstaddr->epid != rcv_ev.epid) { + rcv_ev.ipsaddr = NULL; + recvq->recvq_callbacks.callback_packet_unknown(&rcv_ev); + } + else { + rcv_ev.ipsaddr = epstaddr->ipsaddr; + ret = ips_proto_process_packet(&rcv_ev); + if (ret == IPS_RECVHDRQ_OOO) return PSM_OK_NO_PROGRESS; + } + } + else { +subcontext_packet: + /* If the destination is not our subcontext, process message + * as a subcontext message (shared contexts) */ + rcv_ev.ipsaddr = NULL; + + ret = recvq->recvq_callbacks.callback_subcontext(&rcv_ev, + dest_subcontext); + } + +skip_packet: + /* + * important to update rcv_egr_index_head iff + * 1. Packet was of type eager + * 2. Packet actually consumed an eagerbuf (post QLE72XX) + * 3. Packet was *not* an eager header with RHF_H_TIDERR to mark + * an eager overflow + */ + if (has_optional_eagerbuf ? ipath_hdrget_use_egr_buf(rhf) + : (rcv_ev.ptype == RCVHQ_RCV_TYPE_EAGER)) { + state->rcv_egr_index_head = ipath_hdrget_index(rhf); + /* a header entry is using an eager entry, stop tracing. */ + state->hdr_countdown = 0; + } + +skip_packet_no_egr_update: + /* Note that state->hdrq_head is sampled speculatively by the code + * in ips_ptl_shared_poll() when context sharing, so it is not safe + * for this shared variable to temporarily exceed the last element. */ + tmp_hdrq_head = state->hdrq_head + hdrq_elemsz; + if_pt (tmp_hdrq_head <= recvq->hdrq_elemlast) + state->hdrq_head = tmp_hdrq_head; + else + state->hdrq_head = 0; + + if_pf (has_no_rtail && ++recvq->state->hdrq_rhf_seq > LAST_RHF_SEQNO) + recvq->state->hdrq_rhf_seq = 1; + + state->num_hdrq_done++; + num_hdrq_done++; + rcv_hdr = (const uint32_t *) recvq->hdrq.base_addr + state->hdrq_head; + done = (!next_hdrq_is_ready() || (ret == IPS_RECVHDRQ_BREAK) || + (num_hdrq_done == num_hdrq_todo)); + + do_hdr_update = (state->head_update_interval ? + (state->num_hdrq_done == state->head_update_interval) : done); + if (do_hdr_update) { + ips_recvq_head_update(&recvq->hdrq, state->hdrq_head); + + /* Lazy update of egrq */ + if (state->rcv_egr_index_head != NO_EAGER_UPDATE) { + ips_recvq_head_update(&recvq->egrq, state->rcv_egr_index_head); + state->rcv_egr_index_head = NO_EAGER_UPDATE; + } + + /* Process any pending acks while updated eager/headq queue */ + process_pending_acks(recvq); + + /* Reset header queue entries processed */ + state->num_hdrq_done = 0; + } + + if (state->hdr_countdown > 0) { + /* a header entry is consumed. */ + state->hdr_countdown -= hdrq_elemsz; + if (state->hdr_countdown == 0) { + /* header entry count reaches zero. */ + const uint32_t tail = ips_recvq_tail_get(&recvq->egrq); + const uint32_t head = ips_recvq_head_get(&recvq->egrq); + uint32_t egr_cnt = recvq->egrq.elemcnt; + + /* Checks eager-full again. This is a real false-egr-full */ + if (head == ((tail+1)%egr_cnt)) { + ips_recvq_head_update(&recvq->egrq, tail); + _IPATH_DBG("eager array full after overflow, flushing " + "(head %llx, tail %llx)\n", + (long long)head, (long long)tail); + recvq->proto->stats.egr_overflow++; + } else + _IPATH_ERROR("PSM BUG: EgrOverflow: eager queue is not full\n"); + } + } + } + /* while (hdrq_entries_to_read) */ + + /* Process any pending acks before exiting */ + process_pending_acks(recvq); + + return num_hdrq_done ? PSM_OK : PSM_OK_NO_PROGRESS; +} + +#if IPS_RCVHDRQ_THRU_FUNCTION_POINTER +/* + * QLE71XX + */ +static +psm_error_t __recvpath +ips_recvhdrq_progress_none(struct ips_recvhdrq *recvq) +{ + const int has_no_rtail = 0; + return ips_recvhdrq_progress_inner(recvq, has_no_rtail); +} + +/* + * QLE72XX+ + */ +static +psm_error_t __recvpath +ips_recvhdrq_progress_nortail(struct ips_recvhdrq *recvq) +{ + const int has_no_rtail = 1; + return ips_recvhdrq_progress_inner(recvq, has_no_rtail); +} + +psm_error_t __recvpath +ips_recvhdrq_progress(struct ips_recvhdrq *recvq) +{ + /* Call the progress function with the right chip features. */ + return recvq->progress_fn(recvq); +} +#endif diff --git a/ptl_ips/ips_recvhdrq.h b/ptl_ips/ips_recvhdrq.h new file mode 100644 index 0000000..1e45f57 --- /dev/null +++ b/ptl_ips/ips_recvhdrq.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ips_proto.h" +#include "ips_proto_header.h" +#include "ips_proto_params.h" +#include "ips_recvq.h" + +#ifndef _IPS_RECVHDRQ_H +#define _IPS_RECVHDRQ_H + +struct ips_recvhdrq; +struct ips_recvhdrq_state; +struct ips_epstate; + +#define IPS_RECVHDRQ_CONTINUE 0 +#define IPS_RECVHDRQ_BREAK 1 +#define IPS_RECVHDRQ_OOO 2 /* out of order */ +#define IPS_RECVHDRQ_ELEMSZ_MAX 32 /* 128 bytes */ +#define LAST_RHF_SEQNO 13 + +/* CCA related receive events */ +#define IPS_RECV_EVENT_FECN 0x1 +#define IPS_RECV_EVENT_BECN 0x2 + +struct ips_recvhdrq_event { + struct ips_proto *proto; + const struct ips_recvhdrq *recvq; /* where message received */ + const uint32_t *rcv_hdr; /* rcv_hdr ptr */ + const __le32 *rhf; /* receive header flags */ + struct ips_message_header *p_hdr; /* protocol header in rcv_hdr */ + struct ptl_epaddr *ipsaddr; /* peer ipsaddr, if available */ + psm_epid_t epid; /* peer epid */ + uint32_t error_flags; /* error flags */ + uint8_t has_cksum; /* payload has cksum */ + uint8_t is_congested;/* Packet faced congestion */ + uint16_t ptype; /* packet type */ +}; + +struct ips_recvhdrq_callbacks { + int (*callback_packet_unknown)(const struct ips_recvhdrq_event *); + int (*callback_subcontext)(const struct ips_recvhdrq_event *, uint32_t subcontext); + int (*callback_error)(struct ips_recvhdrq_event *); +}; + +psm_error_t +ips_recvhdrq_init(const psmi_context_t *context, + const struct ips_epstate *epstate, + const struct ips_proto *proto, + const struct ips_recvq_params *hdrq_params, + const struct ips_recvq_params *egrq_params, + const struct ips_recvhdrq_callbacks *callbacks, + uint32_t flags, + uint32_t subcontext, + struct ips_recvhdrq *recvq, + struct ips_recvhdrq_state *recvq_state); + +psm_error_t +ips_recvhdrq_progress(struct ips_recvhdrq *recvq); + +psm_error_t +ips_recvhdrq_fini(struct ips_recvhdrq *recvq); + +/* + * Structure containing state for recvhdrq reading. This is logically + * part of ips_recvhdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the context. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +#define NO_EAGER_UPDATE ~0U +struct ips_recvhdrq_state +{ + uint32_t hdrq_head; /* software copy of head */ + uint32_t rcv_egr_index_head; /* software copy of eager index head*/ + uint32_t hdrq_rhf_seq; /* QLE73XX/QLE72XX last seq */ + uint32_t head_update_interval; /* Header update interval */ + uint32_t num_hdrq_done; /* Num header queue done */ + uint32_t hdr_countdown; /* for false-egr-full tracing */ +}; + +/* + * Structure to read from recvhdrq + */ +typedef psm_error_t (*ips_recvhdrq_progress_fn_t)(struct ips_recvhdrq *recvq); + +struct ips_recvhdrq +{ + struct ips_proto *proto; + const psmi_context_t *context; /* error handling, epid id, etc. */ + ips_recvhdrq_progress_fn_t progress_fn; + struct ips_recvhdrq_state *state; + uint32_t context_flags; /* derived from base_info.spi_runtime_flags */ + uint32_t subcontext; /* messages that don't match subcontext call + * recv_callback_subcontext */ + + /* Header queue handling */ + pthread_spinlock_t hdrq_lock; /* Lock for thread-safe polling */ + uint32_t hdrq_rhf_off; /* QLE73XX/QLE72XX rhf offset */ + int hdrq_rhf_notail; /* rhf notail enabled */ + uint32_t hdrq_elemlast; /* last element precomputed */ + struct ips_recvq_params hdrq; + + /* Eager queue handling */ + void **egrq_buftable; /* table of eager idx-to-ptr */ + struct ips_recvq_params egrq; + + /* Lookup endpoints epid -> ptladdr (rank)) */ + const struct ips_epstate *epstate; + + /* Callbacks to handle recvq events */ + struct ips_recvhdrq_callbacks recvq_callbacks; + + /* List of flows with pending acks for receive queue */ + SLIST_HEAD(pending_flows, ips_flow) pending_acks; + + uint32_t runtime_flags; + volatile __u64 *spi_status; +}; + +PSMI_INLINE( +int ips_recvhdrq_isempty(const struct ips_recvhdrq *recvq)) +{ + if (recvq->hdrq_rhf_notail) /* use rhf-based reads */ + return recvq->state->hdrq_rhf_seq != + ipath_hdrget_seq( + recvq->hdrq.base_addr + recvq->state->hdrq_head + + recvq->hdrq_rhf_off); + else + return ips_recvq_tail_get(&recvq->hdrq) == recvq->state->hdrq_head; +} + +PSMI_INLINE( +void *ips_recvhdrq_event_payload(const struct ips_recvhdrq_event *rcv_ev)) +{ + /* XXX return NULL if no eager buffer allocated */ + return ips_recvq_egr_index_2_ptr(rcv_ev->recvq->egrq_buftable, + ipath_hdrget_index(rcv_ev->rhf)); +} + +PSMI_INLINE( +int ips_recvhdrq_trylock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_trylock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE( +int ips_recvhdrq_lock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_lock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE( +int ips_recvhdrq_unlock(struct ips_recvhdrq *recvq)) +{ + int ret = pthread_spin_unlock(&recvq->hdrq_lock); + return !ret; +} + +PSMI_INLINE( +uint32_t ips_recvhdrq_event_paylen(const struct ips_recvhdrq_event *rcv_ev)) +{ + uint32_t cksum_len = rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0; + + return ipath_hdrget_length_in_bytes(rcv_ev->rhf) - + (sizeof(struct ips_message_header) + CRC_SIZE_IN_BYTES + cksum_len + + ((__be32_to_cpu(rcv_ev->p_hdr->bth[0]) >> 20) & 3)); /* padding */ +} + +#endif /* _IPS_RECVHDRQ_H */ + diff --git a/ptl_ips/ips_recvq.c b/ptl_ips/ips_recvq.c new file mode 100644 index 0000000..710320d --- /dev/null +++ b/ptl_ips/ips_recvq.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_recvq.h" + +/* We return a table of pointer indexes. + * + * From the point of view of the returned pointer, index -1 always points to + * the address to call psmi_free on (since we force page-alignment). + */ +void ** +ips_recvq_egrbuf_table_alloc(psm_ep_t ep, void *baseptr, + uint32_t chunksize, + uint32_t bufnum, uint32_t bufsize) +{ + unsigned i; + uint32_t bufperchunk = chunksize / bufsize; + void *ptr_alloc; + uintptr_t *buft; + uintptr_t base = (uintptr_t) baseptr; + + ptr_alloc = psmi_malloc(ep, UNDEFINED, + PSMI_PAGESIZE + sizeof(uintptr_t)*(bufnum+1)); + if (ptr_alloc == NULL) + return NULL; + /* First pointer is to the actual allocated address, so we can free it but + * buft[1] is first on the page boundary + */ + buft = (uintptr_t *) PSMI_ALIGNUP(ptr_alloc+1, PSMI_PAGESIZE); + buft[-1] = (uintptr_t) ptr_alloc; + for (i=0; ihead_register = __cpu_to_le32(newhead); + return; +} + +PSMI_INLINE( +uint32_t ips_recvq_head_get(const struct ips_recvq_params *recvq)) +{ + uint32_t res = __le32_to_cpu(*recvq->head_register); + ips_rmb(); + return res; +} + +PSMI_INLINE( +void ips_recvq_tail_update(const struct ips_recvq_params *recvq, uint32_t newtail)) +{ + *recvq->tail_register = __cpu_to_le32(newtail); + return; +} + +PSMI_INLINE( +uint32_t ips_recvq_tail_get(const struct ips_recvq_params *recvq)) +{ + uint32_t res = __le32_to_cpu(*recvq->tail_register); + ips_rmb(); + return res; +} + +#endif /* _IPS_RECVQ_H */ diff --git a/ptl_ips/ips_scb.c b/ptl_ips/ips_scb.c new file mode 100644 index 0000000..452e752 --- /dev/null +++ b/ptl_ips/ips_scb.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "psm_user.h" +#include "ips_proto.h" +#include "ips_scb.h" + +psm_error_t +ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t scb_avail_callback, + void *scb_avail_context, + struct ips_scbctrl *scbc) +{ + int i; + struct ips_scb *scb; + size_t scb_size; + size_t alloc_sz; + uintptr_t base, imm_base; + psm_ep_t ep = context->ep; + //scbc->context = context; + psm_error_t err = PSM_OK; + + psmi_assert_always(numscb > 0); + scbc->sbuf_num = scbc->sbuf_num_cur = numbufs; + SLIST_INIT(&scbc->sbuf_free); + scbc->sbuf_buf_size = bufsize; + scbc->sbuf_buf_base = NULL; + scbc->sbuf_buf_alloc = NULL; + scbc->sbuf_buf_last = NULL; + + /* send buffers are not mandatory but when allocating them, make sure they + * are on a page boundary */ + if (numbufs > 0) { + struct ips_scbbuf *sbuf; + int redzone = PSM_VALGRIND_REDZONE_SZ; + + /* If the allocation requested is a page and we have redzones we have + * to allocate 2 pages so we end up using a redzone of 2048 bytes. + * + * if the allocation is not 4096, we relax that requirement and keep + * the redzones PSM_VALGRIND_REDZONE_SZ + */ + if (redzone > 0 && bufsize % PSMI_PAGESIZE == 0) + redzone = PSMI_PAGESIZE / 2; + bufsize += 2 * redzone; + bufsize = PSMI_ALIGNUP(bufsize, 64); + + alloc_sz = numbufs * bufsize + redzone + PSMI_PAGESIZE; + scbc->sbuf_buf_alloc = + psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); + if (scbc->sbuf_buf_alloc == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + base = (uintptr_t)scbc->sbuf_buf_alloc; + base = PSMI_ALIGNUP(base + redzone, PSMI_PAGESIZE); + scbc->sbuf_buf_base = (void *)base; + scbc->sbuf_buf_last = (void *)(base + bufsize * (numbufs-1)); + _IPATH_VDBG("sendbufs=%d, (redzone=%d|size=%d|redzone=%d),base=[%p..%p)\n", + numbufs, redzone, bufsize-2*redzone, redzone, + (void *) scbc->sbuf_buf_base, (void *) scbc->sbuf_buf_last); + + for (i = 0; i < numbufs; i++) { + sbuf = (struct ips_scbbuf *) (base + bufsize * i); + SLIST_NEXT(sbuf, next) = NULL; + SLIST_INSERT_HEAD(&scbc->sbuf_free, sbuf, next); + } + + VALGRIND_CREATE_MEMPOOL(scbc->sbuf_buf_alloc, + 0, + /* Should be undefined but we stuff a next + * pointer in the buffer */ + PSM_VALGRIND_MEM_DEFINED); + } + + imm_base = 0; + scbc->scb_imm_size = imm_size; + if (scbc->scb_imm_size) { + scbc->scb_imm_size = PSMI_ALIGNUP(imm_size, 64); + alloc_sz = numscb * scbc->scb_imm_size + 64; + scbc->scb_imm_buf = + psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); + if (scbc->scb_imm_buf == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + imm_base = PSMI_ALIGNUP(scbc->scb_imm_buf, 64); + } + else + scbc->scb_imm_buf = NULL; + + scbc->scb_num = scbc->scb_num_cur = numscb; + SLIST_INIT(&scbc->scb_free); + scb_size = sizeof(struct ips_scb) + 2*PSM_VALGRIND_REDZONE_SZ; + scb_size = PSMI_ALIGNUP(scb_size, 64); + alloc_sz = numscb * scb_size + PSM_VALGRIND_REDZONE_SZ + 64; + scbc->scb_base = (void *) + psmi_calloc(ep, NETWORK_BUFFERS, 1, alloc_sz); + if (scbc->scb_base == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + base = (uintptr_t)scbc->scb_base; + base = PSMI_ALIGNUP(base + PSM_VALGRIND_REDZONE_SZ, 64); + for (i = 0; i < numscb; i++) { + scb = (struct ips_scb *)(base + i * scb_size); + scb->scbc = scbc; + if (scbc->scb_imm_buf) + scb->imm_payload = (void*)(imm_base + (i * scbc->scb_imm_size)); + else + scb->imm_payload = NULL; + + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + } + scbc->scb_avail_callback = scb_avail_callback; + scbc->scb_avail_context = scb_avail_context; + + /* It would be nice to mark the scb as undefined but we pre-initialize the + * "next" pointer and valgrind would see this as a violation. + */ + VALGRIND_CREATE_MEMPOOL(scbc, PSM_VALGRIND_REDZONE_SZ, + PSM_VALGRIND_MEM_DEFINED); + +fail: + return err; +} + +psm_error_t +ips_scbctrl_fini(struct ips_scbctrl *scbc) +{ + if (scbc->scb_base != NULL) { + psmi_free(scbc->scb_base); + VALGRIND_DESTROY_MEMPOOL(scbc); + } + if (scbc->sbuf_buf_alloc) { + VALGRIND_DESTROY_MEMPOOL(scbc->sbuf_buf_alloc); + psmi_free(scbc->sbuf_buf_alloc); + } + return PSM_OK; +} + +int +ips_scbctrl_bufalloc(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + + psmi_assert_always(scbc->sbuf_num > 0); + psmi_assert_always(!((scb->payload >= scbc->sbuf_buf_base) && + (scb->payload <= scbc->sbuf_buf_last))); + if (SLIST_EMPTY(&scbc->sbuf_free)) + return 0; + else { + psmi_assert(scbc->sbuf_num_cur); + scb->payload = SLIST_FIRST(&scbc->sbuf_free); + scb->payload_size = scbc->sbuf_buf_size; + scbc->sbuf_num_cur--; + + /* If under memory pressure request ACK for packet to reclaim + * credits. + */ + if (scbc->sbuf_num_cur < (scbc->sbuf_num >> 1)) + scb->flags |= IPS_SEND_FLAG_ACK_REQ; + + VALGRIND_MEMPOOL_ALLOC(scbc->sbuf_buf_alloc, scb->payload, + scb->payload_size); + SLIST_REMOVE_HEAD(&scbc->sbuf_free, next); + return 1; + } +} + +int +ips_scbctrl_avail(struct ips_scbctrl *scbc) +{ + return (!SLIST_EMPTY(&scbc->scb_free) && scbc->sbuf_num_cur > 0); +} + +ips_scb_t * +ips_scbctrl_alloc(struct ips_scbctrl *scbc, int scbnum, int len, uint32_t flags) +{ + ips_scb_t *scb, *scb_head = NULL; + + psmi_assert(flags & IPS_SCB_FLAG_ADD_BUFFER ? (scbc->sbuf_num>0) : 1); + + while (scbnum--) { + if (SLIST_EMPTY(&scbc->scb_free)) + break; + scb = SLIST_FIRST(&scbc->scb_free); + scb->flags = 0; /* Need to set this here as bufalloc may request + * an ACK under memory pressure + */ + VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb)); + + if (flags & IPS_SCB_FLAG_ADD_BUFFER) { + if (len > scbc->scb_imm_size) { + if (!ips_scbctrl_bufalloc(scb)) + break; + } + else { /* Attach immediate buffer */ + scb->payload = scb->imm_payload; + scb->payload_size = scbc->scb_imm_size; + psmi_assert(scb->payload); + } + } + else { + scb->payload = NULL; + scb->payload_size = 0; + } + + scb->tid = IPATH_EAGER_TID_ID; + scb->tidsendc = NULL; + scb->callback = NULL; + scb->ips_lrh.mqhdr = 0; + scb->offset = 0; + scb->nfrag = 1; + scb->frag_size = 0; + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->flags |= IPS_SEND_FLAG_ACK_REQ; + + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = scb_head; + scb_head = scb; + } + return scb_head; +} + +void +ips_scbctrl_free(ips_scb_t *scb) +{ + struct ips_scbctrl *scbc = scb->scbc; + if (scbc->sbuf_num && (scb->payload >= scbc->sbuf_buf_base) && + (scb->payload <= scbc->sbuf_buf_last)) { + scbc->sbuf_num_cur++; + SLIST_INSERT_HEAD(&scbc->sbuf_free, scb->sbuf, next); + VALGRIND_MEMPOOL_FREE(scbc->sbuf_buf_alloc, scb->payload); + } + + scb->payload = NULL; + scb->tidsendc = NULL; + scb->payload_size = 0; + scbc->scb_num_cur++; + if (SLIST_EMPTY(&scbc->scb_free)) { + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + if (scbc->scb_avail_callback != NULL) + scbc->scb_avail_callback(scbc, scbc->scb_avail_context); + } + else + SLIST_INSERT_HEAD(&scbc->scb_free, scb, next); + + VALGRIND_MEMPOOL_FREE(scbc, scb); + return; +} + +ips_scb_t * +ips_scbctrl_alloc_tiny(struct ips_scbctrl *scbc) +{ + ips_scb_t *scb; + if (SLIST_EMPTY(&scbc->scb_free)) + return NULL; + scb = SLIST_FIRST(&scbc->scb_free); + + VALGRIND_MEMPOOL_ALLOC(scbc, scb, sizeof(struct ips_scb)); + SLIST_REMOVE_HEAD(&scbc->scb_free, next); + SLIST_NEXT(scb, next) = NULL; + + scb->payload = NULL; + scb->payload_size = 0; + scb->flags = 0; + scb->tid = IPATH_EAGER_TID_ID; + scb->tidsendc = NULL; + scb->callback = NULL; + scb->nfrag = 1; + scb->frag_size = 0; + + scbc->scb_num_cur--; + if (scbc->scb_num_cur < (scbc->scb_num >> 1)) + scb->flags |= IPS_SEND_FLAG_ACK_REQ; + return scb; +} + diff --git a/ptl_ips/ips_scb.h b/ptl_ips/ips_scb.h new file mode 100644 index 0000000..f7fb148 --- /dev/null +++ b/ptl_ips/ips_scb.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_SCB_H +#define _IPS_SCB_H + +#include "psm_user.h" +#include "ips_proto_header.h" + +/* ips_alloc_scb flags */ +#define IPS_SCB_FLAG_NONE 0x0 +#define IPS_SCB_FLAG_ADD_BUFFER 0x1 + +/* macros to update scb */ +#define ips_scb_mqhdr(scb) scb->ips_lrh.mqhdr +#define ips_scb_mqtag(scb) scb->ips_lrh.data[0].u64w0 +#define ips_scb_mqparam(scb) scb->ips_lrh.data[1] +#define ips_scb_uwords(scb) scb->ips_lrh.data +#define ips_scb_subopcode(scb) scb->ips_lrh.sub_opcode +#define ips_scb_buffer(scb) scb->payload +#define ips_scb_length(scb) scb->payload_size +#define ips_scb_flags(scb) scb->flags +#define ips_scb_dma_ctr(scb) scb->dma_ctr +#define ips_scb_epaddr(scb) scb->epaddr +#define ips_scb_cb(scb) scb->callback +#define ips_scb_cb_param(scb) scb->cb_param +#define ips_scb_hdr_dlen(scb) scb->ips_lrh.hdr_dlen + +struct ips_scbbuf; +struct ips_scb; +struct ips_scbctrl; +struct ips_tid_send_desc; + +typedef void (*ips_scbctrl_avail_callback_fn_t)(struct ips_scbctrl *, + void *context); + +STAILQ_HEAD(ips_scb_stailq, ips_scb); +SLIST_HEAD(ips_scb_slist, ips_scb); + +struct ips_scbctrl { + //const psmi_context_t *context; + + /* Send control blocks for each send */ + uint32_t scb_num; + uint32_t scb_num_cur; + SLIST_HEAD(scb_free, ips_scb) scb_free; + void *scb_base; + ips_scbctrl_avail_callback_fn_t scb_avail_callback; + void *scb_avail_context; + + /* Immediate data for send buffers */ + uint32_t scb_imm_size; + void *scb_imm_buf; + + /* + * Send buffers (or bounce buffers) to keep user data if we need to + * retransmit. + */ + uint32_t sbuf_num; + uint32_t sbuf_num_cur; + SLIST_HEAD(sbuf_free, ips_scbbuf) sbuf_free; + void *sbuf_buf_alloc; + uint32_t sbuf_buf_size; + void *sbuf_buf_base; + void *sbuf_buf_last; +}; + +struct ips_scbbuf { + SLIST_ENTRY(ips_scbbuf) next; +}; + +typedef struct ips_scb ips_scb_t; + +struct ips_scb { + union { + SLIST_ENTRY(ips_scb) next; + STAILQ_ENTRY(ips_scb) nextq; + }; + union { + void *payload; + struct ips_scbbuf *sbuf; + }; + uint64_t ack_timeout; /* in cycles */ + uint64_t abs_timeout; /* in cycles */ + + /* Used when composing packet */ + psmi_seqnum_t seq_num; + uint32_t payload_size; + uint32_t extra_bytes; + uint32_t cksum; + uint32_t flags; + uint32_t dma_ctr; + uint32_t payload_bytes; + uint16_t pkt_flags; + uint16_t tid; + uint16_t offset; + uint16_t nfrag; + uint16_t frag_size; + + struct ips_flow *flow; + struct ptl_epaddr *epaddr; + struct ips_tid_send_desc *tidsendc; + void *tsess; + uint16_t tsess_length; + + + struct ips_scbctrl *scbc; + void *imm_payload; + + union { + int (*callback) (void *, uint32_t); + psm_am_completion_fn_t completion_am; + }; + void *cb_param; + + struct { + union ipath_pbc pbc; + struct ips_message_header ips_lrh; + } PSMI_CACHEALIGN; +}; + +void ips_scbctrl_free(ips_scb_t *scb); +int ips_scbctrl_bufalloc(ips_scb_t *scb); +int ips_scbctrl_avail(struct ips_scbctrl *scbc); +ips_scb_t * ips_scbctrl_alloc(struct ips_scbctrl *scbc, + int scbnum, int len, uint32_t flags); +ips_scb_t * ips_scbctrl_alloc_tiny(struct ips_scbctrl *scbc); + +psm_error_t ips_scbctrl_init(const psmi_context_t *context, + uint32_t numscb, uint32_t numbufs, + uint32_t imm_size, uint32_t bufsize, + ips_scbctrl_avail_callback_fn_t, void *avail_context, + struct ips_scbctrl *); +psm_error_t ips_scbctrl_fini(struct ips_scbctrl *); + +psm_error_t ips_scbctrl_writev(struct ips_scb_slist *slist, int fd); + +#endif /* _IPS_SCB_H */ diff --git a/ptl_ips/ips_spio.c b/ptl_ips/ips_spio.c new file mode 100644 index 0000000..2c3c985 --- /dev/null +++ b/ptl_ips/ips_spio.c @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* included header files */ +#include +#include +#include +#include +#include + +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_spio.h" +#include "ipserror.h" /* ips error codes */ +#include "ips_proto_params.h" +#include "ipath_byteorder.h" + + +#define SPIO_INUSE_MASK 0xAAAAAAAAAAAAAAAAULL +#define SPIO_CHECK_MASK 0x5555555555555555ULL + +/* Report PIO stalls every 20 seconds at the least */ +#define SPIO_STALL_WARNING_INTERVAL (nanosecs_to_cycles(20e9)) +#define SPIO_MAX_CONSECUTIVE_SEND_FAIL (1<<20) /* 1M */ +/* RESYNC_CONSECUTIVE_SEND_FAIL has to be a multiple of MAX_CONSECUTIVE */ +#define SPIO_RESYNC_CONSECUTIVE_SEND_FAIL (1<<4) /* 16 */ + +static void spio_report_stall(struct ips_spio *ctrl, + uint64_t t_cyc_now, + uint64_t send_failures); + +static void spio_handle_stall(struct ips_spio *ctrl, + uint64_t send_failures); + +static inline +uint64_t +ips_spio_read_avail_index(struct ips_spio *ctrl, int index) +{ + if (ctrl->runtime_flags & IPATH_RUNTIME_PIO_REGSWAPPED && index > 3) { + return __le64_to_cpu(ctrl->spio_avail_addr[index ^ 1]); + } + else + return __le64_to_cpu(ctrl->spio_avail_addr[index]); +} + +psm_error_t +ips_spio_init(const struct psmi_context *context, const struct ptl *ptl, + struct ips_spio *ctrl) +{ + psm_error_t err = PSM_OK; + const struct ipath_base_info *base_info = &context->base_info; + unsigned wc_unordered; + char *order_str = "undefined"; + int i, last_shadow_index; + int num_shadow_index = sizeof(ctrl->spio_avail_shadow) / + sizeof(ctrl->spio_avail_shadow[0]); + + ctrl->ptl = ptl; + ctrl->context = context; + /* Copy runtime flags */ + ctrl->runtime_flags = ptl->runtime_flags; + ctrl->unit_id = context->ep->unit_id; + ctrl->portnum = context->ep->portnum; + pthread_spin_init(&ctrl->spio_lock, 0); + ctrl->spio_avail_addr = + (__le64 *)(ptrdiff_t)base_info->spi_pioavailaddr; + ctrl->spio_buffer_base = + (uint32_t *)(ptrdiff_t)base_info->spi_piobufbase; + ctrl->spio_sendbuf_status = + (unsigned long *)(ptrdiff_t)base_info->spi_sendbuf_status; + + ctrl->spio_buffer_spacing = base_info->spi_pioalign >> 2; + ctrl->spio_first_buffer = ctrl->spio_current_buffer = + base_info->spi_pioindex; + ctrl->spio_last_buffer = + ctrl->spio_first_buffer + base_info->spi_piocnt - 1; + ctrl->spio_num_of_buffer = base_info->spi_piocnt; + + ctrl->spio_consecutive_failures = 0; + ctrl->spio_num_stall = 0ULL; + ctrl->spio_next_stall_warning = 0ULL; + ctrl->spio_last_stall_cyc = 0ULL; + ctrl->spio_init_cyc = get_cycles(); + + last_shadow_index = ctrl->spio_last_buffer / 32; + last_shadow_index += (ctrl->spio_last_buffer % 32) ? 1 : 0; + if (last_shadow_index > num_shadow_index) + { + err = psmi_handle_error(ctrl->context->ep, PSM_EP_DEVICE_FAILURE, + "Number of buffer avail registers is wrong; " + "have %u, expected %u (1st %u, piocnt %u, last %u)", + last_shadow_index, + (uint32_t)(sizeof(ctrl->spio_avail_shadow) / + sizeof(ctrl->spio_avail_shadow[0])), + base_info->spi_pioindex, ctrl->spio_last_buffer, + base_info->spi_piocnt); + goto fail; + } + + /* update the shadow copy with the current contents of hardware + * available registers */ + for (i = 0; i < num_shadow_index; i++) + ctrl->spio_avail_shadow[i] = ips_spio_read_avail_index(ctrl, i); + + /* Figure out the type of ordering we require for pio writes. Update the + * routine we use for copies according to the type of pio write required */ + wc_unordered = base_info->spi_runtime_flags; + wc_unordered &= IPATH_RUNTIME_FORCE_WC_ORDER; + + if (base_info->spi_runtime_flags & IPATH_RUNTIME_SPECIAL_TRIGGER) { + /* For now all PIO packets are < 2K and use the 2K trigger function. */ + ctrl->spio_copy_fn = ipath_write_pio_special_trigger2k; + order_str = "natural CPU (w/ 2k special trigger)"; + } + else { + switch ( wc_unordered ) { + case 0: +#ifdef __MIC__ + ctrl->spio_copy_fn = getenv("IPATH_MIC_DWORD_PIO")? + ipath_write_pio:ipath_write_pio_vector; +#else + ctrl->spio_copy_fn = ipath_write_pio; +#endif + order_str = "natural CPU"; + break; + + case IPATH_RUNTIME_FORCE_WC_ORDER: + default: // any other non-zero + ctrl->spio_copy_fn = ipath_write_pio_force_order; + order_str = "forced"; + break; + } + } + + _IPATH_PRDBG("PIO copy uses %s ordering\n", order_str); + +fail: + return err; +} + +psm_error_t +ips_spio_fini(struct ips_spio *ctrl) +{ + spio_report_stall(ctrl, get_cycles(), 0ULL); + return PSM_OK; +} + +static +void +spio_report_stall(struct ips_spio *ctrl, uint64_t t_cyc_now, + uint64_t send_failures) +{ + int last, i; + size_t off = 0; + char buf[1024]; + + if (ctrl->spio_num_stall == 0) + return; + + last = ctrl->spio_last_buffer/32; + + if (send_failures > 0) { + char bufctr[128]; + uint64_t tx_stat, rx_stat; + int ret; + + off = snprintf(buf, sizeof buf - 1, + "PIO Send Bufs context %d with %d bufs from %d to %d. PIO avail regs: ", + (int) psm_epid_context(ctrl->context->epid), + ctrl->spio_num_of_buffer, ctrl->spio_first_buffer, + ctrl->spio_last_buffer); + + for (i = 0; i < 8; i++) { + uint64_t avail = ips_spio_read_avail_index(ctrl, i); + off += snprintf(buf+off, sizeof buf - off - 1, " <%d>=(%llx) ", + i, (long long) avail); + } + off += snprintf(buf+off, sizeof buf - off - 1, ". PIO shadow regs: "); + for (i = ctrl->spio_first_buffer/32; i <= last; i++) { + off += snprintf(buf+off, sizeof buf - off - 1, " <%d>=(%llx) ", + i, (long long)ctrl->spio_avail_shadow[i]); + } + buf[off] = '\0'; + + /* In case ipathfs isn't running */ + ret = infinipath_get_single_portctr(ctrl->unit_id, ctrl->portnum, + "TxPkt", &tx_stat); + if (ret != -1) { + ret = infinipath_get_single_portctr(ctrl->unit_id, + ctrl->portnum, "RxPkt", + &rx_stat); + if (ret != -1) { + snprintf(bufctr, sizeof bufctr - 1, + "(TxPktCnt=%llu,RxPktCnt=%llu)", + (unsigned long long) tx_stat, + (unsigned long long) rx_stat); + bufctr[sizeof bufctr - 1] = '\0'; + } else + bufctr[0] = '\0'; + } else + bufctr[0] = '\0'; + _IPATH_DBG("PIO Send Stall after at least %.2fM failed send attempts " + "(elapsed=%.3fs, last=%.3fs, pio_stall_count=%lld) %s %s\n", + send_failures / 1e6, + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc), + (unsigned long long) ctrl->spio_num_stall, + bufctr[0] != '\0' ? bufctr : "", buf); + } + else { + _IPATH_DBG( + "PIO Send Stall Summary: count=%llu, last=%.3fs, elapsed=%.3fs", + (unsigned long long) ctrl->spio_num_stall, + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_init_cyc), + PSMI_CYCLES_TO_SECSF(t_cyc_now - ctrl->spio_last_stall_cyc)); + } + + return; +} + +static void +spio_handle_stall(struct ips_spio *ctrl, + uint64_t send_failures) +{ + uint64_t t_cyc_now = get_cycles(); + int i, last; + + /* We handle the pio-stall every time but only report something every 20 + * seconds. We print a summary at the end while closing the device */ + ctrl->spio_num_stall++; + ctrl->spio_num_stall_total++; + + if (ctrl->spio_next_stall_warning <= t_cyc_now) { + /* If context status is ok (i.e. no cables pulled or anything) */ + if (psmi_context_check_status(ctrl->context) == PSM_OK) + spio_report_stall(ctrl, t_cyc_now, send_failures); + ctrl->spio_next_stall_warning = + get_cycles() + SPIO_STALL_WARNING_INTERVAL; + } + + /* re-initialize our shadow from the real registers; by this time, + * we know the hardware has to have done the update. + * Also, kernel check may have changed things. + */ + last = ctrl->spio_last_buffer/32; + for (i = 0; i <= last; i++) { + uint64_t mask, avail, shadow_avail; + + avail = ips_spio_read_avail_index(ctrl, i); + shadow_avail = ctrl->spio_avail_shadow[i]; + mask = (~(avail ^ shadow_avail) & SPIO_CHECK_MASK) << 1; + shadow_avail &= ~mask; /* clear all possible in-use bits */ + shadow_avail |= (avail & mask); + ctrl->spio_avail_shadow[i] = shadow_avail; + } + + ctrl->spio_last_stall_cyc = t_cyc_now; + + return; +} + +/* + * Update our shadow of the PIO available bitfield at index 'index' + */ +static void __sendpath +spio_update_shadow(struct ips_spio *ctrl, int index) +{ + register uint64_t mask, avail, shadow_avail; + + if_pf (*ctrl->spio_sendbuf_status) { + __u64 event_mask; + struct ips_proto *proto = (struct ips_proto*) &ctrl->ptl->proto; + + /* Get event mask for PSM to process */ + event_mask = (uint64_t) *ctrl->spio_sendbuf_status; + + /* First ack the driver the receipt of the events */ + _IPATH_VDBG("Acking event(s) 0x%"PRIx64" to qib driver.\n", (uint64_t) event_mask); + ipath_event_ack(ctrl->context->ctrl, event_mask); + + if (event_mask & IPATH_EVENT_DISARM_BUFS) { + /* Just acking event has disarmed all buffers */ + _IPATH_VDBG("Disarm of send buffers completed.\n"); + } + + if (event_mask & IPATH_EVENT_LINKDOWN) { + /* A link down event can clear the LMC and SL2VL change as those + * events are implicitly handled in the link up/down event handler. + */ + event_mask &= ~(IPATH_EVENT_LMC_CHANGE | IPATH_EVENT_SL2VL_CHANGE); + ips_ibta_link_updown_event(proto); + _IPATH_VDBG("Link down detected.\n"); + } + + if (event_mask & IPATH_EVENT_LID_CHANGE) { + /* Display a warning that LID change has occurred during the run. This + * is not supported in the current implementation and in general is + * bad for the SM to re-assign LIDs during a run. + */ + int lid, olid; + + lid = + ipath_get_port_lid(proto->ep->context.base_info.spi_unit, + proto->ep->context.base_info.spi_port); + olid = PSMI_EPID_GET_LID(ctrl->context->epid); + + _IPATH_INFO("Warning! LID change detected during run. Old LID: %x, New Lid: %x\n", olid, lid); + } + + if (event_mask & IPATH_EVENT_LMC_CHANGE) { + _IPATH_INFO("Fabric LMC changed.\n"); + } + + if (event_mask & IPATH_EVENT_SL2VL_CHANGE) { + _IPATH_INFO("SL2VL mapping changed for port.\n"); + ips_ibta_init_sl2vl_table(proto); + } + } + + index &= 0x7; // max spio_avail_shadow[] index. + avail = ips_spio_read_avail_index(ctrl, index); + + do { + shadow_avail = ctrl->spio_avail_shadow[index]; + mask = (~(avail ^ shadow_avail) & SPIO_CHECK_MASK) << 1; + shadow_avail &= ~mask; /* clear all possible in-use bits */ + shadow_avail |= (avail & mask); + } +#ifndef PSMI_USE_THREADS + while (0); + ctrl->spio_avail_shadow[index] = shadow_avail; +#else + while (ips_cswap(...)); +#endif +} + +static void +spio_handle_resync(struct ips_spio *ctrl, + uint64_t consecutive_send_failed) +{ + if (ctrl->runtime_flags & IPATH_RUNTIME_FORCE_PIOAVAIL) + ipath_force_pio_avail_update(ctrl->context->ctrl); + if (!(consecutive_send_failed & (SPIO_MAX_CONSECUTIVE_SEND_FAIL - 1))) + spio_handle_stall(ctrl, consecutive_send_failed); +} + +/* + * This function attempts to write a packet to a PIO. + * + * Recoverable errors: + * PSM_OK: Packet triggered through PIO. + * PSM_EP_NO_RESOURCES: No PIO bufs available or cable pulled. + * + * Unrecoverable errors: + * PSM_EP_NO_NETWORK: No network, no lid, ... + * PSM_EP_DEVICE_FAILURE: Chip failures, rxe/txe parity, etc. + */ +psm_error_t __sendpath +ips_spio_transfer_frame(struct ips_spio *ctrl, struct ips_flow *flow, + void *header, void *payload, int length, + uint32_t isCtrlMsg, uint32_t cksum_valid,uint32_t cksum) +{ + uint32_t *current_pio_buffer; + const uint64_t toggle_bits = 3ULL; + psm_error_t err = PSM_OK; + int tries; + int do_lock = (ctrl->runtime_flags & PSMI_RUNTIME_RCVTHREAD); + struct ipath_pio_params pio_params; + struct ips_message_header *p_hdr = (struct ips_message_header*) header; + + if (do_lock) + pthread_spin_lock(&ctrl->spio_lock); + + if_pf (PSMI_FAULTINJ_ENABLED()) { + PSMI_FAULTINJ_STATIC_DECL(fi_lost, "piosend", 1, IPS_FAULTINJ_PIOLOST); + PSMI_FAULTINJ_STATIC_DECL(fi_busy, "piobusy", 1, IPS_FAULTINJ_PIOBUSY); + if (psmi_faultinj_is_fault(fi_lost)) { + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + return PSM_OK; + } + else if (psmi_faultinj_is_fault(fi_busy)) + goto fi_busy; + /* else fall through normal processing path, i.e. no faults */ + } + + if (ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] & + (1ULL<<(((ctrl->spio_current_buffer) % 32 * 2) + 1))) + { + /* + * If the bit was already set, we couldn't get the pio buf. Update our + * shadow copy. + */ + spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32); + + tries = ctrl->spio_num_of_buffer; + + while (tries && (ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] & + (1ULL<<(((ctrl->spio_current_buffer % 32) * 2) + 1)))) + { + /* advance spio_current_buffer to next buffer */ + if (++ctrl->spio_current_buffer > ctrl->spio_last_buffer) { + ctrl->spio_current_buffer = ctrl->spio_first_buffer; + spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32); + } + else if ( (ctrl->spio_current_buffer % 32) == 0 ) + spio_update_shadow(ctrl, ctrl->spio_current_buffer / 32); + tries--; + } + + if_pf ( !tries ) { + /* Check unit status */ +fi_busy: + if ((err = psmi_context_check_status(ctrl->context)) == PSM_OK) { + if (0 == (++ctrl->spio_consecutive_failures & + (SPIO_RESYNC_CONSECUTIVE_SEND_FAIL-1))) + spio_handle_resync(ctrl, ctrl->spio_consecutive_failures); + err = PSM_EP_NO_RESOURCES; + } + /* If cable is pulled, we don't count it as a consecutive failure, + * we just make it as though no send pio was available */ + else if (err == PSM_OK_NO_PROGRESS) + err = PSM_EP_NO_RESOURCES; + /* else something bad happened in check_status */ + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + return err; + } + } + if (ctrl->spio_num_stall) // now able to send, so clear if set + ctrl->spio_num_stall = 0; + + /* toggle the Generation bit and set the busy bit. + * If we detected a flip, toggle busy but not GenBit (0x2) + * If we didn't detect the flip, toggle busy but not the GenBit (0x3) */ + ctrl->spio_avail_shadow[ctrl->spio_current_buffer / 32] ^= + (toggle_bits<<(((ctrl->spio_current_buffer % 32) * 2))); + + current_pio_buffer = (uint32_t *) ctrl->spio_buffer_base + + (ctrl->spio_buffer_spacing * + (ctrl->spio_current_buffer - ctrl->spio_first_buffer)); + + /* advance spio_current_buffer to next buffer */ + if (++ctrl->spio_current_buffer > ctrl->spio_last_buffer) + ctrl->spio_current_buffer = ctrl->spio_first_buffer; + + ctrl->spio_consecutive_failures = 0; + + if (do_lock) + pthread_spin_unlock(&ctrl->spio_lock); + + pio_params.length = length; + pio_params.vl = (__be16_to_cpu(p_hdr->lrh[0]) >> LRH_VL_SHIFT) & 0xf; + pio_params.port = ctrl->portnum; + pio_params.cksum_is_valid = cksum_valid; + pio_params.cksum = cksum; + + /* For matched send/receive rates and control messages IPD is not + * required. + */ + if_pf (!isCtrlMsg && flow->path->epr_active_ipd) + pio_params.rate = + ips_proto_pbc_static_rate(flow, + (length + sizeof(struct ips_message_header))); + else + pio_params.rate = 0; + + /* Copy buffer using PIO */ + ctrl->spio_copy_fn(current_pio_buffer, &pio_params, header, payload); + + return PSM_OK; +} // ips_spio_transfer_frame() + diff --git a/ptl_ips/ips_spio.h b/ptl_ips/ips_spio.h new file mode 100644 index 0000000..2ba7cea --- /dev/null +++ b/ptl_ips/ips_spio.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IPS_SPIO_H +#define IPS_SPIO_H + +#include "psm_user.h" + +struct ips_spio; +struct ptl; + +psm_error_t ips_spio_init(const psmi_context_t *context, + const struct ptl *ptl, + struct ips_spio *ctrl); +psm_error_t ips_spio_transfer_frame(struct ips_spio *ctrl,struct ips_flow *flow, + void *header, void *payload, int length, + uint32_t isCtrlMsg, + uint32_t cksum_valid, uint32_t cksum); +psm_error_t ips_spio_fini(struct ips_spio *ctrl); + +struct ips_spio +{ + const struct ptl *ptl; + const psmi_context_t *context; + uint32_t runtime_flags; + int unit_id; + uint16_t portnum; + pthread_spinlock_t spio_lock; + + /* pio copy routine */ + void (*spio_copy_fn)(volatile uint32_t *, + const struct ipath_pio_params *pioparm, void *, void *); + + volatile __le64 *spio_avail_addr __attribute__((aligned(64))); + volatile uint32_t *spio_buffer_base; + volatile unsigned long *spio_sendbuf_status; + + uint32_t spio_buffer_spacing; + uint32_t spio_first_buffer; + uint32_t spio_last_buffer; + uint32_t spio_current_buffer; + uint32_t spio_num_of_buffer; + + uint64_t spio_avail_shadow[8] __attribute__((aligned(64))); + + uint32_t spio_consecutive_failures; + uint64_t spio_num_stall; + uint64_t spio_num_stall_total; + uint64_t spio_next_stall_warning; + uint64_t spio_last_stall_cyc; + uint64_t spio_init_cyc; + +}; + +#endif /* IPS_SPIO_H */ diff --git a/ptl_ips/ips_stats.h b/ptl_ips/ips_stats.h new file mode 100644 index 0000000..2bc4afd --- /dev/null +++ b/ptl_ips/ips_stats.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_STATS_H +#define _IPS_STATS_H + +struct psm_epaddr; /* for non-PSM clients */ + +/* Old stats */ +typedef +struct { + uint64_t err_chk_send; + uint64_t err_chk_recv; + uint64_t send_failed; + uint64_t recv_dropped; + union { + uint64_t recv_copied; /* obsolete */ + uint64_t nak_sent; + }; + uint64_t nak_recv; + uint64_t total_send_eager; + uint64_t total_send_exp; + uint64_t acks_sent; + uint64_t retransmits; + uint64_t recv_matched; + uint64_t recv_unmatched; + uint64_t scb_alloc_yields; +} ips_sess_stat; + +int ips_get_stat(struct psm_epaddr *epaddr, ips_sess_stat *stats); + +#endif /* _IPS_STATS_H */ diff --git a/ptl_ips/ips_subcontext.c b/ptl_ips/ips_subcontext.c new file mode 100644 index 0000000..7299d39 --- /dev/null +++ b/ptl_ips/ips_subcontext.c @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_subcontext.h" +#include "ptl_ips.h" + +psm_error_t +ips_subcontext_ureg_get(ptl_t *ptl, const psmi_context_t *context, + struct ips_subcontext_ureg **uregp, + uint32_t subcontext_cnt) +{ + psm_error_t err = PSM_OK; + const struct ipath_base_info *base_info = &context->base_info; + uint64_t *all_subcontext_uregbase = (uint64_t *) (uintptr_t) + base_info->spi_subctxt_uregbase; + unsigned pagesize = getpagesize(); + int i; + psmi_assert_always(all_subcontext_uregbase != NULL); + for (i = 0; i < INFINIPATH_MAX_SUBCONTEXT; i++) { + struct ips_subcontext_ureg *subcontext_ureg = + (struct ips_subcontext_ureg *) &all_subcontext_uregbase[_IPATH_UregMax*8]; + *uregp++ = (i < subcontext_cnt) ? subcontext_ureg : NULL; + all_subcontext_uregbase += pagesize / sizeof(uint64_t); + } + return err; +} + +psm_error_t +ips_subcontext_ureg_initialize(ptl_t *ptl, uint32_t subcontext, + struct ips_subcontext_ureg *uregp) +{ + psm_error_t err = PSM_OK; + memset(uregp, 0, sizeof(*uregp)); + if (subcontext == 0) { + if (pthread_spin_init(&uregp->context_lock, + PTHREAD_PROCESS_SHARED) != 0) { + err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE, + "Couldn't initialize process-shared spin lock"); + } + } + return err; +} diff --git a/ptl_ips/ips_subcontext.h b/ptl_ips/ips_subcontext.h new file mode 100644 index 0000000..d69f6e9 --- /dev/null +++ b/ptl_ips/ips_subcontext.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __IPS_SUBCONTEXT_H +#define __IPS_SUBCONTEXT_H + +#include "psm_user.h" +#include "ips_recvhdrq.h" +#include "ips_writehdrq.h" + +/* This data structure is allocated in ureg page of each subcontext process */ + +struct ips_subcontext_ureg { + pthread_spinlock_t context_lock; /* only used in master ureg */ + struct ips_recvhdrq_state recvq_state; /* only used in master ureg */ + struct ips_writehdrq_state writeq_state; /* used in all ureg pages */ +}; + +psm_error_t +ips_subcontext_ureg_get(ptl_t *ptl, const psmi_context_t *context, + struct ips_subcontext_ureg **uregp, + uint32_t subcontext_cnt); + +psm_error_t +ips_subcontext_ureg_initialize(ptl_t *ptl, uint32_t subcontext, + struct ips_subcontext_ureg *uregp); + +#endif diff --git a/ptl_ips/ips_tid.c b/ptl_ips/ips_tid.c new file mode 100644 index 0000000..eb77ed8 --- /dev/null +++ b/ptl_ips/ips_tid.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_tid.h" + +psm_error_t ips_ptl_handle_check_unit_status(psm_ep_t ep, int ips_rc); + +psm_error_t +ips_tid_init(struct ips_tid *tidc, const psmi_context_t *context) +{ + const struct ipath_base_info *base_info = &context->base_info; + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, + NULL, &tidc->tid_num_total), + }; + + tidc->context = context; + tidc->tid_num_max = base_info->spi_tidcnt; + tidc->tid_num_avail = base_info->spi_tidcnt; + tidc->tid_pagesz = base_info->spi_tid_maxsize; + + tidc->tid_num_total = 0; + + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_TIDS, + entries, + PSMI_STATS_HOWMANY(entries), + tidc); +} + +psm_error_t +ips_tid_fini(struct ips_tid *tidc) +{ + return PSM_OK; +} + +psm_error_t +ips_tid_acquire(struct ips_tid *tidc, const void *buf, + int ntids, ips_tidmap_t tid_map, + uint16_t *tid_array) +{ + psm_error_t err = PSM_OK; + int rc; + + psmi_assert((uintptr_t)buf % tidc->tid_pagesz == 0); + psmi_assert(ntids <= tidc->tid_num_avail); + + rc = ipath_update_tid(tidc->context->ctrl, ntids, + (uint64_t)(uintptr_t) tid_array, + (uint64_t)(uintptr_t) buf, + (uint64_t)(uintptr_t) tid_map); + + if (rc != 0) { + /* We're still going to fail but check unit status */ + err = psmi_err_only(psmi_context_check_status(tidc->context)); + if (err == PSM_OK) /* okay, but something else is still wrong */ + err = psmi_handle_error(tidc->context->ep, PSM_EP_DEVICE_FAILURE, + "Failed to update %d tids", + ntids); + goto fail; + } + + tidc->tid_num_total += ntids; + tidc->tid_num_avail -= ntids; + +fail: + return err; +} + +psm_error_t +ips_tid_release(struct ips_tid *tidc, ips_tidmap_t tidmap, int ntids) +{ + psm_error_t err = PSM_OK; + + if (ipath_free_tid(tidc->context->ctrl, ntids, + (uint64_t) (uintptr_t) tidmap)) { + err = PSM_EP_DEVICE_FAILURE; + goto fail; + } + + tidc->tid_num_avail += ntids; + +fail: + return err; +} + diff --git a/ptl_ips/ips_tid.h b/ptl_ips/ips_tid.h new file mode 100644 index 0000000..92170c5 --- /dev/null +++ b/ptl_ips/ips_tid.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* included header files */ + +#ifndef _IPS_TID_H +#define _IPS_TID_H + +#include "psm_user.h" + +#define IPS_TID_MAX_TIDS 512 +#define IPS_TID_ALIGNMENT 4 + +typedef uint64_t ips_tidmap_t[IPS_TID_MAX_TIDS/64]; + +struct ips_tid { + const psmi_context_t *context; + + uint32_t tid_num_max; + uint32_t tid_num_avail; + uint32_t tid_pagesz; + + uint64_t tid_num_total; +}; + +psm_error_t ips_tid_init(struct ips_tid *tidc, const psmi_context_t *context); +psm_error_t ips_tid_fini(struct ips_tid *tidc); + +/* Acquiring tids. + * Buffer base has to be aligned on ips_tid_page_size() boundary + * Buffer base+length has to be aligned on IPS_TID_ALIGNMENT boundary + */ +psm_error_t +ips_tid_acquire(struct ips_tid *tidc, + const void *buf, /* input buffer, aligned to page_size */ + int ntids, /* input number of tids */ + ips_tidmap_t tidmap, /* output tidmap */ + uint16_t *tid_array); /* output tidarray, */ + +psm_error_t +ips_tid_release(struct ips_tid *tidc, + ips_tidmap_t tidmap, /* input tidmap */ + int ntids); /* intput number of tids to release */ +PSMI_INLINE( +psm_error_t +ips_tid_num_available(struct ips_tid *tidc)) +{ + return tidc->tid_num_avail; +} + +PSMI_INLINE( +int +ips_tid_num_required(struct ips_tid *tidc, void *bufi, uint32_t length)) +{ + uintptr_t buf = (uintptr_t) bufi; + const uint32_t page_size = tidc->tid_pagesz; + + return (PSMI_ALIGNUP(buf + length, page_size) - + PSMI_ALIGNDOWN(buf, page_size)) / page_size; +} + +PSMI_INLINE( +uint32_t +ips_tid_page_size(struct ips_tid *tidc)) +{ + return tidc->tid_pagesz; +} + +#endif /* _IPS_TID_H */ diff --git a/ptl_ips/ips_tidflow.c b/ptl_ips/ips_tidflow.c new file mode 100644 index 0000000..d769233 --- /dev/null +++ b/ptl_ips/ips_tidflow.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_tidflow.h" + +psm_error_t ips_tf_init(const psmi_context_t *context, + struct ips_tfctrl *tfctrl, + int start_flowidx, + int end_flowidx, + ips_tf_avail_cb_fn_t cb, + void *cb_context) +{ + int tf_idx; + int num_flows = end_flowidx - start_flowidx; + +#if TF_ADD + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("tidflow update count", MPSPAWN_STATS_REDUCTION_ALL, + NULL, &tfctrl->tf_num_total), + }; +#endif + + psmi_assert_always(num_flows > 0); + + tfctrl->context = context; + tfctrl->tf_start_idx = start_flowidx; + tfctrl->tf_end_idx = end_flowidx; + tfctrl->tf_num_max = num_flows; + tfctrl->tf_num_avail = num_flows; + tfctrl->tf_num_total = 0; + tfctrl->tf_avail_cb = cb; + tfctrl->tf_avail_context= cb_context; + + SLIST_INIT(&tfctrl->tf_avail); + + for (tf_idx = start_flowidx; tf_idx < end_flowidx; tf_idx++) { + /* Update flow state */ + tfctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; + tfctrl->tf[tf_idx].tf_idx = tf_idx; + tfctrl->tf[tf_idx].next_gen = IPS_TF_INVALID_GENERATION + 1; + + SLIST_NEXT(&tfctrl->tf[tf_idx], next) = NULL; + SLIST_INSERT_HEAD(&tfctrl->tf_avail, &tfctrl->tf[tf_idx], next); + + /* Use tidflow reset as we may want to emulate hardware suppression on + * QLE73XX and tidflow_set_entry enables the header suppression engine while + * reset does not. + */ + ipath_tidflow_reset(context->ctrl, tf_idx); + } + +#if TF_ADD + /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */ + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_TIDS, + entries, + PSMI_STATS_HOWMANY(entries), + tidc); +#else + return PSM_OK; +#endif +} + +psm_error_t ips_tf_fini(struct ips_tfctrl *tfctrl) +{ + return PSM_OK; +} + +/* Allocate a tidflow */ +psm_error_t ips_tf_allocate(struct ips_tfctrl *tfctrl, + uint32_t *tf_idx, + uint32_t *tf_gen) +{ + struct ips_tf *tf; + + if (!tfctrl->tf_num_avail){ + *tf_idx = IPS_TF_INVALID; + *tf_gen = IPS_TF_INVALID_GENERATION; + return PSM_EP_NO_RESOURCES; + } + + psmi_assert(!SLIST_EMPTY(&tfctrl->tf_avail)); + + tf = SLIST_FIRST(&tfctrl->tf_avail); + SLIST_REMOVE_HEAD(&tfctrl->tf_avail, next); + + psmi_assert(tf->state == TF_STATE_DEALLOCATED); + + tf->state = TF_STATE_ALLOCATED; + + tfctrl->tf_num_avail--; + tfctrl->tf_num_total++; + + *tf_idx = tf->tf_idx; + *tf_gen = tf->next_gen; + + tf->next_gen++; + if (tf->next_gen == IPS_TF_INVALID_GENERATION) + tf->next_gen++; + + psmi_assert(*tf_gen != IPS_TF_INVALID_GENERATION); + psmi_assert_always(*tf_gen <= IPS_TF_MAX_GENERATION); + + return PSM_OK; +} + +/* Deallocate a tidflow */ +psm_error_t ips_tf_deallocate(struct ips_tfctrl *tfctrl, uint32_t tf_idx) +{ + struct ips_tf *tf; + + psmi_assert_always(tf_idx < tfctrl->tf_end_idx); + + tf = &tfctrl->tf[tf_idx]; + psmi_assert(tf->state == TF_STATE_ALLOCATED); + tf->state = TF_STATE_DEALLOCATED; + + /* Mark invalid generation for flow (stale packets will be dropped) */ + ipath_tidflow_set_entry(tfctrl->context->ctrl, + tf_idx, IPS_TF_INVALID_GENERATION, 0); + + SLIST_NEXT(tf, next) = NULL; + SLIST_INSERT_HEAD(&tfctrl->tf_avail, tf, next); + + /* If an available callback is registered invoke it */ + if ((tfctrl->tf_num_avail++ == 0) && tfctrl->tf_avail_cb) + tfctrl->tf_avail_cb(tfctrl, tfctrl->tf_avail_context); + + return PSM_OK; +} + +/* Allocate a generation for a flow */ +psm_error_t ips_tfgen_allocate(struct ips_tfctrl *tfctrl, + uint32_t tf_idx, + uint32_t *tfgen) +{ + struct ips_tf *tf; + int ret = PSM_OK; + + psmi_assert_always(tf_idx < tfctrl->tf_end_idx); + + tf = &tfctrl->tf[tf_idx]; + psmi_assert(tf->state == TF_STATE_ALLOCATED); + + *tfgen = tf->next_gen; + + tf->next_gen++; + if (tf->next_gen == IPS_TF_INVALID_GENERATION) + tf->next_gen++; + + return ret; +} + diff --git a/ptl_ips/ips_tidflow.h b/ptl_ips/ips_tidflow.h new file mode 100644 index 0000000..ac8e737 --- /dev/null +++ b/ptl_ips/ips_tidflow.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_TIDFLOW_H +#define _IPS_TIDFLOW_H + +#include "psm_user.h" + +#define IPS_TF_MAX_GENERATION 256 +#define IPS_TF_INVALID (~0U) +#define IPS_TF_INVALID_GENERATION 0 + +#define IPS_TF_PSN_PACK(flow,gen,seq) \ + ( ((((uint64_t)flow)&0x1f)<<19) | \ + ((((uint64_t)gen)&INFINIPATH_TF_GENVAL_MASK)<>19) & 0x1f; \ + (gen) = ((tfval)>>INFINIPATH_TF_GENVAL_SHIFT) & INFINIPATH_TF_GENVAL_MASK; \ + (seq) = ((tfval)>>INFINIPATH_TF_SEQNUM_SHIFT) & INFINIPATH_TF_SEQNUM_MASK; \ + } while (0) + +#define IPS_TF_INC_SEQ(tfval) \ + tfval = (tfval & ~INFINIPATH_TF_SEQNUM_MASK) | ((ipath_tidflow_get_seqnum(tfval) + 1) & INFINIPATH_TF_SEQNUM_MASK) + +struct ips_tfctrl; + +typedef void (*ips_tf_avail_cb_fn_t)(struct ips_tfctrl *, + void *context); +typedef enum { + TF_STATE_INVALID = 0, + TF_STATE_ALLOCATED = 1, + TF_STATE_DEALLOCATED = 2 +} tf_state_t; + +struct ips_tf { + + SLIST_ENTRY(ips_tf) next; + + tf_state_t state; + + uint32_t tf_idx; + + uint32_t next_gen:8; + uint32_t pad:24; +}; + +struct ips_tfctrl { + const psmi_context_t *context; + + uint32_t tf_start_idx; + uint32_t tf_end_idx; + + uint32_t tf_num_max; + uint32_t tf_num_avail; + + uint32_t tf_num_total; + + ips_tf_avail_cb_fn_t tf_avail_cb; + void *tf_avail_context; + + SLIST_HEAD(tf_free, ips_tf) tf_avail; + + struct ips_tf tf[INFINIPATH_TF_NFLOWS]; +}; + +PSMI_ALWAYS_INLINE( +int +ips_tf_available(struct ips_tfctrl *tfctrl)) +{ + return tfctrl->tf_num_avail; +} + +psm_error_t ips_tf_init(const psmi_context_t *context, + struct ips_tfctrl *tfctrl, + int start_flowidx, + int end_flowidx, + ips_tf_avail_cb_fn_t cb, + void *cb_context); +psm_error_t ips_tf_fini(struct ips_tfctrl *tfctrl); + +/* Allocate a tidflow */ +psm_error_t ips_tf_allocate(struct ips_tfctrl *tfctrl, + uint32_t *tf_idx, + uint32_t *tf_gen); + +/* Deallocate a tidflow */ +psm_error_t ips_tf_deallocate(struct ips_tfctrl *tfctrl, uint32_t tf_idx); + +/* Allocate a generation for a flow */ +psm_error_t ips_tfgen_allocate(struct ips_tfctrl *tfctrl, + uint32_t tf_idx, + uint32_t *tfgen); + +#endif diff --git a/ptl_ips/ips_writehdrq.c b/ptl_ips/ips_writehdrq.c new file mode 100644 index 0000000..2fc097b --- /dev/null +++ b/ptl_ips/ips_writehdrq.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ips_writehdrq.h" + +psm_error_t +ips_writehdrq_init(const psmi_context_t *context, + const struct ips_recvq_params *hdrq_params, + const struct ips_recvq_params *egrq_params, + struct ips_writehdrq *writeq, + struct ips_writehdrq_state *state, + uint32_t runtime_flags) +{ + const struct ipath_base_info *base_info = &context->base_info; + memset(writeq, 0, sizeof(*writeq)); + writeq->context = context; + writeq->state = state; + writeq->hdrq = *hdrq_params; /* deep copy */ + writeq->hdrq_elemlast = ((writeq->hdrq.elemcnt - 1) * writeq->hdrq.elemsz); + writeq->egrq = *egrq_params; /* deep copy */ + writeq->egrq_buftable = + ips_recvq_egrbuf_table_alloc(context->ep, writeq->egrq.base_addr, + base_info->spi_rcv_egrchunksize, + writeq->egrq.elemcnt, + writeq->egrq.elemsz); + writeq->runtime_flags = runtime_flags; + writeq->hdrq_rhf_off = base_info->spi_rhf_offset; + if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + writeq->state->hdrq_rhf_seq = 1; + /* + * We don't allow readers to see the RHF until the writer can + * atomically write an updated RHF. + */ + writeq->hdrq_hdr_copysz = (writeq->hdrq.elemsz - 2) * sizeof(uint32_t); + /* + * Ensure 8-byte alignment of the RHF by looking at RHF of the second + * header, which is required for atomic RHF updates. + */ + psmi_assert_always( + !((uintptr_t)(writeq->hdrq.base_addr + + writeq->hdrq.elemsz + writeq->hdrq_rhf_off) & 0x7)); + } + else { + writeq->hdrq_hdr_copysz = writeq->hdrq.elemsz * sizeof(uint32_t); + writeq->state->hdrq_rhf_seq = 0; /* _seq is ignored */ + } + writeq->state->enabled = 1; + return PSM_OK; +} + +psm_error_t +ips_writehdrq_fini(struct ips_writehdrq *writeq) +{ + ips_recvq_egrbuf_table_free(writeq->egrq_buftable); + return PSM_OK; +} diff --git a/ptl_ips/ips_writehdrq.h b/ptl_ips/ips_writehdrq.h new file mode 100644 index 0000000..25e91d7 --- /dev/null +++ b/ptl_ips/ips_writehdrq.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_WRITEHDRQ_H +#define _IPS_WRITEHDRQ_H + +#include "psm_user.h" +#include "ips_recvhdrq.h" +#include "ips_recvq.h" +#include "psm_mq_internal.h" + +/* + * Structure containing state for writehdrq writing. This is logically + * part of ips_writehdrq but needs to be separated out for context + * sharing so that it can be put in a shared memory page and hence + * be available to all processes sharing the port. Generally, do not + * put pointers in here since the address map of each process can be + * different. + */ +struct ips_writehdrq_state +{ + uint32_t hdrq_rhf_seq; /* last seq */ + uint32_t enabled; /* enables writing */ +}; + +struct ips_writehdrq +{ + const psmi_context_t *context; + struct ips_writehdrq_state *state; + struct ips_recvq_params hdrq; + uint32_t hdrq_elemlast; + uint32_t hdrq_rhf_off; /* rhf offset */ + uint32_t hdrq_hdr_copysz; + struct ips_recvq_params egrq; + void **egrq_buftable; /* table of eager idx-to-ptr */ + uint32_t runtime_flags; +}; + +psm_error_t +ips_writehdrq_init(const psmi_context_t *context, + const struct ips_recvq_params *hdrq_params, + const struct ips_recvq_params *egrq_params, + struct ips_writehdrq *writeq, + struct ips_writehdrq_state *state, + uint32_t runtime_flags); + +psm_error_t +ips_writehdrq_fini(struct ips_writehdrq *writeq); + +PSMI_ALWAYS_INLINE( +void +ips_writehdrq_write_rhf_atomic(uint32_t *rhf_dest, uint32_t *rhf_src)) +{ +#if WORDSIZE == 64 + /* + * In 64-bit mode, we check in init that the rhf will always be 8-byte + * aligned + */ + *((uint64_t *)rhf_dest) = *((uint64_t *)rhf_src); +#else + /* + * In 32-bit mode, we ensure that word 0 always gets written before word 1 + */ + rhf_dest[0] = rhf_src[0]; + ips_wmb(); + rhf_dest[1] = rhf_src[1]; +#endif + return; +} + +PSMI_INLINE( +int +ips_writehdrq_append(struct ips_writehdrq *writeq, + const struct ips_recvhdrq_event *rcv_ev)) +{ + const uint32_t *rcv_hdr = rcv_ev->rcv_hdr; + uint32_t write_hdr_head; + uint32_t write_hdr_tail; + uint32_t *write_hdr; + uint32_t *write_rhf; + char *write_payload = NULL; + uint32_t next_write_hdr_tail; + uint32_t rcv_paylen; + union { + uint32_t u32[2]; + uint64_t u64; + } rhf; + int result = IPS_RECVHDRQ_CONTINUE; + + /* Drop packet if write header queue is disabled */ + if (!writeq->state->enabled) { + result = IPS_RECVHDRQ_BREAK; + goto done; + } + + write_hdr_head = ips_recvq_head_get(&writeq->hdrq); + write_hdr_tail = ips_recvq_tail_get(&writeq->hdrq); + write_hdr = writeq->hdrq.base_addr + write_hdr_tail; + write_rhf = write_hdr + writeq->hdrq_rhf_off; + + /* Drop packet if write header queue is full */ + next_write_hdr_tail = write_hdr_tail + writeq->hdrq.elemsz; + if (next_write_hdr_tail > writeq->hdrq_elemlast) + next_write_hdr_tail = 0; + if (next_write_hdr_tail == write_hdr_head) { + result = IPS_RECVHDRQ_BREAK; + goto done; + } + + /* + * If NORDMA_TAIL, don't let consumer see RHF until it's ready. We copy + * the source rhf and operate on it until we are ready to atomically update + * it for the reader. + */ + if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + write_rhf = &rhf.u32[0]; + rhf.u64 = *((uint64_t *) rcv_ev->rhf); + } + + /* Copy the data if this is an eager packet */ + rcv_paylen = ips_recvhdrq_event_paylen(rcv_ev); + rcv_paylen += (rcv_ev->has_cksum ? PSM_CRC_SIZE_IN_BYTES : 0); + + if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER && rcv_paylen > 0) + { + uint32_t write_egr_tail = ips_recvq_tail_get(&writeq->egrq); + uint32_t next_write_egr_tail; + + /* Drop packet if write eager queue is full */ + next_write_egr_tail = write_egr_tail + 1; + if (next_write_egr_tail >= writeq->egrq.elemcnt) + next_write_egr_tail = 0; + if (next_write_egr_tail == ips_recvq_head_get(&writeq->egrq)) { + /* Copy the header to the subcontext's header queue */ + psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz); + + /* Mark header with ETIDERR (eager overflow) */ + ipath_hdrset_err_flags(write_rhf, INFINIPATH_RHF_H_TIDERR); + + /* Fix up the header with current subcontext eager index */ + ipath_hdrset_index(write_rhf, write_egr_tail); + + result = IPS_RECVHDRQ_BREAK; + } + else { + if (rcv_paylen) { + const char *rcv_payload = ips_recvhdrq_event_payload(rcv_ev); + + /* Use pre-calculated address from look-up table */ + write_payload = ips_recvq_egr_index_2_ptr( + writeq->egrq_buftable, write_egr_tail); + + psmi_mq_mtucpy(write_payload, rcv_payload, rcv_paylen); + } + + /* Copy the header to the subcontext's header queue */ + psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz); + + /* Fix up the header with the subcontext's eager index */ + ipath_hdrset_index((uint32_t *) write_rhf, write_egr_tail); + + /* Update the eager buffer tail pointer */ + ips_recvq_tail_update(&writeq->egrq, next_write_egr_tail); + } + } + else { + /* Copy the header to the subcontext's header queue */ + psmi_mq_mtucpy(write_hdr, rcv_hdr, writeq->hdrq_hdr_copysz); + + /* Copy the value of the current egr tail, handles the + * eager-with-no-payload case */ + if (rcv_ev->ptype == RCVHQ_RCV_TYPE_EAGER) + ipath_hdrset_index((uint32_t *) write_rhf, + ips_recvq_tail_get(&writeq->egrq)); + } + + /* Ensure previous writes are visible before writing rhf seq or tail */ + ips_wmb(); + + if (writeq->runtime_flags & IPATH_RUNTIME_NODMA_RTAIL) { + /* We accumulated a few changes to the RHF and now want to make it + * atomically visible for the reader. + */ + uint32_t rhf_seq = writeq->state->hdrq_rhf_seq; + ipath_hdrset_seq((uint32_t *) write_rhf, rhf_seq); + if (rhf_seq >= LAST_RHF_SEQNO) + writeq->state->hdrq_rhf_seq = 1; + else + writeq->state->hdrq_rhf_seq = rhf_seq + 1; + + /* Now write the new rhf */ + ips_writehdrq_write_rhf_atomic(write_hdr + writeq->hdrq_rhf_off, write_rhf); + } + + /* The tail must be updated regardless of IPATH_RUNTIME_NODMA_RTAIL + * since this tail is also used to keep track of where + * ips_writehdrq_append will write to next. For subcontexts there is + * no separate shadow copy of the tail. */ + ips_recvq_tail_update(&writeq->hdrq, next_write_hdr_tail); + +done: + return result; +} + +#endif /* _IPS_WRITEHDRQ_H */ diff --git a/ptl_ips/ipserror.c b/ptl_ips/ipserror.c new file mode 100644 index 0000000..1c84d47 --- /dev/null +++ b/ptl_ips/ipserror.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* IPS - Interconnect Protocol Stack */ + +#include +#include +#include +#include + +char * ips_err_str(int ips_error) +{ + static char err_str[128]; + + switch (ips_error) { + case IPS_RC_OK: + return "OK!"; + + case IPS_RC_ERROR: + return "general error"; + + case IPS_RC_PENDING: + return "request pending"; + + case IPS_RC_EXIST: + return "entry exist"; + + case IPS_RC_MAX_ENTRIES_EXCEEDED: + return "max entries has been exceeded"; + + case IPS_RC_NOT_ENOUGH_BUFFERS: + return "not enough buffers to complete request"; + + case IPS_RC_NO_FREE_MEM: + return "no free memory"; + + case IPS_RC_NAME_LOOKUP_FAILED: + return "name lookup failed"; + + case IPS_RC_PARAM_ERROR: + return "invalid parameter"; + + case IPS_RC_UNKNOWN_DEVICE: + return "unknown device"; + + case IPS_RC_DEVICE_INIT_FAILED: + return "device init failed"; + + case IPS_RC_DATA_TRUNCATED: + return "data truncated"; + + case IPS_RC_INVALID_RANK: + return "invalid rank"; + + case IPS_RC_INVALID_OPCODE: + return "invalid op code"; + + case IPS_RC_PEER_NOT_READY: + return "peer is not ready"; + + case IPS_RC_PEER_CLOSED : + return "peer is closed"; + + case IPS_RC_DEST_EQUAL_LOCAL_RANK: + return "src and dest rank is equal"; + + case IPS_RC_DEVICE_ERROR: + return "InfiniPath hardware not found, hardware problem, or disabled"; + + case IPS_RC_NETWORK_DOWN: + return "The link is down"; + + case IPS_RC_NOT_ENOUGH_FREE_TIDS: + return "Not enough free TIDS to complete request"; + + case IPS_RC_NO_RESOURCE_AVAILABLE: + return "Internal resources exhausted"; + + case IPS_RC_HW_UPDATE_FAILED: + return "Failed TID update for rendevous, allocation problem"; + + case IPS_RC_PARTITION_ERROR: + return "One or more nodes is on a different partition"; + + case IPS_RC_RUN_ERROR: + return "One or more nodes is still running the previous job"; + + case IPS_RC_ALREADY_OPEN: + return "Open/init has already been called"; + + case IPS_RC_WAS_CLOSED: + return "Close has already been called"; + + case IPS_RC_DEST_EQUAL_LOCAL_LID: + return "src and dest LID is equal"; + + case IPS_RC_BUFFER_ALIGMENT_ERROR: + return "Buffer start address is not 32 bit aligned"; + + case IPS_RC_LENGTH_ALIGMENT_ERROR: + return "Buffer length is not a whole # of 32 bit words"; + + case IPS_RC_INVALID_DATA_LENGTH: + return "invalid data length"; + + case IPS_RC_BUSY: + return "Device is busy"; + + case IPS_RC_INIT_TIMEOUT_EXPIRED: + return "Could not connect to other nodes"; + + case IPS_RC_NO_PORTS_AVAILABLE: + return "All InfiniPath ports are in use."; + + /* Performance Counters codes */ + case IPS_RCPERF_INIT_FAILED: + return "Initialization of performance counters failed"; + + case IPS_RCPERF_EVENT_SETUP_FAILED: + return "Setting performance counter events failed"; + + case IPS_RCPERF_REG_DEFAULT_SET: + return "Default event set for one of the counters"; + + case IPS_RCPERF_UNSUPPORTED_CPU: + return "This CPU type is not supported"; + + case IPS_RCPERF_REG_GET_FAILED: + return "Failed to get register value for event"; + + case IPS_RCPERF_SET_EVENT_STR_FAILED: + return "Failed to find event description"; + + case IPS_RCPERF_INVALID_REGISTER: + return "Register index out of range of available counters"; + + case IPS_RC_SYSERR: // we hope errno hasn't changed since this was set... + snprintf(err_str, sizeof err_str, "System error: %s", strerror(errno)); + return err_str; + + default: + snprintf(err_str, sizeof err_str, "Error code %i: ", ips_error); + return err_str; + } +} diff --git a/ptl_ips/ipserror.h b/ptl_ips/ipserror.h new file mode 100644 index 0000000..57f35de --- /dev/null +++ b/ptl_ips/ipserror.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * interface to InfiniPath Interconnect Protocol Stack + * + * This file contains the function prototypes of the interconnect protocol + * stack. It should be included in all the clients of the stack, such as MPI. + */ + +#ifndef ipserror_h +#define ipserror_h + +#ifdef __cplusplus +extern "C" { +#endif + +/* Return codes */ +#define IPS_RC_OK 0 +#define IPS_RC_ERROR (-1) +#define IPS_RC_PENDING (-2) +#define IPS_RC_EXIST (-3) +#define IPS_RC_MAX_ENTRIES_EXCEEDED (-4) +#define IPS_RC_NOT_ENOUGH_BUFFERS (-100) +#define IPS_RC_NO_FREE_MEM (-101) +#define IPS_RC_NAME_LOOKUP_FAILED (-102) +#define IPS_RC_PARAM_ERROR (-103) +#define IPS_RC_UNKNOWN_DEVICE (-104) +#define IPS_RC_DEVICE_INIT_FAILED (-105) +#define IPS_RC_DATA_TRUNCATED (-106) +#define IPS_RC_INVALID_RANK (-107) +#define IPS_RC_INVALID_OPCODE (-108) +#define IPS_RC_PEER_NOT_READY (-109) +#define IPS_RC_PEER_CLOSED (-110) +#define IPS_RC_DEST_EQUAL_LOCAL_RANK (-111) +#define IPS_RC_DEVICE_ERROR (-112) +#define IPS_RC_NETWORK_DOWN (-113) +#define IPS_RC_NOT_ENOUGH_FREE_TIDS (-114) +#define IPS_RC_NO_RESOURCE_AVAILABLE (-115) +#define IPS_RC_HW_UPDATE_FAILED (-116) +#define IPS_RC_PARTITION_ERROR (-117) +#define IPS_RC_RUN_ERROR (-118) +#define IPS_RC_ALREADY_OPEN (-119) +#define IPS_RC_WAS_CLOSED (-120) +#define IPS_RC_DEST_EQUAL_LOCAL_LID (-121) +#define IPS_RC_BUFFER_ALIGMENT_ERROR (-122) +#define IPS_RC_LENGTH_ALIGMENT_ERROR (-123) +#define IPS_RC_INVALID_DATA_LENGTH (-124) +#define IPS_RC_BUSY (-125) +#define IPS_RC_INIT_TIMEOUT_EXPIRED (-126) +#define IPS_RC_NO_PORTS_AVAILABLE (-127) +#define IPS_RC_TRANSFER_INCOMPLETE (-128) +#define IPS_RC_SYSERR (-129) // errno has meaning, if no further errors since this error +#define IPS_RC_STARTUP_ERR (-130) + +/* Performance Counters Error Codes */ +#define IPS_RCPERF_INIT_FAILED (-200) +#define IPS_RCPERF_EVENT_SETUP_FAILED (-201) +#define IPS_RCPERF_REG_DEFAULT_SET (-202) +#define IPS_RCPERF_UNSUPPORTED_CPU (-203) +#define IPS_RCPERF_REG_GET_FAILED (-204) +#define IPS_RCPERF_SET_EVENT_STR_FAILED (-205) +#define IPS_RCPERF_INVALID_REGISTER (-206) + + char *ips_err_str(int); + +#ifdef __cplusplus +} /* extern "C" */ +#endif +#endif diff --git a/ptl_ips/ptl.c b/ptl_ips/ptl.c new file mode 100644 index 0000000..0c874d8 --- /dev/null +++ b/ptl_ips/ptl.c @@ -0,0 +1,860 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* This file implements the PSM PTL for ips */ +#include "psm_user.h" +#include "ptl_ips.h" +#include "ipserror.h" + +int ips_ptl_recvq_isempty(const struct ptl *ptl); + +#define PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS 250 + +static +int +ips_subcontext_ignore(const struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext) +{ + return IPS_RECVHDRQ_CONTINUE; +} + +static +int +ips_subcontext_process(const struct ips_recvhdrq_event *rcv_ev, uint32_t subcontext) +{ + struct ptl_shared *recvshc = rcv_ev->proto->ptl->recvshc; + if_pt (subcontext != recvshc->subcontext && + subcontext < recvshc->subcontext_cnt) { + return ips_writehdrq_append(&recvshc->writeq[subcontext], rcv_ev); + } + else { + _IPATH_VDBG("Drop pkt for subcontext %d out of %d (I am %d) : errors 0x%x\n", + (int) subcontext, (int) recvshc->subcontext_cnt, + (int) recvshc->subcontext, (unsigned) rcv_ev->error_flags); + return IPS_RECVHDRQ_BREAK; + } +} + +static +void +recvhdrq_hw_params(const psmi_context_t *context, + struct ips_recvq_params *hdrq, + struct ips_recvq_params *egrq, + int is_shared_context, int subcontext) +{ + const struct ipath_base_info *base_info = &context->base_info; + + hdrq->elemcnt = base_info->spi_rcvhdr_cnt; + hdrq->elemsz = base_info->spi_rcvhdrent_size; + + egrq->elemsz = base_info->spi_rcv_egrbufsize; /* bytes */ + egrq->elemcnt = base_info->spi_tidegrcnt; /* words */ + + if (!is_shared_context) { + volatile uint64_t *uregbase = /* HW registers */ + (volatile uint64_t *) (uintptr_t) base_info->spi_uregbase; + hdrq->base_addr = (uint32_t *)(uintptr_t) base_info->spi_rcvhdr_base; + hdrq->head_register = (volatile __le32 *) &uregbase[ur_rcvhdrhead]; + hdrq->tail_register = (volatile __le32 *) (uintptr_t) + base_info->spi_rcvhdr_tailaddr; + egrq->base_addr = (void *) (uintptr_t) base_info->spi_rcv_egrbufs; + egrq->head_register = (volatile __le32 *) + &uregbase[ur_rcvegrindexhead]; + egrq->tail_register = (volatile __le32 *) + &uregbase[ur_rcvegrindextail]; + } + else { + /* Subcontexts mimic the HW registers but use different addresses + * to avoid cache contention. */ + volatile uint64_t *subcontext_uregbase; + uint32_t *rcv_hdr; + void *rcv_egr; + unsigned pagesize = getpagesize(); + unsigned hdrsize, egrsize; + unsigned i = pagesize - 1; + hdrsize = (base_info->spi_rcvhdr_cnt * sizeof(uint32_t) * + base_info->spi_rcvhdrent_size + i) & ~i; + egrsize = base_info->spi_rcv_egrbuftotlen; + subcontext_uregbase = (uint64_t *) + (((uintptr_t) base_info->spi_subctxt_uregbase) + + (pagesize * subcontext)); + rcv_hdr = (uint32_t *) + (((uintptr_t) base_info->spi_subctxt_rcvhdr_base + + (hdrsize * subcontext))); + rcv_egr = (void *) + (((uintptr_t) base_info->spi_subctxt_rcvegrbuf + + (egrsize * subcontext))); + hdrq->base_addr = (uint32_t *) rcv_hdr; + hdrq->head_register = (volatile __le32 *) + &subcontext_uregbase[ur_rcvhdrhead * 8]; + hdrq->tail_register = (volatile __le32 *) (uintptr_t) + &subcontext_uregbase[ur_rcvhdrtail * 8]; + egrq->base_addr = rcv_egr; + egrq->head_register = (volatile __le32 *) + &subcontext_uregbase[ur_rcvegrindexhead * 8]; + egrq->tail_register = (volatile __le32 *) + &subcontext_uregbase[ur_rcvegrindextail * 8]; + } +} + +static psm_error_t shrecvq_init(ptl_t *ptl, const psmi_context_t *context); +static psm_error_t shrecvq_fini(ptl_t *ptl); + +static +size_t +ips_ptl_sizeof(void) +{ + return sizeof(ptl_t); +} + +static +int +ips_ptl_epaddr_stats_num(void) +{ + return sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t); +} + +static +int +ips_ptl_epaddr_stats_init(char **desc, uint16_t *flags) +{ + int num_stats = sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t); + int i; + + /* All stats are uint64_t */ + for (i = 0; i < num_stats; i++) + flags[i] = MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_SKIP_IF_ZERO; + + desc[0] = "errchecks sent"; + desc[1] = "errchecks recv"; + desc[2] = "naks sent"; + desc[3] = "naks recv"; + desc[4] = "connect reqs sent"; + desc[5] = "disconnect reqs sent"; + desc[6] = "tid grants sent"; + desc[7] = "tid grants recv"; + desc[8] = "send rexmit"; + desc[9] = "congestion packets"; + + return num_stats; +} + +int +ips_ptl_epaddr_stats_get(psm_epaddr_t epaddr, uint64_t *stats_o) +{ + struct ptl_epaddr *ipsaddr = epaddr->ptladdr; + int i, num_stats = sizeof(struct ptl_epaddr_stats) / sizeof (uint64_t); + uint64_t *stats_i = (uint64_t *) &ipsaddr->stats; + + for (i = 0; i < num_stats; i++) + stats_o[i] = stats_i[i]; + + return num_stats; +} + +static psm_error_t +psmi_context_check_status_callback(struct psmi_timer *t, uint64_t current) +{ + struct ptl *ptl = (struct ptl *) t->context; + const uint64_t current_count = get_cycles(); + psm_error_t err; + + err = psmi_context_check_status(ptl->context); + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + return err; +} + +static +psm_error_t +ips_ptl_init(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl) +{ + psm_error_t err = PSM_OK; + uint32_t num_of_send_bufs = ep->ipath_num_sendbufs; + uint32_t num_of_send_desc = ep->ipath_num_descriptors; + uint32_t imm_size = ep->ipath_imm_size; + const psmi_context_t *context = &ep->context; + const struct ipath_user_info *user_info = &context->user_info; + const int enable_shcontexts = (user_info->spu_subcontext_cnt > 0); + const uint64_t current_count = get_cycles(); + + /* Preconditions */ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + psmi_assert_always(ep->ipath_num_sendbufs > 0); + + memset(ptl, 0, sizeof(struct ptl)); + + ptl->ep = ep; /* back pointer */ + ptl->epid = ep->epid; /* cache epid */ + ptl->epaddr = ep->epaddr; /* cache a copy */ + ptl->ctl = ctl; + ptl->context = context; + ptl->runtime_flags = context->runtime_flags; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ptl = ptl; + ctl->ep_poll = enable_shcontexts ? ips_ptl_shared_poll : ips_ptl_poll; + ctl->ep_connect = ips_ptl_connect; + ctl->ep_disconnect = ips_ptl_disconnect; + ctl->mq_send = ips_proto_mq_send; + ctl->mq_isend = ips_proto_mq_isend; + + ctl->am_short_request = ips_am_short_request; + ctl->am_short_reply = ips_am_short_reply; + + ctl->epaddr_stats_num = ips_ptl_epaddr_stats_num; + ctl->epaddr_stats_init = ips_ptl_epaddr_stats_init; + ctl->epaddr_stats_get = ips_ptl_epaddr_stats_get; + + /* + * Runtime flags in 'ptl' are different from runtime flags in 'context'. + * In 'context', runtime flags reflect what the driver is capable of. + * In 'ptl', runtime flags reflect the features we can or want to use in + * the driver's supported runtime flags. + */ + + /* + * This timer is to be used to check the context's status at every + * PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS. This is useful to detect when + * the link transitions from the DOWN state to the UP state. We can thus + * stop aggregating link failure messages once we detect that the link is + * up. + */ + psmi_timer_entry_init(&ptl->status_timer, + psmi_context_check_status_callback, ptl); + + /* cache the context's status timeout in cycles */ + ptl->status_cyc_timeout = + ms_2_cycles(PSMI_CONTEXT_STATUS_CHECK_INTERVAL_MSECS); + + /* + * Retransmissions and pending operations are kept in a timer structure + * (queue). The timerq is shared to various internal IPS interfaces so + * that they too may schedule events on the timer queue. The timerq is + * drained in the progress function. + */ + if ((err = psmi_timer_init(&ptl->timerq))) + goto fail; + + /* start the context's status timer */ + psmi_timer_request_always(&ptl->timerq, &ptl->status_timer, + current_count + ptl->status_cyc_timeout); + + /* + * Hardware send pio used by eager and control messages. + */ + if ((err = ips_spio_init(context, ptl, &ptl->spioc))) + goto fail; + + /* + * Epstate maps endpoint ids (epid integers) to ipsaddr (structs). Mappings + * are added/removed by the connect portion of the ips protocol and lookup + * is made by the receive queue processing component. + */ + if ((err = ips_epstate_init(&ptl->epstate, context))) + goto fail; + + /* + * Actual ips protocol handling. + */ + if ((err = ips_proto_init(context, ptl, num_of_send_bufs, num_of_send_desc, + imm_size, &ptl->timerq, &ptl->epstate, + &ptl->spioc, &ptl->proto))) + goto fail; + + /* + * Hardware receive hdr/egr queue, services incoming packets and issues + * callbacks for protocol handling in proto_recv. It uses the epstate + * interface to determine if a packet is known or unknown. + */ + if (!enable_shcontexts) { + struct ips_recvhdrq_callbacks recvq_callbacks; + struct ips_recvq_params hdrq, egrq; + recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0); + recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown; + recvq_callbacks.callback_subcontext = ips_subcontext_ignore; + recvq_callbacks.callback_error = ips_proto_process_packet_error; + if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &hdrq, &egrq, &recvq_callbacks, + ptl->runtime_flags, 0, + &ptl->recvq, &ptl->recvq_state))) + goto fail; + } + + /* + * Software receive hdr/egr queue, used in shared contexts. + */ + if (enable_shcontexts && (err = shrecvq_init(ptl, context))) + goto fail; + + /* + * Receive thread, always initialized but not necessary creates a + * pthread. + */ + if ((err = ips_ptl_rcvthread_init(ptl, &ptl->recvq))) + goto fail; +fail: + return err; +} + +static +psm_error_t +ips_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_in) +{ + const struct ipath_user_info *user_info = &ptl->context->user_info; + const int enable_shcontexts = (user_info->spu_subcontext_cnt > 0); + psm_error_t err = PSM_OK; + + if ((err = ips_proto_fini(&ptl->proto, force, timeout_in))) + goto fail; + + /* We have to cancel the thread after terminating the protocol because + * connect/disconnect packets use interrupts and the kernel doesn't + * like to have no pollers waiting */ + if ((err = ips_ptl_rcvthread_fini(ptl))) + goto fail; + + if ((err = ips_epstate_fini(&ptl->epstate))) + goto fail; + + if ((err = ips_spio_fini(&ptl->spioc))) + goto fail; + + if ((err = psmi_timer_fini(&ptl->timerq))) + goto fail; + + if (!enable_shcontexts && (err = ips_recvhdrq_fini(&ptl->recvq))) + goto fail; + + if (enable_shcontexts && (err = shrecvq_fini(ptl))) + goto fail; + +fail: + return err; +} + +static +psm_error_t +ips_ptl_optctl(const void *core_obj, int optname, + void *optval, uint64_t *optlen, int get) +{ + psm_error_t err = PSM_OK; + + switch(optname) { + case PSM_IB_OPT_EP_SL: + { + /* Core object is psm_epaddr */ + psm_epaddr_t epaddr = (psm_epaddr_t) core_obj; + ips_epaddr_t *ipsaddr = epaddr->ptladdr; + + /* If endpoint does not use IB ignore for set, complain for get */ + if (epaddr->ptlctl->ep_connect != ips_ptl_connect) { + if (get) + err = psmi_handle_error(PSMI_EP_LOGEVENT, + PSM_PARAM_ERR, "Invalid EP transport"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(unsigned); + goto exit_fn; + } + + if (get) { + /* Get returns the SL for the PIO flow */ + *((uint8_t *) optval) = + (uint8_t) ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].sl; + } + else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t*) optval; + if (new_sl > 15) { + err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Invalid SL value %u. 0 <= SL <= 15.",new_sl); + goto exit_fn; + } + + /* Set new SL for all flows */ + ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO].sl = new_sl; + ipsaddr->flows[EP_FLOW_GO_BACK_N_DMA].sl = new_sl; + ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_REQ].sl = new_sl; + ipsaddr->flows[EP_FLOW_GO_BACK_N_AM_RSP].sl = new_sl; + } + } + break; + case PSM_IB_OPT_DF_SL: + { + /* Set default SL to be used by an endpoint for all communication */ + /* Core object is psm_epaddr */ + psm_ep_t ep = (psm_ep_t) core_obj; + + /* Make sure ep is specified */ + if (!ep) { + err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Invalid PSM Endpoint"); + goto exit_fn; + } + + /* Sanity check option length */ + if (*optlen < sizeof(uint8_t)) { + err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Option value length error"); + *optlen = sizeof(uint8_t); + goto exit_fn; + } + + if (get) { + *((uint8_t *) optval) = ep->ptl_ips.ptl->proto.epinfo.ep_sl; + } + else { + uint16_t new_sl; + + /* Sanity check if SL is within range */ + new_sl = (uint16_t) *(uint8_t*) optval; + if (new_sl > 15) { + err = psmi_handle_error(PSMI_EP_LOGEVENT, PSM_PARAM_ERR, + "Invalid SL value %u. 0 <= SL <= 15.",new_sl); + goto exit_fn; + } + + ep->ptl_ips.ptl->proto.epinfo.ep_sl = (uint8_t) new_sl; + } + } + break; + default: + err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown PSM_IB option %u.", optname); + } + + exit_fn: + return err; +} + +static +psm_error_t +ips_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + return ips_ptl_optctl(component_obj, optname, (void*) optval, &optlen, 0); +} + +static +psm_error_t +ips_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + return ips_ptl_optctl(component_obj, optname, optval, optlen, 1); +} + +psm_error_t __recvpath +ips_ptl_poll(ptl_t *ptl, int _ignored) +{ + const uint64_t current_count = get_cycles(); + const int do_lock = PSMI_PLOCK_DISABLED && + (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD); + psm_error_t err = PSM_OK_NO_PROGRESS; + psm_error_t err2; + + if (!ips_recvhdrq_isempty(&ptl->recvq)) { + if (do_lock && !ips_recvhdrq_trylock(&ptl->recvq)) + return err; + err = ips_recvhdrq_progress(&ptl->recvq); + if (do_lock) + ips_recvhdrq_unlock(&ptl->recvq); + if_pf (err > PSM_OK_NO_PROGRESS) + return err; + err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); + if (err2 != PSM_OK_NO_PROGRESS) + return err2; + else + return err; + } + + /* + * Process timer expirations after servicing receive queues (some packets + * may have been acked, some requests-to-send may have been queued). + * + * It's safe to look at the timer without holding the lock because it's not + * incorrect to be wrong some of the time. + */ + if (psmi_timer_is_expired(&(ptl->timerq), current_count)) { + if (do_lock) + ips_recvhdrq_lock(&ptl->recvq); + err = psmi_timer_process_expired(&(ptl->timerq), current_count); + if (do_lock) + ips_recvhdrq_unlock(&ptl->recvq); + } + + return err; +} + +PSMI_INLINE( +int +ips_try_lock_shared_context (struct ptl_shared *recvshc)) +{ + return pthread_spin_trylock(recvshc->context_lock); +} + +PSMI_INLINE( +void +ips_lock_shared_context (struct ptl_shared *recvshc)) +{ + pthread_spin_lock(recvshc->context_lock); +} + +PSMI_INLINE( +void +ips_unlock_shared_context (struct ptl_shared *recvshc)) +{ + pthread_spin_unlock(recvshc->context_lock); +} + +psm_error_t __recvpath +ips_ptl_shared_poll(ptl_t *ptl, int _ignored) +{ + const uint64_t current_count = get_cycles(); + psm_error_t err = PSM_OK_NO_PROGRESS; + psm_error_t err2; + struct ptl_shared *recvshc = ptl->recvshc; + psmi_assert(recvshc != NULL); + + /* The following header queue checks are speculative (but safe) + * until this process has acquired the lock. The idea is to + * minimize lock contention due to processes spinning on the + * shared context. */ + if (ips_recvhdrq_isempty(&recvshc->recvq)) { + if (!ips_recvhdrq_isempty(&ptl->recvq) && + ips_try_lock_shared_context(recvshc) == 0) { + /* check that subcontext is empty while under lock to avoid + * re-ordering of incoming packets (since packets from + * hardware context will be processed immediately). */ + if_pt (ips_recvhdrq_isempty(&recvshc->recvq)) { + err = ips_recvhdrq_progress(&ptl->recvq); + } + ips_unlock_shared_context(recvshc); + } + } + + if_pf (err > PSM_OK_NO_PROGRESS) + return err; + + if (!ips_recvhdrq_isempty(&recvshc->recvq)) { + err2 = ips_recvhdrq_progress(&recvshc->recvq); + if (err2 != PSM_OK_NO_PROGRESS) { + err = err2; + } + } + + if_pf (err > PSM_OK_NO_PROGRESS) + return err; + + /* + * Process timer expirations after servicing receive queues (some packets + * may have been acked, some requests-to-send may have been queued). + */ + err2 = psmi_timer_process_if_expired(&(ptl->timerq), current_count); + if (err2 != PSM_OK_NO_PROGRESS) + err = err2; + + return err; +} + +int __recvpath +ips_ptl_recvq_isempty(const ptl_t *ptl) +{ + struct ptl_shared *recvshc = ptl->recvshc; + + if (recvshc != NULL && !ips_recvhdrq_isempty(&recvshc->recvq)) + return 0; + return ips_recvhdrq_isempty(&ptl->recvq); +} + +/* + * Legacy ips_get_stat -- do nothing. + */ +int ips_get_stat(psm_epaddr_t epaddr, ips_sess_stat * stats) +{ + memset(stats, 0, sizeof (ips_sess_stat)); + return 0; +} + +static +psm_error_t +shrecvq_init(ptl_t *ptl, const psmi_context_t *context) +{ + const struct ipath_base_info *base_info = &context->base_info; + const struct ipath_user_info *user_info = &context->user_info; + struct ips_recvhdrq_callbacks recvq_callbacks; + struct ips_recvq_params hdrq, egrq; + psm_error_t err = PSM_OK; + struct ptl_shared *recvshc; + int i; + + psmi_assert_always(user_info->spu_subcontext_cnt > 0); + + recvshc = (struct ptl_shared *) + psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ptl_shared)); + if (recvshc == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + + ptl->recvshc = recvshc; + recvshc->ptl = ptl; + + /* Initialize recvshc fields */ + recvshc->subcontext = base_info->spi_subcontext; + recvshc->subcontext_cnt = user_info->spu_subcontext_cnt; + psmi_assert_always(recvshc->subcontext_cnt <= INFINIPATH_MAX_SUBCONTEXT); + psmi_assert_always(recvshc->subcontext < recvshc->subcontext_cnt); + + if ((err = ips_subcontext_ureg_get(ptl, context, recvshc->subcontext_ureg, + recvshc->subcontext_cnt))) + goto fail; + if ((err = ips_subcontext_ureg_initialize( + ptl, recvshc->subcontext, recvshc->subcontext_ureg[recvshc->subcontext]))) + goto fail; + recvshc->context_lock = &recvshc->subcontext_ureg[0]->context_lock; + + /* Initialize (shared) hardware context recvq (ptl->recvq) */ + /* NOTE: uses recvq in ptl structure for shared h/w context */ + recvhdrq_hw_params(context, &hdrq, &egrq, 0, 0); + recvq_callbacks.callback_packet_unknown = ips_proto_process_unknown; + recvq_callbacks.callback_subcontext = ips_subcontext_process; + recvq_callbacks.callback_error = ips_proto_process_packet_error; + if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &hdrq, &egrq, &recvq_callbacks, + ptl->runtime_flags, recvshc->subcontext, + &ptl->recvq, + &recvshc->subcontext_ureg[0]->recvq_state))) { + goto fail; + } + + /* Initialize software subcontext (recvshc->recvq). Subcontexts do */ + /* not require the rcvhdr copy feature. */ + recvhdrq_hw_params(context, &hdrq, &egrq, 1, recvshc->subcontext); + recvq_callbacks.callback_subcontext = ips_subcontext_ignore; + if ((err = ips_recvhdrq_init(context, &ptl->epstate, &ptl->proto, + &hdrq, &egrq, &recvq_callbacks, + ptl->runtime_flags & ~IPATH_RUNTIME_RCVHDR_COPY, + recvshc->subcontext, + &recvshc->recvq, + &recvshc->recvq_state))) { + goto fail; + } + + /* Initialize each recvshc->writeq for shared contexts */ + for (i = 0; i < recvshc->subcontext_cnt; i++) { + recvhdrq_hw_params(context, &hdrq, &egrq, 1, i); + if ((err = ips_writehdrq_init(context, &hdrq, &egrq, + &recvshc->writeq[i], + &recvshc->subcontext_ureg[i]->writeq_state, + ptl->runtime_flags & ~IPATH_RUNTIME_RCVHDR_COPY))) { + goto fail; + } + } + + if (err == PSM_OK) + _IPATH_DBG("Context sharing in use: lid %d, context %d, sub-context %d\n", + (int) psm_epid_nid(ptl->epid), base_info->spi_context, + recvshc->subcontext); +fail: + return err; +} + +static +psm_error_t +shrecvq_fini(ptl_t *ptl) +{ + psm_error_t err = PSM_OK; + int i; + + /* disable my write header queue before deallocation */ + i = ptl->recvshc->subcontext; + ptl->recvshc->subcontext_ureg[i]->writeq_state.enabled = 0; + + if ((err = ips_recvhdrq_fini(&ptl->recvq))) + goto fail; + + if ((err = ips_recvhdrq_fini(&ptl->recvshc->recvq))) + goto fail; + + for (i = 0; i < ptl->recvshc->subcontext_cnt; i++) { + if ((err = ips_writehdrq_fini(&ptl->recvshc->writeq[i]))) { + goto fail; + } + } + + psmi_free(ptl->recvshc); + +fail: + return err; +} + +psm_error_t +ips_ptl_connect(ptl_t *ptl, int numep, const psm_epid_t *array_of_epid, + const int *array_of_epid_mask, psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, uint64_t timeout_in) +{ + psm_error_t err; + psm_ep_t ep; + psm_epid_t *epid_array = NULL; + psm_error_t *error_array = NULL; + psm_epaddr_t *epaddr_array = NULL; + int *mask_array = NULL; + int i, count; + + PSMI_PLOCK_ASSERT(); + err = ips_proto_connect(&ptl->proto, numep, array_of_epid, + array_of_epid_mask, array_of_errors, + array_of_epaddr, timeout_in); + if (err) return err; + + psmi_assert_always(ptl->ep->mctxt_master == ptl->ep); + if (ptl->ep->mctxt_next == ptl->ep) return err; + + /* make the additional mutil-context connections. */ + epid_array = (psm_epid_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_epid_t)*numep); + mask_array = (int *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(int)*numep); + error_array = (psm_error_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_error_t)*numep); + epaddr_array = (psm_epaddr_t *) + psmi_malloc(ptl->ep, UNDEFINED, sizeof(psm_epaddr_t)*numep); + if (!epid_array || !mask_array || !error_array || !epaddr_array) { + goto fail; + } + + count = 0; + ep = ptl->ep->mctxt_next; + while (ep != ep->mctxt_master) { + + /* Setup the mask array and epid array. */ + for (i = 0; i < numep; i++) { + if (array_of_epid_mask[i] + && array_of_errors[i] == PSM_OK + && count < array_of_epaddr[i]->mctxt_epcount) { + if (ep->gid_hi != array_of_epaddr[i]->mctxt_gidhi[count]) { + mask_array[i] = 0; + _IPATH_INFO("Subnet ID mismatch, ignore...\n"); + } else { + mask_array[i] = 1; + epid_array[i] = array_of_epaddr[i]->mctxt_epid[count]; + } + } else { + mask_array[i] = 0; + } + } + + /* Make the real protocol connections. */ + err = ips_proto_connect(&ep->ptl_ips.ptl->proto, numep, epid_array, + mask_array, error_array, + epaddr_array, timeout_in); + if (err) goto fail; + + /* Make the epaddr linklist for this peer. */ + for (i = 0; i < numep; i++) { + if (!mask_array[i]) continue; + + /* In rare case, when the peer exits psm_ep_connect() + * and sends a message, it is received by this epaddr, + * because the epaddr->mctxt_master is still itself (linked + * and changed by below macro), epaddr->mctxt_recv_seqnum + * is increased, not the master's mctxt_recv_seqnum. + * when this happens, we need to apply this mctxt_recv_seqnum + * to master's mctxt_recv_seqnum, otherwise, the message + * sequence number doesnot match master's mctxt_recv_seqnum, + * and causes code hanging. + * This case only happens in the last rail of multi-rail. + */ + if (epaddr_array[i]->mctxt_recv_seqnum) { + array_of_epaddr[i]->mctxt_recv_seqnum += + epaddr_array[i]->mctxt_recv_seqnum; + epaddr_array[i]->mctxt_recv_seqnum = 0; + } + + PSM_MCTXT_APPEND(array_of_epaddr[i], epaddr_array[i]); + + /* randomize the rail to start traffic */ + if ((random()%(count+2)) == 0) { + array_of_epaddr[i]->mctxt_current = epaddr_array[i]; + } + + /* Set the # slave connections so far */ + array_of_epaddr[i]->mctxt_nsconn++; + } + + count++; + ep = ep->mctxt_next; + } + +fail: + if (epid_array) psmi_free(epid_array); + if (mask_array) psmi_free(mask_array); + if (error_array) psmi_free(error_array); + if (epaddr_array) psmi_free(epaddr_array); + + return err; +} + +psm_error_t +ips_ptl_disconnect(ptl_t *ptl, int force, int numep, + const psm_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t array_of_errors[], uint64_t timeout_in) +{ + psm_error_t err; + + fprintf(stderr, "Aiee! ips_proto_disconnect() called.\n"); + PSMI_PLOCK_ASSERT(); + err = ips_proto_disconnect(&ptl->proto, force, numep, array_of_epaddr, + array_of_epaddr_mask, array_of_errors, + timeout_in); + return err; +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_ips = { + ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt, ips_ptl_getopt +}; diff --git a/ptl_ips/ptl_fwd.h b/ptl_ips/ptl_fwd.h new file mode 100644 index 0000000..08d4c53 --- /dev/null +++ b/ptl_ips/ptl_fwd.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PTL_FWD_IPS_H +#define _PTL_FWD_IPS_H +#include "ptl.h" + +typedef struct ptl_epaddr ips_epaddr_t; + +/* Symbol in ips ptl */ +struct ptl_ctl_init psmi_ptl_ips; +#endif /* _PTL_FWD_IPS_H */ diff --git a/ptl_ips/ptl_ips.h b/ptl_ips/ptl_ips.h new file mode 100644 index 0000000..5643064 --- /dev/null +++ b/ptl_ips/ptl_ips.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPS_PTL_H +#define _IPS_PTL_H + +#include "psm_user.h" +#include "psm_mq_internal.h" + +#include "ips_proto_params.h" +#include "ips_proto.h" +#include "ips_spio.h" +#include "ips_recvhdrq.h" +#include "ips_writehdrq.h" +#include "ips_epstate.h" +#include "ips_stats.h" +#include "ips_subcontext.h" + +struct ptl_shared; + +/* + * PTL at the ips level (for InfiniPath) + * + * This PTL structure glues all the ips components together. + * + * * ips timer, shared by various components, allows each component to + * schedule time-based expiration callbacks on the timerq. + * * HW receive queue + * * send control block to handle eager messages + * * instantiation of the ips protocol + * * endpoint state, to map endpoint indexes into structures + * + * Receive-side + * + * ----[ proto ] + * / ^ ^ + * | | | + * | packet packet + * | known unknown + * add_endpt \ / + * | | + * `----> [epstate] + * ^ + * | + * lookup_endpt + * | + * [recvq] + * | + * poll + * + */ +/* Updates to this struct must be reflected in PTL_IPS_SIZE in ptl_fwd.h */ +/* IPS knows it functions as a PTL whenever ptl->ep is non-NULL */ +struct ptl { + psm_ep_t ep; /* back ptr */ + psm_epid_t epid; /* cached from ep */ + psm_epaddr_t epaddr; /* cached from ep */ + ips_epaddr_t *ipsaddr; /* cached from epaddr */ + ptl_ctl_t *ctl; /* cached from init */ + const psmi_context_t *context; /* cached from init */ + + struct ips_spio spioc; /* PIO send control */ + struct ips_proto proto; /* protocol instance: timerq, epstate, spio */ + + /* Receive header queue and receive queue processing */ + uint32_t runtime_flags; + struct psmi_timer_ctrl timerq; + struct ips_epstate epstate; /* map incoming packets */ + struct ips_recvhdrq_state recvq_state; + struct ips_recvhdrq recvq; /* HW recvq: epstate, proto */ + + /* timer to check the context's status */ + struct psmi_timer status_timer; + + /* context's status check timeout in cycles -- cached */ + uint64_t status_cyc_timeout; + + /* Shared contexts context */ + struct ptl_shared *recvshc; + + /* Rcv thread context */ + struct ptl_rcvthread *rcvthread; +}; + +/* + * Sample implementation of shared contexts context. + * + * In shared mode, the hardware queue is serviced by more than one process. + * Each process also mirrors the hardware queue in software (represented by an + * ips_recvhdrq). For packets we service in the hardware queue that are not + * destined for us, we write them in other processes's receive queues + * (represented by an ips_writehdrq). + * + */ +struct ptl_shared { + ptl_t *ptl; /* backptr to main ptl */ + uint32_t subcontext; + uint32_t subcontext_cnt; + + pthread_spinlock_t *context_lock; + struct ips_subcontext_ureg *subcontext_ureg[INFINIPATH_MAX_SUBCONTEXT]; + struct ips_recvhdrq recvq; /* subcontext receive queue */ + struct ips_recvhdrq_state recvq_state; /* subcontext receive queue state */ + struct ips_writehdrq writeq[INFINIPATH_MAX_SUBCONTEXT]; /* peer subcontexts */ +}; + +/* + * Connect/disconnect are wrappers around psm proto's connect/disconnect, + * mostly to abstract away PSM-specific stuff from ips internal structures + */ +psm_error_t ips_ptl_connect(ptl_t *ptl, int numep, + const psm_epid_t *array_of_epid, + const int *array_of_epid_mask, + psm_error_t *array_of_errors, + psm_epaddr_t *array_of_epaddr, + uint64_t timeout_in); + +psm_error_t ips_ptl_disconnect(ptl_t *ptl, int force, int numep, + const psm_epaddr_t array_of_epaddr[], + const int array_of_epaddr_mask[], + psm_error_t array_of_errors[], + uint64_t timeout_in); + +/* + * Generic Poll function for ips-level ptl + */ +psm_error_t ips_ptl_poll(ptl_t *ptl, int _ignored); +psm_error_t ips_ptl_shared_poll(ptl_t *ptl, int _ignored); + +/* + * Support for receive thread + */ +psm_error_t ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq); +psm_error_t ips_ptl_rcvthread_fini(ptl_t *ptl); + +#endif /* _IPS_PTL_H */ diff --git a/ptl_ips/ptl_rcvthread.c b/ptl_ips/ptl_rcvthread.c new file mode 100644 index 0000000..5c30c7a --- /dev/null +++ b/ptl_ips/ptl_rcvthread.c @@ -0,0 +1,444 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ptl_ips.h" +#include "ips_proto.h" +#include "ips_proto_internal.h" +#include "ips_recvhdrq.h" + +/* All in milliseconds */ +#define RCVTHREAD_TO_MIN_FREQ 10 /* min of 10 polls per sec */ +#define RCVTHREAD_TO_MAX_FREQ 100 /* max of 100 polls per sec */ +#define RCVTHREAD_TO_SHIFT 1 + +struct ptl_rcvthread; + +static void * ips_ptl_pollintr(void *recvthreadc); +static psm_error_t rcvthread_initstats(ptl_t *ptl); +static psm_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc); + +struct ptl_rcvthread { + const psmi_context_t *context; + const ptl_t *ptl; + struct ips_recvhdrq *recvq; + + pthread_t hdrq_threadid; + uint64_t t_start_cyc; + int pipefd[2]; + + /* stats and some for scheduling */ + uint64_t pollcnt; + uint64_t pollcnt_to; + uint64_t pollcyc; + uint64_t pollok; + + /* For scheduling interrupt thread */ + int timeout_period_min; + int timeout_period_max; + int timeout_shift; + uint64_t pollok_last; + uint64_t pollcnt_last; + uint32_t last_timeout; +}; + +/* + * The receive thread knows about the ptl interface, so it can muck with it + * directly. + */ +psm_error_t +ips_ptl_rcvthread_init(ptl_t *ptl, struct ips_recvhdrq *recvq) +{ + psm_error_t err = PSM_OK; + struct ptl_rcvthread *rcvc; + + ptl->rcvthread = + psmi_calloc(ptl->ep, UNDEFINED, 1, sizeof(struct ptl_rcvthread)); + if (ptl->rcvthread == NULL) { + err = PSM_NO_MEMORY; + goto fail; + } + rcvc = ptl->rcvthread; + + rcvc->recvq = recvq; + rcvc->ptl = ptl; + rcvc->context = ptl->context; + rcvc->t_start_cyc = get_cycles(); + + if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) { + + if ((err = rcvthread_initsched(rcvc))) + goto fail; + + /* Create a pipe so we can synchronously terminate the thread */ + if (pipe(rcvc->pipefd) != 0) { + err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE, + "Cannot create a pipe for receive thread: %s\n", + strerror(errno)); + goto fail; + } + + if (pthread_create(&rcvc->hdrq_threadid, NULL, + ips_ptl_pollintr, ptl->rcvthread)) + { + close(rcvc->pipefd[0]); + close(rcvc->pipefd[1]); + err = psmi_handle_error(ptl->ep, PSM_EP_DEVICE_FAILURE, + "Cannot start receive thread: %s\n", strerror(errno)); + goto fail; + } + + } + + if ((err = rcvthread_initstats(ptl))) + goto fail; + +fail: + return err; +} + +psm_error_t +ips_ptl_rcvthread_fini(ptl_t *ptl) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) ptl->rcvthread; + uint64_t t_now; + double t_cancel_us; + psm_error_t err = PSM_OK; + + PSMI_PLOCK_ASSERT(); + + if (ptl->rcvthread == NULL) + return err; + + if (ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD) { + t_now = get_cycles(); + + /* Disable interrupts then kill the receive thread */ + if (psmi_context_interrupt_isenabled((psmi_context_t *) ptl->context)) + if ((err = psmi_context_interrupt_set((psmi_context_t *) ptl->context, 0))) + goto fail; + + /* Close the pipe so we can have the thread synchronously exit. + On Linux just closing the pipe does not wake up the receive + thread. + */ + if (write(rcvc->pipefd[1], (const void*) &t_now, + sizeof(uint64_t)) == -1 || + close(rcvc->pipefd[1]) == -1) { + _IPATH_VDBG("unable to close pipe to receive thread cleanly\n"); + } + pthread_join(rcvc->hdrq_threadid, NULL); + t_cancel_us = (double) cycles_to_nanosecs(get_cycles() - t_now) / 1e3; + + _IPATH_PRDBG("rcvthread poll success %lld/%lld times, " + "thread cancelled in %.3f us\n", (long long) rcvc->pollok, + (long long) rcvc->pollcnt, t_cancel_us); + + } + + psmi_free(ptl->rcvthread); + +fail: + return err; +} + +psm_error_t +rcvthread_initsched(struct ptl_rcvthread *rcvc) +{ + union psmi_envvar_val env_to; + char buf[192]; + char *rcv_freq = buf; + int no_timeout = 0; + int tvals[3] = { RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, + RCVTHREAD_TO_SHIFT }; + snprintf(buf, sizeof buf - 1, "%d:%d:%d", RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); + buf[sizeof buf - 1] = '\0'; + + if (!psmi_getenv("PSM_RCVTHREAD_FREQ", + "Thread timeouts (per sec) ", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val) rcv_freq, &env_to)) + { + /* not using default values */ + int nparsed = psmi_parse_str_tuples(env_to.e_str, 3, tvals); + int invalid = 0; + + if (nparsed < 1 || (nparsed > 0 && tvals[0] == 0) || + (nparsed > 1 && tvals[1] == 0)) + { + no_timeout = 1; + } + else { + if (nparsed > 0 && tvals[0] > 1000) + invalid = 1; + if (nparsed > 1 && (tvals[1] > 1000 || tvals[1] < tvals[0])) + invalid = 1; + if (nparsed > 2 && tvals[2] > 10) + invalid = 1; + } + + if (invalid) { + _IPATH_INFO("Overriding invalid request for RcvThread frequency" + " settings of %s to be <%d:%d:%d>\n", + env_to.e_str, RCVTHREAD_TO_MIN_FREQ, + RCVTHREAD_TO_MAX_FREQ, RCVTHREAD_TO_SHIFT); + tvals[0] = RCVTHREAD_TO_MIN_FREQ; + tvals[1] = RCVTHREAD_TO_MAX_FREQ; + tvals[2] = RCVTHREAD_TO_SHIFT; + } + } + + if (no_timeout) { + rcvc->last_timeout = -1; + _IPATH_PRDBG("PSM_RCVTHREAD_FREQ set to only interrupt " + "(no timeouts)\n"); + } + else { + /* Convert freq to period in microseconds (for poll()) */ + rcvc->timeout_period_max = 1000 / tvals[0]; + rcvc->timeout_period_min = 1000 / tvals[1]; + rcvc->timeout_shift = tvals[2]; + /* Start in the middle of min and max */ + rcvc->last_timeout = (rcvc->timeout_period_min + + rcvc->timeout_period_max) / 2; + _IPATH_PRDBG("PSM_RCVTHREAD_FREQ converted to period " + "min=%dms,max=%dms,shift=%d\n", + rcvc->timeout_period_min, rcvc->timeout_period_max, + rcvc->timeout_shift); + } + return PSM_OK; +} + +static +int +rcvthread_next_timeout(struct ptl_rcvthread *rcvc) +{ + uint64_t pollok_diff = rcvc->pollok - rcvc->pollok_last; + + if (pollok_diff > 0) { + if (rcvc->last_timeout > rcvc->timeout_period_min) + /* By default, be less aggressive, but there's a more aggressive + * alternative if need be */ +#if 1 + rcvc->last_timeout >>= rcvc->timeout_shift; +#else + rcvc->last_timeout = rcvc->timeout_period_min; +#endif + } + else { /* we had less progress */ + if (rcvc->last_timeout < rcvc->timeout_period_max) + rcvc->last_timeout <<= rcvc->timeout_shift; + } + + rcvc->pollok_last = rcvc->pollok; + rcvc->pollcnt_last = rcvc->pollcnt; + return (int) rcvc->last_timeout; +} + +extern int ips_in_rcvthread; + +/* + * Receiver thread support. + * + * By default, polling in the driver asks the chip to generate an interrupt on + * every packet. When the driver supports POLLURG we can switch the poll mode + * to one that requests interrupts only for packets that contain an urgent bit + * (and optionally enable interrupts for hdrq overflow events). When poll + * returns an event, we *try* to make progress on the receive queue but simply + * go back to sleep if we notice that the main thread is already making + * progress. + */ +static +void * +ips_ptl_pollintr(void *rcvthreadc) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) rcvthreadc; + struct ips_recvhdrq *recvq = rcvc->recvq; + psmi_context_t *context = (psmi_context_t *) rcvc->context; + int fd_dev = context->fd; + int fd_pipe = rcvc->pipefd[0]; + psm_ep_t ep = context->ep; + struct pollfd pfd[2]; + int ret; + int next_timeout = rcvc->last_timeout; + uint64_t t_cyc; + psm_error_t err; + + /* No reason to have many of these, keep this as a backup in case the + * recvhdrq init function is misused */ + psmi_assert_always((recvq->runtime_flags & PSMI_RUNTIME_RCVTHREAD)); + + /* Switch driver to a mode where it can interrupt on urgent packets */ + if (psmi_context_interrupt_set((psmi_context_t *) + rcvc->context, 1) == PSM_EP_NO_RESOURCES) { + _IPATH_PRDBG("ipath_poll_type feature not present in driver, turning " + "off internal progress thread\n"); + return NULL; + } + + _IPATH_PRDBG("Enabled communication thread on URG packets\n"); + + while (1) { + pfd[0].fd = fd_dev; + pfd[0].events = POLLIN; + pfd[0].revents = 0; + pfd[1].fd = fd_pipe; + pfd[1].events = POLLIN; + pfd[1].revents = 0; + + ret = poll(pfd, 2, next_timeout); + t_cyc = get_cycles(); + + if_pf (ret < 0) { + if (errno == EINTR) + _IPATH_DBG("got signal, keep polling\n"); + else + psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, + "Receive thread poll() error: %s", strerror(errno)); + } + else if (pfd[1].revents) { + /* Any type of event on this fd means exit, should be POLLHUP */ + _IPATH_DBG("close thread: revents=0x%x\n", pfd[1].revents); + close(fd_pipe); + break; + } + else { + rcvc->pollcnt++; + + if (ret == 0 || pfd[0].revents & (POLLIN | POLLERR)) { + if (PSMI_PLOCK_DISABLED) { + /* We do this check without acquiring the lock, no sense to + * adding the overhead and it doesn't matter if we're + * wrong. */ + if (ips_recvhdrq_isempty(recvq)) + continue; + if (!ips_recvhdrq_trylock(recvq)) + continue; + err = ips_recvhdrq_progress(recvq); + if (err == PSM_OK) + rcvc->pollok++; + else + rcvc->pollcyc += get_cycles() - t_cyc; + ips_recvhdrq_unlock(recvq); + } + else if (!PSMI_PLOCK_TRY()) { + /* If we time out, we service shm and ipath. If not, we + * assume to have received an ipath interrupt and service + * only ipath. + */ + err = psmi_poll_internal(ep, + ret == 0 ? PSMI_TRUE : PSMI_FALSE); + + if (err == PSM_OK) { + rcvc->pollok++; + /* + if (rcvc->pollok % 1000 == 0 && rcvc->pollok >= 1000) + _IPATH_INFO("pollok = %lld\n", (unsigned long long)rcvc->pollok); + */ + } + else + rcvc->pollcyc += get_cycles() - t_cyc; + PSMI_PUNLOCK(); + } + } + + if (ret == 0) { /* change timeout only on timed out poll */ + rcvc->pollcnt_to++; + next_timeout = rcvthread_next_timeout(rcvc); + } + } + } + + return NULL; +} + +static uint64_t +rcvthread_stats_pollok(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) context; + double ratio = 0.0; + uint64_t ratio_u; + if (rcvc->pollcnt > 0) + ratio = (double) rcvc->pollok * 100.0 / rcvc->pollcnt; + memcpy(&ratio_u, &ratio, sizeof(uint64_t)); + return ratio_u; +} + +static uint64_t +rcvthread_stats_pollcyc(void *context) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) context; + /* log in milliseconds */ + return (uint64_t) ((double) cycles_to_nanosecs(rcvc->pollcyc) / 1.0e6); +} + +static psm_error_t +rcvthread_initstats(ptl_t *ptl) +{ + struct ptl_rcvthread *rcvc = (struct ptl_rcvthread *) ptl->rcvthread; + struct psmi_stats_entry entries[] = { + PSMI_STATS_DECL("intrthread schedule count", + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, + NULL, &rcvc->pollcnt), + PSMI_STATS_DECL("intrthread schedule success (%)", + MPSPAWN_STATS_REDUCTION_ALL | + MPSPAWN_STATS_TYPE_DOUBLE, + rcvthread_stats_pollok, NULL), + PSMI_STATS_DECL("intrthread timeout count", + MPSPAWN_STATS_REDUCTION_ALL | MPSPAWN_STATS_SKIP_IF_ZERO, + NULL, &rcvc->pollcnt_to), + PSMI_STATS_DECL("intrthread wasted time (ms)", + MPSPAWN_STATS_REDUCTION_ALL, + rcvthread_stats_pollcyc, NULL) + }; + + /* If we don't want a thread, make sure we still initialize the counters + * but set them to NaN instead */ + if (!(ptl->runtime_flags & PSMI_RUNTIME_RCVTHREAD)) { + int i; + static uint64_t ctr_nan = MPSPAWN_NAN; + for (i = 0; i < (int) PSMI_STATS_HOWMANY(entries); i++) { + entries[i].getfn = NULL; + entries[i].u.val = &ctr_nan; + } + } + + return psmi_stats_register_type(PSMI_STATS_NO_HEADING, + PSMI_STATSTYPE_RCVTHREAD, + entries, + PSMI_STATS_HOWMANY(entries), + rcvc); +} diff --git a/ptl_self/Makefile b/ptl_self/Makefile new file mode 100644 index 0000000..3b41f54 --- /dev/null +++ b/ptl_self/Makefile @@ -0,0 +1,45 @@ +# Copyright (c) 2006-2010. QLogic Corporation. All rights reserved. +# Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + +include $(top_srcdir)/buildflags.mak +INCLUDES += -I$(top_srcdir) + +${TARGLIB}-objs := ptl.o + +all: ${${TARGLIB}-objs} + +%.o: %.c + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +clean: + rm -f *.o + diff --git a/ptl_self/ptl.c b/ptl_self/ptl.c new file mode 100644 index 0000000..bac2d58 --- /dev/null +++ b/ptl_self/ptl.c @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2013. Intel Corporation. All rights reserved. + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file implements the PSM PTL for self (loopback) + */ + +#include "psm_user.h" +#include "psm_mq_internal.h" + +struct ptl { + psm_ep_t ep; + psm_epid_t epid; + psm_epaddr_t epaddr; + ptl_ctl_t *ctl; +}; + +static +psm_error_t __fastpath +ptl_handle_rtsmatch(psm_mq_req_t recv_req, int was_posted) +{ + psm_mq_req_t send_req = (psm_mq_req_t) recv_req->ptl_req_ptr; + + if (recv_req->recv_msglen > 0) { + PSM_VALGRIND_DEFINE_MQ_RECV(recv_req->buf, recv_req->buf_len, + recv_req->recv_msglen); + VALGRIND_MAKE_MEM_DEFINED(send_req->buf, send_req->buf_len); + VALGRIND_MAKE_MEM_DEFINED(send_req->buf, recv_req->recv_msglen); + + psmi_mq_mtucpy(recv_req->buf, send_req->buf, recv_req->recv_msglen); + } + + psmi_mq_handle_rts_complete(recv_req); + + /* If the send is already marked complete, that's because it was internally + * buffered. */ + if (send_req->state == MQ_STATE_COMPLETE) { + psmi_mq_stats_rts_account(send_req); + if (send_req->buf != NULL && send_req->send_msglen > 0) + psmi_mq_sysbuf_free(send_req->mq, send_req->buf); + psmi_mq_req_free(send_req); /* req was left "live" even though the + * sender was told that the send was done */ + } + else + psmi_mq_handle_rts_complete(send_req); + + _IPATH_VDBG("[self][complete][b=%p][sreq=%p][rreq=%p]\n", + recv_req->buf, send_req, recv_req); + return PSM_OK; +} + +static +psm_error_t +self_mq_send_testwait(psm_mq_req_t *ireq, int istest, psm_mq_status_t *status) +{ + uint8_t *ubuf; + psm_mq_req_t req = *ireq; + + PSMI_PLOCK_ASSERT(); + + /* We're waiting on a send request, and the matching receive has not been + * posted yet. This is a deadlock condition in MPI but we accodomate it + * here in the "self ptl" by using system-allocated memory. + */ + req->testwait_callback = NULL; /* no more calls here */ + + ubuf = req->buf; + if (ubuf != NULL && req->send_msglen > 0) { + req->buf = psmi_mq_sysbuf_alloc(req->mq, req->send_msglen); + if (req->buf == NULL) + return PSM_NO_MEMORY; + psmi_mq_mtucpy(req->buf, ubuf, req->send_msglen); + } + + /* Mark it complete but don't free the req, it's freed when the receiver + * does the match */ + req->state = MQ_STATE_COMPLETE; + *ireq = PSM_MQ_REQINVALID; + + if (status != NULL) + mq_status_copy(req, status); + return PSM_OK; +} + +/* Self is different. We do everything as rendezvous. */ +static +psm_error_t __fastpath +self_mq_isend(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len, void *context, + psm_mq_req_t *req_o) +{ + psm_mq_req_t send_req; + psm_mq_req_t recv_req; + int rc; + + send_req = psmi_mq_req_alloc(mq, MQE_TYPE_SEND); + if_pf (send_req == NULL) + return PSM_NO_MEMORY; + + rc = psmi_mq_handle_rts(mq, tag, (uintptr_t) ubuf, len, epaddr, + ptl_handle_rtsmatch, &recv_req); + send_req->buf = (void *) ubuf; + send_req->send_msglen = len; + send_req->context = context; + recv_req->ptl_req_ptr = (void *) send_req; + if (rc == MQ_RET_MATCH_OK) + ptl_handle_rtsmatch(recv_req, 1); + else + send_req->testwait_callback = self_mq_send_testwait; + + _IPATH_VDBG("[self][b=%p][m=%d][t=%"PRIx64"][match=%s][req=%p]\n", + ubuf, len, tag, rc == MQ_RET_MATCH_OK ? "YES" : "NO", send_req); + *req_o = send_req; + return PSM_OK; +} + +static __fastpath +psm_error_t +self_mq_send(psm_mq_t mq, psm_epaddr_t epaddr, uint32_t flags, + uint64_t tag, const void *ubuf, uint32_t len) +{ + psm_error_t err; + psm_mq_req_t req; + err = self_mq_isend(mq,epaddr,flags,tag,ubuf,len,NULL,&req); + psmi_mq_wait_internal(&req); + return err; +} + +static +psm_error_t +self_connect(ptl_t *ptl, + int numep, + const psm_epid_t array_of_epid[], + const int array_of_epid_mask[], + psm_error_t array_of_errors[], + psm_epaddr_t array_of_epaddr[], + uint64_t timeout_ns) +{ + psmi_assert_always(ptl->epaddr != NULL); + psm_epaddr_t epaddr; + psm_error_t err = PSM_OK; + int i; + + PSMI_PLOCK_ASSERT(); + + for (i = 0; i < numep; i++) { + if (!array_of_epid_mask[i]) + continue; + + if (array_of_epid[i] == ptl->epid) { + epaddr = psmi_epid_lookup(ptl->ep, ptl->epid); + psmi_assert_always(epaddr == NULL); + array_of_epaddr[i] = ptl->epaddr; + array_of_epaddr[i]->ptl = ptl; + array_of_epaddr[i]->ptlctl = ptl->ctl; + array_of_epaddr[i]->epid = ptl->epid; + array_of_epaddr[i]->ep = ptl->ep; + if (psmi_epid_set_hostname(psm_epid_nid(ptl->epid), + psmi_gethostname(), 0)) { + err = PSM_NO_MEMORY; + goto fail; + } + psmi_epid_add(ptl->ep, ptl->epid, ptl->epaddr); + array_of_errors[i] = PSM_OK; + } + else { + array_of_epaddr[i] = NULL; + array_of_errors[i] = PSM_EPID_UNREACHABLE; + } + } + +fail: + return err; +} + +#if 0 +static +psm_error_t +self_disconnect(ptl_t *ptl, int numep, + const psm_epaddr_t array_of_epaddr[], + int array_of_epaddr_mask[], + int force, uint64_t timeout_ns) +{ + int i; + for (i = 0; i < numep; i++) { + if (array_of_epaddr_mask[i] == 0) + continue; + + if (array_of_epaddr[i] == ptl->epaddr) + array_of_epaddr_mask[i] = 1; + else + array_of_epaddr_mask[i] = 0; + } + return PSM_OK; +} +#endif + +static +size_t +self_ptl_sizeof(void) +{ + return sizeof(ptl_t); +} + +static +psm_error_t +self_ptl_init(const psm_ep_t ep, ptl_t *ptl, ptl_ctl_t *ctl) +{ + psmi_assert_always(ep != NULL); + psmi_assert_always(ep->epaddr != NULL); + psmi_assert_always(ep->epid != 0); + + ptl->ep = ep; + ptl->epid = ep->epid; + ptl->epaddr = ep->epaddr; + ptl->ctl = ctl; + ep->epaddr->mctxt_prev = ep->epaddr; + ep->epaddr->mctxt_next = ep->epaddr; + ep->epaddr->mctxt_master = ep->epaddr; + + memset(ctl, 0, sizeof(*ctl)); + /* Fill in the control structure */ + ctl->ptl = ptl; + ctl->ep_poll = NULL; + ctl->ep_connect = self_connect; + ctl->ep_disconnect = NULL; + + ctl->mq_send = self_mq_send; + ctl->mq_isend = self_mq_isend; + + /* No stats in self */ + ctl->epaddr_stats_num = NULL; + ctl->epaddr_stats_init = NULL; + ctl->epaddr_stats_get = NULL; + + return PSM_OK; +} + +static +psm_error_t +self_ptl_fini(ptl_t *ptl, int force, uint64_t timeout_ns) +{ + return PSM_OK; /* nothing to do */ +} + +static +psm_error_t +self_ptl_setopt(const void *component_obj, int optname, + const void *optval, uint64_t optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown SELF ptl option %u.", optname); +} + +static +psm_error_t +self_ptl_getopt(const void *component_obj, int optname, + void *optval, uint64_t *optlen) +{ + /* No options for SELF PTL at the moment */ + return psmi_handle_error(NULL, PSM_PARAM_ERR, "Unknown SELF ptl option %u.", optname); +} + +/* Only symbol we expose out of here */ +struct ptl_ctl_init +psmi_ptl_self = { + self_ptl_sizeof, self_ptl_init, self_ptl_fini,self_ptl_setopt,self_ptl_getopt +}; diff --git a/ptl_self/ptl_fwd.h b/ptl_self/ptl_fwd.h new file mode 100644 index 0000000..ff79c7e --- /dev/null +++ b/ptl_self/ptl_fwd.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2006-2012. QLogic Corporation. All rights reserved. + * Copyright (c) 2003-2006, PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _PTL_FWD_SELF_H +#define _PTL_FWD_SELF_H + +/* Symbol in am ptl */ +struct ptl_ctl_init psmi_ptl_self; + +#endif + -- cgit v1.2.3 From 0f8c608a1b128c42b75aa8fac35d3df4a3d57e57 Mon Sep 17 00:00:00 2001 From: Christoph Biedl Date: Sun, 16 Oct 2022 04:18:17 -0700 Subject: Import infinipath-psm_3.3+20.604758e7-6.2.debian.tar.xz [dgit import tarball infinipath-psm 3.3+20.604758e7-6.2 infinipath-psm_3.3+20.604758e7-6.2.debian.tar.xz] --- README.source | 58 +++ changelog | 110 ++++++ compat | 1 + control | 56 +++ copyright | 79 ++++ gbp.conf | 4 + libpsm-infinipath1-dev.install | 2 + libpsm-infinipath1.install | 2 + libpsm-infinipath1.postinst.in | 10 + libpsm-infinipath1.postrm.in | 13 + libpsm-infinipath1.prerm.in | 13 + libpsm-infinipath1.symbols | 432 +++++++++++++++++++++ .../0001-Fix-truncation-warnings-with-gcc7.patch | 45 +++ ...-sysmacros.h-to-avoid-warning-about-minor.patch | 31 ++ patches/0003-gcc8.patch | 29 ++ patches/0004-gcc-11-warning.patch | 22 ++ patches/series | 4 + rules | 61 +++ source/format | 1 + source/options | 1 + watch | 3 + 21 files changed, 977 insertions(+) create mode 100644 README.source create mode 100644 changelog create mode 100644 compat create mode 100644 control create mode 100644 copyright create mode 100644 gbp.conf create mode 100644 libpsm-infinipath1-dev.install create mode 100644 libpsm-infinipath1.install create mode 100644 libpsm-infinipath1.postinst.in create mode 100644 libpsm-infinipath1.postrm.in create mode 100644 libpsm-infinipath1.prerm.in create mode 100644 libpsm-infinipath1.symbols create mode 100644 patches/0001-Fix-truncation-warnings-with-gcc7.patch create mode 100644 patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch create mode 100644 patches/0003-gcc8.patch create mode 100644 patches/0004-gcc-11-warning.patch create mode 100644 patches/series create mode 100755 rules create mode 100644 source/format create mode 100644 source/options create mode 100644 watch diff --git a/README.source b/README.source new file mode 100644 index 0000000..fd5b4ef --- /dev/null +++ b/README.source @@ -0,0 +1,58 @@ +This package is maintained from the upstream git repository located at +https://github.com/intel/psm.git +using Dep14 (http://dep.debian.net/deps/dep14/) layout/workflow. + +New versions should usually be built from the debian/master branch. +There is a d/gbp.conf, so building with 'gbp buildpackage' is supported. +The upstream branch is master (there are old upstream branches called upstream +and upstream-branch that were used for intermediate package versions but are now +obsolete). + +Patches are managed within the patch-queue/debian/master branch. This branch +should always be based on the (upstream) master branch. The contents of the +d/patches directory is then auto-generated using: +$ gbp pq export + +To clone this repo use: +$ gbp clone --pristine-tar git@salsa.debian.org:hpc-team/infinipath-psm.git + +To build the package after cloning: +$ gbp buildpackage + +To be able to receive new upstream releases/commits, after cloning, you need to +add the upstream repo address as a 'git remote' as follows: +$ git remote add upstream https://github.com/intel/psm.git +Then you can pull upstream changes as follows: +$ git branch master +$ git pull upstream + +Handling new upstream releases (e.g. for new release 3.3+20.604758e7): +- Set an upstream tag (Until upstream tags their releases use the latest + upstream commit and tag it according to the scheme + 3.3+. where n should be incremented by 1 for a new debian release + with upstream changes and sha1 is the short form of the SHA-1 object name of + that commit): + $ git tag upstream/3.3+20.604758e7 +- Rebase patch-queue branch (if patches currently exist) + $ git checkout patch-queue/debian/master + $ git rebase -i upstream/3.3+20.604758e7 + Review patches, possibly fix conflicts, when done: + $ gbp pq export + This puts you into the debian/master branch automatically. Review the changes + and commit. +- Merge new upstream release to the debian/master branch: + $ git checkout debian/master (probably you're already there) + $ git merge upstream/3.3+20.604758e7 +- Adjust debian files for new release ... +- Set debian release tag when done (assuming debian version 3.3+20.604758e7-1) + $ git tag debian/3.3+20.604758e7-1 +- Build new package including new pristine-tar generation: + $ gbp buildpackage --git-pristine-tar --git-pristine-tar-commit \ + --git-compression=xz +- When all is fine, push the new version + $ git push --all + $ git push --tags + +Roland Fehrenbacher + + -- Roland Fehrenbacher , Wed, 27 Dec 2017 17:44:26 +0000 diff --git a/changelog b/changelog new file mode 100644 index 0000000..7e666a1 --- /dev/null +++ b/changelog @@ -0,0 +1,110 @@ +infinipath-psm (3.3+20.604758e7-6.2) unstable; urgency=medium + + * Non-maintainer upload + * Work around FTBFS with gcc-12. Closes: #984057 + + -- Christoph Biedl Sun, 16 Oct 2022 13:18:17 +0200 + +infinipath-psm (3.3+20.604758e7-6.1) unstable; urgency=medium + + * Non-maintainer upload. + * Work around FTBFS with gcc-10 (Closes: #957359) + + -- Paul Gevers Sun, 03 Jan 2021 08:42:58 +0100 + +infinipath-psm (3.3+20.604758e7-6) unstable; urgency=medium + + * Fix ftbfs with GCC-8 (Closes: #897774). Thanks to Reiner Herrmann + for the patch. + - add patch 0003-gcc8.patch + + -- Mehdi Dogguy Thu, 29 Nov 2018 23:53:33 +0100 + +infinipath-psm (3.3+20.604758e7-5) unstable; urgency=medium + + * Fix postrm maintainer script to avoid leaving an unowned file + (Closes: #886925) and add a postrm script to handle other cases. + Thanks to Andreas Beckmann for filing the bugreport and putting + relevant references in the bugreport. + + -- Mehdi Dogguy Sun, 14 Jan 2018 11:29:47 +0100 + +infinipath-psm (3.3+20.604758e7-4) unstable; urgency=medium + + * Add myself to Uploaders + * Run wrap-and-sort + * Mark symbol ipath_dwordcpy_safe@Base as amd64 specific (Closes: #886359) + * Provide libpsm_infinipath.so.1 as an alternative + + -- Mehdi Dogguy Thu, 04 Jan 2018 22:43:15 +0100 + +infinipath-psm (3.3+20.604758e7-3) unstable; urgency=medium + + * Make 'Debian HPC Team' new maintainer + * Fix lintian warnings (copyright + Priority) + * Add i386 specific symbols file + + -- Roland Fehrenbacher Thu, 04 Jan 2018 13:12:09 +0100 + +infinipath-psm (3.3+20.604758e7-1) unstable; urgency=medium + + * New upstream release (Add upstream fixes up to git commit 604758e). + * Add patch for gcc 7 compilation (Closes: #853451). + * Remove patches applied upstream. + * Update Vcs-Git/Browser to point to new salsa.debian.org repo. + * Update Standards-Version to 4.1.3, no changes required. + * Update copyright file. + + -- Roland Fehrenbacher Fri, 29 Dec 2017 11:47:37 +0100 + +infinipath-psm (3.3+19.g67c0807.open-2) UNRELEASED; urgency=medium + + * Improve architecture detection. (Closes: #807149) + + -- Ana Beatriz Guerrero Lopez Mon, 04 Apr 2016 10:18:09 +0200 + +infinipath-psm (3.3+19.g67c0807.open-1) unstable; urgency=medium + + * Team upload. + * New upstream release. + * Add 0001-Initialize-variables.patch to fix -Werror=maybe-uninitialized + errors. + * Update symbols file. + * Update Standards-Version to 3.9.7, no changes required. + * Update Vcs-Browser to use HTTPS. + * Add 0002-Fix-a-few-typos.patch fixing a few typo reported by lintian. + + -- Ana Beatriz Guerrero Lopez Fri, 01 Apr 2016 00:54:00 +0200 + +infinipath-psm (3.3+7.gec1d6d2-3) unstable; urgency=low + + * d/control: Make libpsm-infinipath1-dev conflict with libion-dev + + * Bug fix: "libpsm-infinipath1-dev and libion-dev: error when trying to + install together", thanks to Ralf Treinen (Closes: #807300). + + -- Roland Fehrenbacher Tue, 08 Dec 2015 15:36:24 +0100 + +infinipath-psm (3.3+7.gec1d6d2-2) unstable; urgency=medium + + * Add symbols file d/libpsm-infinipath1.symbols + + -- Roland Fehrenbacher Tue, 08 Dec 2015 13:20:59 +0000 + +infinipath-psm (3.3+7.gec1d6d2-1) unstable; urgency=low + + * Upstream integrated previous Debian patches + + -- Roland Fehrenbacher Mon, 07 Dec 2015 19:29:57 +0100 + +infinipath-psm (3.3+7.g05f6f14.open-2) unstable; urgency=low + + * Revert to xz compression + + -- Roland Fehrenbacher Tue, 01 Dec 2015 12:38:18 +0100 + +infinipath-psm (3.3+7.g05f6f14.open-1) unstable; urgency=medium + + * Initial release. (Closes: #806524) + + -- Roland Fehrenbacher Sat, 28 Nov 2015 13:49:53 +0100 diff --git a/compat b/compat new file mode 100644 index 0000000..ec63514 --- /dev/null +++ b/compat @@ -0,0 +1 @@ +9 diff --git a/control b/control new file mode 100644 index 0000000..1f5653f --- /dev/null +++ b/control @@ -0,0 +1,56 @@ +Source: infinipath-psm +Section: libs +Priority: optional +Maintainer: Debian HPC Team +Uploaders: Roland Fehrenbacher , + Mehdi Dogguy +Build-Depends: debhelper (>= 9), + dpkg-dev (>= 1.13.19), + uuid-dev +Standards-Version: 4.1.3 +Homepage: https://github.com/intel/psm +Vcs-Git: https://salsa.debian.org/hpc-team/infinipath-psm +Vcs-Browser: https://salsa.debian.org/hpc-team/infinipath-psm + +Package: libpsm-infinipath1 +Architecture: amd64 i386 +Depends: ${misc:Depends}, + ${shlibs:Depends} +Description: PSM Messaging library for Intel Truescale adapters + The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level, + user-level communication interface for the Truescale family of products. PSM + users can use mechanisms necessary to implement higher-level communication + interfaces in parallel environments. + . + This package contains the shared libraries. + +Package: libpsm-infinipath1-dev +Section: libdevel +Architecture: amd64 i386 +Depends: libpsm-infinipath1 (= ${binary:Version}), + ${misc:Depends} +Conflicts: libion-dev +Description: Development files for libpsm-infinipath1 + The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level, + user-level communication interface for the Truescale family of products. PSM + users can use mechanisms necessary to implement higher-level communication + interfaces in parallel environments. + . + This package is needed to compile programs against libpsm-infinipath1. + It contains the header files and links needed for compiling. + +Package: libpsm-infinipath1-dbg +Section: debug +Priority: optional +Architecture: amd64 i386 +Depends: libpsm-infinipath1 (= ${binary:Version}), + ${misc:Depends} +Description: Debugging symbols for libpsm-infinipath1 + The PSM Messaging API, or PSM API, is Intel's (formerly QLogic's) low-level, + user-level communication interface for the Truescale family of products. PSM + users can use mechanisms necessary to implement higher-level communication + interfaces in parallel environments. + . + This package contains the debugging symbols associated with + libpsm-infinipath1. They will automatically be used by gdb for debugging + libpsm-infinipath1-related issues. diff --git a/copyright b/copyright new file mode 100644 index 0000000..d97bebf --- /dev/null +++ b/copyright @@ -0,0 +1,79 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: infinipath-psm +Upstream-Contact: Mike Marciniszyn +Source: https://github.com/intel/psm + +Files: * +Copyright: Copyright (c) 2012, 2017. Intel Corporation. + Copyright (c) 2005, 2006. QLogic Corporation. +License: BSD-2-clause or GPL-2 + +Files: debian/* +Copyright: 2015-2017 Q-Leap Networks GmbH + 2016 Ana Beatriz Guerrero Lopez +License: GPL-2+ + This package is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + . + This package is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see + . + On Debian systems, the complete text of the GNU General + Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". + +License: GPL-2 + This program is free software; you can redistribute it + and/or modify it under the terms of the GNU General Public + License version 2 as published by the Free Software Foundation. + . + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + PURPOSE. See the GNU General Public License for more + details. + . + You should have received a copy of the GNU General Public + License along with this package; if not, write to the Free + Software Foundation, Inc., 51 Franklin St, Fifth Floor, + Boston, MA 02110-1301 USA + . + On Debian systems, the full text of the GNU General Public + License version 2 can be found in the file + /usr/share/common-licenses/GPL-2. + +License: BSD-2-clause + The BSD 2-Clause License + . + Copyright (c) 2007 Cisco, Inc. All rights reserved. + . + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + . + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + . + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. diff --git a/gbp.conf b/gbp.conf new file mode 100644 index 0000000..5b6c22c --- /dev/null +++ b/gbp.conf @@ -0,0 +1,4 @@ +[DEFAULT] +upstream-branch = master +debian-branch = debian/master +ignore-branch = False diff --git a/libpsm-infinipath1-dev.install b/libpsm-infinipath1-dev.install new file mode 100644 index 0000000..f3800aa --- /dev/null +++ b/libpsm-infinipath1-dev.install @@ -0,0 +1,2 @@ +usr/include/* +usr/lib/*/lib*.so diff --git a/libpsm-infinipath1.install b/libpsm-infinipath1.install new file mode 100644 index 0000000..6e62d0c --- /dev/null +++ b/libpsm-infinipath1.install @@ -0,0 +1,2 @@ +usr/lib/*/libinfinipath*.so.* +usr/lib/libpsm1/* diff --git a/libpsm-infinipath1.postinst.in b/libpsm-infinipath1.postinst.in new file mode 100644 index 0000000..8a2e3a3 --- /dev/null +++ b/libpsm-infinipath1.postinst.in @@ -0,0 +1,10 @@ +#! /bin/sh + +set -e + +update-alternatives --install /usr/lib/@DEB_HOST_MULTIARCH@/libpsm_infinipath.so.@PSM_LIB_MAJOR@ libpsm_infinipath.so.@PSM_LIB_MAJOR@ \ + /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@ 40 + +#DEBHELPER# + +exit 0 diff --git a/libpsm-infinipath1.postrm.in b/libpsm-infinipath1.postrm.in new file mode 100644 index 0000000..d7ec507 --- /dev/null +++ b/libpsm-infinipath1.postrm.in @@ -0,0 +1,13 @@ +#!/bin/sh + +set -e + +if [ "$1" != "remove" ] || [ "$1" != "disappear" ] +then + update-alternatives --remove libpsm_infinipath.so.@PSM_LIB_MAJOR@ \ + /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@ +fi + +#DEBHELPER# + +exit 0 diff --git a/libpsm-infinipath1.prerm.in b/libpsm-infinipath1.prerm.in new file mode 100644 index 0000000..47d57e0 --- /dev/null +++ b/libpsm-infinipath1.prerm.in @@ -0,0 +1,13 @@ +#!/bin/sh + +set -e + +if [ "$1" != "remove" ] +then + update-alternatives --remove libpsm_infinipath.so.@PSM_LIB_MAJOR@ \ + /usr/lib/libpsm1/libpsm_infinipath.so.@PSM_LIB_VERSION@ +fi + +#DEBHELPER# + +exit 0 diff --git a/libpsm-infinipath1.symbols b/libpsm-infinipath1.symbols new file mode 100644 index 0000000..505543e --- /dev/null +++ b/libpsm-infinipath1.symbols @@ -0,0 +1,432 @@ +libinfinipath.so.4 libpsm-infinipath1 #MINVER# + __ipath_dbgout@Base 3.3+7.gec1d6d2 + __ipath_malloc_no_mmap@Base 3.3+7.gec1d6d2 + __ipath_mylabel@Base 3.3+7.gec1d6d2 + __ipath_pico_per_cycle@Base 3.3+7.gec1d6d2 + infinipath_debug@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_port@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_port_names@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_port_names_count@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_unit@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_unit_names@Base 3.3+7.gec1d6d2 + infinipath_get_ctrs_unit_names_count@Base 3.3+7.gec1d6d2 + infinipath_get_next_name@Base 3.3+7.gec1d6d2 + infinipath_get_single_portctr@Base 3.3+7.gec1d6d2 + infinipath_get_single_stat@Base 3.3+7.gec1d6d2 + infinipath_get_single_unitctr@Base 3.3+7.gec1d6d2 + infinipath_get_stats@Base 3.3+7.gec1d6d2 + infinipath_get_stats_names@Base 3.3+7.gec1d6d2 + infinipath_get_stats_names_count@Base 3.3+7.gec1d6d2 + infinipath_get_unit_flash@Base 3.3+7.gec1d6d2 + infinipath_lookup_stat@Base 3.3+7.gec1d6d2 + infinipath_put_unit_flash@Base 3.3+7.gec1d6d2 + infinipath_release_names@Base 3.3+7.gec1d6d2 + ipath_armlaunch_ctrl@Base 3.3+7.gec1d6d2 + ipath_check_unit_status@Base 3.3+7.gec1d6d2 + ipath_cmd_assign_context@Base 3.3+7.gec1d6d2 + ipath_cmd_user_init@Base 3.3+7.gec1d6d2 + ipath_cmd_wait_for_packet@Base 3.3+7.gec1d6d2 + ipath_cmd_write@Base 3.3+7.gec1d6d2 + ipath_cmd_writev@Base 3.3+7.gec1d6d2 + ipath_context_close@Base 3.3+7.gec1d6d2 + ipath_context_open@Base 3.3+7.gec1d6d2 + ipath_disarm_bufs@Base 3.3+7.gec1d6d2 + ipath_dwordcpy@Base 3.3+7.gec1d6d2 + (arch=amd64)ipath_dwordcpy_safe@Base 3.3+7.gec1d6d2 + ipath_event_ack@Base 3.3+7.gec1d6d2 + ipath_flash_csum@Base 3.3+7.gec1d6d2 + ipath_flush_egr_bufs@Base 3.3+7.gec1d6d2 + ipath_force_pio_avail_update@Base 3.3+7.gec1d6d2 + ipath_free_tid_err@Base 3.3+7.gec1d6d2 + ipath_get_cc_settings_bin@Base 3.3+7.gec1d6d2 + ipath_get_cc_table_bin@Base 3.3+7.gec1d6d2 + ipath_get_mylabel@Base 3.3+7.gec1d6d2 + ipath_get_num_contexts@Base 3.3+7.gec1d6d2 + ipath_get_num_units@Base 3.3+7.gec1d6d2 + ipath_get_port_gid@Base 3.3+7.gec1d6d2 + ipath_get_port_lid@Base 3.3+7.gec1d6d2 + ipath_get_port_lmc@Base 3.3+7.gec1d6d2 + ipath_get_port_rate@Base 3.3+7.gec1d6d2 + ipath_get_port_sl2vl@Base 3.3+7.gec1d6d2 + ipath_hideous_ioctl_emulator@Base 3.3+7.gec1d6d2 + ipath_ipathfs_open@Base 3.3+7.gec1d6d2 + ipath_ipathfs_path@Base 3.3+7.gec1d6d2 + ipath_ipathfs_rd@Base 3.3+7.gec1d6d2 + ipath_ipathfs_read@Base 3.3+7.gec1d6d2 + ipath_ipathfs_unit_open@Base 3.3+7.gec1d6d2 + ipath_ipathfs_unit_rd@Base 3.3+7.gec1d6d2 + ipath_ipathfs_unit_read@Base 3.3+7.gec1d6d2 + ipath_ipathfs_unit_write@Base 3.3+7.gec1d6d2 + ipath_manage_rcvq@Base 3.3+7.gec1d6d2 + ipath_mmap64@Base 3.3+7.gec1d6d2 + ipath_poll_type@Base 3.3+7.gec1d6d2 + ipath_sdma_complete@Base 3.3+7.gec1d6d2 + ipath_sdma_inflight@Base 3.3+7.gec1d6d2 + ipath_set_mylabel@Base 3.3+7.gec1d6d2 + ipath_set_pkey@Base 3.3+7.gec1d6d2 + ipath_sysfs_open@Base 3.3+7.gec1d6d2 + ipath_sysfs_path@Base 3.3+7.gec1d6d2 + ipath_sysfs_path_len@Base 3.3+7.gec1d6d2 + ipath_sysfs_port_open@Base 3.3+7.gec1d6d2 + ipath_sysfs_port_printf@Base 3.3+7.gec1d6d2 + ipath_sysfs_port_read@Base 3.3+7.gec1d6d2 + ipath_sysfs_port_read_s64@Base 3.3+7.gec1d6d2 + ipath_sysfs_printf@Base 3.3+7.gec1d6d2 + ipath_sysfs_read@Base 3.3+7.gec1d6d2 + ipath_sysfs_read_s64@Base 3.3+7.gec1d6d2 + ipath_sysfs_unit_open@Base 3.3+7.gec1d6d2 + ipath_sysfs_unit_printf@Base 3.3+7.gec1d6d2 + ipath_sysfs_unit_read@Base 3.3+7.gec1d6d2 + ipath_sysfs_unit_read_s64@Base 3.3+7.gec1d6d2 + ipath_sysfs_unit_write@Base 3.3+7.gec1d6d2 + ipath_syslog@Base 3.3+7.gec1d6d2 + ipath_touch_mmap@Base 3.3+7.gec1d6d2 + ipath_update_tid_err@Base 3.3+7.gec1d6d2 + ipath_userinit@Base 3.3+7.gec1d6d2 + ipath_vsyslog@Base 3.3+7.gec1d6d2 + ipath_wait_for_device@Base 3.3+7.gec1d6d2 + ipath_wait_for_packet@Base 3.3+7.gec1d6d2 + ipath_write_pio@Base 3.3+7.gec1d6d2 + ipath_write_pio_force_order@Base 3.3+7.gec1d6d2 + ipath_write_pio_special_trigger2k@Base 3.3+7.gec1d6d2 + ipath_write_pio_special_trigger4k@Base 3.3+7.gec1d6d2 +libpsm_infinipath.so.1 libpsm-infinipath1 #MINVER# + __psm_am_get_parameters@Base 3.3+7.gec1d6d2 + __psm_am_register_handlers@Base 3.3+7.gec1d6d2 + __psm_am_reply_short@Base 3.3+7.gec1d6d2 + __psm_am_request_short@Base 3.3+7.gec1d6d2 + __psm_ep_close@Base 3.3+7.gec1d6d2 + __psm_ep_connect@Base 3.3+7.gec1d6d2 + __psm_ep_epid_lookup@Base 3.3+7.gec1d6d2 + __psm_ep_epid_share_memory@Base 3.3+7.gec1d6d2 + __psm_ep_num_devunits@Base 3.3+7.gec1d6d2 + __psm_ep_open@Base 3.3+7.gec1d6d2 + __psm_ep_open_internal@Base 3.3+7.gec1d6d2 + __psm_ep_open_opts_get_defaults@Base 3.3+7.gec1d6d2 + __psm_ep_query@Base 3.3+7.gec1d6d2 + __psm_epaddr_getctxt@Base 3.3+7.gec1d6d2 + __psm_epaddr_setctxt@Base 3.3+7.gec1d6d2 + __psm_epaddr_setlabel@Base 3.3+7.gec1d6d2 + __psm_epid_context@Base 3.3+7.gec1d6d2 + __psm_epid_nid@Base 3.3+7.gec1d6d2 + __psm_epid_port@Base 3.3+7.gec1d6d2 + __psm_error_defer@Base 3.3+7.gec1d6d2 + __psm_error_get_string@Base 3.3+7.gec1d6d2 + __psm_error_register_handler@Base 3.3+7.gec1d6d2 + __psm_finalize@Base 3.3+7.gec1d6d2 + __psm_getopt@Base 3.3+7.gec1d6d2 + __psm_init@Base 3.3+7.gec1d6d2 + __psm_map_nid_hostname@Base 3.3+7.gec1d6d2 + __psm_mq_cancel@Base 3.3+7.gec1d6d2 + __psm_mq_finalize@Base 3.3+7.gec1d6d2 + __psm_mq_get_stats@Base 3.3+7.gec1d6d2 + __psm_mq_getopt@Base 3.3+7.gec1d6d2 + __psm_mq_init@Base 3.3+7.gec1d6d2 + __psm_mq_ipeek@Base 3.3+7.gec1d6d2 + __psm_mq_iprobe@Base 3.3+7.gec1d6d2 + __psm_mq_irecv@Base 3.3+7.gec1d6d2 + __psm_mq_isend@Base 3.3+7.gec1d6d2 + __psm_mq_send@Base 3.3+7.gec1d6d2 + __psm_mq_setopt@Base 3.3+7.gec1d6d2 + __psm_mq_test@Base 3.3+7.gec1d6d2 + __psm_mq_wait@Base 3.3+7.gec1d6d2 + __psm_poll@Base 3.3+7.gec1d6d2 + __psm_setopt@Base 3.3+7.gec1d6d2 + __psm_uuid_generate@Base 3.3+7.gec1d6d2 + __psmi_poll_internal@Base 3.3+7.gec1d6d2 + __psmi_poll_noop@Base 3.3+7.gec1d6d2 + ips_am_short_reply@Base 3.3+7.gec1d6d2 + ips_am_short_request@Base 3.3+7.gec1d6d2 + ips_cca_adjust_rate@Base 3.3+7.gec1d6d2 + ips_cca_timer_callback@Base 3.3+7.gec1d6d2 + ips_crc_calculate@Base 3.3+7.gec1d6d2 + ips_dma_transfer_frame@Base 3.3+7.gec1d6d2 + ips_epstate_add@Base 3.3+7.gec1d6d2 + ips_epstate_del@Base 3.3+7.gec1d6d2 + ips_epstate_fini@Base 3.3+7.gec1d6d2 + ips_epstate_init@Base 3.3+7.gec1d6d2 + ips_err_str@Base 3.3+7.gec1d6d2 + ips_flow_init@Base 3.3+7.gec1d6d2 + ips_get_stat@Base 3.3+7.gec1d6d2 + ips_ibta_fini@Base 3.3+7.gec1d6d2 + ips_ibta_init@Base 3.3+7.gec1d6d2 + ips_ibta_init_sl2vl_table@Base 3.3+7.gec1d6d2 + ips_ibta_link_updown_event@Base 3.3+7.gec1d6d2 + ips_mq_send_payload@Base 3.3+7.gec1d6d2 + ips_opp_init@Base 3.3+7.gec1d6d2 + ips_proto_am@Base 3.3+7.gec1d6d2 + ips_proto_am_fini@Base 3.3+7.gec1d6d2 + ips_proto_am_init@Base 3.3+7.gec1d6d2 + ips_proto_build_connect_message@Base 3.3+7.gec1d6d2 + ips_proto_connect@Base 3.3+7.gec1d6d2 + ips_proto_disconnect@Base 3.3+7.gec1d6d2 + ips_proto_dma_wait_until@Base 3.3+7.gec1d6d2 + ips_proto_dump_data@Base 3.3+7.gec1d6d2 + ips_proto_dump_err_stats@Base 3.3+7.gec1d6d2 + ips_proto_dump_frame@Base 3.3+7.gec1d6d2 + ips_proto_fini@Base 3.3+7.gec1d6d2 + ips_proto_flow_enqueue@Base 3.3+7.gec1d6d2 + ips_proto_flow_flush_dma@Base 3.3+7.gec1d6d2 + ips_proto_flow_flush_pio@Base 3.3+7.gec1d6d2 + ips_proto_get_rhf_errstring@Base 3.3+7.gec1d6d2 + ips_proto_init@Base 3.3+7.gec1d6d2 + ips_proto_isconnected@Base 3.3+7.gec1d6d2 + ips_proto_mq_handle_cts@Base 3.3+7.gec1d6d2 + ips_proto_mq_handle_rts_envelope@Base 3.3+7.gec1d6d2 + ips_proto_mq_handle_rts_envelope_outoforder@Base 3.3+7.gec1d6d2 + ips_proto_mq_isend@Base 3.3+7.gec1d6d2 + ips_proto_mq_push_eager_data@Base 3.3+7.gec1d6d2 + ips_proto_mq_push_eager_req@Base 3.3+7.gec1d6d2 + ips_proto_mq_send@Base 3.3+7.gec1d6d2 + ips_proto_process_ack@Base 3.3+7.gec1d6d2 + ips_proto_process_connect@Base 3.3+7.gec1d6d2 + ips_proto_process_packet_error@Base 3.3+7.gec1d6d2 + ips_proto_process_packet_inner@Base 3.3+7.gec1d6d2 + ips_proto_process_unknown@Base 3.3+7.gec1d6d2 + ips_proto_recv_fini@Base 3.3+7.gec1d6d2 + ips_proto_recv_init@Base 3.3+7.gec1d6d2 + ips_proto_rv_scbavail_callback@Base 3.3+7.gec1d6d2 + ips_proto_send_ctrl_message@Base 3.3+7.gec1d6d2 + ips_proto_show_header@Base 3.3+7.gec1d6d2 + ips_proto_timer_ack_callback@Base 3.3+7.gec1d6d2 + ips_proto_timer_ctrlq_callback@Base 3.3+7.gec1d6d2 + ips_proto_timer_pendq_callback@Base 3.3+7.gec1d6d2 + ips_proto_timer_send_callback@Base 3.3+7.gec1d6d2 + ips_protoexp_build_ctrl_message@Base 3.3+7.gec1d6d2 + ips_protoexp_data@Base 3.3+7.gec1d6d2 + ips_protoexp_fini@Base 3.3+7.gec1d6d2 + ips_protoexp_flow_newgen@Base 3.3+7.gec1d6d2 + ips_protoexp_handle_data_err@Base 3.3+7.gec1d6d2 + ips_protoexp_handle_tf_generr@Base 3.3+7.gec1d6d2 + ips_protoexp_handle_tf_seqerr@Base 3.3+7.gec1d6d2 + ips_protoexp_handle_tiderr@Base 3.3+7.gec1d6d2 + ips_protoexp_init@Base 3.3+7.gec1d6d2 + ips_protoexp_recv_unaligned_data@Base 3.3+7.gec1d6d2 + ips_protoexp_scb_inflight@Base 3.3+7.gec1d6d2 + ips_protoexp_tid_get_from_token@Base 3.3+7.gec1d6d2 + ips_protoexp_tid_grant@Base 3.3+7.gec1d6d2 + ips_protoexp_tid_grant_ack@Base 3.3+7.gec1d6d2 + ips_protoexp_tid_release@Base 3.3+7.gec1d6d2 + ips_protoexp_tid_release_ack@Base 3.3+7.gec1d6d2 + ips_ptl_connect@Base 3.3+7.gec1d6d2 + ips_ptl_disconnect@Base 3.3+7.gec1d6d2 + ips_ptl_epaddr_stats_get@Base 3.3+7.gec1d6d2 + ips_ptl_poll@Base 3.3+7.gec1d6d2 + ips_ptl_rcvthread_fini@Base 3.3+7.gec1d6d2 + ips_ptl_rcvthread_init@Base 3.3+7.gec1d6d2 + ips_ptl_recvq_isempty@Base 3.3+7.gec1d6d2 + ips_ptl_shared_poll@Base 3.3+7.gec1d6d2 + ips_recvhdrq_fini@Base 3.3+7.gec1d6d2 + ips_recvhdrq_init@Base 3.3+7.gec1d6d2 + ips_recvhdrq_progress@Base 3.3+7.gec1d6d2 + ips_recvq_egrbuf_table_alloc@Base 3.3+7.gec1d6d2 + ips_recvq_egrbuf_table_free@Base 3.3+7.gec1d6d2 + ips_scbctrl_alloc@Base 3.3+7.gec1d6d2 + ips_scbctrl_alloc_tiny@Base 3.3+7.gec1d6d2 + ips_scbctrl_avail@Base 3.3+7.gec1d6d2 + ips_scbctrl_bufalloc@Base 3.3+7.gec1d6d2 + ips_scbctrl_fini@Base 3.3+7.gec1d6d2 + ips_scbctrl_free@Base 3.3+7.gec1d6d2 + ips_scbctrl_init@Base 3.3+7.gec1d6d2 + ips_spio_fini@Base 3.3+7.gec1d6d2 + ips_spio_init@Base 3.3+7.gec1d6d2 + ips_spio_transfer_frame@Base 3.3+7.gec1d6d2 + ips_subcontext_ureg_get@Base 3.3+7.gec1d6d2 + ips_subcontext_ureg_initialize@Base 3.3+7.gec1d6d2 + ips_tf_allocate@Base 3.3+7.gec1d6d2 + ips_tf_deallocate@Base 3.3+7.gec1d6d2 + ips_tf_fini@Base 3.3+7.gec1d6d2 + ips_tf_init@Base 3.3+7.gec1d6d2 + ips_tfgen_allocate@Base 3.3+7.gec1d6d2 + ips_tid_acquire@Base 3.3+7.gec1d6d2 + ips_tid_fini@Base 3.3+7.gec1d6d2 + ips_tid_init@Base 3.3+7.gec1d6d2 + ips_tid_release@Base 3.3+7.gec1d6d2 + ips_tid_send_exp@Base 3.3+7.gec1d6d2 + ips_tidflow_nak_post_process@Base 3.3+7.gec1d6d2 + ips_writehdrq_fini@Base 3.3+7.gec1d6d2 + ips_writehdrq_init@Base 3.3+7.gec1d6d2 + kcopy_abi@Base 3.3+7.gec1d6d2 + kcopy_get@Base 3.3+7.gec1d6d2 + kcopy_put@Base 3.3+7.gec1d6d2 + knem_get@Base 3.3+7.gec1d6d2 + knem_open_device@Base 3.3+7.gec1d6d2 + knem_put@Base 3.3+7.gec1d6d2 + knem_register_region@Base 3.3+7.gec1d6d2 + psm_am_get_parameters@Base 3.3+7.gec1d6d2 + psm_am_register_handlers@Base 3.3+7.gec1d6d2 + psm_am_reply_short@Base 3.3+7.gec1d6d2 + psm_am_request_short@Base 3.3+7.gec1d6d2 + psm_ep_close@Base 3.3+7.gec1d6d2 + psm_ep_connect@Base 3.3+7.gec1d6d2 + psm_ep_epid_lookup@Base 3.3+7.gec1d6d2 + psm_ep_epid_share_memory@Base 3.3+7.gec1d6d2 + psm_ep_num_devunits@Base 3.3+7.gec1d6d2 + psm_ep_open@Base 3.3+7.gec1d6d2 + psm_ep_open_opts_get_defaults@Base 3.3+7.gec1d6d2 + psm_ep_query@Base 3.3+7.gec1d6d2 + psm_epaddr_getctxt@Base 3.3+7.gec1d6d2 + psm_epaddr_setctxt@Base 3.3+7.gec1d6d2 + psm_epaddr_setlabel@Base 3.3+7.gec1d6d2 + psm_epid_context@Base 3.3+7.gec1d6d2 + psm_epid_nid@Base 3.3+7.gec1d6d2 + psm_epid_port@Base 3.3+7.gec1d6d2 + psm_error_defer@Base 3.3+7.gec1d6d2 + psm_error_get_string@Base 3.3+7.gec1d6d2 + psm_error_register_handler@Base 3.3+7.gec1d6d2 + psm_finalize@Base 3.3+7.gec1d6d2 + psm_getopt@Base 3.3+7.gec1d6d2 + psm_init@Base 3.3+7.gec1d6d2 + psm_map_nid_hostname@Base 3.3+7.gec1d6d2 + psm_mq_cancel@Base 3.3+7.gec1d6d2 + psm_mq_finalize@Base 3.3+7.gec1d6d2 + psm_mq_get_stats@Base 3.3+7.gec1d6d2 + psm_mq_getopt@Base 3.3+7.gec1d6d2 + psm_mq_init@Base 3.3+7.gec1d6d2 + psm_mq_ipeek@Base 3.3+7.gec1d6d2 + psm_mq_iprobe@Base 3.3+7.gec1d6d2 + psm_mq_irecv@Base 3.3+7.gec1d6d2 + psm_mq_isend@Base 3.3+7.gec1d6d2 + psm_mq_send@Base 3.3+7.gec1d6d2 + psm_mq_setopt@Base 3.3+7.gec1d6d2 + psm_mq_test@Base 3.3+7.gec1d6d2 + psm_mq_wait@Base 3.3+7.gec1d6d2 + psm_poll@Base 3.3+7.gec1d6d2 + psm_setopt@Base 3.3+7.gec1d6d2 + psm_uuid_generate@Base 3.3+7.gec1d6d2 + psmi_abort_handler@Base 3.3+7.gec1d6d2 + psmi_allhandlers@Base 3.3+7.gec1d6d2 + psmi_am_getopt@Base 3.3+7.gec1d6d2 + psmi_am_handler@Base 3.3+7.gec1d6d2 + psmi_am_init_internal@Base 3.3+7.gec1d6d2 + psmi_am_mq_handler@Base 3.3+7.gec1d6d2 + psmi_am_mq_handler_data@Base 3.3+7.gec1d6d2 + psmi_am_mq_handler_rtsdone@Base 3.3+7.gec1d6d2 + psmi_am_mq_handler_rtsmatch@Base 3.3+7.gec1d6d2 + psmi_am_reqq_add@Base 3.3+7.gec1d6d2 + psmi_am_reqq_drain@Base 3.3+7.gec1d6d2 + psmi_am_reqq_init@Base 3.3+7.gec1d6d2 + psmi_am_setopt@Base 3.3+7.gec1d6d2 + psmi_amsh_am_short_reply@Base 3.3+7.gec1d6d2 + psmi_amsh_am_short_request@Base 3.3+7.gec1d6d2 + psmi_amsh_generic@Base 3.3+7.gec1d6d2 + psmi_amsh_long_reply@Base 3.3+7.gec1d6d2 + psmi_amsh_long_request@Base 3.3+7.gec1d6d2 + psmi_amsh_short_reply@Base 3.3+7.gec1d6d2 + psmi_amsh_short_request@Base 3.3+7.gec1d6d2 + psmi_calloc_internal@Base 3.3+7.gec1d6d2 + psmi_context_check_status@Base 3.3+7.gec1d6d2 + psmi_context_close@Base 3.3+7.gec1d6d2 + psmi_context_interrupt_isenabled@Base 3.3+7.gec1d6d2 + psmi_context_interrupt_set@Base 3.3+7.gec1d6d2 + psmi_context_open@Base 3.3+7.gec1d6d2 + psmi_core_getopt@Base 3.3+7.gec1d6d2 + psmi_core_setopt@Base 3.3+7.gec1d6d2 + psmi_crc@Base 3.3+7.gec1d6d2 + psmi_cycles_left@Base 3.3+7.gec1d6d2 + psmi_diags@Base 3.3+7.gec1d6d2 + psmi_ep_device_is_enabled@Base 3.3+7.gec1d6d2 + psmi_epaddr_get_hostname@Base 3.3+7.gec1d6d2 + psmi_epaddr_get_name@Base 3.3+7.gec1d6d2 + psmi_epaddr_kcopy_pid@Base 3.3+7.gec1d6d2 + psmi_epid_add@Base 3.3+7.gec1d6d2 + psmi_epid_fini@Base 3.3+7.gec1d6d2 + psmi_epid_hca_type@Base 3.3+7.gec1d6d2 + psmi_epid_init@Base 3.3+7.gec1d6d2 + psmi_epid_itor_fini@Base 3.3+7.gec1d6d2 + psmi_epid_itor_init@Base 3.3+7.gec1d6d2 + psmi_epid_itor_next@Base 3.3+7.gec1d6d2 + psmi_epid_lookup@Base 3.3+7.gec1d6d2 + psmi_epid_remove@Base 3.3+7.gec1d6d2 + psmi_epid_set_hostname@Base 3.3+7.gec1d6d2 + psmi_epid_sl@Base 3.3+7.gec1d6d2 + psmi_epid_subcontext@Base 3.3+7.gec1d6d2 + psmi_epid_table@Base 3.3+7.gec1d6d2 + psmi_errhandler_global@Base 3.3+7.gec1d6d2 + psmi_error_cmp@Base 3.3+7.gec1d6d2 + psmi_error_syslog_level@Base 3.3+7.gec1d6d2 + psmi_faultinj_enabled@Base 3.3+7.gec1d6d2 + psmi_faultinj_fini@Base 3.3+7.gec1d6d2 + psmi_faultinj_getspec@Base 3.3+7.gec1d6d2 + psmi_faultinj_init@Base 3.3+7.gec1d6d2 + psmi_faultinj_is_fault@Base 3.3+7.gec1d6d2 + psmi_faultinj_outfile@Base 3.3+7.gec1d6d2 + psmi_faultinj_verbose@Base 3.3+7.gec1d6d2 + psmi_free_internal@Base 3.3+7.gec1d6d2 + psmi_get_hca_type@Base 3.3+7.gec1d6d2 + psmi_get_ipv4addr@Base 3.3+7.gec1d6d2 + psmi_getenv@Base 3.3+7.gec1d6d2 + psmi_gethostname@Base 3.3+7.gec1d6d2 + psmi_getpagesize@Base 3.3+7.gec1d6d2 + psmi_handle_error@Base 3.3+7.gec1d6d2 + psmi_infinipath_revision@Base 3.3+7.gec1d6d2 + psmi_isinitialized@Base 3.3+7.gec1d6d2 + psmi_log_memstats@Base 3.3+7.gec1d6d2 + psmi_malloc_internal@Base 3.3+7.gec1d6d2 + psmi_memcpyo@Base 3.3+7.gec1d6d2 + psmi_mpool_create@Base 3.3+7.gec1d6d2 + psmi_mpool_destroy@Base 3.3+7.gec1d6d2 + psmi_mpool_find_obj_by_index@Base 3.3+7.gec1d6d2 + psmi_mpool_get@Base 3.3+7.gec1d6d2 + psmi_mpool_get_obj_gen_count@Base 3.3+7.gec1d6d2 + psmi_mpool_get_obj_index@Base 3.3+7.gec1d6d2 + psmi_mpool_get_obj_index_gen_count@Base 3.3+7.gec1d6d2 + psmi_mpool_get_obj_info@Base 3.3+7.gec1d6d2 + psmi_mpool_put@Base 3.3+7.gec1d6d2 + psmi_mq_free@Base 3.3+7.gec1d6d2 + psmi_mq_handle_data@Base 3.3+7.gec1d6d2 + psmi_mq_handle_envelope@Base 3.3+7.gec1d6d2 + psmi_mq_handle_envelope_outoforder@Base 3.3+7.gec1d6d2 + psmi_mq_handle_envelope_unexpected@Base 3.3+7.gec1d6d2 + psmi_mq_handle_outoforder_queue@Base 3.3+7.gec1d6d2 + psmi_mq_handle_rts@Base 3.3+7.gec1d6d2 + psmi_mq_handle_rts_complete@Base 3.3+7.gec1d6d2 + psmi_mq_handle_rts_outoforder@Base 3.3+7.gec1d6d2 + psmi_mq_initialize_defaults@Base 3.3+7.gec1d6d2 + psmi_mq_malloc@Base 3.3+7.gec1d6d2 + psmi_mq_mtucpy@Base 3.3+7.gec1d6d2 + psmi_mq_register_unexpected_callback@Base 3.3+7.gec1d6d2 + psmi_mq_req_alloc@Base 3.3+7.gec1d6d2 + psmi_mq_req_fini@Base 3.3+7.gec1d6d2 + psmi_mq_req_init@Base 3.3+7.gec1d6d2 + psmi_mq_stats_register@Base 3.3+7.gec1d6d2 + psmi_mq_sysbuf_alloc@Base 3.3+7.gec1d6d2 + psmi_mq_sysbuf_fini@Base 3.3+7.gec1d6d2 + psmi_mq_sysbuf_free@Base 3.3+7.gec1d6d2 + psmi_mq_sysbuf_getinfo@Base 3.3+7.gec1d6d2 + psmi_mq_sysbuf_init@Base 3.3+7.gec1d6d2 + psmi_mq_wait_internal@Base 3.3+7.gec1d6d2 + psmi_opened_endpoint@Base 3.3+7.gec1d6d2 + psmi_opened_endpoint_count@Base 3.3+7.gec1d6d2 + psmi_parse_memmode@Base 3.3+7.gec1d6d2 + psmi_parse_mpool_env@Base 3.3+7.gec1d6d2 + psmi_parse_str_tuples@Base 3.3+7.gec1d6d2 + psmi_poll_internal@Base 3.3+7.gec1d6d2 + psmi_poll_noop@Base 3.3+7.gec1d6d2 + psmi_progress_lock@Base 3.3+7.gec1d6d2 + psmi_protocol_fn@Base 3.3+7.gec1d6d2 + psmi_ptl_amsh@Base 3.3+7.gec1d6d2 + psmi_ptl_ips@Base 3.3+7.gec1d6d2 + psmi_ptl_self@Base 3.3+7.gec1d6d2 + psmi_sharedcontext_params@Base 3.3+19.g67c0807.open + psmi_shm_attach@Base 3.3+7.gec1d6d2 + psmi_shm_detach@Base 3.3+7.gec1d6d2 + psmi_shm_mq_rv_thresh@Base 3.3+7.gec1d6d2 + psmi_stats_deregister_all@Base 3.3+7.gec1d6d2 + psmi_stats_memory@Base 3.3+7.gec1d6d2 + psmi_stats_register@Base 3.3+7.gec1d6d2 + psmi_stats_register_type@Base 3.3+7.gec1d6d2 + psmi_strdup_internal@Base 3.3+7.gec1d6d2 + psmi_syslog@Base 3.3+7.gec1d6d2 + psmi_timer_cancel_inner@Base 3.3+7.gec1d6d2 + psmi_timer_entry_init@Base 3.3+7.gec1d6d2 + psmi_timer_fini@Base 3.3+7.gec1d6d2 + psmi_timer_init@Base 3.3+7.gec1d6d2 + psmi_timer_process_expired@Base 3.3+7.gec1d6d2 + psmi_timer_request_always@Base 3.3+7.gec1d6d2 + psmi_uuid_compare@Base 3.3+7.gec1d6d2 + psmi_uuid_parse@Base 3.3+7.gec1d6d2 + psmi_uuid_unparse@Base 3.3+7.gec1d6d2 + psmi_verno_client@Base 3.3+7.gec1d6d2 + psmi_verno_isinteroperable@Base 3.3+7.gec1d6d2 + psmi_xfer_fn@Base 3.3+7.gec1d6d2 diff --git a/patches/0001-Fix-truncation-warnings-with-gcc7.patch b/patches/0001-Fix-truncation-warnings-with-gcc7.patch new file mode 100644 index 0000000..6cb6b32 --- /dev/null +++ b/patches/0001-Fix-truncation-warnings-with-gcc7.patch @@ -0,0 +1,45 @@ +From: Roland Fehrenbacher +Date: Thu, 28 Dec 2017 19:56:31 +0100 +Subject: Fix truncation warnings with gcc7 +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +This patch was originally created by +Johannes Brandstätter see +https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=853451 +--- + psm_ep.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/psm_ep.c b/psm_ep.c +index 6857895..2bd0ed4 100644 +--- a/psm_ep.c ++++ b/psm_ep.c +@@ -978,7 +978,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op + int i, num_rails = 0; + char *uname = "IPATH_UNIT"; + char *pname = "IPATH_PORT"; +- char uvalue[4], pvalue[4]; ++ char uvalue[4], pvalue[6]; + int devid_enabled[PTL_MAX_INIT]; + union psmi_envvar_val devs; + +@@ -1010,7 +1010,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op + /* If multi-rail is used, set the first ep unit/port */ + if (num_rails > 0) { + snprintf(uvalue, 4, "%1d", units[0]); +- snprintf(pvalue, 4, "%1d", ports[0]); ++ snprintf(pvalue, 6, "%1d", ports[0]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + } +@@ -1038,7 +1038,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op + if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { + for (i = 1; i < num_rails; i++) { + snprintf(uvalue, 4, "%1d", units[i]); +- snprintf(pvalue, 4, "%1d", ports[i]); ++ snprintf(pvalue, 6, "%1d", ports[i]); + setenv(uname, uvalue, 1); + setenv(pname, pvalue, 1); + diff --git a/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch b/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch new file mode 100644 index 0000000..b0a69bb --- /dev/null +++ b/patches/0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch @@ -0,0 +1,31 @@ +From: Roland Fehrenbacher +Date: Fri, 29 Dec 2017 12:19:43 +0100 +Subject: Include to avoid warning about minor +MIME-Version: 1.0 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +ipath_proto.c: In function ‘ipath_userinit’: +ipath_proto.c:539:13: error: In the GNU C Library, "minor" is defined + by . For historical compatibility, it is + currently defined by as well, but we plan to + remove this soon. To use "minor", include + directly. If you did not intend to use a system-defined macro + "minor", you should undefine it after including . [-Werror] + spctrl->spc_dev.spd_type = minor(st.st_rdev); +--- + ipath/ipath_proto.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c +index 5f9365f..a943c1d 100644 +--- a/ipath/ipath_proto.c ++++ b/ipath/ipath_proto.c +@@ -37,6 +37,7 @@ + // level infinipath protocol code. + + #include ++#include + #include + #include + #include diff --git a/patches/0003-gcc8.patch b/patches/0003-gcc8.patch new file mode 100644 index 0000000..b405d18 --- /dev/null +++ b/patches/0003-gcc8.patch @@ -0,0 +1,29 @@ +Author: Reiner Herrmann +Description: Fix build with gcc 8 + - psm_utils.c: reserve enough memory for both input strings and the fixed part + - psm_ep.c: e has sufficient space to copy including the NULL terminator, + which fixes a warning about truncation of the input string +Bug-Debian: https://bugs.debian.org/897774 + +--- a/psm_ep.c ++++ b/psm_ep.c +@@ -1349,7 +1349,7 @@ + + b_new = (char *) devstr; + e = b_new + len; +- strncpy(e, devstring, len-1); ++ strncpy(e, devstring, len); + e[len-1] = '\0'; + ee = e + len; + i = 0; +--- a/psm_utils.c ++++ b/psm_utils.c +@@ -955,7 +955,7 @@ + union psmi_envvar_val env_fi; + char fvals_str[128]; + char fname[128]; +- char fdesc[256]; ++ char fdesc[300]; + + snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom); + fvals_str[sizeof fvals_str - 1] = '\0'; diff --git a/patches/0004-gcc-11-warning.patch b/patches/0004-gcc-11-warning.patch new file mode 100644 index 0000000..da556f5 --- /dev/null +++ b/patches/0004-gcc-11-warning.patch @@ -0,0 +1,22 @@ +Description: Disable warning in the cmpxchgl wrapper +Author: Christoph Biedl +Origin: no # upstream is dead +Bug-Debian: https://bugs.debian.org/984057 +Last-Update: 2022-10-16 + +--- a/include/linux-i386/sysdep.h ++++ b/include/linux-i386/sysdep.h +@@ -106,10 +106,13 @@ + uint32_t prev; + struct xchg_dummy { uint32_t a[100]; }; + ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Warray-bounds" + asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" + : "=a"(prev) + : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old) + : "memory"); ++#pragma GCC diagnostic pop + + return prev; + } diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..df69493 --- /dev/null +++ b/patches/series @@ -0,0 +1,4 @@ +0001-Fix-truncation-warnings-with-gcc7.patch +0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch +0003-gcc8.patch +0004-gcc-11-warning.patch diff --git a/rules b/rules new file mode 100755 index 0000000..d110e9a --- /dev/null +++ b/rules @@ -0,0 +1,61 @@ +#!/usr/bin/make -f + +include /usr/share/dpkg/buildflags.mk +include /usr/share/dpkg/pkg-info.mk +include /usr/share/dpkg/architecture.mk + +export DEB_CFLAGS_MAINT_APPEND = -fcommon + +ifeq ($(DEB_HOST_ARCH),amd64) + ARCH := x86_64 +else + ARCH := $(DEB_HOST_ARCH) +endif + +PSM_LIB_MAJOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MAJOR.*0x0\?\([1-9a-f]\?[0-9a-f]\+\).*/\1/p' $(CURDIR)/psm.h`) +PSM_LIB_MINOR := $(shell printf "%d" `sed -n 's/^\#define.*PSM_VERNO_MINOR.*\(0x[0-9a-f]\+\).*/\1/p' $(CURDIR)/psm.h`) +PSM_LIB_VERSION := ${PSM_LIB_MAJOR}.${PSM_LIB_MINOR} + + +MAKE_OPTIONS := INSTALL_PREFIX=/usr libdir=/usr/lib/$(DEB_HOST_MULTIARCH) \ + PSM_HAVE_SCIF=0 USE_PSM_UUID=0 arch=$(ARCH) + +%: + dh $@ --parallel + +debian/%.postinst: debian/%.postinst.in + sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \ + -e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g' \ + -e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g' \ + $< > $@ + +debian/%.prerm: debian/%.prerm.in + sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \ + -e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g' \ + -e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g' \ + $< > $@ + +debian/%.postrm: debian/%.postrm.in + sed -e 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' \ + -e 's/@PSM_LIB_VERSION@/${PSM_LIB_VERSION}/g' \ + -e 's/@PSM_LIB_MAJOR@/${PSM_LIB_MAJOR}/g' \ + $< > $@ + +override_dh_auto_build: debian/libpsm-infinipath1.postinst debian/libpsm-infinipath1.prerm debian/libpsm-infinipath1.postrm + $(MAKE) $(MAKE_OPTIONS) + +override_dh_strip: + dh_strip --dbg-package=libpsm-infinipath1-dbg + +override_dh_auto_install: + $(MAKE) install $(MAKE_OPTIONS) DESTDIR=$$PWD/debian/tmp + mkdir debian/tmp/usr/lib/libpsm1/ + mv debian/tmp/usr/lib/*/libpsm_infinipath.so.${PSM_LIB_VERSION} debian/tmp/usr/lib/libpsm1/ + +override_dh_auto_test: + +override_dh_auto_clean: + $(MAKE) $(MAKE_OPTIONS) distclean + -rm -f include/linux-i386/linux-i386 include/linux-ppc/linux-ppc + -[ ! -f debian/libpsm-infinipath1.postinst ] || rm debian/libpsm-infinipath1.postinst + -[ ! -f debian/libpsm-infinipath1.prerm ] || rm debian/libpsm-infinipath1.prerm diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/source/options b/source/options new file mode 100644 index 0000000..b7bc1f2 --- /dev/null +++ b/source/options @@ -0,0 +1 @@ +compression = "xz" diff --git a/watch b/watch new file mode 100644 index 0000000..4755a19 --- /dev/null +++ b/watch @@ -0,0 +1,3 @@ +version=3 + +https://www.openfabrics.org/downloads/infinipath-psm/infinipath-psm-(.*)\.(?:tar.gz|tar.bz2|tar.xz) -- cgit v1.2.3 From a581310d680f034f10b43e84008e449056cdca8e Mon Sep 17 00:00:00 2001 From: Roland Fehrenbacher Date: Thu, 28 Dec 2017 19:56:31 +0100 Subject: Fix truncation warnings with gcc7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch was originally created by Johannes Brandstätter see https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=853451 Gbp-Pq: Name 0001-Fix-truncation-warnings-with-gcc7.patch --- psm_ep.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/psm_ep.c b/psm_ep.c index 6857895..2bd0ed4 100644 --- a/psm_ep.c +++ b/psm_ep.c @@ -978,7 +978,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op int i, num_rails = 0; char *uname = "IPATH_UNIT"; char *pname = "IPATH_PORT"; - char uvalue[4], pvalue[4]; + char uvalue[4], pvalue[6]; int devid_enabled[PTL_MAX_INIT]; union psmi_envvar_val devs; @@ -1010,7 +1010,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op /* If multi-rail is used, set the first ep unit/port */ if (num_rails > 0) { snprintf(uvalue, 4, "%1d", units[0]); - snprintf(pvalue, 4, "%1d", ports[0]); + snprintf(pvalue, 6, "%1d", ports[0]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); } @@ -1038,7 +1038,7 @@ __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *op if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { for (i = 1; i < num_rails; i++) { snprintf(uvalue, 4, "%1d", units[i]); - snprintf(pvalue, 4, "%1d", ports[i]); + snprintf(pvalue, 6, "%1d", ports[i]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); -- cgit v1.2.3 From 4bf9edab66c99f9a3e0b411ff739ee56bb0b99df Mon Sep 17 00:00:00 2001 From: Roland Fehrenbacher Date: Fri, 29 Dec 2017 12:19:43 +0100 Subject: Include to avoid warning about minor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ipath_proto.c: In function ‘ipath_userinit’: ipath_proto.c:539:13: error: In the GNU C Library, "minor" is defined by . For historical compatibility, it is currently defined by as well, but we plan to remove this soon. To use "minor", include directly. If you did not intend to use a system-defined macro "minor", you should undefine it after including . [-Werror] spctrl->spc_dev.spd_type = minor(st.st_rdev); Gbp-Pq: Name 0002-Include-sys-sysmacros.h-to-avoid-warning-about-minor.patch --- ipath/ipath_proto.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ipath/ipath_proto.c b/ipath/ipath_proto.c index 5f9365f..a943c1d 100644 --- a/ipath/ipath_proto.c +++ b/ipath/ipath_proto.c @@ -37,6 +37,7 @@ // level infinipath protocol code. #include +#include #include #include #include -- cgit v1.2.3 From 5e44e7f07b0e7932e681cdea081f3bf73bd97efd Mon Sep 17 00:00:00 2001 From: Reiner Herrmann Date: Sun, 16 Oct 2022 04:18:17 -0700 Subject: Fix build with gcc 8 Bug-Debian: https://bugs.debian.org/897774 - psm_utils.c: reserve enough memory for both input strings and the fixed part - psm_ep.c: e has sufficient space to copy including the NULL terminator, which fixes a warning about truncation of the input string Gbp-Pq: Name 0003-gcc8.patch --- psm_ep.c | 2 +- psm_utils.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/psm_ep.c b/psm_ep.c index 2bd0ed4..113a4d3 100644 --- a/psm_ep.c +++ b/psm_ep.c @@ -1349,7 +1349,7 @@ psmi_parse_devices(int devices[PTL_MAX_INIT], const char *devstring) b_new = (char *) devstr; e = b_new + len; - strncpy(e, devstring, len-1); + strncpy(e, devstring, len); e[len-1] = '\0'; ee = e + len; i = 0; diff --git a/psm_utils.c b/psm_utils.c index c8651fe..ebaeda6 100644 --- a/psm_utils.c +++ b/psm_utils.c @@ -955,7 +955,7 @@ psmi_faultinj_getspec(char *spec_name, int num, int denom) union psmi_envvar_val env_fi; char fvals_str[128]; char fname[128]; - char fdesc[256]; + char fdesc[300]; snprintf(fvals_str, sizeof fvals_str - 1, "%d:%d:1", num, denom); fvals_str[sizeof fvals_str - 1] = '\0'; -- cgit v1.2.3 From 77e6e6932c73bceef03452d80e1e6d9d835083b1 Mon Sep 17 00:00:00 2001 From: Christoph Biedl Date: Sun, 16 Oct 2022 04:18:17 -0700 Subject: Disable warning in the cmpxchgl wrapper Origin: no # upstream is dead Bug-Debian: https://bugs.debian.org/984057 Last-Update: 2022-10-16 Gbp-Pq: Name 0004-gcc-11-warning.patch --- include/linux-i386/sysdep.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux-i386/sysdep.h b/include/linux-i386/sysdep.h index ef99d1d..55ce91e 100644 --- a/include/linux-i386/sysdep.h +++ b/include/linux-i386/sysdep.h @@ -106,10 +106,13 @@ static __inline__ uint32_t ips_cmpxchg(volatile uint32_t *ptr, uint32_t prev; struct xchg_dummy { uint32_t a[100]; }; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" asm volatile(LOCK_PREFIX "cmpxchgl %1,%2" : "=a"(prev) : "q"(new), "m"(*(struct xchg_dummy *)ptr), "0"(old) : "memory"); +#pragma GCC diagnostic pop return prev; } -- cgit v1.2.3