diff --git a/GNUmakefile.in b/GNUmakefile.in
index c33b5d9b..6900886f 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -13,6 +13,7 @@ AR = ar
 CC = @CC@
 CXX = @CXX@
 CPPFLAGS := -std=c++17
+MTFLAGS:= -L${LD_LIBRARY_PATH} -lwh -lpthread
 DEPSDIR := .deps
 DEPCFLAGS = -MD -MF $(DEPSDIR)/$*.d -MP
 LIBS = @LIBS@ $(MASSTREEDIR)/libjson.a $(LIBMALLOC) -lpthread -lm -lnuma
@@ -366,6 +367,12 @@ unit-hashtable: $(OBJ)/unit-hashtable.o $(STO_DEPS)
 unit-dboindex: $(OBJ)/unit-dboindex.o $(INDEX_DEPS)
 	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $< $(INDEX_DEPS) $(LDFLAGS) $(LIBS)
 
+unit-dboindexmasstrie: $(OBJ)/unit-dboindexmasstrie.o $(INDEX_DEPS)
+	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $< $(INDEX_DEPS) $(LDFLAGS) $(LIBS)
+	
+unit-test_MTrie: $(OBJ)/unit-test_MTrie.o $(INDEX_DEPS)
+	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $< $(INDEX_DEPS) $(LDFLAGS) $(LIBS) $(MTFLAGS) 
+	 
 unit-mvcc-access-all: $(OBJ)/unit-mvcc-access-all.o $(INDEX_DEPS) $(XXHASH_OBJ)
 	$(CXX) $(CXXFLAGS) $(OPTFLAGS) -o $@ $< $(INDEX_DEPS) $(XXHASH_OBJ) $(LDFLAGS) $(LIBS)
 
diff --git a/MassTrie-beta/MassTrie.hh b/MassTrie-beta/MassTrie.hh
new file mode 100644
index 00000000..53cfd776
--- /dev/null
+++ b/MassTrie-beta/MassTrie.hh
@@ -0,0 +1,318 @@
+#include <string>
+
+#include <iostream>
+
+#include <assert.h>
+
+#include <stdio.h>
+
+#include <unordered_map>
+
+#include <cassert>
+
+#include "wormhole/lib.h"
+
+#include "wormhole/kv.h"
+
+#include "wormhole/wh.h"
+
+#define NUM_THREADS 64
+
+#define MAX_SIZE 64
+
+using namespace std;
+
+//~~~~~~~~~CLASS MASSTRIE~~~~~~~~~~~~~~
+
+class MassTrie
+{
+
+public:
+  // constructor
+
+  MassTrie()
+  {
+
+    // creating wh wormhole mapping key to internal_elem (as uintptr_t)
+
+    wh = wh_create();
+
+    ref = wh_ref(this->wh);
+
+    iter = wh_iter_create(this->ref);
+
+    this->kbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    this->vbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    r = false;
+  }
+
+  // destructor
+
+  ~MassTrie()
+  {
+
+    wh_iter_destroy(this->iter);
+
+    wh_unref(this->ref);
+
+    wh_clean(this->wh);
+
+    wh_destroy(this->wh);
+
+    free(kbuf_out);
+
+    free(vbuf_out);
+  }
+
+  //~~~~~~~~~MASSTRIE FUNCTIONS~~~~~~~~~~~~~~
+
+  // put function - putting a uintptr_t which is the internal_elem
+
+  bool put(const void *key, int klen, const void *value, int vlen)
+  {
+
+    return (wh_put(this->ref, key, klen, value, vlen));
+  }
+
+  // get function
+
+  void *get(struct wormref *const ref, const void *key, int klen)
+  {
+
+    // variables
+
+    // bool r;
+
+    u32 vlen_out = 0;
+
+    // get action performed
+
+    r = wh_get(ref, key, klen, vbuf_out, sizeof(vbuf_out), &vlen_out);
+
+    return r ? vbuf_out : nullptr;
+  }
+
+  // delete function
+
+  bool del(const void *key, int klen)
+  {
+
+    return (wh_del(this->ref, key, klen));
+  }
+
+  // probe function - returns true if key exists, false otherwise
+
+  bool probe(const void *key, int klen)
+  {
+
+    r = (wh_probe(this->ref, key, klen));
+
+    return r;
+  }
+
+  // finds the closest pointer currently in the MassTrie
+
+  // to a pointer passed as a parameter
+
+  void *find_closest(const void *key)
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool r;
+
+    int min = INT_MAX;
+
+    int curr;
+
+    void *res = NULL;
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // calculate disatnce
+
+        curr = abs((long)(reinterpret_cast<uintptr_t>(kbuf_out)) - (long)(reinterpret_cast<uintptr_t>(key)));
+
+        if (curr < min)
+        {
+
+          // perform malloc
+
+          if (!res)
+
+            res = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+          // error handling
+
+          if (res == NULL)
+          {
+
+            printf("Error! memory not allocated.");
+
+            exit(1);
+          }
+
+          min = curr;
+
+          // cout<<"curr = "<<curr<<endl;
+
+          memcpy(res, kbuf_out, sizeof(kbuf_out));
+        }
+      }
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+
+    return (res != NULL) ? res : nullptr;
+  }
+
+  // deletes all from MassTrie
+
+  void delete_all()
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // delete key
+
+        this->del(kbuf_out, sizeof(kbuf_out));
+      }
+
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+  }
+
+  // data members
+
+  struct wormhole *wh;
+
+  struct wormref *ref;
+
+  struct wormhole_iter *iter;
+
+  void *kbuf_out;
+
+  void *vbuf_out;
+
+  bool r;
+
+}; // class MassTrie
+
+/**
+
+//override the << operation
+
+
+
+ostream& operator<<(ostream &os, MassTrie* m){
+
+
+
+u32 klen_out = 0;
+
+  char kbuf_out[MAX_SIZE] = {};
+
+  u32 vlen_out = 0;
+
+  char vbuf_out[MAX_SIZE] = {};
+
+  bool r;
+
+
+
+  wh_iter_seek(m->iter, NULL, 0); // seek to the head
+
+  printf("wh_iter_seek \"\"\n");
+
+  while (wh_iter_valid(m->iter)) {
+
+    r = wh_iter_peek(m->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+    if (r) {
+
+      os << "wh_iter_peek: key = "<<reinterpret_cast<char *>(kbuf_out)<<" , klen = "<< klen_out<<" , "<<
+
+      " value= "<<reinterpret_cast<char *>(vbuf_out) << ", vlen= "<< vlen_out<<endl;
+
+    } else {
+
+      printf("ERROR!\n");
+
+    }
+
+
+
+    wh_iter_skip1(m->iter);
+
+
+
+    memset(kbuf_out,0,sizeof(kbuf_out));
+
+    memset(vbuf_out,0,sizeof(vbuf_out));
+
+  }
+
+  return os;
+
+}
+
+
+
+**/
diff --git a/MassTrie-beta/wormhole/LICENSE b/MassTrie-beta/wormhole/LICENSE
new file mode 100644
index 00000000..f288702d
--- /dev/null
+++ b/MassTrie-beta/wormhole/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/MassTrie-beta/wormhole/Makefile b/MassTrie-beta/wormhole/Makefile
new file mode 100644
index 00000000..f00e6b59
--- /dev/null
+++ b/MassTrie-beta/wormhole/Makefile
@@ -0,0 +1,45 @@
+# Makefile
+# rules (always with .out)
+# SRC-X.out += abc        # extra source: abc.c
+# MOD-X.out += abc        # extra module: abc.c abc.h
+# ASM-X.out += abc        # extra assembly: abc.S
+# DEP-X.out += abc        # extra dependency: abc
+# FLG-X.out += -finline   # extra flags
+# LIB-X.out += abc        # extra -labc options
+
+# X.out : xyz.h xyz.c # for extra dependences that are to be compiled/linked.
+
+# X => X.out
+TARGETS += easydemo concbench stresstest
+# X => X.c only
+SOURCES +=
+# X => X.S only
+ASSMBLY +=
+# X => X.c X.h
+MODULES += lib kv wh
+# X => X.h
+HEADERS += ctypes
+
+FLG +=
+LIB += m
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),FreeBSD)
+LIB += execinfo
+endif
+
+# when $ make FORKER_PAPI=y
+ifeq ($(strip $(FORKER_PAPI)),y)
+LIB += papi
+FLG += -DFORKER_PAPI
+endif
+
+bin : libwh.so
+libwh.so : Makefile Makefile.common lib.c lib.h kv.c kv.h wh.c wh.h wh.strip
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) -shared -fPIC)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ lib.c kv.c wh.c $(ALLLIB)
+	strip --strip-all --discard-all @wh.strip $@
+
+
+include Makefile.common
diff --git a/MassTrie-beta/wormhole/Makefile.common b/MassTrie-beta/wormhole/Makefile.common
new file mode 100644
index 00000000..ecd761e7
--- /dev/null
+++ b/MassTrie-beta/wormhole/Makefile.common
@@ -0,0 +1,216 @@
+#usage: include Makefile.common at the end of your Makefile
+
+# no builtin rules/vars (CC, CXX, etc. are still defined but will be empty)
+MAKEFLAGS += -r -R
+
+HDR = $(addsuffix .h,$(MODULES) $(HEADERS))
+SRC = $(addsuffix .c,$(MODULES) $(SOURCES))
+ASM = $(addsuffix .S,$(ASSMBLY))
+OBJ = $(addsuffix .o,$(MODULES) $(SOURCES) $(ASSEMBLY))
+DEP = Makefile.common Makefile $(HDR) $(EXTERNDEP) $(EXTERNSRC)
+BIN = $(addsuffix .out,$(TARGETS))
+DIS = $(addsuffix .dis,$(TARGETS))
+
+# clang:
+# EXTRA="-Rpass=loop-vectorize"  # IDs loops that were successfully V-ed
+# EXTRA="-Rpass-missed=loop-vectorize"  # IDs loops that failed V
+# EXTRA="-Rpass-analysis=loop-vectorize" # IDs the statements that caused V to fail
+# EXTRA="-Rpass=\ *" # remarks for all passes
+# other passes: https://llvm.org/docs/Passes.html
+
+O ?= rg
+
+# predefined OPT: make O={rg,r,0g,3g,p,0s,3s,cov,mc,hc,wn,stk}
+ifeq ($O,rg) # make O=rg
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector
+else ifeq ($O,r) # make O=r (for release)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector
+else ifeq ($O,ns) # make O=ns (no signal handlers)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector -DNOSIGNAL
+else ifeq ($O,0g) # make O=0g
+OPT ?= -g3 -O0 -fno-inline
+else ifeq ($O,2g) # make O=2g
+OPT ?= -g3 -O2
+else ifeq ($O,3g) # make O=3g
+OPT ?= -g3 -O3 -flto -fno-inline
+else ifeq ($O,p) # make O=p (profiling: rg+noinline)
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector -fno-inline
+else ifeq ($O,0s) # make O=0s (address sanitizer)
+OPT ?= -g3 -O0 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,3s) # make O=3s (address sanitizer)
+OPT ?= -g3 -O3 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,t) # make O=0t (thread sanitizer)
+OPT ?= -g3 -O1 -fno-inline -fsanitize=thread -fno-stack-protector
+else ifeq ($O,cov) # make O=cov (for gcov)
+OPT ?= -g3 -DNDEBUG -O0 --coverage
+CCC = gcc
+else ifeq ($O,mc) # make O=mc (for valgrind memcheck)
+OPT ?= -g3 -O1 -fno-inline -DHEAPCHECKING
+ARCH ?= broadwell
+else ifeq ($O,hc) # make O=hc (for gperftools heapcheck)
+OPT ?= -g3 -O1 -fno-inline
+LIB += tcmalloc
+else ifeq ($O,wn) # more warning
+OPT ?= -g3 -O3 -Wvla -Wformat=2 -Wconversion -Wstrict-prototypes -Wmissing-prototypes
+else ifeq ($O,stk) # check stack usage with gcc
+OPT ?= -g3 -O3 -DNDEBUG -fstack-usage
+CCC = gcc
+endif
+
+# malloc: g:glibc, t:tcmalloc, j:jemalloc
+M ?= g
+
+ifeq ($M,t)
+  LIB += tcmalloc
+  FLG += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+else ifeq ($M,j)
+  LIB += jemalloc
+endif
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+  CHECK_S := -D__linux__
+  LIB += rt
+else ifeq ($(UNAME_S),FreeBSD)
+  CHECK_S := -D__FreeBSD__
+  FLG += -I/usr/local/include -L/usr/local/lib
+  LIB += rt
+  LIB += execinfo
+  TPUT := /usr/local/bin/tput
+else ifeq ($(UNAME_S),Darwin)
+  CHECK_S := -D__APPLE__ -D__MACH__
+  # do nothing
+else
+  $(error "Supported Platforms: Linux, FreeBSD, Darwin")
+endif
+TPUT ?= tput
+
+CCC ?= clang
+CSTD = -std=gnu18
+XCC ?= clang++
+XSTD = -std=gnu++17
+
+UNAME_M := $(shell uname -m)
+ifeq ($(UNAME_M),aarch64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),arm64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),x86_64)
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else ifeq ($(UNAME_M),amd64) # freebsd
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else
+  $(error "Supported Platforms: aarch64, x86_64")
+endif
+
+TUNE ?= native
+
+NBI += memcpy memmove memcmp
+
+# minimal requirement on x86_64: -march=nehalem
+# minimal requirement on aarch64: -march=armv8-a+crc
+FLG += -march=$(ARCH) -mtune=$(TUNE)
+FLG += -pthread -Wall -Wextra -Wshadow #-Weverything
+FLG += $(addprefix -fno-builtin-,$(NBI))
+FLG += $(OPT)
+
+ifneq ($(shell $(CCC) --version 2>/dev/null | grep clang),)
+FLG += -ferror-limit=3
+CCCTYPE := clang
+else ifneq ($(shell $(CCC) --version 2>/dev/null | grep gcc),)
+FLG += -fmax-errors=3
+FLG += -Wno-unknown-pragmas
+CCCTYPE := gcc
+else
+  $(error "Supported Compilers: clang, gcc")
+endif
+
+ifeq ($(CCCTYPE),clang)
+  CCINST = /usr/lib/clang/$(shell $(CCC) --version 2>/dev/null | awk '/^clang/ { print $$3 }')
+  CCINC = $(CCINST)/include
+else ifeq ($(CCCTYPE),gcc)
+  CCINST = /usr/lib/gcc/$(shell $(CCC) -dumpmachine)/$(shell $(CCC) -dumpversion)
+  CCINC = $(CCINST)/include $(CCINST)/include-fixed
+endif
+CCINC = /usr/include /usr/local/include
+
+ifneq ($(shell find $(CCINC) -name backtrace-supported.h 2>/dev/null),)
+  LIB += backtrace
+  FLG += -DBACKTRACE
+endif
+
+ifneq ($(shell find $(CCINC) -name liburing.h 2>/dev/null),)
+  LIB += uring
+  FLG += -DLIBURING
+endif
+
+
+uniq = $(if $1,$(firstword $1) $(call uniq,$(filter-out $(firstword $1),$1)))
+magentatxt := $(shell $(TPUT) setaf 5)
+greentxt := $(shell $(TPUT) setaf 2)
+bluetxt := $(shell $(TPUT) setaf 4)
+normaltxt := $(shell $(TPUT) sgr0)
+
+.PHONY : bin dis def clean cleanx check tags
+
+bin : $(BIN)
+dis : $(DIS) bin
+.DEFAULT_GOAL = bin
+.SECONDEXPANSION:
+
+ifeq ($(J),o)
+# DANGER. Don't use unless it works!
+# build from .o files but target-specific flags are missing in %.o : %.x
+%.out : %.o $(OBJ) $$(addsuffix .o,$$(SRC-$$@) $$(MOD-$$@) $$(ASM-$$@))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ $^ $(ALLLIB)
+#
+else # default: all-in-one command
+%.out : %.c $(SRC) $(ASM) $(DEP) $$(DEP-$$@) $$(addsuffix .c,$$(SRC-$$@) $$(MOD-$$@)) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@)) $$(addsuffix .S,$$(ASM-$$@))
+	$(eval ALLSRC := $(SRC) $(addsuffix .c,$(SRC-$@) $(MOD-$@)) $(ASM) $(addsuffix .S,$(ASM-$@)))
+	$(eval UNIQSRC := $(call uniq,$(ALLSRC)))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	@printf '$(bluetxt)$@$(magentatxt) <= $(greentxt)$< $(UNIQSRC)$(normaltxt)\n'
+	$(CCC) $(ALLFLG) -o $@ $< $(UNIQSRC) $(ALLLIB)
+#
+endif
+
+
+%.dis : %.out
+	objdump -SlwtC $< 1>$@ 2>/dev/null
+
+%.o : %.cc $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(XCC) $(XSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.S $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.s : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) -S -o $@ -c $<
+
+def :
+	$(CCC) $(FLG) -dM -E - </dev/null
+
+clean :
+	rm -rf *.out *.dis *.o *.so *.gcda *.gcno *.gcov *.dSYM
+
+cleanx : clean
+	rm -rf $(EXTERNDEP) $(EXTERNSRC)
+
+check :
+	cppcheck $(addprefix -I ,$(CCINC)) \
+    -q $(CHECK_M) $(CHECK_S) -DNDEBUG -U__cplusplus \
+    --std=c11 --language=c --platform=unix64 \
+    --enable=warning,style,performance,portability,information --inline-suppr .
+
+tags :
+	ctags -R . /usr/include /usr/local/include $(CCINC)
diff --git a/MassTrie-beta/wormhole/README.md b/MassTrie-beta/wormhole/README.md
new file mode 100644
index 00000000..9bb59c12
--- /dev/null
+++ b/MassTrie-beta/wormhole/README.md
@@ -0,0 +1,483 @@
+# Wormhole
+
+The Wormhole index structure was introduced in paper ["Wormhole: A Fast Ordered Index for In-memory Data Management"](https://www.cs.uic.edu/~wuxb/papers/wormhole.pdf)
+by Xingbo Wu, Fan Ni, and Song Jiang ([ACM DL](https://dl.acm.org/citation.cfm?id=3303955)).
+This repository maintains a reference implementation of the Wormhole index structure.
+
+It supports Linux/FreeBSD/MacOS on x86\_64 and AArch64 CPUs.
+On x86\_64, Wormhole requires SSE4.2.
+On AArch64, Wormhole requires NEON SIMD and the `crc` features on the target CPU.
+The code has been tested with Intel Haswell, Broadwell, and Skylake CPUs.
+It has also been tested on a Raspberry PI 4 running 64-bit ArchlinuxArm, and a Jetson Nano running 64-bit Ubuntu Groovy.
+
+## NEWS
+
+* See `wh.py` for a brief example of using Wormhole in Python.
+
+* An old limitation about anchor keys has been removed (See Section 3.3 in the original paper for more details).
+Now Wormhole can store binary string keys of any patterns including any number of '\0's. A key's length can be 0 to UINT32\_MAX bytes. (Internally: leaf-nodes' anchor key length <= UINT16\_MAX).
+
+* `wh.h` provides a user-friendly interface. See `easydemo.c` for coding examples. the `wh_` functions are thread-safe.
+
+* The `whsafe` API is a *worry-free* thread-safe wormhole API.
+At a small cost on each operation, users no longer need to call the `wormhole_refresh_qstate` in any circumstances.
+
+* `merge` (Merge a new kv with existing kv) and `delr` (delete range) operations have been added. They are all thread-safe.
+
+## Highlights:
+
+* Thread-safety: all operations, including `get`, `put`, `inplace-update (inp)`, `del`, `iter-seek`, `iter-peek`, `iter-skip` etc., are thread-safe.
+See `stresstest.c` for more thread-safe operations.
+
+* Keys can contain any value, including binary zeros (`'\0'`). Their sizes are always explicitly specified.
+
+* Long keys are welcome! The key-length field (`klen` in `struct kv`) is a 32-bit unsigned integer and the maximum size of a key is 4294967295.
+
+* No background threads or global status. Wormhole uses a mix of user-space rwlocks and QSBR RCU to synchronize between readers and writers.
+See below for more details.
+
+# Build
+
+Clang is the default compiler. It can compile with gcc with `$ make CCC=gcc`.
+On our testbed, Clang usually produces faster code than GCC.
+
+To build:
+
+    $ make
+
+Alternatively, you may use `O=0g` to enable debug info and disable optimizations:
+
+    $ make O=0g
+
+## Sample programs
+`easydemo.c` presents how to use wormhole through a user-friendly API declared at the end of `wh.h`.
+
+    $ ./easydemo.out
+
+The `wh_{ref/unref/get/put/del/probe}` and  `wh_iter_{create/destroy/seek/skip/peek/park/valid}` functions are all thread-safe.
+Each thread should acquire a private reference using `wh_ref` for KV operations.
+
+`concbench.out` is an example benchmarking tool of only 150 LoC. See the helper messages for more details.
+It generates six-word keys based on a word list (words.txt). See `sprintf` in `concbench.c`.
+
+    $ wget https://github.com/dwyl/english-words/raw/master/words.txt
+    $ ./concbench.out words.txt 10000000 4
+    $ numactl -N 0 ./concbench.out words.txt 10000000 4
+
+`stresstest.out` tests all thread-safe operations.
+
+`libwh.so` can be linked to any C/C++ program with the help of `wh.h`.
+
+# The wh API (USE THIS)
+
+The `wh_*` functions provides a clean programming interface that helps to avoid common inefficient use of the Wormhole data structure.
+If you're not sure which interface to use, just use `wh_*`. Read `easydemo.c` for more details.
+
+Coding examples:
+
+```C
+{
+    struct wormhole * wh = wh_create(); // create a new wormhole instance
+    struct wormref * ref = wh_ref(wh); // to access wh, a thread must obtain a reference
+    wh_put(ref, "hello", 5, "world!", 6); // insert a kv pair
+    wh_put(ref, NULL, 0, NULL, 0); // both key and value can be zero-sized
+    r = wh_probe(ref, "hello", 5); // r == true
+    r = wh_probe(ref, NULL, 0); // r == true
+    r = wh_probe(ref, "abc", 3); // r == false
+    u8 buf [6];
+    u32 len_out;
+    r = wh_get(ref, "hello", 5, buf, 6, &len_out); // r == true, len_out == 6, "world!" in buf (without the '\0')
+    struct wormhole_iter * iter = wh_iter_create(ref); // creates an iter on a ref
+    wh_iter_seek(iter, "h", 1); // seek for the smallest key >= "h"; the iter will be placed on "hello"
+    r = wh_iter_valid(iter); // r == true; You should always check if iter is valid after a seek() and skip()
+    r = wh_iter_peek(iter, buf, 6, &len_out, NULL, 0, NULL); // only need the key: will get "hello" and 5
+    r = wh_iter_peek(iter, NULL, 0, NULL, buf, 6, &len_out); // only need the value: will get "world!" and 6
+    // (you can also get both key and value using one call with two buffers)
+    wh_iter_skip1(iter); // skip the current key; equivalent to wh_iter_skip(iter, 1);
+    r = wh_iter_valid(iter); // r == false; already passed the end of the dataset
+    wh_iter_park(iter); // an iter may hold locks; It's a good manner to "park" the iter before sleep.
+    sleep(10); // not interacting with the wormhole instance.
+    wh_iter_seek(iter, NULL, 0); // need to do another seek to reactivate the iter
+    r = wh_iter_valid(iter); // r == true; on the zero-sized key now
+    wh_iter_destroy(iter); // now we're done with the iter
+    wh_del(ref, "hello", 5); // delete a key
+    wh_del(ref, NULL, NULL); // delete the zero-sized key
+    wh_unref(ref); // the current thread is no longer interested in accessing the index
+    wh_destroy(wh); // fully destroy the index; all references should have been released before calling this
+}
+```
+
+## Integer keys
+
+Wormhole supports binary keys, which means you don't need to print integers into text when using Wormhole to index integer keys.
+Here are some quick examples for using Wormhole as an integer-key index. A little-endian CPU is assumed.
+
+```C
+{
+    // 32-bit unsigned integer keys
+    u32 key = __builtin_bswap32(1000); // reverse byte order of key 1000
+    wh_put(ref, &key, 4, NULL, 0);
+    key = __builtin_bswap32(2000); // reverse byte order of key 2000
+    wh_put(ref, &key, 4, NULL, 0);
+    struct wormhole_iter * iter = wh_iter_create(ref);
+    key = __builtin_bswap32(999);
+    wh_iter_seek(iter, &key, 4); // seek 999
+    u32 key_out, len_out;
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 1000 in key_out in reversed byte order
+    wh_iter_skip1(iter);
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 2000 in key_out in reversed byte order
+}
+```
+
+# Advanced APIs
+
+If the simple and thread-safe `wh_*` interface already meets your performance requirements, You don't need to read the following sections.
+Using the `wormhole_*` and `whunsafe_*` APIs can maximize the efficiency of your code with a roughly 5%-10% speedup.
+However, inefficient use of these APIs, such as repeatedly calling malloc() to prepare the key buffer, can easily hurt the performance.
+
+## `struct kv` and `struct kref`
+
+There are a handful of helper functions (`kv_*` and `kref_*` functions) at the beginning of wh.h.
+It's worth noting that the *key's hash* (`hash` of `struct kv` and `hash32` of `struct kref`)
+must be up-to-date before passed to wormhole.
+The `kv_refill*` helper functions internally update the hash after filling the kv contents.
+In a more general case, `kv_update_hash` directly updates a `struct kv`'s hash.
+Similarly, `kref_refill_hash32()` calculates the 32-bit hash for `struct kref`.
+Performing the hash calculation at the client side can achieve the best efficiency on the server (the index operations).
+
+## The Wormhole API
+
+`concbench.c` and `stresstest.c` are examples of how to use a Wormhole index.
+There are three sets of Wormhole API: `whsafe`, `wormhole`, and `whunsafe`.
+* `whsafe`: The *worry-free* thread-safe API. If you use Wormhole in a concurrent environment and want minimal complexity in your code, you should use `whsafe`.
+* `wormhole`: The standard thread-safe API. It offers better efficiency than `whsafe` but requires some extra effort for blocking prevention.
+* `whunsafe`: the thread-unsafe API. It offers the best speed and efficiency but does not perform internal concurrency control.
+External synchronization should be employed when accessing `whunsafe` in a concurrent environment.
+
+The functions of each API can be found near the end of `wh.c` (search `kvmap_api_whsafe`, `kvmap_api_wormhole`, and `kvmap_api_whunsafe`).
+Note that each API contains a mix of `whsafe_*`, `wormhole_*`, and `whunsafe_*` functions.
+
+### The `whsafe` API
+The `whsafe` API functions are listed in the `kvmap_api_whsafe` structure in `wh.c`. The API consists of a mix of `wormhole_*` and `whsafe_*` functions.
+
+The index operations (GET, SET, UPDATE, DEL, PROBE, INPLACE, MERGE, and SCAN (`wormhole_iter_*` functions)) are all *thread safe*.
+A thread needs to hold a reference of the index (_wormref_) to perform safe index operations.
+
+An example of using point-query operations using the `whsafe` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = whsafe_ref(wh);
+    for (...) {
+      whsafe_put(ref, ...);
+      whsafe_get(ref, ...);
+      whsafe_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = whsafe_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      whsafe_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    whsafe_iter_park(iter); // Release resources to avoid blocking other threads
+    // it's now safe to do something such as sleep() or waitpid()
+    // ... start using the iterator again
+    whsafe_iter_seek(iter, key2);
+    // ... other iter operations
+    whsafe_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### The `wormhole` API
+Similar to `whsafe`, `wormhole` is also thread safe. It's often faster than `whsafe` but requires extra caution when using it.
+
+An example of using point-query operations using the `wormhole` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = wormhole_ref(wh);
+    for (...) {
+      wormhole_put(ref, ...);
+      wormhole_get(ref, ...);
+      wormhole_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = wormhole_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      wormhole_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    wormhole_iter_park(iter); // Release resources to avoid blocking other threads
+    while (condition not met) { // See below for explanation
+        wormhole_refresh_qstate(ref);
+    }
+    // ... start using the iterator again
+    wormhole_iter_seek(iter, key2);
+    // ... other iter operations
+    wormhole_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### Avoid blocking writers when using the `wormhole` API
+Wormhole internally uses QSBR RCU to synchronize readers/writers so every holder of a reference (`ref`)
+needs to actively perform index operations.
+An ref-holder, if not actively performing index operations, may block a writer thread that is performing split/merge operations.
+(because of not periodically announcing its quiescent state).
+If a ref-holder is about to become inactive from Wormhole's perspective (doing something else or just sleeping),
+it is recommended that the holder temporarily releases the `ref` before entering the inactive status (such as calling `sleep(10)`),
+and reactivate the `ref` before performing the next index operation.
+
+```C
+{
+    // assume we already have an active ref
+    wormhole_park(ref);   // this will avoid blocking any other threads
+    sleep(10);
+    wormhole_resume(ref);  // this will reactivate the ref
+    // continue to perform index operations
+}
+```
+
+A common scenario of dead-locking is acquiring locks with an active wormhole reference,
+The following example could cause deadlock between two threads.
+
+```C
+// Thread A has an active ref and try to lock()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    lock(just_a_lock); // << block here forever
+}
+
+// Thread B already acquired the lock and wants to insert a key to wh
+{
+    lock(just_a_lock);
+    wormhole_put(ref, kv); << block here forever
+}
+```
+
+To avoid this scenario, thread A should either call `wormhole_park(ref)` before acquiring the lock, or keep updating the qstate of the ref:
+```C
+// Solution A.1: use wormhole_park()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    wormhole_park(ref);
+    lock(just_a_lock);
+    wormhole_resume(ref); // can use ref afterward
+}
+
+// Solution A.2: use try_lock and wormhole_refresh_qstate()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    while (!try_lock(just_a_lock)) {
+        wormhole_refresh_qstate(ref);
+    }
+    // continue to use ref
+}
+```
+
+The above issues with QSBR are specific to the `wormhole` API. `whsafe` does not have these issues.
+
+### The `whunsafe` API
+A set of *thread-unsafe* functions are also provided. See the functions with _prefix_ `whunsafe`.
+The thread-unsafe functions don't use the reference (_wormref_).
+Simply feed them with the pointer to the wormhole index:
+
+```C
+{
+    wh = whunsafe_create(NULL);
+    for (...) {
+      whunsafe_put(wh, ...);
+      whunsafe_get(wh, ...);
+      whunsafe_del(wh, ...);
+      ... // other unsafe operations
+    }
+    ... // other unsafe operations
+    wormhole_destroy(wh);
+}
+```
+
+### In-place update with user-defined function
+`wormhole_inp` executes a user-defined function on an existing key-value item.
+If the key does not exist, a NULL pointer will be passed to the user-defined function.
+A simple example would be incrementing a counter stored in a key-value pair.
+
+```C
+{
+    // user-defined in-place update function
+    void myadd1(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        (*pvalue)++;
+      }
+    }
+
+    // create the counter
+    u64 zero = 0;
+    struct kv * tmp = kv_create("counter", 7, &zero, 8); // malloc-ed
+    wormhole_put(ref, tmp);
+
+    // perform +1 on the stored value
+    struct kref kref = kv_ref(tmp); // create a kref of tmp
+    wormhole_inp(ref, &kref, myadd1, NULL);
+}
+```
+
+Note that the user-defined function should ONLY change the value's content, and nothing else.
+Otherwise, the index can be corrupted.
+A similar mechanism is also provided for iterators (`wormhole_iter_inp`).
+
+The inplace function can also be used to retrieve key-value data. For example:
+
+```C
+{
+    void inplace_getu64(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        *(u64 *)priv = *pvalue;
+      } else {
+        *(u64 *)priv = 0;
+      }
+    }
+    ...
+    struct kref kref = ...
+    u64 val;
+    wormhole_inp(ref, &kref, inplace_getu64, &val);
+}
+```
+
+### `merge`: atomic Read-Modify-Write
+The `wormhole_merge` and `whsafe_merge` functions perform atomic Read-Modify-Write (RMW) operations.
+In a RMW operation, if the search key is found, the KV pair will be passed to a user-defined callback function `uf` (short for user function).
+Otherwise, a NULL pointer is passed to `uf`.
+`uf` could update the KV in-place if it does not require any memory reallocation.
+In such a case, `uf` should return the KV's pointer back and the merge function will do nothing else.
+If `uf` want to replace the KV with something new, it should return a pointer that is different than the original KV pointer.
+The `uf` should not make memory allocation by itself.
+Instead, the `merge` function will copy the returned KV and replace the existing KV with the newly created one.
+`uf` should not return NULL unless the key was not found.
+
+### Iterator
+The `wormhole_iter_{seek,peek,skip,next,inp}` functions provide range-search functionalities.
+If the search key does not exist, the `seek` operation will put the cursor on the item that is greater than the search-key.
+`next` will return the item under the current cursor and move the cursor forward.
+`peek` is similar but does not move the cursor. For example, with keys `{1,3,5}`, `seek(2); r = next()` will see `r == 3`.
+
+Currently Wormhole does not provide `seek_for_less_equal()` and `prev()` for backward scanning. This feature will be added in the future.
+
+# Memory management
+
+By default, Wormhole manages all the key-value data internally and only copies to or from a user-supplied
+buffer (a `struct kv` object).
+This draws a clear boundary in the memory space between the index structure and its users.
+After a call to any of the index operations, the caller can immediately free
+the buffer holding the key-reference or the key-value data.
+This also allows users to use stack-allocated variables to interact with Wormhole.
+
+The memory manager of the internal key-value objects can be customized when creating a new Wormhole (see `wormhole_create`).
+The customization will _only_ affect the internal `struct kv` objects.
+Actually, the memory manager can be configured to directly use the caller's `struct kv` object and store it in Wormhole.
+This `struct kvmap_mm` structure shows an example:
+
+```C
+{
+    const struct kvmap_mm kvmap_mm_ualloc {
+      .in = kvmap_mm_in_noop, // in wormhole_put(), store caller's kv in wh
+      .out = kvmap_mm_out_dup, // but still make a copy in wormhole_get()
+      .free = kvmap_mm_free_free, // call free() for delete/update
+    };
+    ...
+    struct wormhole * wh = wormhole_create(&kvmap_mm_ualloc);
+    struct wormref * ref = wormhole_ref(wh);
+    ...
+    struct kv * newkv = malloc(size);
+    ...
+    wormhole_put(ref, newkv);
+    // Don't free newkv! it's now managed by wh
+}
+```
+
+Each of the in/out/free functions can be freely customized.
+A few `kvmap_mm_*` functions are already provided for common scenarios.
+`kvmap_mm_ndf` is identical to the `kvmap_mm_ualloc` structure in the above example.
+
+## Hugepages
+Wormhole uses hugepages when available. To reserve some hugepages in Linux (10000 * 2MB):
+
+    # echo 10000 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+# Tuning
+
+A few macros in `wh.c` can be tuned.
+
+* `WH_SLABLEAF_SIZE` controls the slab size for leaf node allocation.
+The default is `((1lu << 21))` (2MB slabs). If 1GB hugepages are available, `WH_SLABLEAF_SIZE` can be set to `((1lu << 30))` to utilize 1GB hugepages.
+Using 1GB hugepages can improve search performance on a large dataset.
+
+* `WH_KPN` controls "Keys Per (leaf-)Node". The default value is 128.
+Compared to the default, `WH_KPN=256` can offer 5-10%+ higher point query and update speed.
+However, range queries prefer a smaller node size such as 64.
+
+
+* `QSBR_STATES_NR` and `QSBR_SHARDS_NR` control the capacity (number of active references) of the QSBR RCU.
+The product of the two values is the capacity. For efficiency, `QSBR_STATES_NR` can be set to 23, 39, and 55, and `QSBR_SHARDS_NR` must be 2^n, n<=6.
+The defaults are 23 and 32, respectively. The QSBR registry can run out of space if there are a few hundred of threads, which is not a problem in practice.
+
+# Limitations
+
+## Key Patterns
+A **split** operation will fail when **129** (`WH_KPN + 1`) keys share a common prefix of 65535+ bytes.
+In Wormhole, the maximum _anchor-key_ length is 65535 (2^16) bytes, which is shorter than the maximum key-length (2^32).
+
+## Memory Allocation
+Insertions/updates can fail and return false when a memory allocation fails.
+On memory-allocation failure, the hash-table expansion function will block and wait for available memory.
+
+# Performance
+Some benchmarking results with some real-world datasets: See [this](https://github.com/wuxb45/wormhole/issues/5) page for more information.
+
+![Concurrent GET](https://user-images.githubusercontent.com/564235/112712778-704d7200-8e9f-11eb-9f4d-795de46772d1.png)
diff --git a/MassTrie-beta/wormhole/README.txt b/MassTrie-beta/wormhole/README.txt
new file mode 100644
index 00000000..e70108ef
--- /dev/null
+++ b/MassTrie-beta/wormhole/README.txt
@@ -0,0 +1,31 @@
+To setup the project:
+
+If you're not already in the folder 'wormhole', preform:
+
+1. cd wormhole
+
+Once you're there, set the variable LD_LIBRARY_PATH to the
+current working directory using:
+
+2. setenv LD_LIBRARY_PATH `pwd`
+
+You can check (optionally) that this operation was exceuted properly using:
+
+3. echo $LD_LIBRARY_PATH
+
+
+Then, do:
+
+4. cd sto
+
+5. /./bootstrap.sh
+
+6. ./configure
+
+To run the test file do:
+
+7. make unit-testMTrie
+
+Then run it using:
+
+8. ./unit-test_MTrie
diff --git a/MassTrie-beta/wormhole/concbench.c b/MassTrie-beta/wormhole/concbench.c
new file mode 100644
index 00000000..f18abde9
--- /dev/null
+++ b/MassTrie-beta/wormhole/concbench.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018-2019  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdatomic.h>
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+atomic_uint_least64_t __seqno = 0;
+u64 __nth = 0;
+struct kv ** __samples = NULL;
+u64 __nkeys = 0;
+atomic_uint_least64_t __tot = 0;
+u64 __endtime = 0;
+
+  static void *
+kv_load_worker(struct wormhole * const wh)
+{
+  srandom_u64(time_nsec() * time_nsec());
+  struct wormref * const ref = wormhole_ref(wh);
+  const u64 seq = atomic_fetch_add(&__seqno, 1);
+  const u64 n0 = __nkeys / __nth * seq;
+  const u64 nz = (seq == (__nth - 1)) ? __nkeys : (__nkeys / __nth * (seq + 1));
+  printf("load worker %lu %lu\n", n0, nz);
+  for (u64 i = n0; i < nz; i++)
+    wormhole_put(ref, __samples[i]);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  static void *
+kv_probe_worker(struct wormhole * const wh)
+{
+  struct wormref * const ref = wormhole_ref(wh);
+  struct kv * next = __samples[random_u64() % __nkeys];
+  u64 rnext = random_u64() % __nkeys;
+  u64 count = 0;
+  u64 succ = 0;
+#define BATCH ((10000))
+  do {
+    for (u64 i = 0; i < BATCH; i++) {
+      // reading kv samples leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = __samples[rnext];
+      __builtin_prefetch(next, 0, 0);
+      __builtin_prefetch(((u8 *)next) + 64, 0, 0);
+      rnext = random_u64() % __nkeys;
+      __builtin_prefetch(&(__samples[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const struct kref kref = kv_kref(key);
+      if (wormhole_probe(ref, &kref))
+        succ++;
+    }
+    count += BATCH;
+  } while (time_nsec() < __endtime);
+  if (count != succ)
+    printf("count %lu success %lu\n", count, succ);
+  (void)atomic_fetch_add(&__tot, count);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  if (argc < 3) {
+    printf("usage: <words-file> <#keys> <#threads>\n");
+    printf("  Get words.txt: wget https://github.com/dwyl/english-words/raw/master/words.txt\n");
+    printf("  Example: %s words.txt 1000000 4\n", argv[0]);
+    printf("  Better to use only one numa node with numactl -N 0\n");
+    printf("  Better to run X thread on X cores\n");
+    return 0;
+  }
+
+  char ** const words = malloc(sizeof(char *) * 1000000); // or `wc -l words.txt`
+  u64 nr_words = 0;
+  char * buf = malloc(8192);
+  size_t bufsize = 8192;
+  FILE * const fwords = fopen(argv[1], "r");
+  if (fwords == NULL) {
+    printf("open words file failed\n");
+    return 0;
+  }
+
+  // read all words to words
+  while (getline(&buf, &bufsize, fwords) > 0) {
+    buf[strlen(buf)-1] = '\0';
+    words[nr_words] = strdup(buf);
+    nr_words++;
+  }
+  fclose(fwords);
+
+  // generate keys
+  const u64 nkeys = strtoull(argv[2], NULL, 10);
+  struct kv ** const samples = malloc(sizeof(struct kv *) * nkeys);
+  char * ss[6];
+  for (u64 i = 0; i < nkeys; i++) {
+    for (u64 j = 0; j < 6; j++)
+      ss[j] = words[random() % nr_words];
+    sprintf(buf, "%s %s %s %s %s %s!", ss[0], ss[1], ss[2], ss[3], ss[4], ss[5]);
+    samples[i] = kv_create_str(buf, NULL, 0);
+  }
+  // free words & buf
+  for (u64 i = 0; i < nr_words; i++)
+    free(words[i]);
+  free(words);
+  free(buf);
+
+  // load (4)
+  __samples = samples;
+  __nkeys = nkeys;
+  struct wormhole * const wh = wormhole_create(NULL);
+  __nth = 4;
+  const u64 dtl = thread_fork_join(4, (void *)kv_load_worker, false, (void *)wh);
+  printf("load x4 %.2lf mops\n", ((double)nkeys) * 1e3 / ((double)dtl));
+
+  const u64 nth = strtoull(argv[3], NULL, 10);
+  printf("probe with %lu threads. each round takes 3 seconds\n", nth);
+  for (u64 i = 0; i < 3; i++) {
+    __tot = 0;
+    __endtime = time_nsec() + 3e9; // 3 sec
+    const u64 dt = thread_fork_join(nth, (void *)kv_probe_worker, false, (void *)wh);
+    const double mops = ((double)__tot) * 1e3 / ((double)dt);
+    printf("probe x%lu %.2lf mops\n", nth, mops);
+    sleep(1);
+  }
+
+  // final clean up for valgrind
+  for (u64 i = 0; i < nkeys; i++)
+    free(samples[i]);
+  free(samples);
+  wormhole_destroy(wh);
+  return 0;
+}
diff --git a/MassTrie-beta/wormhole/concbench.out b/MassTrie-beta/wormhole/concbench.out
new file mode 100644
index 00000000..ee87ca31
Binary files /dev/null and b/MassTrie-beta/wormhole/concbench.out differ
diff --git a/MassTrie-beta/wormhole/ctypes.h b/MassTrie-beta/wormhole/ctypes.h
new file mode 100644
index 00000000..314ca5dc
--- /dev/null
+++ b/MassTrie-beta/wormhole/ctypes.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// C types only; C++ source code don't use this
+
+#include <assert.h>
+#include <stdatomic.h>
+
+/* C11 atomic types */
+typedef atomic_bool             abool;
+
+typedef atomic_uchar    au8;
+typedef atomic_ushort   au16;
+typedef atomic_uint     au32;
+typedef atomic_ulong    au64;
+static_assert(sizeof(au8) == 1, "sizeof(au8)");
+static_assert(sizeof(au16) == 2, "sizeof(au16)");
+static_assert(sizeof(au32) == 4, "sizeof(au32)");
+static_assert(sizeof(au64) == 8, "sizeof(au64)");
+
+typedef atomic_char     as8;
+typedef atomic_short    as16;
+typedef atomic_int      as32;
+typedef atomic_long     as64;
+static_assert(sizeof(as8) == 1, "sizeof(as8)");
+static_assert(sizeof(as16) == 2, "sizeof(as16)");
+static_assert(sizeof(as32) == 4, "sizeof(as32)");
+static_assert(sizeof(as64) == 8, "sizeof(as64)");
+
+// shorten long names
+#define MO_RELAXED memory_order_relaxed
+#define MO_CONSUME memory_order_consume
+#define MO_ACQUIRE memory_order_acquire
+#define MO_RELEASE memory_order_release
+#define MO_ACQ_REL memory_order_acq_rel
+#define MO_SEQ_CST memory_order_seq_cst
diff --git a/MassTrie-beta/wormhole/easydemo.c b/MassTrie-beta/wormhole/easydemo.c
new file mode 100644
index 00000000..f095a6ac
--- /dev/null
+++ b/MassTrie-beta/wormhole/easydemo.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+  int
+main(int argc, char ** argv)
+{
+  (void)argc;
+  (void)argv;
+  struct wormhole * const wh = wh_create();
+  struct wormref * const ref = wh_ref(wh);
+
+  bool r;
+
+  r = wh_put(ref, "wormhole", 8, "easy", 4);
+  printf("wh_put wormhole easy %c\n", r?'T':'F');
+
+  r = wh_put(ref, "time_travel", 11, "impossible", 10);
+  printf("wh_put time_travel impossible %c\n", r?'T':'F');
+
+  r = wh_del(ref, "time_travel", 11);
+  printf("wh_del time_travel %c\n", r?'T':'F');
+
+  r = wh_probe(ref, "time_travel", 11);
+  printf("wh_probe time_travel %c\n", r?'T':'F');
+
+  u32 klen_out = 0;
+  char kbuf_out[8] = {};
+  u32 vlen_out = 0;
+  char vbuf_out[8] = {};
+  r = wh_get(ref, "wormhole", 8, vbuf_out, 8, &vlen_out);
+  printf("wh_get wormhole %c %u %.*s\n", r?'T':'F', vlen_out, vlen_out, vbuf_out);
+
+  // in a concurrent environment, the kvmap_api_wormhole need park&resume when a thread is about to go idle
+  // don't need park&resume if you're using the default kvmap_api_whsafe in whwh.c!
+  wh_park(ref);
+  usleep(10);
+  wh_resume(ref);
+
+  // prepare a few keys for range ops
+  wh_put(ref, "00", 2, "0_value", 7);
+  wh_put(ref, "11", 2, "1_value", 7);
+  wh_put(ref, "22", 2, "2_value", 7);
+
+  struct wormhole_iter * const iter = wh_iter_create(ref);
+
+  wh_iter_seek(iter, NULL, 0); // seek to the head
+  printf("wh_iter_seek \"\"\n");
+  while (wh_iter_valid(iter)) {
+    r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, vbuf_out, 8, &vlen_out);
+    if (r) {
+      printf("wh_iter_peek klen=%u key=%.*s vlen=%u value=%.*s\n",
+          klen_out, klen_out, kbuf_out, vlen_out, vlen_out, vbuf_out);
+    } else {
+      printf("ERROR!\n");
+    }
+    wh_iter_skip1(iter);
+  }
+
+  // call iter_park if you will go idle but want to use the iter later
+  // don't need to call iter_park if you're actively using iter
+  wh_iter_park(iter);
+  usleep(10);
+
+  wh_iter_seek(iter, "0", 1);
+  printf("wh_iter_seek \"0\"\n");
+  // this time we don't want to copy the value
+  r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, NULL, 0, NULL);
+  if (r){
+    printf("wh_iter_peek klen=%u key=%.*s\n", klen_out, klen_out, kbuf_out);
+  } else {
+    printf("ERROR: iter_peek failed\n");
+  }
+
+  wh_iter_destroy(iter);
+  // there must be no active iter when calling unref()
+  wh_unref(ref);
+
+  // unsafe operations: should have released all references
+  wh_clean(wh); // just for demonstration
+  wh_destroy(wh); // destroy also calls clean interally
+  return 0;
+}
diff --git a/MassTrie-beta/wormhole/easydemo.out b/MassTrie-beta/wormhole/easydemo.out
new file mode 100644
index 00000000..32521210
Binary files /dev/null and b/MassTrie-beta/wormhole/easydemo.out differ
diff --git a/MassTrie-beta/wormhole/kv.c b/MassTrie-beta/wormhole/kv.c
new file mode 100644
index 00000000..a1720e88
--- /dev/null
+++ b/MassTrie-beta/wormhole/kv.c
@@ -0,0 +1,1131 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include <ctype.h>
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+// }}} headers
+
+// crc32c {{{
+  inline u32
+kv_crc32c(const void * const ptr, u32 len)
+{
+  return crc32c_inc((const u8 *)ptr, len, KV_CRC32C_SEED);
+}
+
+  inline u64
+kv_crc32c_extend(const u32 lo)
+{
+  const u64 hi = (u64)(~lo);
+  return (hi << 32) | ((u64)lo);
+}
+// }}} crc32c
+
+// kv {{{
+
+// size {{{
+  inline size_t
+kv_size(const struct kv * const kv)
+{
+  return sizeof(*kv) + kv->klen + kv->vlen;
+}
+
+  inline size_t
+kv_size_align(const struct kv * const kv, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*kv) + kv->klen + kv->vlen + (align - 1)) & (~(align - 1));
+}
+
+  inline size_t
+key_size(const struct kv *const key)
+{
+  return sizeof(*key) + key->klen;
+}
+
+  inline size_t
+key_size_align(const struct kv *const key, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*key) + key->klen + (align - 1)) & (~(align - 1));
+}
+// }}} size
+
+// construct {{{
+  inline void
+kv_update_hash(struct kv * const kv)
+{
+  const u32 lo = kv_crc32c((const void *)kv->kv, kv->klen);
+  kv->hash = kv_crc32c_extend(lo);
+}
+
+  inline void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen)
+{
+  debug_assert((vlen == 0) || value);
+  memcpy(&(kv->kv[kv->klen]), value, vlen);
+  kv->vlen = vlen;
+}
+
+  inline void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen)
+{
+  debug_assert(kv);
+  kv->klen = klen;
+  memcpy(&(kv->kv[0]), key, klen);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, vlen);
+}
+
+  inline void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+// the u64 key is filled in big-endian byte order for correct ordering
+  inline void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen)
+{
+  kv->klen = sizeof(u64);
+  *(u64 *)(kv->kv) = __builtin_bswap64(key); // bswap on little endian
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 8;
+  strhex_32(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 16;
+  strhex_64(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex,
+    const u32 klen, const void * const value, const u32 vlen)
+{
+  strhex_64(kv->kv, hex);
+  if (klen > 16) {
+    kv->klen = klen;
+    memset(kv->kv + 16, '!', klen - 16);
+  } else {
+    kv->klen = 16;
+  }
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref)
+{
+  kv->klen = kref->len;
+  kv->vlen = 0;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+}
+
+  inline void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen)
+{
+  kv->klen = kref->len;
+  kv->vlen = vlen;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+  memcpy(kv->kv + kv->klen, value, vlen);
+}
+
+  inline struct kref
+kv_kref(const struct kv * const key)
+{
+  return (struct kref){.ptr = key->kv, .len = key->klen, .hash32 = key->hashlo};
+}
+
+  inline struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen)
+{
+  struct kv * const kv = malloc(sizeof(*kv) + klen + vlen);
+  if (kv)
+    kv_refill(kv, key, klen, value, vlen);
+  return kv;
+}
+
+  inline struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen)
+{
+  return kv_create(key, (u32)strlen(key), value, vlen);
+}
+
+  inline struct kv *
+kv_create_str_str(const char * const key, const char * const value)
+{
+  return kv_create(key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+  inline struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen)
+{
+  return kv_create(kref->ptr, kref->len, value, vlen);
+}
+
+static struct kv __kv_null = {};
+
+__attribute__((constructor))
+  static void
+kv_null_init(void)
+{
+  kv_update_hash(&__kv_null);
+}
+
+  inline const struct kv *
+kv_null(void)
+{
+  return &__kv_null;
+}
+// }}} construct
+
+// dup {{{
+  inline struct kv *
+kv_dup(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = kv_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new)
+    memcpy(new, kv, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup_key(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = key_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new) {
+    memcpy(new, kv, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = kv_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new)
+    memcpy(new, from, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = key_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    memcpy(new, from, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen)
+{
+  if (from == NULL)
+    return NULL;
+  debug_assert(plen <= from->klen);
+  const size_t sz = key_size(from) - from->klen + plen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    new->klen = plen;
+    memcpy(new->kv, from->kv, plen);
+    new->vlen = 0;
+    kv_update_hash(new);
+  }
+  return new;
+}
+// }}} dup
+
+// compare {{{
+  static inline int
+klen_compare(const u32 len1, const u32 len2)
+{
+  if (len1 < len2)
+    return -1;
+  else if (len1 > len2)
+    return 1;
+  else
+    return 0;
+}
+
+// compare whether the two keys are identical
+// optimistic: do not check hash
+  inline bool
+kv_match(const struct kv * const key1, const struct kv * const key2)
+{
+  //cpu_prefetch0(((u8 *)key2) + 64);
+  //return (key1->hash == key2->hash)
+  //  && (key1->klen == key2->klen)
+  //  && (!memcmp(key1->kv, key2->kv, key1->klen));
+  return (key1->klen == key2->klen) && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+// compare whether the two keys are identical
+// check hash first
+// pessimistic: return false quickly if their hashes mismatch
+  inline bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2)
+{
+  return (key1->hash == key2->hash)
+    && (key1->klen == key2->klen)
+    && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+  inline bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2)
+{
+  return (kv1->kvlen == kv2->kvlen)
+    && (!memcmp(kv1, kv2, sizeof(*kv1) + kv1->klen + kv1->vlen));
+}
+
+  bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  debug_assert(kv128);
+
+  u32 klen128 = 0;
+  u32 vlen128 = 0;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(kv128, &klen128), &vlen128);
+  (void)vlen128;
+  return (sk->klen == klen128) && (!memcmp(sk->kv, pdata, klen128));
+}
+
+  inline int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2)
+{
+  const u32 len = kv1->klen < kv2->klen ? kv1->klen : kv2->klen;
+  const int cmp = memcmp(kv1->kv, kv2->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kv1->klen, kv2->klen);
+}
+
+// for qsort and bsearch
+  static int
+kv_compare_ptrs(const void * const p1, const void * const p2)
+{
+  const struct kv * const * const pp1 = (typeof(pp1))p1;
+  const struct kv * const * const pp2 = (typeof(pp2))p2;
+  return kv_compare(*pp1, *pp2);
+}
+
+  int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  inline void
+kv_qsort(struct kv ** const kvs, const size_t nr)
+{
+  qsort(kvs, nr, sizeof(kvs[0]), kv_compare_ptrs);
+}
+
+// return the length of longest common prefix of the two keys
+  inline u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  return memlcp(key1->kv, key2->kv, max);
+}
+
+// return the length of longest common prefix of the two keys with a known lcp0
+  inline u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  debug_assert(max >= lcp0);
+  return lcp0 + memlcp(key1->kv+lcp0, key2->kv+lcp0, max-lcp0);
+}
+// }}}
+
+// psort {{{
+  static inline void
+kv_psort_exchange(struct kv ** const kvs, const u64 i, const u64 j)
+{
+  if (i != j) {
+    struct kv * const tmp = kvs[i];
+    kvs[i] = kvs[j];
+    kvs[j] = tmp;
+  }
+}
+
+  static u64
+kv_psort_partition(struct kv ** const kvs, const u64 lo, const u64 hi)
+{
+  if (lo >= hi)
+    return lo;
+
+  const u64 p = (lo+hi) >> 1;
+  kv_psort_exchange(kvs, lo, p);
+  u64 i = lo;
+  u64 j = hi + 1;
+  do {
+    while (kv_compare(kvs[++i], kvs[lo]) < 0 && i < hi);
+    while (kv_compare(kvs[--j], kvs[lo]) > 0);
+    if (i >= j)
+      break;
+    kv_psort_exchange(kvs, i, j);
+  } while (true);
+  kv_psort_exchange(kvs, lo, j);
+  return j;
+}
+
+  static void
+kv_psort_rec(struct kv ** const kvs, const u64 lo, const u64 hi, const u64 tlo, const u64 thi)
+{
+  if (lo >= hi)
+    return;
+  const u64 c = kv_psort_partition(kvs, lo, hi);
+
+  if (c > tlo) // go left
+    kv_psort_rec(kvs, lo, c-1, tlo, thi);
+
+  if (c < thi) // go right
+    kv_psort_rec(kvs, c+1, hi, tlo, thi);
+}
+
+  inline void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi)
+{
+  debug_assert(tlo <= thi);
+  debug_assert(thi < nr);
+  kv_psort_rec(kvs, 0, nr-1, tlo, thi);
+}
+// }}} psort
+
+// ptr {{{
+  inline void *
+kv_vptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[kv->klen]));
+}
+
+  inline void *
+kv_kptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[0]));
+}
+
+  inline const void *
+kv_vptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[kv->klen]));
+}
+
+  inline const void *
+kv_kptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[0]));
+}
+// }}} ptr
+
+// print {{{
+// cmd "KV" K and V can be 's': string, 'x': hex, 'd': dec, or else for not printing.
+// n for newline after kv
+  void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out)
+{
+  debug_assert(cmd);
+  const u32 klen = kv->klen;
+  fprintf(out, "#%016lx k[%3u]", kv->hash, klen);
+
+  switch(cmd[0]) {
+  case 's': fprintf(out, " %.*s", klen, kv->kv); break;
+  case 'x': str_print_hex(out, kv->kv, klen); break;
+  case 'd': str_print_dec(out, kv->kv, klen); break;
+  default: break;
+  }
+
+  const u32 vlen = kv->vlen;
+  switch (cmd[1]) {
+  case 's': fprintf(out, "  v[%4u] %.*s", vlen, vlen, kv->kv+klen); break;
+  case 'x': fprintf(out, "  v[%4u]", vlen); str_print_hex(out, kv->kv+klen, vlen); break;
+  case 'd': fprintf(out, "  v[%4u]", vlen); str_print_dec(out, kv->kv+klen, vlen); break;
+  default: break;
+  }
+  if (strchr(cmd, 'n'))
+    fprintf(out, "\n");
+}
+// }}} print
+
+// mm {{{
+  struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv;
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out)
+{
+  (void)out;
+  return kv;
+}
+
+  void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv)
+{
+  (void)kv;
+  (void)priv;
+}
+
+// copy-in
+  struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv_dup(kv);
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out)
+{
+  return kv_dup2(kv, out);
+}
+
+  void
+kvmap_mm_free_free(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  free(kv);
+}
+
+const struct kvmap_mm kvmap_mm_dup = {
+  .in = kvmap_mm_in_dup,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+const struct kvmap_mm kvmap_mm_ndf = {
+  .in = kvmap_mm_in_noop,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+// }}} mm
+
+// kref {{{
+  inline void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = 0;
+}
+
+  inline void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = kv_crc32c(ptr, len);
+}
+
+  inline void
+kref_update_hash32(struct kref * const kref)
+{
+  kref->hash32 = kv_crc32c(kref->ptr, kref->len);
+}
+
+  inline void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv->hashlo;
+}
+
+  inline void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv_crc32c(kv->kv, kv->klen);
+}
+
+  inline bool
+kref_match(const struct kref * const k1, const struct kref * const k2)
+{
+  return (k1->len == k2->len) && (!memcmp(k1->ptr, k2->ptr, k1->len));
+}
+
+// match a kref and a key
+  inline bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k)
+{
+  return (kref->len == k->klen) && (!memcmp(kref->ptr, k->kv, kref->len));
+}
+
+  inline int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2)
+{
+  const u32 len = kref1->len < kref2->len ? kref1->len : kref2->len;
+  const int cmp = memcmp(kref1->ptr, kref2->ptr, (size_t)len);
+  return cmp ? cmp : klen_compare(kref1->len, kref2->len);
+}
+
+// compare a kref and a key
+  inline int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k)
+{
+  debug_assert(kref);
+  debug_assert(k);
+  const u32 len = kref->len < k->klen ? kref->len : k->klen;
+  const int cmp = memcmp(kref->ptr, k->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kref->len, k->klen);
+}
+
+  inline u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2)
+{
+  const u32 max = (k1->len < k2->len) ? k1->len : k2->len;
+  return memlcp(k1->ptr, k2->ptr, max);
+}
+
+  inline u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv)
+{
+  const u32 max = (kref->len < kv->klen) ? kref->len : kv->klen;
+  return memlcp(kref->ptr, kv->kv, max);
+}
+
+// klen, key, ...
+  inline int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+// klen, vlen, key, ...
+  inline int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+static struct kref __kref_null = {.hash32 = KV_CRC32C_SEED};
+
+  inline const struct kref *
+kref_null(void)
+{
+  return &__kref_null;
+}
+// }}} kref
+
+// kvref {{{
+  inline void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv)
+{
+  ref->kptr = kv->kv;
+  ref->vptr = kv->kv + kv->klen;
+  ref->hdr = *kv;
+}
+
+  struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen + ref->hdr.vlen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  memcpy(new->kv + new->klen, ref->vptr, new->vlen);
+  return new;
+}
+
+  struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  return new;
+}
+
+  int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv)
+{
+  const u32 len = ref->hdr.klen < kv->klen ? ref->hdr.klen : kv->klen;
+  const int cmp = memcmp(ref->kptr, kv->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(ref->hdr.klen, kv->klen);
+}
+// }}} kvref
+
+// kv128 {{{
+// estimate the encoded size
+  inline size_t
+kv128_estimate_kv(const struct kv * const kv)
+{
+  return vi128_estimate_u32(kv->klen) + vi128_estimate_u32(kv->vlen) + kv->klen + kv->vlen;
+}
+
+// create a kv128 from kv
+  u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize)
+{
+  u8 * const ptr = out ? out : malloc(kv128_estimate_kv(kv));
+  if (!ptr)
+    return NULL;
+
+  u8 * const pdata = vi128_encode_u32(vi128_encode_u32(ptr, kv->klen), kv->vlen);
+  memcpy(pdata, kv->kv, kv->klen + kv->vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + kv->klen + kv->vlen;
+  return ptr; // return the head of the encoded kv128
+}
+
+// dup kv128 to a kv
+  struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  struct kv * const ret = out ? out : malloc(sizeof(struct kv) + klen + vlen);
+  if (ret)
+    kv_refill(ret, pdata, klen, pdata + klen, vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + klen + vlen;
+  return ret; // return the kv
+}
+
+  inline size_t
+kv128_size(const u8 * const ptr)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  return ((size_t)(pdata - ptr)) + klen + vlen;
+}
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// registry {{{
+// increase MAX if need more
+#define KVMAP_API_MAX ((32))
+static struct kvmap_api_reg kvmap_api_regs[KVMAP_API_MAX];
+static u64 kvmap_api_regs_nr = 0;
+
+  void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api)
+{
+  if (kvmap_api_regs_nr < KVMAP_API_MAX) {
+    kvmap_api_regs[kvmap_api_regs_nr].nargs = nargs;
+    kvmap_api_regs[kvmap_api_regs_nr].name = name;
+    kvmap_api_regs[kvmap_api_regs_nr].args_msg = args_msg;
+    kvmap_api_regs[kvmap_api_regs_nr].create = create;
+    kvmap_api_regs[kvmap_api_regs_nr].api = api;
+    kvmap_api_regs_nr++;
+  } else {
+    fprintf(stderr, "%s failed to register [%s]\n", __func__, name);
+  }
+}
+  void
+kvmap_api_helper_message(void)
+{
+  fprintf(stderr, "%s Usage: api <map-type> <param1> ...\n", __func__);
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    fprintf(stderr, "%s example: api %s %s\n", __func__,
+        kvmap_api_regs[i].name, kvmap_api_regs[i].args_msg);
+  }
+}
+
+  int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out)
+{
+  // "api" "name" "arg1", ...
+  if (argc < 2 || strcmp(argv[0], "api") != 0)
+    return -1;
+
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    const struct kvmap_api_reg * const reg = &kvmap_api_regs[i];
+    if (0 != strcmp(argv[1], reg->name))
+      continue;
+
+    if ((argc - 2) < reg->nargs)
+      return -1;
+
+    void * const map = reg->create(argv[1], mm, argv + 2); // skip "api" "name"
+    if (map) {
+      *api_out = reg->api;
+      *map_out = map;
+      return 2 + reg->nargs;
+    } else {
+      return -1;
+    }
+  }
+
+  // no match
+  return -1;
+}
+// }}} registry
+
+// misc {{{
+  void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv)
+{
+  // steal the kv pointer out so we don't need a dangerous get_key_interanl()
+  if (priv)
+    *(struct kv **)priv = kv;
+}
+
+  inline void *
+kvmap_ref(const struct kvmap_api * const api, void * const map)
+{
+  return api->ref ? api->ref(map) : map;
+}
+
+// return the original map pointer; usually unused by caller
+  inline void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref)
+{
+  return api->unref ? api->unref(ref) : ref;
+}
+// }}} misc
+
+// kvmap_kv_op {{{
+  inline struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out)
+{
+  const struct kref kref = kv_kref(key);
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv)
+{
+  return api->put(ref, kv);
+}
+
+  inline bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->merge(ref, &kref, uf, priv);
+}
+
+  inline u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end)
+{
+  const struct kref kref0 = kv_kref(start);
+  if (end) {
+    const struct kref krefz = kv_kref(end);
+    return api->delr(ref, &kref0, &krefz);
+  } else {
+    return api->delr(ref, &kref0, NULL);
+  }
+}
+
+  inline void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  api->iter_seek(iter, &kref);
+}
+// }}} kvmap_kv_op
+
+// kvmap_raw_op {{{
+  inline struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  api->iter_seek(iter, &kref);
+}
+// }}}} kvmap_raw_op
+
+// dump {{{
+  u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd)
+{
+  void * const ref = kvmap_ref(api, map);
+  void * const iter = api->iter_create(ref);
+  api->iter_seek(iter, kref_null());
+  u64 i = 0;
+  while (api->iter_valid(iter)) {
+    struct kvref kvref;
+    api->iter_kvref(iter, &kvref);
+    dprintf(fd, "%010lu [%3u] %.*s [%u]\n", i, kvref.hdr.klen, kvref.hdr.klen, kvref.kptr, kvref.hdr.vlen);
+    i++;
+    api->iter_skip1(iter);
+  }
+  api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  return i;
+}
+// }}} dump
+
+// kv64 {{{
+struct kv64 { // internal only
+  struct kv kv;
+  u64 key_be; // must be in big endian
+  u64 value;
+};
+
+  inline bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out)
+{
+  struct kv64 keybuf, kvout;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  struct kv * const ret = api->get(ref, &kref, &kvout.kv);
+  if (ret) {
+    *out = kvout.value;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value)
+{
+  struct kv64 kv;
+  kv.key_be = __builtin_bswap64(key);
+  kv.value = value;
+  kv.kv.klen = sizeof(key);
+  kv.kv.vlen = sizeof(value);
+  if (api->hashkey)
+    kv_update_hash(&kv.kv);
+
+  return api->put(ref, &kv.kv);
+}
+
+  inline bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->del(ref, &kref);
+}
+
+  inline void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  api->iter_seek(iter, &kref);
+}
+
+  inline bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out)
+{
+  struct kv64 kvout;
+  struct kv * const ret = api->iter_peek(iter, &kvout.kv);
+  if (key_out)
+    *key_out = __builtin_bswap64(kvout.key_be); // to LE
+  if (value_out)
+    *value_out = kvout.value;
+  return ret != NULL;
+}
+// }}} kv64
+
+// }}} kvmap
+
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/kv.h b/MassTrie-beta/wormhole/kv.h
new file mode 100644
index 00000000..1e251e58
--- /dev/null
+++ b/MassTrie-beta/wormhole/kv.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// crc32c {{{
+#define KV_CRC32C_SEED ((0xDEADBEEFu))
+
+  extern u32
+kv_crc32c(const void * const ptr, u32 len);
+
+  extern u64
+kv_crc32c_extend(const u32 crc32c);
+// }}} crc32c
+
+// kv {{{
+
+// struct {{{
+/*
+ * Some internal union names can be ignored:
+ * struct kv {
+ *   u32 klen;
+ *   u32 vlen;
+ *   u64 hash;
+ *   u8 kv[];
+ * };
+ */
+struct kv {
+  union { // the first u64
+    u64 kvlen;
+    struct {
+      u32 klen;
+      union { u32 vlen; u32 refcnt; };
+    };
+  };
+  union {
+    u64 hash; // hashvalue of the key
+    u64 priv; // can hide a value here if hash is not used
+    void * privptr;
+    struct { u32 hashlo; u32 hashhi; }; // little endian
+    struct { u32 privlo; u32 privhi; };
+  };
+  u8 kv[0];  // len(kv) == klen + vlen
+} __attribute__((packed));
+
+struct kref {
+  u32 len;
+  union { u32 hash32; u32 priv; };
+  const u8 * ptr;
+} __attribute__((packed));
+
+struct kvref {
+  const u8 * kptr; // read-only
+  const u8 * vptr; // read-only
+  struct kv hdr; // hdr.kv[] is invalid
+};
+// }}} struct
+
+// kv {{{
+typedef int  (*kv_kv_cmp_func)(const struct kv *, const struct kv *);
+
+  extern size_t
+kv_size(const struct kv * const kv);
+
+  extern size_t
+kv_size_align(const struct kv * const kv, const u64 align);
+
+  extern size_t
+key_size(const struct kv * const key);
+
+  extern size_t
+key_size_align(const struct kv * const key, const u64 align);
+
+  extern void
+kv_update_hash(struct kv * const kv);
+
+  extern void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value);
+
+// the u64 key is filled in big-endian byte order
+  extern void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref);
+
+  extern void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen);
+
+  extern struct kref
+kv_kref(const struct kv * const key);
+
+  extern struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str_str(const char * const key, const char * const value);
+
+  extern struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen);
+
+// a static kv with klen == 0
+  extern const struct kv *
+kv_null(void);
+
+  extern struct kv *
+kv_dup(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup_key(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen);
+
+  extern bool
+kv_match(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128);
+
+  extern int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128);
+
+  extern int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128);
+
+  extern void
+kv_qsort(struct kv ** const kvs, const size_t nr);
+
+  extern u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2);
+
+  extern u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0);
+
+  extern void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi);
+
+  extern void *
+kv_vptr(struct kv * const kv);
+
+  extern void *
+kv_kptr(struct kv * const kv);
+
+  extern const void *
+kv_vptr_c(const struct kv * const kv);
+
+  extern const void *
+kv_kptr_c(const struct kv * const kv);
+
+  extern void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out);
+// }}} kv
+
+// mm {{{
+typedef struct kv * (* kvmap_mm_in_func)(struct kv * kv, void * priv);
+typedef struct kv * (* kvmap_mm_out_func)(struct kv * kv, struct kv * out);
+typedef void        (* kvmap_mm_free_func)(struct kv * kv, void * priv);
+
+// manage internal kv data of kvmap
+struct kvmap_mm {
+  // to create a private copy of "kv"
+  // see put() functions
+  kvmap_mm_in_func in;
+  // to duplicate a private copy of "kv" to "out"
+  // see get() and iter_peek() functions
+  kvmap_mm_out_func out;
+  // to free a kv
+  // see del() and put() functions
+  kvmap_mm_free_func free;
+  void * priv;
+};
+
+  extern struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_free(struct kv * const kv, void * const priv);
+
+// the default mm
+extern const struct kvmap_mm kvmap_mm_dup; // in:Dup, out:Dup, free:Free
+extern const struct kvmap_mm kvmap_mm_ndf; // in:Noop, out:Dup, free:Free
+// }}} mm
+
+// ref {{{
+typedef int (*kref_kv_cmp_func)(const struct kref *, const struct kv *);
+
+// ptr and len only
+  extern void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+// this calculates hash32
+  extern void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+  extern void
+kref_update_hash32(struct kref * const kref);
+
+  extern void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv);
+
+  extern void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv);
+
+  extern bool
+kref_match(const struct kref * const k1, const struct kref * const k2);
+
+  extern bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k);
+
+  extern int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2);
+
+  extern int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k);
+
+  extern u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2);
+
+  extern u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv);
+
+  extern int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128);
+
+  extern int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128);
+
+  extern const struct kref *
+kref_null(void);
+
+  extern void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv);
+
+  extern struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to);
+
+  extern struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to);
+
+  extern int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv);
+// }}} ref
+
+// kv128 {{{
+  extern size_t
+kv128_estimate_kv(const struct kv * const kv);
+
+  extern u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize);
+
+  extern struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize);
+
+  extern size_t
+kv128_size(const u8 * const ptr);
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// kvmap_api {{{
+typedef void (* kv_inp_func)(struct kv * const curr, void * const priv);
+
+// the merge function should:
+// 1: return NULL if the origin kv is not changed at all
+// 2: return kv0 if updates has been applied in-place
+// 3: return a different kv if the original kv must be replaced
+// In an in-memory kvmap, 2==1 and no further action is needed
+// In a persistent kv store with a memtable, 2 will need an insertion if kv0 is not from the memtable
+typedef struct kv * (* kv_merge_func)(struct kv * const kv0, void * const priv);
+
+struct kvmap_api {
+  // feature bits
+  bool hashkey; // true: caller needs to provide correct hash in kv/kref
+  bool ordered; // true: has iter_seek
+  bool threadsafe; // true: support thread_safe access
+  bool readonly; // true: no put() and del()
+  bool irefsafe; // true: iter's kref/kvref can be safely accessed after iter_seek/iter_skip/iter_park
+  bool unique; // provide unique keys, especially for iterators
+  bool refpark; // ref has park() and resume()
+  bool async; // XXX for testing KVell
+
+  // put (aka put/upsert): return true on success; false on error
+  // mm.in() controls how things move into the kvmap; the default mm make a copy with malloc()
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* put)     (void * const ref, struct kv * const kv);
+  // get: search and return a kv if found, or NULL if not
+  // with the default mm: malloc() if out == NULL; otherwise, use out as buffer
+  // with custom kvmap_mm: mm.out() controls buffer; use with caution
+  // caller should use the returned ptr even if out is provided
+  struct kv * (* get)     (void * const ref, const struct kref * const key, struct kv * const out);
+  // probe: return true on found, false on not found
+  bool        (* probe)   (void * const ref, const struct kref * const key);
+  // del: return true on something deleted, false on not found
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* del)     (void * const ref, const struct kref * const key);
+  // inp: inplace operation if key exists; otherwise return false; uf() is always executed even with NULL key
+  // inpr/inpw acquires r/w locks respectively.
+  // Note that in inpw() you can only change the value.
+  bool        (* inpr)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  bool        (* inpw)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  // merge: put+callback on old/new keys; another name: read-modify-write
+  // return true if successfull; return false on error
+  bool        (* merge)   (void * const ref, const struct kref * const key, kv_merge_func uf, void * const priv);
+  // delete-range: delete all keys from start (inclusive) to end (exclusive)
+  u64         (* delr)    (void * const ref, const struct kref * const start, const struct kref * const end);
+  // make everything persist; for persistent maps only
+  void        (* sync)    (void * const ref);
+
+  // general guidelines for thread-safe iters:
+  // - it is assumed that the key under the cursor is locked/freezed/immutable
+  // - once created one must call iter_seek to make it valid
+  // - the ownership of ref is given to the iter so ref should not be used until iter_destroy
+  // - creating and use more than one iter based on a ref can cause deadlocks
+  void *      (* iter_create)   (void * const ref);
+  // move the cursor to the first key >= search-key;
+  void        (* iter_seek)     (void * const iter, const struct kref * const key);
+  // check if the cursor points to a valid key
+  bool        (* iter_valid)    (void * const iter);
+  // return the current key; copy to out if (out != NULL)
+  // mm.out() controls copy-out
+  struct kv * (* iter_peek)     (void * const iter, struct kv * const out);
+  // similar to peek but does not copy; return false if iter is invalid
+  bool        (* iter_kref)     (void * const iter, struct kref * const kref);
+  // similar to iter_kref but also provide the value
+  bool        (* iter_kvref)    (void * const iter, struct kvref * const kvref);
+  // iter_retain makes kref or kvref of the current iter remain valid until released
+  // the returned opaque pointer should be provided when releasing the hold
+  u64         (* iter_retain)   (void * const iter);
+  void        (* iter_release)  (void * const iter, const u64 opaque);
+  // skip one element
+  void        (* iter_skip1)    (void * const iter);
+  // skip nr elements
+  void        (* iter_skip)     (void * const iter, const u32 nr);
+  // iter_next == iter_peek + iter_skip1
+  struct kv * (* iter_next)     (void * const iter, struct kv * const out);
+  // perform inplace opeation if the current key is valid; return false if no current key
+  // the uf() is always executed even with NULL key
+  bool        (* iter_inp)      (void * const iter, kv_inp_func uf, void * const priv);
+  // invalidate the iter to release any resources or locks
+  // afterward, must call seek() again before accessing data
+  void        (* iter_park)     (void * const iter);
+  // destroy iter
+  void        (* iter_destroy)  (void * const iter);
+
+  // misc:
+  // create refs for maps if required; always use use kvmap_ref() and kvmap_unref()
+  // if there are ref/unref functions, ref-ptr should be used as map for all kv operations
+  void *      (* ref)     (void * map);
+  // return the original map
+  void *      (* unref)   (void * ref);
+  // pause access without unref; must call resume later before access index again
+  void        (* park)    (void * ref);
+  // resume access of ref; must be paired with a park()
+  void        (* resume)  (void * ref);
+
+  // UNSAFE functions:
+  // empty the map
+  void        (* clean)   (void * map);
+  // erase everything
+  void        (* destroy) (void * map);
+  // for debugging
+  void        (* fprint)  (void * map, FILE * const out);
+};
+
+// registry
+struct kvmap_api_reg {
+  int nargs; // number of arguments after name
+  const char * name;
+  const char * args_msg; // see ...helper_message
+  // multiple apis may share one create function
+  // arguments: name (e.g., "rdb"), mm (usually NULL), the remaining args
+  void * (*create)(const char *, const struct kvmap_mm *, char **);
+  const struct kvmap_api * api;
+};
+
+// call this function to register a kvmap_api
+  extern void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api);
+
+  extern void
+kvmap_api_helper_message(void);
+
+  extern int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out);
+// }}} kvmap_api
+
+// helpers {{{
+  extern void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv);
+
+  extern void *
+kvmap_ref(const struct kvmap_api * const api, void * const map);
+
+  extern void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref);
+
+  extern struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out);
+
+  extern bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv);
+
+  extern bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv);
+
+  extern u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end);
+
+  extern void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key);
+
+  extern struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out);
+
+  extern bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr);
+
+  extern u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd);
+
+  extern bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out);
+
+  extern bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value);
+
+  extern bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out);
+// }}} helpers
+
+// }}} kvmap
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/lib.c b/MassTrie-beta/wormhole/lib.c
new file mode 100644
index 00000000..06d45f6d
--- /dev/null
+++ b/MassTrie-beta/wormhole/lib.c
@@ -0,0 +1,3026 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include "lib.h"
+#include "ctypes.h"
+#include <assert.h>
+#include <execinfo.h>
+#include <math.h>
+#include <netdb.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <time.h>
+#include <stdarg.h> // va_start
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <malloc.h>  // malloc_usable_size
+#elif defined(__APPLE__) && defined(__MACH__)
+#include <sys/disk.h>
+#include <malloc/malloc.h>
+#elif defined(__FreeBSD__)
+#include <sys/disk.h>
+#include <malloc_np.h>
+#endif // OS
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+// }}} headers
+
+// math {{{
+  inline u64
+mhash64(const u64 v)
+{
+  return v * 11400714819323198485lu;
+}
+
+  inline u32
+mhash32(const u32 v)
+{
+  return v * 2654435761u;
+}
+
+// From Daniel Lemire's blog (2013, lemire.me)
+  u64
+gcd64(u64 a, u64 b)
+{
+  if (a == 0)
+    return b;
+  if (b == 0)
+    return a;
+
+  const u32 shift = (u32)__builtin_ctzl(a | b);
+  a >>= __builtin_ctzl(a);
+  do {
+    b >>= __builtin_ctzl(b);
+    if (a > b) {
+      const u64 t = b;
+      b = a;
+      a = t;
+    }
+    b = b - a;
+  } while (b);
+  return a << shift;
+}
+// }}} math
+
+// random {{{
+// Lehmer's generator is 2x faster than xorshift
+/**
+ * D. H. Lehmer, Mathematical methods in large-scale computing units.
+ * Proceedings of a Second Symposium on Large Scale Digital Calculating
+ * Machinery;
+ * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
+ *
+ * P L'Ecuyer,  Tables of linear congruential generators of different sizes and
+ * good lattice structure. Mathematics of Computation of the American
+ * Mathematical
+ * Society 68.225 (1999): 249-260.
+ */
+struct lehmer_u64 {
+  union {
+    u128 v128;
+    u64 v64[2];
+  };
+};
+
+static __thread struct lehmer_u64 rseed_u128 = {.v64 = {4294967291, 1549556881}};
+
+  static inline u64
+lehmer_u64_next(struct lehmer_u64 * const s)
+{
+  const u64 r = s->v64[1];
+  s->v128 *= 0xda942042e4dd58b5lu;
+  return r;
+}
+
+  static inline void
+lehmer_u64_seed(struct lehmer_u64 * const s, const u64 seed)
+{
+  s->v128 = (((u128)(~seed)) << 64) | (seed | 1);
+  (void)lehmer_u64_next(s);
+}
+
+  inline u64
+random_u64(void)
+{
+  return lehmer_u64_next(&rseed_u128);
+}
+
+  inline void
+srandom_u64(const u64 seed)
+{
+  lehmer_u64_seed(&rseed_u128, seed);
+}
+
+  inline double
+random_double(void)
+{
+  // random between [0.0 - 1.0]
+  const u64 r = random_u64();
+  return ((double)r) * (1.0 / ((double)(~0lu)));
+}
+// }}} random
+
+// timing {{{
+  inline u64
+time_nsec(void)
+{
+  struct timespec ts;
+  // MONO_RAW is 5x to 10x slower than MONO
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ((u64)ts.tv_sec) * 1000000000lu + ((u64)ts.tv_nsec);
+}
+
+  inline double
+time_sec(void)
+{
+  const u64 nsec = time_nsec();
+  return ((double)nsec) * 1.0e-9;
+}
+
+  inline u64
+time_diff_nsec(const u64 last)
+{
+  return time_nsec() - last;
+}
+
+  inline double
+time_diff_sec(const double last)
+{
+  return time_sec() - last;
+}
+
+// need char str[64]
+  void
+time_stamp(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F %T %z", &nowtm);
+}
+
+  void
+time_stamp2(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F-%H-%M-%S%z", &nowtm);
+}
+// }}} timing
+
+// cpucache {{{
+  inline void
+cpu_pause(void)
+{
+#if defined(__x86_64__)
+  _mm_pause();
+#elif defined(__aarch64__)
+  // nop
+#endif
+}
+
+  inline void
+cpu_mfence(void)
+{
+  atomic_thread_fence(MO_SEQ_CST);
+}
+
+// compiler fence
+  inline void
+cpu_cfence(void)
+{
+  atomic_thread_fence(MO_ACQ_REL);
+}
+
+  inline void
+cpu_prefetch0(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 0);
+}
+
+  inline void
+cpu_prefetch1(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 1);
+}
+
+  inline void
+cpu_prefetch2(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 2);
+}
+
+  inline void
+cpu_prefetch3(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 3);
+}
+
+  inline void
+cpu_prefetchw(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 1, 0);
+}
+// }}} cpucache
+
+// crc32c {{{
+  inline u32
+crc32c_u8(const u32 crc, const u8 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u8(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cb(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u16(const u32 crc, const u16 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u16(crc, v);
+#elif defined(__aarch64__)
+  return __crc32ch(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u32(const u32 crc, const u32 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u32(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cw(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u64(const u32 crc, const u64 v)
+{
+#if defined(__x86_64__)
+  return (u32)_mm_crc32_u64(crc, v);
+#elif defined(__aarch64__)
+  return (u32)__crc32cd(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc)
+{
+  if (nr == 1)
+    return crc32c_u8(crc, buf[0]);
+
+  crc = crc32c_u16(crc, *(u16 *)buf);
+  return (nr == 2) ? crc : crc32c_u8(crc, buf[2]);
+}
+
+  inline u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc)
+{
+  //debug_assert((nr & 3) == 0);
+  const u32 nr8 = nr >> 3;
+#pragma nounroll
+  for (u32 i = 0; i < nr8; i++)
+    crc = crc32c_u64(crc, ((u64*)buf)[i]);
+
+  if (nr & 4u)
+    crc = crc32c_u32(crc, ((u32*)buf)[nr8<<1]);
+  return crc;
+}
+
+  u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc)
+{
+  crc = crc32c_inc_x4(buf, nr, crc);
+  const u32 nr123 = nr & 3u;
+  return nr123 ? crc32c_inc_123(buf + nr - nr123, nr123, crc) : crc;
+}
+// }}} crc32c
+
+// debug {{{
+  void
+debug_break(void)
+{
+  usleep(100);
+}
+
+static u64 * debug_watch_u64 = NULL;
+
+  static void
+watch_u64_handler(const int sig)
+{
+  (void)sig;
+  const u64 v = debug_watch_u64 ? (*debug_watch_u64) : 0;
+  fprintf(stderr, "[USR1] %lu (0x%lx)\n", v, v);
+}
+
+  void
+watch_u64_usr1(u64 * const ptr)
+{
+  debug_watch_u64 = ptr;
+  struct sigaction sa = {};
+  sa.sa_handler = watch_u64_handler;
+  sigemptyset(&(sa.sa_mask));
+  sa.sa_flags = SA_RESTART;
+  if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+    fprintf(stderr, "Failed to set signal handler for SIGUSR1\n");
+  } else {
+    fprintf(stderr, "to watch> kill -s SIGUSR1 %d\n", getpid());
+  }
+}
+
+static void * debug_bt_state = NULL;
+#if defined(BACKTRACE) && defined(__linux__)
+// TODO: get exec path on MacOS and FreeBSD
+
+#include <backtrace.h>
+static char debug_filepath[1024] = {};
+
+  static void
+debug_bt_error_cb(void * const data, const char * const msg, const int errnum)
+{
+  (void)data;
+  if (msg)
+    dprintf(2, "libbacktrace: %s %s\n", msg, strerror(errnum));
+}
+
+  static int
+debug_bt_print_cb(void * const data, const uintptr_t pc,
+    const char * const file, const int lineno, const char * const func)
+{
+  u32 * const plevel = (typeof(plevel))data;
+  if (file || func || lineno) {
+    dprintf(2, "[%u]0x%012lx " TERMCLR(35) "%s" TERMCLR(31) ":" TERMCLR(34) "%d" TERMCLR(0)" %s\n",
+        *plevel, pc, file ? file : "???", lineno, func ? func : "???");
+  } else if (pc) {
+    dprintf(2, "[%u]0x%012lx ??\n", *plevel, pc);
+  }
+  (*plevel)++;
+  return 0;
+}
+
+__attribute__((constructor))
+  static void
+debug_backtrace_init(void)
+{
+  const ssize_t len = readlink("/proc/self/exe", debug_filepath, 1023);
+  // disable backtrace
+  if (len < 0 || len >= 1023)
+    return;
+
+  debug_filepath[len] = '\0';
+  debug_bt_state = backtrace_create_state(debug_filepath, 1, debug_bt_error_cb, NULL);
+}
+#endif // BACKTRACE
+
+  static void
+debug_wait_gdb(void * const bt_state)
+{
+  if (bt_state) {
+#if defined(BACKTRACE)
+    dprintf(2, "Backtrace :\n");
+    u32 level = 0;
+    backtrace_full(debug_bt_state, 1, debug_bt_print_cb, debug_bt_error_cb, &level);
+#endif // BACKTRACE
+  } else { // fallback to execinfo if no backtrace or initialization failed
+    void *array[64];
+    const int size = backtrace(array, 64);
+    dprintf(2, "Backtrace (%d):\n", size - 1);
+    backtrace_symbols_fd(array + 1, size - 1, 2);
+  }
+
+  abool v = true;
+  char timestamp[32];
+  time_stamp(timestamp, 32);
+  char threadname[32] = {};
+  thread_get_name(pthread_self(), threadname, 32);
+  strcat(threadname, "(!!)");
+  thread_set_name(pthread_self(), threadname);
+  char hostname[32];
+  gethostname(hostname, 32);
+
+  const char * const pattern = "[Waiting GDB] %1$s %2$s @ %3$s\n"
+    "    Attach me: " TERMCLR(31) "sudo -Hi gdb -p %4$d" TERMCLR(0) "\n";
+  char buf[256];
+  sprintf(buf, pattern, timestamp, threadname, hostname, getpid());
+  write(2, buf, strlen(buf));
+
+  // to continue: gdb> set var v = 0
+  // to kill from shell: $ kill %pid; kill -CONT %pid
+
+  // uncomment this line to surrender the shell on error
+  // kill(getpid(), SIGSTOP); // stop burning cpu, once
+
+  static au32 nr_waiting = 0;
+  const u32 seq = atomic_fetch_add_explicit(&nr_waiting, 1, MO_RELAXED);
+  if (seq == 0) {
+    sprintf(buf, "/run/user/%u/.debug_wait_gdb_pid", getuid());
+    const int pidfd = open(buf, O_CREAT|O_TRUNC|O_WRONLY, 00644);
+    if (pidfd >= 0) {
+      dprintf(pidfd, "%u", getpid());
+      close(pidfd);
+    }
+  }
+
+#pragma nounroll
+  while (atomic_load_explicit(&v, MO_CONSUME))
+    sleep(1);
+}
+
+#ifndef NDEBUG
+  void
+debug_assert(const bool v)
+{
+  if (!v)
+    debug_wait_gdb(debug_bt_state);
+}
+#endif
+
+__attribute__((noreturn))
+  void
+debug_die(void)
+{
+  debug_wait_gdb(debug_bt_state);
+  exit(0);
+}
+
+__attribute__((noreturn))
+  void
+debug_die_perror(void)
+{
+  perror(NULL);
+  debug_die();
+}
+
+#if !defined(NOSIGNAL)
+// signal handler for wait_gdb on fatal errors
+  static void
+wait_gdb_handler(const int sig, siginfo_t * const info, void * const context)
+{
+  (void)info;
+  (void)context;
+  char buf[64] = "[SIGNAL] ";
+  strcat(buf, strsignal(sig));
+  write(2, buf, strlen(buf));
+  debug_wait_gdb(NULL);
+}
+
+// setup hooks for catching fatal errors
+__attribute__((constructor))
+  static void
+debug_init(void)
+{
+  void * stack = pages_alloc_4kb(16);
+  //fprintf(stderr, "altstack %p\n", stack);
+  stack_t ss = {.ss_sp = stack, .ss_flags = 0, .ss_size = PGSZ*16};
+  if (sigaltstack(&ss, NULL))
+    fprintf(stderr, "sigaltstack failed\n");
+
+  struct sigaction sa = {.sa_sigaction = wait_gdb_handler, .sa_flags = SA_SIGINFO | SA_ONSTACK};
+  sigemptyset(&(sa.sa_mask));
+  const int fatals[] = {SIGSEGV, SIGFPE, SIGILL, SIGBUS, 0};
+  for (int i = 0; fatals[i]; i++) {
+    if (sigaction(fatals[i], &sa, NULL) == -1) {
+      fprintf(stderr, "Failed to set signal handler for %s\n", strsignal(fatals[i]));
+      fflush(stderr);
+    }
+  }
+}
+
+__attribute__((destructor))
+  static void
+debug_exit(void)
+{
+  // to get rid of valgrind warnings
+  stack_t ss = {.ss_flags = SS_DISABLE};
+  stack_t oss = {};
+  sigaltstack(&ss, &oss);
+  if (oss.ss_sp)
+    pages_unmap(oss.ss_sp, PGSZ * 16);
+}
+#endif // !defined(NOSIGNAL)
+
+  void
+debug_dump_maps(FILE * const out)
+{
+  FILE * const in = fopen("/proc/self/smaps", "r");
+  char * line0 = yalloc(1024);
+  size_t size0 = 1024;
+  while (!feof(in)) {
+    const ssize_t r1 = getline(&line0, &size0, in);
+    if (r1 < 0) break;
+    fprintf(out, "%s", line0);
+  }
+  free(line0);
+  fflush(out);
+  fclose(in);
+}
+
+static pid_t perf_pid = 0;
+
+#if defined(__linux__)
+__attribute__((constructor))
+  static void
+debug_perf_init(void)
+{
+  const pid_t ppid = getppid();
+  char tmp[256] = {};
+  sprintf(tmp, "/proc/%d/cmdline", ppid);
+  FILE * const fc = fopen(tmp, "r");
+  const size_t nr = fread(tmp, 1, sizeof(tmp) - 1, fc);
+  fclose(fc);
+  // look for "perf record"
+  if (nr < 12)
+    return;
+  tmp[nr] = '\0';
+  for (u64 i = 0; i < nr; i++)
+    if (tmp[i] == 0)
+      tmp[i] = ' ';
+
+  char * const perf = strstr(tmp, "perf record");
+  if (perf) {
+    fprintf(stderr, "%s: perf detected\n", __func__);
+    perf_pid = ppid;
+  }
+}
+#endif // __linux__
+
+  bool
+debug_perf_switch(void)
+{
+  if (perf_pid > 0) {
+    kill(perf_pid, SIGUSR2);
+    return true;
+  } else {
+    return false;
+  }
+}
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  bool
+alloc_fail(void)
+{
+#define ALLOCFAIL_RECP ((64lu))
+#define ALLOCFAIL_MAGIC ((ALLOCFAIL_RECP / 3lu))
+  return ((random_u64() % ALLOCFAIL_RECP) == ALLOCFAIL_MAGIC);
+}
+
+#ifdef MALLOCFAIL
+extern void * __libc_malloc(size_t size);
+  void *
+malloc(size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_malloc(size);
+}
+
+extern void * __libc_calloc(size_t nmemb, size_t size);
+  void *
+calloc(size_t nmemb, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_calloc(nmemb, size);
+}
+
+extern void *__libc_realloc(void *ptr, size_t size);
+
+  void *
+realloc(void *ptr, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_realloc(ptr, size);
+}
+#endif // MALLOC_FAIL
+#endif // ALLOC_FAIL
+
+  void *
+xalloc(const size_t align, const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, align, size) == 0) ? p : NULL;
+}
+
+// alloc cache-line aligned address
+  void *
+yalloc(const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, 64, size) == 0) ? p : NULL;
+}
+
+  void **
+malloc_2d(const size_t nr, const size_t size)
+{
+  const size_t size1 = nr * sizeof(void *);
+  const size_t size2 = nr * size;
+  void ** const mem = malloc(size1 + size2);
+  u8 * const mem2 = ((u8 *)mem) + size1;
+  for (size_t i = 0; i < nr; i++)
+    mem[i] = mem2 + (i * size);
+  return mem;
+}
+
+  inline void **
+calloc_2d(const size_t nr, const size_t size)
+{
+  void ** const ret = malloc_2d(nr, size);
+  memset(ret[0], 0, nr * size);
+  return ret;
+}
+
+  inline void
+pages_unmap(void * const ptr, const size_t size)
+{
+#ifndef HEAPCHECKING
+  munmap(ptr, size);
+#else
+  (void)size;
+  free(ptr);
+#endif
+}
+
+  void
+pages_lock(void * const ptr, const size_t size)
+{
+  static bool use_mlock = true;
+  if (use_mlock) {
+    const int ret = mlock(ptr, size);
+    if (ret != 0) {
+      use_mlock = false;
+      fprintf(stderr, "%s: mlock disabled\n", __func__);
+    }
+  }
+}
+
+#ifndef HEAPCHECKING
+  static void *
+pages_do_alloc(const size_t size, const int flags)
+{
+  // vi /etc/security/limits.conf
+  // * - memlock unlimited
+  void * const p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (p == MAP_FAILED)
+    return NULL;
+
+  pages_lock(p, size);
+  return p;
+}
+
+#if defined(__linux__) && defined(MAP_HUGETLB)
+
+#if defined(MAP_HUGE_SHIFT)
+#define PAGES_FLAGS_1G ((MAP_HUGETLB | (30 << MAP_HUGE_SHIFT)))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB | (21 << MAP_HUGE_SHIFT)))
+#else // MAP_HUGE_SHIFT
+#define PAGES_FLAGS_1G ((MAP_HUGETLB))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB))
+#endif // MAP_HUGE_SHIFT
+
+#else
+#define PAGES_FLAGS_1G ((0))
+#define PAGES_FLAGS_2M ((0))
+#endif // __linux__
+
+#endif // HEAPCHECKING
+
+  inline void *
+pages_alloc_1gb(const size_t nr_1gb)
+{
+  const u64 sz = nr_1gb << 30;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_1G);
+#else
+  void * const p = xalloc(1lu << 21, sz); // Warning: valgrind fails with 30
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_2mb(const size_t nr_2mb)
+{
+  const u64 sz = nr_2mb << 21;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_2M);
+#else
+  void * const p = xalloc(1lu << 21, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_4kb(const size_t nr_4kb)
+{
+  const size_t sz = nr_4kb << 12;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS);
+#else
+  void * const p = xalloc(1lu << 12, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  // 1gb huge page: at least 0.25GB
+  if (try_1gb) {
+    if (size >= (1lu << 28)) {
+      const size_t nr_1gb = bits_round_up(size, 30) >> 30;
+      void * const p1 = pages_alloc_1gb(nr_1gb);
+      if (p1) {
+        *size_out = nr_1gb << 30;
+        return p1;
+      }
+    }
+  }
+
+  // 2mb huge page: at least 0.5MB
+  if (size >= (1lu << 19)) {
+    const size_t nr_2mb = bits_round_up(size, 21) >> 21;
+    void * const p2 = pages_alloc_2mb(nr_2mb);
+    if (p2) {
+      *size_out = nr_2mb << 21;
+      return p2;
+    }
+  }
+
+  const size_t nr_4kb = bits_round_up(size, 12) >> 12;
+  void * const p3 = pages_alloc_4kb(nr_4kb);
+  if (p3)
+    *size_out = nr_4kb << 12;
+  return p3;
+}
+// }}} mm
+
+// process/thread {{{
+static u32 process_ncpu;
+#if defined(__FreeBSD__)
+typedef cpuset_t cpu_set_t;
+#elif defined(__APPLE__) && defined(__MACH__)
+typedef u64 cpu_set_t;
+#define CPU_SETSIZE ((64))
+#define CPU_COUNT(__cpu_ptr__) (__builtin_popcountl(*__cpu_ptr__))
+#define CPU_ISSET(__cpu_idx__, __cpu_ptr__) (((*__cpu_ptr__) >> __cpu_idx__) & 1lu)
+#define CPU_ZERO(__cpu_ptr__) ((*__cpu_ptr__) = 0)
+#define CPU_SET(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) |= (1lu << __cpu_idx__))
+#define CPU_CLR(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) &= ~(1lu << __cpu_idx__))
+#define pthread_attr_setaffinity_np(...) ((void)0)
+#endif
+
+__attribute__((constructor))
+  static void
+process_init(void)
+{
+  // Linux's default is 1024 cpus
+  process_ncpu = (u32)sysconf(_SC_NPROCESSORS_CONF);
+  if (process_ncpu > CPU_SETSIZE) {
+    fprintf(stderr, "%s: can use only %zu cores\n",
+        __func__, (size_t)CPU_SETSIZE);
+    process_ncpu = CPU_SETSIZE;
+  }
+  thread_set_name(pthread_self(), "main");
+}
+
+  static inline int
+thread_getaffinity_set(cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_getaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  *cpuset = (1lu << process_ncpu) - 1;
+  return (int)process_ncpu; // TODO
+#endif // OS
+}
+
+  static inline int
+thread_setaffinity_set(const cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_setaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)cpuset; // TODO
+  return 0;
+#endif // OS
+}
+
+  void
+thread_get_name(const pthread_t pt, char * const name, const size_t len)
+{
+#if defined(__linux__)
+  pthread_getname_np(pt, name, len);
+#elif defined(__FreeBSD__)
+  pthread_get_name_np(pt, name, len);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)len;
+  strcpy(name, "unknown"); // TODO
+#endif // OS
+}
+
+  void
+thread_set_name(const pthread_t pt, const char * const name)
+{
+#if defined(__linux__)
+  pthread_setname_np(pt, name);
+#elif defined(__FreeBSD__)
+  pthread_set_name_np(pt, name);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)name; // TODO
+#endif // OS
+}
+
+// kB
+  long
+process_get_rss(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  return rs.ru_maxrss;
+}
+
+  u32
+process_affinity_count(void)
+{
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return process_ncpu;
+
+  const u32 nr = (u32)CPU_COUNT(&set);
+  return nr ? nr : process_ncpu;
+}
+
+  u32
+process_getaffinity_list(const u32 max, u32 * const cores)
+{
+  memset(cores, 0, max * sizeof(cores[0]));
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return 0;
+
+  const u32 nr_affinity = (u32)CPU_COUNT(&set);
+  const u32 nr = nr_affinity < max ? nr_affinity : max;
+  u32 j = 0;
+  for (u32 i = 0; i < process_ncpu; i++) {
+    if (CPU_ISSET(i, &set))
+      cores[j++] = i;
+
+    if (j >= nr)
+      break;
+  }
+  return j;
+}
+
+  void
+thread_setaffinity_list(const u32 nr, const u32 * const list)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (u32 i = 0; i < nr; i++)
+    if (list[i] < process_ncpu)
+      CPU_SET(list[i], &set);
+  thread_setaffinity_set(&set);
+}
+
+  void
+thread_pin(const u32 cpu)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(cpu % process_ncpu, &set);
+  thread_setaffinity_set(&set);
+}
+
+  u64
+process_cpu_time_usec(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  const u64 usr = (((u64)rs.ru_utime.tv_sec) * 1000000lu) + ((u64)rs.ru_utime.tv_usec);
+  const u64 sys = (((u64)rs.ru_stime.tv_sec) * 1000000lu) + ((u64)rs.ru_stime.tv_usec);
+  return usr + sys;
+}
+
+struct fork_join_info {
+  u32 total;
+  u32 ncores;
+  u32 * cores;
+  void *(*func)(void *);
+  bool args;
+  union {
+    void * arg1;
+    void ** argn;
+  };
+  union {
+    struct { au32 ferr, jerr; };
+    au64 xerr;
+  };
+};
+
+// DON'T CHANGE!
+#define FORK_JOIN_RANK_BITS ((16)) // 16
+#define FORK_JOIN_MAX ((1u << FORK_JOIN_RANK_BITS))
+
+/*
+ * fj(6):     T0
+ *         /      \
+ *       T0        T4
+ *     /   \      /
+ *    T0   T2    T4
+ *   / \   / \   / \
+ *  t0 t1 t2 t3 t4 t5
+ */
+
+// recursive tree fork-join
+  static void *
+thread_do_fork_join_worker(void * const ptr)
+{
+  struct entry13 fjp = {.ptr = ptr};
+  // GCC: Without explicitly casting from fjp.fji (a 45-bit u64 value),
+  // the high bits will get truncated, which is always CORRECT in gcc.
+  // Don't use gcc.
+  struct fork_join_info * const fji = u64_to_ptr(fjp.e3);
+  const u32 rank = (u32)fjp.e1;
+
+  const u32 nchild = (u32)__builtin_ctz(rank ? rank : bits_p2_up_u32(fji->total));
+  debug_assert(nchild <= FORK_JOIN_RANK_BITS);
+  pthread_t tids[FORK_JOIN_RANK_BITS];
+  if (nchild) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Joinable by default
+    // fork top-down
+    for (u32 i = nchild - 1; i < nchild; i--) {
+      const u32 cr = rank + (1u << i); // child's rank
+      if (cr >= fji->total)
+        continue; // should not break
+      const u32 core = fji->cores[(cr < fji->ncores) ? cr : (cr % fji->ncores)];
+      CPU_SET(core, &set);
+      pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+      fjp.e1 = (u16)cr;
+      const int r = pthread_create(&tids[i], &attr, thread_do_fork_join_worker, fjp.ptr);
+      CPU_CLR(core, &set);
+      if (unlikely(r)) { // fork failed
+        memset(&tids[0], 0, sizeof(tids[0]) * (i+1));
+        u32 nmiss = (1u << (i + 1)) - 1;
+        if ((rank + nmiss) >= fji->total)
+          nmiss = fji->total - 1 - rank;
+        (void)atomic_fetch_add_explicit(&fji->ferr, nmiss, MO_RELAXED);
+        break;
+      }
+    }
+    pthread_attr_destroy(&attr);
+  }
+
+  char thname0[16];
+  char thname1[16];
+  thread_get_name(pthread_self(), thname0, 16);
+  snprintf(thname1, 16, "%.8s_%u", thname0, rank);
+  thread_set_name(pthread_self(), thname1);
+
+  void * const ret = fji->func(fji->args ? fji->argn[rank] : fji->arg1);
+
+  thread_set_name(pthread_self(), thname0);
+  // join bottom-up
+  for (u32 i = 0; i < nchild; i++) {
+    const u32 cr = rank + (1u << i); // child rank
+    if (cr >= fji->total)
+      break; // safe to break
+    if (tids[i]) {
+      const int r = pthread_join(tids[i], NULL);
+      if (unlikely(r)) { // error
+        //fprintf(stderr, "pthread_join %u..%u = %d: %s\n", rank, cr, r, strerror(r));
+        (void)atomic_fetch_add_explicit(&fji->jerr, 1, MO_RELAXED);
+      }
+    }
+  }
+  return ret;
+}
+
+  u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx)
+{
+  if (unlikely(nr > FORK_JOIN_MAX)) {
+    fprintf(stderr, "%s reduce nr to %u\n", __func__, FORK_JOIN_MAX);
+    nr = FORK_JOIN_MAX;
+  }
+
+  u32 cores[CPU_SETSIZE];
+  u32 ncores = process_getaffinity_list(process_ncpu, cores);
+  if (unlikely(ncores == 0)) { // force to use all cores
+    ncores = process_ncpu;
+    for (u32 i = 0; i < process_ncpu; i++)
+      cores[i] = i;
+  }
+  if (unlikely(nr == 0))
+    nr = ncores;
+
+  // the compiler does not know fji can change since we cast &fji into fjp
+  struct fork_join_info fji = {.total = nr, .cores = cores, .ncores = ncores,
+      .func = func, .args = args, .arg1 = argx};
+  const struct entry13 fjp = entry13(0, (u64)(&fji));
+
+  // save current affinity
+  cpu_set_t set0;
+  thread_getaffinity_set(&set0);
+
+  // master thread shares thread0's core
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(fji.cores[0], &set);
+  thread_setaffinity_set(&set);
+
+  const u64 t0 = time_nsec();
+  (void)thread_do_fork_join_worker(fjp.ptr);
+  const u64 dt = time_diff_nsec(t0);
+
+  // restore original affinity
+  thread_setaffinity_set(&set0);
+
+  // check and report errors (unlikely)
+  if (atomic_load_explicit(&fji.xerr, MO_CONSUME))
+    fprintf(stderr, "%s errors: fork %u join %u\n", __func__, fji.ferr, fji.jerr);
+  return dt;
+}
+
+  int
+thread_create_at(const u32 cpu, pthread_t * const thread,
+    void *(*start_routine) (void *), void * const arg)
+{
+  const u32 cpu_id = (cpu < process_ncpu) ? cpu : (cpu % process_ncpu);
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  cpu_set_t set;
+
+  CPU_ZERO(&set);
+  CPU_SET(cpu_id, &set);
+  pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+  const int r = pthread_create(thread, &attr, start_routine, arg);
+  pthread_attr_destroy(&attr);
+  return r;
+}
+// }}} process/thread
+
+// locking {{{
+
+// spinlock {{{
+#if defined(__linux__)
+#define SPINLOCK_PTHREAD
+#endif // __linux__
+
+#if defined(SPINLOCK_PTHREAD)
+static_assert(sizeof(pthread_spinlock_t) <= sizeof(spinlock), "spinlock size");
+#else // SPINLOCK_PTHREAD
+static_assert(sizeof(au32) <= sizeof(spinlock), "spinlock size");
+#endif // SPINLOCK_PTHREAD
+
+  void
+spinlock_init(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_init(p, PTHREAD_PROCESS_PRIVATE);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_lock(spinlock * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!spinlock_trylock(lock))
+    corr_yield();
+#else // CORR
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_lock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+#pragma nounroll
+  do {
+    if (atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0)
+      return;
+#pragma nounroll
+    do {
+      cpu_pause();
+    } while (atomic_load_explicit(p, MO_CONSUME));
+  } while (true);
+#endif // SPINLOCK_PTHREAD
+#endif // CORR
+}
+
+  inline bool
+spinlock_trylock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  return !pthread_spin_trylock(p);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  return atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0;
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_unlock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_unlock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+// }}} spinlock
+
+// pthread mutex {{{
+static_assert(sizeof(pthread_mutex_t) <= sizeof(mutex), "mutexlock size");
+  inline void
+mutex_init(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_init(p, NULL);
+}
+
+  inline void
+mutex_lock(mutex * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!mutex_trylock(lock))
+    corr_yield();
+#else
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_lock(p); // return value ignored
+#endif
+}
+
+  inline bool
+mutex_trylock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  return !pthread_mutex_trylock(p); // return value ignored
+}
+
+  inline void
+mutex_unlock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_unlock(p); // return value ignored
+}
+
+  inline void
+mutex_deinit(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_destroy(p);
+}
+// }}} pthread mutex
+
+// rwdep {{{
+// poor man's lockdep for rwlock
+// per-thread lock list
+// it calls debug_die() when local double-(un)locking is detected
+// cyclic dependencies can be manually identified by looking at the two lists below in gdb
+#ifdef RWDEP
+#define RWDEP_NR ((16))
+__thread const rwlock * rwdep_readers[RWDEP_NR] = {};
+__thread const rwlock * rwdep_writers[RWDEP_NR] = {};
+
+  static void
+rwdep_check(const rwlock * const lock)
+{
+  debug_assert(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock)
+      debug_die();
+    if (rwdep_writers[i] == lock)
+      debug_die();
+  }
+}
+#endif // RWDEP
+
+  static void
+rwdep_lock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == NULL) {
+      rwdep_readers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock) {
+      rwdep_readers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_lock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == NULL) {
+      rwdep_writers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == lock) {
+      rwdep_writers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+// }}} rwlockdep
+
+// rwlock {{{
+typedef au32 lock_t;
+typedef u32 lock_v;
+static_assert(sizeof(lock_t) == sizeof(lock_v), "lock size");
+static_assert(sizeof(lock_t) <= sizeof(rwlock), "lock size");
+
+#define RWLOCK_WSHIFT ((sizeof(lock_t) * 8 - 1))
+#define RWLOCK_WBIT ((((lock_v)1) << RWLOCK_WSHIFT))
+
+  inline void
+rwlock_init(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_store_explicit(pvar, 0, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  } else {
+    atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_read_lp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) {
+    cpu_pause();
+    return false;
+  }
+  return rwlock_trylock_read(lock);
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  }
+
+#pragma nounroll
+  do { // someone already locked; wait for a little while
+    cpu_pause();
+    if ((atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) == 0) {
+      rwdep_lock_read(lock);
+      return true;
+    }
+  } while (nr--);
+
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+  return false;
+}
+
+  inline void
+rwlock_lock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_read(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT);
+  } while (true);
+}
+
+  inline void
+rwlock_unlock_read(rwlock * const lock)
+{
+  rwdep_unlock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if ((v0 == 0) && atomic_compare_exchange_weak_explicit(pvar, &v0, RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME));
+  } while (true);
+}
+
+  inline bool
+rwlock_trylock_write_hp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if (v0 >> RWLOCK_WSHIFT)
+    return false;
+
+  if (atomic_compare_exchange_weak_explicit(pvar, &v0, v0|RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    // WBIT successfully marked; must wait for readers to leave
+    if (v0) { // saw active readers
+#pragma nounroll
+      while (atomic_load_explicit(pvar, MO_CONSUME) != RWLOCK_WBIT) {
+#if defined(CORR)
+        corr_yield();
+#else
+        cpu_pause();
+#endif
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write_hp(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write_hp(rwlock * const lock)
+{
+#pragma nounroll
+  while (!rwlock_trylock_write_hp(lock)) {
+#if defined(CORR)
+    corr_yield();
+#else
+    cpu_pause();
+#endif
+  }
+}
+
+  inline void
+rwlock_unlock_write(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, RWLOCK_WBIT, MO_RELEASE);
+}
+
+  inline void
+rwlock_write_to_read(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  rwdep_lock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  // +R -W
+  atomic_fetch_add_explicit(pvar, ((lock_v)1) - RWLOCK_WBIT, MO_ACQ_REL);
+}
+
+#undef RWLOCK_WSHIFT
+#undef RWLOCK_WBIT
+// }}} rwlock
+
+// }}} locking
+
+// coroutine {{{
+
+// asm {{{
+#if defined(__x86_64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((6))
+
+// for switch/exit: pass a return value to the target
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error Supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "push %rbp; push %rbx; push %r12;"
+    "push %r13; push %r14; push %r15;"
+    "mov  %rsp, (%rdi);"
+    "mov  %rsi, %rsp;"
+    "pop  %r15; pop  %r14; pop  %r13;"
+    "pop  %r12; pop  %rbx; pop  %rbp;"
+    "mov  %rdx, %rax;"
+    "retq;"
+    );
+
+#elif defined(__aarch64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((20))
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "sub  x8, sp, 160;"
+    "str  x8, [x0];"
+    "stp x30, x19, [x8];      ldp x30, x19, [x1];"
+    "stp x20, x21, [x8, 16];  ldp x20, x21, [x1, 16];"
+    "stp x22, x23, [x8, 32];  ldp x22, x23, [x1, 32];"
+    "stp x24, x25, [x8, 48];  ldp x24, x25, [x1, 48];"
+    "stp x26, x27, [x8, 64];  ldp x26, x27, [x1, 64];"
+    "stp x28, x29, [x8, 80];  ldp x28, x29, [x1, 80];"
+    "stp  d8,  d9, [x8, 96];  ldp  d8,  d9, [x1, 96];"
+    "stp d10, d11, [x8, 112]; ldp d10, d11, [x1, 112];"
+    "stp d12, d13, [x8, 128]; ldp d12, d13, [x1, 128];"
+    "stp d14, d15, [x8, 144]; ldp d14, d15, [x1, 144];"
+    "add  sp, x1, 160;"
+    "mov  x0, x2;"
+    "br  x30;"
+    );
+
+extern void co_entry_aarch64(void);
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_entry_aarch64;"
+    ".type co_entry_aarch64, @function;"
+    "co_entry_aarch64:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_entry_aarch64;"
+    "_co_entry_aarch64:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "ldr x8, [sp, 0];"
+    "blr x8;"
+    "ldr x8, [sp, 8];"
+    "blr x8;"
+    "ldr x8, [sp, 16];"
+    "blr x8;"
+    );
+#else
+#error supported CPUs: x86_64 or AArch64
+#endif // co_switch_stack x86_64 and aarch64
+// }}} asm
+
+// co {{{
+struct co {
+  u64 rsp;
+  void * priv;
+  u64 * host; // set host to NULL to exit
+  size_t stksz;
+};
+
+// not atomic: no concurrent access
+// volatile: avoid caching of co_curr
+static __thread struct co * volatile co_curr = NULL; // NULL in host
+
+// the stack sits under the struct co
+  static void
+co_init(struct co * const co, void * func, void * priv, u64 * const host,
+    const u64 stksz, void * func_exit)
+{
+  debug_assert((stksz & 0x3f) == 0); // a multiple of 64 bytes
+  u64 * rsp = ((u64 *)co) - 4;
+  rsp[0] = (u64)func;
+  rsp[1] = (u64)func_exit;
+  rsp[2] = (u64)debug_die;
+  rsp[3] = 0;
+
+  rsp -= CO_CONTEXT_SIZE;
+
+#if defined(__aarch64__)
+  rsp[0] = (u64)co_entry_aarch64;
+#endif
+
+  co->rsp = (u64)rsp;
+  co->priv = priv;
+  co->host = host;
+  co->stksz = stksz;
+}
+
+  static void
+co_exit0(void)
+{
+  co_exit(0);
+}
+
+  struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct co);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct co * const co = (typeof(co))(mem + stksz);
+  co_init(co, func, priv, host, stksz, co_exit0);
+  return co;
+}
+
+  inline void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(co, func, priv, host, co->stksz, co_exit0);
+}
+
+  inline struct co *
+co_fork(void * func, void * priv)
+{
+  return co_curr ? co_create(co_curr->stksz, func, priv, co_curr->host) : NULL;
+}
+
+  inline void *
+co_priv(void)
+{
+  return co_curr ? co_curr->priv : NULL;
+}
+
+// the host calls this to enter a coroutine.
+  inline u64
+co_enter(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr == NULL); // must entry from the host
+  debug_assert(to && to->host);
+  u64 * const save = to->host;
+  co_curr = to;
+  const u64 ret = co_switch_stack(save, to->rsp, retval);
+  co_curr = NULL;
+  return ret;
+}
+
+// switch from a coroutine to another coroutine
+// co_curr must be valid
+// the target will resume and receive the retval
+  inline u64
+co_switch_to(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr);
+  debug_assert(co_curr != to);
+  debug_assert(to && to->host);
+  struct co * const save = co_curr;
+  co_curr = to;
+  return co_switch_stack(&(save->rsp), to->rsp, retval);
+}
+
+// switch from a coroutine to the host routine
+// co_yield is now a c++ keyword...
+  inline u64
+co_back(const u64 retval)
+{
+  debug_assert(co_curr);
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  return co_switch_stack(&(save->rsp), *(save->host), retval);
+}
+
+#ifdef CO_STACK_CHECK
+  static void
+co_stack_check(const u8 * const mem, const u64 stksz)
+{
+  const u64 * const mem64 = (typeof(mem64))mem;
+  const u64 size64 = stksz / sizeof(u64);
+  for (u64 i = 0; i < size64; i++) {
+    if (mem64[i] != 0x5c5c5c5c5c5c5c5clu) {
+      fprintf(stderr, "%s co stack usage: %lu/%lu\n", __func__, stksz - (i * sizeof(u64)), stksz);
+      break;
+    }
+  }
+}
+#endif // CO_STACK_CHECK
+
+// return to host and set host to NULL
+__attribute__((noreturn))
+  void
+co_exit(const u64 retval)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  u8 * const mem = ((u8 *)co_curr) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+  const u64 hostrsp = *(co_curr->host);
+  co_curr->host = NULL;
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  (void)co_switch_stack(&(save->rsp), hostrsp, retval);
+  // return to co_enter
+  debug_die();
+}
+
+// host is set to NULL on exit
+  inline bool
+co_valid(struct co * const co)
+{
+  return co->host != NULL;
+}
+
+// return NULL on host
+  inline struct co *
+co_self(void)
+{
+  return co_curr;
+}
+
+  inline void
+co_destroy(struct co * const co)
+{
+  u8 * const mem = ((u8 *)co) - co->stksz;
+  free(mem);
+}
+// }}} co
+
+// corr {{{
+struct corr {
+  struct co co;
+  struct corr * next;
+  struct corr * prev;
+};
+
+// initial and link guest to the run-queue
+  struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, host, stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+  return co;
+}
+
+  struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, prev->co.host, stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+  return co;
+}
+
+  inline void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(&(co->co), func, priv, host, co->co.stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+}
+
+  inline void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev)
+{
+  co_init(&(co->co), func, priv, prev->co.host, co->co.stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+}
+
+  inline void
+corr_enter(struct corr * const co)
+{
+  (void)co_enter(&(co->co), 0);
+}
+
+  inline void
+corr_yield(void)
+{
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr && (curr->next != curr))
+    (void)co_switch_to(&(curr->next->co), 0);
+}
+
+__attribute__((noreturn))
+  inline void
+corr_exit(void)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  const u8 * const mem = ((u8 *)(co_curr)) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr->next != curr) { // have more corr
+    struct corr * const next = curr->next;
+    struct corr * const prev = curr->prev;
+    next->prev = prev;
+    prev->next = next;
+    curr->next = NULL;
+    curr->prev = NULL;
+    curr->co.host = NULL; // invalidate
+    (void)co_switch_to(&(next->co), 0);
+  } else { // the last corr
+    co_exit0();
+  }
+  debug_die();
+}
+
+  inline void
+corr_destroy(struct corr * const co)
+{
+  co_destroy(&(co->co));
+}
+// }}} corr
+
+// }}} co
+
+// bits {{{
+  inline u32
+bits_reverse_u32(const u32 v)
+{
+  const u32 v2 = __builtin_bswap32(v);
+  const u32 v3 = ((v2 & 0xf0f0f0f0u) >> 4) | ((v2 & 0x0f0f0f0fu) << 4);
+  const u32 v4 = ((v3 & 0xccccccccu) >> 2) | ((v3 & 0x33333333u) << 2);
+  const u32 v5 = ((v4 & 0xaaaaaaaau) >> 1) | ((v4 & 0x55555555u) << 1);
+  return v5;
+}
+
+  inline u64
+bits_reverse_u64(const u64 v)
+{
+  const u64 v2 = __builtin_bswap64(v);
+  const u64 v3 = ((v2 & 0xf0f0f0f0f0f0f0f0lu) >>  4) | ((v2 & 0x0f0f0f0f0f0f0f0flu) <<  4);
+  const u64 v4 = ((v3 & 0xcccccccccccccccclu) >>  2) | ((v3 & 0x3333333333333333lu) <<  2);
+  const u64 v5 = ((v4 & 0xaaaaaaaaaaaaaaaalu) >>  1) | ((v4 & 0x5555555555555555lu) <<  1);
+  return v5;
+}
+
+  inline u64
+bits_rotl_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v << sh) | (v >> (64 - sh));
+}
+
+  inline u64
+bits_rotr_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v >> sh) | (v << (64 - sh));
+}
+
+  inline u32
+bits_rotl_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v << sh) | (v >> (32 - sh));
+}
+
+  inline u32
+bits_rotr_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v >> sh) | (v << (32 - sh));
+}
+
+  inline u64
+bits_p2_up_u64(const u64 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1lu << (64 - __builtin_clzl(v - 1lu))) : v;
+}
+
+  inline u32
+bits_p2_up_u32(const u32 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1u << (32 - __builtin_clz(v - 1u))) : v;
+}
+
+  inline u64
+bits_p2_down_u64(const u64 v)
+{
+  return v ? (1lu << (63 - __builtin_clzl(v))) : v;
+}
+
+  inline u32
+bits_p2_down_u32(const u32 v)
+{
+  return v ? (1u << (31 - __builtin_clz(v))) : v;
+}
+
+  inline u64
+bits_round_up(const u64 v, const u8 power)
+{
+  return (v + (1lu << power) - 1lu) >> power << power;
+}
+
+  inline u64
+bits_round_up_a(const u64 v, const u64 a)
+{
+  return (v + a - 1) / a * a;
+}
+
+  inline u64
+bits_round_down(const u64 v, const u8 power)
+{
+  return v >> power << power;
+}
+
+  inline u64
+bits_round_down_a(const u64 v, const u64 a)
+{
+  return v / a * a;
+}
+// }}} bits
+
+// vi128 {{{
+#if defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH __attribute__ ((fallthrough))
+#else
+#define FALLTHROUGH ((void)0)
+#endif /* __GNUC__ >= 7 */
+
+  inline u32
+vi128_estimate_u32(const u32 v)
+{
+  static const u8 t[] = {5,5,5,5,
+    4,4,4,4,4,4,4, 3,3,3,3,3,3,3,
+    2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clz(v)] : 2;
+  // 0 -> [0x80 0x00] the first byte is non-zero
+
+  // nz bit range -> enc length    offset in t[]
+  // 0 -> 2          special case
+  // 1 to 7 -> 1     31 to 25
+  // 8 to 14 -> 2    24 to 18
+  // 15 to 21 -> 3   17 to 11
+  // 22 to 28 -> 4   10 to 4
+  // 29 to 32 -> 5    3 to 0
+}
+
+  u8 *
+vi128_encode_u32(u8 * dst, u32 v)
+{
+  switch (vi128_estimate_u32(v)) {
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+  const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out)
+{
+  debug_assert(*src);
+  u32 r = 0;
+  for (u32 shift = 0; shift < 32; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u32)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+  inline u32
+vi128_estimate_u64(const u64 v)
+{
+  static const u8 t[] = {10,
+    9,9,9,9,9,9,9, 8,8,8,8,8,8,8, 7,7,7,7,7,7,7,
+    6,6,6,6,6,6,6, 5,5,5,5,5,5,5, 4,4,4,4,4,4,4,
+    3,3,3,3,3,3,3, 2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clzl(v)] : 2;
+}
+
+// return ptr after the generated bytes
+  u8 *
+vi128_encode_u64(u8 * dst, u64 v)
+{
+  switch (vi128_estimate_u64(v)) {
+  case 10:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 9:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 8:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 7:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 6:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+// return ptr after the consumed bytes
+  const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out)
+{
+  u64 r = 0;
+  for (u32 shift = 0; shift < 64; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u64)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+#undef FALLTHROUGH
+// }}} vi128
+
+// misc {{{
+  inline struct entry13
+entry13(const u16 e1, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  return (struct entry13){.v64 = (e3 << 16) | e1};
+}
+
+  inline void
+entry13_update_e3(struct entry13 * const e, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  *e = entry13(e->e1, e3);
+}
+
+  inline void *
+u64_to_ptr(const u64 v)
+{
+  return (void *)v;
+}
+
+  inline u64
+ptr_to_u64(const void * const ptr)
+{
+  return (u64)ptr;
+}
+
+// portable malloc_usable_size
+  inline size_t
+m_usable_size(void * const ptr)
+{
+#if defined(__linux__) || defined(__FreeBSD__)
+  const size_t sz = malloc_usable_size(ptr);
+#elif defined(__APPLE__) && defined(__MACH__)
+  const size_t sz = malloc_size(ptr);
+#endif // OS
+
+#ifndef HEAPCHECKING
+  // valgrind and asan may return unaligned usable size
+  debug_assert((sz & 0x7lu) == 0);
+#endif // HEAPCHECKING
+
+  return sz;
+}
+
+  inline size_t
+fdsize(const int fd)
+{
+  struct stat st;
+  st.st_size = 0;
+  if (fstat(fd, &st) != 0)
+    return 0;
+
+  if (S_ISBLK(st.st_mode)) {
+#if defined(__linux__)
+    ioctl(fd, BLKGETSIZE64, &st.st_size);
+#elif defined(__APPLE__) && defined(__MACH__)
+    u64 blksz = 0;
+    u64 nblks = 0;
+    ioctl(fd, DKIOCGETBLOCKSIZE, &blksz);
+    ioctl(fd, DKIOCGETBLOCKCOUNT, &nblks);
+    st.st_size = (ssize_t)(blksz * nblks);
+#elif defined(__FreeBSD__)
+    ioctl(fd, DIOCGMEDIASIZE, &st.st_size);
+#endif // OS
+  }
+
+  return (size_t)st.st_size;
+}
+
+  u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max)
+{
+  const u32 max64 = max & (~7u);
+  u32 clen = 0;
+  while (clen < max64) {
+    const u64 v1 = *(const u64 *)(p1+clen);
+    const u64 v2 = *(const u64 *)(p2+clen);
+    const u64 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctzl(x) >> 3);
+
+    clen += sizeof(u64);
+  }
+
+  if ((clen + sizeof(u32)) <= max) {
+    const u32 v1 = *(const u32 *)(p1+clen);
+    const u32 v2 = *(const u32 *)(p2+clen);
+    const u32 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctz(x) >> 3);
+
+    clen += sizeof(u32);
+  }
+
+  while ((clen < max) && (p1[clen] == p2[clen]))
+    clen++;
+  return clen;
+}
+
+static double logger_t0 = 0.0;
+
+__attribute__((constructor))
+  static void
+logger_init(void)
+{
+  logger_t0 = time_sec();
+}
+
+__attribute__ ((format (printf, 2, 3)))
+  void
+logger_printf(const int fd, const char * const fmt, ...)
+{
+  char buf[4096];
+  va_list ap;
+  va_start(ap, fmt);
+  vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+  dprintf(fd, "%010.3lf %08x %s", time_diff_sec(logger_t0), crc32c_u64(0x12345678, (u64)pthread_self()), buf);
+}
+// }}} misc
+
+// astk {{{
+// atomic stack
+struct acell { struct acell * next; };
+
+// extract ptr from m value
+  static inline struct acell *
+astk_ptr(const u64 m)
+{
+  return (struct acell *)(m >> 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1(const u64 m0, struct acell * const ptr)
+{
+  return ((m0 + 1) & 0xfffflu) | (((u64)ptr) << 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1_unsafe(struct acell * const ptr)
+{
+  return ((u64)ptr) << 16;
+}
+
+  static bool
+astk_try_push(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1(m0, first);
+  return atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_RELEASE, MO_RELAXED);
+}
+
+  static void
+astk_push_safe(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  while (!astk_try_push(pmagic, first, last));
+}
+
+  static void
+astk_push_unsafe(au64 * const pmagic, struct acell * const first,
+    struct acell * const last)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1_unsafe(first);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+}
+
+//// can fail for two reasons: (1) NULL: no available object; (2) ~0lu: contention
+//  static void *
+//astk_try_pop(au64 * const pmagic)
+//{
+//  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+//  struct acell * const ret = astk_ptr(m0);
+//  if (ret == NULL)
+//    return NULL;
+//
+//  const u64 m1 = astk_m1(m0, ret->next);
+//  if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+//    return ret;
+//  else
+//    return (void *)(~0lu);
+//}
+
+  static void *
+astk_pop_safe(au64 * const pmagic)
+{
+  do {
+    u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+    struct acell * const ret = astk_ptr(m0);
+    if (ret == NULL)
+      return NULL;
+
+    const u64 m1 = astk_m1(m0, ret->next);
+    if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+      return ret;
+  } while (true);
+}
+
+  static void *
+astk_pop_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  struct acell * const ret = astk_ptr(m0);
+  if (ret == NULL)
+    return NULL;
+
+  const u64 m1 = astk_m1_unsafe(ret->next);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+  return (void *)ret;
+}
+
+  static void *
+astk_peek_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  return astk_ptr(m0);
+}
+// }}} astk
+
+// slab {{{
+#define SLAB_OBJ0_OFFSET ((64))
+struct slab {
+  au64 magic; // hi 48: ptr, lo 16: seq
+  u64 padding1[7];
+
+  // 2nd line
+  struct acell * head_active; // list of blocks in use or in magic
+  struct acell * head_backup; // list of unused full blocks
+  u64 nr_ready; // UNSAFE only! number of objects under magic
+  u64 padding2[5];
+
+  // 3rd line const
+  u64 obj_size; // const: aligned size of each object
+  u64 blk_size; // const: size of each memory block
+  u64 objs_per_slab; // const: number of objects in a slab
+  u64 obj0_offset; // const: offset of the first object in a block
+  u64 padding3[4];
+
+  // 4th line
+  union {
+    mutex lock;
+    u64 padding4[8];
+  };
+};
+static_assert(sizeof(struct slab) == 256, "sizeof(struct slab) != 256");
+
+  static void
+slab_add(struct slab * const slab, struct acell * const blk, const bool is_safe)
+{
+  // insert into head_active
+  blk->next = slab->head_active;
+  slab->head_active = blk;
+
+  u8 * const base = ((u8 *)blk) + slab->obj0_offset;
+  struct acell * iter = (typeof(iter))base; // [0]
+  for (u64 i = 1; i < slab->objs_per_slab; i++) {
+    struct acell * const next = (typeof(next))(base + (i * slab->obj_size));
+    iter->next = next;
+    iter = next;
+  }
+
+  // base points to the first block; iter points to the last block
+  if (is_safe) { // other threads can poll magic
+    astk_push_safe(&slab->magic, (struct acell *)base, iter);
+  } else { // unsafe
+    astk_push_unsafe(&slab->magic, (struct acell *)base, iter);
+    slab->nr_ready += slab->objs_per_slab;
+  }
+}
+
+// critical section; call with lock
+  static bool
+slab_expand(struct slab * const slab, const bool is_safe)
+{
+  struct acell * const old = slab->head_backup;
+  if (old) { // pop old from backup and add
+    slab->head_backup = old->next;
+    slab_add(slab, old, is_safe);
+  } else { // more core
+    size_t blk_size;
+    struct acell * const new = pages_alloc_best(slab->blk_size, true, &blk_size);
+    (void)blk_size;
+    if (new == NULL)
+      return false;
+
+    slab_add(slab, new, is_safe);
+  }
+  return true;
+}
+
+// return 0 on failure; otherwise, obj0_offset
+  static u64
+slab_check_sizes(const u64 obj_size, const u64 blk_size)
+{
+  // obj must be non-zero and 8-byte aligned
+  // blk must be at least of page size and power of 2
+  if ((!obj_size) || (obj_size % 8lu) || (blk_size < 4096lu) || (blk_size & (blk_size - 1)))
+    return 0;
+
+  // each slab should have at least one object
+  const u64 obj0_offset = (obj_size & (obj_size - 1)) ? SLAB_OBJ0_OFFSET : obj_size;
+  if (obj0_offset >= blk_size || (blk_size - obj0_offset) < obj_size)
+    return 0;
+
+  return obj0_offset;
+}
+
+  static void
+slab_init_internal(struct slab * const slab, const u64 obj_size, const u64 blk_size, const u64 obj0_offset)
+{
+  memset(slab, 0, sizeof(*slab));
+  slab->obj_size = obj_size;
+  slab->blk_size = blk_size;
+  slab->objs_per_slab = (blk_size - obj0_offset) / obj_size;
+  debug_assert(slab->objs_per_slab); // >= 1
+  slab->obj0_offset = obj0_offset;
+  mutex_init(&(slab->lock));
+}
+
+  struct slab *
+slab_create(const u64 obj_size, const u64 blk_size)
+{
+  const u64 obj0_offset = slab_check_sizes(obj_size, blk_size);
+  if (!obj0_offset)
+    return NULL;
+
+  struct slab * const slab = yalloc(sizeof(*slab));
+  if (slab == NULL)
+    return NULL;
+
+  slab_init_internal(slab, obj_size, blk_size, obj0_offset);
+  return slab;
+}
+
+// unsafe
+  bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr)
+{
+  while (slab->nr_ready < nr)
+    if (!slab_expand(slab, false))
+      return false;
+  return true;
+}
+
+  void *
+slab_alloc_unsafe(struct slab * const slab)
+{
+  void * ret = astk_pop_unsafe(&slab->magic);
+  if (ret == NULL) {
+    if (!slab_expand(slab, false))
+      return NULL;
+    ret = astk_pop_unsafe(&slab->magic);
+  }
+  debug_assert(ret);
+  slab->nr_ready--;
+  return ret;
+}
+
+  void *
+slab_alloc_safe(struct slab * const slab)
+{
+  void * ret = astk_pop_safe(&slab->magic);
+  if (ret)
+    return ret;
+
+  mutex_lock(&slab->lock);
+  do {
+    ret = astk_pop_safe(&slab->magic); // may already have new objs
+    if (ret)
+      break;
+    if (!slab_expand(slab, true))
+      break;
+  } while (true);
+  mutex_unlock(&slab->lock);
+  return ret;
+}
+
+  void
+slab_free_unsafe(struct slab * const slab, void * const ptr)
+{
+  debug_assert(ptr);
+  astk_push_unsafe(&slab->magic, ptr, ptr);
+  slab->nr_ready++;
+}
+
+  void
+slab_free_safe(struct slab * const slab, void * const ptr)
+{
+  astk_push_safe(&slab->magic, ptr, ptr);
+}
+
+// UNSAFE
+  void
+slab_free_all(struct slab * const slab)
+{
+  slab->magic = 0;
+  slab->nr_ready = 0; // backup does not count
+
+  if (slab->head_active) {
+    struct acell * iter = slab->head_active;
+    while (iter->next)
+      iter = iter->next;
+    // now iter points to the last blk
+    iter->next = slab->head_backup; // active..backup
+    slab->head_backup = slab->head_active; // backup gets all
+    slab->head_active = NULL; // empty active
+  }
+}
+
+// unsafe
+  u64
+slab_get_nalloc(struct slab * const slab)
+{
+  struct acell * iter = slab->head_active;
+  u64 n = 0;
+  while (iter) {
+    n++;
+    iter = iter->next;
+  }
+  n *= slab->objs_per_slab;
+
+  iter = astk_peek_unsafe(&slab->magic);
+  while (iter) {
+    n--;
+    iter = iter->next;
+  }
+  return n;
+}
+
+  static void
+slab_deinit(struct slab * const slab)
+{
+  debug_assert(slab);
+  struct acell * iter = slab->head_active;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+  iter = slab->head_backup;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+}
+
+  void
+slab_destroy(struct slab * const slab)
+{
+  slab_deinit(slab);
+  free(slab);
+}
+// }}} slab
+
+// string {{{
+static union { u16 v16; u8 v8[2]; } strdec_table[100];
+
+__attribute__((constructor))
+  static void
+strdec_init(void)
+{
+  for (u8 i = 0; i < 100; i++) {
+    const u8 hi = (typeof(hi))('0' + (i / 10));
+    const u8 lo = (typeof(lo))('0' + (i % 10));
+    strdec_table[i].v8[0] = hi;
+    strdec_table[i].v8[1] = lo;
+  }
+}
+
+// output 10 bytes
+  void
+strdec_32(void * const out, const u32 v)
+{
+  u32 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 4; i <= 4; i--) { // x5
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100u;
+  }
+}
+
+// output 20 bytes
+  void
+strdec_64(void * const out, const u64 v)
+{
+  u64 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 9; i <= 9; i--) { // x10
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100;
+  }
+}
+
+static const u8 strhex_table_16[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+
+#if defined(__x86_64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+
+  const m128 tmp = _mm_set_epi64x((s64)(v>>4), (s64)v); // mm want s64
+  const m128 hilo = _mm_and_si128(tmp, _mm_set1_epi8(0xf));
+  const m128 bin = _mm_shuffle_epi8(hilo, _mm_load_si128((void *)mask1));
+  const m128 str = _mm_shuffle_epi8(_mm_load_si128((const void *)strhex_table_16), bin);
+  return str;
+}
+#elif defined(__aarch64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+  u64 v2[2] = {v, v>>4};
+  const m128 tmp = vld1q_u8((u8 *)v2);
+  const m128 hilo = vandq_u8(tmp, vdupq_n_u8(0xf));
+  const m128 bin = vqtbl1q_u8(hilo, vld1q_u8(mask1));
+  const m128 str = vqtbl1q_u8(vld1q_u8(strhex_table_16), bin);
+  return str;
+}
+#else
+static u16 strhex_table_256[256];
+
+__attribute__((constructor))
+  static void
+strhex_init(void)
+{
+  for (u64 i = 0; i < 256; i++)
+    strhex_table_256[i] = (((u16)strhex_table_16[i & 0xf]) << 8) | (strhex_table_16[i>>4]);
+}
+#endif // __x86_64__
+
+// output 8 bytes
+  void
+strhex_32(void * const out, u32 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper((u64)v);
+  _mm_storel_epi64(out, _mm_srli_si128(str, 8));
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper((u64)v);
+  vst1q_lane_u64(out, vreinterpretq_u64_u8(str), 1);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 4; i++) {
+    ptr[3-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// output 16 bytes // buffer must be aligned by 16B
+  void
+strhex_64(void * const out, u64 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper(v);
+  _mm_storeu_si128(out, str);
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper(v);
+  vst1q_u8(out, str);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 8; i++) {
+    ptr[7-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// string to u64
+  inline u64
+a2u64(const void * const str)
+{
+  return strtoull(str, NULL, 10);
+}
+
+  inline u32
+a2u32(const void * const str)
+{
+  return (u32)strtoull(str, NULL, 10);
+}
+
+  inline s64
+a2s64(const void * const str)
+{
+  return strtoll(str, NULL, 10);
+}
+
+  inline s32
+a2s32(const void * const str)
+{
+  return (s32)strtoll(str, NULL, 10);
+}
+
+  void
+str_print_hex(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 3;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    buf[i*3] = ' ';
+    buf[i*3+1] = strhex_table_16[ptr[i]>>4];
+    buf[i*3+2] = strhex_table_16[ptr[i] & 0xf];
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+  void
+str_print_dec(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 4;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    const u8 v = ptr[i];
+    buf[i*4] = ' ';
+    const u8 v1 = v / 100u;
+    const u8 v23 = v % 100u;
+    buf[i*4+1] = (u8)'0' + v1;
+    buf[i*4+2] = (u8)'0' + (v23 / 10u);
+    buf[i*4+3] = (u8)'0' + (v23 % 10u);
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+// returns a NULL-terminated list of string tokens.
+// After use you only need to free the returned pointer (char **).
+  char **
+strtoks(const char * const str, const char * const delim)
+{
+  if (str == NULL)
+    return NULL;
+  size_t nptr_alloc = 32;
+  char ** tokens = malloc(sizeof(tokens[0]) * nptr_alloc);
+  if (tokens == NULL)
+    return NULL;
+  const size_t bufsize = strlen(str) + 1;
+  char * const buf = malloc(bufsize);
+  if (buf == NULL)
+    goto fail_buf;
+
+  memcpy(buf, str, bufsize);
+  char * saveptr = NULL;
+  char * tok = strtok_r(buf, delim, &saveptr);
+  size_t ntoks = 0;
+  while (tok) {
+    if (ntoks >= nptr_alloc) {
+      nptr_alloc += 32;
+      char ** const r = realloc(tokens, sizeof(tokens[0]) * nptr_alloc);
+      if (r == NULL)
+        goto fail_realloc;
+
+      tokens = r;
+    }
+    tokens[ntoks] = tok;
+    ntoks++;
+    tok = strtok_r(NULL, delim, &saveptr);
+  }
+  tokens[ntoks] = NULL;
+  const size_t nptr = ntoks + 1; // append a NULL
+  const size_t rsize = (sizeof(tokens[0]) * nptr) + bufsize;
+  char ** const r = realloc(tokens, rsize);
+  if (r == NULL)
+    goto fail_realloc;
+
+  tokens = r;
+  char * const dest = (char *)(&(tokens[nptr]));
+  memcpy(dest, buf, bufsize);
+  for (u64 i = 0; i < ntoks; i++)
+    tokens[i] += (dest - buf);
+
+  free(buf);
+  return tokens;
+
+fail_realloc:
+  free(buf);
+fail_buf:
+  free(tokens);
+  return NULL;
+}
+
+  u32
+strtoks_count(const char * const * const toks)
+{
+  if (!toks)
+    return 0;
+  u32 n = 0;
+  while (toks[n++]);
+  return n;
+}
+// }}} string
+
+// qsbr {{{
+#define QSBR_STATES_NR ((23)) // shard capacity; valid values are 3*8-1 == 23; 5*8-1 == 39; 7*8-1 == 55
+#define QSBR_SHARD_BITS  ((5)) // 2^n shards
+#define QSBR_SHARD_NR    (((1u) << QSBR_SHARD_BITS))
+#define QSBR_SHARD_MASK  ((QSBR_SHARD_NR - 1))
+
+struct qsbr_ref_real {
+#ifdef QSBR_DEBUG
+  pthread_t ptid; // 8
+  u32 status; // 4
+  u32 nbt; // 4 (number of backtrace frames)
+#define QSBR_DEBUG_BTNR ((14))
+  void * backtrace[QSBR_DEBUG_BTNR];
+#endif
+  au64 qstate; // user updates it
+  au64 * pptr; // internal only
+  struct qsbr_ref_real * park;
+};
+
+static_assert(sizeof(struct qsbr_ref) == sizeof(struct qsbr_ref_real), "sizeof qsbr_ref");
+
+// Quiescent-State-Based Reclamation RCU
+struct qsbr {
+  struct qsbr_ref_real target;
+  u64 padding0[5];
+  struct qshard {
+    au64 bitmap;
+    au64 ptrs[QSBR_STATES_NR];
+  } shards[QSBR_SHARD_NR];
+};
+
+  struct qsbr *
+qsbr_create(void)
+{
+  struct qsbr * const q = yalloc(sizeof(*q));
+  memset(q, 0, sizeof(*q));
+  return q;
+}
+
+  static inline struct qshard *
+qsbr_shard(struct qsbr * const q, void * const ptr)
+{
+  const u32 sid = crc32c_u64(0, (u64)ptr) & QSBR_SHARD_MASK;
+  debug_assert(sid < QSBR_SHARD_NR);
+  return &(q->shards[sid]);
+}
+
+  static inline void
+qsbr_write_qstate(struct qsbr_ref_real * const ref, const u64 v)
+{
+  atomic_store_explicit(&ref->qstate, v, MO_RELAXED);
+}
+
+  bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  qsbr_write_qstate(ref, 0);
+
+  do {
+    u64 bits = atomic_load_explicit(&shard->bitmap, MO_CONSUME);
+    const u32 pos = (u32)__builtin_ctzl(~bits);
+    if (unlikely(pos >= QSBR_STATES_NR))
+      return false;
+
+    const u64 bits1 = bits | (1lu << pos);
+    if (atomic_compare_exchange_weak_explicit(&shard->bitmap, &bits, bits1, MO_ACQUIRE, MO_RELAXED)) {
+      atomic_store_explicit(&shard->ptrs[pos], (u64)ref, MO_RELAXED);
+      //shard->ptrs[pos] = ref;
+
+      ref->pptr = &(shard->ptrs[pos]);
+      ref->park = &q->target;
+#ifdef QSBR_DEBUG
+      ref->ptid = (u64)pthread_self();
+      ref->tid = 0;
+      ref->status = 1;
+      ref->nbt = backtrace(ref->backtrace, QSBR_DEBUG_BTNR);
+#endif
+      return true;
+    }
+  } while (true);
+}
+
+  void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  const u32 pos = (u32)(ref->pptr - shard->ptrs);
+  debug_assert(pos < QSBR_STATES_NR);
+  debug_assert(shard->bitmap & (1lu << pos));
+
+  atomic_store_explicit(&shard->ptrs[pos], (u64)(&q->target), MO_RELAXED);
+  //shard->ptrs[pos] = &q->target;
+  (void)atomic_fetch_and_explicit(&shard->bitmap, ~(1lu << pos), MO_RELEASE);
+#ifdef QSBR_DEBUG
+  ref->tid = 0;
+  ref->ptid = 0;
+  ref->status = 0xffff; // unregistered
+  ref->nbt = 0;
+#endif
+  ref->pptr = NULL;
+  // wait for qsbr_wait to leave if it's working on the shard
+  while (atomic_load_explicit(&shard->bitmap, MO_CONSUME) >> 63)
+    cpu_pause();
+}
+
+  inline void
+qsbr_update(struct qsbr_ref * const qref, const u64 v)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  debug_assert((*ref->pptr) == (u64)ref); // must be unparked
+  // rcu update does not require release or acquire order
+  qsbr_write_qstate(ref, v);
+}
+
+  inline void
+qsbr_park(struct qsbr_ref * const qref)
+{
+  cpu_cfence();
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref->park, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xfff; // parked
+#endif
+}
+
+  inline void
+qsbr_resume(struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xf; // resumed
+#endif
+  cpu_cfence();
+}
+
+// waiters needs external synchronization
+  void
+qsbr_wait(struct qsbr * const q, const u64 target)
+{
+  cpu_cfence();
+  qsbr_write_qstate(&q->target, target);
+  u64 cbits = 0; // check-bits; each bit corresponds to a shard
+  u64 bms[QSBR_SHARD_NR]; // copy of all bitmap
+  // take an unsafe snapshot of active users
+  for (u32 i = 0; i < QSBR_SHARD_NR; i++) {
+    bms[i] = atomic_load_explicit(&q->shards[i].bitmap, MO_CONSUME);
+    if (bms[i])
+      cbits |= (1lu << i); // set to 1 if [i] has ptrs
+  }
+
+  while (cbits) {
+    for (u64 ctmp = cbits; ctmp; ctmp &= (ctmp - 1)) {
+      // shard id
+      const u32 i = (u32)__builtin_ctzl(ctmp);
+      struct qshard * const shard = &(q->shards[i]);
+      const u64 bits1 = atomic_fetch_or_explicit(&(shard->bitmap), 1lu << 63, MO_ACQUIRE);
+      for (u64 bits = bms[i]; bits; bits &= (bits - 1)) {
+        const u64 bit = bits & -bits; // extract lowest bit
+        if ((bits1 & bit) == 0) {
+          bms[i] &= ~bit;
+        } else {
+          au64 * pptr = &(shard->ptrs[__builtin_ctzl(bit)]);
+          struct qsbr_ref_real * const ptr = (typeof(ptr))atomic_load_explicit(pptr, MO_RELAXED);
+          if (atomic_load_explicit(&(ptr->qstate), MO_CONSUME) == target)
+            bms[i] &= ~bit;
+        }
+      }
+      (void)atomic_fetch_and_explicit(&(shard->bitmap), ~(1lu << 63), MO_RELEASE);
+      if (bms[i] == 0)
+        cbits &= ~(1lu << i);
+    }
+#if defined(CORR)
+    corr_yield();
+#endif
+  }
+  debug_assert(cbits == 0);
+  cpu_cfence();
+}
+
+  void
+qsbr_destroy(struct qsbr * const q)
+{
+  if (q)
+    free(q);
+}
+#undef QSBR_STATES_NR
+#undef QSBR_BITMAP_NR
+// }}} qsbr
+
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/lib.h b/MassTrie-beta/wormhole/lib.h
new file mode 100644
index 00000000..40a2f40d
--- /dev/null
+++ b/MassTrie-beta/wormhole/lib.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// includes {{{
+// C headers
+#include <errno.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+// POSIX headers
+#include <fcntl.h>
+#include <pthread.h>
+#include <unistd.h>
+
+// Linux headers
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+// SIMD
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#elif defined(__aarch64__)
+#include <arm_acle.h>
+#include <arm_neon.h>
+#endif
+// }}} includes
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// types {{{
+#ifndef typeof
+#define typeof __typeof__
+#endif
+#ifndef asm
+#define asm __asm__
+#endif
+typedef char            s8;
+typedef short           s16;
+typedef int             s32;
+typedef long            s64;
+typedef __int128_t      s128;
+static_assert(sizeof(s8) == 1, "sizeof(s8)");
+static_assert(sizeof(s16) == 2, "sizeof(s16)");
+static_assert(sizeof(s32) == 4, "sizeof(s32)");
+static_assert(sizeof(s64) == 8, "sizeof(s64)");
+static_assert(sizeof(s128) == 16, "sizeof(s128)");
+
+typedef unsigned char   u8;
+typedef unsigned short  u16;
+typedef unsigned int    u32;
+typedef unsigned long   u64;
+typedef __uint128_t     u128;
+static_assert(sizeof(u8) == 1, "sizeof(u8)");
+static_assert(sizeof(u16) == 2, "sizeof(u16)");
+static_assert(sizeof(u32) == 4, "sizeof(u32)");
+static_assert(sizeof(u64) == 8, "sizeof(u64)");
+static_assert(sizeof(u128) == 16, "sizeof(u128)");
+
+#if defined(__x86_64__)
+typedef __m128i m128;
+#if defined(__AVX2__)
+typedef __m256i m256;
+#endif // __AVX2__
+#if defined(__AVX512F__)
+typedef __m512i m512;
+#endif // __AVX512F__
+#elif defined(__aarch64__)
+typedef uint8x16_t m128;
+#else
+#error Need x86_64 or AArch64.
+#endif
+// }}} types
+
+// defs {{{
+#define likely(____x____)   __builtin_expect(____x____, 1)
+#define unlikely(____x____) __builtin_expect(____x____, 0)
+
+// ansi colors
+// 3X:fg; 4X:bg; 9X:light fg; 10X:light bg;
+// X can be one of the following colors:
+// 0:black;   1:red;     2:green;  3:yellow;
+// 4:blue;    5:magenta; 6:cyan;   7:white;
+#define TERMCLR(____code____) "\x1b[" #____code____ "m"
+// }}} defs
+
+// const {{{
+#define PGBITS ((12))
+#define PGSZ ((1lu << PGBITS))
+// }}} const
+
+// math {{{
+  extern u64
+mhash64(const u64 v);
+
+  extern u32
+mhash32(const u32 v);
+
+  extern u64
+gcd64(u64 a, u64 b);
+// }}} math
+
+// random {{{
+  extern u64
+random_u64(void);
+
+  extern void
+srandom_u64(const u64 seed);
+
+  extern double
+random_double(void);
+// }}} random
+
+// timing {{{
+  extern u64
+time_nsec(void);
+
+  extern double
+time_sec(void);
+
+  extern u64
+time_diff_nsec(const u64 last);
+
+  extern double
+time_diff_sec(const double last);
+
+  extern void
+time_stamp(char * str, const size_t size);
+
+  extern void
+time_stamp2(char * str, const size_t size);
+// }}} timing
+
+// cpucache {{{
+  extern void
+cpu_pause(void);
+
+  extern void
+cpu_mfence(void);
+
+  extern void
+cpu_cfence(void);
+
+  extern void
+cpu_prefetch0(const void * const ptr);
+
+  extern void
+cpu_prefetch1(const void * const ptr);
+
+  extern void
+cpu_prefetch2(const void * const ptr);
+
+  extern void
+cpu_prefetch3(const void * const ptr);
+
+  extern void
+cpu_prefetchw(const void * const ptr);
+// }}} cpucache
+
+// crc32c {{{
+  extern u32
+crc32c_u8(const u32 crc, const u8 v);
+
+  extern u32
+crc32c_u16(const u32 crc, const u16 v);
+
+  extern u32
+crc32c_u32(const u32 crc, const u32 v);
+
+  extern u32
+crc32c_u64(const u32 crc, const u64 v);
+
+// 1 <= nr <= 3
+  extern u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc);
+
+// nr % 4 == 0
+  extern u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc);
+
+  extern u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc);
+// }}} crc32c
+
+// debug {{{
+  extern void
+debug_break(void);
+
+  extern void
+debug_backtrace(void);
+
+  extern void
+watch_u64_usr1(u64 * const ptr);
+
+#ifndef NDEBUG
+  extern void
+debug_assert(const bool v);
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+__attribute__((noreturn))
+  extern void
+debug_die(void);
+
+__attribute__((noreturn))
+  extern void
+debug_die_perror(void);
+
+  extern void
+debug_dump_maps(FILE * const out);
+
+  extern bool
+debug_perf_switch(void);
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  extern bool
+alloc_fail(void);
+#endif
+
+  extern void *
+xalloc(const size_t align, const size_t size);
+
+  extern void *
+yalloc(const size_t size);
+
+  extern void **
+malloc_2d(const size_t nr, const size_t size);
+
+  extern void **
+calloc_2d(const size_t nr, const size_t size);
+
+  extern void
+pages_unmap(void * const ptr, const size_t size);
+
+  extern void
+pages_lock(void * const ptr, const size_t size);
+
+/* hugepages */
+// force posix allocators: -DVALGRIND_MEMCHECK
+  extern void *
+pages_alloc_4kb(const size_t nr_4kb);
+
+  extern void *
+pages_alloc_2mb(const size_t nr_2mb);
+
+  extern void *
+pages_alloc_1gb(const size_t nr_1gb);
+
+  extern void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out);
+// }}} mm
+
+// process/thread {{{
+  extern void
+thread_get_name(const pthread_t pt, char * const name, const size_t len);
+
+  extern void
+thread_set_name(const pthread_t pt, const char * const name);
+
+  extern long
+process_get_rss(void);
+
+  extern u32
+process_affinity_count(void);
+
+  extern u32
+process_getaffinity_list(const u32 max, u32 * const cores);
+
+  extern void
+thread_setaffinity_list(const u32 nr, const u32 * const list);
+
+  extern void
+thread_pin(const u32 cpu);
+
+  extern u64
+process_cpu_time_usec(void);
+
+// if args == true, argx is void **
+// if args == false, argx is void *
+  extern u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx);
+
+  extern int
+thread_create_at(const u32 cpu, pthread_t * const thread, void *(*start_routine) (void *), void * const arg);
+// }}} process/thread
+
+// locking {{{
+typedef union {
+  u32 opaque;
+} spinlock;
+
+  extern void
+spinlock_init(spinlock * const lock);
+
+  extern void
+spinlock_lock(spinlock * const lock);
+
+  extern bool
+spinlock_trylock(spinlock * const lock);
+
+  extern void
+spinlock_unlock(spinlock * const lock);
+
+typedef union {
+  u32 opaque;
+} rwlock;
+
+  extern void
+rwlock_init(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read(rwlock * const lock);
+
+// low-priority reader-lock; use with trylock_write_hp
+  extern bool
+rwlock_trylock_read_lp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_read(rwlock * const lock);
+
+  extern void
+rwlock_unlock_read(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write(rwlock * const lock);
+
+// writer has higher priority; new readers are blocked
+  extern bool
+rwlock_trylock_write_hp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write_hp(rwlock * const lock);
+
+  extern void
+rwlock_unlock_write(rwlock * const lock);
+
+  extern void
+rwlock_write_to_read(rwlock * const lock);
+
+typedef union {
+  u64 opqaue[8];
+} mutex;
+
+  extern void
+mutex_init(mutex * const lock);
+
+  extern void
+mutex_lock(mutex * const lock);
+
+  extern bool
+mutex_trylock(mutex * const lock);
+
+  extern void
+mutex_unlock(mutex * const lock);
+
+  extern void
+mutex_deinit(mutex * const lock);
+// }}} locking
+
+// coroutine {{{
+extern u64 co_switch_stack(u64 * const saversp, const u64 newrsp, const u64 retval);
+
+struct co;
+
+  extern struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host);
+
+  extern struct co *
+co_fork(void * func, void * priv);
+
+  extern void *
+co_priv(void);
+
+  extern u64
+co_enter(struct co * const to, const u64 retval);
+
+  extern u64
+co_switch_to(struct co * const to, const u64 retval);
+
+  extern u64
+co_back(const u64 retval);
+
+  extern void
+co_exit(const u64 retval);
+
+  extern bool
+co_valid(struct co * const co);
+
+  extern struct co *
+co_self(void);
+
+  extern void
+co_destroy(struct co * const co);
+
+struct corr;
+
+  extern struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host);
+
+  extern void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_enter(struct corr * const co);
+
+  extern void
+corr_yield(void);
+
+  extern void
+corr_exit(void);
+
+  extern void
+corr_destroy(struct corr * const co);
+// }}} coroutine
+
+// bits {{{
+  extern u32
+bits_reverse_u32(const u32 v);
+
+  extern u64
+bits_reverse_u64(const u64 v);
+
+  extern u64
+bits_rotl_u64(const u64 v, const u8 n);
+
+  extern u64
+bits_rotr_u64(const u64 v, const u8 n);
+
+  extern u32
+bits_rotl_u32(const u32 v, const u8 n);
+
+  extern u32
+bits_rotr_u32(const u32 v, const u8 n);
+
+  extern u64
+bits_p2_up_u64(const u64 v);
+
+  extern u32
+bits_p2_up_u32(const u32 v);
+
+  extern u64
+bits_p2_down_u64(const u64 v);
+
+  extern u32
+bits_p2_down_u32(const u32 v);
+
+  extern u64
+bits_round_up(const u64 v, const u8 power);
+
+  extern u64
+bits_round_up_a(const u64 v, const u64 a);
+
+  extern u64
+bits_round_down(const u64 v, const u8 power);
+
+  extern u64
+bits_round_down_a(const u64 v, const u64 a);
+// }}} bits
+
+// vi128 {{{
+  extern u32
+vi128_estimate_u32(const u32 v);
+
+  extern u8 *
+vi128_encode_u32(u8 * dst, u32 v);
+
+  extern const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out);
+
+  extern u32
+vi128_estimate_u64(const u64 v);
+
+  extern u8 *
+vi128_encode_u64(u8 * dst, u64 v);
+
+  extern const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out);
+// }}} vi128
+
+// misc {{{
+// TODO: only works on little endian?
+struct entry13 { // what a beautiful name
+  union {
+    u16 e1;
+    struct { // easy for debugging
+      u64 e1_64:16;
+      u64 e3:48;
+    };
+    u64 v64;
+    void * ptr;
+  };
+};
+
+static_assert(sizeof(struct entry13) == 8, "sizeof(entry13) != 8");
+
+// directly access read .e1 and .e3
+// directly write .e1
+// use entry13_update() to update the entire entry
+
+  extern struct entry13
+entry13(const u16 e1, const u64 e3);
+
+  extern void
+entry13_update_e3(struct entry13 * const e, const u64 e3);
+
+  extern void *
+u64_to_ptr(const u64 v);
+
+  extern u64
+ptr_to_u64(const void * const ptr);
+
+  extern size_t
+m_usable_size(void * const ptr);
+
+  extern size_t
+fdsize(const int fd);
+
+  extern u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max);
+
+__attribute__ ((format (printf, 2, 3)))
+  extern void
+logger_printf(const int fd, const char * const fmt, ...);
+// }}} misc
+
+// slab {{{
+struct slab;
+
+  extern struct slab *
+slab_create(const u64 obj_size, const u64 blk_size);
+
+  extern bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr);
+
+  extern void *
+slab_alloc_unsafe(struct slab * const slab);
+
+  extern void *
+slab_alloc_safe(struct slab * const slab);
+
+  extern void
+slab_free_unsafe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_safe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_all(struct slab * const slab);
+
+  extern u64
+slab_get_nalloc(struct slab * const slab);
+
+  extern void
+slab_destroy(struct slab * const slab);
+// }}}  slab
+
+// string {{{
+// XXX strdec_ and strhex_ functions does not append the trailing '\0' to the output string
+// size of out should be >= 10
+  extern void
+strdec_32(void * const out, const u32 v);
+
+// size of out should be >= 20
+  extern void
+strdec_64(void * const out, const u64 v);
+
+// size of out should be >= 8
+  extern void
+strhex_32(void * const out, const u32 v);
+
+// size of out should be >= 16
+  extern void
+strhex_64(void * const out, const u64 v);
+
+  extern u64
+a2u64(const void * const str);
+
+  extern u32
+a2u32(const void * const str);
+
+  extern s64
+a2s64(const void * const str);
+
+  extern s32
+a2s32(const void * const str);
+
+  extern void
+str_print_hex(FILE * const out, const void * const data, const u32 len);
+
+  extern void
+str_print_dec(FILE * const out, const void * const data, const u32 len);
+
+// user should free returned ptr (and nothing else) after use
+  extern char **
+strtoks(const char * const str, const char * const delim);
+
+  extern u32
+strtoks_count(const char * const * const toks);
+// }}} string
+
+// qsbr {{{
+// QSBR vs EBR (Quiescent-State vs Epoch Based Reclaimation)
+// QSBR: readers just use qsbr_update -> qsbr_update -> ... repeatedly
+// EBR: readers use qsbr_update -> qsbr_park -> qsbr_resume -> qsbr_update -> ...
+// The advantage of EBR is qsbr_park can happen much earlier than the next qsbr_update
+// The disadvantage is the extra cost, a pair of park/resume is used in every iteration
+struct qsbr;
+struct qsbr_ref {
+#ifdef QSBR_DEBUG
+  u64 debug[16];
+#endif
+  u64 opaque[3];
+};
+
+  extern struct qsbr *
+qsbr_create(void);
+
+// every READER accessing the shared data must first register itself with the qsbr
+  extern bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref);
+
+  extern void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref);
+
+// For READER: mark the beginning of critical section; like rcu_read_lock()
+  extern void
+qsbr_update(struct qsbr_ref * const qref, const u64 v);
+
+// temporarily stop access the shared data to avoid blocking writers
+// READER can use qsbr_park (like rcu_read_unlock()) in conjunction with qsbr_update
+// qsbr_park is roughly equivalent to qsbr_unregister, but faster
+  extern void
+qsbr_park(struct qsbr_ref * const qref);
+
+// undo the effect of qsbr_park; must use it between qsbr_park and qsbr_update
+// qsbr_resume is roughly equivalent to qsbr_register, but faster
+  extern void
+qsbr_resume(struct qsbr_ref * const qref);
+
+// WRITER: wait until all the readers have announced v=target with qsbr_update
+  extern void
+qsbr_wait(struct qsbr * const q, const u64 target);
+
+  extern void
+qsbr_destroy(struct qsbr * const q);
+// }}} qsbr
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/libwh.so b/MassTrie-beta/wormhole/libwh.so
new file mode 100644
index 00000000..2ecd7e7e
Binary files /dev/null and b/MassTrie-beta/wormhole/libwh.so differ
diff --git a/MassTrie-beta/wormhole/stresstest.c b/MassTrie-beta/wormhole/stresstest.c
new file mode 100644
index 00000000..93fb6f05
--- /dev/null
+++ b/MassTrie-beta/wormhole/stresstest.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016-2020  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+#include "ctypes.h"
+
+struct stress_info {
+  u64 nkeys;
+  u32 nloader;
+  u32 nunldr;
+  u32 nth;
+  u32 cpt;
+  bool has_iter;
+
+  au64 seqno;
+  struct kv ** keys;
+
+  const struct kvmap_api * api;
+  void * map;
+  au64 tot;
+  au64 wfail;
+  u64 endtime;
+};
+
+  static void *
+stress_load_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  void * const ref = kvmap_ref(si->api, si->map);
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nloader * seq;
+  const u64 nz = (seq == (si->nloader - 1)) ? si->nkeys : (si->nkeys / si->nloader * (seq + 1));
+  //printf("load worker %lu %lu\n", n0, nz-1);
+
+  char * buf = malloc(128);
+  debug_assert(buf);
+  u64 * buf64 = (typeof(buf64))buf;
+  for (u64 i = n0; i < nz; i++) {
+    const u32 klen = (u32)(random_u64() & 0x3flu) + 8;
+    const u32 klen8 = (klen + 7) >> 3;
+    /*
+       buf64[0] = bswap_64(i); // little endian
+       for (u64 j = 1; j < klen8; j++)
+       buf64[j] = random_u64();
+     */
+    const u64 rkey = random_u64();
+    for (u32 j = 0; j < klen8; j++)
+      buf64[j] = (rkey >> j) & 0x0101010101010101lu;
+
+    si->keys[i] = kv_create(buf, klen, buf, 8);
+    if (si->keys[i] == NULL)
+      exit(0);
+    kvmap_kv_put(si->api, ref, si->keys[i]);
+  }
+  free(buf);
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void *
+stress_unload_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nunldr * seq;
+  const u64 nz = (seq == (si->nunldr - 1)) ? si->nkeys : (si->nkeys / si->nunldr * (seq + 1));
+
+  void * const ref = kvmap_ref(si->api, si->map);
+  for (u64 i = n0; i < nz; i++) {
+    kvmap_kv_del(si->api, ref, si->keys[i]);
+    free(si->keys[i]);
+  }
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void
+stress_inp_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+  }
+}
+
+  static struct kv *
+stress_merge_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+    return kv0;
+  } else {
+    u64 * ptr = kv_vptr((struct kv *)priv);
+    *ptr = 0;
+    return priv;
+  }
+}
+
+  static void
+stress_func(struct stress_info * const si)
+{
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  const struct kvmap_api * const api = si->api;
+  void * ref = kvmap_ref(api, si->map);
+  struct kv * next = si->keys[random_u64() % si->nkeys];
+  u64 rnext = random_u64() % si->nkeys;
+  struct kv * const tmp = malloc(128);
+  struct kref tmpkref;
+  struct kvref tmpkvref;
+  debug_assert(tmp);
+  void * iter = NULL;
+  if (api->iter_park) {
+    iter = api->iter_create(ref);
+    api->iter_park(iter);
+  }
+  u64 wfail1 = 0;
+  u64 nops = 0;
+#define BATCHSIZE ((4096))
+  do {
+    for (u64 i = 0; i < BATCHSIZE; i++) {
+      // reading kv keys leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = si->keys[rnext];
+      cpu_prefetch0(next);
+      cpu_prefetch0(((u8 *)next) + 64);
+      rnext = random_u64() % si->nkeys;
+      cpu_prefetch0(&(si->keys[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const u64 r = random_u64() % 16;
+      switch (r) {
+      case 0:
+        kvmap_kv_probe(api, ref, key);
+        break;
+      case 1:
+        kvmap_kv_get(api, ref, key, tmp);
+        break;
+      case 2:
+        if (si->has_iter) {
+          if (api->iter_park == NULL)
+            iter = api->iter_create(ref);
+          debug_assert(iter);
+          kvmap_kv_iter_seek(api, iter, key);
+          api->iter_next(iter, tmp);
+          api->iter_peek(iter, tmp);
+          api->iter_skip(iter, 2);
+          // this is unsafe; only reader's lock is acquired
+          if (api->iter_inp)
+            api->iter_inp(iter, stress_inp_plus1, NULL);
+          // kref
+          if (api->iter_kref)
+            api->iter_kref(iter, &tmpkref);
+          // kvref
+          if (api->iter_kvref)
+            api->iter_kvref(iter, &tmpkvref);
+          // done
+          if (api->iter_park)
+            api->iter_park(iter);
+          else
+            api->iter_destroy(iter);
+        }
+        break;
+      case 3:
+        if (api->refpark) {
+          api->park(ref);
+          api->resume(ref);
+        }
+        break;
+      case 4:
+        if (api->iter_park)
+          api->iter_destroy(iter);
+        (void)kvmap_unref(api, ref);
+        ref = kvmap_ref(api, si->map);
+        if (api->iter_park)
+          iter = api->iter_create(ref);
+        break;
+      case 5:
+        if (api->merge) {
+          kv_dup2_key(key, tmp);
+          tmp->vlen = 8;
+          kvmap_kv_merge(api, ref, key, stress_merge_plus1, tmp);
+        }
+        break;
+      case 6:
+        if ((random_u64() & 0x7fffu) == 0x22 && api->delr)
+          (void)kvmap_kv_delr(api, ref, si->keys[rnext], (rnext + 10) < si->nkeys ? si->keys[rnext + 10] : NULL);
+        else
+          kvmap_kv_probe(api, ref, key);
+        break;
+      case 7: case 8: case 9:
+        (void)kvmap_kv_del(api, ref, key);
+        break;
+      case 10: case 11:
+        if (api->inpw)
+          kvmap_kv_inpw(api, ref, key, stress_inp_plus1, NULL);
+        break;
+      case 12: case 13: case 14: case 15:
+        if (!kvmap_kv_put(api, ref, key))
+          wfail1++;
+        break;
+      default:
+        break;
+      }
+    }
+    nops += BATCHSIZE;
+  } while (time_nsec() < si->endtime);
+  si->wfail += wfail1;
+  if (api->iter_park)
+    api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  free(tmp);
+  si->tot += nops;
+}
+
+  static void
+stress_co_worker(void)
+{
+  struct stress_info * const si = (typeof(si))co_priv();
+  debug_assert(si);
+  stress_func(si);
+}
+
+  static void *
+stress_thread_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  if (si->cpt) {
+    u64 hostrsp = 0;
+    struct corr * crs[32];
+    do { // to work smoothly with ALLOCFAIL
+      crs[0] = corr_create(16*PGSZ, stress_co_worker, si, &hostrsp);
+    } while (crs[0] == NULL);
+    for (u32 j = 1; j < si->cpt; j++) {
+      do { // to work smoothly with ALLOCFAIL
+        crs[j] = corr_link(16*PGSZ, stress_co_worker, si, crs[j-1]);
+      } while (crs[j] == NULL);
+    }
+
+    corr_enter(crs[0]);
+    for (u32 j = 0; j < si->cpt; j++)
+      corr_destroy(crs[j]);
+  } else {
+    stress_func(si);
+  }
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  struct stress_info si = {.nkeys = 10000, .nloader = 1, .nunldr = 1, .nth = 1, .cpt = 0};
+  argc--;
+  argv++;
+  int n = -1;
+  if ((n = kvmap_api_helper(argc, argv, NULL, &si.api, &si.map)) < 0) {
+    fprintf(stderr, "usage: api ... [<#keys>=10000 [<#load-threads>=1 [<#unload-threads>=1 [<#threads>=1 [<#co-per-thread>=0 (disabled) [<rounds>=1 [<epochs>=1]]]]]]]\n");
+    kvmap_api_helper_message();
+    exit(0);
+  }
+  argc -= n;
+  argv += n;
+
+  const bool has_point = si.api->get && si.api->probe && si.api->del && si.api->put;
+  if (!has_point) {
+    fprintf(stderr, "api not supported\n");
+    exit(0);
+  }
+  if (!si.api->inpw)
+    fprintf(stderr, "api->inpw function not found: ignored\n");
+  if (!si.api->merge)
+    fprintf(stderr, "api->merge function not found: ignored\n");
+  if (!si.api->delr)
+    fprintf(stderr, "api->delr function not found: ignored\n");
+
+  si.has_iter = si.api->iter_create && si.api->iter_seek && si.api->iter_peek &&
+    si.api->iter_skip && si.api->iter_next && si.api->iter_destroy;
+  if (!si.has_iter)
+    fprintf(stderr, "iter functions not complete: ignored\n");
+
+  // generate keys
+  if (argc >= 1)
+    si.nkeys = a2u64(argv[0]);
+  si.keys = malloc(sizeof(struct kv *) * si.nkeys);
+  debug_assert(si.keys);
+  if (argc >= 2)
+    si.nloader = a2u32(argv[1]);
+  if (argc >= 3)
+    si.nunldr = a2u32(argv[2]);
+  if (argc >= 4)
+    si.nth = a2u32(argv[3]);
+  if (argc >= 5)
+    si.cpt = a2u32(argv[4]);
+  if (si.cpt > 32)
+    si.cpt = 32;
+#if !defined(CORR)
+  if (si.cpt > 1)
+    fprintf(stderr, TERMCLR(35) "CORR not enabled. Compile with -DCORR to enable it.\n" TERMCLR(0));
+#endif // CORR
+  const u64 nr = (argc >= 6) ? a2u64(argv[5]) : 1; // default 1
+  const u64 ne = (argc >= 7) ? a2u64(argv[6]) : 1; // default 1
+  printf("stresstest: nkeys %lu ldr %u uldr %u th %u cpt %u r %lu e %lu\n",
+      si.nkeys, si.nloader, si.nunldr, si.nth, si.cpt, nr, ne);
+
+  for (u64 e = 0; e < ne; e++) {
+    si.seqno = 0;
+    const u64 dtl = thread_fork_join(si.nloader, (void *)stress_load_worker, false, &si);
+    printf("load th %u mops %.2lf\n", si.nloader, ((double)si.nkeys) * 1e3 / ((double)dtl));
+    if (si.api->fprint)
+      si.api->fprint(si.map, stdout);
+
+    debug_perf_switch();
+    for (u64 r = 0; r < nr; r++) {
+      si.tot = 0;
+      si.wfail = 0;
+      si.endtime = time_nsec() + 2000000000lu;
+      const u64 dt = thread_fork_join(si.nth, (void *)stress_thread_worker, false, &si);
+      const double mops = ((double)si.tot) * 1e3 / ((double)dt);
+      char ts[64];
+      time_stamp(ts, 64);
+      const long rss = process_get_rss();
+      printf("%s e %lu r %lu th %u cpt %u tot %lu mops %.2lf rss %ldkB wfail %lu\n",
+          ts, e, r, si.nth, si.cpt, si.tot, mops, rss, si.wfail);
+      debug_perf_switch();
+    }
+    si.seqno = 0;
+    if (si.nunldr == 0) { // use clean
+      const u64 t0 = time_nsec();
+      si.api->clean(si.map);
+      const u64 dtu = time_diff_nsec(t0);
+      for (u64 i = 0; i < si.nkeys; i++)
+        free(si.keys[i]);
+      printf("clean mops %.2lf\n", ((double)si.nkeys) *1e3 / ((double)dtu));
+    } else {
+      const u64 dtu = thread_fork_join(si.nunldr, (void *)stress_unload_worker, false, &si);
+      printf("unload th %u mops %.2lf\n", si.nunldr, ((double)si.nkeys) *1e3 / ((double)dtu));
+    }
+  }
+
+  free(si.keys);
+  si.api->destroy(si.map);
+  return 0;
+}
diff --git a/MassTrie-beta/wormhole/stresstest.out b/MassTrie-beta/wormhole/stresstest.out
new file mode 100644
index 00000000..874d359c
Binary files /dev/null and b/MassTrie-beta/wormhole/stresstest.out differ
diff --git a/MassTrie-beta/wormhole/wh.c b/MassTrie-beta/wormhole/wh.c
new file mode 100644
index 00000000..1d31e231
--- /dev/null
+++ b/MassTrie-beta/wormhole/wh.c
@@ -0,0 +1,3876 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+#include "wh.h"
+// }}} headers
+
+// def {{{
+#define WH_HMAPINIT_SIZE ((1u << 12)) // 10: 16KB/64KB  12: 64KB/256KB  14: 256KB/1MB
+#define WH_SLABMETA_SIZE ((1lu << 21)) // 2MB
+
+#ifndef HEAPCHECKING
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB is ok
+#else
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB for valgrind
+#endif
+
+#define WH_KPN ((128u)) // keys per node; power of 2
+#define WH_HDIV (((1u << 16)) / WH_KPN)
+#define WH_MID ((WH_KPN >> 1)) // ideal cut point for split, the closer the better
+#define WH_BKT_NR ((8))
+#define WH_KPN2 ((WH_KPN + WH_KPN))
+
+#define WH_KPN_MRG (((WH_KPN + WH_MID) >> 1 )) // 3/4
+
+// FO is fixed at 256. Don't change it
+#define WH_FO  ((256u)) // index fan-out
+// number of bits in a bitmap
+#define WH_BMNR ((WH_FO >> 6)) // number of u64
+// }}} def
+
+// struct {{{
+struct wormmeta {
+  struct entry13 k13; // kref+klen
+  struct entry13 l13; // lmost+bitmin+bitmax
+  struct entry13 r13; // rmost+hash32_lo
+  struct entry13 p13; // lpath+hash32_hi
+  u64 bitmap[0]; // 4 if bitmin != bitmax
+};
+static_assert(sizeof(struct wormmeta) == 32, "sizeof(wormmeta) != 32");
+
+struct wormkv64 { u64 key; void * ptr; }; // u64 keys (whu64)
+
+struct wormleaf {
+  // first line
+  rwlock leaflock;
+  spinlock sortlock; // to protect the seemingly "read-only" iter_seek
+  au64 lv; // version (dont use the first u64)
+  struct wormleaf * prev; // prev leaf
+  struct wormleaf * next; // next leaf
+  struct kv * anchor;
+
+  u32 nr_sorted;
+  u32 nr_keys;
+  u64 reserved[2];
+
+  struct entry13 hs[WH_KPN]; // sorted by hashes
+  u8 ss[WH_KPN]; // sorted by keys
+};
+
+struct wormslot { u16 t[WH_BKT_NR]; };
+static_assert(sizeof(struct wormslot) == 16, "sizeof(wormslot) != 16");
+
+struct wormmbkt { struct wormmeta * e[WH_BKT_NR]; };
+static_assert(sizeof(struct wormmbkt) == 64, "sizeof(wormmbkt) != 64");
+
+struct wormhmap {
+  au64 hv;
+  struct wormslot * wmap;
+  struct wormmbkt * pmap;
+  u32 mask;
+  u32 maxplen;
+  u64 msize;
+
+  struct slab * slab1;
+  struct slab * slab2;
+  struct kv * pbuf;
+};
+static_assert(sizeof(struct wormhmap) == 64, "sizeof(wormhmap) != 64");
+
+struct wormhole {
+  // 1 line
+  union {
+    au64 hmap_ptr; // safe
+    struct wormhmap * hmap; // unsafe
+  };
+  u64 padding0[6];
+  struct wormleaf * leaf0; // usually not used
+  // 1 line
+  struct kvmap_mm mm;
+  struct qsbr * qsbr;
+  struct slab * slab_leaf;
+  struct kv * pbuf;
+  u32 leaftype;
+  u32 padding1;
+  // 2 lines
+  struct wormhmap hmap2[2];
+  // fifth line
+  rwlock metalock;
+  u32 padding2[15];
+};
+
+struct wormhole_iter {
+  struct wormref * ref; // safe-iter only
+  struct wormhole * map;
+  struct wormleaf * leaf;
+  u32 is;
+};
+
+struct wormref {
+  struct wormhole * map;
+  struct qsbr_ref qref;
+};
+// }}} struct
+
+// helpers {{{
+
+// meta {{{
+  static inline struct kv *
+wormmeta_keyref_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->k13.e3);
+}
+
+  static inline u16
+wormmeta_klen_load(const struct wormmeta * const meta)
+{
+  return meta->k13.e1;
+}
+
+  static inline struct wormleaf *
+wormmeta_lmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->l13.e3 & (~0x3flu));
+}
+
+  static inline u32
+wormmeta_bitmin_load(const struct wormmeta * const meta)
+{
+  return (u32)(meta->l13.v64 & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_bitmax_load(const struct wormmeta * const meta)
+{
+  return (u32)((meta->l13.v64 >> 9) & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_hash32_load(const struct wormmeta * const meta)
+{
+  return ((u32)meta->r13.e1) | (((u32)meta->p13.e1) << 16);
+}
+
+  static inline struct wormleaf *
+wormmeta_rmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->r13.e3);
+}
+
+  static inline struct wormleaf *
+wormmeta_lpath_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->p13.e3);
+}
+
+// internal
+  static inline void
+wormmeta_lpath_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->p13, ptr_to_u64(leaf));
+}
+
+// also updates leaf_klen_eq and
+  static inline void
+wormmeta_lmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  const u64 minmax = meta->l13.v64 & 0x3fffflu;
+  meta->l13.v64 = (((u64)leaf) << 16) | minmax;
+
+  const bool leaf_klen_eq = leaf->anchor->klen == wormmeta_klen_load(meta);
+  wormmeta_lpath_store(meta, leaf_klen_eq ? leaf : leaf->prev);
+}
+
+  static inline void
+wormmeta_bitmin_store(struct wormmeta * const meta, const u32 bitmin)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x1fflu)) | bitmin;
+}
+
+  static inline void
+wormmeta_bitmax_store(struct wormmeta * const meta, const u32 bitmax)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x3fe00lu)) | (bitmax << 9);
+}
+
+  static inline void
+wormmeta_rmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->r13, ptr_to_u64(leaf));
+}
+
+// for wormmeta_alloc
+  static void
+wormmeta_init(struct wormmeta * const meta, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  keyref->refcnt++; // shared
+
+  const u32 plen = keyref->klen;
+  debug_assert(plen <= UINT16_MAX);
+  meta->k13 = entry13((u16)plen, ptr_to_u64(keyref));
+  meta->l13.v64 = (ptr_to_u64(lrmost) << 16) | (bit << 9) | bit;
+
+  const u32 hash32 = keyref->hashlo;
+  meta->r13 = entry13((u16)hash32, ptr_to_u64(lrmost));
+
+  const bool leaf_klen_eq = alen == plen;
+  meta->p13 = entry13((u16)(hash32 >> 16), ptr_to_u64(leaf_klen_eq ? lrmost : lrmost->prev));
+}
+// }}} meta
+
+// meta-bitmap {{{
+  static inline bool
+wormmeta_bm_test(const struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin == bitmax) { // half node
+    return bitmin == id;
+  } else { // full node
+    return (bool)((meta->bitmap[id >> 6u] >> (id & 0x3fu)) & 1lu);
+  }
+}
+
+// meta must be a full node
+  static void
+wormmeta_bm_set(struct wormmeta * const meta, const u32 id)
+{
+  // need to replace meta
+  u64 * const ptr = &(meta->bitmap[id >> 6u]);
+  const u64 bit = 1lu << (id & 0x3fu);
+  if ((*ptr) & bit)
+    return;
+
+  (*ptr) |= bit;
+
+  // min
+  if (id < wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, id);
+
+  // max
+  const u32 oldmax = wormmeta_bitmax_load(meta);
+  if (oldmax == WH_FO || id > oldmax)
+    wormmeta_bitmax_store(meta, id);
+}
+
+// find the lowest bit > id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_gt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ~((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + (u32)__builtin_ctzl(bits);
+
+  while (++ix < WH_BMNR) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + (u32)__builtin_ctzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// find the highest bit that is lower than the id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_lt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+
+  while (ix--) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// meta must be a full node
+  static inline void
+wormmeta_bm_clear(struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(wormmeta_bitmin_load(meta) < wormmeta_bitmax_load(meta));
+  meta->bitmap[id >> 6u] &= (~(1lu << (id & 0x3fu)));
+
+  // min
+  if (id == wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, wormmeta_bm_gt(meta, id));
+
+  // max
+  if (id == wormmeta_bitmax_load(meta))
+    wormmeta_bitmax_store(meta, wormmeta_bm_lt(meta, id));
+}
+// }}} meta-bitmap
+
+// key/prefix {{{
+  static inline u16
+wormhole_pkey(const u32 hash32)
+{
+  const u16 pkey0 = ((u16)hash32) ^ ((u16)(hash32 >> 16));
+  return pkey0 ? pkey0 : 1;
+}
+
+  static inline u32
+wormhole_bswap(const u32 hashlo)
+{
+  return __builtin_bswap32(hashlo);
+}
+
+  static inline bool
+wormhole_key_meta_match(const struct kv * const key, const struct wormmeta * const meta)
+{
+  return (key->klen == wormmeta_klen_load(meta))
+    && (!memcmp(key->kv, wormmeta_keyref_load(meta)->kv, key->klen));
+}
+
+// called by get_kref_slot
+  static inline bool
+wormhole_kref_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta)
+{
+  return (kref->len == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, wormmeta_keyref_load(meta)->kv, kref->len));
+}
+
+// called from meta_down ... get_kref1_slot
+// will access rmost, prefetching is effective here
+  static inline bool
+wormhole_kref1_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta, const u8 cid)
+{
+  const u8 * const keybuf = wormmeta_keyref_load(meta)->kv;
+  const u32 plen = kref->len;
+  return ((plen + 1) == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, keybuf, plen))
+    && (keybuf[plen] == cid);
+}
+
+// warning: be careful with buffer overflow
+  static inline void
+wormhole_prefix(struct kv * const pfx, const u32 klen)
+{
+  pfx->klen = klen;
+  kv_update_hash(pfx);
+}
+
+// for split
+  static inline void
+wormhole_prefix_inc1(struct kv * const pfx)
+{
+  pfx->hashlo = crc32c_u8(pfx->hashlo, pfx->kv[pfx->klen]);
+  pfx->klen++;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc_123(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc_123(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+// }}} key/prefix
+
+// alloc {{{
+  static inline struct kv *
+wormhole_alloc_akey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_akey(struct kv * const akey)
+{
+  free(akey);
+}
+
+  static inline struct kv *
+wormhole_alloc_mkey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_mkey(struct kv * const mkey)
+{
+  free(mkey);
+}
+
+  static struct wormleaf *
+wormleaf_alloc(struct wormhole * const map, struct wormleaf * const prev,
+    struct wormleaf * const next, struct kv * const anchor)
+{
+  struct wormleaf * const leaf = slab_alloc_safe(map->slab_leaf);
+  if (leaf == NULL)
+    return NULL;
+
+  rwlock_init(&(leaf->leaflock));
+  spinlock_init(&(leaf->sortlock));
+
+  // keep the old version; new version will be assigned by split functions
+  //leaf->lv = 0;
+
+  leaf->prev = prev;
+  leaf->next = next;
+  leaf->anchor = anchor;
+
+  leaf->nr_keys = 0;
+  leaf->nr_sorted = 0;
+
+  // hs requires zero init.
+  memset(leaf->hs, 0, sizeof(leaf->hs[0]) * WH_KPN);
+  return leaf;
+}
+
+  static void
+wormleaf_free(struct slab * const slab, struct wormleaf * const leaf)
+{
+  debug_assert(leaf->leaflock.opaque == 0);
+  wormhole_free_akey(leaf->anchor);
+  slab_free_safe(slab, leaf);
+}
+
+  static struct wormmeta *
+wormmeta_alloc(struct wormhmap * const hmap, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  debug_assert(alen <= UINT16_MAX);
+  debug_assert(lrmost && keyref);
+
+  struct wormmeta * const meta = slab_alloc_unsafe(hmap->slab1);
+  if (meta == NULL)
+    return NULL;
+
+  wormmeta_init(meta, lrmost, keyref, alen, bit);
+  return meta;
+}
+
+  static inline bool
+wormhole_slab_reserve(struct wormhole * const map, const u32 nr)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return false;
+#endif
+  for (u32 i = 0; i < 2; i++) {
+    if (!(map->hmap2[i].slab1 && map->hmap2[i].slab2))
+      continue;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab1, nr))
+      return false;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab2, nr))
+      return false;
+  }
+  return true;
+}
+
+  static void
+wormmeta_keyref_release(struct wormmeta * const meta)
+{
+  struct kv * const keyref = wormmeta_keyref_load(meta);
+  debug_assert(keyref->refcnt);
+  keyref->refcnt--;
+  if (keyref->refcnt == 0)
+    wormhole_free_mkey(keyref);
+}
+
+  static void
+wormmeta_free(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  wormmeta_keyref_release(meta);
+  slab_free_unsafe(hmap->slab1, meta);
+}
+// }}} alloc
+
+// lock {{{
+  static void
+wormleaf_lock_write(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_lock_read(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_read(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_read(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_unlock_write(struct wormleaf * const leaf)
+{
+  rwlock_unlock_write(&(leaf->leaflock));
+}
+
+  static void
+wormleaf_unlock_read(struct wormleaf * const leaf)
+{
+  rwlock_unlock_read(&(leaf->leaflock));
+}
+
+  static void
+wormhmap_lock(struct wormhole * const map, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(map->metalock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(map->metalock));
+    wormhole_resume(ref);
+  }
+}
+
+  static inline void
+wormhmap_unlock(struct wormhole * const map)
+{
+  rwlock_unlock_write(&(map->metalock));
+}
+// }}} lock
+
+// hmap-version {{{
+  static inline struct wormhmap *
+wormhmap_switch(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  return (hmap == map->hmap2) ? (hmap + 1) : (hmap - 1);
+}
+
+  static inline struct wormhmap *
+wormhmap_load(struct wormhole * const map)
+{
+  return (struct wormhmap *)atomic_load_explicit(&(map->hmap_ptr), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_store(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  atomic_store_explicit(&(map->hmap_ptr), (u64)hmap, MO_RELEASE);
+}
+
+  static inline u64
+wormhmap_version_load(const struct wormhmap * const hmap)
+{
+  // no concurrent access
+  return atomic_load_explicit(&(hmap->hv), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_version_store(struct wormhmap * const hmap, const u64 v)
+{
+  atomic_store_explicit(&(hmap->hv), v, MO_RELEASE);
+}
+
+  static inline u64
+wormleaf_version_load(struct wormleaf * const leaf)
+{
+  return atomic_load_explicit(&(leaf->lv), MO_CONSUME);
+}
+
+  static inline void
+wormleaf_version_store(struct wormleaf * const leaf, const u64 v)
+{
+  atomic_store_explicit(&(leaf->lv), v, MO_RELEASE);
+}
+// }}} hmap-version
+
+// co {{{
+  static inline void
+wormhmap_prefetch_pmap(const struct wormhmap * const hmap, const u32 idx)
+{
+#if defined(CORR)
+  (void)hmap;
+  (void)idx;
+#else
+  cpu_prefetch0(&(hmap->pmap[idx]));
+#endif
+}
+
+  static inline struct wormmeta *
+wormhmap_get_meta(const struct wormhmap * const hmap, const u32 mid, const u32 i)
+{
+  struct wormmeta * const meta = hmap->pmap[mid].e[i];
+#if defined(CORR)
+  cpu_prefetch0(meta);
+  corr_yield();
+#endif
+  return meta;
+}
+
+  static inline void
+wormleaf_prefetch(struct wormleaf * const leaf, const u32 hashlo)
+{
+  const u32 i = wormhole_pkey(hashlo) / WH_HDIV;
+#if defined(CORR)
+  cpu_prefetch0(leaf);
+  cpu_prefetch0(&(leaf->hs[i-4]));
+  cpu_prefetch0(&(leaf->hs[i+4]));
+  corr_yield();
+#else
+  cpu_prefetch0(&(leaf->hs[i]));
+#endif
+}
+
+  static inline bool
+wormhole_kref_kv_match(const struct kref * const key, const struct kv * const curr)
+{
+#if defined(CORR)
+  const u8 * const ptr = (typeof(ptr))curr;
+  cpu_prefetch0(ptr);
+  cpu_prefetch0(ptr + 64);
+  if (key->len > 56) {
+    cpu_prefetch0(ptr + 128);
+    cpu_prefetch0(ptr + 192);
+  }
+  corr_yield();
+#endif
+  return kref_kv_match(key, curr);
+}
+
+  static inline void
+wormhole_qsbr_update_pause(struct wormref * const ref, const u64 v)
+{
+  qsbr_update(&ref->qref, v);
+#if defined(CORR)
+  corr_yield();
+#endif
+}
+// }}} co
+
+// }}} helpers
+
+// hmap {{{
+// hmap is the MetaTrieHT of Wormhole
+  static bool
+wormhmap_init(struct wormhmap * const hmap, struct kv * const pbuf)
+{
+  const u64 wsize = sizeof(hmap->wmap[0]) * WH_HMAPINIT_SIZE;
+  const u64 psize = sizeof(hmap->pmap[0]) * WH_HMAPINIT_SIZE;
+  u64 msize = wsize + psize;
+  u8 * const mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL)
+    return false;
+
+  hmap->pmap = (typeof(hmap->pmap))mem;
+  hmap->wmap = (typeof(hmap->wmap))(mem + psize);
+  hmap->msize = msize;
+  hmap->mask = WH_HMAPINIT_SIZE - 1;
+  wormhmap_version_store(hmap, 0);
+  hmap->maxplen = 0;
+  hmap->pbuf = pbuf;
+  return true;
+}
+
+  static inline void
+wormhmap_deinit(struct wormhmap * const hmap)
+{
+  if (hmap->pmap) {
+    pages_unmap(hmap->pmap, hmap->msize);
+    hmap->pmap = NULL;
+    hmap->wmap = NULL;
+  }
+}
+
+  static inline m128
+wormhmap_zero(void)
+{
+#if defined(__x86_64__)
+  return _mm_setzero_si128();
+#elif defined(__aarch64__)
+  return vdupq_n_u8(0);
+#endif
+}
+
+  static inline m128
+wormhmap_m128_pkey(const u16 pkey)
+{
+#if defined(__x86_64__)
+  return _mm_set1_epi16((short)pkey);
+#elif defined(__aarch64__)
+  return vreinterpretq_u8_u16(vdupq_n_u16(pkey));
+#endif
+}
+
+  static inline u32
+wormhmap_match_mask(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  const m128 sv = _mm_load_si128((const void *)s);
+  return (u32)_mm_movemask_epi8(_mm_cmpeq_epi16(skey, sv));
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  static const uint16x8_t mbits = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
+  return (u32)vaddvq_u16(vandq_u16(cmp, mbits));
+#endif
+}
+
+  static inline bool
+wormhmap_match_any(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  return wormhmap_match_mask(s, skey) != 0;
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  return vaddvq_u32(vreinterpretq_u32_u16(cmp)) != 0;
+#endif
+}
+
+// meta_lcp only
+  static inline bool
+wormhmap_peek(const struct wormhmap * const hmap, const u32 hash32)
+{
+  const m128 sk = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  return wormhmap_match_any(&(hmap->wmap[midx]), sk)
+    || wormhmap_match_any(&(hmap->wmap[midy]), sk);
+}
+
+  static inline struct wormmeta *
+wormhmap_get_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kv * const key)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_key_meta_match(key, meta)))
+      return meta;
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+  static struct wormmeta *
+wormhmap_get(const struct wormhmap * const hmap, const struct kv * const key)
+{
+  const u32 hash32 = key->hashlo;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_slot(hmap, midx, skey, key);
+  if (r)
+    return r;
+  return wormhmap_get_slot(hmap, midy, skey, key);
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_kref_meta_match(kref, meta)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref(const struct wormhmap * const hmap, const struct kref * const kref)
+{
+  const u32 hash32 = kref->hash32;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref_slot(hmap, midx, skey, kref);
+  if (r)
+    return r;
+  return wormhmap_get_kref_slot(hmap, midy, skey, kref);
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref, const u8 cid)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    //cpu_prefetch0(wormmeta_rmost_load(meta)); // will access
+    if (likely(wormhole_kref1_meta_match(kref, meta, cid)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1(const struct wormhmap * const hmap,
+    const struct kref * const kref, const u8 cid)
+{
+  const u32 hash32 = crc32c_u8(kref->hash32, cid);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref1_slot(hmap, midx, skey, kref, cid);
+  if (r)
+    return r;
+  return wormhmap_get_kref1_slot(hmap, midy, skey, kref, cid);
+}
+
+  static inline u32
+wormhmap_slot_count(const struct wormslot * const slot)
+{
+  const u32 mask = wormhmap_match_mask(slot, wormhmap_zero());
+  return mask ? ((u32)__builtin_ctz(mask) >> 1) : 8;
+}
+
+  static inline void
+wormhmap_squeeze(const struct wormhmap * const hmap)
+{
+  struct wormslot * const wmap = hmap->wmap;
+  struct wormmbkt * const pmap = hmap->pmap;
+  const u32 mask = hmap->mask;
+  const u64 nrs64 = ((u64)(hmap->mask)) + 1; // must use u64; u32 can overflow
+  for (u64 si64 = 0; si64 < nrs64; si64++) { // # of buckets
+    const u32 si = (u32)si64;
+    u32 ci = wormhmap_slot_count(&(wmap[si]));
+    for (u32 ei = ci - 1; ei < WH_BKT_NR; ei--) {
+      struct wormmeta * const meta = pmap[si].e[ei];
+      const u32 sj = wormmeta_hash32_load(meta) & mask; // first hash
+      if (sj == si)
+        continue;
+
+      // move
+      const u32 ej = wormhmap_slot_count(&(wmap[sj]));
+      if (ej < WH_BKT_NR) { // has space at home location
+        wmap[sj].t[ej] = wmap[si].t[ei];
+        pmap[sj].e[ej] = pmap[si].e[ei];
+        const u32 ni = ci - 1;
+        if (ei < ni) {
+          wmap[si].t[ei] = wmap[si].t[ni];
+          pmap[si].e[ei] = pmap[si].e[ni];
+        }
+        wmap[si].t[ni] = 0;
+        pmap[si].e[ni] = NULL;
+        ci--;
+      }
+    }
+  }
+}
+
+  static void
+wormhmap_expand(struct wormhmap * const hmap)
+{
+  // sync expand
+  const u32 mask0 = hmap->mask;
+  if (mask0 == UINT32_MAX)
+    debug_die();
+  const u32 nr0 = mask0 + 1;
+  const u32 mask1 = mask0 + nr0;
+  const u64 nr1 = ((u64)nr0) << 1; // must use u64; u32 can overflow
+  const u64 wsize = nr1 * sizeof(hmap->wmap[0]);
+  const u64 psize = nr1 * sizeof(hmap->pmap[0]);
+  u64 msize = wsize + psize;
+  u8 * mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL) {
+    // We are at a very deep call stack from wormhole_put().
+    // Gracefully handling the failure requires lots of changes.
+    // Currently we simply wait for available memory
+    // TODO: gracefully return with insertion failure
+    char ts[64];
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s sleep-wait for memory allocation %lukB\n",
+        __func__, ts, msize >> 10);
+    do {
+      sleep(1);
+      mem = pages_alloc_best(msize, true, &msize);
+    } while (mem == NULL);
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s memory allocation done\n", __func__, ts);
+  }
+
+  struct wormhmap hmap1 = *hmap;
+  hmap1.pmap = (typeof(hmap1.pmap))mem;
+  hmap1.wmap = (typeof(hmap1.wmap))(mem + psize);
+  hmap1.msize = msize;
+  hmap1.mask = mask1;
+
+  const struct wormslot * const wmap0 = hmap->wmap;
+  const struct wormmbkt * const pmap0 = hmap->pmap;
+
+  for (u32 s = 0; s < nr0; s++) {
+    const struct wormmbkt * const bkt = &pmap0[s];
+    for (u32 i = 0; (i < WH_BKT_NR) && bkt->e[i]; i++) {
+      const struct wormmeta * const meta = bkt->e[i];
+      const u32 hash32 = wormmeta_hash32_load(meta);
+      const u32 idx0 = hash32 & mask0;
+      const u32 idx1 = ((idx0 == s) ? hash32 : wormhole_bswap(hash32)) & mask1;
+
+      const u32 n = wormhmap_slot_count(&(hmap1.wmap[idx1]));
+      debug_assert(n < 8);
+      hmap1.wmap[idx1].t[n] = wmap0[s].t[i];
+      hmap1.pmap[idx1].e[n] = bkt->e[i];
+    }
+  }
+  pages_unmap(hmap->pmap, hmap->msize);
+  hmap->pmap = hmap1.pmap;
+  hmap->wmap = hmap1.wmap;
+  hmap->msize = hmap1.msize;
+  hmap->mask = hmap1.mask;
+  wormhmap_squeeze(hmap);
+}
+
+  static bool
+wormhmap_cuckoo(struct wormhmap * const hmap, const u32 mid0,
+    struct wormmeta * const e0, const u16 s0, const u32 depth)
+{
+  const u32 ii = wormhmap_slot_count(&(hmap->wmap[mid0]));
+  if (ii < WH_BKT_NR) {
+    hmap->wmap[mid0].t[ii] = s0;
+    hmap->pmap[mid0].e[ii] = e0;
+    return true;
+  } else if (depth == 0) {
+    return false;
+  }
+
+  // depth > 0
+  struct wormmbkt * const bkt = &(hmap->pmap[mid0]);
+  u16 * const sv = &(hmap->wmap[mid0].t[0]);
+  for (u32 i = 0; i < WH_BKT_NR; i++) {
+    const struct wormmeta * const meta = bkt->e[i];
+    debug_assert(meta);
+    const u32 hash32 = wormmeta_hash32_load(meta);
+
+    const u32 midx = hash32 & hmap->mask;
+    const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+    const u32 midt = (midx != mid0) ? midx : midy;
+    if (midt != mid0) { // possible
+      // no penalty if moving someone back to its 1st hash location
+      const u32 depth1 = (midt == midx) ? depth : (depth - 1);
+      if (wormhmap_cuckoo(hmap, midt, bkt->e[i], sv[i], depth1)) {
+        bkt->e[i] = e0;
+        sv[i] = s0;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+  static void
+wormhmap_set(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const u16 pkey = wormhole_pkey(hash32);
+  // insert with cuckoo
+  if (likely(wormhmap_cuckoo(hmap, midx, meta, pkey, 1)))
+    return;
+  if (wormhmap_cuckoo(hmap, midy, meta, pkey, 1))
+    return;
+  if (wormhmap_cuckoo(hmap, midx, meta, pkey, 2))
+    return;
+
+  // expand
+  wormhmap_expand(hmap);
+
+  wormhmap_set(hmap, meta);
+}
+
+  static bool
+wormhmap_del_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const meta, const m128 skey)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    const struct wormmeta * const meta1 = hmap->pmap[mid].e[i2>>1];
+    if (likely(meta == meta1)) {
+      const u32 i = i2 >> 1;
+      const u32 j = wormhmap_slot_count(&(hmap->wmap[mid])) - 1;
+      hmap->wmap[mid].t[i] = hmap->wmap[mid].t[j];
+      hmap->pmap[mid].e[i] = hmap->pmap[mid].e[j];
+      hmap->wmap[mid].t[j] = 0;
+      hmap->pmap[mid].e[j] = NULL;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_del(struct wormhmap * const hmap, const struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_del_slot(hmap, midx, meta, skey)
+    || wormhmap_del_slot(hmap, midy, meta, skey);
+}
+
+  static bool
+wormhmap_replace_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const old, const m128 skey, struct wormmeta * const new)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta ** const pslot = &hmap->pmap[mid].e[i2>>1];
+    if (likely(old == *pslot)) {
+      *pslot = new;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_replace(struct wormhmap * const hmap, const struct wormmeta * const old, struct wormmeta * const new)
+{
+  const u32 hash32 = wormmeta_hash32_load(old);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_replace_slot(hmap, midx, old, skey, new)
+    || wormhmap_replace_slot(hmap, midy, old, skey, new);
+}
+// }}} hmap
+
+// create {{{
+// it's unsafe
+  static bool
+wormhole_create_leaf0(struct wormhole * const map)
+{
+  const bool sr = wormhole_slab_reserve(map, 1);
+  if (unlikely(!sr))
+    return false;
+
+  // create leaf of empty key
+  struct kv * const anchor = wormhole_alloc_akey(0);
+  if (anchor == NULL)
+    return false;
+  kv_dup2(kv_null(), anchor);
+
+  struct wormleaf * const leaf0 = wormleaf_alloc(map, NULL, NULL, anchor);
+  if (leaf0 == NULL) {
+    wormhole_free_akey(anchor);
+    return false;
+  }
+
+  struct kv * const mkey = wormhole_alloc_mkey(0);
+  if (mkey == NULL) {
+    wormleaf_free(map->slab_leaf, leaf0);
+    return false;
+  }
+
+  wormhole_prefix(mkey, 0);
+  mkey->refcnt = 0;
+  // create meta of empty key
+  for (u32 i = 0; i < 2; i++) {
+    if (map->hmap2[i].slab1) {
+      struct wormmeta * const m0 = wormmeta_alloc(&map->hmap2[i], leaf0, mkey, 0, WH_FO);
+      debug_assert(m0); // already reserved enough
+      wormhmap_set(&(map->hmap2[i]), m0);
+    }
+  }
+
+  map->leaf0 = leaf0;
+  return true;
+}
+
+  static struct wormhole *
+wormhole_create_internal(const struct kvmap_mm * const mm, const u32 nh)
+{
+  struct wormhole * const map = yalloc(sizeof(*map));
+  if (map == NULL)
+    return NULL;
+  memset(map, 0, sizeof(*map));
+  // mm
+  map->mm = mm ? (*mm) : kvmap_mm_dup;
+
+  // pbuf for meta-merge
+  map->pbuf = yalloc(1lu << 16); // 64kB
+  if (map->pbuf == NULL)
+    goto fail;
+
+  // hmap
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (!wormhmap_init(hmap, map->pbuf))
+      goto fail;
+
+    hmap->slab1 = slab_create(sizeof(struct wormmeta), WH_SLABMETA_SIZE);
+    if (hmap->slab1 == NULL)
+      goto fail;
+
+    hmap->slab2 = slab_create(sizeof(struct wormmeta) + (sizeof(u64) * WH_BMNR), WH_SLABMETA_SIZE);
+    if (hmap->slab2 == NULL)
+      goto fail;
+  }
+
+  // leaf slab
+  map->slab_leaf = slab_create(sizeof(struct wormleaf), WH_SLABLEAF_SIZE);
+  if (map->slab_leaf == NULL)
+    goto fail;
+
+  // qsbr
+  map->qsbr = qsbr_create();
+  if (map->qsbr == NULL)
+    goto fail;
+
+  // leaf0
+  if (!wormhole_create_leaf0(map))
+    goto fail;
+
+  rwlock_init(&(map->metalock));
+  wormhmap_store(map, &map->hmap2[0]);
+  return map;
+
+fail:
+  if (map->qsbr)
+    qsbr_destroy(map->qsbr);
+
+  if (map->slab_leaf)
+    slab_destroy(map->slab_leaf);
+
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+
+  if (map->pbuf)
+    free(map->pbuf);
+
+  free(map);
+  return NULL;
+}
+
+  struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 2);
+}
+
+  struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 1);
+}
+// }}} create
+
+// jump {{{
+
+// lcp {{{
+// search in the hash table for the Longest Prefix Match of the search key
+// The corresponding wormmeta node is returned and the LPM is recorded in kref
+  static struct wormmeta *
+wormhole_meta_lcp(const struct wormhmap * const hmap, struct kref * const kref, const u32 klen)
+{
+  // invariant: lo <= lcp < (lo + gd)
+  // ending condition: gd == 1
+  u32 gd = (hmap->maxplen < klen ? hmap->maxplen : klen) + 1u;
+  u32 lo = 0;
+  u32 loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_1 ((7u))
+  while (META_LCP_GAP_1 < gd) {
+    const u32 inc = gd >> 3 << 2; // x4
+    const u32 hash32 = crc32c_inc_x4(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = gd >> 1;
+    const u32 hash32 = crc32c_inc_123(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_1
+
+  kref->hash32 = loh;
+  kref->len = lo;
+  struct wormmeta * ret = wormhmap_get_kref(hmap, kref);
+  if (likely(ret != NULL))
+    return ret;
+
+  gd = lo;
+  lo = 0;
+  loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_2 ((5u))
+  while (META_LCP_GAP_2 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        gd = 1;
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc_123(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_2
+
+  if (kref->len != lo) {
+    kref->hash32 = loh;
+    kref->len = lo;
+  }
+  if (ret == NULL)
+    ret = wormhmap_get_kref(hmap, kref);
+  debug_assert(ret);
+  return ret;
+}
+// }}} lcp
+
+// down {{{
+  static struct wormleaf *
+wormhole_meta_down(const struct wormhmap * const hmap, const struct kref * const lcp,
+    const struct wormmeta * const meta, const u32 klen)
+{
+  if (likely(lcp->len < klen)) { // partial match
+    const u32 id0 = lcp->ptr[lcp->len];
+    if (wormmeta_bitmin_load(meta) > id0) { // no left, don't care about right.
+      return wormmeta_lpath_load(meta);
+    } else if (wormmeta_bitmax_load(meta) < id0) { // has left sibling but no right sibling
+      return wormmeta_rmost_load(meta);
+    } else { // has both (expensive)
+      return wormmeta_rmost_load(wormhmap_get_kref1(hmap, lcp, (u8)wormmeta_bm_lt(meta, id0)));
+    }
+  } else { // lcp->len == klen
+    return wormmeta_lpath_load(meta);
+  }
+}
+// }}} down
+
+// jump-rw {{{
+  static struct wormleaf *
+wormhole_jump_leaf(const struct wormhmap * const hmap, const struct kref * const key)
+{
+  struct kref kref = {.ptr = key->ptr};
+  debug_assert(kv_crc32c(key->ptr, key->len) == key->hash32);
+
+  const struct wormmeta * const meta = wormhole_meta_lcp(hmap, &kref, key->len);
+  return wormhole_meta_down(hmap, &kref, meta, key->len);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_read(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_read_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_read(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_write(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_write_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_write(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+// }}} jump-rw
+
+// }}} jump
+
+// leaf-read {{{
+  static inline struct kv *
+wormleaf_kv_at_ih(const struct wormleaf * const leaf, const u32 ih)
+{
+  return u64_to_ptr(leaf->hs[ih].e3);
+}
+
+  static inline struct kv *
+wormleaf_kv_at_is(const struct wormleaf * const leaf, const u32 is)
+{
+  return u64_to_ptr(leaf->hs[leaf->ss[is]].e3);
+}
+
+  static inline void
+wormleaf_prefetch_ss(const struct wormleaf * const leaf)
+{
+  for (u32 i = 0; i < WH_KPN; i+=64)
+    cpu_prefetch0(&leaf->ss[i]);
+}
+
+// leaf must have been sorted
+// return the key at [i] as if k1 has been inserted into leaf; i <= leaf->nr_sorted
+  static const struct kv *
+wormleaf_kv_at_is1(const struct wormleaf * const leaf, const u32 i, const u32 is1, const struct kv * const k1)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(is1 <= leaf->nr_sorted);
+  if (i < is1)
+    return wormleaf_kv_at_is(leaf, i);
+  else if (i > is1)
+    return wormleaf_kv_at_is(leaf, i-1);
+  else // i == is1
+    return k1;
+}
+
+
+
+// fast point-lookup
+// returns WH_KPN if not found
+  static u32
+wormleaf_match_hs(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  const u16 pkey = wormhole_pkey(key->hash32);
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+
+  if (hs[i0].e1 == pkey) {
+    struct kv * const curr = u64_to_ptr(hs[i0].e3);
+    if (likely(wormhole_kref_kv_match(key, curr)))
+      return i0;
+  }
+  if (hs[i0].e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if (hs[i].e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if ((hs[i].e1 > pkey) || (hs[i].e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+  
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in hs
+  static u32
+wormleaf_search_ih(const struct wormleaf * const leaf, const struct entry13 e)
+{
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+  const struct entry13 e0 = hs[i0];
+
+  if (e0.v64 == e.v64)
+    return i0;
+
+  if (e0.e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if (ei.e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if ((ei.e1 > pkey) || (ei.e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in ss
+  static u32
+wormleaf_search_is(const struct wormleaf * const leaf, const u8 ih)
+{
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 i1 = _mm256_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#else // SSE4.2
+  const m128 i1 = _mm_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm_movemask_epi8(_mm_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__)
+  static const m128 vtbl = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+  static const uint16x8_t mbits = {0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};
+  const m128 i1 = vdupq_n_u8(ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 cmp = vceqq_u8(vld1q_u8(leaf->ss+i), i1); // cmpeq => 0xff or 0x00
+    const m128 cmp1 = vqtbl1q_u8(cmp, vtbl); // reorder
+    const u32 mask = (u32)vaddvq_u16(vandq_u8(vreinterpretq_u16_u8(cmp1), mbits));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __x86_64__
+  debug_die();
+}
+
+// assumes there in no duplicated keys
+// search the first key that is >= the given key
+// return 0 .. nr_sorted
+  static u32
+wormleaf_search_ss(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  u32 lo = 0;
+  u32 hi = leaf->nr_sorted;
+  while ((lo + 2) < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    cpu_prefetch0(curr);
+    cpu_prefetch0(leaf->hs + leaf->ss[(lo + i) >> 1]);
+    cpu_prefetch0(leaf->hs + leaf->ss[(i + 1 + hi) >> 1]);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+
+  while (lo < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+  return lo;
+}
+
+  static u32
+wormleaf_seek(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_sorted == leaf->nr_keys);
+  wormleaf_prefetch_ss(leaf); // effective for both hit and miss
+  const u32 ih = wormleaf_match_hs(leaf, key);
+  if (ih < WH_KPN) { // hit
+    return wormleaf_search_is(leaf, (u8)ih);
+  } else { // miss, binary search for gt
+    return wormleaf_search_ss(leaf, key);
+  }
+}
+
+// same to search_sorted but the target is very likely beyond the end
+  static u32
+wormleaf_seek_end(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  if (leaf->nr_sorted) {
+    const int cmp = kref_kv_compare(key, wormleaf_kv_at_is(leaf, leaf->nr_sorted-1));
+    if (cmp > 0)
+      return leaf->nr_sorted;
+    else if (cmp == 0)
+      return leaf->nr_sorted - 1;
+    else
+      return wormleaf_seek(leaf, key);
+  } else {
+    return 0;
+  }
+}
+// }}} leaf-read
+
+// leaf-write {{{
+  static void
+wormleaf_sort_m2(struct wormleaf * const leaf, const u32 n1, const u32 n2)
+{
+  if (n1 == 0 || n2 == 0)
+    return; // no need to sort
+
+  u8 * const ss = leaf->ss;
+  u8 et[WH_KPN/2]; // min(n1,n2) < KPN/2
+  if (n1 <= n2) { // merge left
+    memcpy(et, &(ss[0]), sizeof(ss[0]) * n1);
+    u8 * eo = ss;
+    u8 * e1 = et; // size == n1
+    u8 * e2 = &(ss[n1]); // size == n2
+    const u8 * const z1 = e1 + n1;
+    const u8 * const z2 = e2 + n2;
+    while ((e1 < z1) && (e2 < z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo++) = *(e1++);
+      else if (cmp > 0)
+        *(eo++) = *(e2++);
+      else
+        debug_die();
+
+      if (eo == e2)
+        break; // finish early
+    }
+    if (eo < e2)
+      memcpy(eo, e1, sizeof(*eo) * (size_t)(e2 - eo));
+  } else {
+    memcpy(et, &(ss[n1]), sizeof(ss[0]) * n2);
+    u8 * eo = &(ss[n1 + n2 - 1]); // merge backwards
+    u8 * e1 = &(ss[n1 - 1]); // size == n1
+    u8 * e2 = &(et[n2 - 1]); // size == n2
+    const u8 * const z1 = e1 - n1;
+    const u8 * const z2 = e2 - n2;
+    while ((e1 > z1) && (e2 > z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo--) = *(e2--);
+      else if (cmp > 0)
+        *(eo--) = *(e1--);
+      else
+        debug_die();
+
+      if (eo == e1)
+        break;
+    }
+    if (eo > e1)
+      memcpy(e1 + 1, et, sizeof(*eo) * (size_t)(eo - e1));
+  }
+}
+
+#if defined(__linux__)
+  static int
+wormleaf_ss_cmp(const void * const p1, const void * const p2, void * priv)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#else // (FreeBSD and APPLE only)
+  static int
+wormleaf_ss_cmp(void * priv, const void * const p1, const void * const p2)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#endif // __linux__
+
+  static inline void
+wormleaf_sort_range(struct wormleaf * const leaf, const u32 i0, const u32 nr)
+{
+#if defined(__linux__)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), wormleaf_ss_cmp, leaf);
+#else // (FreeBSD and APPLE only)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), leaf, wormleaf_ss_cmp);
+#endif // __linux__
+}
+
+// make sure all keys are sorted in a leaf node
+  static void
+wormleaf_sync_sorted(struct wormleaf * const leaf)
+{
+  const u32 s = leaf->nr_sorted;
+  const u32 n = leaf->nr_keys;
+  if (s == n)
+    return;
+
+  wormleaf_sort_range(leaf, s, n - s);
+  // merge-sort inplace
+  wormleaf_sort_m2(leaf, s, n - s);
+  leaf->nr_sorted = n;
+}
+
+// shift a sequence of entries on hs and update the corresponding ss values
+  static void
+wormleaf_shift_inc(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from+1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_add_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_add_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vaddq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+  static void
+wormleaf_shift_dec(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from-1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_sub_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += 16) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_sub_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vsubq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+// insert hs and also shift ss
+  static u32
+wormleaf_insert_hs(struct wormleaf * const leaf, const struct entry13 e)
+{
+  struct entry13 * const hs = leaf->hs;
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  if (hs[i0].e1 == 0) { // insert
+    hs[i0] = e;
+    return i0;
+  }
+
+  // find left-most insertion point
+  u32 i = i0;
+  while (i && hs[i-1].e1 && (hs[i-1].e1 >= pkey))
+    i--;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 < pkey)) // stop at >= or empty
+    i++;
+  const u32 il = --i; // i in [0, KPN]
+
+  // find left empty slot
+  if (i > (i0 - 1))
+    i = i0 - 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i--;
+  const u32 el = i; // el < i0 or el is invalid (>= KPN)
+
+  // find right-most insertion point.
+  i = il + 1;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 == pkey))
+    i++;
+  const u32 ir = i; // ir >= il, in [0, KPN]
+
+  // find right empty slot
+  if (i < (i0 + 1))
+    i = i0 + 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i++;
+  const u32 er = i; // er > i0 or el is invalid (>= KPN)
+
+  // el <= il < ir <= er    (if < WH_KPN)
+  const u32 dl = (el < WH_KPN) ? (il - el) : WH_KPN;
+  const u32 dr = (er < WH_KPN) ? (er - ir) : WH_KPN;
+  if (dl <= dr) { // push left
+    debug_assert(dl < WH_KPN);
+    if (dl)
+      wormleaf_shift_dec(leaf, el, el+1, dl);
+    hs[il] = e;
+    return il;
+  } else {
+    debug_assert(dr < WH_KPN);
+    if (dr)
+      wormleaf_shift_inc(leaf, ir+1, ir, dr);
+    hs[ir] = e;
+    return ir;
+  }
+}
+
+  static void
+wormleaf_insert_e13(struct wormleaf * const leaf, const struct entry13 e)
+{
+  // insert to hs and fix all existing is
+  const u32 ih = wormleaf_insert_hs(leaf, e);
+  debug_assert(ih < WH_KPN);
+  // append the new is
+  leaf->ss[leaf->nr_keys] = (u8)ih;
+  // fix nr
+  leaf->nr_keys++;
+}
+
+  static void
+wormleaf_insert(struct wormleaf * const leaf, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  debug_assert(leaf->nr_keys < WH_KPN);
+
+  // insert
+  const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  const u32 nr0 = leaf->nr_keys;
+  wormleaf_insert_e13(leaf, e);
+
+  // optimize for seq insertion
+  if (nr0 == leaf->nr_sorted) {
+    if (nr0) {
+      const struct kv * const kvn = wormleaf_kv_at_is(leaf, nr0 - 1);
+      if (kv_compare(new, kvn) > 0)
+        leaf->nr_sorted = nr0 + 1;
+    } else {
+      leaf->nr_sorted = 1;
+    }
+  }
+}
+
+  static void
+wormleaf_pull_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  struct entry13 * const hs = leaf->hs;
+  // try left
+  u32 i = ih - 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) > i))
+    i--;
+
+  if ((++i) < ih) {
+    wormleaf_shift_inc(leaf, i+1, i, ih - i);
+    leaf->hs[i].v64 = 0;
+    return;
+  }
+
+  // try right
+  i = ih + 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) < i))
+    i++;
+
+  if ((--i) > ih) {
+    wormleaf_shift_dec(leaf, ih, ih+1, i - ih);
+    hs[i].v64 = 0;
+  }
+  // hs[ih] may still be 0
+}
+
+// internal only
+  static struct kv *
+wormleaf_remove(struct wormleaf * const leaf, const u32 ih, const u32 is)
+{
+  // ss
+  leaf->ss[is] = leaf->ss[leaf->nr_keys - 1];
+  if (leaf->nr_sorted > is)
+    leaf->nr_sorted = is;
+
+  // ret
+  struct kv * const victim = wormleaf_kv_at_ih(leaf, ih);
+  // hs
+  leaf->hs[ih].v64 = 0;
+  leaf->nr_keys--;
+  // use magnet
+  wormleaf_pull_ih(leaf, ih);
+  return victim;
+}
+
+// remove key from leaf but do not call free
+  static struct kv *
+wormleaf_remove_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  // remove from ss
+  const u32 is = wormleaf_search_is(leaf, (u8)ih);
+  debug_assert(is < leaf->nr_keys);
+  return wormleaf_remove(leaf, ih, is);
+}
+
+  static struct kv *
+wormleaf_remove_is(struct wormleaf * const leaf, const u32 is)
+{
+  return wormleaf_remove(leaf, leaf->ss[is], is);
+}
+
+// for delr (delete-range)
+  static void
+wormleaf_delete_range(struct wormhole * const map, struct wormleaf * const leaf,
+    const u32 i0, const u32 end)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  for (u32 i = end; i > i0; i--) {
+    const u32 ir = i - 1;
+    struct kv * const victim = wormleaf_remove_is(leaf, ir);
+    map->mm.free(victim, map->mm.priv);
+  }
+}
+
+// return the old kv; the caller should free the old kv
+  static struct kv *
+wormleaf_update(struct wormleaf * const leaf, const u32 ih, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  // search entry in ss (is)
+  struct kv * const old = wormleaf_kv_at_ih(leaf, ih);
+  debug_assert(old);
+
+  entry13_update_e3(&leaf->hs[ih], (u64)new);
+  return old;
+}
+// }}} leaf-write
+
+// leaf-split {{{
+// It only works correctly in cut_search
+// quickly tell if a cut between k1 and k2 can achieve a specific anchor-key length
+  static bool
+wormhole_split_cut_alen_check(const u32 alen, const struct kv * const k1, const struct kv * const k2)
+{
+  debug_assert(k2->klen >= alen);
+  return (k1->klen < alen) || (k1->kv[alen - 1] != k2->kv[alen - 1]);
+}
+
+// return the number of keys that should go to leaf1
+// assert(r > 0 && r <= nr_keys)
+// (1) r < is1, anchor key is ss[r-1]:ss[r]
+// (2) r == is1: anchor key is ss[r-1]:new
+// (3) r == is1+1: anchor key is new:ss[r-1] (ss[r-1] is the ss[r] on the logically sorted array)
+// (4) r > is1+1: anchor key is ss[r-2]:ss[r-1] (ss[r-2] is the [r-1] on the logically sorted array)
+// edge cases:
+//   (case 2) is1 == nr_keys: r = nr_keys; ss[r-1]:new
+//   (case 3) is1 == 0, r == 1; new:ss[0]
+// return 1..WH_KPN
+  static u32
+wormhole_split_cut_search1(struct wormleaf * const leaf, u32 l, u32 h, const u32 is1, const struct kv * const new)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(leaf->nr_keys);
+  debug_assert(l < h && h <= leaf->nr_sorted);
+
+  const struct kv * const kl0 = wormleaf_kv_at_is1(leaf, l, is1, new);
+  const struct kv * const kh0 = wormleaf_kv_at_is1(leaf, h, is1, new);
+  const u32 alen = kv_key_lcp(kl0, kh0) + 1;
+  if (unlikely(alen > UINT16_MAX))
+    return WH_KPN2;
+
+  const u32 target = leaf->next ? WH_MID : WH_KPN_MRG;
+  while ((l + 1) < h) {
+    const u32 m = (l + h + 1) >> 1;
+    if (m <= target) { // try right
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, h, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        l = m;
+      else
+        h = m;
+    } else { // try left
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, l, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        h = m;
+      else
+        l = m;
+    }
+  }
+  return h;
+}
+
+  static void
+wormhole_split_leaf_move1(struct wormleaf * const leaf1, struct wormleaf * const leaf2,
+    const u32 cut, const u32 is1, const struct kv * const new)
+{
+  const u32 nr_keys = leaf1->nr_keys;
+  const struct entry13 e1 = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  struct entry13 es[WH_KPN];
+
+  if (cut <= is1) { // e1 goes to leaf2
+    // leaf2
+    for (u32 i = cut; i < is1; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    wormleaf_insert_e13(leaf2, e1);
+
+    for (u32 i = is1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+  } else { // e1 goes to leaf1
+    // leaf2
+    for (u32 i = cut - 1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < is1; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+    es[is1] = e1;
+
+    for (u32 i = is1 + 1; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i - 1]];
+  }
+
+  leaf2->nr_sorted = leaf2->nr_keys;
+
+  memset(leaf1->hs, 0, sizeof(leaf1->hs[0]) * WH_KPN);
+  leaf1->nr_keys = 0;
+  for (u32 i = 0; i < cut; i++)
+    wormleaf_insert_e13(leaf1, es[i]);
+  leaf1->nr_sorted = cut;
+  debug_assert((leaf1->nr_sorted + leaf2->nr_sorted) == (nr_keys + 1));
+}
+
+// create an anchor for leaf-split
+  static struct kv *
+wormhole_split_alloc_anchor(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 alen = kv_key_lcp(key1, key2) + 1;
+  debug_assert(alen <= key2->klen);
+
+  struct kv * const anchor = wormhole_alloc_akey(alen);
+  if (anchor)
+    kv_refill(anchor, key2->kv, alen, NULL, 0);
+  return anchor;
+}
+
+// leaf1 is locked
+// split leaf1 into leaf1+leaf2; insert new into leaf1 or leaf2, return leaf2
+  static struct wormleaf *
+wormhole_split_leaf(struct wormhole * const map, struct wormleaf * const leaf1, struct kv * const new)
+{
+  wormleaf_sync_sorted(leaf1);
+  struct kref kref_new;
+  kref_ref_kv(&kref_new, new);
+  const u32 is1 = wormleaf_search_ss(leaf1, &kref_new); // new should be inserted at [is1]
+  const u32 cut = wormhole_split_cut_search1(leaf1, 0, leaf1->nr_keys, is1, new);
+  if (unlikely(cut == WH_KPN2))
+    return NULL;
+
+  // anchor of leaf2
+  debug_assert(cut && (cut <= leaf1->nr_keys));
+  const struct kv * const key1 = wormleaf_kv_at_is1(leaf1, cut - 1, is1, new);
+  const struct kv * const key2 = wormleaf_kv_at_is1(leaf1, cut, is1, new);
+  struct kv * const anchor2 = wormhole_split_alloc_anchor(key1, key2);
+  if (unlikely(anchor2 == NULL)) // anchor alloc failed
+    return NULL;
+
+  // create leaf2 with anchor2
+  struct wormleaf * const leaf2 = wormleaf_alloc(map, leaf1, leaf1->next, anchor2);
+  if (unlikely(leaf2 == NULL)) {
+    wormhole_free_akey(anchor2);
+    return NULL;
+  }
+
+  // split_hmap will unlock the leaf nodes; must move now
+  wormhole_split_leaf_move1(leaf1, leaf2, cut, is1, new);
+  // leaf1 and leaf2 should be sorted after split
+  debug_assert(leaf1->nr_keys == leaf1->nr_sorted);
+  debug_assert(leaf2->nr_keys == leaf2->nr_sorted);
+
+  return leaf2;
+}
+// }}} leaf-split
+
+// leaf-merge {{{
+// MERGE is the only operation that deletes a leaf node (leaf2).
+// It ALWAYS merges the right node into the left node even if the left is empty.
+// This requires both of their writer locks to be acquired.
+// This allows iterators to safely probe the next node (but not backwards).
+// In other words, if either the reader or the writer lock of node X has been acquired:
+// X->next (the pointer) cannot be changed by any other thread.
+// X->next cannot be deleted.
+// But the content in X->next can still be changed.
+  static bool
+wormleaf_merge(struct wormleaf * const leaf1, struct wormleaf * const leaf2)
+{
+  debug_assert((leaf1->nr_keys + leaf2->nr_keys) <= WH_KPN);
+  const bool leaf1_sorted = leaf1->nr_keys == leaf1->nr_sorted;
+
+  for (u32 i = 0; i < leaf2->nr_keys; i++)
+    wormleaf_insert_e13(leaf1, leaf2->hs[leaf2->ss[i]]);
+  if (leaf1_sorted)
+    leaf1->nr_sorted += leaf2->nr_sorted;
+  return true;
+}
+
+// for undoing insertion under split_meta failure; leaf2 is still local
+// remove the new key; merge keys in leaf2 into leaf1; free leaf2
+  static void
+wormleaf_split_undo(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, struct kv * const new)
+{
+  if (new) {
+    const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+    const u32 im1 = wormleaf_search_ih(leaf1, e);
+    if (im1 < WH_KPN) {
+      (void)wormleaf_remove_ih(leaf1, im1);
+    } else { // not found in leaf1; search leaf2
+      const u32 im2 = wormleaf_search_ih(leaf2, e);
+      debug_assert(im2 < WH_KPN);
+      (void)wormleaf_remove_ih(leaf2, im2);
+    }
+  }
+  // this merge must succeed
+  if (!wormleaf_merge(leaf1, leaf2))
+    debug_die();
+  // Keep this to avoid triggering false alarm in wormleaf_free
+  leaf2->leaflock.opaque = 0;
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} leaf-merge
+
+// get/probe {{{
+  struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  struct kv * const tmp = (i < WH_KPN) ? ref->map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+  wormleaf_unlock_read(leaf);
+  return tmp;
+}
+
+  struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  wormhole_resume(ref);
+  struct kv * const ret = wormhole_get(ref, key, out);
+  wormhole_park(ref);
+  return ret;
+}
+
+  struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  return (i < WH_KPN) ? map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+}
+
+  bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  wormleaf_unlock_read(leaf);
+  return i < WH_KPN;
+}
+
+  bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_probe(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  return wormleaf_match_hs(leaf, key) < WH_KPN;
+}
+// }}} get/probe
+
+// meta-split {{{
+// duplicate from meta1; only has one bit but will soon add a new bit
+  static struct wormmeta *
+wormmeta_expand(struct wormhmap * const hmap, struct wormmeta * const meta1)
+{
+  struct wormmeta * const meta2 = slab_alloc_unsafe(hmap->slab2);
+  if (meta2 == NULL)
+    return NULL;
+
+  memcpy(meta2, meta1, sizeof(*meta1));
+  for (u32 i = 0; i < WH_BMNR; i++)
+    meta2->bitmap[i] = 0;
+  const u32 bitmin = wormmeta_bitmin_load(meta1);
+  debug_assert(bitmin == wormmeta_bitmax_load(meta1));
+  debug_assert(bitmin < WH_FO);
+  // set the only bit
+  meta2->bitmap[bitmin >> 6u] |= (1lu << (bitmin & 0x3fu));
+
+  wormhmap_replace(hmap, meta1, meta2);
+  slab_free_unsafe(hmap->slab1, meta1);
+  return meta2;
+}
+
+  static struct wormmeta *
+wormmeta_bm_set_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin < bitmax) { // already in full size
+    wormmeta_bm_set(meta, id);
+    return meta;
+  } else if (id == bitmin) { // do nothing
+    return meta;
+  } else if (bitmin == WH_FO) { // add the first bit
+    wormmeta_bitmin_store(meta, id);
+    wormmeta_bitmax_store(meta, id);
+    return meta;
+  } else { // need to expand
+    struct wormmeta * const meta2 = wormmeta_expand(hmap, meta);
+    wormmeta_bm_set(meta2, id);
+    return meta2;
+  }
+}
+
+// return true if a new node is created
+  static void
+wormmeta_split_touch(struct wormhmap * const hmap, struct kv * const mkey,
+    struct wormleaf * const leaf, const u32 alen)
+{
+  struct wormmeta * meta = wormhmap_get(hmap, mkey);
+  if (meta) {
+    if (mkey->klen < alen)
+      meta = wormmeta_bm_set_helper(hmap, meta, mkey->kv[mkey->klen]);
+    if (wormmeta_lmost_load(meta) == leaf->next)
+      wormmeta_lmost_store(meta, leaf);
+    else if (wormmeta_rmost_load(meta) == leaf->prev)
+      wormmeta_rmost_store(meta, leaf);
+  } else { // create new node
+    const u32 bit = (mkey->klen < alen) ? mkey->kv[mkey->klen] : WH_FO;
+    meta = wormmeta_alloc(hmap, leaf, mkey, alen, bit);
+    debug_assert(meta);
+    wormhmap_set(hmap, meta);
+  }
+}
+
+  static void
+wormmeta_lpath_update(struct wormhmap * const hmap, const struct kv * const a1, const struct kv * const a2,
+    struct wormleaf * const lpath)
+{
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(a2, pbuf);
+
+  // only need to update a2's own branch
+  u32 i = kv_key_lcp(a1, a2) + 1;
+  debug_assert(i <= pbuf->klen);
+  wormhole_prefix(pbuf, i);
+  while (i < a2->klen) {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * const meta = wormhmap_get(hmap, pbuf);
+    debug_assert(meta);
+    wormmeta_lpath_store(meta, lpath);
+
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  }
+}
+
+// for leaf1, a leaf2 is already linked at its right side.
+// this function updates the meta-map by moving leaf1 and hooking leaf2 at correct positions
+  static void
+wormmeta_split(struct wormhmap * const hmap, struct wormleaf * const leaf,
+    struct kv * const mkey)
+{
+  // left branches
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  u32 i = next ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+
+  // save klen
+  const u32 mklen = mkey->klen;
+  wormhole_prefix(mkey, i);
+  do {
+    wormmeta_split_touch(hmap, mkey, leaf, alen);
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(mkey);
+  } while (true);
+
+  // adjust maxplen; i is the plen of the last _touch()
+  if (i > hmap->maxplen)
+    hmap->maxplen = i;
+  debug_assert(i <= UINT16_MAX);
+
+  // restore klen
+  mkey->klen = mklen;
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, leaf);
+}
+
+// all locks will be released before returning
+  static bool
+wormhole_split_meta(struct wormref * const ref, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  struct wormhole * const map = ref->map;
+  // metalock
+  wormhmap_lock(map, ref);
+
+  // check slab reserve
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+
+  // link
+  struct wormleaf * const leaf1 = leaf2->prev;
+  leaf1->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  // update versions
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_split(hmap1, leaf2, mkey);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_split(hmap0, leaf2, mkey);
+
+  wormhmap_unlock(map);
+
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+// all locks (metalock + leaflocks) will be released before returning
+// leaf1->lock (write) is already taken
+  static bool
+wormhole_split_insert(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(ref->map, leaf1, new);
+  if (unlikely(leaf2 == NULL)) {
+    wormleaf_unlock_write(leaf1);
+    return false;
+  }
+
+  rwlock_lock_write(&(leaf2->leaflock));
+  const bool rsm = wormhole_split_meta(ref, leaf2);
+  if (unlikely(!rsm)) {
+    // undo insertion & merge; free leaf2
+    wormleaf_split_undo(ref->map, leaf1, leaf2, new);
+    wormleaf_unlock_write(leaf1);
+  }
+  return rsm;
+}
+
+  static bool
+whunsafe_split_meta(struct wormhole * const map, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  // link
+  leaf2->prev->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_split(&(map->hmap2[i]), leaf2, mkey);
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+  static bool
+whunsafe_split_insert(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(map, leaf1, new);
+  if (unlikely(leaf2 == NULL))
+    return false;
+
+  const bool rsm = whunsafe_split_meta(map, leaf2);
+  if (unlikely(!rsm))  // undo insertion, merge, free leaf2
+    wormleaf_split_undo(map, leaf1, leaf2, new);
+
+  return rsm;
+}
+// }}} meta-split
+
+// meta-merge {{{
+// now it only contains one bit
+  static struct wormmeta *
+wormmeta_shrink(struct wormhmap * const hmap, struct wormmeta * const meta2)
+{
+  debug_assert(wormmeta_bitmin_load(meta2) == wormmeta_bitmax_load(meta2));
+  struct wormmeta * const meta1 = slab_alloc_unsafe(hmap->slab1);
+  if (meta1 == NULL)
+    return NULL;
+
+  memcpy(meta1, meta2, sizeof(*meta1));
+
+  wormhmap_replace(hmap, meta2, meta1);
+  slab_free_unsafe(hmap->slab2, meta2);
+  return meta1;
+}
+
+  static void
+wormmeta_bm_clear_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta)) {
+    debug_assert(wormmeta_bitmin_load(meta) < WH_FO);
+    wormmeta_bitmin_store(meta, WH_FO);
+    wormmeta_bitmax_store(meta, WH_FO);
+  } else { // has more than 1 bit
+    wormmeta_bm_clear(meta, id);
+    if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta))
+      wormmeta_shrink(hmap, meta);
+  }
+}
+
+// all locks held
+  static void
+wormmeta_merge(struct wormhmap * const hmap, struct wormleaf * const leaf)
+{
+  // leaf->next is the new next after merge, which can be NULL
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(leaf->anchor, pbuf);
+  u32 i = (prev && next) ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+  wormhole_prefix(pbuf, i);
+  struct wormmeta * parent = NULL;
+  do {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * meta = wormhmap_get(hmap, pbuf);
+    if (wormmeta_lmost_load(meta) == wormmeta_rmost_load(meta)) { // delete single-child
+      debug_assert(wormmeta_lmost_load(meta) == leaf);
+      const u32 bitmin = wormmeta_bitmin_load(meta);
+      wormhmap_del(hmap, meta);
+      wormmeta_free(hmap, meta);
+      if (parent) {
+        wormmeta_bm_clear_helper(hmap, parent, pbuf->kv[i-1]);
+        parent = NULL;
+      }
+      if (bitmin == WH_FO) // no child
+        break;
+    } else { // adjust lmost rmost
+      if (wormmeta_lmost_load(meta) == leaf)
+        wormmeta_lmost_store(meta, next);
+      else if (wormmeta_rmost_load(meta) == leaf)
+        wormmeta_rmost_store(meta, prev);
+      parent = meta;
+    }
+
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  } while (true);
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, prev);
+}
+
+// all locks (metalock + two leaflock) will be released before returning
+// merge leaf2 to leaf1, removing all metadata to leaf2 and leaf2 itself
+  static void
+wormhole_meta_merge(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, const bool unlock_leaf1)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  struct wormhole * const map = ref->map;
+
+  wormhmap_lock(map, ref);
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_merge(hmap1, leaf2);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  if (unlock_leaf1)
+    wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_merge(hmap0, leaf2);
+  // leaf2 is now safe to be removed
+  wormleaf_free(map->slab_leaf, leaf2);
+  wormhmap_unlock(map);
+}
+
+// caller must acquire leaf->wlock and next->wlock
+// all locks will be released when this function returns
+  static bool
+wormhole_meta_leaf_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  debug_assert(next);
+
+  // double check
+  if ((leaf->nr_keys + next->nr_keys) <= WH_KPN) {
+    if (wormleaf_merge(leaf, next)) {
+      wormhole_meta_merge(ref, leaf, next, true);
+      return true;
+    }
+  }
+  // merge failed but it's fine
+  wormleaf_unlock_write(leaf);
+  wormleaf_unlock_write(next);
+  return false;
+}
+
+  static void
+whunsafe_meta_leaf_merge(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  if (!wormleaf_merge(leaf1, leaf2))
+    return;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_merge(&(map->hmap2[i]), leaf2);
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} meta-merge
+
+// put {{{
+  bool
+wormhole_put(struct wormref * const ref, struct kv * const kv)
+{
+  // we always allocate a new item on SET
+  // future optimizations may perform in-place update
+  struct wormhole * const map = ref->map;
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) {
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_put(struct wormref * const ref, struct kv * const kv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_put(ref, kv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv)
+{
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) { // overwrite
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormhole * const map = ref->map;
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL)) { // no replacement
+      wormleaf_unlock_write(leaf);
+      return true;
+    }
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL)) { // mm error
+      wormleaf_unlock_write(leaf);
+      return false;
+    }
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) { // nothing to be inserted
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) { // mm error
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_merge(ref, kref, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL))
+      return true;
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL))
+      return false;
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) // nothing to be inserted
+    return true;
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) // mm error
+    return false;
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+// }}} put
+
+// inplace {{{
+  bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_read(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_read(leaf);
+    return false;
+  }
+}
+
+  bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_write(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpr(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpw(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // overwrite
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    return true;
+  } else {
+    uf(NULL, priv);
+    return false;
+  }
+}
+// }}} put
+
+// del {{{
+  static void
+wormhole_del_try_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  if (next && ((leaf->nr_keys == 0) || ((leaf->nr_keys + next->nr_keys) < WH_KPN_MRG))) {
+    // try merge, it may fail if size becomes larger after locking
+    wormleaf_lock_write(next, ref);
+    (void)wormhole_meta_leaf_merge(ref, leaf);
+    // locks are already released; immediately return
+  } else {
+    wormleaf_unlock_write(leaf);
+  }
+}
+
+  bool
+wormhole_del(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    wormhole_del_try_merge(ref, leaf);
+    debug_assert(kv);
+    // free after releasing locks
+    struct wormhole * const map = ref->map;
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  } else {
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_del(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_del(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  static void
+whunsafe_del_try_merge(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 n0 = leaf->prev ? leaf->prev->nr_keys : WH_KPN;
+  const u32 n1 = leaf->nr_keys;
+  const u32 n2 = leaf->next ? leaf->next->nr_keys : WH_KPN;
+
+  if ((leaf->prev && (n1 == 0)) || ((n0 + n1) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf->prev, leaf);
+  } else if ((leaf->next && (n1 == 0)) || ((n1 + n2) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf, leaf->next);
+  }
+}
+
+  bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    debug_assert(kv);
+
+    whunsafe_del_try_merge(map, leaf);
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  }
+  return false;
+}
+
+  u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  struct wormleaf * const leafa = wormhole_jump_leaf_write(ref, start);
+  wormleaf_sync_sorted(leafa);
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia) { // do nothing if end < start
+    wormleaf_unlock_write(leafa);
+    return 0;
+  }
+  u64 ndel = iaz - ia;
+  struct wormhole * const map = ref->map;
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  if (leafa->nr_keys > ia) { // end hit; done
+    wormhole_del_try_merge(ref, leafa);
+    return ndel;
+  }
+
+  while (leafa->next) {
+    struct wormleaf * const leafx = leafa->next;
+    wormleaf_lock_write(leafx, ref);
+    // two leaf nodes locked
+    wormleaf_sync_sorted(leafx);
+    const u32 iz = end ? wormleaf_seek_end(leafx, end) : leafx->nr_keys;
+    ndel += iz;
+    wormleaf_delete_range(map, leafx, 0, iz);
+    if (leafx->nr_keys == 0) { // removed all
+      // must hold leaf1's lock for the next iteration
+      wormhole_meta_merge(ref, leafa, leafx, false);
+    } else { // partially removed; done
+      (void)wormhole_meta_leaf_merge(ref, leafa);
+      return ndel;
+    }
+  }
+  wormleaf_unlock_write(leafa);
+  return ndel;
+}
+
+  u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  wormhole_resume(ref);
+  const u64 ret = wormhole_delr(ref, start, end);
+  wormhole_park(ref);
+  return ret;
+}
+
+  u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end)
+{
+  // first leaf
+  struct wormhmap * const hmap = map->hmap;
+  struct wormleaf * const leafa = wormhole_jump_leaf(hmap, start);
+  wormleaf_sync_sorted(leafa);
+  // last leaf
+  struct wormleaf * const leafz = end ? wormhole_jump_leaf(hmap, end) : NULL;
+
+  // select start/end on leafa
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia)
+    return 0;
+
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  u64 ndel = iaz - ia;
+
+  if (leafa == leafz) { // one node only
+    whunsafe_del_try_merge(map, leafa);
+    return ndel;
+  }
+
+  // 0 or more nodes between leafa and leafz
+  while (leafa->next != leafz) {
+    struct wormleaf * const leafx = leafa->next;
+    ndel += leafx->nr_keys;
+    for (u32 i = 0; i < leafx->nr_keys; i++)
+      map->mm.free(wormleaf_kv_at_is(leafx, i), map->mm.priv);
+    leafx->nr_keys = 0;
+    leafx->nr_sorted = 0;
+    whunsafe_meta_leaf_merge(map, leafa, leafx);
+  }
+  // delete the smaller keys in leafz
+  if (leafz) {
+    wormleaf_sync_sorted(leafz);
+    const u32 iz = wormleaf_seek_end(leafz, end);
+    wormleaf_delete_range(map, leafz, 0, iz);
+    ndel += iz;
+    whunsafe_del_try_merge(map, leafa);
+  }
+  return ndel;
+}
+// }}} del
+
+// iter {{{
+// safe iter: safe sort with read-lock acquired
+// unsafe iter: allow concurrent seek/skip
+  static void
+wormhole_iter_leaf_sync_sorted(struct wormleaf * const leaf)
+{
+  if (unlikely(leaf->nr_keys != leaf->nr_sorted)) {
+    spinlock_lock(&(leaf->sortlock));
+    wormleaf_sync_sorted(leaf);
+    spinlock_unlock(&(leaf->sortlock));
+  }
+}
+
+  struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = ref;
+  iter->map = ref->map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  return iter;
+}
+
+  static void
+wormhole_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL)) {
+      struct wormref * const ref = iter->ref;
+      wormleaf_lock_read(next, ref);
+      wormleaf_unlock_read(iter->leaf);
+
+      wormhole_iter_leaf_sync_sorted(next);
+    } else {
+      wormleaf_unlock_read(iter->leaf);
+    }
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  debug_assert(key);
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(iter->ref, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  wormhole_iter_fix(iter);
+}
+
+  void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  wormhole_resume(iter->ref);
+  wormhole_iter_seek(iter, key);
+}
+
+  bool
+wormhole_iter_valid(struct wormhole_iter * const iter)
+{
+  return iter->leaf != NULL;
+}
+
+  static struct kv *
+wormhole_iter_current(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    debug_assert(iter->is < iter->leaf->nr_sorted);
+    struct kv * const kv = wormleaf_kv_at_is(iter->leaf, iter->is);
+    return kv;
+  }
+  return NULL;
+}
+
+  struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    struct kv * const ret = iter->map->mm.out(kv, out);
+    return ret;
+  }
+  return NULL;
+}
+
+  bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kref_ref_kv(kref, kv);
+    return true;
+  }
+  return false;
+}
+
+  bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kvref_ref_kv(kvref, kv);
+    return true;
+  }
+  return false;
+}
+
+  void
+wormhole_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    wormhole_iter_fix(iter);
+  }
+}
+
+  void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    wormhole_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  wormhole_iter_skip1(iter);
+  return ret;
+}
+
+  bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  uf(kv, priv); // call uf even if (kv == NULL)
+  return kv != NULL;
+}
+
+  void
+wormhole_iter_park(struct wormhole_iter * const iter)
+{
+  if (iter->leaf) {
+    wormleaf_unlock_read(iter->leaf);
+    iter->leaf = NULL;
+  }
+}
+
+  void
+whsafe_iter_park(struct wormhole_iter * const iter)
+{
+  wormhole_iter_park(iter);
+  wormhole_park(iter->ref);
+}
+
+  void
+wormhole_iter_destroy(struct wormhole_iter * const iter)
+{
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+  free(iter);
+}
+
+  void
+whsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  wormhole_park(iter->ref);
+  wormhole_iter_destroy(iter);
+}
+// }}} iter
+
+// unsafe iter {{{
+  struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = NULL;
+  iter->map = map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  whunsafe_iter_seek(iter, kref_null());
+  return iter;
+}
+
+  static void
+whunsafe_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL))
+      wormhole_iter_leaf_sync_sorted(next);
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(iter->map->hmap, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  whunsafe_iter_fix(iter);
+}
+
+  void
+whunsafe_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    whunsafe_iter_fix(iter);
+  }
+}
+
+  void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    whunsafe_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  whunsafe_iter_skip1(iter);
+  return ret;
+}
+
+  void
+whunsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  free(iter);
+}
+// }}} unsafe iter
+
+// misc {{{
+  struct wormref *
+wormhole_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = malloc(sizeof(*ref));
+  if (ref == NULL)
+    return NULL;
+  ref->map = map;
+  if (qsbr_register(map->qsbr, &(ref->qref)) == false) {
+    free(ref);
+    return NULL;
+  }
+  return ref;
+}
+
+  struct wormref *
+whsafe_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = wormhole_ref(map);
+  if (ref)
+    wormhole_park(ref);
+  return ref;
+}
+
+  struct wormhole *
+wormhole_unref(struct wormref * const ref)
+{
+  struct wormhole * const map = ref->map;
+  qsbr_unregister(map->qsbr, &(ref->qref));
+  free(ref);
+  return map;
+}
+
+  inline void
+wormhole_park(struct wormref * const ref)
+{
+  qsbr_park(&(ref->qref));
+}
+
+  inline void
+wormhole_resume(struct wormref * const ref)
+{
+  qsbr_resume(&(ref->qref));
+}
+
+  inline void
+wormhole_refresh_qstate(struct wormref * const ref)
+{
+  qsbr_update(&(ref->qref), wormhmap_version_load(wormhmap_load(ref->map)));
+}
+
+  static void
+wormhole_clean_hmap(struct wormhole * const map)
+{
+  for (u32 x = 0; x < 2; x++) {
+    if (map->hmap2[x].pmap == NULL)
+      continue;
+    struct wormhmap * const hmap = &(map->hmap2[x]);
+    const u64 nr_slots = ((u64)(hmap->mask)) + 1;
+    struct wormmbkt * const pmap = hmap->pmap;
+    for (u64 s = 0; s < nr_slots; s++) {
+      struct wormmbkt * const slot = &(pmap[s]);
+      for (u32 i = 0; i < WH_BKT_NR; i++)
+        if (slot->e[i])
+          wormmeta_keyref_release(slot->e[i]);
+    }
+
+    slab_free_all(hmap->slab1);
+    slab_free_all(hmap->slab2);
+    memset(hmap->pmap, 0, hmap->msize);
+    hmap->maxplen = 0;
+  }
+}
+
+  static void
+wormhole_free_leaf_keys(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 nr = leaf->nr_keys;
+  for (u32 i = 0; i < nr; i++) {
+    void * const curr = wormleaf_kv_at_is(leaf, i);
+    debug_assert(curr);
+    map->mm.free(curr, map->mm.priv);
+  }
+  wormhole_free_akey(leaf->anchor);
+}
+
+  static void
+wormhole_clean_helper(struct wormhole * const map)
+{
+  wormhole_clean_hmap(map);
+  for (struct wormleaf * leaf = map->leaf0; leaf; leaf = leaf->next)
+    wormhole_free_leaf_keys(map, leaf);
+  slab_free_all(map->slab_leaf);
+  map->leaf0 = NULL;
+}
+
+// unsafe
+  void
+wormhole_clean(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  wormhole_create_leaf0(map);
+}
+
+  void
+wormhole_destroy(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  for (u32 i = 0; i < 2; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+  qsbr_destroy(map->qsbr);
+  slab_destroy(map->slab_leaf);
+  free(map->pbuf);
+  free(map);
+}
+
+  void
+wormhole_fprint(struct wormhole * const map, FILE * const out)
+{
+  const u64 nr_slab_ul = slab_get_nalloc(map->slab_leaf);
+  const u64 nr_slab_um11 = slab_get_nalloc(map->hmap2[0].slab1);
+  const u64 nr_slab_um12 = slab_get_nalloc(map->hmap2[0].slab2);
+  const u64 nr_slab_um21 = map->hmap2[1].slab1 ? slab_get_nalloc(map->hmap2[1].slab1) : 0;
+  const u64 nr_slab_um22 = map->hmap2[1].slab2 ? slab_get_nalloc(map->hmap2[1].slab2) : 0;
+  fprintf(out, "%s L-SLAB %lu M-SLAB [0] %lu+%lu [1] %lu+%lu\n",
+      __func__, nr_slab_ul, nr_slab_um11, nr_slab_um12, nr_slab_um21, nr_slab_um22);
+}
+// }}} misc
+
+// api {{{
+const struct kvmap_api kvmap_api_wormhole = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .refpark = true,
+  .put = (void *)wormhole_put,
+  .get = (void *)wormhole_get,
+  .probe = (void *)wormhole_probe,
+  .del = (void *)wormhole_del,
+  .inpr = (void *)wormhole_inpr,
+  .inpw = (void *)wormhole_inpw,
+  .merge = (void *)wormhole_merge,
+  .delr = (void *)wormhole_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)wormhole_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)wormhole_iter_park,
+  .iter_destroy = (void *)wormhole_iter_destroy,
+  .ref = (void *)wormhole_ref,
+  .unref = (void *)wormhole_unref,
+  .park = (void *)wormhole_park,
+  .resume = (void *)wormhole_resume,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .put = (void *)whsafe_put,
+  .get = (void *)whsafe_get,
+  .probe = (void *)whsafe_probe,
+  .del = (void *)whsafe_del,
+  .inpr = (void *)whsafe_inpr,
+  .inpw = (void *)whsafe_inpw,
+  .merge = (void *)whsafe_merge,
+  .delr = (void *)whsafe_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)whsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)whsafe_iter_park,
+  .iter_destroy = (void *)whsafe_iter_destroy,
+  .ref = (void *)whsafe_ref,
+  .unref = (void *)wormhole_unref,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whunsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .unique = true,
+  .put = (void *)whunsafe_put,
+  .get = (void *)whunsafe_get,
+  .probe = (void *)whunsafe_probe,
+  .del = (void *)whunsafe_del,
+  .inpr = (void *)whunsafe_inp,
+  .inpw = (void *)whunsafe_inp,
+  .merge = (void *)whunsafe_merge,
+  .delr = (void *)whunsafe_delr,
+  .iter_create = (void *)whunsafe_iter_create,
+  .iter_seek = (void *)whunsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)whunsafe_iter_skip1,
+  .iter_skip = (void *)whunsafe_iter_skip,
+  .iter_next = (void *)whunsafe_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_destroy = (void *)whunsafe_iter_destroy,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+  static void *
+wormhole_kvmap_api_create(const char * const name, const struct kvmap_mm * const mm, char ** args)
+{
+  (void)args;
+  if ((!strcmp(name, "wormhole")) || (!strcmp(name, "whsafe"))) {
+    return wormhole_create(mm);
+  } else if (!strcmp(name, "whunsafe")) {
+    return whunsafe_create(mm);
+  } else {
+    return NULL;
+  }
+}
+
+__attribute__((constructor))
+  static void
+wormhole_kvmap_api_init(void)
+{
+  kvmap_api_register(0, "wormhole", "", wormhole_kvmap_api_create, &kvmap_api_wormhole);
+  kvmap_api_register(0, "whsafe", "", wormhole_kvmap_api_create, &kvmap_api_whsafe);
+  kvmap_api_register(0, "whunsafe", "", wormhole_kvmap_api_create, &kvmap_api_whunsafe);
+}
+// }}} api
+
+// wh {{{
+// Users often don't enjoy dealing with struct kv/kref and just want to use plain buffers.
+// No problem!
+// This example library shows you how to use Wormhole efficiently in the most intuitive way.
+
+// Use the worry-free api
+static const struct kvmap_api * const wh_api = &kvmap_api_whsafe;
+
+// You can change the wh_api to kvmap_api_wormhole with a one-line replacement
+// The standard Wormhole api can give you ~5% boost; read README for thread-safety tips
+//static const struct kvmap_api * const wh_api = &kvmap_api_wormhole;
+
+  struct wormhole *
+wh_create(void)
+{
+  // kvmap_mm_ndf (kv.h) will let the caller allocate the kv when inserting
+  // This can avoid a memcpy if the caller does not have the data in a struct kv
+  return wormhole_create(&kvmap_mm_ndf);
+}
+
+  struct wormref *
+wh_ref(struct wormhole * const wh)
+{
+  return wh_api->ref(wh);
+}
+
+  void
+wh_unref(struct wormref * const ref)
+{
+  (void)wh_api->unref(ref);
+}
+
+  void
+wh_park(struct wormref * const ref)
+{
+  if (wh_api->park)
+    wh_api->park(ref);
+}
+
+  void
+wh_resume(struct wormref * const ref)
+{
+  if (wh_api->resume)
+    wh_api->resume(ref);
+}
+
+  void
+wh_clean(struct wormhole * const map)
+{
+  wh_api->clean(map);
+}
+
+  void
+wh_destroy(struct wormhole * const map)
+{
+  wh_api->destroy(map);
+}
+
+// Do set/put with explicit kv buffers
+  bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen)
+{
+  struct kv * const newkv = kv_create(kbuf, klen, vbuf, vlen);
+  if (newkv == NULL)
+    return false;
+  // must use with kvmap_mm_ndf (see below)
+  // the newkv will be saved in the Wormhole and freed by Wormhole when upon deletion
+  return wh_api->put(ref, newkv);
+}
+
+// delete a key
+  bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->del(ref, &kref);
+}
+
+// test if the key exist in Wormhole
+  bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->probe(ref, &kref);
+}
+
+// for wh_get()
+struct wh_inp_info { void * vbuf_out; u32 * vlen_out; u32 vbuf_size; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+wh_inp_copy_value(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_inp_info * const info = (typeof(info))priv;
+    // copy the value data out
+    const u32 copy_size = info->vbuf_size < curr->vlen ? info->vbuf_size : curr->vlen;
+    memcpy(info->vbuf_out, kv_vptr_c(curr), copy_size);
+    // copy the vlen out
+    *info->vlen_out = curr->vlen;
+  }
+}
+
+// returns a boolean value indicating whether the key is found.
+// the value's data will be written to *vlen_out and vbuf_out if the key is found
+// if vbuf_size < vlen, then only the first vbuf_size bytes is copied to the buffer
+// a small vbuf_size can be used to reduce memcpy cost when only the first a few bytes are needed
+  bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  struct wh_inp_info info = {vbuf_out, vlen_out, vbuf_size};
+  // use the inplace read function to get the value if it exists
+  return wh_api->inpr(ref, &kref, wh_inp_copy_value, &info);
+}
+
+  bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpr(ref, &kref, uf, priv);
+}
+
+// inplace update KV's value with a user-defined hook function
+// the update should only modify the data in the value; It should not change the value size
+  bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpw(ref, &kref, uf, priv);
+}
+
+// merge existing KV with updates with a user-defined hook function
+  bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->merge(ref, &kref, uf, priv);
+}
+
+// remove a range of KVs from start (inclusive) to end (exclusive); [start, end)
+  u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end)
+{
+  struct kref kref_start, kref_end;
+  kref_ref_hash32(&kref_start, kbuf_start, klen_start);
+  kref_ref_hash32(&kref_end, kbuf_end, klen_end);
+  return wh_api->delr(ref, &kref_start, &kref_end);
+}
+
+  struct wormhole_iter *
+wh_iter_create(struct wormref * const ref)
+{
+  return wh_api->iter_create(ref);
+}
+
+  void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  wh_api->iter_seek(iter, &kref);
+}
+
+  bool
+wh_iter_valid(struct wormhole_iter * const iter)
+{
+  return wh_api->iter_valid(iter);
+}
+
+// for wh_iter_peek()
+// the out ptrs must be provided in pairs; use a pair of NULLs to ignore the key or value
+struct wh_iter_inp_info { void * kbuf_out; void * vbuf_out; u32 kbuf_size; u32 vbuf_size; u32 * klen_out; u32 * vlen_out; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+inp_copy_kv_cb(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_iter_inp_info * const info = (typeof(info))priv;
+
+    // copy the key
+    if (info->kbuf_out) { // it assumes klen_out is also not NULL
+      // copy the key data out
+      const u32 clen = curr->klen < info->kbuf_size ? curr->klen : info->kbuf_size;
+      memcpy(info->kbuf_out, kv_kptr_c(curr), clen);
+      // copy the klen out
+      *info->klen_out = curr->klen;
+    }
+
+    // copy the value
+    if (info->vbuf_out) { // it assumes vlen_out is also not NULL
+      // copy the value data out
+      const u32 clen = curr->vlen < info->vbuf_size ? curr->vlen : info->vbuf_size;
+      memcpy(info->vbuf_out, kv_vptr_c(curr), clen);
+      // copy the vlen out
+      *info->vlen_out = curr->vlen;
+    }
+  }
+}
+
+// seek is similar to get
+  bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct wh_iter_inp_info info = {kbuf_out, vbuf_out, kbuf_size, vbuf_size, klen_out, vlen_out};
+  return wh_api->iter_inp(iter, inp_copy_kv_cb, &info);
+}
+
+  void
+wh_iter_skip1(struct wormhole_iter * const iter)
+{
+  wh_api->iter_skip1(iter);
+}
+
+  void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  wh_api->iter_skip(iter, nr);
+}
+
+  bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  return wh_api->iter_inp(iter, uf, priv);
+}
+
+  void
+wh_iter_park(struct wormhole_iter * const iter)
+{
+  wh_api->iter_park(iter);
+}
+
+  void
+wh_iter_destroy(struct wormhole_iter * const iter)
+{
+  wh_api->iter_destroy(iter);
+}
+// }}} wh
+
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/wh.h b/MassTrie-beta/wormhole/wh.h
new file mode 100644
index 00000000..bd17b38d
--- /dev/null
+++ b/MassTrie-beta/wormhole/wh.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct wormhole;
+struct wormref;
+
+// wormhole {{{
+// the wh created by wormhole_create() can work with all of safe/unsafe operations.
+  extern struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm);
+
+// the wh created by whunsafe_create() can only work with the unsafe operations.
+  extern struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm);
+
+  extern struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+wormhole_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref);
+
+  extern void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern bool
+wormhole_iter_valid(struct wormhole_iter * const iter);
+
+  extern struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref);
+
+  extern bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref);
+
+  extern void
+wormhole_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wormhole_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+wormhole_ref(struct wormhole * const map);
+
+  extern struct wormhole *
+wormhole_unref(struct wormref * const ref);
+
+  extern void
+wormhole_park(struct wormref * const ref);
+
+  extern void
+wormhole_resume(struct wormref * const ref);
+
+  extern void
+wormhole_refresh_qstate(struct wormref * const ref);
+
+// clean with more threads
+  extern void
+wormhole_clean_th(struct wormhole * const map, const u32 nr_threads);
+
+  extern void
+wormhole_clean(struct wormhole * const map);
+
+  extern void
+wormhole_destroy(struct wormhole * const map);
+
+// safe API (no need to refresh qstate)
+
+  extern struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+whsafe_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+// use wormhole_iter_create
+  extern void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern struct kv *
+whsafe_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+// use wormhole_iter_valid
+// use wormhole_iter_peek
+// use wormhole_iter_kref
+// use wormhole_iter_kvref
+// use wormhole_iter_skip1
+// use wormhole_iter_skip
+// use wormhole_iter_next
+// use wormhole_iter_inp
+
+  extern void
+whsafe_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+whsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+whsafe_ref(struct wormhole * const map);
+
+// use wormhole_unref
+
+// unsafe API
+
+  extern struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key);
+
+  extern bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv);
+
+  extern bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key);
+
+  extern u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map);
+
+  extern void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+// unsafe iter_valid: use wormhole_iter_valid
+// unsafe iter_peek: use wormhole_iter_peek
+// unsafe iter_kref: use wormhole_iter_kref
+
+  extern void
+whunsafe_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+// unsafe iter_inp: use wormhole_iter_inp
+
+  extern void
+whunsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_fprint(struct wormhole * const map, FILE * const out);
+
+extern const struct kvmap_api kvmap_api_wormhole;
+extern const struct kvmap_api kvmap_api_whsafe;
+extern const struct kvmap_api kvmap_api_whunsafe;
+// }}} wormhole
+
+// wh {{{
+  extern struct wormhole *
+wh_create(void);
+
+  extern struct wormref *
+wh_ref(struct wormhole * const wh);
+
+  extern void
+wh_unref(struct wormref * const ref);
+
+  extern void
+wh_park(struct wormref * const ref);
+
+  extern void
+wh_resume(struct wormref * const ref);
+
+  extern void
+wh_clean(struct wormhole * const map);
+
+  extern void
+wh_destroy(struct wormhole * const map);
+
+  extern bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen);
+
+  extern bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv);
+
+  extern u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end);
+
+  extern struct wormhole_iter *
+wh_iter_create(struct wormref * const ref);
+
+  extern void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_iter_valid(struct wormhole_iter * const iter);
+
+  extern bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern void
+wh_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wh_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_destroy(struct wormhole_iter * const iter);
+// }}} wh
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/wh.py b/MassTrie-beta/wormhole/wh.py
new file mode 100644
index 00000000..e744cec8
--- /dev/null
+++ b/MassTrie-beta/wormhole/wh.py
@@ -0,0 +1,192 @@
+#!/usr/bin/python3
+
+#
+# Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+#
+# All rights reserved. No warranty, explicit or implicit, provided.
+#
+
+import msgpack
+from ctypes import *   # CDLL and c_xxx types
+
+# libwh {{{
+# Change this path when necessary
+libwh = CDLL("./libwh.so")
+
+# create
+libwh.wh_create.argtypes = []
+libwh.wh_create.restype = c_void_p
+
+# close (no return value)
+libwh.wh_destroy.argtypes = [c_void_p]
+
+# ref
+libwh.wh_ref.argtypes = [c_void_p]
+libwh.wh_ref.restype = c_void_p
+
+# unref
+libwh.wh_unref.argtypes = [c_void_p]
+
+# put
+libwh.wh_put.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint]
+libwh.wh_put.restype = c_bool
+
+# get
+libwh.wh_get.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint, c_void_p]
+libwh.wh_get.restype = c_bool
+
+# probe
+libwh.wh_probe.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_probe.restype = c_bool
+
+# del
+libwh.wh_del.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_del.restype = c_bool
+
+# iter_create
+libwh.wh_iter_create.argtypes = [c_void_p]
+libwh.wh_iter_create.restype = c_void_p
+
+# iter_seek
+libwh.wh_iter_seek.argtypes = [c_void_p, c_char_p, c_uint]
+
+# iter_valid
+libwh.wh_iter_valid.argtypes = [c_void_p]
+libwh.wh_iter_valid.restype = c_bool
+
+# iter_skip1
+libwh.wh_iter_skip1.argtypes = [c_void_p]
+
+# iter_skip
+libwh.wh_iter_skip.argtypes = [c_void_p, c_uint]
+
+# iter_peek
+libwh.wh_iter_peek.argtypes = [c_void_p, c_char_p, c_uint, c_void_p, c_char_p, c_uint, c_void_p]
+libwh.wh_iter_peek.restype = c_bool
+
+# iter_park
+libwh.wh_iter_park.argtypes = [c_void_p]
+
+# iter_destroy
+libwh.wh_iter_destroy.argtypes = [c_void_p]
+# }}} libwh
+
+# class {{{
+class Wh:
+    def __init__(self, maxklen=256, maxvlen=8192):
+        self.whptr = libwh.wh_create()
+        self.kbufsz = maxklen
+        self.vbufsz = maxvlen
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_destroy(self.whptr)
+
+    def ref(self):
+        return WhRef(self.whptr, self.kbufsz, self.vbufsz)
+
+class WhRef:
+    def __init__(self, whptr, kbufsz, vbufsz):
+        self.refptr = libwh.wh_ref(whptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.vbuf = create_string_buffer(self.vbufsz)
+
+    # user must call explicitly
+    def unref(self):
+        libwh.wh_unref(self.refptr)
+
+    def iter(self):
+        return WhIter(self.refptr, self.kbufsz, self.vbufsz)
+
+    # key: python string; value: any (hierarchical) python object
+    def put(self, key, value):
+        binkey = key.encode()
+        binvalue = msgpack.packb(value)
+        return libwh.wh_put(self.refptr, binkey, c_uint(len(binkey)), binvalue, c_uint(len(binvalue)))
+
+    # return the value as a python object
+    def get(self, key):
+        binkey = key.encode()
+        vlen = c_uint()
+        ret = libwh.wh_get(self.refptr, binkey, len(binkey), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and vlen.value <= self.vbufsz:
+            return msgpack.unpackb(self.vbuf.value)
+        else:
+            return None
+
+    def delete(self, key):
+        binkey = key.encode()
+        return libwh.wh_del(self.refptr, binkey, c_uint(len(binkey)))
+
+    def probe(self, key):
+        binkey = key.encode()
+        return libwh.wh_probe(self.refptr, binkey, c_uint(len(binkey)))
+
+class WhIter:
+    def __init__(self, refptr, kbufsz, vbufsz):
+        self.iptr = libwh.wh_iter_create(refptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.kbuf = create_string_buffer(kbufsz)
+        self.vbuf = create_string_buffer(vbufsz)
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_iter_destroy(self.iptr)
+
+    def seek(self, key):
+        if key is None:
+            libwh.wh_iter_seek(self.iptr, None, c_uint(0))
+        else:
+            binkey = key.encode()
+            libwh.wh_iter_seek(self.iptr, binkey, c_uint(len(binkey)))
+
+    def valid(self):
+        return libwh.wh_iter_valid(self.iptr)
+
+    def skip1(self):
+        libwh.wh_iter_skip1(self.iptr)
+
+    def skip(self, nr):
+        libwh.wh_iter_skip(self.iptr, c_uint(nr))
+
+    # return (key, value) pair or None
+    def peek(self):
+        klen = c_uint()
+        vlen = c_uint()
+        ret = libwh.wh_iter_peek(self.iptr, self.kbuf, self.kbufsz, byref(klen), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and klen.value <= self.kbufsz and vlen.value <= self.vbufsz:
+            self.kbuf[klen.value] = b'\x00'
+            return (self.kbuf.value.decode(), klen.value, msgpack.unpackb(self.vbuf.value), vlen.value)
+        else:
+            return None
+
+# }}} class
+
+# examples
+wh1 = Wh(32, 1024)
+ref1 = wh1.ref()  # take a ref for kv operations
+
+ref1.put("Hello", "pywh")
+ref1.put("key1", "value1")
+ref1.put("key2", "value2")
+ref1.put("key3", {"xxx":"valuex", "yyy":"valuey"})
+ref1.delete("key2")
+
+rget = ref1.get("Hello")
+print(rget)
+
+# don't use ref when iterating
+iter1 = ref1.iter()
+iter1.seek(None)
+while iter1.valid():
+    r = iter1.peek()
+    print(r)
+    iter1.skip1()
+
+iter1.destroy() # must destroy all iters before unref
+ref1.unref() # must unref all refs before close()
+wh1.destroy()
+
+# vim:fdm=marker
diff --git a/MassTrie-beta/wormhole/wh.strip b/MassTrie-beta/wormhole/wh.strip
new file mode 100644
index 00000000..e7b3971f
--- /dev/null
+++ b/MassTrie-beta/wormhole/wh.strip
@@ -0,0 +1,161 @@
+-K key_size
+-K key_size_align
+-K kref_compare
+-K kref_kv_compare
+-K kref_kv_match
+-K kref_lcp
+-K kref_match
+-K kref_null
+-K kref_ref_hash32
+-K kref_ref_kv
+-K kref_ref_kv_hash32
+-K kref_ref_raw
+-K kref_update_hash32
+-K kv_compare
+-K kv_compare_ptrs
+-K kv_crc32c
+-K kv_crc32c_extend
+-K kv_create
+-K kv_create_kref
+-K kv_create_str
+-K kv_create_str_str
+-K kv_dup
+-K kv_dup2
+-K kv_dup2_key
+-K kv_dup2_key_prefix
+-K kv_dup_key
+-K kv_key_lcp
+-K kv_kptr
+-K kv_kptr_c
+-K kv_kref
+-K kvmap_api_whsafe
+-K kvmap_api_whunsafe
+-K kvmap_api_wormhole
+-K kvmap_dump_keys
+-K kvmap_inp_steal_kv
+-K kvmap_kv_del
+-K kvmap_kv_delr
+-K kvmap_kv_get
+-K kvmap_kv_inpr
+-K kvmap_kv_inpw
+-K kvmap_kv_iter_seek
+-K kvmap_kv_merge
+-K kvmap_kv_probe
+-K kvmap_kv_put
+-K kvmap_mm_dup
+-K kvmap_mm_free_free
+-K kvmap_mm_free_noop
+-K kvmap_mm_in_dup
+-K kvmap_mm_in_noop
+-K kvmap_mm_ndf
+-K kvmap_mm_out_dup
+-K kvmap_mm_out_noop
+-K kvmap_raw_del
+-K kvmap_raw_get
+-K kvmap_raw_inpr
+-K kvmap_raw_inpw
+-K kvmap_raw_iter_seek
+-K kvmap_raw_probe
+-K kvmap_ref
+-K kvmap_unref
+-K kv_match
+-K kv_match_full
+-K kv_null
+-K kv_print
+-K kv_qsort
+-K kvref_dup2_key
+-K kvref_dup2_kv
+-K kv_refill
+-K kv_refill_hex32
+-K kv_refill_hex64
+-K kv_refill_hex64_klen
+-K kv_refill_kref
+-K kv_refill_kref_v
+-K kv_refill_str
+-K kv_refill_str_str
+-K kv_refill_u64
+-K kv_refill_value
+-K kvref_kv_compare
+-K kvref_ref_kv
+-K kv_size
+-K kv_size_align
+-K kv_update_hash
+-K kv_vptr
+-K kv_vptr_c
+-K wh_clean
+-K wh_create
+-K wh_del
+-K wh_delr
+-K wh_destroy
+-K wh_get
+-K wh_inpr
+-K wh_inpw
+-K wh_iter_create
+-K wh_iter_destroy
+-K wh_iter_inp
+-K wh_iter_park
+-K wh_iter_peek
+-K wh_iter_seek
+-K wh_iter_skip
+-K wh_iter_valid
+-K wh_merge
+-K wh_park
+-K wh_probe
+-K wh_ref
+-K wh_resume
+-K whsafe_del
+-K whsafe_delr
+-K whsafe_get
+-K whsafe_inpr
+-K whsafe_inpw
+-K whsafe_iter_destroy
+-K whsafe_iter_park
+-K whsafe_iter_seek
+-K whsafe_merge
+-K whsafe_probe
+-K whsafe_ref
+-K whsafe_put
+-K wh_put
+-K wh_unref
+-K whunsafe_create
+-K whunsafe_del
+-K whunsafe_delr
+-K whunsafe_get
+-K whunsafe_inp
+-K whunsafe_iter_create
+-K whunsafe_iter_destroy
+-K whunsafe_iter_next
+-K whunsafe_iter_seek
+-K whunsafe_iter_skip
+-K whunsafe_merge
+-K whunsafe_probe
+-K whunsafe_put
+-K wormhole_clean
+-K wormhole_create
+-K wormhole_del
+-K wormhole_delr
+-K wormhole_destroy
+-K wormhole_fprint
+-K wormhole_get
+-K wormhole_inpr
+-K wormhole_inpw
+-K wormhole_iter_create
+-K wormhole_iter_destroy
+-K wormhole_iter_inp
+-K wormhole_iter_kref
+-K wormhole_iter_kvref
+-K wormhole_iter_next
+-K wormhole_iter_park
+-K wormhole_iter_peek
+-K wormhole_iter_seek
+-K wormhole_iter_skip
+-K wormhole_iter_valid
+-K wormhole_kvmap_api_create
+-K wormhole_merge
+-K wormhole_park
+-K wormhole_probe
+-K wormhole_ref
+-K wormhole_refresh_qstate
+-K wormhole_resume
+-K wormhole_put
+-K wormhole_unref
diff --git a/README.md b/README.md
index 8f74a3fe..48ce0c53 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ $ sudo apt install g++-7
 
 1. Clone the git repository
 ```bash
-$ git clone https://github.com/readablesystems/sto.git
+$ git clone -b masstrie https://github.com/roeeash/sto.git
 $ cd sto
 ```
 
@@ -63,7 +63,31 @@ $ cd sto
 $ git submodule update --init --recursive
 ```
 
-3. Execute configuration scripts
+3. Set system variables
+```bash
+$ cd MassTrie-beta/wormhole
+$ export LD_LIBRARY_PATH=`pwd`
+$ cd ../
+$ cd ../
+```
+OR if you're on the tcsh shell
+
+```bash
+$ cd MassTrie-beta/wormhole
+$ setenv LD_LIBRARY_PATH=`pwd`
+$ cd ../
+$ cd ../
+```
+
+4. Additional system setup:
+If you do not have autoconf and python-is-python3 installed, run:
+
+```bash
+$ sudo apt install python-is-python3
+$ sudo apt install autoconf
+```
+
+5. Execute configuration scripts
 ```bash
 $ ./bootstrap.sh
 $ ./configure
@@ -74,7 +98,7 @@ enable it for STO by running `./configure CC=gcc-7 CXX=g++-7`.
 
 (Note: if you use macOS you should probably run `./configure CXX='clang++ -stdlib=libc++'`)
 
-4. Build
+6. Build
 ```bash
 $ make -jN # launch N parallel build jobs
 ```
@@ -90,6 +114,19 @@ by continuous integration.
 - `make micro_bench`: Build the array-based microbenchmark.
 - `make clean`: You know what it does.
 
+
+7. Build (specifically for benchmark files)
+
+```bash
+$ make -jN unit-test_MTrie # launch N parallel build jobs
+$ ./unit-test_MTrie
+```
+
+```bash
+$ make -jN unit-dboindex # launch N parallel build jobs
+$ ./unit-dboindex
+```
+
 See [Wiki](https://github.com/readablesystems/sto/wiki) for advanced buid options.
 
 ## IDE Support & cmake
diff --git a/benchmark/DB_oindexMTrie.hh b/benchmark/DB_oindexMTrie.hh
new file mode 100644
index 00000000..d454287b
--- /dev/null
+++ b/benchmark/DB_oindexMTrie.hh
@@ -0,0 +1,3053 @@
+#pragma once
+
+
+
+#include "DB_index.hh"
+
+
+
+#include "../MassTrie-beta/MassTrie.hh"
+
+
+
+namespace bench {
+
+template <typename K, typename V, typename DBParams>
+
+class MTrie_ordered_index : public TObject {
+
+public:
+
+    typedef K key_type;
+
+    typedef V value_type;
+
+    typedef commutators::Commutator<value_type> comm_type;
+
+
+
+    //typedef typename get_occ_version<DBParams>::type occ_version_type;
+
+    typedef typename get_version<DBParams>::type version_type;
+
+
+
+    using accessor_t = typename index_common<K, V, DBParams>::accessor_t;
+
+
+
+    static constexpr typename version_type::type invalid_bit = TransactionTid::user_bit;
+
+    static constexpr TransItem::flags_type insert_bit = TransItem::user0_bit;
+
+    static constexpr TransItem::flags_type delete_bit = TransItem::user0_bit << 1u;
+
+    static constexpr TransItem::flags_type row_update_bit = TransItem::user0_bit << 2u;
+
+    static constexpr TransItem::flags_type row_cell_bit = TransItem::user0_bit << 3u;
+
+    static constexpr uintptr_t internode_bit = 1;
+
+    // TicToc node version bit
+
+    static constexpr uintptr_t ttnv_bit = 1 << 1u;
+
+
+
+    typedef typename value_type::NamedColumn NamedColumn;
+
+    typedef IndexValueContainer<V, version_type> value_container_type;
+
+
+
+    static constexpr bool value_is_small = is_small<V>::value;
+
+
+
+    static constexpr bool index_read_my_write = DBParams::RdMyWr;
+
+
+
+    struct internal_elem {
+
+        key_type key;
+
+        value_container_type row_container;
+
+        bool deleted;
+
+
+
+        internal_elem(const key_type& k, const value_type& v, bool valid)
+
+            : key(k),
+
+              row_container((valid ? Sto::initialized_tid() : (Sto::initialized_tid() | invalid_bit)),
+
+                            !valid, v),
+
+              deleted(false) {}
+
+
+
+        version_type& version() {
+
+            return row_container.row_version();
+
+        }
+
+
+
+        bool valid() {
+
+            return !(version().value() & invalid_bit);
+
+        }
+
+    };
+
+
+
+    struct table_params : public Masstree::nodeparams<15,15> {
+
+        typedef internal_elem* value_type;
+
+        typedef Masstree::value_print<value_type> value_print_type;
+
+        typedef threadinfo threadinfo_type;
+
+
+
+        static constexpr bool track_nodes = (DBParams::NodeTrack && DBParams::TicToc);
+
+        typedef std::conditional_t<track_nodes, version_type, int> aux_tracker_type;
+
+    };
+
+
+
+    typedef Masstree::Str Str;
+
+    typedef Masstree::basic_table<table_params> table_type;
+
+    typedef Masstree::unlocked_tcursor<table_params> unlocked_cursor_type;
+
+    typedef Masstree::tcursor<table_params> cursor_type;
+
+    typedef Masstree::leaf<table_params> leaf_type;
+
+
+
+    typedef typename table_type::node_type node_type;
+
+    typedef typename unlocked_cursor_type::nodeversion_value_type nodeversion_value_type;
+
+    
+
+    typedef MassTrie* MTrie_table_type;
+
+
+
+    using column_access_t = typename split_version_helpers<MTrie_ordered_index<K, V, DBParams>>::column_access_t;
+
+    using item_key_t = typename split_version_helpers<MTrie_ordered_index<K, V, DBParams>>::item_key_t;
+
+    template <typename T>
+
+    static constexpr auto column_to_cell_accesses
+
+        = split_version_helpers<MTrie_ordered_index<K, V, DBParams>>::template column_to_cell_accesses<T>;
+
+    template <typename T>
+
+    static constexpr auto extract_item_list
+
+        = split_version_helpers<MTrie_ordered_index<K, V, DBParams>>::template extract_item_list<T>;
+
+
+
+    typedef std::tuple<bool, bool, uintptr_t, const value_type*> sel_return_type;
+
+    typedef std::tuple<bool, bool>                               ins_return_type;
+
+    typedef std::tuple<bool, bool>                               del_return_type;
+
+    typedef std::tuple<bool, bool, uintptr_t, UniRecordAccessor<V>> sel_split_return_type;
+
+
+
+    static __thread typename table_params::threadinfo_type *ti;
+
+
+
+    MTrie_ordered_index(size_t init_size) {
+
+        this->table_init();
+
+        (void)init_size;
+
+    }
+
+    MTrie_ordered_index() {
+
+        this->table_init();
+
+    }
+
+
+
+    void table_init() {
+
+        if (ti == nullptr)
+
+            ti = threadinfo::make(threadinfo::TI_MAIN, -1);
+
+        //table_.initialize(*ti);
+
+        key_gen_ = 0;
+
+        
+
+        //MTrie init
+
+        if(!this->MTrie_table)
+
+      		this->MTrie_table = new MassTrie();
+
+        
+
+    }
+
+
+
+    static void thread_init() {
+
+        if (ti == nullptr)
+
+            ti = threadinfo::make(threadinfo::TI_PROCESS, TThread::id());
+
+        Transaction::tinfo[TThread::id()].trans_start_callback = []() {
+
+            ti->rcu_start();
+
+        };
+
+        Transaction::tinfo[TThread::id()].trans_end_callback = []() {
+
+            ti->rcu_stop();
+
+        };
+
+    }
+
+
+
+    uint64_t gen_key() {
+
+        return fetch_and_add(&key_gen_, 1);
+
+    }
+
+
+
+#if 0
+
+    sel_return_type
+
+    select_row(const key_type& key, RowAccess acc) {
+
+        unlocked_cursor_type lp(table_, key);
+
+        bool found = lp.find_unlocked(*ti);
+
+        internal_elem *e = lp.value();
+
+        if (found) {
+
+            return select_row(reinterpret_cast<uintptr_t>(e), acc);
+
+        } else {
+
+            if (!register_internode_version(lp.node(), lp.full_version_value()))
+
+                return {false, false, 0, UniRecordAccessor<V>(nullptr)};
+
+            return {true, false, 0, UniRecordAccessor<V>(nullptr)};
+
+        }
+
+    }
+
+#endif
+
+
+
+    sel_split_return_type
+
+    select_split_row(const key_type& key, std::initializer_list<column_access_t> accesses) {
+
+     
+
+        
+
+        
+
+        bool MTrie_found =MTrie_table->probe(&key,sizeof(key)) ;
+
+        
+
+        
+
+        
+
+        
+
+        
+
+        if (MTrie_found) {
+
+        
+
+        void * vbuf_out = MTrie_table->get(MTrie_table->ref,&key,sizeof(key));
+
+        
+
+        internal_elem ** e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+        
+
+            return select_split_row(reinterpret_cast<uintptr_t>(*e), accesses);
+
+        }
+
+        
+
+        
+
+        //else, if key is not in MassTrie
+
+        
+
+        void * res = MTrie_table->find_closest(&key);
+
+        
+
+        
+
+        
+
+        void * vbuf_out = MTrie_table->get(MTrie_table->ref,res,sizeof(res));
+
+        
+
+        internal_elem ** e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+        
+
+        
+
+        //cout<<" register_internode_version(*e) = "<<r<<endl;
+
+     
+
+     
+
+     	
+
+        
+
+        return {    
+
+            register_internode_version(*e),
+
+            false,
+
+            0,
+
+            UniRecordAccessor<V>(nullptr)
+
+        };
+
+    }
+
+
+
+#if 0
+
+    sel_return_type
+
+    select_row(uintptr_t rid, RowAccess access) {
+
+        auto e = reinterpret_cast<internal_elem *>(rid);
+
+        bool ok = true;
+
+        TransProxy row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+
+
+        if (is_phantom(e, row_item))
+
+            goto abort;
+
+
+
+        if (index_read_my_write) {
+
+            if (has_delete(row_item)) {
+
+                return sel_return_type(true, false, 0, nullptr);
+
+            }
+
+            if (has_row_update(row_item)) {
+
+                value_type *vptr;
+
+                if (has_insert(row_item))
+
+                    vptr = &e->row_container.row;
+
+                else
+
+                    vptr = row_item.template raw_write_value<value_type*>();
+
+                return sel_return_type(true, true, rid, vptr);
+
+            }
+
+        }
+
+
+
+        switch (access) {
+
+            case RowAccess::UpdateValue:
+
+                ok = version_adapter::select_for_update(row_item, e->version());
+
+                row_item.add_flags(row_update_bit);
+
+                break;
+
+            case RowAccess::ObserveExists:
+
+            case RowAccess::ObserveValue:
+
+                ok = row_item.observe(e->version());
+
+                break;
+
+            default:
+
+                break;
+
+        }
+
+
+
+        if (!ok)
+
+            goto abort;
+
+
+
+        return sel_return_type(true, true, rid, &(e->row_container.row));
+
+
+
+    abort:
+
+        return sel_return_type(false, false, 0, nullptr);
+
+    }
+
+#endif
+
+
+
+    sel_split_return_type
+
+    select_split_row(uintptr_t rid, std::initializer_list<column_access_t> accesses) {
+
+        auto e = reinterpret_cast<internal_elem*>(rid);
+
+        TransProxy row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+
+
+        // Translate from column accesses to cell accesses
+
+        // all buffered writes are only stored in the wdata_ of the row item (to avoid redundant copies)
+
+        auto cell_accesses = column_to_cell_accesses<value_container_type>(accesses);
+
+
+
+        std::array<TransItem*, value_container_type::num_versions> cell_items {};
+
+        bool any_has_write;
+
+        bool ok;
+
+        std::tie(any_has_write, cell_items) = extract_item_list<value_container_type>(cell_accesses, this, e);
+
+
+
+        if (is_phantom(e, row_item))
+
+            goto abort;
+
+
+
+        if (index_read_my_write) {
+
+            if (has_delete(row_item)) {
+
+                return {true, false, 0, UniRecordAccessor<V>(nullptr)};
+
+            }
+
+            if (any_has_write || has_row_update(row_item)) {
+
+                value_type *vptr;
+
+                if (has_insert(row_item))
+
+                    vptr = &e->row_container.row;
+
+                else
+
+                    vptr = row_item.template raw_write_value<value_type *>();
+
+                return {true, true, rid, UniRecordAccessor<V>(vptr)};
+
+            }
+
+        }
+
+
+
+        ok = access_all(cell_accesses, cell_items, e->row_container);
+
+        if (!ok)
+
+            goto abort;
+
+
+
+        return {true, true, rid, UniRecordAccessor<V>(&(e->row_container.row))};
+
+
+
+    abort:
+
+        return {false, false, 0, UniRecordAccessor<V>(nullptr)};
+
+    }
+
+
+
+    void update_row(uintptr_t rid, value_type *new_row) {
+
+        auto e = reinterpret_cast<internal_elem*>(rid);
+
+        auto row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+        if (value_is_small) {
+
+            row_item.acquire_write(e->version(), *new_row);
+
+        } else {
+
+            row_item.acquire_write(e->version(), new_row);
+
+        }
+
+    }
+
+
+
+    void update_row(uintptr_t rid, const comm_type &comm) {
+
+        assert(&comm);
+
+        auto row_item = Sto::item(this, item_key_t::row_item_key(reinterpret_cast<internal_elem *>(rid)));
+
+        row_item.add_commute(comm);
+
+    }
+
+
+
+    // insert assumes common case where the row doesn't exist in the table
+
+    // if a row already exists, then use select (FOR UPDATE) instead
+
+    ins_return_type
+
+    insert_row(const key_type& key, value_type *vptr, bool overwrite = false) {
+
+        //cursor_type lp(table_, key);
+
+        //bool found = lp.find_insert(*ti);
+
+        
+
+        bool MTrie_found =MTrie_table->probe(&key,sizeof(&key)) ;
+
+        
+
+        
+
+        if (MTrie_found) {
+
+            // NB: the insert method only manipulates the row_item. It is possible
+
+            // this insert is overwriting some previous updates on selected columns
+
+            // The expected behavior is that this row-level operation should overwrite
+
+            // all changes made by previous updates (in the same transaction) on this
+
+            // row. We achieve this by granting this row_item a higher priority.
+
+            // During the install phase, if we notice that the row item has already
+
+            // been locked then we simply ignore installing any changes made by cell items.
+
+            // It should be trivial for a cell item to find the corresponding row item
+
+            // and figure out if the row-level version is locked.
+
+            void * vbuf_out = MTrie_table->get(MTrie_table->ref,&key,sizeof(key));
+
+        
+
+        	internal_elem ** e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+            
+
+            internal_elem *MTrie_e = *e;
+
+            //lp.finish(0, *ti);
+
+
+
+            TransProxy row_item = Sto::item(this, item_key_t::row_item_key(MTrie_e));
+
+
+
+            if (is_phantom(MTrie_e, row_item))
+
+                goto abort;
+
+
+
+            if (index_read_my_write) {
+
+                if (has_delete(row_item)) {
+
+                    auto proxy = row_item.clear_flags(delete_bit).clear_write();
+
+
+
+                    if (value_is_small)
+
+                        proxy.add_write(*vptr);
+
+                    else
+
+                        proxy.add_write(vptr);
+
+
+
+                    return ins_return_type(true, false);
+
+                }
+
+            }
+
+
+
+            if (overwrite) {
+
+                bool ok;
+
+                if (value_is_small)
+
+                    ok = version_adapter::select_for_overwrite(row_item, MTrie_e->version(), *vptr);
+
+                else
+
+                    ok = version_adapter::select_for_overwrite(row_item, MTrie_e->version(), vptr);
+
+                if (!ok)
+
+                    goto abort;
+
+                if (index_read_my_write) {
+
+                    if (has_insert(row_item)) {
+
+                        copy_row(MTrie_e, vptr);
+
+                    }
+
+                }
+
+            } else {
+
+                // observes that the row exists, but nothing more
+
+                if (!row_item.observe(MTrie_e->version()))
+
+                    goto abort;
+
+            }
+
+
+
+        } else {
+
+        
+
+       
+
+        
+
+            auto e = new internal_elem(key, vptr ? *vptr : value_type(),
+
+                                       false /*!valid*/);
+
+            
+
+            //put in mtrie table
+
+            MTrie_table->put(&key,sizeof(&key),
+
+            reinterpret_cast<const void *>(&e),sizeof(reinterpret_cast<const void *>(&e)));
+
+          
+
+          
+
+          //cout<<"MTrie_table ="<<MTrie_table<<endl;
+
+
+
+            TransProxy row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+          
+
+            row_item.acquire_write(e->version());
+
+            row_item.add_flags(insert_bit);
+
+
+
+            // update the node version already in the read set and modified by split
+
+            //if (!update_internode_version(node, orig_nv, new_nv))
+
+             //   goto abort;
+
+        }
+
+
+
+        return ins_return_type(true, MTrie_found);
+
+
+
+    abort:
+
+        return ins_return_type(false, false);
+
+    }
+
+
+
+    del_return_type
+
+    delete_row(const key_type& key) {
+
+    
+
+        bool MTrie_found =MTrie_table->probe(&key,sizeof(&key));
+
+        
+
+        //cout<<"MTrie_found in delete = "<<MTrie_found<<endl;
+
+
+
+        if (MTrie_found) {
+
+        
+
+            void * vbuf_out = MTrie_table->get(MTrie_table->ref,&key,sizeof(key));
+
+        
+
+        	internal_elem ** MTrie_e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+            
+
+            internal_elem *e = *MTrie_e;
+
+            
+
+            TransProxy row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+
+
+            if (is_phantom(e, row_item)) {
+
+                goto abort;
+
+            }
+
+
+
+            if (index_read_my_write) {
+
+                if (has_delete(row_item))
+
+                    return del_return_type(true, false);
+
+                    
+
+                if (!e->valid() && has_insert(row_item)) {
+
+                    row_item.add_flags(delete_bit);
+
+                    return del_return_type(true, true);
+
+                }
+
+            }
+
+
+
+            // Register a TicToc write to the leaf node when necessary.
+
+            //ttnv_register_node_write(lp.node());
+
+              
+
+
+
+            // select_for_update will register an observation and set the write bit of
+
+            // the TItem
+
+            if (!version_adapter::select_for_update(row_item, e->version())) {
+
+                goto abort;
+
+            }
+
+            fence();
+
+            if (e->deleted) {
+
+                goto abort;
+
+            }
+
+            row_item.add_flags(delete_bit);
+
+            
+
+           
+
+            
+
+        } else {
+
+        
+
+        //else, if key is not in MassTrie
+
+         void * res = MTrie_table->find_closest(&key);
+
+        
+
+        bool r ;
+
+        
+
+        
+
+        
+
+        
+
+        void * vbuf_out = MTrie_table->get(MTrie_table->ref,res,sizeof(res));
+
+        
+
+        internal_elem ** e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+        
+
+        r=register_internode_version(*e);
+
+        
+
+          if (!r) {
+
+                goto abort;
+
+             }
+
+
+
+        }
+
+        
+
+	 
+
+        return del_return_type(true, MTrie_found);
+
+
+
+    abort:
+
+        return del_return_type(false, false);
+
+    }
+
+
+
+    template <typename Callback, bool Reverse>
+
+    bool range_scan(const key_type& begin, const key_type& end, Callback callback,
+
+                    std::initializer_list<column_access_t> accesses, bool phantom_protection = true, int limit = -1) {
+
+       //instructed to be an empty function
+
+       return true;
+
+    }
+
+
+
+    template <typename Callback, bool Reverse>
+
+    bool range_scan(const key_type& begin, const key_type& end, Callback callback,
+
+                    RowAccess access, bool phantom_protection = true, int limit = -1) {
+
+      //instructed to be an empty function
+
+       return true;
+
+    }
+
+
+
+    value_type *nontrans_get(const key_type& k) {
+
+        
+
+         bool MTrie_found =MTrie_table->probe(&k,sizeof(&k));
+
+         
+
+         //cout<<"MTrie found = "<< MTrie_found<<endl;
+
+         
+
+        if (MTrie_found) {
+
+            
+
+            void * vbuf_out = MTrie_table->get(MTrie_table->ref,&k,sizeof(k));
+
+            
+
+            
+
+             internal_elem **MTrie_e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+             
+
+            
+
+            
+
+            return &((*MTrie_e)->row_container.row);
+
+        } else
+
+            return nullptr;
+
+    }
+
+
+
+    void nontrans_put(const key_type& k, const value_type& v) {
+
+      
+
+        
+
+        bool MTrie_found =MTrie_table->probe(&k,sizeof(&k)) ;
+
+        
+
+        
+
+        
+
+        if (MTrie_found ) {
+
+        	
+
+         
+
+             void * vbuf_out = MTrie_table->get(MTrie_table->ref,&k,sizeof(k));
+
+            
+
+            
+
+             internal_elem **e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+            
+
+            
+
+            
+
+            if (value_is_small)
+
+                (*e)->row_container.row = v;
+
+            else
+
+               copy_row(*e, &v);
+
+               
+
+             //put in MTrie_table
+
+            MTrie_table->put(&k,sizeof(k),
+
+            reinterpret_cast<const void *>(e),sizeof(reinterpret_cast<const void *>(e)));
+
+            
+
+            
+
+        } else {
+
+        
+
+        
+
+        
+
+       		
+
+        
+
+        
+
+            internal_elem *e = new internal_elem(k, v, true);
+
+            
+
+            
+
+            
+
+           //put in MTrie_table
+
+            MTrie_table->put(&k,sizeof(k),
+
+            reinterpret_cast<const void *>(&e),sizeof(reinterpret_cast<const void *>(&e)));
+
+
+
+            
+
+            //cout<<"MTrie_table = "<<MTrie_table<<endl;
+
+            
+
+            
+
+        }
+
+    }
+
+
+
+    // TObject interface methods
+
+    bool lock(TransItem& item, Transaction &txn) override {
+
+        assert(!is_internode(item));
+
+        if constexpr (table_params::track_nodes) {
+
+            if (is_ttnv(item)) {
+
+                auto n = get_internode_address(item);
+
+                return txn.try_lock(item, *static_cast<leaf_type*>(n)->get_aux_tracker());
+
+            }
+
+        }
+
+        auto key = item.key<item_key_t>();
+
+        auto e = key.internal_elem_ptr();
+
+        if (key.is_row_item())
+
+            return txn.try_lock(item, e->version());
+
+        else
+
+            return txn.try_lock(item, e->row_container.version_at(key.cell_num()));
+
+    }
+
+
+
+    bool check(TransItem& item, Transaction& txn) override {
+
+        if (is_internode(item)) {
+
+            node_type *n = get_internode_address(item);
+
+            auto curr_nv = static_cast<leaf_type *>(n)->full_version_value();
+
+            auto read_nv = item.template read_value<decltype(curr_nv)>();
+
+            return (curr_nv == read_nv);
+
+        } else {
+
+            if constexpr (table_params::track_nodes) {
+
+                if (is_ttnv(item)) {
+
+                    auto n = get_internode_address(item);
+
+                    return static_cast<leaf_type*>(n)->get_aux_tracker()->cp_check_version(txn, item);
+
+                }
+
+            }
+
+            auto key = item.key<item_key_t>();
+
+            auto e = key.internal_elem_ptr();
+
+            if (key.is_row_item())
+
+                return e->version().cp_check_version(txn, item);
+
+            else
+
+                return e->row_container.version_at(key.cell_num()).cp_check_version(txn, item);
+
+        }
+
+    }
+
+
+
+    void install(TransItem& item, Transaction& txn) override {
+
+        assert(!is_internode(item));
+
+
+
+        if constexpr (table_params::track_nodes) {
+
+            if (is_ttnv(item)) {
+
+                auto n = get_internode_address(item);
+
+                txn.set_version_unlock(*static_cast<leaf_type*>(n)->get_aux_tracker(), item);
+
+                return;
+
+            }
+
+        }
+
+
+
+        auto key = item.key<item_key_t>();
+
+        auto e = key.internal_elem_ptr();
+
+
+
+        if (key.is_row_item()) {
+
+            //assert(e->version.is_locked());
+
+            if (has_delete(item)) {
+
+                assert(e->valid() && !e->deleted);
+
+                e->deleted = true;
+
+                txn.set_version(e->version());
+
+                return;
+
+            }
+
+
+
+            if (!has_insert(item)) {
+
+                if (item.has_commute()) {
+
+                    comm_type &comm = item.write_value<comm_type>();
+
+                    if (has_row_update(item)) {
+
+                        copy_row(e, comm);
+
+                    } else if (has_row_cell(item)) {
+
+                        e->row_container.install_cell(comm);
+
+                    }
+
+                } else {
+
+                    value_type *vptr;
+
+                    if (value_is_small) {
+
+                        vptr = &(item.write_value<value_type>());
+
+                    } else {
+
+                        vptr = item.write_value<value_type *>();
+
+                    }
+
+
+
+                    if (has_row_update(item)) {
+
+                        if (value_is_small) {
+
+                            e->row_container.row = *vptr;
+
+                        } else {
+
+                            copy_row(e, vptr);
+
+                        }
+
+                    } else if (has_row_cell(item)) {
+
+                        // install only the difference part
+
+                        // not sure if works when there are more than 1 minor version fields
+
+                        // should still work
+
+                        e->row_container.install_cell(0, vptr);
+
+                    }
+
+                }
+
+            }
+
+            txn.set_version_unlock(e->version(), item);
+
+        } else {
+
+            // skip installation if row-level update is present
+
+            auto row_item = Sto::item(this, item_key_t::row_item_key(e));
+
+            if (!has_row_update(row_item)) {
+
+                if (row_item.has_commute()) {
+
+                    comm_type &comm = row_item.template write_value<comm_type>();
+
+                    assert(&comm);
+
+                    e->row_container.install_cell(comm);
+
+                } else {
+
+                    value_type *vptr;
+
+                    if (value_is_small)
+
+                        vptr = &(row_item.template raw_write_value<value_type>());
+
+                    else
+
+                        vptr = row_item.template raw_write_value<value_type *>();
+
+
+
+                    e->row_container.install_cell(key.cell_num(), vptr);
+
+                }
+
+            }
+
+
+
+            txn.set_version_unlock(e->row_container.version_at(key.cell_num()), item);
+
+        }
+
+    }
+
+
+
+    void unlock(TransItem& item) override {
+
+        assert(!is_internode(item));
+
+        if constexpr (table_params::track_nodes) {
+
+            if (is_ttnv(item)) {
+
+                auto n = get_internode_address(item);
+
+                static_cast<leaf_type*>(n)->get_aux_tracker()->cp_unlock(item);
+
+                return;
+
+            }
+
+        }
+
+        auto key = item.key<item_key_t>();
+
+        auto e = key.internal_elem_ptr();
+
+        if (key.is_row_item())
+
+            e->version().cp_unlock(item);
+
+        else
+
+            e->row_container.version_at(key.cell_num()).cp_unlock(item);
+
+    }
+
+
+
+    void cleanup(TransItem& item, bool committed) override {
+
+        if (committed ? has_delete(item) : has_insert(item)) {
+
+            auto key = item.key<item_key_t>();
+
+            assert(key.is_row_item());
+
+            internal_elem *e = key.internal_elem_ptr();
+
+            bool ok = _remove(e->key);
+
+            
+
+            
+
+            if (!ok) {
+
+                std::cout << "committed=" << committed << ", "
+
+                          << "has_delete=" << has_delete(item) << ", "
+
+                          << "has_insert=" << has_insert(item) << ", "
+
+                          << "locked_at_commit=" << item.locked_at_commit() << std::endl;
+
+                always_assert(false, "insert-bit exclusive ownership violated");
+
+            }
+
+            item.clear_needs_unlock();
+
+        }
+
+    }
+
+
+
+protected:
+
+    template <typename NodeCallback, typename ValueCallback, bool Reverse>
+
+    class range_scanner {
+
+    public:
+
+        range_scanner(const Str upper, NodeCallback ncb, ValueCallback vcb, int limit) :
+
+            boundary_(upper), boundary_compar_(false), scan_succeeded_(true), limit_(limit), scancount_(0),
+
+            node_callback_(ncb), value_callback_(vcb) {}
+
+
+
+        template <typename ITER, typename KEY>
+
+        void check(const ITER& iter, const KEY& key) {
+
+            int min = std::min(boundary_.length(), key.prefix_length());
+
+            int cmp = memcmp(boundary_.data(), key.full_string().data(), min);
+
+            if (!Reverse) {
+
+                if (cmp < 0 || (cmp == 0 && boundary_.length() <= key.prefix_length()))
+
+                    boundary_compar_ = true;
+
+                else if (cmp == 0) {
+
+                    uint64_t last_ikey = iter.node()->ikey0_[iter.permutation()[iter.permutation().size() - 1]];
+
+                    uint64_t slice = string_slice<uint64_t>::make_comparable(boundary_.data() + key.prefix_length(),
+
+                        std::min(boundary_.length() - key.prefix_length(), 8));
+
+                    boundary_compar_ = (slice <= last_ikey);
+
+                }
+
+            } else {
+
+                if (cmp >= 0)
+
+                    boundary_compar_ = true;
+
+            }
+
+        }
+
+
+
+        template <typename ITER>
+
+        void visit_leaf(const ITER& iter, const Masstree::key<uint64_t>& key, threadinfo&) {
+
+            if (!node_callback_(iter.node(), iter.full_version_value())) {
+
+                scan_succeeded_ = false;
+
+            }
+
+            if (this->boundary_) {
+
+                check(iter, key);
+
+            }
+
+        }
+
+
+
+        bool visit_value(const Masstree::key<uint64_t>& key, internal_elem *e, threadinfo&) {
+
+            if (this->boundary_compar_) {
+
+                if ((Reverse && (boundary_ >= key.full_string())) ||
+
+                    (!Reverse && (boundary_ <= key.full_string())))
+
+                    return false;
+
+            }
+
+            bool visited = false;
+
+            bool count = true;
+
+            if (!value_callback_(key.full_string(), e, visited, count)) {
+
+                scan_succeeded_ = false;
+
+                if (count) {++scancount_;}
+
+                return false;
+
+            } else {
+
+                if (!visited)
+
+                    scan_succeeded_ = false;
+
+                if (count) {++scancount_;}
+
+                if (limit_ > 0 && scancount_ >= limit_) {
+
+                    return false;
+
+                }
+
+                return visited;
+
+            }
+
+        }
+
+
+
+        Str boundary_;
+
+        bool boundary_compar_;
+
+        bool scan_succeeded_;
+
+        int limit_;
+
+        int scancount_;
+
+
+
+        NodeCallback node_callback_;
+
+        ValueCallback value_callback_;
+
+    };
+
+
+
+private:
+
+    MTrie_table_type MTrie_table;
+
+    //table_type table_;
+
+    uint64_t key_gen_;
+
+    
+
+
+
+    static bool
+
+    access_all(std::array<access_t, value_container_type::num_versions>& cell_accesses, std::array<TransItem*,
+
+               value_container_type::num_versions>& cell_items, value_container_type& row_container) {
+
+        for (size_t idx = 0; idx < cell_accesses.size(); ++idx) {
+
+            auto& access = cell_accesses[idx];
+
+            auto proxy = TransProxy(*Sto::transaction(), *cell_items[idx]);
+
+            if (static_cast<uint8_t>(access) & static_cast<uint8_t>(access_t::read)) {
+
+                if (!proxy.observe(row_container.version_at(idx)))
+
+                    return false;
+
+            }
+
+            if (static_cast<uint8_t>(access) & static_cast<uint8_t>(access_t::write)) {
+
+                if (!proxy.acquire_write(row_container.version_at(idx)))
+
+                    return false;
+
+                if (proxy.item().key<item_key_t>().is_row_item()) {
+
+                    proxy.item().add_flags(row_cell_bit);
+
+                }
+
+            }
+
+        }
+
+        return true;
+
+    }
+
+
+
+    static bool has_insert(const TransItem& item) {
+
+        return (item.flags() & insert_bit) != 0;
+
+    }
+
+    static bool has_delete(const TransItem& item) {
+
+        return (item.flags() & delete_bit) != 0;
+
+    }
+
+    static bool has_row_update(const TransItem& item) {
+
+        return (item.flags() & row_update_bit) != 0;
+
+    }
+
+    static bool has_row_cell(const TransItem& item) {
+
+        return (item.flags() & row_cell_bit) != 0;
+
+    }
+
+    static bool is_phantom(internal_elem *e, const TransItem& item) {
+
+        return (!e->valid() && !has_insert(item));
+
+    }
+
+
+
+    bool register_internode_version(node_type *node, unlocked_cursor_type& cursor) {
+
+        if constexpr (table_params::track_nodes) {
+
+            return ttnv_register_node_read_with_snapshot(node, *cursor.get_aux_tracker());
+
+        } else {
+
+            TransProxy item = Sto::item(this, get_internode_key(node));
+
+            if constexpr (DBParams::Opaque) {
+
+                return item.add_read_opaque(cursor.full_version_value());
+
+            } else {
+
+                return item.add_read(cursor.full_version_value());
+
+            }
+
+        }
+
+    }
+
+    
+
+    bool register_internode_version(internal_elem * e) {
+
+      TransProxy row_item = Sto::item(this, item_key_t::row_item_key(e));    
+
+      return row_item.add_read(e->version());
+
+            
+
+    }
+
+
+
+    // Used in scan helpers to track leaf node timestamps for phantom protection.
+
+    bool scan_track_node_version(node_type *node, nodeversion_value_type nodeversion) {
+
+        if constexpr (table_params::track_nodes) {
+
+            (void)nodeversion;
+
+            return ttnv_register_node_read(node);
+
+        } else {
+
+            TransProxy item = Sto::item(this, get_internode_key(node));
+
+            if constexpr (DBParams::Opaque) {
+
+                return item.add_read_opaque(nodeversion);
+
+            } else {
+
+                return item.add_read(nodeversion);
+
+            }
+
+        }
+
+    }
+
+
+
+    bool update_internode_version(node_type *node,
+
+            nodeversion_value_type prev_nv, nodeversion_value_type new_nv) {
+
+        ttnv_register_node_write(node);
+
+        TransProxy item = Sto::item(this, get_internode_key(node));
+
+        if (!item.has_read()) {
+
+            return true;
+
+        }
+
+        if (prev_nv == item.template read_value<nodeversion_value_type>()) {
+
+            item.update_read(prev_nv, new_nv);
+
+            return true;
+
+        }
+
+        return false;
+
+    }
+
+
+
+    void ttnv_register_node_write(node_type* node) {
+
+        (void)node;
+
+        if constexpr (table_params::track_nodes) {
+
+            static_assert(DBParams::TicToc, "Node tracking requires TicToc.");
+
+            always_assert(node->isleaf(), "Tracking non-leaf node!!");
+
+            auto tt_item = Sto::item(this, get_ttnv_key(node));
+
+            tt_item.acquire_write(*static_cast<leaf_type*>(node)->get_aux_tracker());
+
+        }
+
+    }
+
+
+
+    bool ttnv_register_node_read_with_snapshot(node_type* node, typename table_params::aux_tracker_type& snapshot) {
+
+        (void)node; (void)snapshot;
+
+        if constexpr (table_params::track_nodes) {
+
+            static_assert(DBParams::TicToc, "Node tracking requires TicToc.");
+
+            always_assert(node->isleaf(), "Tracking non-leaf node!!");
+
+            auto tt_item = Sto::item(this, get_ttnv_key(node));
+
+            return tt_item.observe(*static_cast<leaf_type*>(node)->get_aux_tracker(), snapshot);
+
+        } else {
+
+            return true;
+
+        }
+
+    }
+
+
+
+    bool ttnv_register_node_read(node_type* node) {
+
+        (void)node;
+
+        if constexpr (table_params::track_nodes) {
+
+            static_assert(DBParams::TicToc, "Node tracking requires TicToc.");
+
+            always_assert(node->isleaf(), "Tracking non-leaf node!!");
+
+            auto tt_item = Sto::item(this, get_ttnv_key(node));
+
+            return tt_item.observe(*static_cast<leaf_type*>(node)->get_aux_tracker());
+
+        } else {
+
+            return true;
+
+        }
+
+    }
+
+
+
+    bool _remove(const key_type& key) {
+
+        //cursor_type lp(table_, key);
+
+        //bool found = lp.find_locked(*ti);
+
+        
+
+        bool MTrie_found =MTrie_table->probe(&key,sizeof(key)) ;
+
+        
+
+        
+
+        if (MTrie_found) {
+
+        
+
+         void * vbuf_out = MTrie_table->get(MTrie_table->ref,&key,sizeof(key));
+
+        
+
+        internal_elem ** e =  reinterpret_cast<internal_elem**>(vbuf_out);
+
+        
+
+        
+
+            internal_elem *el = *e;
+
+            
+
+            //remove from MTrie_table
+
+            MTrie_table->del(&el->key,sizeof(&el->key));
+
+           
+
+            Transaction::rcu_delete(el);
+
+        } else {
+
+            // XXX is this correct?
+
+           
+
+        }
+
+        return MTrie_found;
+
+    }
+
+
+
+    static uintptr_t get_internode_key(node_type* node) {
+
+        return reinterpret_cast<uintptr_t>(node) | internode_bit;
+
+    }
+
+    static bool is_internode(TransItem& item) {
+
+        return (item.key<uintptr_t>() & internode_bit) != 0;
+
+    }
+
+    static node_type *get_internode_address(TransItem& item) {
+
+        if (is_internode(item)) {
+
+            return reinterpret_cast<node_type *>(item.key<uintptr_t>() & ~internode_bit);
+
+        } else if (is_ttnv(item)) {
+
+            return reinterpret_cast<node_type *>(item.key<uintptr_t>() & ~ttnv_bit);
+
+        }
+
+        assert(false);
+
+        return nullptr;
+
+    }
+
+
+
+    static uintptr_t get_ttnv_key(node_type* node) {
+
+        return reinterpret_cast<uintptr_t>(node) | ttnv_bit;
+
+    }
+
+    static bool is_ttnv(TransItem& item) {
+
+        return (item.key<uintptr_t>() & ttnv_bit);
+
+    }
+
+
+
+    static void copy_row(internal_elem *e, comm_type &comm) {
+
+        comm.operate(e->row_container.row);
+
+    }
+
+    static void copy_row(internal_elem *e, const value_type *new_row) {
+
+        if (new_row == nullptr)
+
+            return;
+
+        e->row_container.row = *new_row;
+
+    }
+
+};
+
+
+
+template <typename K, typename V, typename DBParams>
+
+__thread typename MTrie_ordered_index<K, V, DBParams>::table_params::threadinfo_type* MTrie_ordered_index<K, V, DBParams>::ti;
+
+
+
+template <typename K, typename V, typename DBParams>
+
+class MTrie_mvcc_ordered_index : public TObject {
+
+public:
+
+    typedef K key_type;
+
+    typedef V value_type;
+
+    typedef commutators::Commutator<value_type> comm_type;
+
+
+
+    static constexpr bool Commute = DBParams::Commute;
+
+
+
+    static constexpr TransItem::flags_type insert_bit = TransItem::user0_bit;
+
+    static constexpr TransItem::flags_type delete_bit = TransItem::user0_bit << 1u;
+
+    static constexpr TransItem::flags_type row_update_bit = TransItem::user0_bit << 2u;
+
+    static constexpr TransItem::flags_type row_cell_bit = TransItem::user0_bit << 3u;
+
+    static constexpr uintptr_t internode_bit = 1;
+
+
+
+    typedef typename value_type::NamedColumn NamedColumn;
+
+
+
+    static constexpr bool index_read_my_write = DBParams::RdMyWr;
+
+
+
+    typedef typename index_common<K, V, DBParams>::MvInternalElement internal_elem;
+
+
+
+    struct table_params : public Masstree::nodeparams<15,15> {
+
+        typedef internal_elem* value_type;
+
+        typedef Masstree::value_print<value_type> value_print_type;
+
+        typedef threadinfo threadinfo_type;
+
+    };
+
+
+
+    typedef Masstree::Str Str;
+
+    typedef Masstree::basic_table<table_params> table_type;
+
+    typedef Masstree::unlocked_tcursor<table_params> unlocked_cursor_type;
+
+    typedef Masstree::tcursor<table_params> cursor_type;
+
+    typedef Masstree::leaf<table_params> leaf_type;
+
+
+
+    typedef typename table_type::node_type node_type;
+
+    typedef typename unlocked_cursor_type::nodeversion_value_type nodeversion_value_type;
+
+    
+
+    typedef MassTrie* MTrie_table_type;
+
+
+
+    using accessor_t = typename index_common<K, V, DBParams>::accessor_t;
+
+
+
+    typedef std::tuple<bool, bool, uintptr_t, const value_type*> sel_return_type;
+
+    typedef std::tuple<bool, bool>                               ins_return_type;
+
+    typedef std::tuple<bool, bool>                               del_return_type;
+
+    typedef std::tuple<bool, bool, uintptr_t, SplitRecordAccessor<V>> sel_split_return_type;
+
+
+
+    using index_t = MTrie_mvcc_ordered_index<K, V, DBParams>;
+
+    using column_access_t = typename split_version_helpers<index_t>::column_access_t;
+
+    using item_key_t = typename split_version_helpers<index_t>::item_key_t;
+
+    template <typename T> static constexpr auto mvcc_column_to_cell_accesses =
+
+        split_version_helpers<index_t>::template mvcc_column_to_cell_accesses<T>;
+
+    template <typename T> static constexpr auto extract_item_list =
+
+        split_version_helpers<index_t>::template extract_item_list<T>;
+
+    using MvSplitAccessAll = typename split_version_helpers<index_t>::template MvSplitAccessAll<SplitParams<value_type>>;
+
+
+
+    static __thread typename table_params::threadinfo_type *ti;
+
+
+
+    MTrie_mvcc_ordered_index(size_t init_size) {
+
+        this->table_init();
+
+        (void)init_size;
+
+    }
+
+    MTrie_mvcc_ordered_index() {
+
+        this->table_init();
+
+    }
+
+
+
+    void table_init() {
+
+        static_assert(DBParams::Opaque, "MVCC must operate in opaque mode.");
+
+        if (ti == nullptr)
+
+            ti = threadinfo::make(threadinfo::TI_MAIN, -1);
+
+        table_.initialize(*ti);
+
+        key_gen_ = 0;
+
+    }
+
+
+
+    static void thread_init() {
+
+        if (ti == nullptr)
+
+            ti = threadinfo::make(threadinfo::TI_PROCESS, TThread::id());
+
+        Transaction::tinfo[TThread::id()].trans_start_callback = []() {
+
+            ti->rcu_start();
+
+        };
+
+        Transaction::tinfo[TThread::id()].trans_end_callback = []() {
+
+            ti->rcu_stop();
+
+        };
+
+    }
+
+
+
+    uint64_t gen_key() {
+
+        return fetch_and_add(&key_gen_, 1);
+
+    }
+
+
+
+    sel_return_type
+
+    select_row(const key_type& key, RowAccess acc) {
+
+        unlocked_cursor_type lp(table_, key);
+
+        bool found = lp.find_unlocked(*ti);
+
+        internal_elem *e = lp.value();
+
+        if (found) {
+
+            return select_row(reinterpret_cast<uintptr_t>(e), acc);
+
+        } else {
+
+            if (!register_internode_version(lp.node(), lp.full_version_value()))
+
+                goto abort;
+
+            return sel_return_type(true, false, 0, nullptr);
+
+        }
+
+
+
+    abort:
+
+        return sel_return_type(false, false, 0, nullptr);
+
+    }
+
+
+
+    sel_return_type
+
+    select_row(const key_type& key, std::initializer_list<column_access_t> accesses) {
+
+        unlocked_cursor_type lp(table_, key);
+
+        bool found = lp.find_unlocked(*ti);
+
+        internal_elem *e = lp.value();
+
+        if (found) {
+
+            return select_row(reinterpret_cast<uintptr_t>(e), accesses);
+
+        } else {
+
+            if (!register_internode_version(lp.node(), lp.full_version_value()))
+
+                return sel_return_type(false, false, 0, nullptr);
+
+            return sel_return_type(true, false, 0, nullptr);
+
+        }
+
+
+
+        return sel_return_type(false, false, 0, nullptr);
+
+    }
+
+
+
+    // Split version select row
+
+    sel_split_return_type
+
+    select_split_row(const key_type& key, std::initializer_list<column_access_t> accesses) {
+
+        unlocked_cursor_type lp(table_, key);
+
+        bool found = lp.find_unlocked(*ti);
+
+        internal_elem *e = lp.value();
+
+        
+
+        
+
+        if (found) {
+
+            return select_splits(reinterpret_cast<uintptr_t>(e), accesses);
+
+        } else {
+
+            return {
+
+                register_internode_version(lp.node(), lp.full_version_value()),
+
+                false,
+
+                0,
+
+                SplitRecordAccessor<V>({ nullptr })
+
+            };
+
+        }
+
+    }
+
+
+
+    sel_split_return_type
+
+    select_splits(uintptr_t rid, std::initializer_list<column_access_t> accesses) {
+
+        using split_params = SplitParams<value_type>;
+
+        auto e = reinterpret_cast<internal_elem*>(rid);
+
+        auto cell_accesses = mvcc_column_to_cell_accesses<split_params>(accesses);
+
+        bool found;
+
+        auto result = MvSplitAccessAll::run_select(&found, cell_accesses, this, e);
+
+        return {true, found, rid, SplitRecordAccessor<V>(result)};
+
+    }
+
+
+
+    void update_row(uintptr_t rid, value_type* new_row) {
+
+        // Update entire row using overwrite.
+
+        // In timestamp-split tables, this will add a write set item to each "cell item".
+
+        MvSplitAccessAll::run_update(this, reinterpret_cast<internal_elem*>(rid), new_row);
+
+    }
+
+
+
+    void update_row(uintptr_t rid, const comm_type &comm) {
+
+        // Update row using commutatively.
+
+        // In timestamp-split tables, this will add a commutator to each "cell item". The
+
+        // per-cell commutators should be supplied by the user (defined for each split) and
+
+        // they should be subclasses of the row commutator.
+
+        // Internally this run_update() implementation below uses a down-cast to convert
+
+        // row commutators to cell commutators.
+
+        MvSplitAccessAll::run_update(this, reinterpret_cast<internal_elem*>(rid), comm);
+
+    }
+
+
+
+    // insert assumes common case where the row doesn't exist in the table
+
+    // if a row already exists, then use select (FOR UPDATE) instead
+
+    ins_return_type
+
+    insert_row(const key_type& key, value_type *vptr, bool overwrite = false) {
+
+        cursor_type lp(table_, key);
+
+        bool found = lp.find_insert(*ti);
+
+        bool should_abort = false;
+
+        internal_elem *e;
+
+        if (!found) {
+
+            e = new internal_elem(this, key);
+
+            lp.value() = e;
+
+
+
+            node_type *node;
+
+            nodeversion_value_type orig_nv;
+
+            nodeversion_value_type new_nv;
+
+
+
+            bool split_right = (lp.node() != lp.original_node());
+
+            if (split_right) {
+
+                node = lp.original_node();
+
+                orig_nv = lp.original_version_value();
+
+                new_nv = lp.updated_version_value();
+
+            } else {
+
+                node = lp.node();
+
+                orig_nv = lp.previous_full_version_value();
+
+                new_nv = lp.next_full_version_value(1);
+
+            }
+
+
+
+            fence();
+
+            lp.finish(1, *ti);
+
+            //fence();
+
+
+
+            // update the node version already in the read set and modified by split
+
+            should_abort = !update_internode_version(node, orig_nv, new_nv);
+
+        } else {
+
+            e = lp.value();
+
+            lp.finish(0, *ti);
+
+        }
+
+
+
+        if (!should_abort) {
+
+            // NB: the insert method only manipulates the row_item. It is possible
+
+            // this insert is overwriting some previous updates on selected columns
+
+            // The expected behavior is that this row-level operation should overwrite
+
+            // all changes made by previous updates (in the same transaction) on this
+
+            // row. We achieve this by granting this row_item a higher priority.
+
+            // During the install phase, if we notice that the row item has already
+
+            // been locked then we simply ignore installing any changes made by cell items.
+
+            // It should be trivial for a cell item to find the corresponding row item
+
+            // and figure out if the row-level version is locked.
+
+
+
+            // Use cell-id 0 to represent the row item.
+
+            auto row_item = Sto::item(this, item_key_t(e, 0));
+
+
+
+            auto h = e->template chain_at<0>()->find(txn_read_tid());
+
+            found = !h->status_is(DELETED);
+
+            if (is_phantom(h, row_item)) {
+
+                MvAccess::read(row_item, h);
+
+                auto val_ptrs = TxSplitInto<value_type>(vptr);
+
+                for (size_t cell_id = 0; cell_id < SplitParams<value_type>::num_splits; ++cell_id) {
+
+                    TransProxy cell_item = Sto::item(this, item_key_t(e, cell_id));
+
+                    cell_item.add_write(val_ptrs[cell_id]);
+
+                    cell_item.add_flags(insert_bit);
+
+                }
+
+                return ins_return_type(true, false);
+
+            }
+
+
+
+            if (index_read_my_write) {
+
+                if (has_delete(row_item)) {
+
+                    auto proxy = row_item.clear_flags(delete_bit).clear_write();
+
+                    proxy.add_write(*vptr);
+
+                    return ins_return_type(true, false);
+
+                }
+
+            }
+
+
+
+            if (overwrite) {
+
+                for (size_t i = 0; i < SplitParams<V>::num_splits; ++i) {
+
+                    auto item = Sto::item(this, item_key_t(e, i));
+
+                    item.add_write();
+
+                }
+
+                this->update_row(reinterpret_cast<uintptr_t>(e), vptr);
+
+            } else {
+
+                // TODO: This now acts like a full read of the value
+
+                // at rtid. Once we add predicates we can change it to
+
+                // something else.
+
+                MvAccess::read(row_item, h);
+
+            }
+
+
+
+            return ins_return_type(true, found);
+
+        }
+
+
+
+        return ins_return_type(false, false);
+
+    }
+
+
+
+    del_return_type
+
+    delete_row(const key_type& key) {
+
+        unlocked_cursor_type lp(table_, key);
+
+        bool found = lp.find_unlocked(*ti);
+
+        if (found) {
+
+            internal_elem *e = lp.value();
+
+            // Use cell 0 to probe for existence of the row.
+
+            auto row_item = Sto::item(this, item_key_t(e, 0));
+
+            auto h = e->template chain_at<0>()->find(txn_read_tid());
+
+
+
+            if (is_phantom(h, row_item)) {
+
+                MvAccess::read(row_item, h);
+
+                return del_return_type(true, false);
+
+            }
+
+
+
+            if (index_read_my_write) {
+
+                if (has_delete(row_item))
+
+                    return del_return_type(true, false);
+
+                if (h->status_is(DELETED) && has_insert(row_item)) {
+
+                    for (size_t i = 0; i < SplitParams<V>::num_splits; i++) {
+
+                        auto item = Sto::item(this, item_key_t(e, i));
+
+                        item.add_flags(delete_bit);
+
+                    }
+
+                    return del_return_type(true, true);
+
+                }
+
+            }
+
+
+
+            MvAccess::read(row_item, h);
+
+            if (h->status_is(DELETED))
+
+                return del_return_type(true, false);
+
+            for (size_t i = 0; i < SplitParams<value_type>::num_splits; i++) {
+
+                auto item = Sto::item(this, item_key_t(e, i));
+
+                item.add_write(0);
+
+                item.add_flags(delete_bit);
+
+            }
+
+        } else {
+
+            if (!register_internode_version(lp.node(), lp.full_version_value()))
+
+                goto abort;
+
+        }
+
+
+
+        return del_return_type(true, found);
+
+
+
+    abort:
+
+        return del_return_type(false, false);
+
+    }
+
+
+
+    template <typename Callback, bool Reverse>
+
+    bool range_scan(const key_type& begin, const key_type& end, Callback callback,
+
+                    std::initializer_list<column_access_t> accesses,
+
+                    bool phantom_protection = true, int limit = -1) {
+
+        //instructed to be an empty function
+
+       return true;
+
+    }
+
+
+
+    template <typename Callback, bool Reverse>
+
+    bool range_scan(const key_type& begin, const key_type& end, Callback callback,
+
+                    RowAccess access, bool phantom_protection = true, int limit = -1) {
+
+        //instructed to be an empty function
+
+       return true;
+
+    }
+
+
+
+    bool nontrans_get(const key_type& k, value_type* value_out) {
+
+        unlocked_cursor_type lp(table_, k);
+
+        bool found = lp.find_unlocked(*ti);
+
+        if (found) {
+
+            internal_elem *e = lp.value();
+
+            MvSplitAccessAll::run_nontrans_get(value_out, e);
+
+            return true;
+
+        } else {
+
+            return false;
+
+        }
+
+    }
+
+
+
+    void nontrans_put(const key_type& k, const value_type& v) {
+
+        cursor_type lp(table_, k);
+
+        bool found = lp.find_insert(*ti);
+
+        if (found) {
+
+            internal_elem *e = lp.value();
+
+            MvSplitAccessAll::run_nontrans_put(v, e);
+
+            lp.finish(0, *ti);
+
+        } else {
+
+            internal_elem *e = new internal_elem(this, k);
+
+            MvSplitAccessAll::run_nontrans_put(v, e);
+
+            lp.value() = e;
+
+            lp.finish(1, *ti);
+
+        }
+
+    }
+
+
+
+    template <typename TSplit>
+
+    bool lock_impl_per_chain(TransItem& item, Transaction& txn, MvObject<TSplit>* chain) {
+
+        return mvcc_chain_operations<K, V, DBParams>::lock_impl_per_chain(item, txn, chain);
+
+    }
+
+    template <typename TSplit>
+
+    bool check_impl_per_chain(TransItem& item, Transaction& txn, MvObject<TSplit>* chain) {
+
+        return mvcc_chain_operations<K, V, DBParams>::check_impl_per_chain(item, txn, chain);
+
+    }
+
+    template <typename TSplit>
+
+    void install_impl_per_chain(TransItem& item, Transaction& txn, MvObject<TSplit>* chain, void (*dcb)(void*)) {
+
+        mvcc_chain_operations<K, V, DBParams>::install_impl_per_chain(item, txn, chain, dcb);
+
+    }
+
+    template <typename TSplit>
+
+    void cleanup_impl_per_chain(TransItem& item, bool committed, MvObject<TSplit>* chain) {
+
+        mvcc_chain_operations<K, V, DBParams>::cleanup_impl_per_chain(item, committed, chain);
+
+    }
+
+
+
+    // TObject interface methods
+
+    bool lock(TransItem& item, Transaction& txn) override {
+
+        assert(!is_internode(item));
+
+        auto key = item.key<item_key_t>();
+
+        return MvSplitAccessAll::run_lock(key.cell_num(), txn, item, this, key.internal_elem_ptr());
+
+    }
+
+
+
+    bool check(TransItem& item, Transaction& txn) override {
+
+        if (is_internode(item)) {
+
+            node_type *n = get_internode_address(item);
+
+            auto curr_nv = static_cast<leaf_type *>(n)->full_version_value();
+
+            auto read_nv = item.template read_value<decltype(curr_nv)>();
+
+            auto result = (curr_nv == read_nv);
+
+            TXP_ACCOUNT(txp_tpcc_check_abort1, txn.special_txp && !result);
+
+            return result;
+
+        } else {
+
+            int cell_id = item.key<item_key_t>().cell_num();
+
+            return MvSplitAccessAll::run_check(cell_id, txn, item, this);
+
+        }
+
+    }
+
+
+
+    void install(TransItem& item, Transaction& txn) override {
+
+        assert(!is_internode(item));
+
+        auto key = item.key<item_key_t>();
+
+        MvSplitAccessAll::run_install(key.cell_num(), txn, item, this, has_delete(item) ? _delete_cb2 : nullptr);
+
+    }
+
+
+
+    void unlock(TransItem& item) override {
+
+        (void)item;
+
+        assert(!is_internode(item));
+
+    }
+
+
+
+    void cleanup(TransItem& item, bool committed) override {
+
+        assert(!is_internode(item));
+
+        auto key = item.key<item_key_t>();
+
+        MvSplitAccessAll::run_cleanup(key.cell_num(), item, committed, this);
+
+    }
+
+
+
+//protected:
+
+    template <typename NodeCallback, typename ValueCallback, bool Reverse>
+
+    class range_scanner {
+
+    public:
+
+        range_scanner(const Str upper, NodeCallback ncb, ValueCallback vcb, int limit) :
+
+            boundary_(upper), boundary_compar_(false), scan_succeeded_(true), limit_(limit), scancount_(0),
+
+            node_callback_(ncb), value_callback_(vcb) {}
+
+
+
+        template <typename ITER, typename KEY>
+
+        void check(const ITER& iter, const KEY& key) {
+
+            int min = std::min(boundary_.length(), key.prefix_length());
+
+            int cmp = memcmp(boundary_.data(), key.full_string().data(), min);
+
+            if (!Reverse) {
+
+                if (cmp < 0 || (cmp == 0 && boundary_.length() <= key.prefix_length()))
+
+                    boundary_compar_ = true;
+
+                else if (cmp == 0) {
+
+                    uint64_t last_ikey = iter.node()->ikey0_[iter.permutation()[iter.permutation().size() - 1]];
+
+                    uint64_t slice = string_slice<uint64_t>::make_comparable(boundary_.data() + key.prefix_length(),
+
+                        std::min(boundary_.length() - key.prefix_length(), 8));
+
+                    boundary_compar_ = (slice <= last_ikey);
+
+                }
+
+            } else {
+
+                if (cmp >= 0)
+
+                    boundary_compar_ = true;
+
+            }
+
+        }
+
+
+
+        template <typename ITER>
+
+        void visit_leaf(const ITER& iter, const Masstree::key<uint64_t>& key, threadinfo&) {
+
+            if (!node_callback_(iter.node(), iter.full_version_value())) {
+
+                scan_succeeded_ = false;
+
+            }
+
+            if (this->boundary_) {
+
+                check(iter, key);
+
+            }
+
+        }
+
+
+
+        bool visit_value(const Masstree::key<uint64_t>& key, internal_elem *e, threadinfo&) {
+
+            if (this->boundary_compar_) {
+
+                if ((Reverse && (boundary_ >= key.full_string())) ||
+
+                    (!Reverse && (boundary_ <= key.full_string())))
+
+                    return false;
+
+            }
+
+            bool visited = false;
+
+            bool count = true;
+
+            if (!value_callback_(key.full_string(), e, visited, count)) {
+
+                scan_succeeded_ = false;
+
+                if (count) {++scancount_;}
+
+                return false;
+
+            } else {
+
+                if (!visited)
+
+                    scan_succeeded_ = false;
+
+                if (count) {++scancount_;}
+
+                if (limit_ > 0 && scancount_ >= limit_) {
+
+                    return false;
+
+                }
+
+                return visited;
+
+            }
+
+        }
+
+
+
+        Str boundary_;
+
+        bool boundary_compar_;
+
+        bool scan_succeeded_;
+
+        int limit_;
+
+        int scancount_;
+
+
+
+        NodeCallback node_callback_;
+
+        ValueCallback value_callback_;
+
+    };
+
+
+
+//private:
+
+    MTrie_table_type MTrie_table;
+
+    table_type table_;
+
+    uint64_t key_gen_;
+
+
+
+    //static bool
+
+    //access_all(std::array<access_t, internal_elem::num_versions>&, std::array<TransItem*, internal_elem::num_versions>&, internal_elem*) {
+
+    //    always_assert(false, "Not implemented.");
+
+    //    return true;
+
+    //}
+
+
+
+    static TransactionTid::type txn_read_tid() {
+
+        return Sto::read_tid();
+
+    }
+
+
+
+    static bool has_insert(const TransItem& item) {
+
+        return (item.flags() & insert_bit) != 0;
+
+    }
+
+    static bool has_delete(const TransItem& item) {
+
+        return (item.flags() & delete_bit) != 0;
+
+    }
+
+    static bool has_row_update(const TransItem& item) {
+
+        return (item.flags() & row_update_bit) != 0;
+
+    }
+
+    static bool has_row_cell(const TransItem& item) {
+
+        return (item.flags() & row_cell_bit) != 0;
+
+    }
+
+    template <typename T>
+
+    static bool is_phantom(const MvHistory<T>* h, const TransItem& item) {
+
+        return (h->status_is(DELETED) && !has_insert(item));
+
+    }
+
+
+
+    bool register_internode_version(node_type *node, nodeversion_value_type nodeversion) {
+
+        TransProxy item = Sto::item(this, get_internode_key(node));
+
+            return item.add_read(nodeversion);
+
+    }
+
+    
+
+    bool register_internode_version(internal_elem * e) {
+
+        TransProxy item = Sto::item(this, item_key_t::row_item_key(e));
+
+            return item.add_read(e->version());
+
+    }
+
+    
+
+    
+
+    bool update_internode_version(node_type *node,
+
+            nodeversion_value_type prev_nv, nodeversion_value_type new_nv) {
+
+        TransProxy item = Sto::item(this, get_internode_key(node));
+
+        if (!item.has_read()) {
+
+            return true;
+
+        }
+
+        if (prev_nv == item.template read_value<nodeversion_value_type>()) {
+
+            item.update_read(prev_nv, new_nv);
+
+            return true;
+
+        }
+
+        return false;
+
+    }
+
+
+
+    static void _delete_cb2(void* history_ptr) {
+
+        using history_type = typename internal_elem::object0_type::history_type;
+
+        auto hp = reinterpret_cast<history_type*>(history_ptr);
+
+        auto obj = hp->object();
+
+        if (obj->find_latest(false) == hp) {
+
+            auto el = internal_elem::from_chain(obj);
+
+            auto table = reinterpret_cast<MTrie_mvcc_ordered_index<K, V, DBParams>*>(el->table);
+
+            cursor_type lp(table->table_, el->key);
+
+            if (lp.find_locked(*table->ti) && lp.value() == el) {
+
+                hp->status_poisoned();
+
+                if (obj->find_latest(true) == hp) {
+
+                    lp.finish(-1, *table->ti);
+
+                    Transaction::rcu_call(gc_internal_elem, el);
+
+                } else {
+
+                    hp->status_unpoisoned();
+
+                    lp.finish(0, *table->ti);
+
+                }
+
+            } else {
+
+                lp.finish(0, *table->ti);
+
+            }
+
+        }
+
+    }
+
+
+
+    static void gc_internal_elem(void* el_ptr) {
+
+        auto el = reinterpret_cast<internal_elem*>(el_ptr);
+
+        delete el;
+
+    }
+
+
+
+    static uintptr_t get_internode_key(node_type* node) {
+
+        return reinterpret_cast<uintptr_t>(node) | internode_bit;
+
+    }
+
+    static bool is_internode(TransItem& item) {
+
+        return (item.key<uintptr_t>() & internode_bit) != 0;
+
+    }
+
+    static node_type *get_internode_address(TransItem& item) {
+
+        assert(is_internode(item));
+
+        
+
+        return reinterpret_cast<node_type *>(item.key<uintptr_t>() & ~internode_bit);
+
+    }
+
+};
+
+
+
+template <typename K, typename V, typename DBParams>
+
+__thread typename MTrie_mvcc_ordered_index<K, V, DBParams>::table_params::threadinfo_type* MTrie_mvcc_ordered_index<K, V, DBParams>::ti;
+
+
+
+} // namespace bench
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 00000000..48e341a0
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,3 @@
+{
+  "lockfileVersion": 1
+}
diff --git a/run/MassTrie-beta/MassTrie.hh b/run/MassTrie-beta/MassTrie.hh
new file mode 100644
index 00000000..53cfd776
--- /dev/null
+++ b/run/MassTrie-beta/MassTrie.hh
@@ -0,0 +1,318 @@
+#include <string>
+
+#include <iostream>
+
+#include <assert.h>
+
+#include <stdio.h>
+
+#include <unordered_map>
+
+#include <cassert>
+
+#include "wormhole/lib.h"
+
+#include "wormhole/kv.h"
+
+#include "wormhole/wh.h"
+
+#define NUM_THREADS 64
+
+#define MAX_SIZE 64
+
+using namespace std;
+
+//~~~~~~~~~CLASS MASSTRIE~~~~~~~~~~~~~~
+
+class MassTrie
+{
+
+public:
+  // constructor
+
+  MassTrie()
+  {
+
+    // creating wh wormhole mapping key to internal_elem (as uintptr_t)
+
+    wh = wh_create();
+
+    ref = wh_ref(this->wh);
+
+    iter = wh_iter_create(this->ref);
+
+    this->kbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    this->vbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    r = false;
+  }
+
+  // destructor
+
+  ~MassTrie()
+  {
+
+    wh_iter_destroy(this->iter);
+
+    wh_unref(this->ref);
+
+    wh_clean(this->wh);
+
+    wh_destroy(this->wh);
+
+    free(kbuf_out);
+
+    free(vbuf_out);
+  }
+
+  //~~~~~~~~~MASSTRIE FUNCTIONS~~~~~~~~~~~~~~
+
+  // put function - putting a uintptr_t which is the internal_elem
+
+  bool put(const void *key, int klen, const void *value, int vlen)
+  {
+
+    return (wh_put(this->ref, key, klen, value, vlen));
+  }
+
+  // get function
+
+  void *get(struct wormref *const ref, const void *key, int klen)
+  {
+
+    // variables
+
+    // bool r;
+
+    u32 vlen_out = 0;
+
+    // get action performed
+
+    r = wh_get(ref, key, klen, vbuf_out, sizeof(vbuf_out), &vlen_out);
+
+    return r ? vbuf_out : nullptr;
+  }
+
+  // delete function
+
+  bool del(const void *key, int klen)
+  {
+
+    return (wh_del(this->ref, key, klen));
+  }
+
+  // probe function - returns true if key exists, false otherwise
+
+  bool probe(const void *key, int klen)
+  {
+
+    r = (wh_probe(this->ref, key, klen));
+
+    return r;
+  }
+
+  // finds the closest pointer currently in the MassTrie
+
+  // to a pointer passed as a parameter
+
+  void *find_closest(const void *key)
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool r;
+
+    int min = INT_MAX;
+
+    int curr;
+
+    void *res = NULL;
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // calculate disatnce
+
+        curr = abs((long)(reinterpret_cast<uintptr_t>(kbuf_out)) - (long)(reinterpret_cast<uintptr_t>(key)));
+
+        if (curr < min)
+        {
+
+          // perform malloc
+
+          if (!res)
+
+            res = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+          // error handling
+
+          if (res == NULL)
+          {
+
+            printf("Error! memory not allocated.");
+
+            exit(1);
+          }
+
+          min = curr;
+
+          // cout<<"curr = "<<curr<<endl;
+
+          memcpy(res, kbuf_out, sizeof(kbuf_out));
+        }
+      }
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+
+    return (res != NULL) ? res : nullptr;
+  }
+
+  // deletes all from MassTrie
+
+  void delete_all()
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // delete key
+
+        this->del(kbuf_out, sizeof(kbuf_out));
+      }
+
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+  }
+
+  // data members
+
+  struct wormhole *wh;
+
+  struct wormref *ref;
+
+  struct wormhole_iter *iter;
+
+  void *kbuf_out;
+
+  void *vbuf_out;
+
+  bool r;
+
+}; // class MassTrie
+
+/**
+
+//override the << operation
+
+
+
+ostream& operator<<(ostream &os, MassTrie* m){
+
+
+
+u32 klen_out = 0;
+
+  char kbuf_out[MAX_SIZE] = {};
+
+  u32 vlen_out = 0;
+
+  char vbuf_out[MAX_SIZE] = {};
+
+  bool r;
+
+
+
+  wh_iter_seek(m->iter, NULL, 0); // seek to the head
+
+  printf("wh_iter_seek \"\"\n");
+
+  while (wh_iter_valid(m->iter)) {
+
+    r = wh_iter_peek(m->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+    if (r) {
+
+      os << "wh_iter_peek: key = "<<reinterpret_cast<char *>(kbuf_out)<<" , klen = "<< klen_out<<" , "<<
+
+      " value= "<<reinterpret_cast<char *>(vbuf_out) << ", vlen= "<< vlen_out<<endl;
+
+    } else {
+
+      printf("ERROR!\n");
+
+    }
+
+
+
+    wh_iter_skip1(m->iter);
+
+
+
+    memset(kbuf_out,0,sizeof(kbuf_out));
+
+    memset(vbuf_out,0,sizeof(vbuf_out));
+
+  }
+
+  return os;
+
+}
+
+
+
+**/
diff --git a/run/MassTrie-beta/wormhole/LICENSE b/run/MassTrie-beta/wormhole/LICENSE
new file mode 100644
index 00000000..f288702d
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/run/MassTrie-beta/wormhole/Makefile b/run/MassTrie-beta/wormhole/Makefile
new file mode 100644
index 00000000..f00e6b59
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/Makefile
@@ -0,0 +1,45 @@
+# Makefile
+# rules (always with .out)
+# SRC-X.out += abc        # extra source: abc.c
+# MOD-X.out += abc        # extra module: abc.c abc.h
+# ASM-X.out += abc        # extra assembly: abc.S
+# DEP-X.out += abc        # extra dependency: abc
+# FLG-X.out += -finline   # extra flags
+# LIB-X.out += abc        # extra -labc options
+
+# X.out : xyz.h xyz.c # for extra dependences that are to be compiled/linked.
+
+# X => X.out
+TARGETS += easydemo concbench stresstest
+# X => X.c only
+SOURCES +=
+# X => X.S only
+ASSMBLY +=
+# X => X.c X.h
+MODULES += lib kv wh
+# X => X.h
+HEADERS += ctypes
+
+FLG +=
+LIB += m
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),FreeBSD)
+LIB += execinfo
+endif
+
+# when $ make FORKER_PAPI=y
+ifeq ($(strip $(FORKER_PAPI)),y)
+LIB += papi
+FLG += -DFORKER_PAPI
+endif
+
+bin : libwh.so
+libwh.so : Makefile Makefile.common lib.c lib.h kv.c kv.h wh.c wh.h wh.strip
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) -shared -fPIC)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ lib.c kv.c wh.c $(ALLLIB)
+	strip --strip-all --discard-all @wh.strip $@
+
+
+include Makefile.common
diff --git a/run/MassTrie-beta/wormhole/Makefile.common b/run/MassTrie-beta/wormhole/Makefile.common
new file mode 100644
index 00000000..ecd761e7
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/Makefile.common
@@ -0,0 +1,216 @@
+#usage: include Makefile.common at the end of your Makefile
+
+# no builtin rules/vars (CC, CXX, etc. are still defined but will be empty)
+MAKEFLAGS += -r -R
+
+HDR = $(addsuffix .h,$(MODULES) $(HEADERS))
+SRC = $(addsuffix .c,$(MODULES) $(SOURCES))
+ASM = $(addsuffix .S,$(ASSMBLY))
+OBJ = $(addsuffix .o,$(MODULES) $(SOURCES) $(ASSEMBLY))
+DEP = Makefile.common Makefile $(HDR) $(EXTERNDEP) $(EXTERNSRC)
+BIN = $(addsuffix .out,$(TARGETS))
+DIS = $(addsuffix .dis,$(TARGETS))
+
+# clang:
+# EXTRA="-Rpass=loop-vectorize"  # IDs loops that were successfully V-ed
+# EXTRA="-Rpass-missed=loop-vectorize"  # IDs loops that failed V
+# EXTRA="-Rpass-analysis=loop-vectorize" # IDs the statements that caused V to fail
+# EXTRA="-Rpass=\ *" # remarks for all passes
+# other passes: https://llvm.org/docs/Passes.html
+
+O ?= rg
+
+# predefined OPT: make O={rg,r,0g,3g,p,0s,3s,cov,mc,hc,wn,stk}
+ifeq ($O,rg) # make O=rg
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector
+else ifeq ($O,r) # make O=r (for release)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector
+else ifeq ($O,ns) # make O=ns (no signal handlers)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector -DNOSIGNAL
+else ifeq ($O,0g) # make O=0g
+OPT ?= -g3 -O0 -fno-inline
+else ifeq ($O,2g) # make O=2g
+OPT ?= -g3 -O2
+else ifeq ($O,3g) # make O=3g
+OPT ?= -g3 -O3 -flto -fno-inline
+else ifeq ($O,p) # make O=p (profiling: rg+noinline)
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector -fno-inline
+else ifeq ($O,0s) # make O=0s (address sanitizer)
+OPT ?= -g3 -O0 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,3s) # make O=3s (address sanitizer)
+OPT ?= -g3 -O3 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,t) # make O=0t (thread sanitizer)
+OPT ?= -g3 -O1 -fno-inline -fsanitize=thread -fno-stack-protector
+else ifeq ($O,cov) # make O=cov (for gcov)
+OPT ?= -g3 -DNDEBUG -O0 --coverage
+CCC = gcc
+else ifeq ($O,mc) # make O=mc (for valgrind memcheck)
+OPT ?= -g3 -O1 -fno-inline -DHEAPCHECKING
+ARCH ?= broadwell
+else ifeq ($O,hc) # make O=hc (for gperftools heapcheck)
+OPT ?= -g3 -O1 -fno-inline
+LIB += tcmalloc
+else ifeq ($O,wn) # more warning
+OPT ?= -g3 -O3 -Wvla -Wformat=2 -Wconversion -Wstrict-prototypes -Wmissing-prototypes
+else ifeq ($O,stk) # check stack usage with gcc
+OPT ?= -g3 -O3 -DNDEBUG -fstack-usage
+CCC = gcc
+endif
+
+# malloc: g:glibc, t:tcmalloc, j:jemalloc
+M ?= g
+
+ifeq ($M,t)
+  LIB += tcmalloc
+  FLG += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+else ifeq ($M,j)
+  LIB += jemalloc
+endif
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+  CHECK_S := -D__linux__
+  LIB += rt
+else ifeq ($(UNAME_S),FreeBSD)
+  CHECK_S := -D__FreeBSD__
+  FLG += -I/usr/local/include -L/usr/local/lib
+  LIB += rt
+  LIB += execinfo
+  TPUT := /usr/local/bin/tput
+else ifeq ($(UNAME_S),Darwin)
+  CHECK_S := -D__APPLE__ -D__MACH__
+  # do nothing
+else
+  $(error "Supported Platforms: Linux, FreeBSD, Darwin")
+endif
+TPUT ?= tput
+
+CCC ?= clang
+CSTD = -std=gnu18
+XCC ?= clang++
+XSTD = -std=gnu++17
+
+UNAME_M := $(shell uname -m)
+ifeq ($(UNAME_M),aarch64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),arm64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),x86_64)
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else ifeq ($(UNAME_M),amd64) # freebsd
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else
+  $(error "Supported Platforms: aarch64, x86_64")
+endif
+
+TUNE ?= native
+
+NBI += memcpy memmove memcmp
+
+# minimal requirement on x86_64: -march=nehalem
+# minimal requirement on aarch64: -march=armv8-a+crc
+FLG += -march=$(ARCH) -mtune=$(TUNE)
+FLG += -pthread -Wall -Wextra -Wshadow #-Weverything
+FLG += $(addprefix -fno-builtin-,$(NBI))
+FLG += $(OPT)
+
+ifneq ($(shell $(CCC) --version 2>/dev/null | grep clang),)
+FLG += -ferror-limit=3
+CCCTYPE := clang
+else ifneq ($(shell $(CCC) --version 2>/dev/null | grep gcc),)
+FLG += -fmax-errors=3
+FLG += -Wno-unknown-pragmas
+CCCTYPE := gcc
+else
+  $(error "Supported Compilers: clang, gcc")
+endif
+
+ifeq ($(CCCTYPE),clang)
+  CCINST = /usr/lib/clang/$(shell $(CCC) --version 2>/dev/null | awk '/^clang/ { print $$3 }')
+  CCINC = $(CCINST)/include
+else ifeq ($(CCCTYPE),gcc)
+  CCINST = /usr/lib/gcc/$(shell $(CCC) -dumpmachine)/$(shell $(CCC) -dumpversion)
+  CCINC = $(CCINST)/include $(CCINST)/include-fixed
+endif
+CCINC = /usr/include /usr/local/include
+
+ifneq ($(shell find $(CCINC) -name backtrace-supported.h 2>/dev/null),)
+  LIB += backtrace
+  FLG += -DBACKTRACE
+endif
+
+ifneq ($(shell find $(CCINC) -name liburing.h 2>/dev/null),)
+  LIB += uring
+  FLG += -DLIBURING
+endif
+
+
+uniq = $(if $1,$(firstword $1) $(call uniq,$(filter-out $(firstword $1),$1)))
+magentatxt := $(shell $(TPUT) setaf 5)
+greentxt := $(shell $(TPUT) setaf 2)
+bluetxt := $(shell $(TPUT) setaf 4)
+normaltxt := $(shell $(TPUT) sgr0)
+
+.PHONY : bin dis def clean cleanx check tags
+
+bin : $(BIN)
+dis : $(DIS) bin
+.DEFAULT_GOAL = bin
+.SECONDEXPANSION:
+
+ifeq ($(J),o)
+# DANGER. Don't use unless it works!
+# build from .o files but target-specific flags are missing in %.o : %.x
+%.out : %.o $(OBJ) $$(addsuffix .o,$$(SRC-$$@) $$(MOD-$$@) $$(ASM-$$@))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ $^ $(ALLLIB)
+#
+else # default: all-in-one command
+%.out : %.c $(SRC) $(ASM) $(DEP) $$(DEP-$$@) $$(addsuffix .c,$$(SRC-$$@) $$(MOD-$$@)) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@)) $$(addsuffix .S,$$(ASM-$$@))
+	$(eval ALLSRC := $(SRC) $(addsuffix .c,$(SRC-$@) $(MOD-$@)) $(ASM) $(addsuffix .S,$(ASM-$@)))
+	$(eval UNIQSRC := $(call uniq,$(ALLSRC)))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	@printf '$(bluetxt)$@$(magentatxt) <= $(greentxt)$< $(UNIQSRC)$(normaltxt)\n'
+	$(CCC) $(ALLFLG) -o $@ $< $(UNIQSRC) $(ALLLIB)
+#
+endif
+
+
+%.dis : %.out
+	objdump -SlwtC $< 1>$@ 2>/dev/null
+
+%.o : %.cc $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(XCC) $(XSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.S $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.s : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) -S -o $@ -c $<
+
+def :
+	$(CCC) $(FLG) -dM -E - </dev/null
+
+clean :
+	rm -rf *.out *.dis *.o *.so *.gcda *.gcno *.gcov *.dSYM
+
+cleanx : clean
+	rm -rf $(EXTERNDEP) $(EXTERNSRC)
+
+check :
+	cppcheck $(addprefix -I ,$(CCINC)) \
+    -q $(CHECK_M) $(CHECK_S) -DNDEBUG -U__cplusplus \
+    --std=c11 --language=c --platform=unix64 \
+    --enable=warning,style,performance,portability,information --inline-suppr .
+
+tags :
+	ctags -R . /usr/include /usr/local/include $(CCINC)
diff --git a/run/MassTrie-beta/wormhole/README.md b/run/MassTrie-beta/wormhole/README.md
new file mode 100644
index 00000000..9bb59c12
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/README.md
@@ -0,0 +1,483 @@
+# Wormhole
+
+The Wormhole index structure was introduced in paper ["Wormhole: A Fast Ordered Index for In-memory Data Management"](https://www.cs.uic.edu/~wuxb/papers/wormhole.pdf)
+by Xingbo Wu, Fan Ni, and Song Jiang ([ACM DL](https://dl.acm.org/citation.cfm?id=3303955)).
+This repository maintains a reference implementation of the Wormhole index structure.
+
+It supports Linux/FreeBSD/MacOS on x86\_64 and AArch64 CPUs.
+On x86\_64, Wormhole requires SSE4.2.
+On AArch64, Wormhole requires NEON SIMD and the `crc` features on the target CPU.
+The code has been tested with Intel Haswell, Broadwell, and Skylake CPUs.
+It has also been tested on a Raspberry PI 4 running 64-bit ArchlinuxArm, and a Jetson Nano running 64-bit Ubuntu Groovy.
+
+## NEWS
+
+* See `wh.py` for a brief example of using Wormhole in Python.
+
+* An old limitation about anchor keys has been removed (See Section 3.3 in the original paper for more details).
+Now Wormhole can store binary string keys of any patterns including any number of '\0's. A key's length can be 0 to UINT32\_MAX bytes. (Internally: leaf-nodes' anchor key length <= UINT16\_MAX).
+
+* `wh.h` provides a user-friendly interface. See `easydemo.c` for coding examples. the `wh_` functions are thread-safe.
+
+* The `whsafe` API is a *worry-free* thread-safe wormhole API.
+At a small cost on each operation, users no longer need to call the `wormhole_refresh_qstate` in any circumstances.
+
+* `merge` (Merge a new kv with existing kv) and `delr` (delete range) operations have been added. They are all thread-safe.
+
+## Highlights:
+
+* Thread-safety: all operations, including `get`, `put`, `inplace-update (inp)`, `del`, `iter-seek`, `iter-peek`, `iter-skip` etc., are thread-safe.
+See `stresstest.c` for more thread-safe operations.
+
+* Keys can contain any value, including binary zeros (`'\0'`). Their sizes are always explicitly specified.
+
+* Long keys are welcome! The key-length field (`klen` in `struct kv`) is a 32-bit unsigned integer and the maximum size of a key is 4294967295.
+
+* No background threads or global status. Wormhole uses a mix of user-space rwlocks and QSBR RCU to synchronize between readers and writers.
+See below for more details.
+
+# Build
+
+Clang is the default compiler. It can compile with gcc with `$ make CCC=gcc`.
+On our testbed, Clang usually produces faster code than GCC.
+
+To build:
+
+    $ make
+
+Alternatively, you may use `O=0g` to enable debug info and disable optimizations:
+
+    $ make O=0g
+
+## Sample programs
+`easydemo.c` presents how to use wormhole through a user-friendly API declared at the end of `wh.h`.
+
+    $ ./easydemo.out
+
+The `wh_{ref/unref/get/put/del/probe}` and  `wh_iter_{create/destroy/seek/skip/peek/park/valid}` functions are all thread-safe.
+Each thread should acquire a private reference using `wh_ref` for KV operations.
+
+`concbench.out` is an example benchmarking tool of only 150 LoC. See the helper messages for more details.
+It generates six-word keys based on a word list (words.txt). See `sprintf` in `concbench.c`.
+
+    $ wget https://github.com/dwyl/english-words/raw/master/words.txt
+    $ ./concbench.out words.txt 10000000 4
+    $ numactl -N 0 ./concbench.out words.txt 10000000 4
+
+`stresstest.out` tests all thread-safe operations.
+
+`libwh.so` can be linked to any C/C++ program with the help of `wh.h`.
+
+# The wh API (USE THIS)
+
+The `wh_*` functions provides a clean programming interface that helps to avoid common inefficient use of the Wormhole data structure.
+If you're not sure which interface to use, just use `wh_*`. Read `easydemo.c` for more details.
+
+Coding examples:
+
+```C
+{
+    struct wormhole * wh = wh_create(); // create a new wormhole instance
+    struct wormref * ref = wh_ref(wh); // to access wh, a thread must obtain a reference
+    wh_put(ref, "hello", 5, "world!", 6); // insert a kv pair
+    wh_put(ref, NULL, 0, NULL, 0); // both key and value can be zero-sized
+    r = wh_probe(ref, "hello", 5); // r == true
+    r = wh_probe(ref, NULL, 0); // r == true
+    r = wh_probe(ref, "abc", 3); // r == false
+    u8 buf [6];
+    u32 len_out;
+    r = wh_get(ref, "hello", 5, buf, 6, &len_out); // r == true, len_out == 6, "world!" in buf (without the '\0')
+    struct wormhole_iter * iter = wh_iter_create(ref); // creates an iter on a ref
+    wh_iter_seek(iter, "h", 1); // seek for the smallest key >= "h"; the iter will be placed on "hello"
+    r = wh_iter_valid(iter); // r == true; You should always check if iter is valid after a seek() and skip()
+    r = wh_iter_peek(iter, buf, 6, &len_out, NULL, 0, NULL); // only need the key: will get "hello" and 5
+    r = wh_iter_peek(iter, NULL, 0, NULL, buf, 6, &len_out); // only need the value: will get "world!" and 6
+    // (you can also get both key and value using one call with two buffers)
+    wh_iter_skip1(iter); // skip the current key; equivalent to wh_iter_skip(iter, 1);
+    r = wh_iter_valid(iter); // r == false; already passed the end of the dataset
+    wh_iter_park(iter); // an iter may hold locks; It's a good manner to "park" the iter before sleep.
+    sleep(10); // not interacting with the wormhole instance.
+    wh_iter_seek(iter, NULL, 0); // need to do another seek to reactivate the iter
+    r = wh_iter_valid(iter); // r == true; on the zero-sized key now
+    wh_iter_destroy(iter); // now we're done with the iter
+    wh_del(ref, "hello", 5); // delete a key
+    wh_del(ref, NULL, NULL); // delete the zero-sized key
+    wh_unref(ref); // the current thread is no longer interested in accessing the index
+    wh_destroy(wh); // fully destroy the index; all references should have been released before calling this
+}
+```
+
+## Integer keys
+
+Wormhole supports binary keys, which means you don't need to print integers into text when using Wormhole to index integer keys.
+Here are some quick examples for using Wormhole as an integer-key index. A little-endian CPU is assumed.
+
+```C
+{
+    // 32-bit unsigned integer keys
+    u32 key = __builtin_bswap32(1000); // reverse byte order of key 1000
+    wh_put(ref, &key, 4, NULL, 0);
+    key = __builtin_bswap32(2000); // reverse byte order of key 2000
+    wh_put(ref, &key, 4, NULL, 0);
+    struct wormhole_iter * iter = wh_iter_create(ref);
+    key = __builtin_bswap32(999);
+    wh_iter_seek(iter, &key, 4); // seek 999
+    u32 key_out, len_out;
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 1000 in key_out in reversed byte order
+    wh_iter_skip1(iter);
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 2000 in key_out in reversed byte order
+}
+```
+
+# Advanced APIs
+
+If the simple and thread-safe `wh_*` interface already meets your performance requirements, You don't need to read the following sections.
+Using the `wormhole_*` and `whunsafe_*` APIs can maximize the efficiency of your code with a roughly 5%-10% speedup.
+However, inefficient use of these APIs, such as repeatedly calling malloc() to prepare the key buffer, can easily hurt the performance.
+
+## `struct kv` and `struct kref`
+
+There are a handful of helper functions (`kv_*` and `kref_*` functions) at the beginning of wh.h.
+It's worth noting that the *key's hash* (`hash` of `struct kv` and `hash32` of `struct kref`)
+must be up-to-date before passed to wormhole.
+The `kv_refill*` helper functions internally update the hash after filling the kv contents.
+In a more general case, `kv_update_hash` directly updates a `struct kv`'s hash.
+Similarly, `kref_refill_hash32()` calculates the 32-bit hash for `struct kref`.
+Performing the hash calculation at the client side can achieve the best efficiency on the server (the index operations).
+
+## The Wormhole API
+
+`concbench.c` and `stresstest.c` are examples of how to use a Wormhole index.
+There are three sets of Wormhole API: `whsafe`, `wormhole`, and `whunsafe`.
+* `whsafe`: The *worry-free* thread-safe API. If you use Wormhole in a concurrent environment and want minimal complexity in your code, you should use `whsafe`.
+* `wormhole`: The standard thread-safe API. It offers better efficiency than `whsafe` but requires some extra effort for blocking prevention.
+* `whunsafe`: the thread-unsafe API. It offers the best speed and efficiency but does not perform internal concurrency control.
+External synchronization should be employed when accessing `whunsafe` in a concurrent environment.
+
+The functions of each API can be found near the end of `wh.c` (search `kvmap_api_whsafe`, `kvmap_api_wormhole`, and `kvmap_api_whunsafe`).
+Note that each API contains a mix of `whsafe_*`, `wormhole_*`, and `whunsafe_*` functions.
+
+### The `whsafe` API
+The `whsafe` API functions are listed in the `kvmap_api_whsafe` structure in `wh.c`. The API consists of a mix of `wormhole_*` and `whsafe_*` functions.
+
+The index operations (GET, SET, UPDATE, DEL, PROBE, INPLACE, MERGE, and SCAN (`wormhole_iter_*` functions)) are all *thread safe*.
+A thread needs to hold a reference of the index (_wormref_) to perform safe index operations.
+
+An example of using point-query operations using the `whsafe` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = whsafe_ref(wh);
+    for (...) {
+      whsafe_put(ref, ...);
+      whsafe_get(ref, ...);
+      whsafe_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = whsafe_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      whsafe_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    whsafe_iter_park(iter); // Release resources to avoid blocking other threads
+    // it's now safe to do something such as sleep() or waitpid()
+    // ... start using the iterator again
+    whsafe_iter_seek(iter, key2);
+    // ... other iter operations
+    whsafe_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### The `wormhole` API
+Similar to `whsafe`, `wormhole` is also thread safe. It's often faster than `whsafe` but requires extra caution when using it.
+
+An example of using point-query operations using the `wormhole` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = wormhole_ref(wh);
+    for (...) {
+      wormhole_put(ref, ...);
+      wormhole_get(ref, ...);
+      wormhole_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = wormhole_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      wormhole_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    wormhole_iter_park(iter); // Release resources to avoid blocking other threads
+    while (condition not met) { // See below for explanation
+        wormhole_refresh_qstate(ref);
+    }
+    // ... start using the iterator again
+    wormhole_iter_seek(iter, key2);
+    // ... other iter operations
+    wormhole_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### Avoid blocking writers when using the `wormhole` API
+Wormhole internally uses QSBR RCU to synchronize readers/writers so every holder of a reference (`ref`)
+needs to actively perform index operations.
+An ref-holder, if not actively performing index operations, may block a writer thread that is performing split/merge operations.
+(because of not periodically announcing its quiescent state).
+If a ref-holder is about to become inactive from Wormhole's perspective (doing something else or just sleeping),
+it is recommended that the holder temporarily releases the `ref` before entering the inactive status (such as calling `sleep(10)`),
+and reactivate the `ref` before performing the next index operation.
+
+```C
+{
+    // assume we already have an active ref
+    wormhole_park(ref);   // this will avoid blocking any other threads
+    sleep(10);
+    wormhole_resume(ref);  // this will reactivate the ref
+    // continue to perform index operations
+}
+```
+
+A common scenario of dead-locking is acquiring locks with an active wormhole reference,
+The following example could cause deadlock between two threads.
+
+```C
+// Thread A has an active ref and try to lock()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    lock(just_a_lock); // << block here forever
+}
+
+// Thread B already acquired the lock and wants to insert a key to wh
+{
+    lock(just_a_lock);
+    wormhole_put(ref, kv); << block here forever
+}
+```
+
+To avoid this scenario, thread A should either call `wormhole_park(ref)` before acquiring the lock, or keep updating the qstate of the ref:
+```C
+// Solution A.1: use wormhole_park()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    wormhole_park(ref);
+    lock(just_a_lock);
+    wormhole_resume(ref); // can use ref afterward
+}
+
+// Solution A.2: use try_lock and wormhole_refresh_qstate()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    while (!try_lock(just_a_lock)) {
+        wormhole_refresh_qstate(ref);
+    }
+    // continue to use ref
+}
+```
+
+The above issues with QSBR are specific to the `wormhole` API. `whsafe` does not have these issues.
+
+### The `whunsafe` API
+A set of *thread-unsafe* functions are also provided. See the functions with _prefix_ `whunsafe`.
+The thread-unsafe functions don't use the reference (_wormref_).
+Simply feed them with the pointer to the wormhole index:
+
+```C
+{
+    wh = whunsafe_create(NULL);
+    for (...) {
+      whunsafe_put(wh, ...);
+      whunsafe_get(wh, ...);
+      whunsafe_del(wh, ...);
+      ... // other unsafe operations
+    }
+    ... // other unsafe operations
+    wormhole_destroy(wh);
+}
+```
+
+### In-place update with user-defined function
+`wormhole_inp` executes a user-defined function on an existing key-value item.
+If the key does not exist, a NULL pointer will be passed to the user-defined function.
+A simple example would be incrementing a counter stored in a key-value pair.
+
+```C
+{
+    // user-defined in-place update function
+    void myadd1(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        (*pvalue)++;
+      }
+    }
+
+    // create the counter
+    u64 zero = 0;
+    struct kv * tmp = kv_create("counter", 7, &zero, 8); // malloc-ed
+    wormhole_put(ref, tmp);
+
+    // perform +1 on the stored value
+    struct kref kref = kv_ref(tmp); // create a kref of tmp
+    wormhole_inp(ref, &kref, myadd1, NULL);
+}
+```
+
+Note that the user-defined function should ONLY change the value's content, and nothing else.
+Otherwise, the index can be corrupted.
+A similar mechanism is also provided for iterators (`wormhole_iter_inp`).
+
+The inplace function can also be used to retrieve key-value data. For example:
+
+```C
+{
+    void inplace_getu64(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        *(u64 *)priv = *pvalue;
+      } else {
+        *(u64 *)priv = 0;
+      }
+    }
+    ...
+    struct kref kref = ...
+    u64 val;
+    wormhole_inp(ref, &kref, inplace_getu64, &val);
+}
+```
+
+### `merge`: atomic Read-Modify-Write
+The `wormhole_merge` and `whsafe_merge` functions perform atomic Read-Modify-Write (RMW) operations.
+In a RMW operation, if the search key is found, the KV pair will be passed to a user-defined callback function `uf` (short for user function).
+Otherwise, a NULL pointer is passed to `uf`.
+`uf` could update the KV in-place if it does not require any memory reallocation.
+In such a case, `uf` should return the KV's pointer back and the merge function will do nothing else.
+If `uf` want to replace the KV with something new, it should return a pointer that is different than the original KV pointer.
+The `uf` should not make memory allocation by itself.
+Instead, the `merge` function will copy the returned KV and replace the existing KV with the newly created one.
+`uf` should not return NULL unless the key was not found.
+
+### Iterator
+The `wormhole_iter_{seek,peek,skip,next,inp}` functions provide range-search functionalities.
+If the search key does not exist, the `seek` operation will put the cursor on the item that is greater than the search-key.
+`next` will return the item under the current cursor and move the cursor forward.
+`peek` is similar but does not move the cursor. For example, with keys `{1,3,5}`, `seek(2); r = next()` will see `r == 3`.
+
+Currently Wormhole does not provide `seek_for_less_equal()` and `prev()` for backward scanning. This feature will be added in the future.
+
+# Memory management
+
+By default, Wormhole manages all the key-value data internally and only copies to or from a user-supplied
+buffer (a `struct kv` object).
+This draws a clear boundary in the memory space between the index structure and its users.
+After a call to any of the index operations, the caller can immediately free
+the buffer holding the key-reference or the key-value data.
+This also allows users to use stack-allocated variables to interact with Wormhole.
+
+The memory manager of the internal key-value objects can be customized when creating a new Wormhole (see `wormhole_create`).
+The customization will _only_ affect the internal `struct kv` objects.
+Actually, the memory manager can be configured to directly use the caller's `struct kv` object and store it in Wormhole.
+This `struct kvmap_mm` structure shows an example:
+
+```C
+{
+    const struct kvmap_mm kvmap_mm_ualloc {
+      .in = kvmap_mm_in_noop, // in wormhole_put(), store caller's kv in wh
+      .out = kvmap_mm_out_dup, // but still make a copy in wormhole_get()
+      .free = kvmap_mm_free_free, // call free() for delete/update
+    };
+    ...
+    struct wormhole * wh = wormhole_create(&kvmap_mm_ualloc);
+    struct wormref * ref = wormhole_ref(wh);
+    ...
+    struct kv * newkv = malloc(size);
+    ...
+    wormhole_put(ref, newkv);
+    // Don't free newkv! it's now managed by wh
+}
+```
+
+Each of the in/out/free functions can be freely customized.
+A few `kvmap_mm_*` functions are already provided for common scenarios.
+`kvmap_mm_ndf` is identical to the `kvmap_mm_ualloc` structure in the above example.
+
+## Hugepages
+Wormhole uses hugepages when available. To reserve some hugepages in Linux (10000 * 2MB):
+
+    # echo 10000 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+# Tuning
+
+A few macros in `wh.c` can be tuned.
+
+* `WH_SLABLEAF_SIZE` controls the slab size for leaf node allocation.
+The default is `((1lu << 21))` (2MB slabs). If 1GB hugepages are available, `WH_SLABLEAF_SIZE` can be set to `((1lu << 30))` to utilize 1GB hugepages.
+Using 1GB hugepages can improve search performance on a large dataset.
+
+* `WH_KPN` controls "Keys Per (leaf-)Node". The default value is 128.
+Compared to the default, `WH_KPN=256` can offer 5-10%+ higher point query and update speed.
+However, range queries prefer a smaller node size such as 64.
+
+
+* `QSBR_STATES_NR` and `QSBR_SHARDS_NR` control the capacity (number of active references) of the QSBR RCU.
+The product of the two values is the capacity. For efficiency, `QSBR_STATES_NR` can be set to 23, 39, and 55, and `QSBR_SHARDS_NR` must be 2^n, n<=6.
+The defaults are 23 and 32, respectively. The QSBR registry can run out of space if there are a few hundred of threads, which is not a problem in practice.
+
+# Limitations
+
+## Key Patterns
+A **split** operation will fail when **129** (`WH_KPN + 1`) keys share a common prefix of 65535+ bytes.
+In Wormhole, the maximum _anchor-key_ length is 65535 (2^16) bytes, which is shorter than the maximum key-length (2^32).
+
+## Memory Allocation
+Insertions/updates can fail and return false when a memory allocation fails.
+On memory-allocation failure, the hash-table expansion function will block and wait for available memory.
+
+# Performance
+Some benchmarking results with some real-world datasets: See [this](https://github.com/wuxb45/wormhole/issues/5) page for more information.
+
+![Concurrent GET](https://user-images.githubusercontent.com/564235/112712778-704d7200-8e9f-11eb-9f4d-795de46772d1.png)
diff --git a/run/MassTrie-beta/wormhole/README.txt b/run/MassTrie-beta/wormhole/README.txt
new file mode 100644
index 00000000..e70108ef
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/README.txt
@@ -0,0 +1,31 @@
+To setup the project:
+
+If you're not already in the folder 'wormhole', preform:
+
+1. cd wormhole
+
+Once you're there, set the variable LD_LIBRARY_PATH to the
+current working directory using:
+
+2. setenv LD_LIBRARY_PATH `pwd`
+
+You can check (optionally) that this operation was exceuted properly using:
+
+3. echo $LD_LIBRARY_PATH
+
+
+Then, do:
+
+4. cd sto
+
+5. /./bootstrap.sh
+
+6. ./configure
+
+To run the test file do:
+
+7. make unit-testMTrie
+
+Then run it using:
+
+8. ./unit-test_MTrie
diff --git a/run/MassTrie-beta/wormhole/concbench.c b/run/MassTrie-beta/wormhole/concbench.c
new file mode 100644
index 00000000..f18abde9
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/concbench.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018-2019  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdatomic.h>
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+atomic_uint_least64_t __seqno = 0;
+u64 __nth = 0;
+struct kv ** __samples = NULL;
+u64 __nkeys = 0;
+atomic_uint_least64_t __tot = 0;
+u64 __endtime = 0;
+
+  static void *
+kv_load_worker(struct wormhole * const wh)
+{
+  srandom_u64(time_nsec() * time_nsec());
+  struct wormref * const ref = wormhole_ref(wh);
+  const u64 seq = atomic_fetch_add(&__seqno, 1);
+  const u64 n0 = __nkeys / __nth * seq;
+  const u64 nz = (seq == (__nth - 1)) ? __nkeys : (__nkeys / __nth * (seq + 1));
+  printf("load worker %lu %lu\n", n0, nz);
+  for (u64 i = n0; i < nz; i++)
+    wormhole_put(ref, __samples[i]);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  static void *
+kv_probe_worker(struct wormhole * const wh)
+{
+  struct wormref * const ref = wormhole_ref(wh);
+  struct kv * next = __samples[random_u64() % __nkeys];
+  u64 rnext = random_u64() % __nkeys;
+  u64 count = 0;
+  u64 succ = 0;
+#define BATCH ((10000))
+  do {
+    for (u64 i = 0; i < BATCH; i++) {
+      // reading kv samples leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = __samples[rnext];
+      __builtin_prefetch(next, 0, 0);
+      __builtin_prefetch(((u8 *)next) + 64, 0, 0);
+      rnext = random_u64() % __nkeys;
+      __builtin_prefetch(&(__samples[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const struct kref kref = kv_kref(key);
+      if (wormhole_probe(ref, &kref))
+        succ++;
+    }
+    count += BATCH;
+  } while (time_nsec() < __endtime);
+  if (count != succ)
+    printf("count %lu success %lu\n", count, succ);
+  (void)atomic_fetch_add(&__tot, count);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  if (argc < 3) {
+    printf("usage: <words-file> <#keys> <#threads>\n");
+    printf("  Get words.txt: wget https://github.com/dwyl/english-words/raw/master/words.txt\n");
+    printf("  Example: %s words.txt 1000000 4\n", argv[0]);
+    printf("  Better to use only one numa node with numactl -N 0\n");
+    printf("  Better to run X thread on X cores\n");
+    return 0;
+  }
+
+  char ** const words = malloc(sizeof(char *) * 1000000); // or `wc -l words.txt`
+  u64 nr_words = 0;
+  char * buf = malloc(8192);
+  size_t bufsize = 8192;
+  FILE * const fwords = fopen(argv[1], "r");
+  if (fwords == NULL) {
+    printf("open words file failed\n");
+    return 0;
+  }
+
+  // read all words to words
+  while (getline(&buf, &bufsize, fwords) > 0) {
+    buf[strlen(buf)-1] = '\0';
+    words[nr_words] = strdup(buf);
+    nr_words++;
+  }
+  fclose(fwords);
+
+  // generate keys
+  const u64 nkeys = strtoull(argv[2], NULL, 10);
+  struct kv ** const samples = malloc(sizeof(struct kv *) * nkeys);
+  char * ss[6];
+  for (u64 i = 0; i < nkeys; i++) {
+    for (u64 j = 0; j < 6; j++)
+      ss[j] = words[random() % nr_words];
+    sprintf(buf, "%s %s %s %s %s %s!", ss[0], ss[1], ss[2], ss[3], ss[4], ss[5]);
+    samples[i] = kv_create_str(buf, NULL, 0);
+  }
+  // free words & buf
+  for (u64 i = 0; i < nr_words; i++)
+    free(words[i]);
+  free(words);
+  free(buf);
+
+  // load (4)
+  __samples = samples;
+  __nkeys = nkeys;
+  struct wormhole * const wh = wormhole_create(NULL);
+  __nth = 4;
+  const u64 dtl = thread_fork_join(4, (void *)kv_load_worker, false, (void *)wh);
+  printf("load x4 %.2lf mops\n", ((double)nkeys) * 1e3 / ((double)dtl));
+
+  const u64 nth = strtoull(argv[3], NULL, 10);
+  printf("probe with %lu threads. each round takes 3 seconds\n", nth);
+  for (u64 i = 0; i < 3; i++) {
+    __tot = 0;
+    __endtime = time_nsec() + 3e9; // 3 sec
+    const u64 dt = thread_fork_join(nth, (void *)kv_probe_worker, false, (void *)wh);
+    const double mops = ((double)__tot) * 1e3 / ((double)dt);
+    printf("probe x%lu %.2lf mops\n", nth, mops);
+    sleep(1);
+  }
+
+  // final clean up for valgrind
+  for (u64 i = 0; i < nkeys; i++)
+    free(samples[i]);
+  free(samples);
+  wormhole_destroy(wh);
+  return 0;
+}
diff --git a/run/MassTrie-beta/wormhole/concbench.out b/run/MassTrie-beta/wormhole/concbench.out
new file mode 100644
index 00000000..ee87ca31
Binary files /dev/null and b/run/MassTrie-beta/wormhole/concbench.out differ
diff --git a/run/MassTrie-beta/wormhole/ctypes.h b/run/MassTrie-beta/wormhole/ctypes.h
new file mode 100644
index 00000000..314ca5dc
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/ctypes.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// C types only; C++ source code don't use this
+
+#include <assert.h>
+#include <stdatomic.h>
+
+/* C11 atomic types */
+typedef atomic_bool             abool;
+
+typedef atomic_uchar    au8;
+typedef atomic_ushort   au16;
+typedef atomic_uint     au32;
+typedef atomic_ulong    au64;
+static_assert(sizeof(au8) == 1, "sizeof(au8)");
+static_assert(sizeof(au16) == 2, "sizeof(au16)");
+static_assert(sizeof(au32) == 4, "sizeof(au32)");
+static_assert(sizeof(au64) == 8, "sizeof(au64)");
+
+typedef atomic_char     as8;
+typedef atomic_short    as16;
+typedef atomic_int      as32;
+typedef atomic_long     as64;
+static_assert(sizeof(as8) == 1, "sizeof(as8)");
+static_assert(sizeof(as16) == 2, "sizeof(as16)");
+static_assert(sizeof(as32) == 4, "sizeof(as32)");
+static_assert(sizeof(as64) == 8, "sizeof(as64)");
+
+// shorten long names
+#define MO_RELAXED memory_order_relaxed
+#define MO_CONSUME memory_order_consume
+#define MO_ACQUIRE memory_order_acquire
+#define MO_RELEASE memory_order_release
+#define MO_ACQ_REL memory_order_acq_rel
+#define MO_SEQ_CST memory_order_seq_cst
diff --git a/run/MassTrie-beta/wormhole/easydemo.c b/run/MassTrie-beta/wormhole/easydemo.c
new file mode 100644
index 00000000..f095a6ac
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/easydemo.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+  int
+main(int argc, char ** argv)
+{
+  (void)argc;
+  (void)argv;
+  struct wormhole * const wh = wh_create();
+  struct wormref * const ref = wh_ref(wh);
+
+  bool r;
+
+  r = wh_put(ref, "wormhole", 8, "easy", 4);
+  printf("wh_put wormhole easy %c\n", r?'T':'F');
+
+  r = wh_put(ref, "time_travel", 11, "impossible", 10);
+  printf("wh_put time_travel impossible %c\n", r?'T':'F');
+
+  r = wh_del(ref, "time_travel", 11);
+  printf("wh_del time_travel %c\n", r?'T':'F');
+
+  r = wh_probe(ref, "time_travel", 11);
+  printf("wh_probe time_travel %c\n", r?'T':'F');
+
+  u32 klen_out = 0;
+  char kbuf_out[8] = {};
+  u32 vlen_out = 0;
+  char vbuf_out[8] = {};
+  r = wh_get(ref, "wormhole", 8, vbuf_out, 8, &vlen_out);
+  printf("wh_get wormhole %c %u %.*s\n", r?'T':'F', vlen_out, vlen_out, vbuf_out);
+
+  // in a concurrent environment, the kvmap_api_wormhole need park&resume when a thread is about to go idle
+  // don't need park&resume if you're using the default kvmap_api_whsafe in whwh.c!
+  wh_park(ref);
+  usleep(10);
+  wh_resume(ref);
+
+  // prepare a few keys for range ops
+  wh_put(ref, "00", 2, "0_value", 7);
+  wh_put(ref, "11", 2, "1_value", 7);
+  wh_put(ref, "22", 2, "2_value", 7);
+
+  struct wormhole_iter * const iter = wh_iter_create(ref);
+
+  wh_iter_seek(iter, NULL, 0); // seek to the head
+  printf("wh_iter_seek \"\"\n");
+  while (wh_iter_valid(iter)) {
+    r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, vbuf_out, 8, &vlen_out);
+    if (r) {
+      printf("wh_iter_peek klen=%u key=%.*s vlen=%u value=%.*s\n",
+          klen_out, klen_out, kbuf_out, vlen_out, vlen_out, vbuf_out);
+    } else {
+      printf("ERROR!\n");
+    }
+    wh_iter_skip1(iter);
+  }
+
+  // call iter_park if you will go idle but want to use the iter later
+  // don't need to call iter_park if you're actively using iter
+  wh_iter_park(iter);
+  usleep(10);
+
+  wh_iter_seek(iter, "0", 1);
+  printf("wh_iter_seek \"0\"\n");
+  // this time we don't want to copy the value
+  r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, NULL, 0, NULL);
+  if (r){
+    printf("wh_iter_peek klen=%u key=%.*s\n", klen_out, klen_out, kbuf_out);
+  } else {
+    printf("ERROR: iter_peek failed\n");
+  }
+
+  wh_iter_destroy(iter);
+  // there must be no active iter when calling unref()
+  wh_unref(ref);
+
+  // unsafe operations: should have released all references
+  wh_clean(wh); // just for demonstration
+  wh_destroy(wh); // destroy also calls clean interally
+  return 0;
+}
diff --git a/run/MassTrie-beta/wormhole/easydemo.out b/run/MassTrie-beta/wormhole/easydemo.out
new file mode 100644
index 00000000..32521210
Binary files /dev/null and b/run/MassTrie-beta/wormhole/easydemo.out differ
diff --git a/run/MassTrie-beta/wormhole/kv.c b/run/MassTrie-beta/wormhole/kv.c
new file mode 100644
index 00000000..a1720e88
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/kv.c
@@ -0,0 +1,1131 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include <ctype.h>
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+// }}} headers
+
+// crc32c {{{
+  inline u32
+kv_crc32c(const void * const ptr, u32 len)
+{
+  return crc32c_inc((const u8 *)ptr, len, KV_CRC32C_SEED);
+}
+
+  inline u64
+kv_crc32c_extend(const u32 lo)
+{
+  const u64 hi = (u64)(~lo);
+  return (hi << 32) | ((u64)lo);
+}
+// }}} crc32c
+
+// kv {{{
+
+// size {{{
+  inline size_t
+kv_size(const struct kv * const kv)
+{
+  return sizeof(*kv) + kv->klen + kv->vlen;
+}
+
+  inline size_t
+kv_size_align(const struct kv * const kv, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*kv) + kv->klen + kv->vlen + (align - 1)) & (~(align - 1));
+}
+
+  inline size_t
+key_size(const struct kv *const key)
+{
+  return sizeof(*key) + key->klen;
+}
+
+  inline size_t
+key_size_align(const struct kv *const key, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*key) + key->klen + (align - 1)) & (~(align - 1));
+}
+// }}} size
+
+// construct {{{
+  inline void
+kv_update_hash(struct kv * const kv)
+{
+  const u32 lo = kv_crc32c((const void *)kv->kv, kv->klen);
+  kv->hash = kv_crc32c_extend(lo);
+}
+
+  inline void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen)
+{
+  debug_assert((vlen == 0) || value);
+  memcpy(&(kv->kv[kv->klen]), value, vlen);
+  kv->vlen = vlen;
+}
+
+  inline void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen)
+{
+  debug_assert(kv);
+  kv->klen = klen;
+  memcpy(&(kv->kv[0]), key, klen);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, vlen);
+}
+
+  inline void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+// the u64 key is filled in big-endian byte order for correct ordering
+  inline void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen)
+{
+  kv->klen = sizeof(u64);
+  *(u64 *)(kv->kv) = __builtin_bswap64(key); // bswap on little endian
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 8;
+  strhex_32(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 16;
+  strhex_64(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex,
+    const u32 klen, const void * const value, const u32 vlen)
+{
+  strhex_64(kv->kv, hex);
+  if (klen > 16) {
+    kv->klen = klen;
+    memset(kv->kv + 16, '!', klen - 16);
+  } else {
+    kv->klen = 16;
+  }
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref)
+{
+  kv->klen = kref->len;
+  kv->vlen = 0;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+}
+
+  inline void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen)
+{
+  kv->klen = kref->len;
+  kv->vlen = vlen;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+  memcpy(kv->kv + kv->klen, value, vlen);
+}
+
+  inline struct kref
+kv_kref(const struct kv * const key)
+{
+  return (struct kref){.ptr = key->kv, .len = key->klen, .hash32 = key->hashlo};
+}
+
+  inline struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen)
+{
+  struct kv * const kv = malloc(sizeof(*kv) + klen + vlen);
+  if (kv)
+    kv_refill(kv, key, klen, value, vlen);
+  return kv;
+}
+
+  inline struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen)
+{
+  return kv_create(key, (u32)strlen(key), value, vlen);
+}
+
+  inline struct kv *
+kv_create_str_str(const char * const key, const char * const value)
+{
+  return kv_create(key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+  inline struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen)
+{
+  return kv_create(kref->ptr, kref->len, value, vlen);
+}
+
+static struct kv __kv_null = {};
+
+__attribute__((constructor))
+  static void
+kv_null_init(void)
+{
+  kv_update_hash(&__kv_null);
+}
+
+  inline const struct kv *
+kv_null(void)
+{
+  return &__kv_null;
+}
+// }}} construct
+
+// dup {{{
+  inline struct kv *
+kv_dup(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = kv_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new)
+    memcpy(new, kv, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup_key(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = key_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new) {
+    memcpy(new, kv, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = kv_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new)
+    memcpy(new, from, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = key_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    memcpy(new, from, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen)
+{
+  if (from == NULL)
+    return NULL;
+  debug_assert(plen <= from->klen);
+  const size_t sz = key_size(from) - from->klen + plen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    new->klen = plen;
+    memcpy(new->kv, from->kv, plen);
+    new->vlen = 0;
+    kv_update_hash(new);
+  }
+  return new;
+}
+// }}} dup
+
+// compare {{{
+  static inline int
+klen_compare(const u32 len1, const u32 len2)
+{
+  if (len1 < len2)
+    return -1;
+  else if (len1 > len2)
+    return 1;
+  else
+    return 0;
+}
+
+// compare whether the two keys are identical
+// optimistic: do not check hash
+  inline bool
+kv_match(const struct kv * const key1, const struct kv * const key2)
+{
+  //cpu_prefetch0(((u8 *)key2) + 64);
+  //return (key1->hash == key2->hash)
+  //  && (key1->klen == key2->klen)
+  //  && (!memcmp(key1->kv, key2->kv, key1->klen));
+  return (key1->klen == key2->klen) && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+// compare whether the two keys are identical
+// check hash first
+// pessimistic: return false quickly if their hashes mismatch
+  inline bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2)
+{
+  return (key1->hash == key2->hash)
+    && (key1->klen == key2->klen)
+    && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+  inline bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2)
+{
+  return (kv1->kvlen == kv2->kvlen)
+    && (!memcmp(kv1, kv2, sizeof(*kv1) + kv1->klen + kv1->vlen));
+}
+
+  bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  debug_assert(kv128);
+
+  u32 klen128 = 0;
+  u32 vlen128 = 0;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(kv128, &klen128), &vlen128);
+  (void)vlen128;
+  return (sk->klen == klen128) && (!memcmp(sk->kv, pdata, klen128));
+}
+
+  inline int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2)
+{
+  const u32 len = kv1->klen < kv2->klen ? kv1->klen : kv2->klen;
+  const int cmp = memcmp(kv1->kv, kv2->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kv1->klen, kv2->klen);
+}
+
+// for qsort and bsearch
+  static int
+kv_compare_ptrs(const void * const p1, const void * const p2)
+{
+  const struct kv * const * const pp1 = (typeof(pp1))p1;
+  const struct kv * const * const pp2 = (typeof(pp2))p2;
+  return kv_compare(*pp1, *pp2);
+}
+
+  int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  inline void
+kv_qsort(struct kv ** const kvs, const size_t nr)
+{
+  qsort(kvs, nr, sizeof(kvs[0]), kv_compare_ptrs);
+}
+
+// return the length of longest common prefix of the two keys
+  inline u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  return memlcp(key1->kv, key2->kv, max);
+}
+
+// return the length of longest common prefix of the two keys with a known lcp0
+  inline u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  debug_assert(max >= lcp0);
+  return lcp0 + memlcp(key1->kv+lcp0, key2->kv+lcp0, max-lcp0);
+}
+// }}}
+
+// psort {{{
+  static inline void
+kv_psort_exchange(struct kv ** const kvs, const u64 i, const u64 j)
+{
+  if (i != j) {
+    struct kv * const tmp = kvs[i];
+    kvs[i] = kvs[j];
+    kvs[j] = tmp;
+  }
+}
+
+  static u64
+kv_psort_partition(struct kv ** const kvs, const u64 lo, const u64 hi)
+{
+  if (lo >= hi)
+    return lo;
+
+  const u64 p = (lo+hi) >> 1;
+  kv_psort_exchange(kvs, lo, p);
+  u64 i = lo;
+  u64 j = hi + 1;
+  do {
+    while (kv_compare(kvs[++i], kvs[lo]) < 0 && i < hi);
+    while (kv_compare(kvs[--j], kvs[lo]) > 0);
+    if (i >= j)
+      break;
+    kv_psort_exchange(kvs, i, j);
+  } while (true);
+  kv_psort_exchange(kvs, lo, j);
+  return j;
+}
+
+  static void
+kv_psort_rec(struct kv ** const kvs, const u64 lo, const u64 hi, const u64 tlo, const u64 thi)
+{
+  if (lo >= hi)
+    return;
+  const u64 c = kv_psort_partition(kvs, lo, hi);
+
+  if (c > tlo) // go left
+    kv_psort_rec(kvs, lo, c-1, tlo, thi);
+
+  if (c < thi) // go right
+    kv_psort_rec(kvs, c+1, hi, tlo, thi);
+}
+
+  inline void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi)
+{
+  debug_assert(tlo <= thi);
+  debug_assert(thi < nr);
+  kv_psort_rec(kvs, 0, nr-1, tlo, thi);
+}
+// }}} psort
+
+// ptr {{{
+  inline void *
+kv_vptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[kv->klen]));
+}
+
+  inline void *
+kv_kptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[0]));
+}
+
+  inline const void *
+kv_vptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[kv->klen]));
+}
+
+  inline const void *
+kv_kptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[0]));
+}
+// }}} ptr
+
+// print {{{
+// cmd "KV" K and V can be 's': string, 'x': hex, 'd': dec, or else for not printing.
+// n for newline after kv
+  void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out)
+{
+  debug_assert(cmd);
+  const u32 klen = kv->klen;
+  fprintf(out, "#%016lx k[%3u]", kv->hash, klen);
+
+  switch(cmd[0]) {
+  case 's': fprintf(out, " %.*s", klen, kv->kv); break;
+  case 'x': str_print_hex(out, kv->kv, klen); break;
+  case 'd': str_print_dec(out, kv->kv, klen); break;
+  default: break;
+  }
+
+  const u32 vlen = kv->vlen;
+  switch (cmd[1]) {
+  case 's': fprintf(out, "  v[%4u] %.*s", vlen, vlen, kv->kv+klen); break;
+  case 'x': fprintf(out, "  v[%4u]", vlen); str_print_hex(out, kv->kv+klen, vlen); break;
+  case 'd': fprintf(out, "  v[%4u]", vlen); str_print_dec(out, kv->kv+klen, vlen); break;
+  default: break;
+  }
+  if (strchr(cmd, 'n'))
+    fprintf(out, "\n");
+}
+// }}} print
+
+// mm {{{
+  struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv;
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out)
+{
+  (void)out;
+  return kv;
+}
+
+  void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv)
+{
+  (void)kv;
+  (void)priv;
+}
+
+// copy-in
+  struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv_dup(kv);
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out)
+{
+  return kv_dup2(kv, out);
+}
+
+  void
+kvmap_mm_free_free(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  free(kv);
+}
+
+const struct kvmap_mm kvmap_mm_dup = {
+  .in = kvmap_mm_in_dup,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+const struct kvmap_mm kvmap_mm_ndf = {
+  .in = kvmap_mm_in_noop,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+// }}} mm
+
+// kref {{{
+  inline void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = 0;
+}
+
+  inline void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = kv_crc32c(ptr, len);
+}
+
+  inline void
+kref_update_hash32(struct kref * const kref)
+{
+  kref->hash32 = kv_crc32c(kref->ptr, kref->len);
+}
+
+  inline void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv->hashlo;
+}
+
+  inline void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv_crc32c(kv->kv, kv->klen);
+}
+
+  inline bool
+kref_match(const struct kref * const k1, const struct kref * const k2)
+{
+  return (k1->len == k2->len) && (!memcmp(k1->ptr, k2->ptr, k1->len));
+}
+
+// match a kref and a key
+  inline bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k)
+{
+  return (kref->len == k->klen) && (!memcmp(kref->ptr, k->kv, kref->len));
+}
+
+  inline int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2)
+{
+  const u32 len = kref1->len < kref2->len ? kref1->len : kref2->len;
+  const int cmp = memcmp(kref1->ptr, kref2->ptr, (size_t)len);
+  return cmp ? cmp : klen_compare(kref1->len, kref2->len);
+}
+
+// compare a kref and a key
+  inline int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k)
+{
+  debug_assert(kref);
+  debug_assert(k);
+  const u32 len = kref->len < k->klen ? kref->len : k->klen;
+  const int cmp = memcmp(kref->ptr, k->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kref->len, k->klen);
+}
+
+  inline u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2)
+{
+  const u32 max = (k1->len < k2->len) ? k1->len : k2->len;
+  return memlcp(k1->ptr, k2->ptr, max);
+}
+
+  inline u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv)
+{
+  const u32 max = (kref->len < kv->klen) ? kref->len : kv->klen;
+  return memlcp(kref->ptr, kv->kv, max);
+}
+
+// klen, key, ...
+  inline int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+// klen, vlen, key, ...
+  inline int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+static struct kref __kref_null = {.hash32 = KV_CRC32C_SEED};
+
+  inline const struct kref *
+kref_null(void)
+{
+  return &__kref_null;
+}
+// }}} kref
+
+// kvref {{{
+  inline void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv)
+{
+  ref->kptr = kv->kv;
+  ref->vptr = kv->kv + kv->klen;
+  ref->hdr = *kv;
+}
+
+  struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen + ref->hdr.vlen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  memcpy(new->kv + new->klen, ref->vptr, new->vlen);
+  return new;
+}
+
+  struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  return new;
+}
+
+  int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv)
+{
+  const u32 len = ref->hdr.klen < kv->klen ? ref->hdr.klen : kv->klen;
+  const int cmp = memcmp(ref->kptr, kv->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(ref->hdr.klen, kv->klen);
+}
+// }}} kvref
+
+// kv128 {{{
+// estimate the encoded size
+  inline size_t
+kv128_estimate_kv(const struct kv * const kv)
+{
+  return vi128_estimate_u32(kv->klen) + vi128_estimate_u32(kv->vlen) + kv->klen + kv->vlen;
+}
+
+// create a kv128 from kv
+  u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize)
+{
+  u8 * const ptr = out ? out : malloc(kv128_estimate_kv(kv));
+  if (!ptr)
+    return NULL;
+
+  u8 * const pdata = vi128_encode_u32(vi128_encode_u32(ptr, kv->klen), kv->vlen);
+  memcpy(pdata, kv->kv, kv->klen + kv->vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + kv->klen + kv->vlen;
+  return ptr; // return the head of the encoded kv128
+}
+
+// dup kv128 to a kv
+  struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  struct kv * const ret = out ? out : malloc(sizeof(struct kv) + klen + vlen);
+  if (ret)
+    kv_refill(ret, pdata, klen, pdata + klen, vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + klen + vlen;
+  return ret; // return the kv
+}
+
+  inline size_t
+kv128_size(const u8 * const ptr)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  return ((size_t)(pdata - ptr)) + klen + vlen;
+}
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// registry {{{
+// increase MAX if need more
+#define KVMAP_API_MAX ((32))
+static struct kvmap_api_reg kvmap_api_regs[KVMAP_API_MAX];
+static u64 kvmap_api_regs_nr = 0;
+
+  void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api)
+{
+  if (kvmap_api_regs_nr < KVMAP_API_MAX) {
+    kvmap_api_regs[kvmap_api_regs_nr].nargs = nargs;
+    kvmap_api_regs[kvmap_api_regs_nr].name = name;
+    kvmap_api_regs[kvmap_api_regs_nr].args_msg = args_msg;
+    kvmap_api_regs[kvmap_api_regs_nr].create = create;
+    kvmap_api_regs[kvmap_api_regs_nr].api = api;
+    kvmap_api_regs_nr++;
+  } else {
+    fprintf(stderr, "%s failed to register [%s]\n", __func__, name);
+  }
+}
+  void
+kvmap_api_helper_message(void)
+{
+  fprintf(stderr, "%s Usage: api <map-type> <param1> ...\n", __func__);
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    fprintf(stderr, "%s example: api %s %s\n", __func__,
+        kvmap_api_regs[i].name, kvmap_api_regs[i].args_msg);
+  }
+}
+
+  int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out)
+{
+  // "api" "name" "arg1", ...
+  if (argc < 2 || strcmp(argv[0], "api") != 0)
+    return -1;
+
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    const struct kvmap_api_reg * const reg = &kvmap_api_regs[i];
+    if (0 != strcmp(argv[1], reg->name))
+      continue;
+
+    if ((argc - 2) < reg->nargs)
+      return -1;
+
+    void * const map = reg->create(argv[1], mm, argv + 2); // skip "api" "name"
+    if (map) {
+      *api_out = reg->api;
+      *map_out = map;
+      return 2 + reg->nargs;
+    } else {
+      return -1;
+    }
+  }
+
+  // no match
+  return -1;
+}
+// }}} registry
+
+// misc {{{
+  void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv)
+{
+  // steal the kv pointer out so we don't need a dangerous get_key_interanl()
+  if (priv)
+    *(struct kv **)priv = kv;
+}
+
+  inline void *
+kvmap_ref(const struct kvmap_api * const api, void * const map)
+{
+  return api->ref ? api->ref(map) : map;
+}
+
+// return the original map pointer; usually unused by caller
+  inline void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref)
+{
+  return api->unref ? api->unref(ref) : ref;
+}
+// }}} misc
+
+// kvmap_kv_op {{{
+  inline struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out)
+{
+  const struct kref kref = kv_kref(key);
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv)
+{
+  return api->put(ref, kv);
+}
+
+  inline bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->merge(ref, &kref, uf, priv);
+}
+
+  inline u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end)
+{
+  const struct kref kref0 = kv_kref(start);
+  if (end) {
+    const struct kref krefz = kv_kref(end);
+    return api->delr(ref, &kref0, &krefz);
+  } else {
+    return api->delr(ref, &kref0, NULL);
+  }
+}
+
+  inline void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  api->iter_seek(iter, &kref);
+}
+// }}} kvmap_kv_op
+
+// kvmap_raw_op {{{
+  inline struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  api->iter_seek(iter, &kref);
+}
+// }}}} kvmap_raw_op
+
+// dump {{{
+  u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd)
+{
+  void * const ref = kvmap_ref(api, map);
+  void * const iter = api->iter_create(ref);
+  api->iter_seek(iter, kref_null());
+  u64 i = 0;
+  while (api->iter_valid(iter)) {
+    struct kvref kvref;
+    api->iter_kvref(iter, &kvref);
+    dprintf(fd, "%010lu [%3u] %.*s [%u]\n", i, kvref.hdr.klen, kvref.hdr.klen, kvref.kptr, kvref.hdr.vlen);
+    i++;
+    api->iter_skip1(iter);
+  }
+  api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  return i;
+}
+// }}} dump
+
+// kv64 {{{
+struct kv64 { // internal only
+  struct kv kv;
+  u64 key_be; // must be in big endian
+  u64 value;
+};
+
+  inline bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out)
+{
+  struct kv64 keybuf, kvout;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  struct kv * const ret = api->get(ref, &kref, &kvout.kv);
+  if (ret) {
+    *out = kvout.value;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value)
+{
+  struct kv64 kv;
+  kv.key_be = __builtin_bswap64(key);
+  kv.value = value;
+  kv.kv.klen = sizeof(key);
+  kv.kv.vlen = sizeof(value);
+  if (api->hashkey)
+    kv_update_hash(&kv.kv);
+
+  return api->put(ref, &kv.kv);
+}
+
+  inline bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->del(ref, &kref);
+}
+
+  inline void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  api->iter_seek(iter, &kref);
+}
+
+  inline bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out)
+{
+  struct kv64 kvout;
+  struct kv * const ret = api->iter_peek(iter, &kvout.kv);
+  if (key_out)
+    *key_out = __builtin_bswap64(kvout.key_be); // to LE
+  if (value_out)
+    *value_out = kvout.value;
+  return ret != NULL;
+}
+// }}} kv64
+
+// }}} kvmap
+
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/kv.h b/run/MassTrie-beta/wormhole/kv.h
new file mode 100644
index 00000000..1e251e58
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/kv.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// crc32c {{{
+#define KV_CRC32C_SEED ((0xDEADBEEFu))
+
+  extern u32
+kv_crc32c(const void * const ptr, u32 len);
+
+  extern u64
+kv_crc32c_extend(const u32 crc32c);
+// }}} crc32c
+
+// kv {{{
+
+// struct {{{
+/*
+ * Some internal union names can be ignored:
+ * struct kv {
+ *   u32 klen;
+ *   u32 vlen;
+ *   u64 hash;
+ *   u8 kv[];
+ * };
+ */
+struct kv {
+  union { // the first u64
+    u64 kvlen;
+    struct {
+      u32 klen;
+      union { u32 vlen; u32 refcnt; };
+    };
+  };
+  union {
+    u64 hash; // hashvalue of the key
+    u64 priv; // can hide a value here if hash is not used
+    void * privptr;
+    struct { u32 hashlo; u32 hashhi; }; // little endian
+    struct { u32 privlo; u32 privhi; };
+  };
+  u8 kv[0];  // len(kv) == klen + vlen
+} __attribute__((packed));
+
+struct kref {
+  u32 len;
+  union { u32 hash32; u32 priv; };
+  const u8 * ptr;
+} __attribute__((packed));
+
+struct kvref {
+  const u8 * kptr; // read-only
+  const u8 * vptr; // read-only
+  struct kv hdr; // hdr.kv[] is invalid
+};
+// }}} struct
+
+// kv {{{
+typedef int  (*kv_kv_cmp_func)(const struct kv *, const struct kv *);
+
+  extern size_t
+kv_size(const struct kv * const kv);
+
+  extern size_t
+kv_size_align(const struct kv * const kv, const u64 align);
+
+  extern size_t
+key_size(const struct kv * const key);
+
+  extern size_t
+key_size_align(const struct kv * const key, const u64 align);
+
+  extern void
+kv_update_hash(struct kv * const kv);
+
+  extern void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value);
+
+// the u64 key is filled in big-endian byte order
+  extern void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref);
+
+  extern void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen);
+
+  extern struct kref
+kv_kref(const struct kv * const key);
+
+  extern struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str_str(const char * const key, const char * const value);
+
+  extern struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen);
+
+// a static kv with klen == 0
+  extern const struct kv *
+kv_null(void);
+
+  extern struct kv *
+kv_dup(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup_key(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen);
+
+  extern bool
+kv_match(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128);
+
+  extern int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128);
+
+  extern int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128);
+
+  extern void
+kv_qsort(struct kv ** const kvs, const size_t nr);
+
+  extern u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2);
+
+  extern u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0);
+
+  extern void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi);
+
+  extern void *
+kv_vptr(struct kv * const kv);
+
+  extern void *
+kv_kptr(struct kv * const kv);
+
+  extern const void *
+kv_vptr_c(const struct kv * const kv);
+
+  extern const void *
+kv_kptr_c(const struct kv * const kv);
+
+  extern void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out);
+// }}} kv
+
+// mm {{{
+typedef struct kv * (* kvmap_mm_in_func)(struct kv * kv, void * priv);
+typedef struct kv * (* kvmap_mm_out_func)(struct kv * kv, struct kv * out);
+typedef void        (* kvmap_mm_free_func)(struct kv * kv, void * priv);
+
+// manage internal kv data of kvmap
+struct kvmap_mm {
+  // to create a private copy of "kv"
+  // see put() functions
+  kvmap_mm_in_func in;
+  // to duplicate a private copy of "kv" to "out"
+  // see get() and iter_peek() functions
+  kvmap_mm_out_func out;
+  // to free a kv
+  // see del() and put() functions
+  kvmap_mm_free_func free;
+  void * priv;
+};
+
+  extern struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_free(struct kv * const kv, void * const priv);
+
+// the default mm
+extern const struct kvmap_mm kvmap_mm_dup; // in:Dup, out:Dup, free:Free
+extern const struct kvmap_mm kvmap_mm_ndf; // in:Noop, out:Dup, free:Free
+// }}} mm
+
+// ref {{{
+typedef int (*kref_kv_cmp_func)(const struct kref *, const struct kv *);
+
+// ptr and len only
+  extern void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+// this calculates hash32
+  extern void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+  extern void
+kref_update_hash32(struct kref * const kref);
+
+  extern void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv);
+
+  extern void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv);
+
+  extern bool
+kref_match(const struct kref * const k1, const struct kref * const k2);
+
+  extern bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k);
+
+  extern int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2);
+
+  extern int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k);
+
+  extern u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2);
+
+  extern u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv);
+
+  extern int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128);
+
+  extern int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128);
+
+  extern const struct kref *
+kref_null(void);
+
+  extern void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv);
+
+  extern struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to);
+
+  extern struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to);
+
+  extern int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv);
+// }}} ref
+
+// kv128 {{{
+  extern size_t
+kv128_estimate_kv(const struct kv * const kv);
+
+  extern u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize);
+
+  extern struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize);
+
+  extern size_t
+kv128_size(const u8 * const ptr);
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// kvmap_api {{{
+typedef void (* kv_inp_func)(struct kv * const curr, void * const priv);
+
+// the merge function should:
+// 1: return NULL if the origin kv is not changed at all
+// 2: return kv0 if updates has been applied in-place
+// 3: return a different kv if the original kv must be replaced
+// In an in-memory kvmap, 2==1 and no further action is needed
+// In a persistent kv store with a memtable, 2 will need an insertion if kv0 is not from the memtable
+typedef struct kv * (* kv_merge_func)(struct kv * const kv0, void * const priv);
+
+struct kvmap_api {
+  // feature bits
+  bool hashkey; // true: caller needs to provide correct hash in kv/kref
+  bool ordered; // true: has iter_seek
+  bool threadsafe; // true: support thread_safe access
+  bool readonly; // true: no put() and del()
+  bool irefsafe; // true: iter's kref/kvref can be safely accessed after iter_seek/iter_skip/iter_park
+  bool unique; // provide unique keys, especially for iterators
+  bool refpark; // ref has park() and resume()
+  bool async; // XXX for testing KVell
+
+  // put (aka put/upsert): return true on success; false on error
+  // mm.in() controls how things move into the kvmap; the default mm make a copy with malloc()
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* put)     (void * const ref, struct kv * const kv);
+  // get: search and return a kv if found, or NULL if not
+  // with the default mm: malloc() if out == NULL; otherwise, use out as buffer
+  // with custom kvmap_mm: mm.out() controls buffer; use with caution
+  // caller should use the returned ptr even if out is provided
+  struct kv * (* get)     (void * const ref, const struct kref * const key, struct kv * const out);
+  // probe: return true on found, false on not found
+  bool        (* probe)   (void * const ref, const struct kref * const key);
+  // del: return true on something deleted, false on not found
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* del)     (void * const ref, const struct kref * const key);
+  // inp: inplace operation if key exists; otherwise return false; uf() is always executed even with NULL key
+  // inpr/inpw acquires r/w locks respectively.
+  // Note that in inpw() you can only change the value.
+  bool        (* inpr)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  bool        (* inpw)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  // merge: put+callback on old/new keys; another name: read-modify-write
+  // return true if successfull; return false on error
+  bool        (* merge)   (void * const ref, const struct kref * const key, kv_merge_func uf, void * const priv);
+  // delete-range: delete all keys from start (inclusive) to end (exclusive)
+  u64         (* delr)    (void * const ref, const struct kref * const start, const struct kref * const end);
+  // make everything persist; for persistent maps only
+  void        (* sync)    (void * const ref);
+
+  // general guidelines for thread-safe iters:
+  // - it is assumed that the key under the cursor is locked/freezed/immutable
+  // - once created one must call iter_seek to make it valid
+  // - the ownership of ref is given to the iter so ref should not be used until iter_destroy
+  // - creating and use more than one iter based on a ref can cause deadlocks
+  void *      (* iter_create)   (void * const ref);
+  // move the cursor to the first key >= search-key;
+  void        (* iter_seek)     (void * const iter, const struct kref * const key);
+  // check if the cursor points to a valid key
+  bool        (* iter_valid)    (void * const iter);
+  // return the current key; copy to out if (out != NULL)
+  // mm.out() controls copy-out
+  struct kv * (* iter_peek)     (void * const iter, struct kv * const out);
+  // similar to peek but does not copy; return false if iter is invalid
+  bool        (* iter_kref)     (void * const iter, struct kref * const kref);
+  // similar to iter_kref but also provide the value
+  bool        (* iter_kvref)    (void * const iter, struct kvref * const kvref);
+  // iter_retain makes kref or kvref of the current iter remain valid until released
+  // the returned opaque pointer should be provided when releasing the hold
+  u64         (* iter_retain)   (void * const iter);
+  void        (* iter_release)  (void * const iter, const u64 opaque);
+  // skip one element
+  void        (* iter_skip1)    (void * const iter);
+  // skip nr elements
+  void        (* iter_skip)     (void * const iter, const u32 nr);
+  // iter_next == iter_peek + iter_skip1
+  struct kv * (* iter_next)     (void * const iter, struct kv * const out);
+  // perform inplace opeation if the current key is valid; return false if no current key
+  // the uf() is always executed even with NULL key
+  bool        (* iter_inp)      (void * const iter, kv_inp_func uf, void * const priv);
+  // invalidate the iter to release any resources or locks
+  // afterward, must call seek() again before accessing data
+  void        (* iter_park)     (void * const iter);
+  // destroy iter
+  void        (* iter_destroy)  (void * const iter);
+
+  // misc:
+  // create refs for maps if required; always use use kvmap_ref() and kvmap_unref()
+  // if there are ref/unref functions, ref-ptr should be used as map for all kv operations
+  void *      (* ref)     (void * map);
+  // return the original map
+  void *      (* unref)   (void * ref);
+  // pause access without unref; must call resume later before access index again
+  void        (* park)    (void * ref);
+  // resume access of ref; must be paired with a park()
+  void        (* resume)  (void * ref);
+
+  // UNSAFE functions:
+  // empty the map
+  void        (* clean)   (void * map);
+  // erase everything
+  void        (* destroy) (void * map);
+  // for debugging
+  void        (* fprint)  (void * map, FILE * const out);
+};
+
+// registry
+struct kvmap_api_reg {
+  int nargs; // number of arguments after name
+  const char * name;
+  const char * args_msg; // see ...helper_message
+  // multiple apis may share one create function
+  // arguments: name (e.g., "rdb"), mm (usually NULL), the remaining args
+  void * (*create)(const char *, const struct kvmap_mm *, char **);
+  const struct kvmap_api * api;
+};
+
+// call this function to register a kvmap_api
+  extern void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api);
+
+  extern void
+kvmap_api_helper_message(void);
+
+  extern int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out);
+// }}} kvmap_api
+
+// helpers {{{
+  extern void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv);
+
+  extern void *
+kvmap_ref(const struct kvmap_api * const api, void * const map);
+
+  extern void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref);
+
+  extern struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out);
+
+  extern bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv);
+
+  extern bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv);
+
+  extern u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end);
+
+  extern void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key);
+
+  extern struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out);
+
+  extern bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr);
+
+  extern u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd);
+
+  extern bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out);
+
+  extern bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value);
+
+  extern bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out);
+// }}} helpers
+
+// }}} kvmap
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/lib.c b/run/MassTrie-beta/wormhole/lib.c
new file mode 100644
index 00000000..06d45f6d
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/lib.c
@@ -0,0 +1,3026 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include "lib.h"
+#include "ctypes.h"
+#include <assert.h>
+#include <execinfo.h>
+#include <math.h>
+#include <netdb.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <time.h>
+#include <stdarg.h> // va_start
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <malloc.h>  // malloc_usable_size
+#elif defined(__APPLE__) && defined(__MACH__)
+#include <sys/disk.h>
+#include <malloc/malloc.h>
+#elif defined(__FreeBSD__)
+#include <sys/disk.h>
+#include <malloc_np.h>
+#endif // OS
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+// }}} headers
+
+// math {{{
+  inline u64
+mhash64(const u64 v)
+{
+  return v * 11400714819323198485lu;
+}
+
+  inline u32
+mhash32(const u32 v)
+{
+  return v * 2654435761u;
+}
+
+// From Daniel Lemire's blog (2013, lemire.me)
+  u64
+gcd64(u64 a, u64 b)
+{
+  if (a == 0)
+    return b;
+  if (b == 0)
+    return a;
+
+  const u32 shift = (u32)__builtin_ctzl(a | b);
+  a >>= __builtin_ctzl(a);
+  do {
+    b >>= __builtin_ctzl(b);
+    if (a > b) {
+      const u64 t = b;
+      b = a;
+      a = t;
+    }
+    b = b - a;
+  } while (b);
+  return a << shift;
+}
+// }}} math
+
+// random {{{
+// Lehmer's generator is 2x faster than xorshift
+/**
+ * D. H. Lehmer, Mathematical methods in large-scale computing units.
+ * Proceedings of a Second Symposium on Large Scale Digital Calculating
+ * Machinery;
+ * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
+ *
+ * P L'Ecuyer,  Tables of linear congruential generators of different sizes and
+ * good lattice structure. Mathematics of Computation of the American
+ * Mathematical
+ * Society 68.225 (1999): 249-260.
+ */
+struct lehmer_u64 {
+  union {
+    u128 v128;
+    u64 v64[2];
+  };
+};
+
+static __thread struct lehmer_u64 rseed_u128 = {.v64 = {4294967291, 1549556881}};
+
+  static inline u64
+lehmer_u64_next(struct lehmer_u64 * const s)
+{
+  const u64 r = s->v64[1];
+  s->v128 *= 0xda942042e4dd58b5lu;
+  return r;
+}
+
+  static inline void
+lehmer_u64_seed(struct lehmer_u64 * const s, const u64 seed)
+{
+  s->v128 = (((u128)(~seed)) << 64) | (seed | 1);
+  (void)lehmer_u64_next(s);
+}
+
+  inline u64
+random_u64(void)
+{
+  return lehmer_u64_next(&rseed_u128);
+}
+
+  inline void
+srandom_u64(const u64 seed)
+{
+  lehmer_u64_seed(&rseed_u128, seed);
+}
+
+  inline double
+random_double(void)
+{
+  // random between [0.0 - 1.0]
+  const u64 r = random_u64();
+  return ((double)r) * (1.0 / ((double)(~0lu)));
+}
+// }}} random
+
+// timing {{{
+  inline u64
+time_nsec(void)
+{
+  struct timespec ts;
+  // MONO_RAW is 5x to 10x slower than MONO
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ((u64)ts.tv_sec) * 1000000000lu + ((u64)ts.tv_nsec);
+}
+
+  inline double
+time_sec(void)
+{
+  const u64 nsec = time_nsec();
+  return ((double)nsec) * 1.0e-9;
+}
+
+  inline u64
+time_diff_nsec(const u64 last)
+{
+  return time_nsec() - last;
+}
+
+  inline double
+time_diff_sec(const double last)
+{
+  return time_sec() - last;
+}
+
+// need char str[64]
+  void
+time_stamp(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F %T %z", &nowtm);
+}
+
+  void
+time_stamp2(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F-%H-%M-%S%z", &nowtm);
+}
+// }}} timing
+
+// cpucache {{{
+  inline void
+cpu_pause(void)
+{
+#if defined(__x86_64__)
+  _mm_pause();
+#elif defined(__aarch64__)
+  // nop
+#endif
+}
+
+  inline void
+cpu_mfence(void)
+{
+  atomic_thread_fence(MO_SEQ_CST);
+}
+
+// compiler fence
+  inline void
+cpu_cfence(void)
+{
+  atomic_thread_fence(MO_ACQ_REL);
+}
+
+  inline void
+cpu_prefetch0(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 0);
+}
+
+  inline void
+cpu_prefetch1(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 1);
+}
+
+  inline void
+cpu_prefetch2(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 2);
+}
+
+  inline void
+cpu_prefetch3(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 3);
+}
+
+  inline void
+cpu_prefetchw(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 1, 0);
+}
+// }}} cpucache
+
+// crc32c {{{
+  inline u32
+crc32c_u8(const u32 crc, const u8 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u8(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cb(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u16(const u32 crc, const u16 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u16(crc, v);
+#elif defined(__aarch64__)
+  return __crc32ch(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u32(const u32 crc, const u32 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u32(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cw(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u64(const u32 crc, const u64 v)
+{
+#if defined(__x86_64__)
+  return (u32)_mm_crc32_u64(crc, v);
+#elif defined(__aarch64__)
+  return (u32)__crc32cd(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc)
+{
+  if (nr == 1)
+    return crc32c_u8(crc, buf[0]);
+
+  crc = crc32c_u16(crc, *(u16 *)buf);
+  return (nr == 2) ? crc : crc32c_u8(crc, buf[2]);
+}
+
+  inline u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc)
+{
+  //debug_assert((nr & 3) == 0);
+  const u32 nr8 = nr >> 3;
+#pragma nounroll
+  for (u32 i = 0; i < nr8; i++)
+    crc = crc32c_u64(crc, ((u64*)buf)[i]);
+
+  if (nr & 4u)
+    crc = crc32c_u32(crc, ((u32*)buf)[nr8<<1]);
+  return crc;
+}
+
+  u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc)
+{
+  crc = crc32c_inc_x4(buf, nr, crc);
+  const u32 nr123 = nr & 3u;
+  return nr123 ? crc32c_inc_123(buf + nr - nr123, nr123, crc) : crc;
+}
+// }}} crc32c
+
+// debug {{{
+  void
+debug_break(void)
+{
+  usleep(100);
+}
+
+static u64 * debug_watch_u64 = NULL;
+
+  static void
+watch_u64_handler(const int sig)
+{
+  (void)sig;
+  const u64 v = debug_watch_u64 ? (*debug_watch_u64) : 0;
+  fprintf(stderr, "[USR1] %lu (0x%lx)\n", v, v);
+}
+
+  void
+watch_u64_usr1(u64 * const ptr)
+{
+  debug_watch_u64 = ptr;
+  struct sigaction sa = {};
+  sa.sa_handler = watch_u64_handler;
+  sigemptyset(&(sa.sa_mask));
+  sa.sa_flags = SA_RESTART;
+  if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+    fprintf(stderr, "Failed to set signal handler for SIGUSR1\n");
+  } else {
+    fprintf(stderr, "to watch> kill -s SIGUSR1 %d\n", getpid());
+  }
+}
+
+static void * debug_bt_state = NULL;
+#if defined(BACKTRACE) && defined(__linux__)
+// TODO: get exec path on MacOS and FreeBSD
+
+#include <backtrace.h>
+static char debug_filepath[1024] = {};
+
+  static void
+debug_bt_error_cb(void * const data, const char * const msg, const int errnum)
+{
+  (void)data;
+  if (msg)
+    dprintf(2, "libbacktrace: %s %s\n", msg, strerror(errnum));
+}
+
+  static int
+debug_bt_print_cb(void * const data, const uintptr_t pc,
+    const char * const file, const int lineno, const char * const func)
+{
+  u32 * const plevel = (typeof(plevel))data;
+  if (file || func || lineno) {
+    dprintf(2, "[%u]0x%012lx " TERMCLR(35) "%s" TERMCLR(31) ":" TERMCLR(34) "%d" TERMCLR(0)" %s\n",
+        *plevel, pc, file ? file : "???", lineno, func ? func : "???");
+  } else if (pc) {
+    dprintf(2, "[%u]0x%012lx ??\n", *plevel, pc);
+  }
+  (*plevel)++;
+  return 0;
+}
+
+__attribute__((constructor))
+  static void
+debug_backtrace_init(void)
+{
+  const ssize_t len = readlink("/proc/self/exe", debug_filepath, 1023);
+  // disable backtrace
+  if (len < 0 || len >= 1023)
+    return;
+
+  debug_filepath[len] = '\0';
+  debug_bt_state = backtrace_create_state(debug_filepath, 1, debug_bt_error_cb, NULL);
+}
+#endif // BACKTRACE
+
+  static void
+debug_wait_gdb(void * const bt_state)
+{
+  if (bt_state) {
+#if defined(BACKTRACE)
+    dprintf(2, "Backtrace :\n");
+    u32 level = 0;
+    backtrace_full(debug_bt_state, 1, debug_bt_print_cb, debug_bt_error_cb, &level);
+#endif // BACKTRACE
+  } else { // fallback to execinfo if no backtrace or initialization failed
+    void *array[64];
+    const int size = backtrace(array, 64);
+    dprintf(2, "Backtrace (%d):\n", size - 1);
+    backtrace_symbols_fd(array + 1, size - 1, 2);
+  }
+
+  abool v = true;
+  char timestamp[32];
+  time_stamp(timestamp, 32);
+  char threadname[32] = {};
+  thread_get_name(pthread_self(), threadname, 32);
+  strcat(threadname, "(!!)");
+  thread_set_name(pthread_self(), threadname);
+  char hostname[32];
+  gethostname(hostname, 32);
+
+  const char * const pattern = "[Waiting GDB] %1$s %2$s @ %3$s\n"
+    "    Attach me: " TERMCLR(31) "sudo -Hi gdb -p %4$d" TERMCLR(0) "\n";
+  char buf[256];
+  sprintf(buf, pattern, timestamp, threadname, hostname, getpid());
+  write(2, buf, strlen(buf));
+
+  // to continue: gdb> set var v = 0
+  // to kill from shell: $ kill %pid; kill -CONT %pid
+
+  // uncomment this line to surrender the shell on error
+  // kill(getpid(), SIGSTOP); // stop burning cpu, once
+
+  static au32 nr_waiting = 0;
+  const u32 seq = atomic_fetch_add_explicit(&nr_waiting, 1, MO_RELAXED);
+  if (seq == 0) {
+    sprintf(buf, "/run/user/%u/.debug_wait_gdb_pid", getuid());
+    const int pidfd = open(buf, O_CREAT|O_TRUNC|O_WRONLY, 00644);
+    if (pidfd >= 0) {
+      dprintf(pidfd, "%u", getpid());
+      close(pidfd);
+    }
+  }
+
+#pragma nounroll
+  while (atomic_load_explicit(&v, MO_CONSUME))
+    sleep(1);
+}
+
+#ifndef NDEBUG
+  void
+debug_assert(const bool v)
+{
+  if (!v)
+    debug_wait_gdb(debug_bt_state);
+}
+#endif
+
+__attribute__((noreturn))
+  void
+debug_die(void)
+{
+  debug_wait_gdb(debug_bt_state);
+  exit(0);
+}
+
+__attribute__((noreturn))
+  void
+debug_die_perror(void)
+{
+  perror(NULL);
+  debug_die();
+}
+
+#if !defined(NOSIGNAL)
+// signal handler for wait_gdb on fatal errors
+  static void
+wait_gdb_handler(const int sig, siginfo_t * const info, void * const context)
+{
+  (void)info;
+  (void)context;
+  char buf[64] = "[SIGNAL] ";
+  strcat(buf, strsignal(sig));
+  write(2, buf, strlen(buf));
+  debug_wait_gdb(NULL);
+}
+
+// setup hooks for catching fatal errors
+__attribute__((constructor))
+  static void
+debug_init(void)
+{
+  void * stack = pages_alloc_4kb(16);
+  //fprintf(stderr, "altstack %p\n", stack);
+  stack_t ss = {.ss_sp = stack, .ss_flags = 0, .ss_size = PGSZ*16};
+  if (sigaltstack(&ss, NULL))
+    fprintf(stderr, "sigaltstack failed\n");
+
+  struct sigaction sa = {.sa_sigaction = wait_gdb_handler, .sa_flags = SA_SIGINFO | SA_ONSTACK};
+  sigemptyset(&(sa.sa_mask));
+  const int fatals[] = {SIGSEGV, SIGFPE, SIGILL, SIGBUS, 0};
+  for (int i = 0; fatals[i]; i++) {
+    if (sigaction(fatals[i], &sa, NULL) == -1) {
+      fprintf(stderr, "Failed to set signal handler for %s\n", strsignal(fatals[i]));
+      fflush(stderr);
+    }
+  }
+}
+
+__attribute__((destructor))
+  static void
+debug_exit(void)
+{
+  // to get rid of valgrind warnings
+  stack_t ss = {.ss_flags = SS_DISABLE};
+  stack_t oss = {};
+  sigaltstack(&ss, &oss);
+  if (oss.ss_sp)
+    pages_unmap(oss.ss_sp, PGSZ * 16);
+}
+#endif // !defined(NOSIGNAL)
+
+  void
+debug_dump_maps(FILE * const out)
+{
+  FILE * const in = fopen("/proc/self/smaps", "r");
+  char * line0 = yalloc(1024);
+  size_t size0 = 1024;
+  while (!feof(in)) {
+    const ssize_t r1 = getline(&line0, &size0, in);
+    if (r1 < 0) break;
+    fprintf(out, "%s", line0);
+  }
+  free(line0);
+  fflush(out);
+  fclose(in);
+}
+
+static pid_t perf_pid = 0;
+
+#if defined(__linux__)
+__attribute__((constructor))
+  static void
+debug_perf_init(void)
+{
+  const pid_t ppid = getppid();
+  char tmp[256] = {};
+  sprintf(tmp, "/proc/%d/cmdline", ppid);
+  FILE * const fc = fopen(tmp, "r");
+  const size_t nr = fread(tmp, 1, sizeof(tmp) - 1, fc);
+  fclose(fc);
+  // look for "perf record"
+  if (nr < 12)
+    return;
+  tmp[nr] = '\0';
+  for (u64 i = 0; i < nr; i++)
+    if (tmp[i] == 0)
+      tmp[i] = ' ';
+
+  char * const perf = strstr(tmp, "perf record");
+  if (perf) {
+    fprintf(stderr, "%s: perf detected\n", __func__);
+    perf_pid = ppid;
+  }
+}
+#endif // __linux__
+
+  bool
+debug_perf_switch(void)
+{
+  if (perf_pid > 0) {
+    kill(perf_pid, SIGUSR2);
+    return true;
+  } else {
+    return false;
+  }
+}
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  bool
+alloc_fail(void)
+{
+#define ALLOCFAIL_RECP ((64lu))
+#define ALLOCFAIL_MAGIC ((ALLOCFAIL_RECP / 3lu))
+  return ((random_u64() % ALLOCFAIL_RECP) == ALLOCFAIL_MAGIC);
+}
+
+#ifdef MALLOCFAIL
+extern void * __libc_malloc(size_t size);
+  void *
+malloc(size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_malloc(size);
+}
+
+extern void * __libc_calloc(size_t nmemb, size_t size);
+  void *
+calloc(size_t nmemb, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_calloc(nmemb, size);
+}
+
+extern void *__libc_realloc(void *ptr, size_t size);
+
+  void *
+realloc(void *ptr, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_realloc(ptr, size);
+}
+#endif // MALLOC_FAIL
+#endif // ALLOC_FAIL
+
+  void *
+xalloc(const size_t align, const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, align, size) == 0) ? p : NULL;
+}
+
+// alloc cache-line aligned address
+  void *
+yalloc(const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, 64, size) == 0) ? p : NULL;
+}
+
+  void **
+malloc_2d(const size_t nr, const size_t size)
+{
+  const size_t size1 = nr * sizeof(void *);
+  const size_t size2 = nr * size;
+  void ** const mem = malloc(size1 + size2);
+  u8 * const mem2 = ((u8 *)mem) + size1;
+  for (size_t i = 0; i < nr; i++)
+    mem[i] = mem2 + (i * size);
+  return mem;
+}
+
+  inline void **
+calloc_2d(const size_t nr, const size_t size)
+{
+  void ** const ret = malloc_2d(nr, size);
+  memset(ret[0], 0, nr * size);
+  return ret;
+}
+
+  inline void
+pages_unmap(void * const ptr, const size_t size)
+{
+#ifndef HEAPCHECKING
+  munmap(ptr, size);
+#else
+  (void)size;
+  free(ptr);
+#endif
+}
+
+  void
+pages_lock(void * const ptr, const size_t size)
+{
+  static bool use_mlock = true;
+  if (use_mlock) {
+    const int ret = mlock(ptr, size);
+    if (ret != 0) {
+      use_mlock = false;
+      fprintf(stderr, "%s: mlock disabled\n", __func__);
+    }
+  }
+}
+
+#ifndef HEAPCHECKING
+  static void *
+pages_do_alloc(const size_t size, const int flags)
+{
+  // vi /etc/security/limits.conf
+  // * - memlock unlimited
+  void * const p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (p == MAP_FAILED)
+    return NULL;
+
+  pages_lock(p, size);
+  return p;
+}
+
+#if defined(__linux__) && defined(MAP_HUGETLB)
+
+#if defined(MAP_HUGE_SHIFT)
+#define PAGES_FLAGS_1G ((MAP_HUGETLB | (30 << MAP_HUGE_SHIFT)))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB | (21 << MAP_HUGE_SHIFT)))
+#else // MAP_HUGE_SHIFT
+#define PAGES_FLAGS_1G ((MAP_HUGETLB))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB))
+#endif // MAP_HUGE_SHIFT
+
+#else
+#define PAGES_FLAGS_1G ((0))
+#define PAGES_FLAGS_2M ((0))
+#endif // __linux__
+
+#endif // HEAPCHECKING
+
+  inline void *
+pages_alloc_1gb(const size_t nr_1gb)
+{
+  const u64 sz = nr_1gb << 30;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_1G);
+#else
+  void * const p = xalloc(1lu << 21, sz); // Warning: valgrind fails with 30
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_2mb(const size_t nr_2mb)
+{
+  const u64 sz = nr_2mb << 21;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_2M);
+#else
+  void * const p = xalloc(1lu << 21, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_4kb(const size_t nr_4kb)
+{
+  const size_t sz = nr_4kb << 12;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS);
+#else
+  void * const p = xalloc(1lu << 12, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  // 1gb huge page: at least 0.25GB
+  if (try_1gb) {
+    if (size >= (1lu << 28)) {
+      const size_t nr_1gb = bits_round_up(size, 30) >> 30;
+      void * const p1 = pages_alloc_1gb(nr_1gb);
+      if (p1) {
+        *size_out = nr_1gb << 30;
+        return p1;
+      }
+    }
+  }
+
+  // 2mb huge page: at least 0.5MB
+  if (size >= (1lu << 19)) {
+    const size_t nr_2mb = bits_round_up(size, 21) >> 21;
+    void * const p2 = pages_alloc_2mb(nr_2mb);
+    if (p2) {
+      *size_out = nr_2mb << 21;
+      return p2;
+    }
+  }
+
+  const size_t nr_4kb = bits_round_up(size, 12) >> 12;
+  void * const p3 = pages_alloc_4kb(nr_4kb);
+  if (p3)
+    *size_out = nr_4kb << 12;
+  return p3;
+}
+// }}} mm
+
+// process/thread {{{
+static u32 process_ncpu;
+#if defined(__FreeBSD__)
+typedef cpuset_t cpu_set_t;
+#elif defined(__APPLE__) && defined(__MACH__)
+typedef u64 cpu_set_t;
+#define CPU_SETSIZE ((64))
+#define CPU_COUNT(__cpu_ptr__) (__builtin_popcountl(*__cpu_ptr__))
+#define CPU_ISSET(__cpu_idx__, __cpu_ptr__) (((*__cpu_ptr__) >> __cpu_idx__) & 1lu)
+#define CPU_ZERO(__cpu_ptr__) ((*__cpu_ptr__) = 0)
+#define CPU_SET(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) |= (1lu << __cpu_idx__))
+#define CPU_CLR(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) &= ~(1lu << __cpu_idx__))
+#define pthread_attr_setaffinity_np(...) ((void)0)
+#endif
+
+__attribute__((constructor))
+  static void
+process_init(void)
+{
+  // Linux's default is 1024 cpus
+  process_ncpu = (u32)sysconf(_SC_NPROCESSORS_CONF);
+  if (process_ncpu > CPU_SETSIZE) {
+    fprintf(stderr, "%s: can use only %zu cores\n",
+        __func__, (size_t)CPU_SETSIZE);
+    process_ncpu = CPU_SETSIZE;
+  }
+  thread_set_name(pthread_self(), "main");
+}
+
+  static inline int
+thread_getaffinity_set(cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_getaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  *cpuset = (1lu << process_ncpu) - 1;
+  return (int)process_ncpu; // TODO
+#endif // OS
+}
+
+  static inline int
+thread_setaffinity_set(const cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_setaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)cpuset; // TODO
+  return 0;
+#endif // OS
+}
+
+  void
+thread_get_name(const pthread_t pt, char * const name, const size_t len)
+{
+#if defined(__linux__)
+  pthread_getname_np(pt, name, len);
+#elif defined(__FreeBSD__)
+  pthread_get_name_np(pt, name, len);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)len;
+  strcpy(name, "unknown"); // TODO
+#endif // OS
+}
+
+  void
+thread_set_name(const pthread_t pt, const char * const name)
+{
+#if defined(__linux__)
+  pthread_setname_np(pt, name);
+#elif defined(__FreeBSD__)
+  pthread_set_name_np(pt, name);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)name; // TODO
+#endif // OS
+}
+
+// kB
+  long
+process_get_rss(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  return rs.ru_maxrss;
+}
+
+  u32
+process_affinity_count(void)
+{
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return process_ncpu;
+
+  const u32 nr = (u32)CPU_COUNT(&set);
+  return nr ? nr : process_ncpu;
+}
+
+  u32
+process_getaffinity_list(const u32 max, u32 * const cores)
+{
+  memset(cores, 0, max * sizeof(cores[0]));
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return 0;
+
+  const u32 nr_affinity = (u32)CPU_COUNT(&set);
+  const u32 nr = nr_affinity < max ? nr_affinity : max;
+  u32 j = 0;
+  for (u32 i = 0; i < process_ncpu; i++) {
+    if (CPU_ISSET(i, &set))
+      cores[j++] = i;
+
+    if (j >= nr)
+      break;
+  }
+  return j;
+}
+
+  void
+thread_setaffinity_list(const u32 nr, const u32 * const list)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (u32 i = 0; i < nr; i++)
+    if (list[i] < process_ncpu)
+      CPU_SET(list[i], &set);
+  thread_setaffinity_set(&set);
+}
+
+  void
+thread_pin(const u32 cpu)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(cpu % process_ncpu, &set);
+  thread_setaffinity_set(&set);
+}
+
+  u64
+process_cpu_time_usec(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  const u64 usr = (((u64)rs.ru_utime.tv_sec) * 1000000lu) + ((u64)rs.ru_utime.tv_usec);
+  const u64 sys = (((u64)rs.ru_stime.tv_sec) * 1000000lu) + ((u64)rs.ru_stime.tv_usec);
+  return usr + sys;
+}
+
+struct fork_join_info {
+  u32 total;
+  u32 ncores;
+  u32 * cores;
+  void *(*func)(void *);
+  bool args;
+  union {
+    void * arg1;
+    void ** argn;
+  };
+  union {
+    struct { au32 ferr, jerr; };
+    au64 xerr;
+  };
+};
+
+// DON'T CHANGE!
+#define FORK_JOIN_RANK_BITS ((16)) // 16
+#define FORK_JOIN_MAX ((1u << FORK_JOIN_RANK_BITS))
+
+/*
+ * fj(6):     T0
+ *         /      \
+ *       T0        T4
+ *     /   \      /
+ *    T0   T2    T4
+ *   / \   / \   / \
+ *  t0 t1 t2 t3 t4 t5
+ */
+
+// recursive tree fork-join
+  static void *
+thread_do_fork_join_worker(void * const ptr)
+{
+  struct entry13 fjp = {.ptr = ptr};
+  // GCC: Without explicitly casting from fjp.fji (a 45-bit u64 value),
+  // the high bits will get truncated, which is always CORRECT in gcc.
+  // Don't use gcc.
+  struct fork_join_info * const fji = u64_to_ptr(fjp.e3);
+  const u32 rank = (u32)fjp.e1;
+
+  const u32 nchild = (u32)__builtin_ctz(rank ? rank : bits_p2_up_u32(fji->total));
+  debug_assert(nchild <= FORK_JOIN_RANK_BITS);
+  pthread_t tids[FORK_JOIN_RANK_BITS];
+  if (nchild) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Joinable by default
+    // fork top-down
+    for (u32 i = nchild - 1; i < nchild; i--) {
+      const u32 cr = rank + (1u << i); // child's rank
+      if (cr >= fji->total)
+        continue; // should not break
+      const u32 core = fji->cores[(cr < fji->ncores) ? cr : (cr % fji->ncores)];
+      CPU_SET(core, &set);
+      pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+      fjp.e1 = (u16)cr;
+      const int r = pthread_create(&tids[i], &attr, thread_do_fork_join_worker, fjp.ptr);
+      CPU_CLR(core, &set);
+      if (unlikely(r)) { // fork failed
+        memset(&tids[0], 0, sizeof(tids[0]) * (i+1));
+        u32 nmiss = (1u << (i + 1)) - 1;
+        if ((rank + nmiss) >= fji->total)
+          nmiss = fji->total - 1 - rank;
+        (void)atomic_fetch_add_explicit(&fji->ferr, nmiss, MO_RELAXED);
+        break;
+      }
+    }
+    pthread_attr_destroy(&attr);
+  }
+
+  char thname0[16];
+  char thname1[16];
+  thread_get_name(pthread_self(), thname0, 16);
+  snprintf(thname1, 16, "%.8s_%u", thname0, rank);
+  thread_set_name(pthread_self(), thname1);
+
+  void * const ret = fji->func(fji->args ? fji->argn[rank] : fji->arg1);
+
+  thread_set_name(pthread_self(), thname0);
+  // join bottom-up
+  for (u32 i = 0; i < nchild; i++) {
+    const u32 cr = rank + (1u << i); // child rank
+    if (cr >= fji->total)
+      break; // safe to break
+    if (tids[i]) {
+      const int r = pthread_join(tids[i], NULL);
+      if (unlikely(r)) { // error
+        //fprintf(stderr, "pthread_join %u..%u = %d: %s\n", rank, cr, r, strerror(r));
+        (void)atomic_fetch_add_explicit(&fji->jerr, 1, MO_RELAXED);
+      }
+    }
+  }
+  return ret;
+}
+
+  u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx)
+{
+  if (unlikely(nr > FORK_JOIN_MAX)) {
+    fprintf(stderr, "%s reduce nr to %u\n", __func__, FORK_JOIN_MAX);
+    nr = FORK_JOIN_MAX;
+  }
+
+  u32 cores[CPU_SETSIZE];
+  u32 ncores = process_getaffinity_list(process_ncpu, cores);
+  if (unlikely(ncores == 0)) { // force to use all cores
+    ncores = process_ncpu;
+    for (u32 i = 0; i < process_ncpu; i++)
+      cores[i] = i;
+  }
+  if (unlikely(nr == 0))
+    nr = ncores;
+
+  // the compiler does not know fji can change since we cast &fji into fjp
+  struct fork_join_info fji = {.total = nr, .cores = cores, .ncores = ncores,
+      .func = func, .args = args, .arg1 = argx};
+  const struct entry13 fjp = entry13(0, (u64)(&fji));
+
+  // save current affinity
+  cpu_set_t set0;
+  thread_getaffinity_set(&set0);
+
+  // master thread shares thread0's core
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(fji.cores[0], &set);
+  thread_setaffinity_set(&set);
+
+  const u64 t0 = time_nsec();
+  (void)thread_do_fork_join_worker(fjp.ptr);
+  const u64 dt = time_diff_nsec(t0);
+
+  // restore original affinity
+  thread_setaffinity_set(&set0);
+
+  // check and report errors (unlikely)
+  if (atomic_load_explicit(&fji.xerr, MO_CONSUME))
+    fprintf(stderr, "%s errors: fork %u join %u\n", __func__, fji.ferr, fji.jerr);
+  return dt;
+}
+
+  int
+thread_create_at(const u32 cpu, pthread_t * const thread,
+    void *(*start_routine) (void *), void * const arg)
+{
+  const u32 cpu_id = (cpu < process_ncpu) ? cpu : (cpu % process_ncpu);
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  cpu_set_t set;
+
+  CPU_ZERO(&set);
+  CPU_SET(cpu_id, &set);
+  pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+  const int r = pthread_create(thread, &attr, start_routine, arg);
+  pthread_attr_destroy(&attr);
+  return r;
+}
+// }}} process/thread
+
+// locking {{{
+
+// spinlock {{{
+#if defined(__linux__)
+#define SPINLOCK_PTHREAD
+#endif // __linux__
+
+#if defined(SPINLOCK_PTHREAD)
+static_assert(sizeof(pthread_spinlock_t) <= sizeof(spinlock), "spinlock size");
+#else // SPINLOCK_PTHREAD
+static_assert(sizeof(au32) <= sizeof(spinlock), "spinlock size");
+#endif // SPINLOCK_PTHREAD
+
+  void
+spinlock_init(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_init(p, PTHREAD_PROCESS_PRIVATE);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_lock(spinlock * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!spinlock_trylock(lock))
+    corr_yield();
+#else // CORR
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_lock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+#pragma nounroll
+  do {
+    if (atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0)
+      return;
+#pragma nounroll
+    do {
+      cpu_pause();
+    } while (atomic_load_explicit(p, MO_CONSUME));
+  } while (true);
+#endif // SPINLOCK_PTHREAD
+#endif // CORR
+}
+
+  inline bool
+spinlock_trylock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  return !pthread_spin_trylock(p);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  return atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0;
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_unlock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_unlock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+// }}} spinlock
+
+// pthread mutex {{{
+static_assert(sizeof(pthread_mutex_t) <= sizeof(mutex), "mutexlock size");
+  inline void
+mutex_init(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_init(p, NULL);
+}
+
+  inline void
+mutex_lock(mutex * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!mutex_trylock(lock))
+    corr_yield();
+#else
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_lock(p); // return value ignored
+#endif
+}
+
+  inline bool
+mutex_trylock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  return !pthread_mutex_trylock(p); // return value ignored
+}
+
+  inline void
+mutex_unlock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_unlock(p); // return value ignored
+}
+
+  inline void
+mutex_deinit(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_destroy(p);
+}
+// }}} pthread mutex
+
+// rwdep {{{
+// poor man's lockdep for rwlock
+// per-thread lock list
+// it calls debug_die() when local double-(un)locking is detected
+// cyclic dependencies can be manually identified by looking at the two lists below in gdb
+#ifdef RWDEP
+#define RWDEP_NR ((16))
+__thread const rwlock * rwdep_readers[RWDEP_NR] = {};
+__thread const rwlock * rwdep_writers[RWDEP_NR] = {};
+
+  static void
+rwdep_check(const rwlock * const lock)
+{
+  debug_assert(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock)
+      debug_die();
+    if (rwdep_writers[i] == lock)
+      debug_die();
+  }
+}
+#endif // RWDEP
+
+  static void
+rwdep_lock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == NULL) {
+      rwdep_readers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock) {
+      rwdep_readers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_lock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == NULL) {
+      rwdep_writers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == lock) {
+      rwdep_writers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+// }}} rwlockdep
+
+// rwlock {{{
+typedef au32 lock_t;
+typedef u32 lock_v;
+static_assert(sizeof(lock_t) == sizeof(lock_v), "lock size");
+static_assert(sizeof(lock_t) <= sizeof(rwlock), "lock size");
+
+#define RWLOCK_WSHIFT ((sizeof(lock_t) * 8 - 1))
+#define RWLOCK_WBIT ((((lock_v)1) << RWLOCK_WSHIFT))
+
+  inline void
+rwlock_init(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_store_explicit(pvar, 0, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  } else {
+    atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_read_lp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) {
+    cpu_pause();
+    return false;
+  }
+  return rwlock_trylock_read(lock);
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  }
+
+#pragma nounroll
+  do { // someone already locked; wait for a little while
+    cpu_pause();
+    if ((atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) == 0) {
+      rwdep_lock_read(lock);
+      return true;
+    }
+  } while (nr--);
+
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+  return false;
+}
+
+  inline void
+rwlock_lock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_read(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT);
+  } while (true);
+}
+
+  inline void
+rwlock_unlock_read(rwlock * const lock)
+{
+  rwdep_unlock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if ((v0 == 0) && atomic_compare_exchange_weak_explicit(pvar, &v0, RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME));
+  } while (true);
+}
+
+  inline bool
+rwlock_trylock_write_hp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if (v0 >> RWLOCK_WSHIFT)
+    return false;
+
+  if (atomic_compare_exchange_weak_explicit(pvar, &v0, v0|RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    // WBIT successfully marked; must wait for readers to leave
+    if (v0) { // saw active readers
+#pragma nounroll
+      while (atomic_load_explicit(pvar, MO_CONSUME) != RWLOCK_WBIT) {
+#if defined(CORR)
+        corr_yield();
+#else
+        cpu_pause();
+#endif
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write_hp(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write_hp(rwlock * const lock)
+{
+#pragma nounroll
+  while (!rwlock_trylock_write_hp(lock)) {
+#if defined(CORR)
+    corr_yield();
+#else
+    cpu_pause();
+#endif
+  }
+}
+
+  inline void
+rwlock_unlock_write(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, RWLOCK_WBIT, MO_RELEASE);
+}
+
+  inline void
+rwlock_write_to_read(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  rwdep_lock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  // +R -W
+  atomic_fetch_add_explicit(pvar, ((lock_v)1) - RWLOCK_WBIT, MO_ACQ_REL);
+}
+
+#undef RWLOCK_WSHIFT
+#undef RWLOCK_WBIT
+// }}} rwlock
+
+// }}} locking
+
+// coroutine {{{
+
+// asm {{{
+#if defined(__x86_64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((6))
+
+// for switch/exit: pass a return value to the target
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error Supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "push %rbp; push %rbx; push %r12;"
+    "push %r13; push %r14; push %r15;"
+    "mov  %rsp, (%rdi);"
+    "mov  %rsi, %rsp;"
+    "pop  %r15; pop  %r14; pop  %r13;"
+    "pop  %r12; pop  %rbx; pop  %rbp;"
+    "mov  %rdx, %rax;"
+    "retq;"
+    );
+
+#elif defined(__aarch64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((20))
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "sub  x8, sp, 160;"
+    "str  x8, [x0];"
+    "stp x30, x19, [x8];      ldp x30, x19, [x1];"
+    "stp x20, x21, [x8, 16];  ldp x20, x21, [x1, 16];"
+    "stp x22, x23, [x8, 32];  ldp x22, x23, [x1, 32];"
+    "stp x24, x25, [x8, 48];  ldp x24, x25, [x1, 48];"
+    "stp x26, x27, [x8, 64];  ldp x26, x27, [x1, 64];"
+    "stp x28, x29, [x8, 80];  ldp x28, x29, [x1, 80];"
+    "stp  d8,  d9, [x8, 96];  ldp  d8,  d9, [x1, 96];"
+    "stp d10, d11, [x8, 112]; ldp d10, d11, [x1, 112];"
+    "stp d12, d13, [x8, 128]; ldp d12, d13, [x1, 128];"
+    "stp d14, d15, [x8, 144]; ldp d14, d15, [x1, 144];"
+    "add  sp, x1, 160;"
+    "mov  x0, x2;"
+    "br  x30;"
+    );
+
+extern void co_entry_aarch64(void);
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_entry_aarch64;"
+    ".type co_entry_aarch64, @function;"
+    "co_entry_aarch64:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_entry_aarch64;"
+    "_co_entry_aarch64:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "ldr x8, [sp, 0];"
+    "blr x8;"
+    "ldr x8, [sp, 8];"
+    "blr x8;"
+    "ldr x8, [sp, 16];"
+    "blr x8;"
+    );
+#else
+#error supported CPUs: x86_64 or AArch64
+#endif // co_switch_stack x86_64 and aarch64
+// }}} asm
+
+// co {{{
+struct co {
+  u64 rsp;
+  void * priv;
+  u64 * host; // set host to NULL to exit
+  size_t stksz;
+};
+
+// not atomic: no concurrent access
+// volatile: avoid caching of co_curr
+static __thread struct co * volatile co_curr = NULL; // NULL in host
+
+// the stack sits under the struct co
+  static void
+co_init(struct co * const co, void * func, void * priv, u64 * const host,
+    const u64 stksz, void * func_exit)
+{
+  debug_assert((stksz & 0x3f) == 0); // a multiple of 64 bytes
+  u64 * rsp = ((u64 *)co) - 4;
+  rsp[0] = (u64)func;
+  rsp[1] = (u64)func_exit;
+  rsp[2] = (u64)debug_die;
+  rsp[3] = 0;
+
+  rsp -= CO_CONTEXT_SIZE;
+
+#if defined(__aarch64__)
+  rsp[0] = (u64)co_entry_aarch64;
+#endif
+
+  co->rsp = (u64)rsp;
+  co->priv = priv;
+  co->host = host;
+  co->stksz = stksz;
+}
+
+  static void
+co_exit0(void)
+{
+  co_exit(0);
+}
+
+  struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct co);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct co * const co = (typeof(co))(mem + stksz);
+  co_init(co, func, priv, host, stksz, co_exit0);
+  return co;
+}
+
+  inline void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(co, func, priv, host, co->stksz, co_exit0);
+}
+
+  inline struct co *
+co_fork(void * func, void * priv)
+{
+  return co_curr ? co_create(co_curr->stksz, func, priv, co_curr->host) : NULL;
+}
+
+  inline void *
+co_priv(void)
+{
+  return co_curr ? co_curr->priv : NULL;
+}
+
+// the host calls this to enter a coroutine.
+  inline u64
+co_enter(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr == NULL); // must entry from the host
+  debug_assert(to && to->host);
+  u64 * const save = to->host;
+  co_curr = to;
+  const u64 ret = co_switch_stack(save, to->rsp, retval);
+  co_curr = NULL;
+  return ret;
+}
+
+// switch from a coroutine to another coroutine
+// co_curr must be valid
+// the target will resume and receive the retval
+  inline u64
+co_switch_to(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr);
+  debug_assert(co_curr != to);
+  debug_assert(to && to->host);
+  struct co * const save = co_curr;
+  co_curr = to;
+  return co_switch_stack(&(save->rsp), to->rsp, retval);
+}
+
+// switch from a coroutine to the host routine
+// co_yield is now a c++ keyword...
+  inline u64
+co_back(const u64 retval)
+{
+  debug_assert(co_curr);
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  return co_switch_stack(&(save->rsp), *(save->host), retval);
+}
+
+#ifdef CO_STACK_CHECK
+  static void
+co_stack_check(const u8 * const mem, const u64 stksz)
+{
+  const u64 * const mem64 = (typeof(mem64))mem;
+  const u64 size64 = stksz / sizeof(u64);
+  for (u64 i = 0; i < size64; i++) {
+    if (mem64[i] != 0x5c5c5c5c5c5c5c5clu) {
+      fprintf(stderr, "%s co stack usage: %lu/%lu\n", __func__, stksz - (i * sizeof(u64)), stksz);
+      break;
+    }
+  }
+}
+#endif // CO_STACK_CHECK
+
+// return to host and set host to NULL
+__attribute__((noreturn))
+  void
+co_exit(const u64 retval)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  u8 * const mem = ((u8 *)co_curr) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+  const u64 hostrsp = *(co_curr->host);
+  co_curr->host = NULL;
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  (void)co_switch_stack(&(save->rsp), hostrsp, retval);
+  // return to co_enter
+  debug_die();
+}
+
+// host is set to NULL on exit
+  inline bool
+co_valid(struct co * const co)
+{
+  return co->host != NULL;
+}
+
+// return NULL on host
+  inline struct co *
+co_self(void)
+{
+  return co_curr;
+}
+
+  inline void
+co_destroy(struct co * const co)
+{
+  u8 * const mem = ((u8 *)co) - co->stksz;
+  free(mem);
+}
+// }}} co
+
+// corr {{{
+struct corr {
+  struct co co;
+  struct corr * next;
+  struct corr * prev;
+};
+
+// initial and link guest to the run-queue
+  struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, host, stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+  return co;
+}
+
+  struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, prev->co.host, stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+  return co;
+}
+
+  inline void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(&(co->co), func, priv, host, co->co.stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+}
+
+  inline void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev)
+{
+  co_init(&(co->co), func, priv, prev->co.host, co->co.stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+}
+
+  inline void
+corr_enter(struct corr * const co)
+{
+  (void)co_enter(&(co->co), 0);
+}
+
+  inline void
+corr_yield(void)
+{
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr && (curr->next != curr))
+    (void)co_switch_to(&(curr->next->co), 0);
+}
+
+__attribute__((noreturn))
+  inline void
+corr_exit(void)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  const u8 * const mem = ((u8 *)(co_curr)) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr->next != curr) { // have more corr
+    struct corr * const next = curr->next;
+    struct corr * const prev = curr->prev;
+    next->prev = prev;
+    prev->next = next;
+    curr->next = NULL;
+    curr->prev = NULL;
+    curr->co.host = NULL; // invalidate
+    (void)co_switch_to(&(next->co), 0);
+  } else { // the last corr
+    co_exit0();
+  }
+  debug_die();
+}
+
+  inline void
+corr_destroy(struct corr * const co)
+{
+  co_destroy(&(co->co));
+}
+// }}} corr
+
+// }}} co
+
+// bits {{{
+  inline u32
+bits_reverse_u32(const u32 v)
+{
+  const u32 v2 = __builtin_bswap32(v);
+  const u32 v3 = ((v2 & 0xf0f0f0f0u) >> 4) | ((v2 & 0x0f0f0f0fu) << 4);
+  const u32 v4 = ((v3 & 0xccccccccu) >> 2) | ((v3 & 0x33333333u) << 2);
+  const u32 v5 = ((v4 & 0xaaaaaaaau) >> 1) | ((v4 & 0x55555555u) << 1);
+  return v5;
+}
+
+  inline u64
+bits_reverse_u64(const u64 v)
+{
+  const u64 v2 = __builtin_bswap64(v);
+  const u64 v3 = ((v2 & 0xf0f0f0f0f0f0f0f0lu) >>  4) | ((v2 & 0x0f0f0f0f0f0f0f0flu) <<  4);
+  const u64 v4 = ((v3 & 0xcccccccccccccccclu) >>  2) | ((v3 & 0x3333333333333333lu) <<  2);
+  const u64 v5 = ((v4 & 0xaaaaaaaaaaaaaaaalu) >>  1) | ((v4 & 0x5555555555555555lu) <<  1);
+  return v5;
+}
+
+  inline u64
+bits_rotl_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v << sh) | (v >> (64 - sh));
+}
+
+  inline u64
+bits_rotr_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v >> sh) | (v << (64 - sh));
+}
+
+  inline u32
+bits_rotl_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v << sh) | (v >> (32 - sh));
+}
+
+  inline u32
+bits_rotr_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v >> sh) | (v << (32 - sh));
+}
+
+  inline u64
+bits_p2_up_u64(const u64 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1lu << (64 - __builtin_clzl(v - 1lu))) : v;
+}
+
+  inline u32
+bits_p2_up_u32(const u32 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1u << (32 - __builtin_clz(v - 1u))) : v;
+}
+
+  inline u64
+bits_p2_down_u64(const u64 v)
+{
+  return v ? (1lu << (63 - __builtin_clzl(v))) : v;
+}
+
+  inline u32
+bits_p2_down_u32(const u32 v)
+{
+  return v ? (1u << (31 - __builtin_clz(v))) : v;
+}
+
+  inline u64
+bits_round_up(const u64 v, const u8 power)
+{
+  return (v + (1lu << power) - 1lu) >> power << power;
+}
+
+  inline u64
+bits_round_up_a(const u64 v, const u64 a)
+{
+  return (v + a - 1) / a * a;
+}
+
+  inline u64
+bits_round_down(const u64 v, const u8 power)
+{
+  return v >> power << power;
+}
+
+  inline u64
+bits_round_down_a(const u64 v, const u64 a)
+{
+  return v / a * a;
+}
+// }}} bits
+
+// vi128 {{{
+#if defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH __attribute__ ((fallthrough))
+#else
+#define FALLTHROUGH ((void)0)
+#endif /* __GNUC__ >= 7 */
+
+  inline u32
+vi128_estimate_u32(const u32 v)
+{
+  static const u8 t[] = {5,5,5,5,
+    4,4,4,4,4,4,4, 3,3,3,3,3,3,3,
+    2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clz(v)] : 2;
+  // 0 -> [0x80 0x00] the first byte is non-zero
+
+  // nz bit range -> enc length    offset in t[]
+  // 0 -> 2          special case
+  // 1 to 7 -> 1     31 to 25
+  // 8 to 14 -> 2    24 to 18
+  // 15 to 21 -> 3   17 to 11
+  // 22 to 28 -> 4   10 to 4
+  // 29 to 32 -> 5    3 to 0
+}
+
+  u8 *
+vi128_encode_u32(u8 * dst, u32 v)
+{
+  switch (vi128_estimate_u32(v)) {
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+  const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out)
+{
+  debug_assert(*src);
+  u32 r = 0;
+  for (u32 shift = 0; shift < 32; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u32)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+  inline u32
+vi128_estimate_u64(const u64 v)
+{
+  static const u8 t[] = {10,
+    9,9,9,9,9,9,9, 8,8,8,8,8,8,8, 7,7,7,7,7,7,7,
+    6,6,6,6,6,6,6, 5,5,5,5,5,5,5, 4,4,4,4,4,4,4,
+    3,3,3,3,3,3,3, 2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clzl(v)] : 2;
+}
+
+// return ptr after the generated bytes
+  u8 *
+vi128_encode_u64(u8 * dst, u64 v)
+{
+  switch (vi128_estimate_u64(v)) {
+  case 10:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 9:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 8:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 7:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 6:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+// return ptr after the consumed bytes
+  const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out)
+{
+  u64 r = 0;
+  for (u32 shift = 0; shift < 64; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u64)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+#undef FALLTHROUGH
+// }}} vi128
+
+// misc {{{
+  inline struct entry13
+entry13(const u16 e1, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  return (struct entry13){.v64 = (e3 << 16) | e1};
+}
+
+  inline void
+entry13_update_e3(struct entry13 * const e, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  *e = entry13(e->e1, e3);
+}
+
+  inline void *
+u64_to_ptr(const u64 v)
+{
+  return (void *)v;
+}
+
+  inline u64
+ptr_to_u64(const void * const ptr)
+{
+  return (u64)ptr;
+}
+
+// portable malloc_usable_size
+  inline size_t
+m_usable_size(void * const ptr)
+{
+#if defined(__linux__) || defined(__FreeBSD__)
+  const size_t sz = malloc_usable_size(ptr);
+#elif defined(__APPLE__) && defined(__MACH__)
+  const size_t sz = malloc_size(ptr);
+#endif // OS
+
+#ifndef HEAPCHECKING
+  // valgrind and asan may return unaligned usable size
+  debug_assert((sz & 0x7lu) == 0);
+#endif // HEAPCHECKING
+
+  return sz;
+}
+
+  inline size_t
+fdsize(const int fd)
+{
+  struct stat st;
+  st.st_size = 0;
+  if (fstat(fd, &st) != 0)
+    return 0;
+
+  if (S_ISBLK(st.st_mode)) {
+#if defined(__linux__)
+    ioctl(fd, BLKGETSIZE64, &st.st_size);
+#elif defined(__APPLE__) && defined(__MACH__)
+    u64 blksz = 0;
+    u64 nblks = 0;
+    ioctl(fd, DKIOCGETBLOCKSIZE, &blksz);
+    ioctl(fd, DKIOCGETBLOCKCOUNT, &nblks);
+    st.st_size = (ssize_t)(blksz * nblks);
+#elif defined(__FreeBSD__)
+    ioctl(fd, DIOCGMEDIASIZE, &st.st_size);
+#endif // OS
+  }
+
+  return (size_t)st.st_size;
+}
+
+  u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max)
+{
+  const u32 max64 = max & (~7u);
+  u32 clen = 0;
+  while (clen < max64) {
+    const u64 v1 = *(const u64 *)(p1+clen);
+    const u64 v2 = *(const u64 *)(p2+clen);
+    const u64 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctzl(x) >> 3);
+
+    clen += sizeof(u64);
+  }
+
+  if ((clen + sizeof(u32)) <= max) {
+    const u32 v1 = *(const u32 *)(p1+clen);
+    const u32 v2 = *(const u32 *)(p2+clen);
+    const u32 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctz(x) >> 3);
+
+    clen += sizeof(u32);
+  }
+
+  while ((clen < max) && (p1[clen] == p2[clen]))
+    clen++;
+  return clen;
+}
+
+static double logger_t0 = 0.0;
+
+__attribute__((constructor))
+  static void
+logger_init(void)
+{
+  logger_t0 = time_sec();
+}
+
+__attribute__ ((format (printf, 2, 3)))
+  void
+logger_printf(const int fd, const char * const fmt, ...)
+{
+  char buf[4096];
+  va_list ap;
+  va_start(ap, fmt);
+  vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+  dprintf(fd, "%010.3lf %08x %s", time_diff_sec(logger_t0), crc32c_u64(0x12345678, (u64)pthread_self()), buf);
+}
+// }}} misc
+
+// astk {{{
+// atomic stack
+struct acell { struct acell * next; };
+
+// extract ptr from m value
+  static inline struct acell *
+astk_ptr(const u64 m)
+{
+  return (struct acell *)(m >> 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1(const u64 m0, struct acell * const ptr)
+{
+  return ((m0 + 1) & 0xfffflu) | (((u64)ptr) << 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1_unsafe(struct acell * const ptr)
+{
+  return ((u64)ptr) << 16;
+}
+
+  static bool
+astk_try_push(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1(m0, first);
+  return atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_RELEASE, MO_RELAXED);
+}
+
+  static void
+astk_push_safe(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  while (!astk_try_push(pmagic, first, last));
+}
+
+  static void
+astk_push_unsafe(au64 * const pmagic, struct acell * const first,
+    struct acell * const last)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1_unsafe(first);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+}
+
+//// can fail for two reasons: (1) NULL: no available object; (2) ~0lu: contention
+//  static void *
+//astk_try_pop(au64 * const pmagic)
+//{
+//  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+//  struct acell * const ret = astk_ptr(m0);
+//  if (ret == NULL)
+//    return NULL;
+//
+//  const u64 m1 = astk_m1(m0, ret->next);
+//  if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+//    return ret;
+//  else
+//    return (void *)(~0lu);
+//}
+
+  static void *
+astk_pop_safe(au64 * const pmagic)
+{
+  do {
+    u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+    struct acell * const ret = astk_ptr(m0);
+    if (ret == NULL)
+      return NULL;
+
+    const u64 m1 = astk_m1(m0, ret->next);
+    if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+      return ret;
+  } while (true);
+}
+
+  static void *
+astk_pop_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  struct acell * const ret = astk_ptr(m0);
+  if (ret == NULL)
+    return NULL;
+
+  const u64 m1 = astk_m1_unsafe(ret->next);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+  return (void *)ret;
+}
+
+  static void *
+astk_peek_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  return astk_ptr(m0);
+}
+// }}} astk
+
+// slab {{{
+#define SLAB_OBJ0_OFFSET ((64))
+struct slab {
+  au64 magic; // hi 48: ptr, lo 16: seq
+  u64 padding1[7];
+
+  // 2nd line
+  struct acell * head_active; // list of blocks in use or in magic
+  struct acell * head_backup; // list of unused full blocks
+  u64 nr_ready; // UNSAFE only! number of objects under magic
+  u64 padding2[5];
+
+  // 3rd line const
+  u64 obj_size; // const: aligned size of each object
+  u64 blk_size; // const: size of each memory block
+  u64 objs_per_slab; // const: number of objects in a slab
+  u64 obj0_offset; // const: offset of the first object in a block
+  u64 padding3[4];
+
+  // 4th line
+  union {
+    mutex lock;
+    u64 padding4[8];
+  };
+};
+static_assert(sizeof(struct slab) == 256, "sizeof(struct slab) != 256");
+
+  static void
+slab_add(struct slab * const slab, struct acell * const blk, const bool is_safe)
+{
+  // insert into head_active
+  blk->next = slab->head_active;
+  slab->head_active = blk;
+
+  u8 * const base = ((u8 *)blk) + slab->obj0_offset;
+  struct acell * iter = (typeof(iter))base; // [0]
+  for (u64 i = 1; i < slab->objs_per_slab; i++) {
+    struct acell * const next = (typeof(next))(base + (i * slab->obj_size));
+    iter->next = next;
+    iter = next;
+  }
+
+  // base points to the first block; iter points to the last block
+  if (is_safe) { // other threads can poll magic
+    astk_push_safe(&slab->magic, (struct acell *)base, iter);
+  } else { // unsafe
+    astk_push_unsafe(&slab->magic, (struct acell *)base, iter);
+    slab->nr_ready += slab->objs_per_slab;
+  }
+}
+
+// critical section; call with lock
+  static bool
+slab_expand(struct slab * const slab, const bool is_safe)
+{
+  struct acell * const old = slab->head_backup;
+  if (old) { // pop old from backup and add
+    slab->head_backup = old->next;
+    slab_add(slab, old, is_safe);
+  } else { // more core
+    size_t blk_size;
+    struct acell * const new = pages_alloc_best(slab->blk_size, true, &blk_size);
+    (void)blk_size;
+    if (new == NULL)
+      return false;
+
+    slab_add(slab, new, is_safe);
+  }
+  return true;
+}
+
+// return 0 on failure; otherwise, obj0_offset
+  static u64
+slab_check_sizes(const u64 obj_size, const u64 blk_size)
+{
+  // obj must be non-zero and 8-byte aligned
+  // blk must be at least of page size and power of 2
+  if ((!obj_size) || (obj_size % 8lu) || (blk_size < 4096lu) || (blk_size & (blk_size - 1)))
+    return 0;
+
+  // each slab should have at least one object
+  const u64 obj0_offset = (obj_size & (obj_size - 1)) ? SLAB_OBJ0_OFFSET : obj_size;
+  if (obj0_offset >= blk_size || (blk_size - obj0_offset) < obj_size)
+    return 0;
+
+  return obj0_offset;
+}
+
+  static void
+slab_init_internal(struct slab * const slab, const u64 obj_size, const u64 blk_size, const u64 obj0_offset)
+{
+  memset(slab, 0, sizeof(*slab));
+  slab->obj_size = obj_size;
+  slab->blk_size = blk_size;
+  slab->objs_per_slab = (blk_size - obj0_offset) / obj_size;
+  debug_assert(slab->objs_per_slab); // >= 1
+  slab->obj0_offset = obj0_offset;
+  mutex_init(&(slab->lock));
+}
+
+  struct slab *
+slab_create(const u64 obj_size, const u64 blk_size)
+{
+  const u64 obj0_offset = slab_check_sizes(obj_size, blk_size);
+  if (!obj0_offset)
+    return NULL;
+
+  struct slab * const slab = yalloc(sizeof(*slab));
+  if (slab == NULL)
+    return NULL;
+
+  slab_init_internal(slab, obj_size, blk_size, obj0_offset);
+  return slab;
+}
+
+// unsafe
+  bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr)
+{
+  while (slab->nr_ready < nr)
+    if (!slab_expand(slab, false))
+      return false;
+  return true;
+}
+
+  void *
+slab_alloc_unsafe(struct slab * const slab)
+{
+  void * ret = astk_pop_unsafe(&slab->magic);
+  if (ret == NULL) {
+    if (!slab_expand(slab, false))
+      return NULL;
+    ret = astk_pop_unsafe(&slab->magic);
+  }
+  debug_assert(ret);
+  slab->nr_ready--;
+  return ret;
+}
+
+  void *
+slab_alloc_safe(struct slab * const slab)
+{
+  void * ret = astk_pop_safe(&slab->magic);
+  if (ret)
+    return ret;
+
+  mutex_lock(&slab->lock);
+  do {
+    ret = astk_pop_safe(&slab->magic); // may already have new objs
+    if (ret)
+      break;
+    if (!slab_expand(slab, true))
+      break;
+  } while (true);
+  mutex_unlock(&slab->lock);
+  return ret;
+}
+
+  void
+slab_free_unsafe(struct slab * const slab, void * const ptr)
+{
+  debug_assert(ptr);
+  astk_push_unsafe(&slab->magic, ptr, ptr);
+  slab->nr_ready++;
+}
+
+  void
+slab_free_safe(struct slab * const slab, void * const ptr)
+{
+  astk_push_safe(&slab->magic, ptr, ptr);
+}
+
+// UNSAFE
+  void
+slab_free_all(struct slab * const slab)
+{
+  slab->magic = 0;
+  slab->nr_ready = 0; // backup does not count
+
+  if (slab->head_active) {
+    struct acell * iter = slab->head_active;
+    while (iter->next)
+      iter = iter->next;
+    // now iter points to the last blk
+    iter->next = slab->head_backup; // active..backup
+    slab->head_backup = slab->head_active; // backup gets all
+    slab->head_active = NULL; // empty active
+  }
+}
+
+// unsafe
+  u64
+slab_get_nalloc(struct slab * const slab)
+{
+  struct acell * iter = slab->head_active;
+  u64 n = 0;
+  while (iter) {
+    n++;
+    iter = iter->next;
+  }
+  n *= slab->objs_per_slab;
+
+  iter = astk_peek_unsafe(&slab->magic);
+  while (iter) {
+    n--;
+    iter = iter->next;
+  }
+  return n;
+}
+
+  static void
+slab_deinit(struct slab * const slab)
+{
+  debug_assert(slab);
+  struct acell * iter = slab->head_active;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+  iter = slab->head_backup;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+}
+
+  void
+slab_destroy(struct slab * const slab)
+{
+  slab_deinit(slab);
+  free(slab);
+}
+// }}} slab
+
+// string {{{
+static union { u16 v16; u8 v8[2]; } strdec_table[100];
+
+__attribute__((constructor))
+  static void
+strdec_init(void)
+{
+  for (u8 i = 0; i < 100; i++) {
+    const u8 hi = (typeof(hi))('0' + (i / 10));
+    const u8 lo = (typeof(lo))('0' + (i % 10));
+    strdec_table[i].v8[0] = hi;
+    strdec_table[i].v8[1] = lo;
+  }
+}
+
+// output 10 bytes
+  void
+strdec_32(void * const out, const u32 v)
+{
+  u32 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 4; i <= 4; i--) { // x5
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100u;
+  }
+}
+
+// output 20 bytes
+  void
+strdec_64(void * const out, const u64 v)
+{
+  u64 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 9; i <= 9; i--) { // x10
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100;
+  }
+}
+
+static const u8 strhex_table_16[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+
+#if defined(__x86_64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+
+  const m128 tmp = _mm_set_epi64x((s64)(v>>4), (s64)v); // mm want s64
+  const m128 hilo = _mm_and_si128(tmp, _mm_set1_epi8(0xf));
+  const m128 bin = _mm_shuffle_epi8(hilo, _mm_load_si128((void *)mask1));
+  const m128 str = _mm_shuffle_epi8(_mm_load_si128((const void *)strhex_table_16), bin);
+  return str;
+}
+#elif defined(__aarch64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+  u64 v2[2] = {v, v>>4};
+  const m128 tmp = vld1q_u8((u8 *)v2);
+  const m128 hilo = vandq_u8(tmp, vdupq_n_u8(0xf));
+  const m128 bin = vqtbl1q_u8(hilo, vld1q_u8(mask1));
+  const m128 str = vqtbl1q_u8(vld1q_u8(strhex_table_16), bin);
+  return str;
+}
+#else
+static u16 strhex_table_256[256];
+
+__attribute__((constructor))
+  static void
+strhex_init(void)
+{
+  for (u64 i = 0; i < 256; i++)
+    strhex_table_256[i] = (((u16)strhex_table_16[i & 0xf]) << 8) | (strhex_table_16[i>>4]);
+}
+#endif // __x86_64__
+
+// output 8 bytes
+  void
+strhex_32(void * const out, u32 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper((u64)v);
+  _mm_storel_epi64(out, _mm_srli_si128(str, 8));
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper((u64)v);
+  vst1q_lane_u64(out, vreinterpretq_u64_u8(str), 1);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 4; i++) {
+    ptr[3-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// output 16 bytes // buffer must be aligned by 16B
+  void
+strhex_64(void * const out, u64 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper(v);
+  _mm_storeu_si128(out, str);
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper(v);
+  vst1q_u8(out, str);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 8; i++) {
+    ptr[7-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// string to u64
+  inline u64
+a2u64(const void * const str)
+{
+  return strtoull(str, NULL, 10);
+}
+
+  inline u32
+a2u32(const void * const str)
+{
+  return (u32)strtoull(str, NULL, 10);
+}
+
+  inline s64
+a2s64(const void * const str)
+{
+  return strtoll(str, NULL, 10);
+}
+
+  inline s32
+a2s32(const void * const str)
+{
+  return (s32)strtoll(str, NULL, 10);
+}
+
+  void
+str_print_hex(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 3;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    buf[i*3] = ' ';
+    buf[i*3+1] = strhex_table_16[ptr[i]>>4];
+    buf[i*3+2] = strhex_table_16[ptr[i] & 0xf];
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+  void
+str_print_dec(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 4;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    const u8 v = ptr[i];
+    buf[i*4] = ' ';
+    const u8 v1 = v / 100u;
+    const u8 v23 = v % 100u;
+    buf[i*4+1] = (u8)'0' + v1;
+    buf[i*4+2] = (u8)'0' + (v23 / 10u);
+    buf[i*4+3] = (u8)'0' + (v23 % 10u);
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+// returns a NULL-terminated list of string tokens.
+// After use you only need to free the returned pointer (char **).
+  char **
+strtoks(const char * const str, const char * const delim)
+{
+  if (str == NULL)
+    return NULL;
+  size_t nptr_alloc = 32;
+  char ** tokens = malloc(sizeof(tokens[0]) * nptr_alloc);
+  if (tokens == NULL)
+    return NULL;
+  const size_t bufsize = strlen(str) + 1;
+  char * const buf = malloc(bufsize);
+  if (buf == NULL)
+    goto fail_buf;
+
+  memcpy(buf, str, bufsize);
+  char * saveptr = NULL;
+  char * tok = strtok_r(buf, delim, &saveptr);
+  size_t ntoks = 0;
+  while (tok) {
+    if (ntoks >= nptr_alloc) {
+      nptr_alloc += 32;
+      char ** const r = realloc(tokens, sizeof(tokens[0]) * nptr_alloc);
+      if (r == NULL)
+        goto fail_realloc;
+
+      tokens = r;
+    }
+    tokens[ntoks] = tok;
+    ntoks++;
+    tok = strtok_r(NULL, delim, &saveptr);
+  }
+  tokens[ntoks] = NULL;
+  const size_t nptr = ntoks + 1; // append a NULL
+  const size_t rsize = (sizeof(tokens[0]) * nptr) + bufsize;
+  char ** const r = realloc(tokens, rsize);
+  if (r == NULL)
+    goto fail_realloc;
+
+  tokens = r;
+  char * const dest = (char *)(&(tokens[nptr]));
+  memcpy(dest, buf, bufsize);
+  for (u64 i = 0; i < ntoks; i++)
+    tokens[i] += (dest - buf);
+
+  free(buf);
+  return tokens;
+
+fail_realloc:
+  free(buf);
+fail_buf:
+  free(tokens);
+  return NULL;
+}
+
+  u32
+strtoks_count(const char * const * const toks)
+{
+  if (!toks)
+    return 0;
+  u32 n = 0;
+  while (toks[n++]);
+  return n;
+}
+// }}} string
+
+// qsbr {{{
+#define QSBR_STATES_NR ((23)) // shard capacity; valid values are 3*8-1 == 23; 5*8-1 == 39; 7*8-1 == 55
+#define QSBR_SHARD_BITS  ((5)) // 2^n shards
+#define QSBR_SHARD_NR    (((1u) << QSBR_SHARD_BITS))
+#define QSBR_SHARD_MASK  ((QSBR_SHARD_NR - 1))
+
+struct qsbr_ref_real {
+#ifdef QSBR_DEBUG
+  pthread_t ptid; // 8
+  u32 status; // 4
+  u32 nbt; // 4 (number of backtrace frames)
+#define QSBR_DEBUG_BTNR ((14))
+  void * backtrace[QSBR_DEBUG_BTNR];
+#endif
+  au64 qstate; // user updates it
+  au64 * pptr; // internal only
+  struct qsbr_ref_real * park;
+};
+
+static_assert(sizeof(struct qsbr_ref) == sizeof(struct qsbr_ref_real), "sizeof qsbr_ref");
+
+// Quiescent-State-Based Reclamation RCU
+struct qsbr {
+  struct qsbr_ref_real target;
+  u64 padding0[5];
+  struct qshard {
+    au64 bitmap;
+    au64 ptrs[QSBR_STATES_NR];
+  } shards[QSBR_SHARD_NR];
+};
+
+  struct qsbr *
+qsbr_create(void)
+{
+  struct qsbr * const q = yalloc(sizeof(*q));
+  memset(q, 0, sizeof(*q));
+  return q;
+}
+
+  static inline struct qshard *
+qsbr_shard(struct qsbr * const q, void * const ptr)
+{
+  const u32 sid = crc32c_u64(0, (u64)ptr) & QSBR_SHARD_MASK;
+  debug_assert(sid < QSBR_SHARD_NR);
+  return &(q->shards[sid]);
+}
+
+  static inline void
+qsbr_write_qstate(struct qsbr_ref_real * const ref, const u64 v)
+{
+  atomic_store_explicit(&ref->qstate, v, MO_RELAXED);
+}
+
+  bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  qsbr_write_qstate(ref, 0);
+
+  do {
+    u64 bits = atomic_load_explicit(&shard->bitmap, MO_CONSUME);
+    const u32 pos = (u32)__builtin_ctzl(~bits);
+    if (unlikely(pos >= QSBR_STATES_NR))
+      return false;
+
+    const u64 bits1 = bits | (1lu << pos);
+    if (atomic_compare_exchange_weak_explicit(&shard->bitmap, &bits, bits1, MO_ACQUIRE, MO_RELAXED)) {
+      atomic_store_explicit(&shard->ptrs[pos], (u64)ref, MO_RELAXED);
+      //shard->ptrs[pos] = ref;
+
+      ref->pptr = &(shard->ptrs[pos]);
+      ref->park = &q->target;
+#ifdef QSBR_DEBUG
+      ref->ptid = (u64)pthread_self();
+      ref->tid = 0;
+      ref->status = 1;
+      ref->nbt = backtrace(ref->backtrace, QSBR_DEBUG_BTNR);
+#endif
+      return true;
+    }
+  } while (true);
+}
+
+  void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  const u32 pos = (u32)(ref->pptr - shard->ptrs);
+  debug_assert(pos < QSBR_STATES_NR);
+  debug_assert(shard->bitmap & (1lu << pos));
+
+  atomic_store_explicit(&shard->ptrs[pos], (u64)(&q->target), MO_RELAXED);
+  //shard->ptrs[pos] = &q->target;
+  (void)atomic_fetch_and_explicit(&shard->bitmap, ~(1lu << pos), MO_RELEASE);
+#ifdef QSBR_DEBUG
+  ref->tid = 0;
+  ref->ptid = 0;
+  ref->status = 0xffff; // unregistered
+  ref->nbt = 0;
+#endif
+  ref->pptr = NULL;
+  // wait for qsbr_wait to leave if it's working on the shard
+  while (atomic_load_explicit(&shard->bitmap, MO_CONSUME) >> 63)
+    cpu_pause();
+}
+
+  inline void
+qsbr_update(struct qsbr_ref * const qref, const u64 v)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  debug_assert((*ref->pptr) == (u64)ref); // must be unparked
+  // rcu update does not require release or acquire order
+  qsbr_write_qstate(ref, v);
+}
+
+  inline void
+qsbr_park(struct qsbr_ref * const qref)
+{
+  cpu_cfence();
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref->park, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xfff; // parked
+#endif
+}
+
+  inline void
+qsbr_resume(struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xf; // resumed
+#endif
+  cpu_cfence();
+}
+
+// waiters needs external synchronization
+  void
+qsbr_wait(struct qsbr * const q, const u64 target)
+{
+  cpu_cfence();
+  qsbr_write_qstate(&q->target, target);
+  u64 cbits = 0; // check-bits; each bit corresponds to a shard
+  u64 bms[QSBR_SHARD_NR]; // copy of all bitmap
+  // take an unsafe snapshot of active users
+  for (u32 i = 0; i < QSBR_SHARD_NR; i++) {
+    bms[i] = atomic_load_explicit(&q->shards[i].bitmap, MO_CONSUME);
+    if (bms[i])
+      cbits |= (1lu << i); // set to 1 if [i] has ptrs
+  }
+
+  while (cbits) {
+    for (u64 ctmp = cbits; ctmp; ctmp &= (ctmp - 1)) {
+      // shard id
+      const u32 i = (u32)__builtin_ctzl(ctmp);
+      struct qshard * const shard = &(q->shards[i]);
+      const u64 bits1 = atomic_fetch_or_explicit(&(shard->bitmap), 1lu << 63, MO_ACQUIRE);
+      for (u64 bits = bms[i]; bits; bits &= (bits - 1)) {
+        const u64 bit = bits & -bits; // extract lowest bit
+        if ((bits1 & bit) == 0) {
+          bms[i] &= ~bit;
+        } else {
+          au64 * pptr = &(shard->ptrs[__builtin_ctzl(bit)]);
+          struct qsbr_ref_real * const ptr = (typeof(ptr))atomic_load_explicit(pptr, MO_RELAXED);
+          if (atomic_load_explicit(&(ptr->qstate), MO_CONSUME) == target)
+            bms[i] &= ~bit;
+        }
+      }
+      (void)atomic_fetch_and_explicit(&(shard->bitmap), ~(1lu << 63), MO_RELEASE);
+      if (bms[i] == 0)
+        cbits &= ~(1lu << i);
+    }
+#if defined(CORR)
+    corr_yield();
+#endif
+  }
+  debug_assert(cbits == 0);
+  cpu_cfence();
+}
+
+  void
+qsbr_destroy(struct qsbr * const q)
+{
+  if (q)
+    free(q);
+}
+#undef QSBR_STATES_NR
+#undef QSBR_BITMAP_NR
+// }}} qsbr
+
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/lib.h b/run/MassTrie-beta/wormhole/lib.h
new file mode 100644
index 00000000..40a2f40d
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/lib.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// includes {{{
+// C headers
+#include <errno.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+// POSIX headers
+#include <fcntl.h>
+#include <pthread.h>
+#include <unistd.h>
+
+// Linux headers
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+// SIMD
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#elif defined(__aarch64__)
+#include <arm_acle.h>
+#include <arm_neon.h>
+#endif
+// }}} includes
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// types {{{
+#ifndef typeof
+#define typeof __typeof__
+#endif
+#ifndef asm
+#define asm __asm__
+#endif
+typedef char            s8;
+typedef short           s16;
+typedef int             s32;
+typedef long            s64;
+typedef __int128_t      s128;
+static_assert(sizeof(s8) == 1, "sizeof(s8)");
+static_assert(sizeof(s16) == 2, "sizeof(s16)");
+static_assert(sizeof(s32) == 4, "sizeof(s32)");
+static_assert(sizeof(s64) == 8, "sizeof(s64)");
+static_assert(sizeof(s128) == 16, "sizeof(s128)");
+
+typedef unsigned char   u8;
+typedef unsigned short  u16;
+typedef unsigned int    u32;
+typedef unsigned long   u64;
+typedef __uint128_t     u128;
+static_assert(sizeof(u8) == 1, "sizeof(u8)");
+static_assert(sizeof(u16) == 2, "sizeof(u16)");
+static_assert(sizeof(u32) == 4, "sizeof(u32)");
+static_assert(sizeof(u64) == 8, "sizeof(u64)");
+static_assert(sizeof(u128) == 16, "sizeof(u128)");
+
+#if defined(__x86_64__)
+typedef __m128i m128;
+#if defined(__AVX2__)
+typedef __m256i m256;
+#endif // __AVX2__
+#if defined(__AVX512F__)
+typedef __m512i m512;
+#endif // __AVX512F__
+#elif defined(__aarch64__)
+typedef uint8x16_t m128;
+#else
+#error Need x86_64 or AArch64.
+#endif
+// }}} types
+
+// defs {{{
+#define likely(____x____)   __builtin_expect(____x____, 1)
+#define unlikely(____x____) __builtin_expect(____x____, 0)
+
+// ansi colors
+// 3X:fg; 4X:bg; 9X:light fg; 10X:light bg;
+// X can be one of the following colors:
+// 0:black;   1:red;     2:green;  3:yellow;
+// 4:blue;    5:magenta; 6:cyan;   7:white;
+#define TERMCLR(____code____) "\x1b[" #____code____ "m"
+// }}} defs
+
+// const {{{
+#define PGBITS ((12))
+#define PGSZ ((1lu << PGBITS))
+// }}} const
+
+// math {{{
+  extern u64
+mhash64(const u64 v);
+
+  extern u32
+mhash32(const u32 v);
+
+  extern u64
+gcd64(u64 a, u64 b);
+// }}} math
+
+// random {{{
+  extern u64
+random_u64(void);
+
+  extern void
+srandom_u64(const u64 seed);
+
+  extern double
+random_double(void);
+// }}} random
+
+// timing {{{
+  extern u64
+time_nsec(void);
+
+  extern double
+time_sec(void);
+
+  extern u64
+time_diff_nsec(const u64 last);
+
+  extern double
+time_diff_sec(const double last);
+
+  extern void
+time_stamp(char * str, const size_t size);
+
+  extern void
+time_stamp2(char * str, const size_t size);
+// }}} timing
+
+// cpucache {{{
+  extern void
+cpu_pause(void);
+
+  extern void
+cpu_mfence(void);
+
+  extern void
+cpu_cfence(void);
+
+  extern void
+cpu_prefetch0(const void * const ptr);
+
+  extern void
+cpu_prefetch1(const void * const ptr);
+
+  extern void
+cpu_prefetch2(const void * const ptr);
+
+  extern void
+cpu_prefetch3(const void * const ptr);
+
+  extern void
+cpu_prefetchw(const void * const ptr);
+// }}} cpucache
+
+// crc32c {{{
+  extern u32
+crc32c_u8(const u32 crc, const u8 v);
+
+  extern u32
+crc32c_u16(const u32 crc, const u16 v);
+
+  extern u32
+crc32c_u32(const u32 crc, const u32 v);
+
+  extern u32
+crc32c_u64(const u32 crc, const u64 v);
+
+// 1 <= nr <= 3
+  extern u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc);
+
+// nr % 4 == 0
+  extern u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc);
+
+  extern u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc);
+// }}} crc32c
+
+// debug {{{
+  extern void
+debug_break(void);
+
+  extern void
+debug_backtrace(void);
+
+  extern void
+watch_u64_usr1(u64 * const ptr);
+
+#ifndef NDEBUG
+  extern void
+debug_assert(const bool v);
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+__attribute__((noreturn))
+  extern void
+debug_die(void);
+
+__attribute__((noreturn))
+  extern void
+debug_die_perror(void);
+
+  extern void
+debug_dump_maps(FILE * const out);
+
+  extern bool
+debug_perf_switch(void);
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  extern bool
+alloc_fail(void);
+#endif
+
+  extern void *
+xalloc(const size_t align, const size_t size);
+
+  extern void *
+yalloc(const size_t size);
+
+  extern void **
+malloc_2d(const size_t nr, const size_t size);
+
+  extern void **
+calloc_2d(const size_t nr, const size_t size);
+
+  extern void
+pages_unmap(void * const ptr, const size_t size);
+
+  extern void
+pages_lock(void * const ptr, const size_t size);
+
+/* hugepages */
+// force posix allocators: -DVALGRIND_MEMCHECK
+  extern void *
+pages_alloc_4kb(const size_t nr_4kb);
+
+  extern void *
+pages_alloc_2mb(const size_t nr_2mb);
+
+  extern void *
+pages_alloc_1gb(const size_t nr_1gb);
+
+  extern void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out);
+// }}} mm
+
+// process/thread {{{
+  extern void
+thread_get_name(const pthread_t pt, char * const name, const size_t len);
+
+  extern void
+thread_set_name(const pthread_t pt, const char * const name);
+
+  extern long
+process_get_rss(void);
+
+  extern u32
+process_affinity_count(void);
+
+  extern u32
+process_getaffinity_list(const u32 max, u32 * const cores);
+
+  extern void
+thread_setaffinity_list(const u32 nr, const u32 * const list);
+
+  extern void
+thread_pin(const u32 cpu);
+
+  extern u64
+process_cpu_time_usec(void);
+
+// if args == true, argx is void **
+// if args == false, argx is void *
+  extern u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx);
+
+  extern int
+thread_create_at(const u32 cpu, pthread_t * const thread, void *(*start_routine) (void *), void * const arg);
+// }}} process/thread
+
+// locking {{{
+typedef union {
+  u32 opaque;
+} spinlock;
+
+  extern void
+spinlock_init(spinlock * const lock);
+
+  extern void
+spinlock_lock(spinlock * const lock);
+
+  extern bool
+spinlock_trylock(spinlock * const lock);
+
+  extern void
+spinlock_unlock(spinlock * const lock);
+
+typedef union {
+  u32 opaque;
+} rwlock;
+
+  extern void
+rwlock_init(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read(rwlock * const lock);
+
+// low-priority reader-lock; use with trylock_write_hp
+  extern bool
+rwlock_trylock_read_lp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_read(rwlock * const lock);
+
+  extern void
+rwlock_unlock_read(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write(rwlock * const lock);
+
+// writer has higher priority; new readers are blocked
+  extern bool
+rwlock_trylock_write_hp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write_hp(rwlock * const lock);
+
+  extern void
+rwlock_unlock_write(rwlock * const lock);
+
+  extern void
+rwlock_write_to_read(rwlock * const lock);
+
+typedef union {
+  u64 opqaue[8];
+} mutex;
+
+  extern void
+mutex_init(mutex * const lock);
+
+  extern void
+mutex_lock(mutex * const lock);
+
+  extern bool
+mutex_trylock(mutex * const lock);
+
+  extern void
+mutex_unlock(mutex * const lock);
+
+  extern void
+mutex_deinit(mutex * const lock);
+// }}} locking
+
+// coroutine {{{
+extern u64 co_switch_stack(u64 * const saversp, const u64 newrsp, const u64 retval);
+
+struct co;
+
+  extern struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host);
+
+  extern struct co *
+co_fork(void * func, void * priv);
+
+  extern void *
+co_priv(void);
+
+  extern u64
+co_enter(struct co * const to, const u64 retval);
+
+  extern u64
+co_switch_to(struct co * const to, const u64 retval);
+
+  extern u64
+co_back(const u64 retval);
+
+  extern void
+co_exit(const u64 retval);
+
+  extern bool
+co_valid(struct co * const co);
+
+  extern struct co *
+co_self(void);
+
+  extern void
+co_destroy(struct co * const co);
+
+struct corr;
+
+  extern struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host);
+
+  extern void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_enter(struct corr * const co);
+
+  extern void
+corr_yield(void);
+
+  extern void
+corr_exit(void);
+
+  extern void
+corr_destroy(struct corr * const co);
+// }}} coroutine
+
+// bits {{{
+  extern u32
+bits_reverse_u32(const u32 v);
+
+  extern u64
+bits_reverse_u64(const u64 v);
+
+  extern u64
+bits_rotl_u64(const u64 v, const u8 n);
+
+  extern u64
+bits_rotr_u64(const u64 v, const u8 n);
+
+  extern u32
+bits_rotl_u32(const u32 v, const u8 n);
+
+  extern u32
+bits_rotr_u32(const u32 v, const u8 n);
+
+  extern u64
+bits_p2_up_u64(const u64 v);
+
+  extern u32
+bits_p2_up_u32(const u32 v);
+
+  extern u64
+bits_p2_down_u64(const u64 v);
+
+  extern u32
+bits_p2_down_u32(const u32 v);
+
+  extern u64
+bits_round_up(const u64 v, const u8 power);
+
+  extern u64
+bits_round_up_a(const u64 v, const u64 a);
+
+  extern u64
+bits_round_down(const u64 v, const u8 power);
+
+  extern u64
+bits_round_down_a(const u64 v, const u64 a);
+// }}} bits
+
+// vi128 {{{
+  extern u32
+vi128_estimate_u32(const u32 v);
+
+  extern u8 *
+vi128_encode_u32(u8 * dst, u32 v);
+
+  extern const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out);
+
+  extern u32
+vi128_estimate_u64(const u64 v);
+
+  extern u8 *
+vi128_encode_u64(u8 * dst, u64 v);
+
+  extern const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out);
+// }}} vi128
+
+// misc {{{
+// TODO: only works on little endian?
+struct entry13 { // what a beautiful name
+  union {
+    u16 e1;
+    struct { // easy for debugging
+      u64 e1_64:16;
+      u64 e3:48;
+    };
+    u64 v64;
+    void * ptr;
+  };
+};
+
+static_assert(sizeof(struct entry13) == 8, "sizeof(entry13) != 8");
+
+// directly access read .e1 and .e3
+// directly write .e1
+// use entry13_update() to update the entire entry
+
+  extern struct entry13
+entry13(const u16 e1, const u64 e3);
+
+  extern void
+entry13_update_e3(struct entry13 * const e, const u64 e3);
+
+  extern void *
+u64_to_ptr(const u64 v);
+
+  extern u64
+ptr_to_u64(const void * const ptr);
+
+  extern size_t
+m_usable_size(void * const ptr);
+
+  extern size_t
+fdsize(const int fd);
+
+  extern u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max);
+
+__attribute__ ((format (printf, 2, 3)))
+  extern void
+logger_printf(const int fd, const char * const fmt, ...);
+// }}} misc
+
+// slab {{{
+struct slab;
+
+  extern struct slab *
+slab_create(const u64 obj_size, const u64 blk_size);
+
+  extern bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr);
+
+  extern void *
+slab_alloc_unsafe(struct slab * const slab);
+
+  extern void *
+slab_alloc_safe(struct slab * const slab);
+
+  extern void
+slab_free_unsafe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_safe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_all(struct slab * const slab);
+
+  extern u64
+slab_get_nalloc(struct slab * const slab);
+
+  extern void
+slab_destroy(struct slab * const slab);
+// }}}  slab
+
+// string {{{
+// XXX strdec_ and strhex_ functions does not append the trailing '\0' to the output string
+// size of out should be >= 10
+  extern void
+strdec_32(void * const out, const u32 v);
+
+// size of out should be >= 20
+  extern void
+strdec_64(void * const out, const u64 v);
+
+// size of out should be >= 8
+  extern void
+strhex_32(void * const out, const u32 v);
+
+// size of out should be >= 16
+  extern void
+strhex_64(void * const out, const u64 v);
+
+  extern u64
+a2u64(const void * const str);
+
+  extern u32
+a2u32(const void * const str);
+
+  extern s64
+a2s64(const void * const str);
+
+  extern s32
+a2s32(const void * const str);
+
+  extern void
+str_print_hex(FILE * const out, const void * const data, const u32 len);
+
+  extern void
+str_print_dec(FILE * const out, const void * const data, const u32 len);
+
+// user should free returned ptr (and nothing else) after use
+  extern char **
+strtoks(const char * const str, const char * const delim);
+
+  extern u32
+strtoks_count(const char * const * const toks);
+// }}} string
+
+// qsbr {{{
+// QSBR vs EBR (Quiescent-State vs Epoch Based Reclaimation)
+// QSBR: readers just use qsbr_update -> qsbr_update -> ... repeatedly
+// EBR: readers use qsbr_update -> qsbr_park -> qsbr_resume -> qsbr_update -> ...
+// The advantage of EBR is qsbr_park can happen much earlier than the next qsbr_update
+// The disadvantage is the extra cost, a pair of park/resume is used in every iteration
+struct qsbr;
+struct qsbr_ref {
+#ifdef QSBR_DEBUG
+  u64 debug[16];
+#endif
+  u64 opaque[3];
+};
+
+  extern struct qsbr *
+qsbr_create(void);
+
+// every READER accessing the shared data must first register itself with the qsbr
+  extern bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref);
+
+  extern void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref);
+
+// For READER: mark the beginning of critical section; like rcu_read_lock()
+  extern void
+qsbr_update(struct qsbr_ref * const qref, const u64 v);
+
+// temporarily stop access the shared data to avoid blocking writers
+// READER can use qsbr_park (like rcu_read_unlock()) in conjunction with qsbr_update
+// qsbr_park is roughly equivalent to qsbr_unregister, but faster
+  extern void
+qsbr_park(struct qsbr_ref * const qref);
+
+// undo the effect of qsbr_park; must use it between qsbr_park and qsbr_update
+// qsbr_resume is roughly equivalent to qsbr_register, but faster
+  extern void
+qsbr_resume(struct qsbr_ref * const qref);
+
+// WRITER: wait until all the readers have announced v=target with qsbr_update
+  extern void
+qsbr_wait(struct qsbr * const q, const u64 target);
+
+  extern void
+qsbr_destroy(struct qsbr * const q);
+// }}} qsbr
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/libwh.so b/run/MassTrie-beta/wormhole/libwh.so
new file mode 100644
index 00000000..2ecd7e7e
Binary files /dev/null and b/run/MassTrie-beta/wormhole/libwh.so differ
diff --git a/run/MassTrie-beta/wormhole/stresstest.c b/run/MassTrie-beta/wormhole/stresstest.c
new file mode 100644
index 00000000..93fb6f05
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/stresstest.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016-2020  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+#include "ctypes.h"
+
+struct stress_info {
+  u64 nkeys;
+  u32 nloader;
+  u32 nunldr;
+  u32 nth;
+  u32 cpt;
+  bool has_iter;
+
+  au64 seqno;
+  struct kv ** keys;
+
+  const struct kvmap_api * api;
+  void * map;
+  au64 tot;
+  au64 wfail;
+  u64 endtime;
+};
+
+  static void *
+stress_load_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  void * const ref = kvmap_ref(si->api, si->map);
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nloader * seq;
+  const u64 nz = (seq == (si->nloader - 1)) ? si->nkeys : (si->nkeys / si->nloader * (seq + 1));
+  //printf("load worker %lu %lu\n", n0, nz-1);
+
+  char * buf = malloc(128);
+  debug_assert(buf);
+  u64 * buf64 = (typeof(buf64))buf;
+  for (u64 i = n0; i < nz; i++) {
+    const u32 klen = (u32)(random_u64() & 0x3flu) + 8;
+    const u32 klen8 = (klen + 7) >> 3;
+    /*
+       buf64[0] = bswap_64(i); // little endian
+       for (u64 j = 1; j < klen8; j++)
+       buf64[j] = random_u64();
+     */
+    const u64 rkey = random_u64();
+    for (u32 j = 0; j < klen8; j++)
+      buf64[j] = (rkey >> j) & 0x0101010101010101lu;
+
+    si->keys[i] = kv_create(buf, klen, buf, 8);
+    if (si->keys[i] == NULL)
+      exit(0);
+    kvmap_kv_put(si->api, ref, si->keys[i]);
+  }
+  free(buf);
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void *
+stress_unload_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nunldr * seq;
+  const u64 nz = (seq == (si->nunldr - 1)) ? si->nkeys : (si->nkeys / si->nunldr * (seq + 1));
+
+  void * const ref = kvmap_ref(si->api, si->map);
+  for (u64 i = n0; i < nz; i++) {
+    kvmap_kv_del(si->api, ref, si->keys[i]);
+    free(si->keys[i]);
+  }
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void
+stress_inp_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+  }
+}
+
+  static struct kv *
+stress_merge_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+    return kv0;
+  } else {
+    u64 * ptr = kv_vptr((struct kv *)priv);
+    *ptr = 0;
+    return priv;
+  }
+}
+
+  static void
+stress_func(struct stress_info * const si)
+{
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  const struct kvmap_api * const api = si->api;
+  void * ref = kvmap_ref(api, si->map);
+  struct kv * next = si->keys[random_u64() % si->nkeys];
+  u64 rnext = random_u64() % si->nkeys;
+  struct kv * const tmp = malloc(128);
+  struct kref tmpkref;
+  struct kvref tmpkvref;
+  debug_assert(tmp);
+  void * iter = NULL;
+  if (api->iter_park) {
+    iter = api->iter_create(ref);
+    api->iter_park(iter);
+  }
+  u64 wfail1 = 0;
+  u64 nops = 0;
+#define BATCHSIZE ((4096))
+  do {
+    for (u64 i = 0; i < BATCHSIZE; i++) {
+      // reading kv keys leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = si->keys[rnext];
+      cpu_prefetch0(next);
+      cpu_prefetch0(((u8 *)next) + 64);
+      rnext = random_u64() % si->nkeys;
+      cpu_prefetch0(&(si->keys[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const u64 r = random_u64() % 16;
+      switch (r) {
+      case 0:
+        kvmap_kv_probe(api, ref, key);
+        break;
+      case 1:
+        kvmap_kv_get(api, ref, key, tmp);
+        break;
+      case 2:
+        if (si->has_iter) {
+          if (api->iter_park == NULL)
+            iter = api->iter_create(ref);
+          debug_assert(iter);
+          kvmap_kv_iter_seek(api, iter, key);
+          api->iter_next(iter, tmp);
+          api->iter_peek(iter, tmp);
+          api->iter_skip(iter, 2);
+          // this is unsafe; only reader's lock is acquired
+          if (api->iter_inp)
+            api->iter_inp(iter, stress_inp_plus1, NULL);
+          // kref
+          if (api->iter_kref)
+            api->iter_kref(iter, &tmpkref);
+          // kvref
+          if (api->iter_kvref)
+            api->iter_kvref(iter, &tmpkvref);
+          // done
+          if (api->iter_park)
+            api->iter_park(iter);
+          else
+            api->iter_destroy(iter);
+        }
+        break;
+      case 3:
+        if (api->refpark) {
+          api->park(ref);
+          api->resume(ref);
+        }
+        break;
+      case 4:
+        if (api->iter_park)
+          api->iter_destroy(iter);
+        (void)kvmap_unref(api, ref);
+        ref = kvmap_ref(api, si->map);
+        if (api->iter_park)
+          iter = api->iter_create(ref);
+        break;
+      case 5:
+        if (api->merge) {
+          kv_dup2_key(key, tmp);
+          tmp->vlen = 8;
+          kvmap_kv_merge(api, ref, key, stress_merge_plus1, tmp);
+        }
+        break;
+      case 6:
+        if ((random_u64() & 0x7fffu) == 0x22 && api->delr)
+          (void)kvmap_kv_delr(api, ref, si->keys[rnext], (rnext + 10) < si->nkeys ? si->keys[rnext + 10] : NULL);
+        else
+          kvmap_kv_probe(api, ref, key);
+        break;
+      case 7: case 8: case 9:
+        (void)kvmap_kv_del(api, ref, key);
+        break;
+      case 10: case 11:
+        if (api->inpw)
+          kvmap_kv_inpw(api, ref, key, stress_inp_plus1, NULL);
+        break;
+      case 12: case 13: case 14: case 15:
+        if (!kvmap_kv_put(api, ref, key))
+          wfail1++;
+        break;
+      default:
+        break;
+      }
+    }
+    nops += BATCHSIZE;
+  } while (time_nsec() < si->endtime);
+  si->wfail += wfail1;
+  if (api->iter_park)
+    api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  free(tmp);
+  si->tot += nops;
+}
+
+  static void
+stress_co_worker(void)
+{
+  struct stress_info * const si = (typeof(si))co_priv();
+  debug_assert(si);
+  stress_func(si);
+}
+
+  static void *
+stress_thread_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  if (si->cpt) {
+    u64 hostrsp = 0;
+    struct corr * crs[32];
+    do { // to work smoothly with ALLOCFAIL
+      crs[0] = corr_create(16*PGSZ, stress_co_worker, si, &hostrsp);
+    } while (crs[0] == NULL);
+    for (u32 j = 1; j < si->cpt; j++) {
+      do { // to work smoothly with ALLOCFAIL
+        crs[j] = corr_link(16*PGSZ, stress_co_worker, si, crs[j-1]);
+      } while (crs[j] == NULL);
+    }
+
+    corr_enter(crs[0]);
+    for (u32 j = 0; j < si->cpt; j++)
+      corr_destroy(crs[j]);
+  } else {
+    stress_func(si);
+  }
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  struct stress_info si = {.nkeys = 10000, .nloader = 1, .nunldr = 1, .nth = 1, .cpt = 0};
+  argc--;
+  argv++;
+  int n = -1;
+  if ((n = kvmap_api_helper(argc, argv, NULL, &si.api, &si.map)) < 0) {
+    fprintf(stderr, "usage: api ... [<#keys>=10000 [<#load-threads>=1 [<#unload-threads>=1 [<#threads>=1 [<#co-per-thread>=0 (disabled) [<rounds>=1 [<epochs>=1]]]]]]]\n");
+    kvmap_api_helper_message();
+    exit(0);
+  }
+  argc -= n;
+  argv += n;
+
+  const bool has_point = si.api->get && si.api->probe && si.api->del && si.api->put;
+  if (!has_point) {
+    fprintf(stderr, "api not supported\n");
+    exit(0);
+  }
+  if (!si.api->inpw)
+    fprintf(stderr, "api->inpw function not found: ignored\n");
+  if (!si.api->merge)
+    fprintf(stderr, "api->merge function not found: ignored\n");
+  if (!si.api->delr)
+    fprintf(stderr, "api->delr function not found: ignored\n");
+
+  si.has_iter = si.api->iter_create && si.api->iter_seek && si.api->iter_peek &&
+    si.api->iter_skip && si.api->iter_next && si.api->iter_destroy;
+  if (!si.has_iter)
+    fprintf(stderr, "iter functions not complete: ignored\n");
+
+  // generate keys
+  if (argc >= 1)
+    si.nkeys = a2u64(argv[0]);
+  si.keys = malloc(sizeof(struct kv *) * si.nkeys);
+  debug_assert(si.keys);
+  if (argc >= 2)
+    si.nloader = a2u32(argv[1]);
+  if (argc >= 3)
+    si.nunldr = a2u32(argv[2]);
+  if (argc >= 4)
+    si.nth = a2u32(argv[3]);
+  if (argc >= 5)
+    si.cpt = a2u32(argv[4]);
+  if (si.cpt > 32)
+    si.cpt = 32;
+#if !defined(CORR)
+  if (si.cpt > 1)
+    fprintf(stderr, TERMCLR(35) "CORR not enabled. Compile with -DCORR to enable it.\n" TERMCLR(0));
+#endif // CORR
+  const u64 nr = (argc >= 6) ? a2u64(argv[5]) : 1; // default 1
+  const u64 ne = (argc >= 7) ? a2u64(argv[6]) : 1; // default 1
+  printf("stresstest: nkeys %lu ldr %u uldr %u th %u cpt %u r %lu e %lu\n",
+      si.nkeys, si.nloader, si.nunldr, si.nth, si.cpt, nr, ne);
+
+  for (u64 e = 0; e < ne; e++) {
+    si.seqno = 0;
+    const u64 dtl = thread_fork_join(si.nloader, (void *)stress_load_worker, false, &si);
+    printf("load th %u mops %.2lf\n", si.nloader, ((double)si.nkeys) * 1e3 / ((double)dtl));
+    if (si.api->fprint)
+      si.api->fprint(si.map, stdout);
+
+    debug_perf_switch();
+    for (u64 r = 0; r < nr; r++) {
+      si.tot = 0;
+      si.wfail = 0;
+      si.endtime = time_nsec() + 2000000000lu;
+      const u64 dt = thread_fork_join(si.nth, (void *)stress_thread_worker, false, &si);
+      const double mops = ((double)si.tot) * 1e3 / ((double)dt);
+      char ts[64];
+      time_stamp(ts, 64);
+      const long rss = process_get_rss();
+      printf("%s e %lu r %lu th %u cpt %u tot %lu mops %.2lf rss %ldkB wfail %lu\n",
+          ts, e, r, si.nth, si.cpt, si.tot, mops, rss, si.wfail);
+      debug_perf_switch();
+    }
+    si.seqno = 0;
+    if (si.nunldr == 0) { // use clean
+      const u64 t0 = time_nsec();
+      si.api->clean(si.map);
+      const u64 dtu = time_diff_nsec(t0);
+      for (u64 i = 0; i < si.nkeys; i++)
+        free(si.keys[i]);
+      printf("clean mops %.2lf\n", ((double)si.nkeys) *1e3 / ((double)dtu));
+    } else {
+      const u64 dtu = thread_fork_join(si.nunldr, (void *)stress_unload_worker, false, &si);
+      printf("unload th %u mops %.2lf\n", si.nunldr, ((double)si.nkeys) *1e3 / ((double)dtu));
+    }
+  }
+
+  free(si.keys);
+  si.api->destroy(si.map);
+  return 0;
+}
diff --git a/run/MassTrie-beta/wormhole/stresstest.out b/run/MassTrie-beta/wormhole/stresstest.out
new file mode 100644
index 00000000..874d359c
Binary files /dev/null and b/run/MassTrie-beta/wormhole/stresstest.out differ
diff --git a/run/MassTrie-beta/wormhole/wh.c b/run/MassTrie-beta/wormhole/wh.c
new file mode 100644
index 00000000..1d31e231
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/wh.c
@@ -0,0 +1,3876 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+#include "wh.h"
+// }}} headers
+
+// def {{{
+#define WH_HMAPINIT_SIZE ((1u << 12)) // 10: 16KB/64KB  12: 64KB/256KB  14: 256KB/1MB
+#define WH_SLABMETA_SIZE ((1lu << 21)) // 2MB
+
+#ifndef HEAPCHECKING
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB is ok
+#else
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB for valgrind
+#endif
+
+#define WH_KPN ((128u)) // keys per node; power of 2
+#define WH_HDIV (((1u << 16)) / WH_KPN)
+#define WH_MID ((WH_KPN >> 1)) // ideal cut point for split, the closer the better
+#define WH_BKT_NR ((8))
+#define WH_KPN2 ((WH_KPN + WH_KPN))
+
+#define WH_KPN_MRG (((WH_KPN + WH_MID) >> 1 )) // 3/4
+
+// FO is fixed at 256. Don't change it
+#define WH_FO  ((256u)) // index fan-out
+// number of bits in a bitmap
+#define WH_BMNR ((WH_FO >> 6)) // number of u64
+// }}} def
+
+// struct {{{
+struct wormmeta {
+  struct entry13 k13; // kref+klen
+  struct entry13 l13; // lmost+bitmin+bitmax
+  struct entry13 r13; // rmost+hash32_lo
+  struct entry13 p13; // lpath+hash32_hi
+  u64 bitmap[0]; // 4 if bitmin != bitmax
+};
+static_assert(sizeof(struct wormmeta) == 32, "sizeof(wormmeta) != 32");
+
+struct wormkv64 { u64 key; void * ptr; }; // u64 keys (whu64)
+
+struct wormleaf {
+  // first line
+  rwlock leaflock;
+  spinlock sortlock; // to protect the seemingly "read-only" iter_seek
+  au64 lv; // version (dont use the first u64)
+  struct wormleaf * prev; // prev leaf
+  struct wormleaf * next; // next leaf
+  struct kv * anchor;
+
+  u32 nr_sorted;
+  u32 nr_keys;
+  u64 reserved[2];
+
+  struct entry13 hs[WH_KPN]; // sorted by hashes
+  u8 ss[WH_KPN]; // sorted by keys
+};
+
+struct wormslot { u16 t[WH_BKT_NR]; };
+static_assert(sizeof(struct wormslot) == 16, "sizeof(wormslot) != 16");
+
+struct wormmbkt { struct wormmeta * e[WH_BKT_NR]; };
+static_assert(sizeof(struct wormmbkt) == 64, "sizeof(wormmbkt) != 64");
+
+struct wormhmap {
+  au64 hv;
+  struct wormslot * wmap;
+  struct wormmbkt * pmap;
+  u32 mask;
+  u32 maxplen;
+  u64 msize;
+
+  struct slab * slab1;
+  struct slab * slab2;
+  struct kv * pbuf;
+};
+static_assert(sizeof(struct wormhmap) == 64, "sizeof(wormhmap) != 64");
+
+struct wormhole {
+  // 1 line
+  union {
+    au64 hmap_ptr; // safe
+    struct wormhmap * hmap; // unsafe
+  };
+  u64 padding0[6];
+  struct wormleaf * leaf0; // usually not used
+  // 1 line
+  struct kvmap_mm mm;
+  struct qsbr * qsbr;
+  struct slab * slab_leaf;
+  struct kv * pbuf;
+  u32 leaftype;
+  u32 padding1;
+  // 2 lines
+  struct wormhmap hmap2[2];
+  // fifth line
+  rwlock metalock;
+  u32 padding2[15];
+};
+
+struct wormhole_iter {
+  struct wormref * ref; // safe-iter only
+  struct wormhole * map;
+  struct wormleaf * leaf;
+  u32 is;
+};
+
+struct wormref {
+  struct wormhole * map;
+  struct qsbr_ref qref;
+};
+// }}} struct
+
+// helpers {{{
+
+// meta {{{
+  static inline struct kv *
+wormmeta_keyref_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->k13.e3);
+}
+
+  static inline u16
+wormmeta_klen_load(const struct wormmeta * const meta)
+{
+  return meta->k13.e1;
+}
+
+  static inline struct wormleaf *
+wormmeta_lmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->l13.e3 & (~0x3flu));
+}
+
+  static inline u32
+wormmeta_bitmin_load(const struct wormmeta * const meta)
+{
+  return (u32)(meta->l13.v64 & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_bitmax_load(const struct wormmeta * const meta)
+{
+  return (u32)((meta->l13.v64 >> 9) & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_hash32_load(const struct wormmeta * const meta)
+{
+  return ((u32)meta->r13.e1) | (((u32)meta->p13.e1) << 16);
+}
+
+  static inline struct wormleaf *
+wormmeta_rmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->r13.e3);
+}
+
+  static inline struct wormleaf *
+wormmeta_lpath_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->p13.e3);
+}
+
+// internal
+  static inline void
+wormmeta_lpath_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->p13, ptr_to_u64(leaf));
+}
+
+// also updates leaf_klen_eq and
+  static inline void
+wormmeta_lmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  const u64 minmax = meta->l13.v64 & 0x3fffflu;
+  meta->l13.v64 = (((u64)leaf) << 16) | minmax;
+
+  const bool leaf_klen_eq = leaf->anchor->klen == wormmeta_klen_load(meta);
+  wormmeta_lpath_store(meta, leaf_klen_eq ? leaf : leaf->prev);
+}
+
+  static inline void
+wormmeta_bitmin_store(struct wormmeta * const meta, const u32 bitmin)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x1fflu)) | bitmin;
+}
+
+  static inline void
+wormmeta_bitmax_store(struct wormmeta * const meta, const u32 bitmax)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x3fe00lu)) | (bitmax << 9);
+}
+
+  static inline void
+wormmeta_rmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->r13, ptr_to_u64(leaf));
+}
+
+// for wormmeta_alloc
+  static void
+wormmeta_init(struct wormmeta * const meta, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  keyref->refcnt++; // shared
+
+  const u32 plen = keyref->klen;
+  debug_assert(plen <= UINT16_MAX);
+  meta->k13 = entry13((u16)plen, ptr_to_u64(keyref));
+  meta->l13.v64 = (ptr_to_u64(lrmost) << 16) | (bit << 9) | bit;
+
+  const u32 hash32 = keyref->hashlo;
+  meta->r13 = entry13((u16)hash32, ptr_to_u64(lrmost));
+
+  const bool leaf_klen_eq = alen == plen;
+  meta->p13 = entry13((u16)(hash32 >> 16), ptr_to_u64(leaf_klen_eq ? lrmost : lrmost->prev));
+}
+// }}} meta
+
+// meta-bitmap {{{
+  static inline bool
+wormmeta_bm_test(const struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin == bitmax) { // half node
+    return bitmin == id;
+  } else { // full node
+    return (bool)((meta->bitmap[id >> 6u] >> (id & 0x3fu)) & 1lu);
+  }
+}
+
+// meta must be a full node
+  static void
+wormmeta_bm_set(struct wormmeta * const meta, const u32 id)
+{
+  // need to replace meta
+  u64 * const ptr = &(meta->bitmap[id >> 6u]);
+  const u64 bit = 1lu << (id & 0x3fu);
+  if ((*ptr) & bit)
+    return;
+
+  (*ptr) |= bit;
+
+  // min
+  if (id < wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, id);
+
+  // max
+  const u32 oldmax = wormmeta_bitmax_load(meta);
+  if (oldmax == WH_FO || id > oldmax)
+    wormmeta_bitmax_store(meta, id);
+}
+
+// find the lowest bit > id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_gt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ~((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + (u32)__builtin_ctzl(bits);
+
+  while (++ix < WH_BMNR) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + (u32)__builtin_ctzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// find the highest bit that is lower than the id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_lt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+
+  while (ix--) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// meta must be a full node
+  static inline void
+wormmeta_bm_clear(struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(wormmeta_bitmin_load(meta) < wormmeta_bitmax_load(meta));
+  meta->bitmap[id >> 6u] &= (~(1lu << (id & 0x3fu)));
+
+  // min
+  if (id == wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, wormmeta_bm_gt(meta, id));
+
+  // max
+  if (id == wormmeta_bitmax_load(meta))
+    wormmeta_bitmax_store(meta, wormmeta_bm_lt(meta, id));
+}
+// }}} meta-bitmap
+
+// key/prefix {{{
+  static inline u16
+wormhole_pkey(const u32 hash32)
+{
+  const u16 pkey0 = ((u16)hash32) ^ ((u16)(hash32 >> 16));
+  return pkey0 ? pkey0 : 1;
+}
+
+  static inline u32
+wormhole_bswap(const u32 hashlo)
+{
+  return __builtin_bswap32(hashlo);
+}
+
+  static inline bool
+wormhole_key_meta_match(const struct kv * const key, const struct wormmeta * const meta)
+{
+  return (key->klen == wormmeta_klen_load(meta))
+    && (!memcmp(key->kv, wormmeta_keyref_load(meta)->kv, key->klen));
+}
+
+// called by get_kref_slot
+  static inline bool
+wormhole_kref_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta)
+{
+  return (kref->len == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, wormmeta_keyref_load(meta)->kv, kref->len));
+}
+
+// called from meta_down ... get_kref1_slot
+// will access rmost, prefetching is effective here
+  static inline bool
+wormhole_kref1_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta, const u8 cid)
+{
+  const u8 * const keybuf = wormmeta_keyref_load(meta)->kv;
+  const u32 plen = kref->len;
+  return ((plen + 1) == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, keybuf, plen))
+    && (keybuf[plen] == cid);
+}
+
+// warning: be careful with buffer overflow
+  static inline void
+wormhole_prefix(struct kv * const pfx, const u32 klen)
+{
+  pfx->klen = klen;
+  kv_update_hash(pfx);
+}
+
+// for split
+  static inline void
+wormhole_prefix_inc1(struct kv * const pfx)
+{
+  pfx->hashlo = crc32c_u8(pfx->hashlo, pfx->kv[pfx->klen]);
+  pfx->klen++;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc_123(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc_123(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+// }}} key/prefix
+
+// alloc {{{
+  static inline struct kv *
+wormhole_alloc_akey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_akey(struct kv * const akey)
+{
+  free(akey);
+}
+
+  static inline struct kv *
+wormhole_alloc_mkey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_mkey(struct kv * const mkey)
+{
+  free(mkey);
+}
+
+  static struct wormleaf *
+wormleaf_alloc(struct wormhole * const map, struct wormleaf * const prev,
+    struct wormleaf * const next, struct kv * const anchor)
+{
+  struct wormleaf * const leaf = slab_alloc_safe(map->slab_leaf);
+  if (leaf == NULL)
+    return NULL;
+
+  rwlock_init(&(leaf->leaflock));
+  spinlock_init(&(leaf->sortlock));
+
+  // keep the old version; new version will be assigned by split functions
+  //leaf->lv = 0;
+
+  leaf->prev = prev;
+  leaf->next = next;
+  leaf->anchor = anchor;
+
+  leaf->nr_keys = 0;
+  leaf->nr_sorted = 0;
+
+  // hs requires zero init.
+  memset(leaf->hs, 0, sizeof(leaf->hs[0]) * WH_KPN);
+  return leaf;
+}
+
+  static void
+wormleaf_free(struct slab * const slab, struct wormleaf * const leaf)
+{
+  debug_assert(leaf->leaflock.opaque == 0);
+  wormhole_free_akey(leaf->anchor);
+  slab_free_safe(slab, leaf);
+}
+
+  static struct wormmeta *
+wormmeta_alloc(struct wormhmap * const hmap, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  debug_assert(alen <= UINT16_MAX);
+  debug_assert(lrmost && keyref);
+
+  struct wormmeta * const meta = slab_alloc_unsafe(hmap->slab1);
+  if (meta == NULL)
+    return NULL;
+
+  wormmeta_init(meta, lrmost, keyref, alen, bit);
+  return meta;
+}
+
+  static inline bool
+wormhole_slab_reserve(struct wormhole * const map, const u32 nr)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return false;
+#endif
+  for (u32 i = 0; i < 2; i++) {
+    if (!(map->hmap2[i].slab1 && map->hmap2[i].slab2))
+      continue;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab1, nr))
+      return false;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab2, nr))
+      return false;
+  }
+  return true;
+}
+
+  static void
+wormmeta_keyref_release(struct wormmeta * const meta)
+{
+  struct kv * const keyref = wormmeta_keyref_load(meta);
+  debug_assert(keyref->refcnt);
+  keyref->refcnt--;
+  if (keyref->refcnt == 0)
+    wormhole_free_mkey(keyref);
+}
+
+  static void
+wormmeta_free(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  wormmeta_keyref_release(meta);
+  slab_free_unsafe(hmap->slab1, meta);
+}
+// }}} alloc
+
+// lock {{{
+  static void
+wormleaf_lock_write(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_lock_read(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_read(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_read(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_unlock_write(struct wormleaf * const leaf)
+{
+  rwlock_unlock_write(&(leaf->leaflock));
+}
+
+  static void
+wormleaf_unlock_read(struct wormleaf * const leaf)
+{
+  rwlock_unlock_read(&(leaf->leaflock));
+}
+
+  static void
+wormhmap_lock(struct wormhole * const map, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(map->metalock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(map->metalock));
+    wormhole_resume(ref);
+  }
+}
+
+  static inline void
+wormhmap_unlock(struct wormhole * const map)
+{
+  rwlock_unlock_write(&(map->metalock));
+}
+// }}} lock
+
+// hmap-version {{{
+  static inline struct wormhmap *
+wormhmap_switch(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  return (hmap == map->hmap2) ? (hmap + 1) : (hmap - 1);
+}
+
+  static inline struct wormhmap *
+wormhmap_load(struct wormhole * const map)
+{
+  return (struct wormhmap *)atomic_load_explicit(&(map->hmap_ptr), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_store(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  atomic_store_explicit(&(map->hmap_ptr), (u64)hmap, MO_RELEASE);
+}
+
+  static inline u64
+wormhmap_version_load(const struct wormhmap * const hmap)
+{
+  // no concurrent access
+  return atomic_load_explicit(&(hmap->hv), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_version_store(struct wormhmap * const hmap, const u64 v)
+{
+  atomic_store_explicit(&(hmap->hv), v, MO_RELEASE);
+}
+
+  static inline u64
+wormleaf_version_load(struct wormleaf * const leaf)
+{
+  return atomic_load_explicit(&(leaf->lv), MO_CONSUME);
+}
+
+  static inline void
+wormleaf_version_store(struct wormleaf * const leaf, const u64 v)
+{
+  atomic_store_explicit(&(leaf->lv), v, MO_RELEASE);
+}
+// }}} hmap-version
+
+// co {{{
+  static inline void
+wormhmap_prefetch_pmap(const struct wormhmap * const hmap, const u32 idx)
+{
+#if defined(CORR)
+  (void)hmap;
+  (void)idx;
+#else
+  cpu_prefetch0(&(hmap->pmap[idx]));
+#endif
+}
+
+  static inline struct wormmeta *
+wormhmap_get_meta(const struct wormhmap * const hmap, const u32 mid, const u32 i)
+{
+  struct wormmeta * const meta = hmap->pmap[mid].e[i];
+#if defined(CORR)
+  cpu_prefetch0(meta);
+  corr_yield();
+#endif
+  return meta;
+}
+
+  static inline void
+wormleaf_prefetch(struct wormleaf * const leaf, const u32 hashlo)
+{
+  const u32 i = wormhole_pkey(hashlo) / WH_HDIV;
+#if defined(CORR)
+  cpu_prefetch0(leaf);
+  cpu_prefetch0(&(leaf->hs[i-4]));
+  cpu_prefetch0(&(leaf->hs[i+4]));
+  corr_yield();
+#else
+  cpu_prefetch0(&(leaf->hs[i]));
+#endif
+}
+
+  static inline bool
+wormhole_kref_kv_match(const struct kref * const key, const struct kv * const curr)
+{
+#if defined(CORR)
+  const u8 * const ptr = (typeof(ptr))curr;
+  cpu_prefetch0(ptr);
+  cpu_prefetch0(ptr + 64);
+  if (key->len > 56) {
+    cpu_prefetch0(ptr + 128);
+    cpu_prefetch0(ptr + 192);
+  }
+  corr_yield();
+#endif
+  return kref_kv_match(key, curr);
+}
+
+  static inline void
+wormhole_qsbr_update_pause(struct wormref * const ref, const u64 v)
+{
+  qsbr_update(&ref->qref, v);
+#if defined(CORR)
+  corr_yield();
+#endif
+}
+// }}} co
+
+// }}} helpers
+
+// hmap {{{
+// hmap is the MetaTrieHT of Wormhole
+  static bool
+wormhmap_init(struct wormhmap * const hmap, struct kv * const pbuf)
+{
+  const u64 wsize = sizeof(hmap->wmap[0]) * WH_HMAPINIT_SIZE;
+  const u64 psize = sizeof(hmap->pmap[0]) * WH_HMAPINIT_SIZE;
+  u64 msize = wsize + psize;
+  u8 * const mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL)
+    return false;
+
+  hmap->pmap = (typeof(hmap->pmap))mem;
+  hmap->wmap = (typeof(hmap->wmap))(mem + psize);
+  hmap->msize = msize;
+  hmap->mask = WH_HMAPINIT_SIZE - 1;
+  wormhmap_version_store(hmap, 0);
+  hmap->maxplen = 0;
+  hmap->pbuf = pbuf;
+  return true;
+}
+
+  static inline void
+wormhmap_deinit(struct wormhmap * const hmap)
+{
+  if (hmap->pmap) {
+    pages_unmap(hmap->pmap, hmap->msize);
+    hmap->pmap = NULL;
+    hmap->wmap = NULL;
+  }
+}
+
+  static inline m128
+wormhmap_zero(void)
+{
+#if defined(__x86_64__)
+  return _mm_setzero_si128();
+#elif defined(__aarch64__)
+  return vdupq_n_u8(0);
+#endif
+}
+
+  static inline m128
+wormhmap_m128_pkey(const u16 pkey)
+{
+#if defined(__x86_64__)
+  return _mm_set1_epi16((short)pkey);
+#elif defined(__aarch64__)
+  return vreinterpretq_u8_u16(vdupq_n_u16(pkey));
+#endif
+}
+
+  static inline u32
+wormhmap_match_mask(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  const m128 sv = _mm_load_si128((const void *)s);
+  return (u32)_mm_movemask_epi8(_mm_cmpeq_epi16(skey, sv));
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  static const uint16x8_t mbits = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
+  return (u32)vaddvq_u16(vandq_u16(cmp, mbits));
+#endif
+}
+
+  static inline bool
+wormhmap_match_any(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  return wormhmap_match_mask(s, skey) != 0;
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  return vaddvq_u32(vreinterpretq_u32_u16(cmp)) != 0;
+#endif
+}
+
+// meta_lcp only
+  static inline bool
+wormhmap_peek(const struct wormhmap * const hmap, const u32 hash32)
+{
+  const m128 sk = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  return wormhmap_match_any(&(hmap->wmap[midx]), sk)
+    || wormhmap_match_any(&(hmap->wmap[midy]), sk);
+}
+
+  static inline struct wormmeta *
+wormhmap_get_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kv * const key)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_key_meta_match(key, meta)))
+      return meta;
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+  static struct wormmeta *
+wormhmap_get(const struct wormhmap * const hmap, const struct kv * const key)
+{
+  const u32 hash32 = key->hashlo;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_slot(hmap, midx, skey, key);
+  if (r)
+    return r;
+  return wormhmap_get_slot(hmap, midy, skey, key);
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_kref_meta_match(kref, meta)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref(const struct wormhmap * const hmap, const struct kref * const kref)
+{
+  const u32 hash32 = kref->hash32;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref_slot(hmap, midx, skey, kref);
+  if (r)
+    return r;
+  return wormhmap_get_kref_slot(hmap, midy, skey, kref);
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref, const u8 cid)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    //cpu_prefetch0(wormmeta_rmost_load(meta)); // will access
+    if (likely(wormhole_kref1_meta_match(kref, meta, cid)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1(const struct wormhmap * const hmap,
+    const struct kref * const kref, const u8 cid)
+{
+  const u32 hash32 = crc32c_u8(kref->hash32, cid);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref1_slot(hmap, midx, skey, kref, cid);
+  if (r)
+    return r;
+  return wormhmap_get_kref1_slot(hmap, midy, skey, kref, cid);
+}
+
+  static inline u32
+wormhmap_slot_count(const struct wormslot * const slot)
+{
+  const u32 mask = wormhmap_match_mask(slot, wormhmap_zero());
+  return mask ? ((u32)__builtin_ctz(mask) >> 1) : 8;
+}
+
+  static inline void
+wormhmap_squeeze(const struct wormhmap * const hmap)
+{
+  struct wormslot * const wmap = hmap->wmap;
+  struct wormmbkt * const pmap = hmap->pmap;
+  const u32 mask = hmap->mask;
+  const u64 nrs64 = ((u64)(hmap->mask)) + 1; // must use u64; u32 can overflow
+  for (u64 si64 = 0; si64 < nrs64; si64++) { // # of buckets
+    const u32 si = (u32)si64;
+    u32 ci = wormhmap_slot_count(&(wmap[si]));
+    for (u32 ei = ci - 1; ei < WH_BKT_NR; ei--) {
+      struct wormmeta * const meta = pmap[si].e[ei];
+      const u32 sj = wormmeta_hash32_load(meta) & mask; // first hash
+      if (sj == si)
+        continue;
+
+      // move
+      const u32 ej = wormhmap_slot_count(&(wmap[sj]));
+      if (ej < WH_BKT_NR) { // has space at home location
+        wmap[sj].t[ej] = wmap[si].t[ei];
+        pmap[sj].e[ej] = pmap[si].e[ei];
+        const u32 ni = ci - 1;
+        if (ei < ni) {
+          wmap[si].t[ei] = wmap[si].t[ni];
+          pmap[si].e[ei] = pmap[si].e[ni];
+        }
+        wmap[si].t[ni] = 0;
+        pmap[si].e[ni] = NULL;
+        ci--;
+      }
+    }
+  }
+}
+
+  static void
+wormhmap_expand(struct wormhmap * const hmap)
+{
+  // sync expand
+  const u32 mask0 = hmap->mask;
+  if (mask0 == UINT32_MAX)
+    debug_die();
+  const u32 nr0 = mask0 + 1;
+  const u32 mask1 = mask0 + nr0;
+  const u64 nr1 = ((u64)nr0) << 1; // must use u64; u32 can overflow
+  const u64 wsize = nr1 * sizeof(hmap->wmap[0]);
+  const u64 psize = nr1 * sizeof(hmap->pmap[0]);
+  u64 msize = wsize + psize;
+  u8 * mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL) {
+    // We are at a very deep call stack from wormhole_put().
+    // Gracefully handling the failure requires lots of changes.
+    // Currently we simply wait for available memory
+    // TODO: gracefully return with insertion failure
+    char ts[64];
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s sleep-wait for memory allocation %lukB\n",
+        __func__, ts, msize >> 10);
+    do {
+      sleep(1);
+      mem = pages_alloc_best(msize, true, &msize);
+    } while (mem == NULL);
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s memory allocation done\n", __func__, ts);
+  }
+
+  struct wormhmap hmap1 = *hmap;
+  hmap1.pmap = (typeof(hmap1.pmap))mem;
+  hmap1.wmap = (typeof(hmap1.wmap))(mem + psize);
+  hmap1.msize = msize;
+  hmap1.mask = mask1;
+
+  const struct wormslot * const wmap0 = hmap->wmap;
+  const struct wormmbkt * const pmap0 = hmap->pmap;
+
+  for (u32 s = 0; s < nr0; s++) {
+    const struct wormmbkt * const bkt = &pmap0[s];
+    for (u32 i = 0; (i < WH_BKT_NR) && bkt->e[i]; i++) {
+      const struct wormmeta * const meta = bkt->e[i];
+      const u32 hash32 = wormmeta_hash32_load(meta);
+      const u32 idx0 = hash32 & mask0;
+      const u32 idx1 = ((idx0 == s) ? hash32 : wormhole_bswap(hash32)) & mask1;
+
+      const u32 n = wormhmap_slot_count(&(hmap1.wmap[idx1]));
+      debug_assert(n < 8);
+      hmap1.wmap[idx1].t[n] = wmap0[s].t[i];
+      hmap1.pmap[idx1].e[n] = bkt->e[i];
+    }
+  }
+  pages_unmap(hmap->pmap, hmap->msize);
+  hmap->pmap = hmap1.pmap;
+  hmap->wmap = hmap1.wmap;
+  hmap->msize = hmap1.msize;
+  hmap->mask = hmap1.mask;
+  wormhmap_squeeze(hmap);
+}
+
+  static bool
+wormhmap_cuckoo(struct wormhmap * const hmap, const u32 mid0,
+    struct wormmeta * const e0, const u16 s0, const u32 depth)
+{
+  const u32 ii = wormhmap_slot_count(&(hmap->wmap[mid0]));
+  if (ii < WH_BKT_NR) {
+    hmap->wmap[mid0].t[ii] = s0;
+    hmap->pmap[mid0].e[ii] = e0;
+    return true;
+  } else if (depth == 0) {
+    return false;
+  }
+
+  // depth > 0
+  struct wormmbkt * const bkt = &(hmap->pmap[mid0]);
+  u16 * const sv = &(hmap->wmap[mid0].t[0]);
+  for (u32 i = 0; i < WH_BKT_NR; i++) {
+    const struct wormmeta * const meta = bkt->e[i];
+    debug_assert(meta);
+    const u32 hash32 = wormmeta_hash32_load(meta);
+
+    const u32 midx = hash32 & hmap->mask;
+    const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+    const u32 midt = (midx != mid0) ? midx : midy;
+    if (midt != mid0) { // possible
+      // no penalty if moving someone back to its 1st hash location
+      const u32 depth1 = (midt == midx) ? depth : (depth - 1);
+      if (wormhmap_cuckoo(hmap, midt, bkt->e[i], sv[i], depth1)) {
+        bkt->e[i] = e0;
+        sv[i] = s0;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+  static void
+wormhmap_set(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const u16 pkey = wormhole_pkey(hash32);
+  // insert with cuckoo
+  if (likely(wormhmap_cuckoo(hmap, midx, meta, pkey, 1)))
+    return;
+  if (wormhmap_cuckoo(hmap, midy, meta, pkey, 1))
+    return;
+  if (wormhmap_cuckoo(hmap, midx, meta, pkey, 2))
+    return;
+
+  // expand
+  wormhmap_expand(hmap);
+
+  wormhmap_set(hmap, meta);
+}
+
+  static bool
+wormhmap_del_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const meta, const m128 skey)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    const struct wormmeta * const meta1 = hmap->pmap[mid].e[i2>>1];
+    if (likely(meta == meta1)) {
+      const u32 i = i2 >> 1;
+      const u32 j = wormhmap_slot_count(&(hmap->wmap[mid])) - 1;
+      hmap->wmap[mid].t[i] = hmap->wmap[mid].t[j];
+      hmap->pmap[mid].e[i] = hmap->pmap[mid].e[j];
+      hmap->wmap[mid].t[j] = 0;
+      hmap->pmap[mid].e[j] = NULL;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_del(struct wormhmap * const hmap, const struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_del_slot(hmap, midx, meta, skey)
+    || wormhmap_del_slot(hmap, midy, meta, skey);
+}
+
+  static bool
+wormhmap_replace_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const old, const m128 skey, struct wormmeta * const new)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta ** const pslot = &hmap->pmap[mid].e[i2>>1];
+    if (likely(old == *pslot)) {
+      *pslot = new;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_replace(struct wormhmap * const hmap, const struct wormmeta * const old, struct wormmeta * const new)
+{
+  const u32 hash32 = wormmeta_hash32_load(old);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_replace_slot(hmap, midx, old, skey, new)
+    || wormhmap_replace_slot(hmap, midy, old, skey, new);
+}
+// }}} hmap
+
+// create {{{
+// it's unsafe
+  static bool
+wormhole_create_leaf0(struct wormhole * const map)
+{
+  const bool sr = wormhole_slab_reserve(map, 1);
+  if (unlikely(!sr))
+    return false;
+
+  // create leaf of empty key
+  struct kv * const anchor = wormhole_alloc_akey(0);
+  if (anchor == NULL)
+    return false;
+  kv_dup2(kv_null(), anchor);
+
+  struct wormleaf * const leaf0 = wormleaf_alloc(map, NULL, NULL, anchor);
+  if (leaf0 == NULL) {
+    wormhole_free_akey(anchor);
+    return false;
+  }
+
+  struct kv * const mkey = wormhole_alloc_mkey(0);
+  if (mkey == NULL) {
+    wormleaf_free(map->slab_leaf, leaf0);
+    return false;
+  }
+
+  wormhole_prefix(mkey, 0);
+  mkey->refcnt = 0;
+  // create meta of empty key
+  for (u32 i = 0; i < 2; i++) {
+    if (map->hmap2[i].slab1) {
+      struct wormmeta * const m0 = wormmeta_alloc(&map->hmap2[i], leaf0, mkey, 0, WH_FO);
+      debug_assert(m0); // already reserved enough
+      wormhmap_set(&(map->hmap2[i]), m0);
+    }
+  }
+
+  map->leaf0 = leaf0;
+  return true;
+}
+
+  static struct wormhole *
+wormhole_create_internal(const struct kvmap_mm * const mm, const u32 nh)
+{
+  struct wormhole * const map = yalloc(sizeof(*map));
+  if (map == NULL)
+    return NULL;
+  memset(map, 0, sizeof(*map));
+  // mm
+  map->mm = mm ? (*mm) : kvmap_mm_dup;
+
+  // pbuf for meta-merge
+  map->pbuf = yalloc(1lu << 16); // 64kB
+  if (map->pbuf == NULL)
+    goto fail;
+
+  // hmap
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (!wormhmap_init(hmap, map->pbuf))
+      goto fail;
+
+    hmap->slab1 = slab_create(sizeof(struct wormmeta), WH_SLABMETA_SIZE);
+    if (hmap->slab1 == NULL)
+      goto fail;
+
+    hmap->slab2 = slab_create(sizeof(struct wormmeta) + (sizeof(u64) * WH_BMNR), WH_SLABMETA_SIZE);
+    if (hmap->slab2 == NULL)
+      goto fail;
+  }
+
+  // leaf slab
+  map->slab_leaf = slab_create(sizeof(struct wormleaf), WH_SLABLEAF_SIZE);
+  if (map->slab_leaf == NULL)
+    goto fail;
+
+  // qsbr
+  map->qsbr = qsbr_create();
+  if (map->qsbr == NULL)
+    goto fail;
+
+  // leaf0
+  if (!wormhole_create_leaf0(map))
+    goto fail;
+
+  rwlock_init(&(map->metalock));
+  wormhmap_store(map, &map->hmap2[0]);
+  return map;
+
+fail:
+  if (map->qsbr)
+    qsbr_destroy(map->qsbr);
+
+  if (map->slab_leaf)
+    slab_destroy(map->slab_leaf);
+
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+
+  if (map->pbuf)
+    free(map->pbuf);
+
+  free(map);
+  return NULL;
+}
+
+  struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 2);
+}
+
+  struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 1);
+}
+// }}} create
+
+// jump {{{
+
+// lcp {{{
+// search in the hash table for the Longest Prefix Match of the search key
+// The corresponding wormmeta node is returned and the LPM is recorded in kref
+  static struct wormmeta *
+wormhole_meta_lcp(const struct wormhmap * const hmap, struct kref * const kref, const u32 klen)
+{
+  // invariant: lo <= lcp < (lo + gd)
+  // ending condition: gd == 1
+  u32 gd = (hmap->maxplen < klen ? hmap->maxplen : klen) + 1u;
+  u32 lo = 0;
+  u32 loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_1 ((7u))
+  while (META_LCP_GAP_1 < gd) {
+    const u32 inc = gd >> 3 << 2; // x4
+    const u32 hash32 = crc32c_inc_x4(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = gd >> 1;
+    const u32 hash32 = crc32c_inc_123(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_1
+
+  kref->hash32 = loh;
+  kref->len = lo;
+  struct wormmeta * ret = wormhmap_get_kref(hmap, kref);
+  if (likely(ret != NULL))
+    return ret;
+
+  gd = lo;
+  lo = 0;
+  loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_2 ((5u))
+  while (META_LCP_GAP_2 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        gd = 1;
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc_123(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_2
+
+  if (kref->len != lo) {
+    kref->hash32 = loh;
+    kref->len = lo;
+  }
+  if (ret == NULL)
+    ret = wormhmap_get_kref(hmap, kref);
+  debug_assert(ret);
+  return ret;
+}
+// }}} lcp
+
+// down {{{
+  static struct wormleaf *
+wormhole_meta_down(const struct wormhmap * const hmap, const struct kref * const lcp,
+    const struct wormmeta * const meta, const u32 klen)
+{
+  if (likely(lcp->len < klen)) { // partial match
+    const u32 id0 = lcp->ptr[lcp->len];
+    if (wormmeta_bitmin_load(meta) > id0) { // no left, don't care about right.
+      return wormmeta_lpath_load(meta);
+    } else if (wormmeta_bitmax_load(meta) < id0) { // has left sibling but no right sibling
+      return wormmeta_rmost_load(meta);
+    } else { // has both (expensive)
+      return wormmeta_rmost_load(wormhmap_get_kref1(hmap, lcp, (u8)wormmeta_bm_lt(meta, id0)));
+    }
+  } else { // lcp->len == klen
+    return wormmeta_lpath_load(meta);
+  }
+}
+// }}} down
+
+// jump-rw {{{
+  static struct wormleaf *
+wormhole_jump_leaf(const struct wormhmap * const hmap, const struct kref * const key)
+{
+  struct kref kref = {.ptr = key->ptr};
+  debug_assert(kv_crc32c(key->ptr, key->len) == key->hash32);
+
+  const struct wormmeta * const meta = wormhole_meta_lcp(hmap, &kref, key->len);
+  return wormhole_meta_down(hmap, &kref, meta, key->len);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_read(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_read_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_read(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_write(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_write_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_write(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+// }}} jump-rw
+
+// }}} jump
+
+// leaf-read {{{
+  static inline struct kv *
+wormleaf_kv_at_ih(const struct wormleaf * const leaf, const u32 ih)
+{
+  return u64_to_ptr(leaf->hs[ih].e3);
+}
+
+  static inline struct kv *
+wormleaf_kv_at_is(const struct wormleaf * const leaf, const u32 is)
+{
+  return u64_to_ptr(leaf->hs[leaf->ss[is]].e3);
+}
+
+  static inline void
+wormleaf_prefetch_ss(const struct wormleaf * const leaf)
+{
+  for (u32 i = 0; i < WH_KPN; i+=64)
+    cpu_prefetch0(&leaf->ss[i]);
+}
+
+// leaf must have been sorted
+// return the key at [i] as if k1 has been inserted into leaf; i <= leaf->nr_sorted
+  static const struct kv *
+wormleaf_kv_at_is1(const struct wormleaf * const leaf, const u32 i, const u32 is1, const struct kv * const k1)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(is1 <= leaf->nr_sorted);
+  if (i < is1)
+    return wormleaf_kv_at_is(leaf, i);
+  else if (i > is1)
+    return wormleaf_kv_at_is(leaf, i-1);
+  else // i == is1
+    return k1;
+}
+
+
+
+// fast point-lookup
+// returns WH_KPN if not found
+  static u32
+wormleaf_match_hs(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  const u16 pkey = wormhole_pkey(key->hash32);
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+
+  if (hs[i0].e1 == pkey) {
+    struct kv * const curr = u64_to_ptr(hs[i0].e3);
+    if (likely(wormhole_kref_kv_match(key, curr)))
+      return i0;
+  }
+  if (hs[i0].e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if (hs[i].e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if ((hs[i].e1 > pkey) || (hs[i].e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+  
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in hs
+  static u32
+wormleaf_search_ih(const struct wormleaf * const leaf, const struct entry13 e)
+{
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+  const struct entry13 e0 = hs[i0];
+
+  if (e0.v64 == e.v64)
+    return i0;
+
+  if (e0.e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if (ei.e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if ((ei.e1 > pkey) || (ei.e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in ss
+  static u32
+wormleaf_search_is(const struct wormleaf * const leaf, const u8 ih)
+{
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 i1 = _mm256_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#else // SSE4.2
+  const m128 i1 = _mm_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm_movemask_epi8(_mm_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__)
+  static const m128 vtbl = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+  static const uint16x8_t mbits = {0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};
+  const m128 i1 = vdupq_n_u8(ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 cmp = vceqq_u8(vld1q_u8(leaf->ss+i), i1); // cmpeq => 0xff or 0x00
+    const m128 cmp1 = vqtbl1q_u8(cmp, vtbl); // reorder
+    const u32 mask = (u32)vaddvq_u16(vandq_u8(vreinterpretq_u16_u8(cmp1), mbits));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __x86_64__
+  debug_die();
+}
+
+// assumes there in no duplicated keys
+// search the first key that is >= the given key
+// return 0 .. nr_sorted
+  static u32
+wormleaf_search_ss(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  u32 lo = 0;
+  u32 hi = leaf->nr_sorted;
+  while ((lo + 2) < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    cpu_prefetch0(curr);
+    cpu_prefetch0(leaf->hs + leaf->ss[(lo + i) >> 1]);
+    cpu_prefetch0(leaf->hs + leaf->ss[(i + 1 + hi) >> 1]);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+
+  while (lo < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+  return lo;
+}
+
+  static u32
+wormleaf_seek(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_sorted == leaf->nr_keys);
+  wormleaf_prefetch_ss(leaf); // effective for both hit and miss
+  const u32 ih = wormleaf_match_hs(leaf, key);
+  if (ih < WH_KPN) { // hit
+    return wormleaf_search_is(leaf, (u8)ih);
+  } else { // miss, binary search for gt
+    return wormleaf_search_ss(leaf, key);
+  }
+}
+
+// same to search_sorted but the target is very likely beyond the end
+  static u32
+wormleaf_seek_end(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  if (leaf->nr_sorted) {
+    const int cmp = kref_kv_compare(key, wormleaf_kv_at_is(leaf, leaf->nr_sorted-1));
+    if (cmp > 0)
+      return leaf->nr_sorted;
+    else if (cmp == 0)
+      return leaf->nr_sorted - 1;
+    else
+      return wormleaf_seek(leaf, key);
+  } else {
+    return 0;
+  }
+}
+// }}} leaf-read
+
+// leaf-write {{{
+  static void
+wormleaf_sort_m2(struct wormleaf * const leaf, const u32 n1, const u32 n2)
+{
+  if (n1 == 0 || n2 == 0)
+    return; // no need to sort
+
+  u8 * const ss = leaf->ss;
+  u8 et[WH_KPN/2]; // min(n1,n2) < KPN/2
+  if (n1 <= n2) { // merge left
+    memcpy(et, &(ss[0]), sizeof(ss[0]) * n1);
+    u8 * eo = ss;
+    u8 * e1 = et; // size == n1
+    u8 * e2 = &(ss[n1]); // size == n2
+    const u8 * const z1 = e1 + n1;
+    const u8 * const z2 = e2 + n2;
+    while ((e1 < z1) && (e2 < z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo++) = *(e1++);
+      else if (cmp > 0)
+        *(eo++) = *(e2++);
+      else
+        debug_die();
+
+      if (eo == e2)
+        break; // finish early
+    }
+    if (eo < e2)
+      memcpy(eo, e1, sizeof(*eo) * (size_t)(e2 - eo));
+  } else {
+    memcpy(et, &(ss[n1]), sizeof(ss[0]) * n2);
+    u8 * eo = &(ss[n1 + n2 - 1]); // merge backwards
+    u8 * e1 = &(ss[n1 - 1]); // size == n1
+    u8 * e2 = &(et[n2 - 1]); // size == n2
+    const u8 * const z1 = e1 - n1;
+    const u8 * const z2 = e2 - n2;
+    while ((e1 > z1) && (e2 > z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo--) = *(e2--);
+      else if (cmp > 0)
+        *(eo--) = *(e1--);
+      else
+        debug_die();
+
+      if (eo == e1)
+        break;
+    }
+    if (eo > e1)
+      memcpy(e1 + 1, et, sizeof(*eo) * (size_t)(eo - e1));
+  }
+}
+
+#if defined(__linux__)
+  static int
+wormleaf_ss_cmp(const void * const p1, const void * const p2, void * priv)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#else // (FreeBSD and APPLE only)
+  static int
+wormleaf_ss_cmp(void * priv, const void * const p1, const void * const p2)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#endif // __linux__
+
+  static inline void
+wormleaf_sort_range(struct wormleaf * const leaf, const u32 i0, const u32 nr)
+{
+#if defined(__linux__)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), wormleaf_ss_cmp, leaf);
+#else // (FreeBSD and APPLE only)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), leaf, wormleaf_ss_cmp);
+#endif // __linux__
+}
+
+// make sure all keys are sorted in a leaf node
+  static void
+wormleaf_sync_sorted(struct wormleaf * const leaf)
+{
+  const u32 s = leaf->nr_sorted;
+  const u32 n = leaf->nr_keys;
+  if (s == n)
+    return;
+
+  wormleaf_sort_range(leaf, s, n - s);
+  // merge-sort inplace
+  wormleaf_sort_m2(leaf, s, n - s);
+  leaf->nr_sorted = n;
+}
+
+// shift a sequence of entries on hs and update the corresponding ss values
+  static void
+wormleaf_shift_inc(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from+1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_add_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_add_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vaddq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+  static void
+wormleaf_shift_dec(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from-1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_sub_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += 16) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_sub_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vsubq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+// insert hs and also shift ss
+  static u32
+wormleaf_insert_hs(struct wormleaf * const leaf, const struct entry13 e)
+{
+  struct entry13 * const hs = leaf->hs;
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  if (hs[i0].e1 == 0) { // insert
+    hs[i0] = e;
+    return i0;
+  }
+
+  // find left-most insertion point
+  u32 i = i0;
+  while (i && hs[i-1].e1 && (hs[i-1].e1 >= pkey))
+    i--;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 < pkey)) // stop at >= or empty
+    i++;
+  const u32 il = --i; // i in [0, KPN]
+
+  // find left empty slot
+  if (i > (i0 - 1))
+    i = i0 - 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i--;
+  const u32 el = i; // el < i0 or el is invalid (>= KPN)
+
+  // find right-most insertion point.
+  i = il + 1;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 == pkey))
+    i++;
+  const u32 ir = i; // ir >= il, in [0, KPN]
+
+  // find right empty slot
+  if (i < (i0 + 1))
+    i = i0 + 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i++;
+  const u32 er = i; // er > i0 or el is invalid (>= KPN)
+
+  // el <= il < ir <= er    (if < WH_KPN)
+  const u32 dl = (el < WH_KPN) ? (il - el) : WH_KPN;
+  const u32 dr = (er < WH_KPN) ? (er - ir) : WH_KPN;
+  if (dl <= dr) { // push left
+    debug_assert(dl < WH_KPN);
+    if (dl)
+      wormleaf_shift_dec(leaf, el, el+1, dl);
+    hs[il] = e;
+    return il;
+  } else {
+    debug_assert(dr < WH_KPN);
+    if (dr)
+      wormleaf_shift_inc(leaf, ir+1, ir, dr);
+    hs[ir] = e;
+    return ir;
+  }
+}
+
+  static void
+wormleaf_insert_e13(struct wormleaf * const leaf, const struct entry13 e)
+{
+  // insert to hs and fix all existing is
+  const u32 ih = wormleaf_insert_hs(leaf, e);
+  debug_assert(ih < WH_KPN);
+  // append the new is
+  leaf->ss[leaf->nr_keys] = (u8)ih;
+  // fix nr
+  leaf->nr_keys++;
+}
+
+  static void
+wormleaf_insert(struct wormleaf * const leaf, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  debug_assert(leaf->nr_keys < WH_KPN);
+
+  // insert
+  const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  const u32 nr0 = leaf->nr_keys;
+  wormleaf_insert_e13(leaf, e);
+
+  // optimize for seq insertion
+  if (nr0 == leaf->nr_sorted) {
+    if (nr0) {
+      const struct kv * const kvn = wormleaf_kv_at_is(leaf, nr0 - 1);
+      if (kv_compare(new, kvn) > 0)
+        leaf->nr_sorted = nr0 + 1;
+    } else {
+      leaf->nr_sorted = 1;
+    }
+  }
+}
+
+  static void
+wormleaf_pull_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  struct entry13 * const hs = leaf->hs;
+  // try left
+  u32 i = ih - 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) > i))
+    i--;
+
+  if ((++i) < ih) {
+    wormleaf_shift_inc(leaf, i+1, i, ih - i);
+    leaf->hs[i].v64 = 0;
+    return;
+  }
+
+  // try right
+  i = ih + 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) < i))
+    i++;
+
+  if ((--i) > ih) {
+    wormleaf_shift_dec(leaf, ih, ih+1, i - ih);
+    hs[i].v64 = 0;
+  }
+  // hs[ih] may still be 0
+}
+
+// internal only
+  static struct kv *
+wormleaf_remove(struct wormleaf * const leaf, const u32 ih, const u32 is)
+{
+  // ss
+  leaf->ss[is] = leaf->ss[leaf->nr_keys - 1];
+  if (leaf->nr_sorted > is)
+    leaf->nr_sorted = is;
+
+  // ret
+  struct kv * const victim = wormleaf_kv_at_ih(leaf, ih);
+  // hs
+  leaf->hs[ih].v64 = 0;
+  leaf->nr_keys--;
+  // use magnet
+  wormleaf_pull_ih(leaf, ih);
+  return victim;
+}
+
+// remove key from leaf but do not call free
+  static struct kv *
+wormleaf_remove_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  // remove from ss
+  const u32 is = wormleaf_search_is(leaf, (u8)ih);
+  debug_assert(is < leaf->nr_keys);
+  return wormleaf_remove(leaf, ih, is);
+}
+
+  static struct kv *
+wormleaf_remove_is(struct wormleaf * const leaf, const u32 is)
+{
+  return wormleaf_remove(leaf, leaf->ss[is], is);
+}
+
+// for delr (delete-range)
+  static void
+wormleaf_delete_range(struct wormhole * const map, struct wormleaf * const leaf,
+    const u32 i0, const u32 end)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  for (u32 i = end; i > i0; i--) {
+    const u32 ir = i - 1;
+    struct kv * const victim = wormleaf_remove_is(leaf, ir);
+    map->mm.free(victim, map->mm.priv);
+  }
+}
+
+// return the old kv; the caller should free the old kv
+  static struct kv *
+wormleaf_update(struct wormleaf * const leaf, const u32 ih, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  // search entry in ss (is)
+  struct kv * const old = wormleaf_kv_at_ih(leaf, ih);
+  debug_assert(old);
+
+  entry13_update_e3(&leaf->hs[ih], (u64)new);
+  return old;
+}
+// }}} leaf-write
+
+// leaf-split {{{
+// It only works correctly in cut_search
+// quickly tell if a cut between k1 and k2 can achieve a specific anchor-key length
+  static bool
+wormhole_split_cut_alen_check(const u32 alen, const struct kv * const k1, const struct kv * const k2)
+{
+  debug_assert(k2->klen >= alen);
+  return (k1->klen < alen) || (k1->kv[alen - 1] != k2->kv[alen - 1]);
+}
+
+// return the number of keys that should go to leaf1
+// assert(r > 0 && r <= nr_keys)
+// (1) r < is1, anchor key is ss[r-1]:ss[r]
+// (2) r == is1: anchor key is ss[r-1]:new
+// (3) r == is1+1: anchor key is new:ss[r-1] (ss[r-1] is the ss[r] on the logically sorted array)
+// (4) r > is1+1: anchor key is ss[r-2]:ss[r-1] (ss[r-2] is the [r-1] on the logically sorted array)
+// edge cases:
+//   (case 2) is1 == nr_keys: r = nr_keys; ss[r-1]:new
+//   (case 3) is1 == 0, r == 1; new:ss[0]
+// return 1..WH_KPN
+  static u32
+wormhole_split_cut_search1(struct wormleaf * const leaf, u32 l, u32 h, const u32 is1, const struct kv * const new)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(leaf->nr_keys);
+  debug_assert(l < h && h <= leaf->nr_sorted);
+
+  const struct kv * const kl0 = wormleaf_kv_at_is1(leaf, l, is1, new);
+  const struct kv * const kh0 = wormleaf_kv_at_is1(leaf, h, is1, new);
+  const u32 alen = kv_key_lcp(kl0, kh0) + 1;
+  if (unlikely(alen > UINT16_MAX))
+    return WH_KPN2;
+
+  const u32 target = leaf->next ? WH_MID : WH_KPN_MRG;
+  while ((l + 1) < h) {
+    const u32 m = (l + h + 1) >> 1;
+    if (m <= target) { // try right
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, h, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        l = m;
+      else
+        h = m;
+    } else { // try left
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, l, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        h = m;
+      else
+        l = m;
+    }
+  }
+  return h;
+}
+
+  static void
+wormhole_split_leaf_move1(struct wormleaf * const leaf1, struct wormleaf * const leaf2,
+    const u32 cut, const u32 is1, const struct kv * const new)
+{
+  const u32 nr_keys = leaf1->nr_keys;
+  const struct entry13 e1 = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  struct entry13 es[WH_KPN];
+
+  if (cut <= is1) { // e1 goes to leaf2
+    // leaf2
+    for (u32 i = cut; i < is1; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    wormleaf_insert_e13(leaf2, e1);
+
+    for (u32 i = is1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+  } else { // e1 goes to leaf1
+    // leaf2
+    for (u32 i = cut - 1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < is1; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+    es[is1] = e1;
+
+    for (u32 i = is1 + 1; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i - 1]];
+  }
+
+  leaf2->nr_sorted = leaf2->nr_keys;
+
+  memset(leaf1->hs, 0, sizeof(leaf1->hs[0]) * WH_KPN);
+  leaf1->nr_keys = 0;
+  for (u32 i = 0; i < cut; i++)
+    wormleaf_insert_e13(leaf1, es[i]);
+  leaf1->nr_sorted = cut;
+  debug_assert((leaf1->nr_sorted + leaf2->nr_sorted) == (nr_keys + 1));
+}
+
+// create an anchor for leaf-split
+  static struct kv *
+wormhole_split_alloc_anchor(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 alen = kv_key_lcp(key1, key2) + 1;
+  debug_assert(alen <= key2->klen);
+
+  struct kv * const anchor = wormhole_alloc_akey(alen);
+  if (anchor)
+    kv_refill(anchor, key2->kv, alen, NULL, 0);
+  return anchor;
+}
+
+// leaf1 is locked
+// split leaf1 into leaf1+leaf2; insert new into leaf1 or leaf2, return leaf2
+  static struct wormleaf *
+wormhole_split_leaf(struct wormhole * const map, struct wormleaf * const leaf1, struct kv * const new)
+{
+  wormleaf_sync_sorted(leaf1);
+  struct kref kref_new;
+  kref_ref_kv(&kref_new, new);
+  const u32 is1 = wormleaf_search_ss(leaf1, &kref_new); // new should be inserted at [is1]
+  const u32 cut = wormhole_split_cut_search1(leaf1, 0, leaf1->nr_keys, is1, new);
+  if (unlikely(cut == WH_KPN2))
+    return NULL;
+
+  // anchor of leaf2
+  debug_assert(cut && (cut <= leaf1->nr_keys));
+  const struct kv * const key1 = wormleaf_kv_at_is1(leaf1, cut - 1, is1, new);
+  const struct kv * const key2 = wormleaf_kv_at_is1(leaf1, cut, is1, new);
+  struct kv * const anchor2 = wormhole_split_alloc_anchor(key1, key2);
+  if (unlikely(anchor2 == NULL)) // anchor alloc failed
+    return NULL;
+
+  // create leaf2 with anchor2
+  struct wormleaf * const leaf2 = wormleaf_alloc(map, leaf1, leaf1->next, anchor2);
+  if (unlikely(leaf2 == NULL)) {
+    wormhole_free_akey(anchor2);
+    return NULL;
+  }
+
+  // split_hmap will unlock the leaf nodes; must move now
+  wormhole_split_leaf_move1(leaf1, leaf2, cut, is1, new);
+  // leaf1 and leaf2 should be sorted after split
+  debug_assert(leaf1->nr_keys == leaf1->nr_sorted);
+  debug_assert(leaf2->nr_keys == leaf2->nr_sorted);
+
+  return leaf2;
+}
+// }}} leaf-split
+
+// leaf-merge {{{
+// MERGE is the only operation that deletes a leaf node (leaf2).
+// It ALWAYS merges the right node into the left node even if the left is empty.
+// This requires both of their writer locks to be acquired.
+// This allows iterators to safely probe the next node (but not backwards).
+// In other words, if either the reader or the writer lock of node X has been acquired:
+// X->next (the pointer) cannot be changed by any other thread.
+// X->next cannot be deleted.
+// But the content in X->next can still be changed.
+  static bool
+wormleaf_merge(struct wormleaf * const leaf1, struct wormleaf * const leaf2)
+{
+  debug_assert((leaf1->nr_keys + leaf2->nr_keys) <= WH_KPN);
+  const bool leaf1_sorted = leaf1->nr_keys == leaf1->nr_sorted;
+
+  for (u32 i = 0; i < leaf2->nr_keys; i++)
+    wormleaf_insert_e13(leaf1, leaf2->hs[leaf2->ss[i]]);
+  if (leaf1_sorted)
+    leaf1->nr_sorted += leaf2->nr_sorted;
+  return true;
+}
+
+// for undoing insertion under split_meta failure; leaf2 is still local
+// remove the new key; merge keys in leaf2 into leaf1; free leaf2
+  static void
+wormleaf_split_undo(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, struct kv * const new)
+{
+  if (new) {
+    const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+    const u32 im1 = wormleaf_search_ih(leaf1, e);
+    if (im1 < WH_KPN) {
+      (void)wormleaf_remove_ih(leaf1, im1);
+    } else { // not found in leaf1; search leaf2
+      const u32 im2 = wormleaf_search_ih(leaf2, e);
+      debug_assert(im2 < WH_KPN);
+      (void)wormleaf_remove_ih(leaf2, im2);
+    }
+  }
+  // this merge must succeed
+  if (!wormleaf_merge(leaf1, leaf2))
+    debug_die();
+  // Keep this to avoid triggering false alarm in wormleaf_free
+  leaf2->leaflock.opaque = 0;
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} leaf-merge
+
+// get/probe {{{
+  struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  struct kv * const tmp = (i < WH_KPN) ? ref->map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+  wormleaf_unlock_read(leaf);
+  return tmp;
+}
+
+  struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  wormhole_resume(ref);
+  struct kv * const ret = wormhole_get(ref, key, out);
+  wormhole_park(ref);
+  return ret;
+}
+
+  struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  return (i < WH_KPN) ? map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+}
+
+  bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  wormleaf_unlock_read(leaf);
+  return i < WH_KPN;
+}
+
+  bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_probe(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  return wormleaf_match_hs(leaf, key) < WH_KPN;
+}
+// }}} get/probe
+
+// meta-split {{{
+// duplicate from meta1; only has one bit but will soon add a new bit
+  static struct wormmeta *
+wormmeta_expand(struct wormhmap * const hmap, struct wormmeta * const meta1)
+{
+  struct wormmeta * const meta2 = slab_alloc_unsafe(hmap->slab2);
+  if (meta2 == NULL)
+    return NULL;
+
+  memcpy(meta2, meta1, sizeof(*meta1));
+  for (u32 i = 0; i < WH_BMNR; i++)
+    meta2->bitmap[i] = 0;
+  const u32 bitmin = wormmeta_bitmin_load(meta1);
+  debug_assert(bitmin == wormmeta_bitmax_load(meta1));
+  debug_assert(bitmin < WH_FO);
+  // set the only bit
+  meta2->bitmap[bitmin >> 6u] |= (1lu << (bitmin & 0x3fu));
+
+  wormhmap_replace(hmap, meta1, meta2);
+  slab_free_unsafe(hmap->slab1, meta1);
+  return meta2;
+}
+
+  static struct wormmeta *
+wormmeta_bm_set_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin < bitmax) { // already in full size
+    wormmeta_bm_set(meta, id);
+    return meta;
+  } else if (id == bitmin) { // do nothing
+    return meta;
+  } else if (bitmin == WH_FO) { // add the first bit
+    wormmeta_bitmin_store(meta, id);
+    wormmeta_bitmax_store(meta, id);
+    return meta;
+  } else { // need to expand
+    struct wormmeta * const meta2 = wormmeta_expand(hmap, meta);
+    wormmeta_bm_set(meta2, id);
+    return meta2;
+  }
+}
+
+// return true if a new node is created
+  static void
+wormmeta_split_touch(struct wormhmap * const hmap, struct kv * const mkey,
+    struct wormleaf * const leaf, const u32 alen)
+{
+  struct wormmeta * meta = wormhmap_get(hmap, mkey);
+  if (meta) {
+    if (mkey->klen < alen)
+      meta = wormmeta_bm_set_helper(hmap, meta, mkey->kv[mkey->klen]);
+    if (wormmeta_lmost_load(meta) == leaf->next)
+      wormmeta_lmost_store(meta, leaf);
+    else if (wormmeta_rmost_load(meta) == leaf->prev)
+      wormmeta_rmost_store(meta, leaf);
+  } else { // create new node
+    const u32 bit = (mkey->klen < alen) ? mkey->kv[mkey->klen] : WH_FO;
+    meta = wormmeta_alloc(hmap, leaf, mkey, alen, bit);
+    debug_assert(meta);
+    wormhmap_set(hmap, meta);
+  }
+}
+
+  static void
+wormmeta_lpath_update(struct wormhmap * const hmap, const struct kv * const a1, const struct kv * const a2,
+    struct wormleaf * const lpath)
+{
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(a2, pbuf);
+
+  // only need to update a2's own branch
+  u32 i = kv_key_lcp(a1, a2) + 1;
+  debug_assert(i <= pbuf->klen);
+  wormhole_prefix(pbuf, i);
+  while (i < a2->klen) {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * const meta = wormhmap_get(hmap, pbuf);
+    debug_assert(meta);
+    wormmeta_lpath_store(meta, lpath);
+
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  }
+}
+
+// for leaf1, a leaf2 is already linked at its right side.
+// this function updates the meta-map by moving leaf1 and hooking leaf2 at correct positions
+  static void
+wormmeta_split(struct wormhmap * const hmap, struct wormleaf * const leaf,
+    struct kv * const mkey)
+{
+  // left branches
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  u32 i = next ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+
+  // save klen
+  const u32 mklen = mkey->klen;
+  wormhole_prefix(mkey, i);
+  do {
+    wormmeta_split_touch(hmap, mkey, leaf, alen);
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(mkey);
+  } while (true);
+
+  // adjust maxplen; i is the plen of the last _touch()
+  if (i > hmap->maxplen)
+    hmap->maxplen = i;
+  debug_assert(i <= UINT16_MAX);
+
+  // restore klen
+  mkey->klen = mklen;
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, leaf);
+}
+
+// all locks will be released before returning
+  static bool
+wormhole_split_meta(struct wormref * const ref, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  struct wormhole * const map = ref->map;
+  // metalock
+  wormhmap_lock(map, ref);
+
+  // check slab reserve
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+
+  // link
+  struct wormleaf * const leaf1 = leaf2->prev;
+  leaf1->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  // update versions
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_split(hmap1, leaf2, mkey);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_split(hmap0, leaf2, mkey);
+
+  wormhmap_unlock(map);
+
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+// all locks (metalock + leaflocks) will be released before returning
+// leaf1->lock (write) is already taken
+  static bool
+wormhole_split_insert(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(ref->map, leaf1, new);
+  if (unlikely(leaf2 == NULL)) {
+    wormleaf_unlock_write(leaf1);
+    return false;
+  }
+
+  rwlock_lock_write(&(leaf2->leaflock));
+  const bool rsm = wormhole_split_meta(ref, leaf2);
+  if (unlikely(!rsm)) {
+    // undo insertion & merge; free leaf2
+    wormleaf_split_undo(ref->map, leaf1, leaf2, new);
+    wormleaf_unlock_write(leaf1);
+  }
+  return rsm;
+}
+
+  static bool
+whunsafe_split_meta(struct wormhole * const map, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  // link
+  leaf2->prev->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_split(&(map->hmap2[i]), leaf2, mkey);
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+  static bool
+whunsafe_split_insert(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(map, leaf1, new);
+  if (unlikely(leaf2 == NULL))
+    return false;
+
+  const bool rsm = whunsafe_split_meta(map, leaf2);
+  if (unlikely(!rsm))  // undo insertion, merge, free leaf2
+    wormleaf_split_undo(map, leaf1, leaf2, new);
+
+  return rsm;
+}
+// }}} meta-split
+
+// meta-merge {{{
+// now it only contains one bit
+  static struct wormmeta *
+wormmeta_shrink(struct wormhmap * const hmap, struct wormmeta * const meta2)
+{
+  debug_assert(wormmeta_bitmin_load(meta2) == wormmeta_bitmax_load(meta2));
+  struct wormmeta * const meta1 = slab_alloc_unsafe(hmap->slab1);
+  if (meta1 == NULL)
+    return NULL;
+
+  memcpy(meta1, meta2, sizeof(*meta1));
+
+  wormhmap_replace(hmap, meta2, meta1);
+  slab_free_unsafe(hmap->slab2, meta2);
+  return meta1;
+}
+
+  static void
+wormmeta_bm_clear_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta)) {
+    debug_assert(wormmeta_bitmin_load(meta) < WH_FO);
+    wormmeta_bitmin_store(meta, WH_FO);
+    wormmeta_bitmax_store(meta, WH_FO);
+  } else { // has more than 1 bit
+    wormmeta_bm_clear(meta, id);
+    if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta))
+      wormmeta_shrink(hmap, meta);
+  }
+}
+
+// all locks held
+  static void
+wormmeta_merge(struct wormhmap * const hmap, struct wormleaf * const leaf)
+{
+  // leaf->next is the new next after merge, which can be NULL
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(leaf->anchor, pbuf);
+  u32 i = (prev && next) ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+  wormhole_prefix(pbuf, i);
+  struct wormmeta * parent = NULL;
+  do {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * meta = wormhmap_get(hmap, pbuf);
+    if (wormmeta_lmost_load(meta) == wormmeta_rmost_load(meta)) { // delete single-child
+      debug_assert(wormmeta_lmost_load(meta) == leaf);
+      const u32 bitmin = wormmeta_bitmin_load(meta);
+      wormhmap_del(hmap, meta);
+      wormmeta_free(hmap, meta);
+      if (parent) {
+        wormmeta_bm_clear_helper(hmap, parent, pbuf->kv[i-1]);
+        parent = NULL;
+      }
+      if (bitmin == WH_FO) // no child
+        break;
+    } else { // adjust lmost rmost
+      if (wormmeta_lmost_load(meta) == leaf)
+        wormmeta_lmost_store(meta, next);
+      else if (wormmeta_rmost_load(meta) == leaf)
+        wormmeta_rmost_store(meta, prev);
+      parent = meta;
+    }
+
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  } while (true);
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, prev);
+}
+
+// all locks (metalock + two leaflock) will be released before returning
+// merge leaf2 to leaf1, removing all metadata to leaf2 and leaf2 itself
+  static void
+wormhole_meta_merge(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, const bool unlock_leaf1)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  struct wormhole * const map = ref->map;
+
+  wormhmap_lock(map, ref);
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_merge(hmap1, leaf2);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  if (unlock_leaf1)
+    wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_merge(hmap0, leaf2);
+  // leaf2 is now safe to be removed
+  wormleaf_free(map->slab_leaf, leaf2);
+  wormhmap_unlock(map);
+}
+
+// caller must acquire leaf->wlock and next->wlock
+// all locks will be released when this function returns
+  static bool
+wormhole_meta_leaf_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  debug_assert(next);
+
+  // double check
+  if ((leaf->nr_keys + next->nr_keys) <= WH_KPN) {
+    if (wormleaf_merge(leaf, next)) {
+      wormhole_meta_merge(ref, leaf, next, true);
+      return true;
+    }
+  }
+  // merge failed but it's fine
+  wormleaf_unlock_write(leaf);
+  wormleaf_unlock_write(next);
+  return false;
+}
+
+  static void
+whunsafe_meta_leaf_merge(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  if (!wormleaf_merge(leaf1, leaf2))
+    return;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_merge(&(map->hmap2[i]), leaf2);
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} meta-merge
+
+// put {{{
+  bool
+wormhole_put(struct wormref * const ref, struct kv * const kv)
+{
+  // we always allocate a new item on SET
+  // future optimizations may perform in-place update
+  struct wormhole * const map = ref->map;
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) {
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_put(struct wormref * const ref, struct kv * const kv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_put(ref, kv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv)
+{
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) { // overwrite
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormhole * const map = ref->map;
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL)) { // no replacement
+      wormleaf_unlock_write(leaf);
+      return true;
+    }
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL)) { // mm error
+      wormleaf_unlock_write(leaf);
+      return false;
+    }
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) { // nothing to be inserted
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) { // mm error
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_merge(ref, kref, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL))
+      return true;
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL))
+      return false;
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) // nothing to be inserted
+    return true;
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) // mm error
+    return false;
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+// }}} put
+
+// inplace {{{
+  bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_read(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_read(leaf);
+    return false;
+  }
+}
+
+  bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_write(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpr(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpw(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // overwrite
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    return true;
+  } else {
+    uf(NULL, priv);
+    return false;
+  }
+}
+// }}} put
+
+// del {{{
+  static void
+wormhole_del_try_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  if (next && ((leaf->nr_keys == 0) || ((leaf->nr_keys + next->nr_keys) < WH_KPN_MRG))) {
+    // try merge, it may fail if size becomes larger after locking
+    wormleaf_lock_write(next, ref);
+    (void)wormhole_meta_leaf_merge(ref, leaf);
+    // locks are already released; immediately return
+  } else {
+    wormleaf_unlock_write(leaf);
+  }
+}
+
+  bool
+wormhole_del(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    wormhole_del_try_merge(ref, leaf);
+    debug_assert(kv);
+    // free after releasing locks
+    struct wormhole * const map = ref->map;
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  } else {
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_del(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_del(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  static void
+whunsafe_del_try_merge(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 n0 = leaf->prev ? leaf->prev->nr_keys : WH_KPN;
+  const u32 n1 = leaf->nr_keys;
+  const u32 n2 = leaf->next ? leaf->next->nr_keys : WH_KPN;
+
+  if ((leaf->prev && (n1 == 0)) || ((n0 + n1) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf->prev, leaf);
+  } else if ((leaf->next && (n1 == 0)) || ((n1 + n2) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf, leaf->next);
+  }
+}
+
+  bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    debug_assert(kv);
+
+    whunsafe_del_try_merge(map, leaf);
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  }
+  return false;
+}
+
+  u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  struct wormleaf * const leafa = wormhole_jump_leaf_write(ref, start);
+  wormleaf_sync_sorted(leafa);
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia) { // do nothing if end < start
+    wormleaf_unlock_write(leafa);
+    return 0;
+  }
+  u64 ndel = iaz - ia;
+  struct wormhole * const map = ref->map;
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  if (leafa->nr_keys > ia) { // end hit; done
+    wormhole_del_try_merge(ref, leafa);
+    return ndel;
+  }
+
+  while (leafa->next) {
+    struct wormleaf * const leafx = leafa->next;
+    wormleaf_lock_write(leafx, ref);
+    // two leaf nodes locked
+    wormleaf_sync_sorted(leafx);
+    const u32 iz = end ? wormleaf_seek_end(leafx, end) : leafx->nr_keys;
+    ndel += iz;
+    wormleaf_delete_range(map, leafx, 0, iz);
+    if (leafx->nr_keys == 0) { // removed all
+      // must hold leaf1's lock for the next iteration
+      wormhole_meta_merge(ref, leafa, leafx, false);
+    } else { // partially removed; done
+      (void)wormhole_meta_leaf_merge(ref, leafa);
+      return ndel;
+    }
+  }
+  wormleaf_unlock_write(leafa);
+  return ndel;
+}
+
+  u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  wormhole_resume(ref);
+  const u64 ret = wormhole_delr(ref, start, end);
+  wormhole_park(ref);
+  return ret;
+}
+
+  u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end)
+{
+  // first leaf
+  struct wormhmap * const hmap = map->hmap;
+  struct wormleaf * const leafa = wormhole_jump_leaf(hmap, start);
+  wormleaf_sync_sorted(leafa);
+  // last leaf
+  struct wormleaf * const leafz = end ? wormhole_jump_leaf(hmap, end) : NULL;
+
+  // select start/end on leafa
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia)
+    return 0;
+
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  u64 ndel = iaz - ia;
+
+  if (leafa == leafz) { // one node only
+    whunsafe_del_try_merge(map, leafa);
+    return ndel;
+  }
+
+  // 0 or more nodes between leafa and leafz
+  while (leafa->next != leafz) {
+    struct wormleaf * const leafx = leafa->next;
+    ndel += leafx->nr_keys;
+    for (u32 i = 0; i < leafx->nr_keys; i++)
+      map->mm.free(wormleaf_kv_at_is(leafx, i), map->mm.priv);
+    leafx->nr_keys = 0;
+    leafx->nr_sorted = 0;
+    whunsafe_meta_leaf_merge(map, leafa, leafx);
+  }
+  // delete the smaller keys in leafz
+  if (leafz) {
+    wormleaf_sync_sorted(leafz);
+    const u32 iz = wormleaf_seek_end(leafz, end);
+    wormleaf_delete_range(map, leafz, 0, iz);
+    ndel += iz;
+    whunsafe_del_try_merge(map, leafa);
+  }
+  return ndel;
+}
+// }}} del
+
+// iter {{{
+// safe iter: safe sort with read-lock acquired
+// unsafe iter: allow concurrent seek/skip
+  static void
+wormhole_iter_leaf_sync_sorted(struct wormleaf * const leaf)
+{
+  if (unlikely(leaf->nr_keys != leaf->nr_sorted)) {
+    spinlock_lock(&(leaf->sortlock));
+    wormleaf_sync_sorted(leaf);
+    spinlock_unlock(&(leaf->sortlock));
+  }
+}
+
+  struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = ref;
+  iter->map = ref->map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  return iter;
+}
+
+  static void
+wormhole_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL)) {
+      struct wormref * const ref = iter->ref;
+      wormleaf_lock_read(next, ref);
+      wormleaf_unlock_read(iter->leaf);
+
+      wormhole_iter_leaf_sync_sorted(next);
+    } else {
+      wormleaf_unlock_read(iter->leaf);
+    }
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  debug_assert(key);
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(iter->ref, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  wormhole_iter_fix(iter);
+}
+
+  void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  wormhole_resume(iter->ref);
+  wormhole_iter_seek(iter, key);
+}
+
+  bool
+wormhole_iter_valid(struct wormhole_iter * const iter)
+{
+  return iter->leaf != NULL;
+}
+
+  static struct kv *
+wormhole_iter_current(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    debug_assert(iter->is < iter->leaf->nr_sorted);
+    struct kv * const kv = wormleaf_kv_at_is(iter->leaf, iter->is);
+    return kv;
+  }
+  return NULL;
+}
+
+  struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    struct kv * const ret = iter->map->mm.out(kv, out);
+    return ret;
+  }
+  return NULL;
+}
+
+  bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kref_ref_kv(kref, kv);
+    return true;
+  }
+  return false;
+}
+
+  bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kvref_ref_kv(kvref, kv);
+    return true;
+  }
+  return false;
+}
+
+  void
+wormhole_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    wormhole_iter_fix(iter);
+  }
+}
+
+  void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    wormhole_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  wormhole_iter_skip1(iter);
+  return ret;
+}
+
+  bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  uf(kv, priv); // call uf even if (kv == NULL)
+  return kv != NULL;
+}
+
+  void
+wormhole_iter_park(struct wormhole_iter * const iter)
+{
+  if (iter->leaf) {
+    wormleaf_unlock_read(iter->leaf);
+    iter->leaf = NULL;
+  }
+}
+
+  void
+whsafe_iter_park(struct wormhole_iter * const iter)
+{
+  wormhole_iter_park(iter);
+  wormhole_park(iter->ref);
+}
+
+  void
+wormhole_iter_destroy(struct wormhole_iter * const iter)
+{
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+  free(iter);
+}
+
+  void
+whsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  wormhole_park(iter->ref);
+  wormhole_iter_destroy(iter);
+}
+// }}} iter
+
+// unsafe iter {{{
+  struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = NULL;
+  iter->map = map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  whunsafe_iter_seek(iter, kref_null());
+  return iter;
+}
+
+  static void
+whunsafe_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL))
+      wormhole_iter_leaf_sync_sorted(next);
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(iter->map->hmap, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  whunsafe_iter_fix(iter);
+}
+
+  void
+whunsafe_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    whunsafe_iter_fix(iter);
+  }
+}
+
+  void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    whunsafe_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  whunsafe_iter_skip1(iter);
+  return ret;
+}
+
+  void
+whunsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  free(iter);
+}
+// }}} unsafe iter
+
+// misc {{{
+  struct wormref *
+wormhole_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = malloc(sizeof(*ref));
+  if (ref == NULL)
+    return NULL;
+  ref->map = map;
+  if (qsbr_register(map->qsbr, &(ref->qref)) == false) {
+    free(ref);
+    return NULL;
+  }
+  return ref;
+}
+
+  struct wormref *
+whsafe_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = wormhole_ref(map);
+  if (ref)
+    wormhole_park(ref);
+  return ref;
+}
+
+  struct wormhole *
+wormhole_unref(struct wormref * const ref)
+{
+  struct wormhole * const map = ref->map;
+  qsbr_unregister(map->qsbr, &(ref->qref));
+  free(ref);
+  return map;
+}
+
+  inline void
+wormhole_park(struct wormref * const ref)
+{
+  qsbr_park(&(ref->qref));
+}
+
+  inline void
+wormhole_resume(struct wormref * const ref)
+{
+  qsbr_resume(&(ref->qref));
+}
+
+  inline void
+wormhole_refresh_qstate(struct wormref * const ref)
+{
+  qsbr_update(&(ref->qref), wormhmap_version_load(wormhmap_load(ref->map)));
+}
+
+  static void
+wormhole_clean_hmap(struct wormhole * const map)
+{
+  for (u32 x = 0; x < 2; x++) {
+    if (map->hmap2[x].pmap == NULL)
+      continue;
+    struct wormhmap * const hmap = &(map->hmap2[x]);
+    const u64 nr_slots = ((u64)(hmap->mask)) + 1;
+    struct wormmbkt * const pmap = hmap->pmap;
+    for (u64 s = 0; s < nr_slots; s++) {
+      struct wormmbkt * const slot = &(pmap[s]);
+      for (u32 i = 0; i < WH_BKT_NR; i++)
+        if (slot->e[i])
+          wormmeta_keyref_release(slot->e[i]);
+    }
+
+    slab_free_all(hmap->slab1);
+    slab_free_all(hmap->slab2);
+    memset(hmap->pmap, 0, hmap->msize);
+    hmap->maxplen = 0;
+  }
+}
+
+  static void
+wormhole_free_leaf_keys(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 nr = leaf->nr_keys;
+  for (u32 i = 0; i < nr; i++) {
+    void * const curr = wormleaf_kv_at_is(leaf, i);
+    debug_assert(curr);
+    map->mm.free(curr, map->mm.priv);
+  }
+  wormhole_free_akey(leaf->anchor);
+}
+
+  static void
+wormhole_clean_helper(struct wormhole * const map)
+{
+  wormhole_clean_hmap(map);
+  for (struct wormleaf * leaf = map->leaf0; leaf; leaf = leaf->next)
+    wormhole_free_leaf_keys(map, leaf);
+  slab_free_all(map->slab_leaf);
+  map->leaf0 = NULL;
+}
+
+// unsafe
+  void
+wormhole_clean(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  wormhole_create_leaf0(map);
+}
+
+  void
+wormhole_destroy(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  for (u32 i = 0; i < 2; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+  qsbr_destroy(map->qsbr);
+  slab_destroy(map->slab_leaf);
+  free(map->pbuf);
+  free(map);
+}
+
+  void
+wormhole_fprint(struct wormhole * const map, FILE * const out)
+{
+  const u64 nr_slab_ul = slab_get_nalloc(map->slab_leaf);
+  const u64 nr_slab_um11 = slab_get_nalloc(map->hmap2[0].slab1);
+  const u64 nr_slab_um12 = slab_get_nalloc(map->hmap2[0].slab2);
+  const u64 nr_slab_um21 = map->hmap2[1].slab1 ? slab_get_nalloc(map->hmap2[1].slab1) : 0;
+  const u64 nr_slab_um22 = map->hmap2[1].slab2 ? slab_get_nalloc(map->hmap2[1].slab2) : 0;
+  fprintf(out, "%s L-SLAB %lu M-SLAB [0] %lu+%lu [1] %lu+%lu\n",
+      __func__, nr_slab_ul, nr_slab_um11, nr_slab_um12, nr_slab_um21, nr_slab_um22);
+}
+// }}} misc
+
+// api {{{
+const struct kvmap_api kvmap_api_wormhole = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .refpark = true,
+  .put = (void *)wormhole_put,
+  .get = (void *)wormhole_get,
+  .probe = (void *)wormhole_probe,
+  .del = (void *)wormhole_del,
+  .inpr = (void *)wormhole_inpr,
+  .inpw = (void *)wormhole_inpw,
+  .merge = (void *)wormhole_merge,
+  .delr = (void *)wormhole_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)wormhole_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)wormhole_iter_park,
+  .iter_destroy = (void *)wormhole_iter_destroy,
+  .ref = (void *)wormhole_ref,
+  .unref = (void *)wormhole_unref,
+  .park = (void *)wormhole_park,
+  .resume = (void *)wormhole_resume,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .put = (void *)whsafe_put,
+  .get = (void *)whsafe_get,
+  .probe = (void *)whsafe_probe,
+  .del = (void *)whsafe_del,
+  .inpr = (void *)whsafe_inpr,
+  .inpw = (void *)whsafe_inpw,
+  .merge = (void *)whsafe_merge,
+  .delr = (void *)whsafe_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)whsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)whsafe_iter_park,
+  .iter_destroy = (void *)whsafe_iter_destroy,
+  .ref = (void *)whsafe_ref,
+  .unref = (void *)wormhole_unref,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whunsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .unique = true,
+  .put = (void *)whunsafe_put,
+  .get = (void *)whunsafe_get,
+  .probe = (void *)whunsafe_probe,
+  .del = (void *)whunsafe_del,
+  .inpr = (void *)whunsafe_inp,
+  .inpw = (void *)whunsafe_inp,
+  .merge = (void *)whunsafe_merge,
+  .delr = (void *)whunsafe_delr,
+  .iter_create = (void *)whunsafe_iter_create,
+  .iter_seek = (void *)whunsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)whunsafe_iter_skip1,
+  .iter_skip = (void *)whunsafe_iter_skip,
+  .iter_next = (void *)whunsafe_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_destroy = (void *)whunsafe_iter_destroy,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+  static void *
+wormhole_kvmap_api_create(const char * const name, const struct kvmap_mm * const mm, char ** args)
+{
+  (void)args;
+  if ((!strcmp(name, "wormhole")) || (!strcmp(name, "whsafe"))) {
+    return wormhole_create(mm);
+  } else if (!strcmp(name, "whunsafe")) {
+    return whunsafe_create(mm);
+  } else {
+    return NULL;
+  }
+}
+
+__attribute__((constructor))
+  static void
+wormhole_kvmap_api_init(void)
+{
+  kvmap_api_register(0, "wormhole", "", wormhole_kvmap_api_create, &kvmap_api_wormhole);
+  kvmap_api_register(0, "whsafe", "", wormhole_kvmap_api_create, &kvmap_api_whsafe);
+  kvmap_api_register(0, "whunsafe", "", wormhole_kvmap_api_create, &kvmap_api_whunsafe);
+}
+// }}} api
+
+// wh {{{
+// Users often don't enjoy dealing with struct kv/kref and just want to use plain buffers.
+// No problem!
+// This example library shows you how to use Wormhole efficiently in the most intuitive way.
+
+// Use the worry-free api
+static const struct kvmap_api * const wh_api = &kvmap_api_whsafe;
+
+// You can change the wh_api to kvmap_api_wormhole with a one-line replacement
+// The standard Wormhole api can give you ~5% boost; read README for thread-safety tips
+//static const struct kvmap_api * const wh_api = &kvmap_api_wormhole;
+
+  struct wormhole *
+wh_create(void)
+{
+  // kvmap_mm_ndf (kv.h) will let the caller allocate the kv when inserting
+  // This can avoid a memcpy if the caller does not have the data in a struct kv
+  return wormhole_create(&kvmap_mm_ndf);
+}
+
+  struct wormref *
+wh_ref(struct wormhole * const wh)
+{
+  return wh_api->ref(wh);
+}
+
+  void
+wh_unref(struct wormref * const ref)
+{
+  (void)wh_api->unref(ref);
+}
+
+  void
+wh_park(struct wormref * const ref)
+{
+  if (wh_api->park)
+    wh_api->park(ref);
+}
+
+  void
+wh_resume(struct wormref * const ref)
+{
+  if (wh_api->resume)
+    wh_api->resume(ref);
+}
+
+  void
+wh_clean(struct wormhole * const map)
+{
+  wh_api->clean(map);
+}
+
+  void
+wh_destroy(struct wormhole * const map)
+{
+  wh_api->destroy(map);
+}
+
+// Do set/put with explicit kv buffers
+  bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen)
+{
+  struct kv * const newkv = kv_create(kbuf, klen, vbuf, vlen);
+  if (newkv == NULL)
+    return false;
+  // must use with kvmap_mm_ndf (see below)
+  // the newkv will be saved in the Wormhole and freed by Wormhole when upon deletion
+  return wh_api->put(ref, newkv);
+}
+
+// delete a key
+  bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->del(ref, &kref);
+}
+
+// test if the key exist in Wormhole
+  bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->probe(ref, &kref);
+}
+
+// for wh_get()
+struct wh_inp_info { void * vbuf_out; u32 * vlen_out; u32 vbuf_size; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+wh_inp_copy_value(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_inp_info * const info = (typeof(info))priv;
+    // copy the value data out
+    const u32 copy_size = info->vbuf_size < curr->vlen ? info->vbuf_size : curr->vlen;
+    memcpy(info->vbuf_out, kv_vptr_c(curr), copy_size);
+    // copy the vlen out
+    *info->vlen_out = curr->vlen;
+  }
+}
+
+// returns a boolean value indicating whether the key is found.
+// the value's data will be written to *vlen_out and vbuf_out if the key is found
+// if vbuf_size < vlen, then only the first vbuf_size bytes is copied to the buffer
+// a small vbuf_size can be used to reduce memcpy cost when only the first a few bytes are needed
+  bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  struct wh_inp_info info = {vbuf_out, vlen_out, vbuf_size};
+  // use the inplace read function to get the value if it exists
+  return wh_api->inpr(ref, &kref, wh_inp_copy_value, &info);
+}
+
+  bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpr(ref, &kref, uf, priv);
+}
+
+// inplace update KV's value with a user-defined hook function
+// the update should only modify the data in the value; It should not change the value size
+  bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpw(ref, &kref, uf, priv);
+}
+
+// merge existing KV with updates with a user-defined hook function
+  bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->merge(ref, &kref, uf, priv);
+}
+
+// remove a range of KVs from start (inclusive) to end (exclusive); [start, end)
+  u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end)
+{
+  struct kref kref_start, kref_end;
+  kref_ref_hash32(&kref_start, kbuf_start, klen_start);
+  kref_ref_hash32(&kref_end, kbuf_end, klen_end);
+  return wh_api->delr(ref, &kref_start, &kref_end);
+}
+
+  struct wormhole_iter *
+wh_iter_create(struct wormref * const ref)
+{
+  return wh_api->iter_create(ref);
+}
+
+  void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  wh_api->iter_seek(iter, &kref);
+}
+
+  bool
+wh_iter_valid(struct wormhole_iter * const iter)
+{
+  return wh_api->iter_valid(iter);
+}
+
+// for wh_iter_peek()
+// the out ptrs must be provided in pairs; use a pair of NULLs to ignore the key or value
+struct wh_iter_inp_info { void * kbuf_out; void * vbuf_out; u32 kbuf_size; u32 vbuf_size; u32 * klen_out; u32 * vlen_out; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+inp_copy_kv_cb(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_iter_inp_info * const info = (typeof(info))priv;
+
+    // copy the key
+    if (info->kbuf_out) { // it assumes klen_out is also not NULL
+      // copy the key data out
+      const u32 clen = curr->klen < info->kbuf_size ? curr->klen : info->kbuf_size;
+      memcpy(info->kbuf_out, kv_kptr_c(curr), clen);
+      // copy the klen out
+      *info->klen_out = curr->klen;
+    }
+
+    // copy the value
+    if (info->vbuf_out) { // it assumes vlen_out is also not NULL
+      // copy the value data out
+      const u32 clen = curr->vlen < info->vbuf_size ? curr->vlen : info->vbuf_size;
+      memcpy(info->vbuf_out, kv_vptr_c(curr), clen);
+      // copy the vlen out
+      *info->vlen_out = curr->vlen;
+    }
+  }
+}
+
+// seek is similar to get
+  bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct wh_iter_inp_info info = {kbuf_out, vbuf_out, kbuf_size, vbuf_size, klen_out, vlen_out};
+  return wh_api->iter_inp(iter, inp_copy_kv_cb, &info);
+}
+
+  void
+wh_iter_skip1(struct wormhole_iter * const iter)
+{
+  wh_api->iter_skip1(iter);
+}
+
+  void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  wh_api->iter_skip(iter, nr);
+}
+
+  bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  return wh_api->iter_inp(iter, uf, priv);
+}
+
+  void
+wh_iter_park(struct wormhole_iter * const iter)
+{
+  wh_api->iter_park(iter);
+}
+
+  void
+wh_iter_destroy(struct wormhole_iter * const iter)
+{
+  wh_api->iter_destroy(iter);
+}
+// }}} wh
+
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/wh.h b/run/MassTrie-beta/wormhole/wh.h
new file mode 100644
index 00000000..bd17b38d
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/wh.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct wormhole;
+struct wormref;
+
+// wormhole {{{
+// the wh created by wormhole_create() can work with all of safe/unsafe operations.
+  extern struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm);
+
+// the wh created by whunsafe_create() can only work with the unsafe operations.
+  extern struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm);
+
+  extern struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+wormhole_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref);
+
+  extern void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern bool
+wormhole_iter_valid(struct wormhole_iter * const iter);
+
+  extern struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref);
+
+  extern bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref);
+
+  extern void
+wormhole_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wormhole_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+wormhole_ref(struct wormhole * const map);
+
+  extern struct wormhole *
+wormhole_unref(struct wormref * const ref);
+
+  extern void
+wormhole_park(struct wormref * const ref);
+
+  extern void
+wormhole_resume(struct wormref * const ref);
+
+  extern void
+wormhole_refresh_qstate(struct wormref * const ref);
+
+// clean with more threads
+  extern void
+wormhole_clean_th(struct wormhole * const map, const u32 nr_threads);
+
+  extern void
+wormhole_clean(struct wormhole * const map);
+
+  extern void
+wormhole_destroy(struct wormhole * const map);
+
+// safe API (no need to refresh qstate)
+
+  extern struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+whsafe_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+// use wormhole_iter_create
+  extern void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern struct kv *
+whsafe_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+// use wormhole_iter_valid
+// use wormhole_iter_peek
+// use wormhole_iter_kref
+// use wormhole_iter_kvref
+// use wormhole_iter_skip1
+// use wormhole_iter_skip
+// use wormhole_iter_next
+// use wormhole_iter_inp
+
+  extern void
+whsafe_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+whsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+whsafe_ref(struct wormhole * const map);
+
+// use wormhole_unref
+
+// unsafe API
+
+  extern struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key);
+
+  extern bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv);
+
+  extern bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key);
+
+  extern u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map);
+
+  extern void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+// unsafe iter_valid: use wormhole_iter_valid
+// unsafe iter_peek: use wormhole_iter_peek
+// unsafe iter_kref: use wormhole_iter_kref
+
+  extern void
+whunsafe_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+// unsafe iter_inp: use wormhole_iter_inp
+
+  extern void
+whunsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_fprint(struct wormhole * const map, FILE * const out);
+
+extern const struct kvmap_api kvmap_api_wormhole;
+extern const struct kvmap_api kvmap_api_whsafe;
+extern const struct kvmap_api kvmap_api_whunsafe;
+// }}} wormhole
+
+// wh {{{
+  extern struct wormhole *
+wh_create(void);
+
+  extern struct wormref *
+wh_ref(struct wormhole * const wh);
+
+  extern void
+wh_unref(struct wormref * const ref);
+
+  extern void
+wh_park(struct wormref * const ref);
+
+  extern void
+wh_resume(struct wormref * const ref);
+
+  extern void
+wh_clean(struct wormhole * const map);
+
+  extern void
+wh_destroy(struct wormhole * const map);
+
+  extern bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen);
+
+  extern bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv);
+
+  extern u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end);
+
+  extern struct wormhole_iter *
+wh_iter_create(struct wormref * const ref);
+
+  extern void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_iter_valid(struct wormhole_iter * const iter);
+
+  extern bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern void
+wh_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wh_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_destroy(struct wormhole_iter * const iter);
+// }}} wh
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/wh.py b/run/MassTrie-beta/wormhole/wh.py
new file mode 100644
index 00000000..e744cec8
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/wh.py
@@ -0,0 +1,192 @@
+#!/usr/bin/python3
+
+#
+# Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+#
+# All rights reserved. No warranty, explicit or implicit, provided.
+#
+
+import msgpack
+from ctypes import *   # CDLL and c_xxx types
+
+# libwh {{{
+# Change this path when necessary
+libwh = CDLL("./libwh.so")
+
+# create
+libwh.wh_create.argtypes = []
+libwh.wh_create.restype = c_void_p
+
+# close (no return value)
+libwh.wh_destroy.argtypes = [c_void_p]
+
+# ref
+libwh.wh_ref.argtypes = [c_void_p]
+libwh.wh_ref.restype = c_void_p
+
+# unref
+libwh.wh_unref.argtypes = [c_void_p]
+
+# put
+libwh.wh_put.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint]
+libwh.wh_put.restype = c_bool
+
+# get
+libwh.wh_get.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint, c_void_p]
+libwh.wh_get.restype = c_bool
+
+# probe
+libwh.wh_probe.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_probe.restype = c_bool
+
+# del
+libwh.wh_del.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_del.restype = c_bool
+
+# iter_create
+libwh.wh_iter_create.argtypes = [c_void_p]
+libwh.wh_iter_create.restype = c_void_p
+
+# iter_seek
+libwh.wh_iter_seek.argtypes = [c_void_p, c_char_p, c_uint]
+
+# iter_valid
+libwh.wh_iter_valid.argtypes = [c_void_p]
+libwh.wh_iter_valid.restype = c_bool
+
+# iter_skip1
+libwh.wh_iter_skip1.argtypes = [c_void_p]
+
+# iter_skip
+libwh.wh_iter_skip.argtypes = [c_void_p, c_uint]
+
+# iter_peek
+libwh.wh_iter_peek.argtypes = [c_void_p, c_char_p, c_uint, c_void_p, c_char_p, c_uint, c_void_p]
+libwh.wh_iter_peek.restype = c_bool
+
+# iter_park
+libwh.wh_iter_park.argtypes = [c_void_p]
+
+# iter_destroy
+libwh.wh_iter_destroy.argtypes = [c_void_p]
+# }}} libwh
+
+# class {{{
+class Wh:
+    def __init__(self, maxklen=256, maxvlen=8192):
+        self.whptr = libwh.wh_create()
+        self.kbufsz = maxklen
+        self.vbufsz = maxvlen
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_destroy(self.whptr)
+
+    def ref(self):
+        return WhRef(self.whptr, self.kbufsz, self.vbufsz)
+
+class WhRef:
+    def __init__(self, whptr, kbufsz, vbufsz):
+        self.refptr = libwh.wh_ref(whptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.vbuf = create_string_buffer(self.vbufsz)
+
+    # user must call explicitly
+    def unref(self):
+        libwh.wh_unref(self.refptr)
+
+    def iter(self):
+        return WhIter(self.refptr, self.kbufsz, self.vbufsz)
+
+    # key: python string; value: any (hierarchical) python object
+    def put(self, key, value):
+        binkey = key.encode()
+        binvalue = msgpack.packb(value)
+        return libwh.wh_put(self.refptr, binkey, c_uint(len(binkey)), binvalue, c_uint(len(binvalue)))
+
+    # return the value as a python object
+    def get(self, key):
+        binkey = key.encode()
+        vlen = c_uint()
+        ret = libwh.wh_get(self.refptr, binkey, len(binkey), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and vlen.value <= self.vbufsz:
+            return msgpack.unpackb(self.vbuf.value)
+        else:
+            return None
+
+    def delete(self, key):
+        binkey = key.encode()
+        return libwh.wh_del(self.refptr, binkey, c_uint(len(binkey)))
+
+    def probe(self, key):
+        binkey = key.encode()
+        return libwh.wh_probe(self.refptr, binkey, c_uint(len(binkey)))
+
+class WhIter:
+    def __init__(self, refptr, kbufsz, vbufsz):
+        self.iptr = libwh.wh_iter_create(refptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.kbuf = create_string_buffer(kbufsz)
+        self.vbuf = create_string_buffer(vbufsz)
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_iter_destroy(self.iptr)
+
+    def seek(self, key):
+        if key is None:
+            libwh.wh_iter_seek(self.iptr, None, c_uint(0))
+        else:
+            binkey = key.encode()
+            libwh.wh_iter_seek(self.iptr, binkey, c_uint(len(binkey)))
+
+    def valid(self):
+        return libwh.wh_iter_valid(self.iptr)
+
+    def skip1(self):
+        libwh.wh_iter_skip1(self.iptr)
+
+    def skip(self, nr):
+        libwh.wh_iter_skip(self.iptr, c_uint(nr))
+
+    # return (key, value) pair or None
+    def peek(self):
+        klen = c_uint()
+        vlen = c_uint()
+        ret = libwh.wh_iter_peek(self.iptr, self.kbuf, self.kbufsz, byref(klen), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and klen.value <= self.kbufsz and vlen.value <= self.vbufsz:
+            self.kbuf[klen.value] = b'\x00'
+            return (self.kbuf.value.decode(), klen.value, msgpack.unpackb(self.vbuf.value), vlen.value)
+        else:
+            return None
+
+# }}} class
+
+# examples
+wh1 = Wh(32, 1024)
+ref1 = wh1.ref()  # take a ref for kv operations
+
+ref1.put("Hello", "pywh")
+ref1.put("key1", "value1")
+ref1.put("key2", "value2")
+ref1.put("key3", {"xxx":"valuex", "yyy":"valuey"})
+ref1.delete("key2")
+
+rget = ref1.get("Hello")
+print(rget)
+
+# don't use ref when iterating
+iter1 = ref1.iter()
+iter1.seek(None)
+while iter1.valid():
+    r = iter1.peek()
+    print(r)
+    iter1.skip1()
+
+iter1.destroy() # must destroy all iters before unref
+ref1.unref() # must unref all refs before close()
+wh1.destroy()
+
+# vim:fdm=marker
diff --git a/run/MassTrie-beta/wormhole/wh.strip b/run/MassTrie-beta/wormhole/wh.strip
new file mode 100644
index 00000000..e7b3971f
--- /dev/null
+++ b/run/MassTrie-beta/wormhole/wh.strip
@@ -0,0 +1,161 @@
+-K key_size
+-K key_size_align
+-K kref_compare
+-K kref_kv_compare
+-K kref_kv_match
+-K kref_lcp
+-K kref_match
+-K kref_null
+-K kref_ref_hash32
+-K kref_ref_kv
+-K kref_ref_kv_hash32
+-K kref_ref_raw
+-K kref_update_hash32
+-K kv_compare
+-K kv_compare_ptrs
+-K kv_crc32c
+-K kv_crc32c_extend
+-K kv_create
+-K kv_create_kref
+-K kv_create_str
+-K kv_create_str_str
+-K kv_dup
+-K kv_dup2
+-K kv_dup2_key
+-K kv_dup2_key_prefix
+-K kv_dup_key
+-K kv_key_lcp
+-K kv_kptr
+-K kv_kptr_c
+-K kv_kref
+-K kvmap_api_whsafe
+-K kvmap_api_whunsafe
+-K kvmap_api_wormhole
+-K kvmap_dump_keys
+-K kvmap_inp_steal_kv
+-K kvmap_kv_del
+-K kvmap_kv_delr
+-K kvmap_kv_get
+-K kvmap_kv_inpr
+-K kvmap_kv_inpw
+-K kvmap_kv_iter_seek
+-K kvmap_kv_merge
+-K kvmap_kv_probe
+-K kvmap_kv_put
+-K kvmap_mm_dup
+-K kvmap_mm_free_free
+-K kvmap_mm_free_noop
+-K kvmap_mm_in_dup
+-K kvmap_mm_in_noop
+-K kvmap_mm_ndf
+-K kvmap_mm_out_dup
+-K kvmap_mm_out_noop
+-K kvmap_raw_del
+-K kvmap_raw_get
+-K kvmap_raw_inpr
+-K kvmap_raw_inpw
+-K kvmap_raw_iter_seek
+-K kvmap_raw_probe
+-K kvmap_ref
+-K kvmap_unref
+-K kv_match
+-K kv_match_full
+-K kv_null
+-K kv_print
+-K kv_qsort
+-K kvref_dup2_key
+-K kvref_dup2_kv
+-K kv_refill
+-K kv_refill_hex32
+-K kv_refill_hex64
+-K kv_refill_hex64_klen
+-K kv_refill_kref
+-K kv_refill_kref_v
+-K kv_refill_str
+-K kv_refill_str_str
+-K kv_refill_u64
+-K kv_refill_value
+-K kvref_kv_compare
+-K kvref_ref_kv
+-K kv_size
+-K kv_size_align
+-K kv_update_hash
+-K kv_vptr
+-K kv_vptr_c
+-K wh_clean
+-K wh_create
+-K wh_del
+-K wh_delr
+-K wh_destroy
+-K wh_get
+-K wh_inpr
+-K wh_inpw
+-K wh_iter_create
+-K wh_iter_destroy
+-K wh_iter_inp
+-K wh_iter_park
+-K wh_iter_peek
+-K wh_iter_seek
+-K wh_iter_skip
+-K wh_iter_valid
+-K wh_merge
+-K wh_park
+-K wh_probe
+-K wh_ref
+-K wh_resume
+-K whsafe_del
+-K whsafe_delr
+-K whsafe_get
+-K whsafe_inpr
+-K whsafe_inpw
+-K whsafe_iter_destroy
+-K whsafe_iter_park
+-K whsafe_iter_seek
+-K whsafe_merge
+-K whsafe_probe
+-K whsafe_ref
+-K whsafe_put
+-K wh_put
+-K wh_unref
+-K whunsafe_create
+-K whunsafe_del
+-K whunsafe_delr
+-K whunsafe_get
+-K whunsafe_inp
+-K whunsafe_iter_create
+-K whunsafe_iter_destroy
+-K whunsafe_iter_next
+-K whunsafe_iter_seek
+-K whunsafe_iter_skip
+-K whunsafe_merge
+-K whunsafe_probe
+-K whunsafe_put
+-K wormhole_clean
+-K wormhole_create
+-K wormhole_del
+-K wormhole_delr
+-K wormhole_destroy
+-K wormhole_fprint
+-K wormhole_get
+-K wormhole_inpr
+-K wormhole_inpw
+-K wormhole_iter_create
+-K wormhole_iter_destroy
+-K wormhole_iter_inp
+-K wormhole_iter_kref
+-K wormhole_iter_kvref
+-K wormhole_iter_next
+-K wormhole_iter_park
+-K wormhole_iter_peek
+-K wormhole_iter_seek
+-K wormhole_iter_skip
+-K wormhole_iter_valid
+-K wormhole_kvmap_api_create
+-K wormhole_merge
+-K wormhole_park
+-K wormhole_probe
+-K wormhole_ref
+-K wormhole_refresh_qstate
+-K wormhole_resume
+-K wormhole_put
+-K wormhole_unref
diff --git a/test/MassTrie-beta/MassTrie.hh b/test/MassTrie-beta/MassTrie.hh
new file mode 100644
index 00000000..53cfd776
--- /dev/null
+++ b/test/MassTrie-beta/MassTrie.hh
@@ -0,0 +1,318 @@
+#include <string>
+
+#include <iostream>
+
+#include <assert.h>
+
+#include <stdio.h>
+
+#include <unordered_map>
+
+#include <cassert>
+
+#include "wormhole/lib.h"
+
+#include "wormhole/kv.h"
+
+#include "wormhole/wh.h"
+
+#define NUM_THREADS 64
+
+#define MAX_SIZE 64
+
+using namespace std;
+
+//~~~~~~~~~CLASS MASSTRIE~~~~~~~~~~~~~~
+
+class MassTrie
+{
+
+public:
+  // constructor
+
+  MassTrie()
+  {
+
+    // creating wh wormhole mapping key to internal_elem (as uintptr_t)
+
+    wh = wh_create();
+
+    ref = wh_ref(this->wh);
+
+    iter = wh_iter_create(this->ref);
+
+    this->kbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    this->vbuf_out = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+    r = false;
+  }
+
+  // destructor
+
+  ~MassTrie()
+  {
+
+    wh_iter_destroy(this->iter);
+
+    wh_unref(this->ref);
+
+    wh_clean(this->wh);
+
+    wh_destroy(this->wh);
+
+    free(kbuf_out);
+
+    free(vbuf_out);
+  }
+
+  //~~~~~~~~~MASSTRIE FUNCTIONS~~~~~~~~~~~~~~
+
+  // put function - putting a uintptr_t which is the internal_elem
+
+  bool put(const void *key, int klen, const void *value, int vlen)
+  {
+
+    return (wh_put(this->ref, key, klen, value, vlen));
+  }
+
+  // get function
+
+  void *get(struct wormref *const ref, const void *key, int klen)
+  {
+
+    // variables
+
+    // bool r;
+
+    u32 vlen_out = 0;
+
+    // get action performed
+
+    r = wh_get(ref, key, klen, vbuf_out, sizeof(vbuf_out), &vlen_out);
+
+    return r ? vbuf_out : nullptr;
+  }
+
+  // delete function
+
+  bool del(const void *key, int klen)
+  {
+
+    return (wh_del(this->ref, key, klen));
+  }
+
+  // probe function - returns true if key exists, false otherwise
+
+  bool probe(const void *key, int klen)
+  {
+
+    r = (wh_probe(this->ref, key, klen));
+
+    return r;
+  }
+
+  // finds the closest pointer currently in the MassTrie
+
+  // to a pointer passed as a parameter
+
+  void *find_closest(const void *key)
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool r;
+
+    int min = INT_MAX;
+
+    int curr;
+
+    void *res = NULL;
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // calculate disatnce
+
+        curr = abs((long)(reinterpret_cast<uintptr_t>(kbuf_out)) - (long)(reinterpret_cast<uintptr_t>(key)));
+
+        if (curr < min)
+        {
+
+          // perform malloc
+
+          if (!res)
+
+            res = (void *)malloc(sizeof(char) * MAX_SIZE);
+
+          // error handling
+
+          if (res == NULL)
+          {
+
+            printf("Error! memory not allocated.");
+
+            exit(1);
+          }
+
+          min = curr;
+
+          // cout<<"curr = "<<curr<<endl;
+
+          memcpy(res, kbuf_out, sizeof(kbuf_out));
+        }
+      }
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+
+    return (res != NULL) ? res : nullptr;
+  }
+
+  // deletes all from MassTrie
+
+  void delete_all()
+  {
+
+    // variables
+
+    u32 klen_out = 0;
+
+    u32 vlen_out = 0;
+
+    // bool
+
+    // search loop
+
+    wh_iter_seek(this->iter, NULL, 0); // seek to the head
+
+    // printf("wh_iter_seek closest pointer to key\"\"\n");
+
+    while (wh_iter_valid(this->iter))
+    {
+
+      r = wh_iter_peek(this->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+      if (r)
+      {
+
+        // delete key
+
+        this->del(kbuf_out, sizeof(kbuf_out));
+      }
+
+      else
+      {
+
+        printf("ERROR!\n");
+      }
+
+      wh_iter_skip1(this->iter);
+
+      memset(kbuf_out, 0, sizeof(kbuf_out));
+
+      memset(vbuf_out, 0, sizeof(vbuf_out));
+    }
+  }
+
+  // data members
+
+  struct wormhole *wh;
+
+  struct wormref *ref;
+
+  struct wormhole_iter *iter;
+
+  void *kbuf_out;
+
+  void *vbuf_out;
+
+  bool r;
+
+}; // class MassTrie
+
+/**
+
+//override the << operation
+
+
+
+ostream& operator<<(ostream &os, MassTrie* m){
+
+
+
+u32 klen_out = 0;
+
+  char kbuf_out[MAX_SIZE] = {};
+
+  u32 vlen_out = 0;
+
+  char vbuf_out[MAX_SIZE] = {};
+
+  bool r;
+
+
+
+  wh_iter_seek(m->iter, NULL, 0); // seek to the head
+
+  printf("wh_iter_seek \"\"\n");
+
+  while (wh_iter_valid(m->iter)) {
+
+    r = wh_iter_peek(m->iter, kbuf_out, MAX_SIZE, &klen_out, vbuf_out, MAX_SIZE, &vlen_out);
+
+    if (r) {
+
+      os << "wh_iter_peek: key = "<<reinterpret_cast<char *>(kbuf_out)<<" , klen = "<< klen_out<<" , "<<
+
+      " value= "<<reinterpret_cast<char *>(vbuf_out) << ", vlen= "<< vlen_out<<endl;
+
+    } else {
+
+      printf("ERROR!\n");
+
+    }
+
+
+
+    wh_iter_skip1(m->iter);
+
+
+
+    memset(kbuf_out,0,sizeof(kbuf_out));
+
+    memset(vbuf_out,0,sizeof(vbuf_out));
+
+  }
+
+  return os;
+
+}
+
+
+
+**/
diff --git a/test/MassTrie-beta/wormhole/LICENSE b/test/MassTrie-beta/wormhole/LICENSE
new file mode 100644
index 00000000..f288702d
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/test/MassTrie-beta/wormhole/Makefile b/test/MassTrie-beta/wormhole/Makefile
new file mode 100644
index 00000000..f00e6b59
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/Makefile
@@ -0,0 +1,45 @@
+# Makefile
+# rules (always with .out)
+# SRC-X.out += abc        # extra source: abc.c
+# MOD-X.out += abc        # extra module: abc.c abc.h
+# ASM-X.out += abc        # extra assembly: abc.S
+# DEP-X.out += abc        # extra dependency: abc
+# FLG-X.out += -finline   # extra flags
+# LIB-X.out += abc        # extra -labc options
+
+# X.out : xyz.h xyz.c # for extra dependences that are to be compiled/linked.
+
+# X => X.out
+TARGETS += easydemo concbench stresstest
+# X => X.c only
+SOURCES +=
+# X => X.S only
+ASSMBLY +=
+# X => X.c X.h
+MODULES += lib kv wh
+# X => X.h
+HEADERS += ctypes
+
+FLG +=
+LIB += m
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),FreeBSD)
+LIB += execinfo
+endif
+
+# when $ make FORKER_PAPI=y
+ifeq ($(strip $(FORKER_PAPI)),y)
+LIB += papi
+FLG += -DFORKER_PAPI
+endif
+
+bin : libwh.so
+libwh.so : Makefile Makefile.common lib.c lib.h kv.c kv.h wh.c wh.h wh.strip
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) -shared -fPIC)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ lib.c kv.c wh.c $(ALLLIB)
+	strip --strip-all --discard-all @wh.strip $@
+
+
+include Makefile.common
diff --git a/test/MassTrie-beta/wormhole/Makefile.common b/test/MassTrie-beta/wormhole/Makefile.common
new file mode 100644
index 00000000..ecd761e7
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/Makefile.common
@@ -0,0 +1,216 @@
+#usage: include Makefile.common at the end of your Makefile
+
+# no builtin rules/vars (CC, CXX, etc. are still defined but will be empty)
+MAKEFLAGS += -r -R
+
+HDR = $(addsuffix .h,$(MODULES) $(HEADERS))
+SRC = $(addsuffix .c,$(MODULES) $(SOURCES))
+ASM = $(addsuffix .S,$(ASSMBLY))
+OBJ = $(addsuffix .o,$(MODULES) $(SOURCES) $(ASSEMBLY))
+DEP = Makefile.common Makefile $(HDR) $(EXTERNDEP) $(EXTERNSRC)
+BIN = $(addsuffix .out,$(TARGETS))
+DIS = $(addsuffix .dis,$(TARGETS))
+
+# clang:
+# EXTRA="-Rpass=loop-vectorize"  # IDs loops that were successfully V-ed
+# EXTRA="-Rpass-missed=loop-vectorize"  # IDs loops that failed V
+# EXTRA="-Rpass-analysis=loop-vectorize" # IDs the statements that caused V to fail
+# EXTRA="-Rpass=\ *" # remarks for all passes
+# other passes: https://llvm.org/docs/Passes.html
+
+O ?= rg
+
+# predefined OPT: make O={rg,r,0g,3g,p,0s,3s,cov,mc,hc,wn,stk}
+ifeq ($O,rg) # make O=rg
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector
+else ifeq ($O,r) # make O=r (for release)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector
+else ifeq ($O,ns) # make O=ns (no signal handlers)
+OPT ?= -DNDEBUG -O3 -flto -fno-stack-protector -DNOSIGNAL
+else ifeq ($O,0g) # make O=0g
+OPT ?= -g3 -O0 -fno-inline
+else ifeq ($O,2g) # make O=2g
+OPT ?= -g3 -O2
+else ifeq ($O,3g) # make O=3g
+OPT ?= -g3 -O3 -flto -fno-inline
+else ifeq ($O,p) # make O=p (profiling: rg+noinline)
+OPT ?= -DNDEBUG -g3 -O3 -flto -fno-stack-protector -fno-inline
+else ifeq ($O,0s) # make O=0s (address sanitizer)
+OPT ?= -g3 -O0 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,3s) # make O=3s (address sanitizer)
+OPT ?= -g3 -O3 -fno-inline -fsanitize=address -fno-omit-frame-pointer -fno-optimize-sibling-calls -DHEAPCHECKING
+else ifeq ($O,t) # make O=0t (thread sanitizer)
+OPT ?= -g3 -O1 -fno-inline -fsanitize=thread -fno-stack-protector
+else ifeq ($O,cov) # make O=cov (for gcov)
+OPT ?= -g3 -DNDEBUG -O0 --coverage
+CCC = gcc
+else ifeq ($O,mc) # make O=mc (for valgrind memcheck)
+OPT ?= -g3 -O1 -fno-inline -DHEAPCHECKING
+ARCH ?= broadwell
+else ifeq ($O,hc) # make O=hc (for gperftools heapcheck)
+OPT ?= -g3 -O1 -fno-inline
+LIB += tcmalloc
+else ifeq ($O,wn) # more warning
+OPT ?= -g3 -O3 -Wvla -Wformat=2 -Wconversion -Wstrict-prototypes -Wmissing-prototypes
+else ifeq ($O,stk) # check stack usage with gcc
+OPT ?= -g3 -O3 -DNDEBUG -fstack-usage
+CCC = gcc
+endif
+
+# malloc: g:glibc, t:tcmalloc, j:jemalloc
+M ?= g
+
+ifeq ($M,t)
+  LIB += tcmalloc
+  FLG += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+else ifeq ($M,j)
+  LIB += jemalloc
+endif
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+  CHECK_S := -D__linux__
+  LIB += rt
+else ifeq ($(UNAME_S),FreeBSD)
+  CHECK_S := -D__FreeBSD__
+  FLG += -I/usr/local/include -L/usr/local/lib
+  LIB += rt
+  LIB += execinfo
+  TPUT := /usr/local/bin/tput
+else ifeq ($(UNAME_S),Darwin)
+  CHECK_S := -D__APPLE__ -D__MACH__
+  # do nothing
+else
+  $(error "Supported Platforms: Linux, FreeBSD, Darwin")
+endif
+TPUT ?= tput
+
+CCC ?= clang
+CSTD = -std=gnu18
+XCC ?= clang++
+XSTD = -std=gnu++17
+
+UNAME_M := $(shell uname -m)
+ifeq ($(UNAME_M),aarch64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),arm64) # "native" does not work for clang@aarch64
+  CHECK_M := -D__aarch64__
+  ARCH ?= armv8-a+crc
+else ifeq ($(UNAME_M),x86_64)
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else ifeq ($(UNAME_M),amd64) # freebsd
+  CHECK_M := -D__x86_64__
+  ARCH ?= native
+else
+  $(error "Supported Platforms: aarch64, x86_64")
+endif
+
+TUNE ?= native
+
+NBI += memcpy memmove memcmp
+
+# minimal requirement on x86_64: -march=nehalem
+# minimal requirement on aarch64: -march=armv8-a+crc
+FLG += -march=$(ARCH) -mtune=$(TUNE)
+FLG += -pthread -Wall -Wextra -Wshadow #-Weverything
+FLG += $(addprefix -fno-builtin-,$(NBI))
+FLG += $(OPT)
+
+ifneq ($(shell $(CCC) --version 2>/dev/null | grep clang),)
+FLG += -ferror-limit=3
+CCCTYPE := clang
+else ifneq ($(shell $(CCC) --version 2>/dev/null | grep gcc),)
+FLG += -fmax-errors=3
+FLG += -Wno-unknown-pragmas
+CCCTYPE := gcc
+else
+  $(error "Supported Compilers: clang, gcc")
+endif
+
+ifeq ($(CCCTYPE),clang)
+  CCINST = /usr/lib/clang/$(shell $(CCC) --version 2>/dev/null | awk '/^clang/ { print $$3 }')
+  CCINC = $(CCINST)/include
+else ifeq ($(CCCTYPE),gcc)
+  CCINST = /usr/lib/gcc/$(shell $(CCC) -dumpmachine)/$(shell $(CCC) -dumpversion)
+  CCINC = $(CCINST)/include $(CCINST)/include-fixed
+endif
+CCINC = /usr/include /usr/local/include
+
+ifneq ($(shell find $(CCINC) -name backtrace-supported.h 2>/dev/null),)
+  LIB += backtrace
+  FLG += -DBACKTRACE
+endif
+
+ifneq ($(shell find $(CCINC) -name liburing.h 2>/dev/null),)
+  LIB += uring
+  FLG += -DLIBURING
+endif
+
+
+uniq = $(if $1,$(firstword $1) $(call uniq,$(filter-out $(firstword $1),$1)))
+magentatxt := $(shell $(TPUT) setaf 5)
+greentxt := $(shell $(TPUT) setaf 2)
+bluetxt := $(shell $(TPUT) setaf 4)
+normaltxt := $(shell $(TPUT) sgr0)
+
+.PHONY : bin dis def clean cleanx check tags
+
+bin : $(BIN)
+dis : $(DIS) bin
+.DEFAULT_GOAL = bin
+.SECONDEXPANSION:
+
+ifeq ($(J),o)
+# DANGER. Don't use unless it works!
+# build from .o files but target-specific flags are missing in %.o : %.x
+%.out : %.o $(OBJ) $$(addsuffix .o,$$(SRC-$$@) $$(MOD-$$@) $$(ASM-$$@))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	$(CCC) $(ALLFLG) -o $@ $^ $(ALLLIB)
+#
+else # default: all-in-one command
+%.out : %.c $(SRC) $(ASM) $(DEP) $$(DEP-$$@) $$(addsuffix .c,$$(SRC-$$@) $$(MOD-$$@)) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@)) $$(addsuffix .S,$$(ASM-$$@))
+	$(eval ALLSRC := $(SRC) $(addsuffix .c,$(SRC-$@) $(MOD-$@)) $(ASM) $(addsuffix .S,$(ASM-$@)))
+	$(eval UNIQSRC := $(call uniq,$(ALLSRC)))
+	$(eval ALLFLG := $(CSTD) $(EXTRA) $(FLG) $(FLG-$@) -rdynamic)
+	$(eval ALLLIB := $(addprefix -l,$(LIB) $(LIB-$@)))
+	@printf '$(bluetxt)$@$(magentatxt) <= $(greentxt)$< $(UNIQSRC)$(normaltxt)\n'
+	$(CCC) $(ALLFLG) -o $@ $< $(UNIQSRC) $(ALLLIB)
+#
+endif
+
+
+%.dis : %.out
+	objdump -SlwtC $< 1>$@ 2>/dev/null
+
+%.o : %.cc $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(XCC) $(XSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.o : %.S $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$@) -o $@ -c $<
+
+%.s : %.c $(DEP) $$(DEP-$$@) $$(addsuffix .h,$$(HDR-$$@) $$(MOD-$$@))
+	$(CCC) $(CSTD) $(EXTRA) $(FLG) $(FLG-$*) $(FLG-$*.o) -S -o $@ -c $<
+
+def :
+	$(CCC) $(FLG) -dM -E - </dev/null
+
+clean :
+	rm -rf *.out *.dis *.o *.so *.gcda *.gcno *.gcov *.dSYM
+
+cleanx : clean
+	rm -rf $(EXTERNDEP) $(EXTERNSRC)
+
+check :
+	cppcheck $(addprefix -I ,$(CCINC)) \
+    -q $(CHECK_M) $(CHECK_S) -DNDEBUG -U__cplusplus \
+    --std=c11 --language=c --platform=unix64 \
+    --enable=warning,style,performance,portability,information --inline-suppr .
+
+tags :
+	ctags -R . /usr/include /usr/local/include $(CCINC)
diff --git a/test/MassTrie-beta/wormhole/README.md b/test/MassTrie-beta/wormhole/README.md
new file mode 100644
index 00000000..9bb59c12
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/README.md
@@ -0,0 +1,483 @@
+# Wormhole
+
+The Wormhole index structure was introduced in paper ["Wormhole: A Fast Ordered Index for In-memory Data Management"](https://www.cs.uic.edu/~wuxb/papers/wormhole.pdf)
+by Xingbo Wu, Fan Ni, and Song Jiang ([ACM DL](https://dl.acm.org/citation.cfm?id=3303955)).
+This repository maintains a reference implementation of the Wormhole index structure.
+
+It supports Linux/FreeBSD/MacOS on x86\_64 and AArch64 CPUs.
+On x86\_64, Wormhole requires SSE4.2.
+On AArch64, Wormhole requires NEON SIMD and the `crc` features on the target CPU.
+The code has been tested with Intel Haswell, Broadwell, and Skylake CPUs.
+It has also been tested on a Raspberry PI 4 running 64-bit ArchlinuxArm, and a Jetson Nano running 64-bit Ubuntu Groovy.
+
+## NEWS
+
+* See `wh.py` for a brief example of using Wormhole in Python.
+
+* An old limitation about anchor keys has been removed (See Section 3.3 in the original paper for more details).
+Now Wormhole can store binary string keys of any patterns including any number of '\0's. A key's length can be 0 to UINT32\_MAX bytes. (Internally: leaf-nodes' anchor key length <= UINT16\_MAX).
+
+* `wh.h` provides a user-friendly interface. See `easydemo.c` for coding examples. the `wh_` functions are thread-safe.
+
+* The `whsafe` API is a *worry-free* thread-safe wormhole API.
+At a small cost on each operation, users no longer need to call the `wormhole_refresh_qstate` in any circumstances.
+
+* `merge` (Merge a new kv with existing kv) and `delr` (delete range) operations have been added. They are all thread-safe.
+
+## Highlights:
+
+* Thread-safety: all operations, including `get`, `put`, `inplace-update (inp)`, `del`, `iter-seek`, `iter-peek`, `iter-skip` etc., are thread-safe.
+See `stresstest.c` for more thread-safe operations.
+
+* Keys can contain any value, including binary zeros (`'\0'`). Their sizes are always explicitly specified.
+
+* Long keys are welcome! The key-length field (`klen` in `struct kv`) is a 32-bit unsigned integer and the maximum size of a key is 4294967295.
+
+* No background threads or global status. Wormhole uses a mix of user-space rwlocks and QSBR RCU to synchronize between readers and writers.
+See below for more details.
+
+# Build
+
+Clang is the default compiler. It can compile with gcc with `$ make CCC=gcc`.
+On our testbed, Clang usually produces faster code than GCC.
+
+To build:
+
+    $ make
+
+Alternatively, you may use `O=0g` to enable debug info and disable optimizations:
+
+    $ make O=0g
+
+## Sample programs
+`easydemo.c` presents how to use wormhole through a user-friendly API declared at the end of `wh.h`.
+
+    $ ./easydemo.out
+
+The `wh_{ref/unref/get/put/del/probe}` and  `wh_iter_{create/destroy/seek/skip/peek/park/valid}` functions are all thread-safe.
+Each thread should acquire a private reference using `wh_ref` for KV operations.
+
+`concbench.out` is an example benchmarking tool of only 150 LoC. See the helper messages for more details.
+It generates six-word keys based on a word list (words.txt). See `sprintf` in `concbench.c`.
+
+    $ wget https://github.com/dwyl/english-words/raw/master/words.txt
+    $ ./concbench.out words.txt 10000000 4
+    $ numactl -N 0 ./concbench.out words.txt 10000000 4
+
+`stresstest.out` tests all thread-safe operations.
+
+`libwh.so` can be linked to any C/C++ program with the help of `wh.h`.
+
+# The wh API (USE THIS)
+
+The `wh_*` functions provides a clean programming interface that helps to avoid common inefficient use of the Wormhole data structure.
+If you're not sure which interface to use, just use `wh_*`. Read `easydemo.c` for more details.
+
+Coding examples:
+
+```C
+{
+    struct wormhole * wh = wh_create(); // create a new wormhole instance
+    struct wormref * ref = wh_ref(wh); // to access wh, a thread must obtain a reference
+    wh_put(ref, "hello", 5, "world!", 6); // insert a kv pair
+    wh_put(ref, NULL, 0, NULL, 0); // both key and value can be zero-sized
+    r = wh_probe(ref, "hello", 5); // r == true
+    r = wh_probe(ref, NULL, 0); // r == true
+    r = wh_probe(ref, "abc", 3); // r == false
+    u8 buf [6];
+    u32 len_out;
+    r = wh_get(ref, "hello", 5, buf, 6, &len_out); // r == true, len_out == 6, "world!" in buf (without the '\0')
+    struct wormhole_iter * iter = wh_iter_create(ref); // creates an iter on a ref
+    wh_iter_seek(iter, "h", 1); // seek for the smallest key >= "h"; the iter will be placed on "hello"
+    r = wh_iter_valid(iter); // r == true; You should always check if iter is valid after a seek() and skip()
+    r = wh_iter_peek(iter, buf, 6, &len_out, NULL, 0, NULL); // only need the key: will get "hello" and 5
+    r = wh_iter_peek(iter, NULL, 0, NULL, buf, 6, &len_out); // only need the value: will get "world!" and 6
+    // (you can also get both key and value using one call with two buffers)
+    wh_iter_skip1(iter); // skip the current key; equivalent to wh_iter_skip(iter, 1);
+    r = wh_iter_valid(iter); // r == false; already passed the end of the dataset
+    wh_iter_park(iter); // an iter may hold locks; It's a good manner to "park" the iter before sleep.
+    sleep(10); // not interacting with the wormhole instance.
+    wh_iter_seek(iter, NULL, 0); // need to do another seek to reactivate the iter
+    r = wh_iter_valid(iter); // r == true; on the zero-sized key now
+    wh_iter_destroy(iter); // now we're done with the iter
+    wh_del(ref, "hello", 5); // delete a key
+    wh_del(ref, NULL, NULL); // delete the zero-sized key
+    wh_unref(ref); // the current thread is no longer interested in accessing the index
+    wh_destroy(wh); // fully destroy the index; all references should have been released before calling this
+}
+```
+
+## Integer keys
+
+Wormhole supports binary keys, which means you don't need to print integers into text when using Wormhole to index integer keys.
+Here are some quick examples for using Wormhole as an integer-key index. A little-endian CPU is assumed.
+
+```C
+{
+    // 32-bit unsigned integer keys
+    u32 key = __builtin_bswap32(1000); // reverse byte order of key 1000
+    wh_put(ref, &key, 4, NULL, 0);
+    key = __builtin_bswap32(2000); // reverse byte order of key 2000
+    wh_put(ref, &key, 4, NULL, 0);
+    struct wormhole_iter * iter = wh_iter_create(ref);
+    key = __builtin_bswap32(999);
+    wh_iter_seek(iter, &key, 4); // seek 999
+    u32 key_out, len_out;
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 1000 in key_out in reversed byte order
+    wh_iter_skip1(iter);
+    r = wh_iter_peek(iter, &key_out, 4, &len_out, NULL, 0, NULL); // see 2000 in key_out in reversed byte order
+}
+```
+
+# Advanced APIs
+
+If the simple and thread-safe `wh_*` interface already meets your performance requirements, You don't need to read the following sections.
+Using the `wormhole_*` and `whunsafe_*` APIs can maximize the efficiency of your code with a roughly 5%-10% speedup.
+However, inefficient use of these APIs, such as repeatedly calling malloc() to prepare the key buffer, can easily hurt the performance.
+
+## `struct kv` and `struct kref`
+
+There are a handful of helper functions (`kv_*` and `kref_*` functions) at the beginning of wh.h.
+It's worth noting that the *key's hash* (`hash` of `struct kv` and `hash32` of `struct kref`)
+must be up-to-date before passed to wormhole.
+The `kv_refill*` helper functions internally update the hash after filling the kv contents.
+In a more general case, `kv_update_hash` directly updates a `struct kv`'s hash.
+Similarly, `kref_refill_hash32()` calculates the 32-bit hash for `struct kref`.
+Performing the hash calculation at the client side can achieve the best efficiency on the server (the index operations).
+
+## The Wormhole API
+
+`concbench.c` and `stresstest.c` are examples of how to use a Wormhole index.
+There are three sets of Wormhole API: `whsafe`, `wormhole`, and `whunsafe`.
+* `whsafe`: The *worry-free* thread-safe API. If you use Wormhole in a concurrent environment and want minimal complexity in your code, you should use `whsafe`.
+* `wormhole`: The standard thread-safe API. It offers better efficiency than `whsafe` but requires some extra effort for blocking prevention.
+* `whunsafe`: the thread-unsafe API. It offers the best speed and efficiency but does not perform internal concurrency control.
+External synchronization should be employed when accessing `whunsafe` in a concurrent environment.
+
+The functions of each API can be found near the end of `wh.c` (search `kvmap_api_whsafe`, `kvmap_api_wormhole`, and `kvmap_api_whunsafe`).
+Note that each API contains a mix of `whsafe_*`, `wormhole_*`, and `whunsafe_*` functions.
+
+### The `whsafe` API
+The `whsafe` API functions are listed in the `kvmap_api_whsafe` structure in `wh.c`. The API consists of a mix of `wormhole_*` and `whsafe_*` functions.
+
+The index operations (GET, SET, UPDATE, DEL, PROBE, INPLACE, MERGE, and SCAN (`wormhole_iter_*` functions)) are all *thread safe*.
+A thread needs to hold a reference of the index (_wormref_) to perform safe index operations.
+
+An example of using point-query operations using the `whsafe` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = whsafe_ref(wh);
+    for (...) {
+      whsafe_put(ref, ...);
+      whsafe_get(ref, ...);
+      whsafe_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = whsafe_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      whsafe_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    whsafe_iter_park(iter); // Release resources to avoid blocking other threads
+    // it's now safe to do something such as sleep() or waitpid()
+    // ... start using the iterator again
+    whsafe_iter_seek(iter, key2);
+    // ... other iter operations
+    whsafe_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### The `wormhole` API
+Similar to `whsafe`, `wormhole` is also thread safe. It's often faster than `whsafe` but requires extra caution when using it.
+
+An example of using point-query operations using the `wormhole` API.
+
+```C
+{
+    wh = wormhole_create(NULL); // use NULL here unless you want to change the allocator.
+    ref = wormhole_ref(wh);
+    for (...) {
+      wormhole_put(ref, ...);
+      wormhole_get(ref, ...);
+      wormhole_del(ref, ...);
+      ... // other safe operations
+    }
+    ... // other safe operations
+    wormhole_unref(ref);
+    wormhole_destroy(wh);
+}
+```
+
+An example of range-query operations:
+
+```C
+{
+    ref = wormhole_ref(wh);
+    // ... assume we already have a valid ref
+    iter = wormhole_iter_create(ref);
+    for (...) {
+      wormhole_iter_seek(iter, key);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 1);
+      wormhole_iter_peek(iter, buf);
+      wormhole_iter_skip(iter, 3);
+      wormhole_iter_inp(iter, uf, priv);
+      // other iter operations
+    }
+    // An active iterator is likely holding a lock.
+    wormhole_iter_park(iter); // Release resources to avoid blocking other threads
+    while (condition not met) { // See below for explanation
+        wormhole_refresh_qstate(ref);
+    }
+    // ... start using the iterator again
+    wormhole_iter_seek(iter, key2);
+    // ... other iter operations
+    wormhole_iter_destroy(iter);
+    // ... do something
+    // must destroy iterators before unref()
+    wormhole_unref(ref);
+}
+```
+
+### Avoid blocking writers when using the `wormhole` API
+Wormhole internally uses QSBR RCU to synchronize readers/writers so every holder of a reference (`ref`)
+needs to actively perform index operations.
+An ref-holder, if not actively performing index operations, may block a writer thread that is performing split/merge operations.
+(because of not periodically announcing its quiescent state).
+If a ref-holder is about to become inactive from Wormhole's perspective (doing something else or just sleeping),
+it is recommended that the holder temporarily releases the `ref` before entering the inactive status (such as calling `sleep(10)`),
+and reactivate the `ref` before performing the next index operation.
+
+```C
+{
+    // assume we already have an active ref
+    wormhole_park(ref);   // this will avoid blocking any other threads
+    sleep(10);
+    wormhole_resume(ref);  // this will reactivate the ref
+    // continue to perform index operations
+}
+```
+
+A common scenario of dead-locking is acquiring locks with an active wormhole reference,
+The following example could cause deadlock between two threads.
+
+```C
+// Thread A has an active ref and try to lock()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    lock(just_a_lock); // << block here forever
+}
+
+// Thread B already acquired the lock and wants to insert a key to wh
+{
+    lock(just_a_lock);
+    wormhole_put(ref, kv); << block here forever
+}
+```
+
+To avoid this scenario, thread A should either call `wormhole_park(ref)` before acquiring the lock, or keep updating the qstate of the ref:
+```C
+// Solution A.1: use wormhole_park()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    wormhole_park(ref);
+    lock(just_a_lock);
+    wormhole_resume(ref); // can use ref afterward
+}
+
+// Solution A.2: use try_lock and wormhole_refresh_qstate()
+{
+    struct wormref * ref = wormhole_ref(wh);
+    while (!try_lock(just_a_lock)) {
+        wormhole_refresh_qstate(ref);
+    }
+    // continue to use ref
+}
+```
+
+The above issues with QSBR are specific to the `wormhole` API. `whsafe` does not have these issues.
+
+### The `whunsafe` API
+A set of *thread-unsafe* functions are also provided. See the functions with _prefix_ `whunsafe`.
+The thread-unsafe functions don't use the reference (_wormref_).
+Simply feed them with the pointer to the wormhole index:
+
+```C
+{
+    wh = whunsafe_create(NULL);
+    for (...) {
+      whunsafe_put(wh, ...);
+      whunsafe_get(wh, ...);
+      whunsafe_del(wh, ...);
+      ... // other unsafe operations
+    }
+    ... // other unsafe operations
+    wormhole_destroy(wh);
+}
+```
+
+### In-place update with user-defined function
+`wormhole_inp` executes a user-defined function on an existing key-value item.
+If the key does not exist, a NULL pointer will be passed to the user-defined function.
+A simple example would be incrementing a counter stored in a key-value pair.
+
+```C
+{
+    // user-defined in-place update function
+    void myadd1(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        (*pvalue)++;
+      }
+    }
+
+    // create the counter
+    u64 zero = 0;
+    struct kv * tmp = kv_create("counter", 7, &zero, 8); // malloc-ed
+    wormhole_put(ref, tmp);
+
+    // perform +1 on the stored value
+    struct kref kref = kv_ref(tmp); // create a kref of tmp
+    wormhole_inp(ref, &kref, myadd1, NULL);
+}
+```
+
+Note that the user-defined function should ONLY change the value's content, and nothing else.
+Otherwise, the index can be corrupted.
+A similar mechanism is also provided for iterators (`wormhole_iter_inp`).
+
+The inplace function can also be used to retrieve key-value data. For example:
+
+```C
+{
+    void inplace_getu64(struct kv * kv, void * priv) {
+      if (kv != NULL) {
+        assert(kv->vlen >= sizeof(u64));
+        u64 * pvalue = kv_vptr(kv);
+        *(u64 *)priv = *pvalue;
+      } else {
+        *(u64 *)priv = 0;
+      }
+    }
+    ...
+    struct kref kref = ...
+    u64 val;
+    wormhole_inp(ref, &kref, inplace_getu64, &val);
+}
+```
+
+### `merge`: atomic Read-Modify-Write
+The `wormhole_merge` and `whsafe_merge` functions perform atomic Read-Modify-Write (RMW) operations.
+In a RMW operation, if the search key is found, the KV pair will be passed to a user-defined callback function `uf` (short for user function).
+Otherwise, a NULL pointer is passed to `uf`.
+`uf` could update the KV in-place if it does not require any memory reallocation.
+In such a case, `uf` should return the KV's pointer back and the merge function will do nothing else.
+If `uf` want to replace the KV with something new, it should return a pointer that is different than the original KV pointer.
+The `uf` should not make memory allocation by itself.
+Instead, the `merge` function will copy the returned KV and replace the existing KV with the newly created one.
+`uf` should not return NULL unless the key was not found.
+
+### Iterator
+The `wormhole_iter_{seek,peek,skip,next,inp}` functions provide range-search functionalities.
+If the search key does not exist, the `seek` operation will put the cursor on the item that is greater than the search-key.
+`next` will return the item under the current cursor and move the cursor forward.
+`peek` is similar but does not move the cursor. For example, with keys `{1,3,5}`, `seek(2); r = next()` will see `r == 3`.
+
+Currently Wormhole does not provide `seek_for_less_equal()` and `prev()` for backward scanning. This feature will be added in the future.
+
+# Memory management
+
+By default, Wormhole manages all the key-value data internally and only copies to or from a user-supplied
+buffer (a `struct kv` object).
+This draws a clear boundary in the memory space between the index structure and its users.
+After a call to any of the index operations, the caller can immediately free
+the buffer holding the key-reference or the key-value data.
+This also allows users to use stack-allocated variables to interact with Wormhole.
+
+The memory manager of the internal key-value objects can be customized when creating a new Wormhole (see `wormhole_create`).
+The customization will _only_ affect the internal `struct kv` objects.
+Actually, the memory manager can be configured to directly use the caller's `struct kv` object and store it in Wormhole.
+This `struct kvmap_mm` structure shows an example:
+
+```C
+{
+    const struct kvmap_mm kvmap_mm_ualloc {
+      .in = kvmap_mm_in_noop, // in wormhole_put(), store caller's kv in wh
+      .out = kvmap_mm_out_dup, // but still make a copy in wormhole_get()
+      .free = kvmap_mm_free_free, // call free() for delete/update
+    };
+    ...
+    struct wormhole * wh = wormhole_create(&kvmap_mm_ualloc);
+    struct wormref * ref = wormhole_ref(wh);
+    ...
+    struct kv * newkv = malloc(size);
+    ...
+    wormhole_put(ref, newkv);
+    // Don't free newkv! it's now managed by wh
+}
+```
+
+Each of the in/out/free functions can be freely customized.
+A few `kvmap_mm_*` functions are already provided for common scenarios.
+`kvmap_mm_ndf` is identical to the `kvmap_mm_ualloc` structure in the above example.
+
+## Hugepages
+Wormhole uses hugepages when available. To reserve some hugepages in Linux (10000 * 2MB):
+
+    # echo 10000 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+# Tuning
+
+A few macros in `wh.c` can be tuned.
+
+* `WH_SLABLEAF_SIZE` controls the slab size for leaf node allocation.
+The default is `((1lu << 21))` (2MB slabs). If 1GB hugepages are available, `WH_SLABLEAF_SIZE` can be set to `((1lu << 30))` to utilize 1GB hugepages.
+Using 1GB hugepages can improve search performance on a large dataset.
+
+* `WH_KPN` controls "Keys Per (leaf-)Node". The default value is 128.
+Compared to the default, `WH_KPN=256` can offer 5-10%+ higher point query and update speed.
+However, range queries prefer a smaller node size such as 64.
+
+
+* `QSBR_STATES_NR` and `QSBR_SHARDS_NR` control the capacity (number of active references) of the QSBR RCU.
+The product of the two values is the capacity. For efficiency, `QSBR_STATES_NR` can be set to 23, 39, and 55, and `QSBR_SHARDS_NR` must be 2^n, n<=6.
+The defaults are 23 and 32, respectively. The QSBR registry can run out of space if there are a few hundred of threads, which is not a problem in practice.
+
+# Limitations
+
+## Key Patterns
+A **split** operation will fail when **129** (`WH_KPN + 1`) keys share a common prefix of 65535+ bytes.
+In Wormhole, the maximum _anchor-key_ length is 65535 (2^16) bytes, which is shorter than the maximum key-length (2^32).
+
+## Memory Allocation
+Insertions/updates can fail and return false when a memory allocation fails.
+On memory-allocation failure, the hash-table expansion function will block and wait for available memory.
+
+# Performance
+Some benchmarking results with some real-world datasets: See [this](https://github.com/wuxb45/wormhole/issues/5) page for more information.
+
+![Concurrent GET](https://user-images.githubusercontent.com/564235/112712778-704d7200-8e9f-11eb-9f4d-795de46772d1.png)
diff --git a/test/MassTrie-beta/wormhole/README.txt b/test/MassTrie-beta/wormhole/README.txt
new file mode 100644
index 00000000..e70108ef
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/README.txt
@@ -0,0 +1,31 @@
+To setup the project:
+
+If you're not already in the folder 'wormhole', preform:
+
+1. cd wormhole
+
+Once you're there, set the variable LD_LIBRARY_PATH to the
+current working directory using:
+
+2. setenv LD_LIBRARY_PATH `pwd`
+
+You can check (optionally) that this operation was exceuted properly using:
+
+3. echo $LD_LIBRARY_PATH
+
+
+Then, do:
+
+4. cd sto
+
+5. /./bootstrap.sh
+
+6. ./configure
+
+To run the test file do:
+
+7. make unit-testMTrie
+
+Then run it using:
+
+8. ./unit-test_MTrie
diff --git a/test/MassTrie-beta/wormhole/concbench.c b/test/MassTrie-beta/wormhole/concbench.c
new file mode 100644
index 00000000..f18abde9
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/concbench.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018-2019  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdatomic.h>
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+atomic_uint_least64_t __seqno = 0;
+u64 __nth = 0;
+struct kv ** __samples = NULL;
+u64 __nkeys = 0;
+atomic_uint_least64_t __tot = 0;
+u64 __endtime = 0;
+
+  static void *
+kv_load_worker(struct wormhole * const wh)
+{
+  srandom_u64(time_nsec() * time_nsec());
+  struct wormref * const ref = wormhole_ref(wh);
+  const u64 seq = atomic_fetch_add(&__seqno, 1);
+  const u64 n0 = __nkeys / __nth * seq;
+  const u64 nz = (seq == (__nth - 1)) ? __nkeys : (__nkeys / __nth * (seq + 1));
+  printf("load worker %lu %lu\n", n0, nz);
+  for (u64 i = n0; i < nz; i++)
+    wormhole_put(ref, __samples[i]);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  static void *
+kv_probe_worker(struct wormhole * const wh)
+{
+  struct wormref * const ref = wormhole_ref(wh);
+  struct kv * next = __samples[random_u64() % __nkeys];
+  u64 rnext = random_u64() % __nkeys;
+  u64 count = 0;
+  u64 succ = 0;
+#define BATCH ((10000))
+  do {
+    for (u64 i = 0; i < BATCH; i++) {
+      // reading kv samples leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = __samples[rnext];
+      __builtin_prefetch(next, 0, 0);
+      __builtin_prefetch(((u8 *)next) + 64, 0, 0);
+      rnext = random_u64() % __nkeys;
+      __builtin_prefetch(&(__samples[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const struct kref kref = kv_kref(key);
+      if (wormhole_probe(ref, &kref))
+        succ++;
+    }
+    count += BATCH;
+  } while (time_nsec() < __endtime);
+  if (count != succ)
+    printf("count %lu success %lu\n", count, succ);
+  (void)atomic_fetch_add(&__tot, count);
+  wormhole_unref(ref);
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  if (argc < 3) {
+    printf("usage: <words-file> <#keys> <#threads>\n");
+    printf("  Get words.txt: wget https://github.com/dwyl/english-words/raw/master/words.txt\n");
+    printf("  Example: %s words.txt 1000000 4\n", argv[0]);
+    printf("  Better to use only one numa node with numactl -N 0\n");
+    printf("  Better to run X thread on X cores\n");
+    return 0;
+  }
+
+  char ** const words = malloc(sizeof(char *) * 1000000); // or `wc -l words.txt`
+  u64 nr_words = 0;
+  char * buf = malloc(8192);
+  size_t bufsize = 8192;
+  FILE * const fwords = fopen(argv[1], "r");
+  if (fwords == NULL) {
+    printf("open words file failed\n");
+    return 0;
+  }
+
+  // read all words to words
+  while (getline(&buf, &bufsize, fwords) > 0) {
+    buf[strlen(buf)-1] = '\0';
+    words[nr_words] = strdup(buf);
+    nr_words++;
+  }
+  fclose(fwords);
+
+  // generate keys
+  const u64 nkeys = strtoull(argv[2], NULL, 10);
+  struct kv ** const samples = malloc(sizeof(struct kv *) * nkeys);
+  char * ss[6];
+  for (u64 i = 0; i < nkeys; i++) {
+    for (u64 j = 0; j < 6; j++)
+      ss[j] = words[random() % nr_words];
+    sprintf(buf, "%s %s %s %s %s %s!", ss[0], ss[1], ss[2], ss[3], ss[4], ss[5]);
+    samples[i] = kv_create_str(buf, NULL, 0);
+  }
+  // free words & buf
+  for (u64 i = 0; i < nr_words; i++)
+    free(words[i]);
+  free(words);
+  free(buf);
+
+  // load (4)
+  __samples = samples;
+  __nkeys = nkeys;
+  struct wormhole * const wh = wormhole_create(NULL);
+  __nth = 4;
+  const u64 dtl = thread_fork_join(4, (void *)kv_load_worker, false, (void *)wh);
+  printf("load x4 %.2lf mops\n", ((double)nkeys) * 1e3 / ((double)dtl));
+
+  const u64 nth = strtoull(argv[3], NULL, 10);
+  printf("probe with %lu threads. each round takes 3 seconds\n", nth);
+  for (u64 i = 0; i < 3; i++) {
+    __tot = 0;
+    __endtime = time_nsec() + 3e9; // 3 sec
+    const u64 dt = thread_fork_join(nth, (void *)kv_probe_worker, false, (void *)wh);
+    const double mops = ((double)__tot) * 1e3 / ((double)dt);
+    printf("probe x%lu %.2lf mops\n", nth, mops);
+    sleep(1);
+  }
+
+  // final clean up for valgrind
+  for (u64 i = 0; i < nkeys; i++)
+    free(samples[i]);
+  free(samples);
+  wormhole_destroy(wh);
+  return 0;
+}
diff --git a/test/MassTrie-beta/wormhole/concbench.out b/test/MassTrie-beta/wormhole/concbench.out
new file mode 100644
index 00000000..ee87ca31
Binary files /dev/null and b/test/MassTrie-beta/wormhole/concbench.out differ
diff --git a/test/MassTrie-beta/wormhole/ctypes.h b/test/MassTrie-beta/wormhole/ctypes.h
new file mode 100644
index 00000000..314ca5dc
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/ctypes.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// C types only; C++ source code don't use this
+
+#include <assert.h>
+#include <stdatomic.h>
+
+/* C11 atomic types */
+typedef atomic_bool             abool;
+
+typedef atomic_uchar    au8;
+typedef atomic_ushort   au16;
+typedef atomic_uint     au32;
+typedef atomic_ulong    au64;
+static_assert(sizeof(au8) == 1, "sizeof(au8)");
+static_assert(sizeof(au16) == 2, "sizeof(au16)");
+static_assert(sizeof(au32) == 4, "sizeof(au32)");
+static_assert(sizeof(au64) == 8, "sizeof(au64)");
+
+typedef atomic_char     as8;
+typedef atomic_short    as16;
+typedef atomic_int      as32;
+typedef atomic_long     as64;
+static_assert(sizeof(as8) == 1, "sizeof(as8)");
+static_assert(sizeof(as16) == 2, "sizeof(as16)");
+static_assert(sizeof(as32) == 4, "sizeof(as32)");
+static_assert(sizeof(as64) == 8, "sizeof(as64)");
+
+// shorten long names
+#define MO_RELAXED memory_order_relaxed
+#define MO_CONSUME memory_order_consume
+#define MO_ACQUIRE memory_order_acquire
+#define MO_RELEASE memory_order_release
+#define MO_ACQ_REL memory_order_acq_rel
+#define MO_SEQ_CST memory_order_seq_cst
diff --git a/test/MassTrie-beta/wormhole/easydemo.c b/test/MassTrie-beta/wormhole/easydemo.c
new file mode 100644
index 00000000..f095a6ac
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/easydemo.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+
+  int
+main(int argc, char ** argv)
+{
+  (void)argc;
+  (void)argv;
+  struct wormhole * const wh = wh_create();
+  struct wormref * const ref = wh_ref(wh);
+
+  bool r;
+
+  r = wh_put(ref, "wormhole", 8, "easy", 4);
+  printf("wh_put wormhole easy %c\n", r?'T':'F');
+
+  r = wh_put(ref, "time_travel", 11, "impossible", 10);
+  printf("wh_put time_travel impossible %c\n", r?'T':'F');
+
+  r = wh_del(ref, "time_travel", 11);
+  printf("wh_del time_travel %c\n", r?'T':'F');
+
+  r = wh_probe(ref, "time_travel", 11);
+  printf("wh_probe time_travel %c\n", r?'T':'F');
+
+  u32 klen_out = 0;
+  char kbuf_out[8] = {};
+  u32 vlen_out = 0;
+  char vbuf_out[8] = {};
+  r = wh_get(ref, "wormhole", 8, vbuf_out, 8, &vlen_out);
+  printf("wh_get wormhole %c %u %.*s\n", r?'T':'F', vlen_out, vlen_out, vbuf_out);
+
+  // in a concurrent environment, the kvmap_api_wormhole need park&resume when a thread is about to go idle
+  // don't need park&resume if you're using the default kvmap_api_whsafe in whwh.c!
+  wh_park(ref);
+  usleep(10);
+  wh_resume(ref);
+
+  // prepare a few keys for range ops
+  wh_put(ref, "00", 2, "0_value", 7);
+  wh_put(ref, "11", 2, "1_value", 7);
+  wh_put(ref, "22", 2, "2_value", 7);
+
+  struct wormhole_iter * const iter = wh_iter_create(ref);
+
+  wh_iter_seek(iter, NULL, 0); // seek to the head
+  printf("wh_iter_seek \"\"\n");
+  while (wh_iter_valid(iter)) {
+    r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, vbuf_out, 8, &vlen_out);
+    if (r) {
+      printf("wh_iter_peek klen=%u key=%.*s vlen=%u value=%.*s\n",
+          klen_out, klen_out, kbuf_out, vlen_out, vlen_out, vbuf_out);
+    } else {
+      printf("ERROR!\n");
+    }
+    wh_iter_skip1(iter);
+  }
+
+  // call iter_park if you will go idle but want to use the iter later
+  // don't need to call iter_park if you're actively using iter
+  wh_iter_park(iter);
+  usleep(10);
+
+  wh_iter_seek(iter, "0", 1);
+  printf("wh_iter_seek \"0\"\n");
+  // this time we don't want to copy the value
+  r = wh_iter_peek(iter, kbuf_out, 8, &klen_out, NULL, 0, NULL);
+  if (r){
+    printf("wh_iter_peek klen=%u key=%.*s\n", klen_out, klen_out, kbuf_out);
+  } else {
+    printf("ERROR: iter_peek failed\n");
+  }
+
+  wh_iter_destroy(iter);
+  // there must be no active iter when calling unref()
+  wh_unref(ref);
+
+  // unsafe operations: should have released all references
+  wh_clean(wh); // just for demonstration
+  wh_destroy(wh); // destroy also calls clean interally
+  return 0;
+}
diff --git a/test/MassTrie-beta/wormhole/easydemo.out b/test/MassTrie-beta/wormhole/easydemo.out
new file mode 100644
index 00000000..32521210
Binary files /dev/null and b/test/MassTrie-beta/wormhole/easydemo.out differ
diff --git a/test/MassTrie-beta/wormhole/kv.c b/test/MassTrie-beta/wormhole/kv.c
new file mode 100644
index 00000000..a1720e88
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/kv.c
@@ -0,0 +1,1131 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include <ctype.h>
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+// }}} headers
+
+// crc32c {{{
+  inline u32
+kv_crc32c(const void * const ptr, u32 len)
+{
+  return crc32c_inc((const u8 *)ptr, len, KV_CRC32C_SEED);
+}
+
+  inline u64
+kv_crc32c_extend(const u32 lo)
+{
+  const u64 hi = (u64)(~lo);
+  return (hi << 32) | ((u64)lo);
+}
+// }}} crc32c
+
+// kv {{{
+
+// size {{{
+  inline size_t
+kv_size(const struct kv * const kv)
+{
+  return sizeof(*kv) + kv->klen + kv->vlen;
+}
+
+  inline size_t
+kv_size_align(const struct kv * const kv, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*kv) + kv->klen + kv->vlen + (align - 1)) & (~(align - 1));
+}
+
+  inline size_t
+key_size(const struct kv *const key)
+{
+  return sizeof(*key) + key->klen;
+}
+
+  inline size_t
+key_size_align(const struct kv *const key, const u64 align)
+{
+  debug_assert(align && ((align & (align - 1)) == 0));
+  return (sizeof(*key) + key->klen + (align - 1)) & (~(align - 1));
+}
+// }}} size
+
+// construct {{{
+  inline void
+kv_update_hash(struct kv * const kv)
+{
+  const u32 lo = kv_crc32c((const void *)kv->kv, kv->klen);
+  kv->hash = kv_crc32c_extend(lo);
+}
+
+  inline void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen)
+{
+  debug_assert((vlen == 0) || value);
+  memcpy(&(kv->kv[kv->klen]), value, vlen);
+  kv->vlen = vlen;
+}
+
+  inline void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen)
+{
+  debug_assert(kv);
+  kv->klen = klen;
+  memcpy(&(kv->kv[0]), key, klen);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, vlen);
+}
+
+  inline void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value)
+{
+  kv_refill(kv, key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+// the u64 key is filled in big-endian byte order for correct ordering
+  inline void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen)
+{
+  kv->klen = sizeof(u64);
+  *(u64 *)(kv->kv) = __builtin_bswap64(key); // bswap on little endian
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 8;
+  strhex_32(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen)
+{
+  kv->klen = 16;
+  strhex_64(kv->kv, hex);
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex,
+    const u32 klen, const void * const value, const u32 vlen)
+{
+  strhex_64(kv->kv, hex);
+  if (klen > 16) {
+    kv->klen = klen;
+    memset(kv->kv + 16, '!', klen - 16);
+  } else {
+    kv->klen = 16;
+  }
+  kv_refill_value(kv, value, vlen);
+  kv_update_hash(kv);
+}
+
+  inline void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref)
+{
+  kv->klen = kref->len;
+  kv->vlen = 0;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+}
+
+  inline void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen)
+{
+  kv->klen = kref->len;
+  kv->vlen = vlen;
+  kv->hash = kv_crc32c_extend(kref->hash32);
+  memmove(kv->kv, kref->ptr, kref->len);
+  memcpy(kv->kv + kv->klen, value, vlen);
+}
+
+  inline struct kref
+kv_kref(const struct kv * const key)
+{
+  return (struct kref){.ptr = key->kv, .len = key->klen, .hash32 = key->hashlo};
+}
+
+  inline struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen)
+{
+  struct kv * const kv = malloc(sizeof(*kv) + klen + vlen);
+  if (kv)
+    kv_refill(kv, key, klen, value, vlen);
+  return kv;
+}
+
+  inline struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen)
+{
+  return kv_create(key, (u32)strlen(key), value, vlen);
+}
+
+  inline struct kv *
+kv_create_str_str(const char * const key, const char * const value)
+{
+  return kv_create(key, (u32)strlen(key), value, (u32)strlen(value));
+}
+
+  inline struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen)
+{
+  return kv_create(kref->ptr, kref->len, value, vlen);
+}
+
+static struct kv __kv_null = {};
+
+__attribute__((constructor))
+  static void
+kv_null_init(void)
+{
+  kv_update_hash(&__kv_null);
+}
+
+  inline const struct kv *
+kv_null(void)
+{
+  return &__kv_null;
+}
+// }}} construct
+
+// dup {{{
+  inline struct kv *
+kv_dup(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = kv_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new)
+    memcpy(new, kv, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup_key(const struct kv * const kv)
+{
+  if (kv == NULL)
+    return NULL;
+
+  const size_t sz = key_size(kv);
+  struct kv * const new = malloc(sz);
+  if (new) {
+    memcpy(new, kv, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = kv_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new)
+    memcpy(new, from, sz);
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to)
+{
+  if (from == NULL)
+    return NULL;
+  const size_t sz = key_size(from);
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    memcpy(new, from, sz);
+    new->vlen = 0;
+  }
+  return new;
+}
+
+  inline struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen)
+{
+  if (from == NULL)
+    return NULL;
+  debug_assert(plen <= from->klen);
+  const size_t sz = key_size(from) - from->klen + plen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new) {
+    new->klen = plen;
+    memcpy(new->kv, from->kv, plen);
+    new->vlen = 0;
+    kv_update_hash(new);
+  }
+  return new;
+}
+// }}} dup
+
+// compare {{{
+  static inline int
+klen_compare(const u32 len1, const u32 len2)
+{
+  if (len1 < len2)
+    return -1;
+  else if (len1 > len2)
+    return 1;
+  else
+    return 0;
+}
+
+// compare whether the two keys are identical
+// optimistic: do not check hash
+  inline bool
+kv_match(const struct kv * const key1, const struct kv * const key2)
+{
+  //cpu_prefetch0(((u8 *)key2) + 64);
+  //return (key1->hash == key2->hash)
+  //  && (key1->klen == key2->klen)
+  //  && (!memcmp(key1->kv, key2->kv, key1->klen));
+  return (key1->klen == key2->klen) && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+// compare whether the two keys are identical
+// check hash first
+// pessimistic: return false quickly if their hashes mismatch
+  inline bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2)
+{
+  return (key1->hash == key2->hash)
+    && (key1->klen == key2->klen)
+    && (!memcmp(key1->kv, key2->kv, key1->klen));
+}
+
+  inline bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2)
+{
+  return (kv1->kvlen == kv2->kvlen)
+    && (!memcmp(kv1, kv2, sizeof(*kv1) + kv1->klen + kv1->vlen));
+}
+
+  bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  debug_assert(kv128);
+
+  u32 klen128 = 0;
+  u32 vlen128 = 0;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(kv128, &klen128), &vlen128);
+  (void)vlen128;
+  return (sk->klen == klen128) && (!memcmp(sk->kv, pdata, klen128));
+}
+
+  inline int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2)
+{
+  const u32 len = kv1->klen < kv2->klen ? kv1->klen : kv2->klen;
+  const int cmp = memcmp(kv1->kv, kv2->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kv1->klen, kv2->klen);
+}
+
+// for qsort and bsearch
+  static int
+kv_compare_ptrs(const void * const p1, const void * const p2)
+{
+  const struct kv * const * const pp1 = (typeof(pp1))p1;
+  const struct kv * const * const pp2 = (typeof(pp2))p2;
+  return kv_compare(*pp1, *pp2);
+}
+
+  int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->klen;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->kv, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+  inline void
+kv_qsort(struct kv ** const kvs, const size_t nr)
+{
+  qsort(kvs, nr, sizeof(kvs[0]), kv_compare_ptrs);
+}
+
+// return the length of longest common prefix of the two keys
+  inline u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  return memlcp(key1->kv, key2->kv, max);
+}
+
+// return the length of longest common prefix of the two keys with a known lcp0
+  inline u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0)
+{
+  const u32 max = (key1->klen < key2->klen) ? key1->klen : key2->klen;
+  debug_assert(max >= lcp0);
+  return lcp0 + memlcp(key1->kv+lcp0, key2->kv+lcp0, max-lcp0);
+}
+// }}}
+
+// psort {{{
+  static inline void
+kv_psort_exchange(struct kv ** const kvs, const u64 i, const u64 j)
+{
+  if (i != j) {
+    struct kv * const tmp = kvs[i];
+    kvs[i] = kvs[j];
+    kvs[j] = tmp;
+  }
+}
+
+  static u64
+kv_psort_partition(struct kv ** const kvs, const u64 lo, const u64 hi)
+{
+  if (lo >= hi)
+    return lo;
+
+  const u64 p = (lo+hi) >> 1;
+  kv_psort_exchange(kvs, lo, p);
+  u64 i = lo;
+  u64 j = hi + 1;
+  do {
+    while (kv_compare(kvs[++i], kvs[lo]) < 0 && i < hi);
+    while (kv_compare(kvs[--j], kvs[lo]) > 0);
+    if (i >= j)
+      break;
+    kv_psort_exchange(kvs, i, j);
+  } while (true);
+  kv_psort_exchange(kvs, lo, j);
+  return j;
+}
+
+  static void
+kv_psort_rec(struct kv ** const kvs, const u64 lo, const u64 hi, const u64 tlo, const u64 thi)
+{
+  if (lo >= hi)
+    return;
+  const u64 c = kv_psort_partition(kvs, lo, hi);
+
+  if (c > tlo) // go left
+    kv_psort_rec(kvs, lo, c-1, tlo, thi);
+
+  if (c < thi) // go right
+    kv_psort_rec(kvs, c+1, hi, tlo, thi);
+}
+
+  inline void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi)
+{
+  debug_assert(tlo <= thi);
+  debug_assert(thi < nr);
+  kv_psort_rec(kvs, 0, nr-1, tlo, thi);
+}
+// }}} psort
+
+// ptr {{{
+  inline void *
+kv_vptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[kv->klen]));
+}
+
+  inline void *
+kv_kptr(struct kv * const kv)
+{
+  return (void *)(&(kv->kv[0]));
+}
+
+  inline const void *
+kv_vptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[kv->klen]));
+}
+
+  inline const void *
+kv_kptr_c(const struct kv * const kv)
+{
+  return (const void *)(&(kv->kv[0]));
+}
+// }}} ptr
+
+// print {{{
+// cmd "KV" K and V can be 's': string, 'x': hex, 'd': dec, or else for not printing.
+// n for newline after kv
+  void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out)
+{
+  debug_assert(cmd);
+  const u32 klen = kv->klen;
+  fprintf(out, "#%016lx k[%3u]", kv->hash, klen);
+
+  switch(cmd[0]) {
+  case 's': fprintf(out, " %.*s", klen, kv->kv); break;
+  case 'x': str_print_hex(out, kv->kv, klen); break;
+  case 'd': str_print_dec(out, kv->kv, klen); break;
+  default: break;
+  }
+
+  const u32 vlen = kv->vlen;
+  switch (cmd[1]) {
+  case 's': fprintf(out, "  v[%4u] %.*s", vlen, vlen, kv->kv+klen); break;
+  case 'x': fprintf(out, "  v[%4u]", vlen); str_print_hex(out, kv->kv+klen, vlen); break;
+  case 'd': fprintf(out, "  v[%4u]", vlen); str_print_dec(out, kv->kv+klen, vlen); break;
+  default: break;
+  }
+  if (strchr(cmd, 'n'))
+    fprintf(out, "\n");
+}
+// }}} print
+
+// mm {{{
+  struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv;
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out)
+{
+  (void)out;
+  return kv;
+}
+
+  void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv)
+{
+  (void)kv;
+  (void)priv;
+}
+
+// copy-in
+  struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  return kv_dup(kv);
+}
+
+// copy-out
+  struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out)
+{
+  return kv_dup2(kv, out);
+}
+
+  void
+kvmap_mm_free_free(struct kv * const kv, void * const priv)
+{
+  (void)priv;
+  free(kv);
+}
+
+const struct kvmap_mm kvmap_mm_dup = {
+  .in = kvmap_mm_in_dup,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+const struct kvmap_mm kvmap_mm_ndf = {
+  .in = kvmap_mm_in_noop,
+  .out = kvmap_mm_out_dup,
+  .free = kvmap_mm_free_free,
+  .priv = NULL,
+};
+
+// }}} mm
+
+// kref {{{
+  inline void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = 0;
+}
+
+  inline void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len)
+{
+  kref->ptr = ptr;
+  kref->len = len;
+  kref->hash32 = kv_crc32c(ptr, len);
+}
+
+  inline void
+kref_update_hash32(struct kref * const kref)
+{
+  kref->hash32 = kv_crc32c(kref->ptr, kref->len);
+}
+
+  inline void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv->hashlo;
+}
+
+  inline void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv)
+{
+  kref->ptr = kv->kv;
+  kref->len = kv->klen;
+  kref->hash32 = kv_crc32c(kv->kv, kv->klen);
+}
+
+  inline bool
+kref_match(const struct kref * const k1, const struct kref * const k2)
+{
+  return (k1->len == k2->len) && (!memcmp(k1->ptr, k2->ptr, k1->len));
+}
+
+// match a kref and a key
+  inline bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k)
+{
+  return (kref->len == k->klen) && (!memcmp(kref->ptr, k->kv, kref->len));
+}
+
+  inline int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2)
+{
+  const u32 len = kref1->len < kref2->len ? kref1->len : kref2->len;
+  const int cmp = memcmp(kref1->ptr, kref2->ptr, (size_t)len);
+  return cmp ? cmp : klen_compare(kref1->len, kref2->len);
+}
+
+// compare a kref and a key
+  inline int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k)
+{
+  debug_assert(kref);
+  debug_assert(k);
+  const u32 len = kref->len < k->klen ? kref->len : k->klen;
+  const int cmp = memcmp(kref->ptr, k->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(kref->len, k->klen);
+}
+
+  inline u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2)
+{
+  const u32 max = (k1->len < k2->len) ? k1->len : k2->len;
+  return memlcp(k1->ptr, k2->ptr, max);
+}
+
+  inline u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv)
+{
+  const u32 max = (kref->len < kv->klen) ? kref->len : kv->klen;
+  return memlcp(kref->ptr, kv->kv, max);
+}
+
+// klen, key, ...
+  inline int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(k128, &klen2);
+  debug_assert(ptr2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+// klen, vlen, key, ...
+  inline int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128)
+{
+  debug_assert(sk);
+  const u32 klen1 = sk->len;
+  u32 klen2 = 0;
+  u32 vlen2 = 0;
+  const u8 * const ptr2 = vi128_decode_u32(vi128_decode_u32(kv128, &klen2), &vlen2);
+  const u32 len = (klen1 < klen2) ? klen1 : klen2;
+  const int cmp = memcmp(sk->ptr, ptr2, len);
+  return cmp ? cmp : klen_compare(klen1, klen2);
+}
+
+static struct kref __kref_null = {.hash32 = KV_CRC32C_SEED};
+
+  inline const struct kref *
+kref_null(void)
+{
+  return &__kref_null;
+}
+// }}} kref
+
+// kvref {{{
+  inline void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv)
+{
+  ref->kptr = kv->kv;
+  ref->vptr = kv->kv + kv->klen;
+  ref->hdr = *kv;
+}
+
+  struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen + ref->hdr.vlen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  memcpy(new->kv + new->klen, ref->vptr, new->vlen);
+  return new;
+}
+
+  struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to)
+{
+  if (ref == NULL)
+    return NULL;
+  const size_t sz = sizeof(*to) + ref->hdr.klen;
+  struct kv * const new = to ? to : malloc(sz);
+  if (new == NULL)
+    return NULL;
+
+  *new = ref->hdr;
+  memcpy(new->kv, ref->kptr, new->klen);
+  return new;
+}
+
+  int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv)
+{
+  const u32 len = ref->hdr.klen < kv->klen ? ref->hdr.klen : kv->klen;
+  const int cmp = memcmp(ref->kptr, kv->kv, (size_t)len);
+  return cmp ? cmp : klen_compare(ref->hdr.klen, kv->klen);
+}
+// }}} kvref
+
+// kv128 {{{
+// estimate the encoded size
+  inline size_t
+kv128_estimate_kv(const struct kv * const kv)
+{
+  return vi128_estimate_u32(kv->klen) + vi128_estimate_u32(kv->vlen) + kv->klen + kv->vlen;
+}
+
+// create a kv128 from kv
+  u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize)
+{
+  u8 * const ptr = out ? out : malloc(kv128_estimate_kv(kv));
+  if (!ptr)
+    return NULL;
+
+  u8 * const pdata = vi128_encode_u32(vi128_encode_u32(ptr, kv->klen), kv->vlen);
+  memcpy(pdata, kv->kv, kv->klen + kv->vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + kv->klen + kv->vlen;
+  return ptr; // return the head of the encoded kv128
+}
+
+// dup kv128 to a kv
+  struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  struct kv * const ret = out ? out : malloc(sizeof(struct kv) + klen + vlen);
+  if (ret)
+    kv_refill(ret, pdata, klen, pdata + klen, vlen);
+
+  if (pesize)
+    *pesize = (size_t)(pdata - ptr) + klen + vlen;
+  return ret; // return the kv
+}
+
+  inline size_t
+kv128_size(const u8 * const ptr)
+{
+  u32 klen, vlen;
+  const u8 * const pdata = vi128_decode_u32(vi128_decode_u32(ptr, &klen), &vlen);
+  return ((size_t)(pdata - ptr)) + klen + vlen;
+}
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// registry {{{
+// increase MAX if need more
+#define KVMAP_API_MAX ((32))
+static struct kvmap_api_reg kvmap_api_regs[KVMAP_API_MAX];
+static u64 kvmap_api_regs_nr = 0;
+
+  void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api)
+{
+  if (kvmap_api_regs_nr < KVMAP_API_MAX) {
+    kvmap_api_regs[kvmap_api_regs_nr].nargs = nargs;
+    kvmap_api_regs[kvmap_api_regs_nr].name = name;
+    kvmap_api_regs[kvmap_api_regs_nr].args_msg = args_msg;
+    kvmap_api_regs[kvmap_api_regs_nr].create = create;
+    kvmap_api_regs[kvmap_api_regs_nr].api = api;
+    kvmap_api_regs_nr++;
+  } else {
+    fprintf(stderr, "%s failed to register [%s]\n", __func__, name);
+  }
+}
+  void
+kvmap_api_helper_message(void)
+{
+  fprintf(stderr, "%s Usage: api <map-type> <param1> ...\n", __func__);
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    fprintf(stderr, "%s example: api %s %s\n", __func__,
+        kvmap_api_regs[i].name, kvmap_api_regs[i].args_msg);
+  }
+}
+
+  int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out)
+{
+  // "api" "name" "arg1", ...
+  if (argc < 2 || strcmp(argv[0], "api") != 0)
+    return -1;
+
+  for (u64 i = 0; i < kvmap_api_regs_nr; i++) {
+    const struct kvmap_api_reg * const reg = &kvmap_api_regs[i];
+    if (0 != strcmp(argv[1], reg->name))
+      continue;
+
+    if ((argc - 2) < reg->nargs)
+      return -1;
+
+    void * const map = reg->create(argv[1], mm, argv + 2); // skip "api" "name"
+    if (map) {
+      *api_out = reg->api;
+      *map_out = map;
+      return 2 + reg->nargs;
+    } else {
+      return -1;
+    }
+  }
+
+  // no match
+  return -1;
+}
+// }}} registry
+
+// misc {{{
+  void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv)
+{
+  // steal the kv pointer out so we don't need a dangerous get_key_interanl()
+  if (priv)
+    *(struct kv **)priv = kv;
+}
+
+  inline void *
+kvmap_ref(const struct kvmap_api * const api, void * const map)
+{
+  return api->ref ? api->ref(map) : map;
+}
+
+// return the original map pointer; usually unused by caller
+  inline void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref)
+{
+  return api->unref ? api->unref(ref) : ref;
+}
+// }}} misc
+
+// kvmap_kv_op {{{
+  inline struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out)
+{
+  const struct kref kref = kv_kref(key);
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv)
+{
+  return api->put(ref, kv);
+}
+
+  inline bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv)
+{
+  const struct kref kref = kv_kref(key);
+  return api->merge(ref, &kref, uf, priv);
+}
+
+  inline u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end)
+{
+  const struct kref kref0 = kv_kref(start);
+  if (end) {
+    const struct kref krefz = kv_kref(end);
+    return api->delr(ref, &kref0, &krefz);
+  } else {
+    return api->delr(ref, &kref0, NULL);
+  }
+}
+
+  inline void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key)
+{
+  const struct kref kref = kv_kref(key);
+  api->iter_seek(iter, &kref);
+}
+// }}} kvmap_kv_op
+
+// kvmap_raw_op {{{
+  inline struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->get(ref, &kref, out);
+}
+
+  inline bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->del(ref, &kref);
+}
+
+  inline bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpr(ref, &kref, uf, priv);
+}
+
+  inline bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  return api->inpw(ref, &kref, uf, priv);
+}
+
+  inline void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr)
+{
+  const struct kref kref = {.ptr = ptr, .len = len,
+    .hash32 = api->hashkey ? kv_crc32c(ptr, len) : 0};
+  api->iter_seek(iter, &kref);
+}
+// }}}} kvmap_raw_op
+
+// dump {{{
+  u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd)
+{
+  void * const ref = kvmap_ref(api, map);
+  void * const iter = api->iter_create(ref);
+  api->iter_seek(iter, kref_null());
+  u64 i = 0;
+  while (api->iter_valid(iter)) {
+    struct kvref kvref;
+    api->iter_kvref(iter, &kvref);
+    dprintf(fd, "%010lu [%3u] %.*s [%u]\n", i, kvref.hdr.klen, kvref.hdr.klen, kvref.kptr, kvref.hdr.vlen);
+    i++;
+    api->iter_skip1(iter);
+  }
+  api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  return i;
+}
+// }}} dump
+
+// kv64 {{{
+struct kv64 { // internal only
+  struct kv kv;
+  u64 key_be; // must be in big endian
+  u64 value;
+};
+
+  inline bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out)
+{
+  struct kv64 keybuf, kvout;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  struct kv * const ret = api->get(ref, &kref, &kvout.kv);
+  if (ret) {
+    *out = kvout.value;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->probe(ref, &kref);
+}
+
+  inline bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value)
+{
+  struct kv64 kv;
+  kv.key_be = __builtin_bswap64(key);
+  kv.value = value;
+  kv.kv.klen = sizeof(key);
+  kv.kv.vlen = sizeof(value);
+  if (api->hashkey)
+    kv_update_hash(&kv.kv);
+
+  return api->put(ref, &kv.kv);
+}
+
+  inline bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  return api->del(ref, &kref);
+}
+
+  inline void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key)
+{
+  struct kv64 keybuf;
+  struct kref kref;
+  keybuf.key_be = __builtin_bswap64(key);
+  kref_ref_hash32(&kref, keybuf.kv.kv, sizeof(keybuf.key_be));
+  api->iter_seek(iter, &kref);
+}
+
+  inline bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out)
+{
+  struct kv64 kvout;
+  struct kv * const ret = api->iter_peek(iter, &kvout.kv);
+  if (key_out)
+    *key_out = __builtin_bswap64(kvout.key_be); // to LE
+  if (value_out)
+    *value_out = kvout.value;
+  return ret != NULL;
+}
+// }}} kv64
+
+// }}} kvmap
+
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/kv.h b/test/MassTrie-beta/wormhole/kv.h
new file mode 100644
index 00000000..1e251e58
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/kv.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// crc32c {{{
+#define KV_CRC32C_SEED ((0xDEADBEEFu))
+
+  extern u32
+kv_crc32c(const void * const ptr, u32 len);
+
+  extern u64
+kv_crc32c_extend(const u32 crc32c);
+// }}} crc32c
+
+// kv {{{
+
+// struct {{{
+/*
+ * Some internal union names can be ignored:
+ * struct kv {
+ *   u32 klen;
+ *   u32 vlen;
+ *   u64 hash;
+ *   u8 kv[];
+ * };
+ */
+struct kv {
+  union { // the first u64
+    u64 kvlen;
+    struct {
+      u32 klen;
+      union { u32 vlen; u32 refcnt; };
+    };
+  };
+  union {
+    u64 hash; // hashvalue of the key
+    u64 priv; // can hide a value here if hash is not used
+    void * privptr;
+    struct { u32 hashlo; u32 hashhi; }; // little endian
+    struct { u32 privlo; u32 privhi; };
+  };
+  u8 kv[0];  // len(kv) == klen + vlen
+} __attribute__((packed));
+
+struct kref {
+  u32 len;
+  union { u32 hash32; u32 priv; };
+  const u8 * ptr;
+} __attribute__((packed));
+
+struct kvref {
+  const u8 * kptr; // read-only
+  const u8 * vptr; // read-only
+  struct kv hdr; // hdr.kv[] is invalid
+};
+// }}} struct
+
+// kv {{{
+typedef int  (*kv_kv_cmp_func)(const struct kv *, const struct kv *);
+
+  extern size_t
+kv_size(const struct kv * const kv);
+
+  extern size_t
+kv_size_align(const struct kv * const kv, const u64 align);
+
+  extern size_t
+key_size(const struct kv * const key);
+
+  extern size_t
+key_size_align(const struct kv * const key, const u64 align);
+
+  extern void
+kv_update_hash(struct kv * const kv);
+
+  extern void
+kv_refill_value(struct kv * const kv, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill(struct kv * const kv, const void * const key, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str(struct kv * const kv, const char * const key,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_str_str(struct kv * const kv, const char * const key,
+    const char * const value);
+
+// the u64 key is filled in big-endian byte order
+  extern void
+kv_refill_u64(struct kv * const kv, const u64 key, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex32(struct kv * const kv, const u32 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64(struct kv * const kv, const u64 hex, const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_hex64_klen(struct kv * const kv, const u64 hex, const u32 klen,
+    const void * const value, const u32 vlen);
+
+  extern void
+kv_refill_kref(struct kv * const kv, const struct kref * const kref);
+
+  extern void
+kv_refill_kref_v(struct kv * const kv, const struct kref * const kref,
+    const void * const value, const u32 vlen);
+
+  extern struct kref
+kv_kref(const struct kv * const key);
+
+  extern struct kv *
+kv_create(const void * const key, const u32 klen, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str(const char * const key, const void * const value, const u32 vlen);
+
+  extern struct kv *
+kv_create_str_str(const char * const key, const char * const value);
+
+  extern struct kv *
+kv_create_kref(const struct kref * const kref, const void * const value, const u32 vlen);
+
+// a static kv with klen == 0
+  extern const struct kv *
+kv_null(void);
+
+  extern struct kv *
+kv_dup(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup_key(const struct kv * const kv);
+
+  extern struct kv *
+kv_dup2(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key(const struct kv * const from, struct kv * const to);
+
+  extern struct kv *
+kv_dup2_key_prefix(const struct kv * const from, struct kv * const to, const u32 plen);
+
+  extern bool
+kv_match(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_hash(const struct kv * const key1, const struct kv * const key2);
+
+  extern bool
+kv_match_full(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern bool
+kv_match_kv128(const struct kv * const sk, const u8 * const kv128);
+
+  extern int
+kv_compare(const struct kv * const kv1, const struct kv * const kv2);
+
+  extern int
+kv_k128_compare(const struct kv * const sk, const u8 * const k128);
+
+  extern int
+kv_kv128_compare(const struct kv * const sk, const u8 * const kv128);
+
+  extern void
+kv_qsort(struct kv ** const kvs, const size_t nr);
+
+  extern u32
+kv_key_lcp(const struct kv * const key1, const struct kv * const key2);
+
+  extern u32
+kv_key_lcp_skip(const struct kv * const key1, const struct kv * const key2, const u32 lcp0);
+
+  extern void
+kv_psort(struct kv ** const kvs, const u64 nr, const u64 tlo, const u64 thi);
+
+  extern void *
+kv_vptr(struct kv * const kv);
+
+  extern void *
+kv_kptr(struct kv * const kv);
+
+  extern const void *
+kv_vptr_c(const struct kv * const kv);
+
+  extern const void *
+kv_kptr_c(const struct kv * const kv);
+
+  extern void
+kv_print(const struct kv * const kv, const char * const cmd, FILE * const out);
+// }}} kv
+
+// mm {{{
+typedef struct kv * (* kvmap_mm_in_func)(struct kv * kv, void * priv);
+typedef struct kv * (* kvmap_mm_out_func)(struct kv * kv, struct kv * out);
+typedef void        (* kvmap_mm_free_func)(struct kv * kv, void * priv);
+
+// manage internal kv data of kvmap
+struct kvmap_mm {
+  // to create a private copy of "kv"
+  // see put() functions
+  kvmap_mm_in_func in;
+  // to duplicate a private copy of "kv" to "out"
+  // see get() and iter_peek() functions
+  kvmap_mm_out_func out;
+  // to free a kv
+  // see del() and put() functions
+  kvmap_mm_free_func free;
+  void * priv;
+};
+
+  extern struct kv *
+kvmap_mm_in_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_noop(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_noop(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_in_dup(struct kv * const kv, void * const priv);
+
+  extern struct kv *
+kvmap_mm_out_dup(struct kv * const kv, struct kv * const out);
+
+  extern void
+kvmap_mm_free_free(struct kv * const kv, void * const priv);
+
+// the default mm
+extern const struct kvmap_mm kvmap_mm_dup; // in:Dup, out:Dup, free:Free
+extern const struct kvmap_mm kvmap_mm_ndf; // in:Noop, out:Dup, free:Free
+// }}} mm
+
+// ref {{{
+typedef int (*kref_kv_cmp_func)(const struct kref *, const struct kv *);
+
+// ptr and len only
+  extern void
+kref_ref_raw(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+// this calculates hash32
+  extern void
+kref_ref_hash32(struct kref * const kref, const u8 * const ptr, const u32 len);
+
+  extern void
+kref_update_hash32(struct kref * const kref);
+
+  extern void
+kref_ref_kv(struct kref * const kref, const struct kv * const kv);
+
+  extern void
+kref_ref_kv_hash32(struct kref * const kref, const struct kv * const kv);
+
+  extern bool
+kref_match(const struct kref * const k1, const struct kref * const k2);
+
+  extern bool
+kref_kv_match(const struct kref * const kref, const struct kv * const k);
+
+  extern int
+kref_compare(const struct kref * const kref1, const struct kref * const kref2);
+
+  extern int
+kref_kv_compare(const struct kref * const kref, const struct kv * const k);
+
+  extern u32
+kref_lcp(const struct kref * const k1, const struct kref * const k2);
+
+  extern u32
+kref_kv_lcp(const struct kref * const kref, const struct kv * const kv);
+
+  extern int
+kref_k128_compare(const struct kref * const sk, const u8 * const k128);
+
+  extern int
+kref_kv128_compare(const struct kref * const sk, const u8 * const kv128);
+
+  extern const struct kref *
+kref_null(void);
+
+  extern void
+kvref_ref_kv(struct kvref * const ref, struct kv * const kv);
+
+  extern struct kv *
+kvref_dup2_kv(struct kvref * const ref, struct kv * const to);
+
+  extern struct kv *
+kvref_dup2_key(struct kvref * const ref, struct kv * const to);
+
+  extern int
+kvref_kv_compare(const struct kvref * const ref, const struct kv * const kv);
+// }}} ref
+
+// kv128 {{{
+  extern size_t
+kv128_estimate_kv(const struct kv * const kv);
+
+  extern u8 *
+kv128_encode_kv(const struct kv * const kv, u8 * const out, size_t * const pesize);
+
+  extern struct kv *
+kv128_decode_kv(const u8 * const ptr, struct kv * const out, size_t * const pesize);
+
+  extern size_t
+kv128_size(const u8 * const ptr);
+// }}} kv128
+
+// }}} kv
+
+// kvmap {{{
+
+// kvmap_api {{{
+typedef void (* kv_inp_func)(struct kv * const curr, void * const priv);
+
+// the merge function should:
+// 1: return NULL if the origin kv is not changed at all
+// 2: return kv0 if updates has been applied in-place
+// 3: return a different kv if the original kv must be replaced
+// In an in-memory kvmap, 2==1 and no further action is needed
+// In a persistent kv store with a memtable, 2 will need an insertion if kv0 is not from the memtable
+typedef struct kv * (* kv_merge_func)(struct kv * const kv0, void * const priv);
+
+struct kvmap_api {
+  // feature bits
+  bool hashkey; // true: caller needs to provide correct hash in kv/kref
+  bool ordered; // true: has iter_seek
+  bool threadsafe; // true: support thread_safe access
+  bool readonly; // true: no put() and del()
+  bool irefsafe; // true: iter's kref/kvref can be safely accessed after iter_seek/iter_skip/iter_park
+  bool unique; // provide unique keys, especially for iterators
+  bool refpark; // ref has park() and resume()
+  bool async; // XXX for testing KVell
+
+  // put (aka put/upsert): return true on success; false on error
+  // mm.in() controls how things move into the kvmap; the default mm make a copy with malloc()
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* put)     (void * const ref, struct kv * const kv);
+  // get: search and return a kv if found, or NULL if not
+  // with the default mm: malloc() if out == NULL; otherwise, use out as buffer
+  // with custom kvmap_mm: mm.out() controls buffer; use with caution
+  // caller should use the returned ptr even if out is provided
+  struct kv * (* get)     (void * const ref, const struct kref * const key, struct kv * const out);
+  // probe: return true on found, false on not found
+  bool        (* probe)   (void * const ref, const struct kref * const key);
+  // del: return true on something deleted, false on not found
+  // mm.free() controls how old kv get disposed when replaced
+  bool        (* del)     (void * const ref, const struct kref * const key);
+  // inp: inplace operation if key exists; otherwise return false; uf() is always executed even with NULL key
+  // inpr/inpw acquires r/w locks respectively.
+  // Note that in inpw() you can only change the value.
+  bool        (* inpr)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  bool        (* inpw)    (void * const ref, const struct kref * const key, kv_inp_func uf, void * const priv);
+  // merge: put+callback on old/new keys; another name: read-modify-write
+  // return true if successfull; return false on error
+  bool        (* merge)   (void * const ref, const struct kref * const key, kv_merge_func uf, void * const priv);
+  // delete-range: delete all keys from start (inclusive) to end (exclusive)
+  u64         (* delr)    (void * const ref, const struct kref * const start, const struct kref * const end);
+  // make everything persist; for persistent maps only
+  void        (* sync)    (void * const ref);
+
+  // general guidelines for thread-safe iters:
+  // - it is assumed that the key under the cursor is locked/freezed/immutable
+  // - once created one must call iter_seek to make it valid
+  // - the ownership of ref is given to the iter so ref should not be used until iter_destroy
+  // - creating and use more than one iter based on a ref can cause deadlocks
+  void *      (* iter_create)   (void * const ref);
+  // move the cursor to the first key >= search-key;
+  void        (* iter_seek)     (void * const iter, const struct kref * const key);
+  // check if the cursor points to a valid key
+  bool        (* iter_valid)    (void * const iter);
+  // return the current key; copy to out if (out != NULL)
+  // mm.out() controls copy-out
+  struct kv * (* iter_peek)     (void * const iter, struct kv * const out);
+  // similar to peek but does not copy; return false if iter is invalid
+  bool        (* iter_kref)     (void * const iter, struct kref * const kref);
+  // similar to iter_kref but also provide the value
+  bool        (* iter_kvref)    (void * const iter, struct kvref * const kvref);
+  // iter_retain makes kref or kvref of the current iter remain valid until released
+  // the returned opaque pointer should be provided when releasing the hold
+  u64         (* iter_retain)   (void * const iter);
+  void        (* iter_release)  (void * const iter, const u64 opaque);
+  // skip one element
+  void        (* iter_skip1)    (void * const iter);
+  // skip nr elements
+  void        (* iter_skip)     (void * const iter, const u32 nr);
+  // iter_next == iter_peek + iter_skip1
+  struct kv * (* iter_next)     (void * const iter, struct kv * const out);
+  // perform inplace opeation if the current key is valid; return false if no current key
+  // the uf() is always executed even with NULL key
+  bool        (* iter_inp)      (void * const iter, kv_inp_func uf, void * const priv);
+  // invalidate the iter to release any resources or locks
+  // afterward, must call seek() again before accessing data
+  void        (* iter_park)     (void * const iter);
+  // destroy iter
+  void        (* iter_destroy)  (void * const iter);
+
+  // misc:
+  // create refs for maps if required; always use use kvmap_ref() and kvmap_unref()
+  // if there are ref/unref functions, ref-ptr should be used as map for all kv operations
+  void *      (* ref)     (void * map);
+  // return the original map
+  void *      (* unref)   (void * ref);
+  // pause access without unref; must call resume later before access index again
+  void        (* park)    (void * ref);
+  // resume access of ref; must be paired with a park()
+  void        (* resume)  (void * ref);
+
+  // UNSAFE functions:
+  // empty the map
+  void        (* clean)   (void * map);
+  // erase everything
+  void        (* destroy) (void * map);
+  // for debugging
+  void        (* fprint)  (void * map, FILE * const out);
+};
+
+// registry
+struct kvmap_api_reg {
+  int nargs; // number of arguments after name
+  const char * name;
+  const char * args_msg; // see ...helper_message
+  // multiple apis may share one create function
+  // arguments: name (e.g., "rdb"), mm (usually NULL), the remaining args
+  void * (*create)(const char *, const struct kvmap_mm *, char **);
+  const struct kvmap_api * api;
+};
+
+// call this function to register a kvmap_api
+  extern void
+kvmap_api_register(const int nargs, const char * const name, const char * const args_msg,
+    void * (*create)(const char *, const struct kvmap_mm *, char **), const struct kvmap_api * const api);
+
+  extern void
+kvmap_api_helper_message(void);
+
+  extern int
+kvmap_api_helper(int argc, char ** const argv, const struct kvmap_mm * const mm,
+    const struct kvmap_api ** const api_out, void ** const map_out);
+// }}} kvmap_api
+
+// helpers {{{
+  extern void
+kvmap_inp_steal_kv(struct kv * const kv, void * const priv);
+
+  extern void *
+kvmap_ref(const struct kvmap_api * const api, void * const map);
+
+  extern void *
+kvmap_unref(const struct kvmap_api * const api, void * const ref);
+
+  extern struct kv *
+kvmap_kv_get(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, struct kv * const out);
+
+  extern bool
+kvmap_kv_probe(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_put(const struct kvmap_api * const api, void * const ref,
+    struct kv * const kv);
+
+  extern bool
+kvmap_kv_del(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key);
+
+  extern bool
+kvmap_kv_inpr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_inpw(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_kv_merge(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const key, kv_merge_func uf, void * const priv);
+
+  extern u64
+kvmap_kv_delr(const struct kvmap_api * const api, void * const ref,
+    const struct kv * const start, const struct kv * const end);
+
+  extern void
+kvmap_kv_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const struct kv * const key);
+
+  extern struct kv *
+kvmap_raw_get(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, struct kv * const out);
+
+  extern bool
+kvmap_raw_probe(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_del(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr);
+
+  extern bool
+kvmap_raw_inpr(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern bool
+kvmap_raw_inpw(const struct kvmap_api * const api, void * const ref,
+    const u32 len, const u8 * const ptr, kv_inp_func uf, void * const priv);
+
+  extern void
+kvmap_raw_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u32 len, const u8 * const ptr);
+
+  extern u64
+kvmap_dump_keys(const struct kvmap_api * const api, void * const map, const int fd);
+
+  extern bool
+kvmap_kv64_get(const struct kvmap_api * const api, void * const ref,
+    const u64 key, u64 * const out);
+
+  extern bool
+kvmap_kv64_probe(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_put(const struct kvmap_api * const api, void * const ref,
+    const u64 key, const u64 value);
+
+  extern bool
+kvmap_kv64_del(const struct kvmap_api * const api, void * const ref,
+    const u64 key);
+
+  extern void
+kvmap_kv64_iter_seek(const struct kvmap_api * const api, void * const iter,
+    const u64 key);
+
+  extern bool
+kvmap_kv64_iter_peek(const struct kvmap_api * const api, void * const iter,
+    u64 * const key_out, u64 * const value_out);
+// }}} helpers
+
+// }}} kvmap
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/lib.c b/test/MassTrie-beta/wormhole/lib.c
new file mode 100644
index 00000000..06d45f6d
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/lib.c
@@ -0,0 +1,3026 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include "lib.h"
+#include "ctypes.h"
+#include <assert.h>
+#include <execinfo.h>
+#include <math.h>
+#include <netdb.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <time.h>
+#include <stdarg.h> // va_start
+
+#if defined(__linux__)
+#include <linux/fs.h>
+#include <malloc.h>  // malloc_usable_size
+#elif defined(__APPLE__) && defined(__MACH__)
+#include <sys/disk.h>
+#include <malloc/malloc.h>
+#elif defined(__FreeBSD__)
+#include <sys/disk.h>
+#include <malloc_np.h>
+#endif // OS
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h>
+#endif
+// }}} headers
+
+// math {{{
+  inline u64
+mhash64(const u64 v)
+{
+  return v * 11400714819323198485lu;
+}
+
+  inline u32
+mhash32(const u32 v)
+{
+  return v * 2654435761u;
+}
+
+// From Daniel Lemire's blog (2013, lemire.me)
+  u64
+gcd64(u64 a, u64 b)
+{
+  if (a == 0)
+    return b;
+  if (b == 0)
+    return a;
+
+  const u32 shift = (u32)__builtin_ctzl(a | b);
+  a >>= __builtin_ctzl(a);
+  do {
+    b >>= __builtin_ctzl(b);
+    if (a > b) {
+      const u64 t = b;
+      b = a;
+      a = t;
+    }
+    b = b - a;
+  } while (b);
+  return a << shift;
+}
+// }}} math
+
+// random {{{
+// Lehmer's generator is 2x faster than xorshift
+/**
+ * D. H. Lehmer, Mathematical methods in large-scale computing units.
+ * Proceedings of a Second Symposium on Large Scale Digital Calculating
+ * Machinery;
+ * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
+ *
+ * P L'Ecuyer,  Tables of linear congruential generators of different sizes and
+ * good lattice structure. Mathematics of Computation of the American
+ * Mathematical
+ * Society 68.225 (1999): 249-260.
+ */
+struct lehmer_u64 {
+  union {
+    u128 v128;
+    u64 v64[2];
+  };
+};
+
+static __thread struct lehmer_u64 rseed_u128 = {.v64 = {4294967291, 1549556881}};
+
+  static inline u64
+lehmer_u64_next(struct lehmer_u64 * const s)
+{
+  const u64 r = s->v64[1];
+  s->v128 *= 0xda942042e4dd58b5lu;
+  return r;
+}
+
+  static inline void
+lehmer_u64_seed(struct lehmer_u64 * const s, const u64 seed)
+{
+  s->v128 = (((u128)(~seed)) << 64) | (seed | 1);
+  (void)lehmer_u64_next(s);
+}
+
+  inline u64
+random_u64(void)
+{
+  return lehmer_u64_next(&rseed_u128);
+}
+
+  inline void
+srandom_u64(const u64 seed)
+{
+  lehmer_u64_seed(&rseed_u128, seed);
+}
+
+  inline double
+random_double(void)
+{
+  // random between [0.0 - 1.0]
+  const u64 r = random_u64();
+  return ((double)r) * (1.0 / ((double)(~0lu)));
+}
+// }}} random
+
+// timing {{{
+  inline u64
+time_nsec(void)
+{
+  struct timespec ts;
+  // MONO_RAW is 5x to 10x slower than MONO
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ((u64)ts.tv_sec) * 1000000000lu + ((u64)ts.tv_nsec);
+}
+
+  inline double
+time_sec(void)
+{
+  const u64 nsec = time_nsec();
+  return ((double)nsec) * 1.0e-9;
+}
+
+  inline u64
+time_diff_nsec(const u64 last)
+{
+  return time_nsec() - last;
+}
+
+  inline double
+time_diff_sec(const double last)
+{
+  return time_sec() - last;
+}
+
+// need char str[64]
+  void
+time_stamp(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F %T %z", &nowtm);
+}
+
+  void
+time_stamp2(char * str, const size_t size)
+{
+  time_t now;
+  struct tm nowtm;
+  time(&now);
+  localtime_r(&now, &nowtm);
+  strftime(str, size, "%F-%H-%M-%S%z", &nowtm);
+}
+// }}} timing
+
+// cpucache {{{
+  inline void
+cpu_pause(void)
+{
+#if defined(__x86_64__)
+  _mm_pause();
+#elif defined(__aarch64__)
+  // nop
+#endif
+}
+
+  inline void
+cpu_mfence(void)
+{
+  atomic_thread_fence(MO_SEQ_CST);
+}
+
+// compiler fence
+  inline void
+cpu_cfence(void)
+{
+  atomic_thread_fence(MO_ACQ_REL);
+}
+
+  inline void
+cpu_prefetch0(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 0);
+}
+
+  inline void
+cpu_prefetch1(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 1);
+}
+
+  inline void
+cpu_prefetch2(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 2);
+}
+
+  inline void
+cpu_prefetch3(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 0, 3);
+}
+
+  inline void
+cpu_prefetchw(const void * const ptr)
+{
+  __builtin_prefetch(ptr, 1, 0);
+}
+// }}} cpucache
+
+// crc32c {{{
+  inline u32
+crc32c_u8(const u32 crc, const u8 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u8(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cb(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u16(const u32 crc, const u16 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u16(crc, v);
+#elif defined(__aarch64__)
+  return __crc32ch(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u32(const u32 crc, const u32 v)
+{
+#if defined(__x86_64__)
+  return _mm_crc32_u32(crc, v);
+#elif defined(__aarch64__)
+  return __crc32cw(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_u64(const u32 crc, const u64 v)
+{
+#if defined(__x86_64__)
+  return (u32)_mm_crc32_u64(crc, v);
+#elif defined(__aarch64__)
+  return (u32)__crc32cd(crc, v);
+#endif
+}
+
+  inline u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc)
+{
+  if (nr == 1)
+    return crc32c_u8(crc, buf[0]);
+
+  crc = crc32c_u16(crc, *(u16 *)buf);
+  return (nr == 2) ? crc : crc32c_u8(crc, buf[2]);
+}
+
+  inline u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc)
+{
+  //debug_assert((nr & 3) == 0);
+  const u32 nr8 = nr >> 3;
+#pragma nounroll
+  for (u32 i = 0; i < nr8; i++)
+    crc = crc32c_u64(crc, ((u64*)buf)[i]);
+
+  if (nr & 4u)
+    crc = crc32c_u32(crc, ((u32*)buf)[nr8<<1]);
+  return crc;
+}
+
+  u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc)
+{
+  crc = crc32c_inc_x4(buf, nr, crc);
+  const u32 nr123 = nr & 3u;
+  return nr123 ? crc32c_inc_123(buf + nr - nr123, nr123, crc) : crc;
+}
+// }}} crc32c
+
+// debug {{{
+  void
+debug_break(void)
+{
+  usleep(100);
+}
+
+static u64 * debug_watch_u64 = NULL;
+
+  static void
+watch_u64_handler(const int sig)
+{
+  (void)sig;
+  const u64 v = debug_watch_u64 ? (*debug_watch_u64) : 0;
+  fprintf(stderr, "[USR1] %lu (0x%lx)\n", v, v);
+}
+
+  void
+watch_u64_usr1(u64 * const ptr)
+{
+  debug_watch_u64 = ptr;
+  struct sigaction sa = {};
+  sa.sa_handler = watch_u64_handler;
+  sigemptyset(&(sa.sa_mask));
+  sa.sa_flags = SA_RESTART;
+  if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+    fprintf(stderr, "Failed to set signal handler for SIGUSR1\n");
+  } else {
+    fprintf(stderr, "to watch> kill -s SIGUSR1 %d\n", getpid());
+  }
+}
+
+static void * debug_bt_state = NULL;
+#if defined(BACKTRACE) && defined(__linux__)
+// TODO: get exec path on MacOS and FreeBSD
+
+#include <backtrace.h>
+static char debug_filepath[1024] = {};
+
+  static void
+debug_bt_error_cb(void * const data, const char * const msg, const int errnum)
+{
+  (void)data;
+  if (msg)
+    dprintf(2, "libbacktrace: %s %s\n", msg, strerror(errnum));
+}
+
+  static int
+debug_bt_print_cb(void * const data, const uintptr_t pc,
+    const char * const file, const int lineno, const char * const func)
+{
+  u32 * const plevel = (typeof(plevel))data;
+  if (file || func || lineno) {
+    dprintf(2, "[%u]0x%012lx " TERMCLR(35) "%s" TERMCLR(31) ":" TERMCLR(34) "%d" TERMCLR(0)" %s\n",
+        *plevel, pc, file ? file : "???", lineno, func ? func : "???");
+  } else if (pc) {
+    dprintf(2, "[%u]0x%012lx ??\n", *plevel, pc);
+  }
+  (*plevel)++;
+  return 0;
+}
+
+__attribute__((constructor))
+  static void
+debug_backtrace_init(void)
+{
+  const ssize_t len = readlink("/proc/self/exe", debug_filepath, 1023);
+  // disable backtrace
+  if (len < 0 || len >= 1023)
+    return;
+
+  debug_filepath[len] = '\0';
+  debug_bt_state = backtrace_create_state(debug_filepath, 1, debug_bt_error_cb, NULL);
+}
+#endif // BACKTRACE
+
+  static void
+debug_wait_gdb(void * const bt_state)
+{
+  if (bt_state) {
+#if defined(BACKTRACE)
+    dprintf(2, "Backtrace :\n");
+    u32 level = 0;
+    backtrace_full(debug_bt_state, 1, debug_bt_print_cb, debug_bt_error_cb, &level);
+#endif // BACKTRACE
+  } else { // fallback to execinfo if no backtrace or initialization failed
+    void *array[64];
+    const int size = backtrace(array, 64);
+    dprintf(2, "Backtrace (%d):\n", size - 1);
+    backtrace_symbols_fd(array + 1, size - 1, 2);
+  }
+
+  abool v = true;
+  char timestamp[32];
+  time_stamp(timestamp, 32);
+  char threadname[32] = {};
+  thread_get_name(pthread_self(), threadname, 32);
+  strcat(threadname, "(!!)");
+  thread_set_name(pthread_self(), threadname);
+  char hostname[32];
+  gethostname(hostname, 32);
+
+  const char * const pattern = "[Waiting GDB] %1$s %2$s @ %3$s\n"
+    "    Attach me: " TERMCLR(31) "sudo -Hi gdb -p %4$d" TERMCLR(0) "\n";
+  char buf[256];
+  sprintf(buf, pattern, timestamp, threadname, hostname, getpid());
+  write(2, buf, strlen(buf));
+
+  // to continue: gdb> set var v = 0
+  // to kill from shell: $ kill %pid; kill -CONT %pid
+
+  // uncomment this line to surrender the shell on error
+  // kill(getpid(), SIGSTOP); // stop burning cpu, once
+
+  static au32 nr_waiting = 0;
+  const u32 seq = atomic_fetch_add_explicit(&nr_waiting, 1, MO_RELAXED);
+  if (seq == 0) {
+    sprintf(buf, "/run/user/%u/.debug_wait_gdb_pid", getuid());
+    const int pidfd = open(buf, O_CREAT|O_TRUNC|O_WRONLY, 00644);
+    if (pidfd >= 0) {
+      dprintf(pidfd, "%u", getpid());
+      close(pidfd);
+    }
+  }
+
+#pragma nounroll
+  while (atomic_load_explicit(&v, MO_CONSUME))
+    sleep(1);
+}
+
+#ifndef NDEBUG
+  void
+debug_assert(const bool v)
+{
+  if (!v)
+    debug_wait_gdb(debug_bt_state);
+}
+#endif
+
+__attribute__((noreturn))
+  void
+debug_die(void)
+{
+  debug_wait_gdb(debug_bt_state);
+  exit(0);
+}
+
+__attribute__((noreturn))
+  void
+debug_die_perror(void)
+{
+  perror(NULL);
+  debug_die();
+}
+
+#if !defined(NOSIGNAL)
+// signal handler for wait_gdb on fatal errors
+  static void
+wait_gdb_handler(const int sig, siginfo_t * const info, void * const context)
+{
+  (void)info;
+  (void)context;
+  char buf[64] = "[SIGNAL] ";
+  strcat(buf, strsignal(sig));
+  write(2, buf, strlen(buf));
+  debug_wait_gdb(NULL);
+}
+
+// setup hooks for catching fatal errors
+__attribute__((constructor))
+  static void
+debug_init(void)
+{
+  void * stack = pages_alloc_4kb(16);
+  //fprintf(stderr, "altstack %p\n", stack);
+  stack_t ss = {.ss_sp = stack, .ss_flags = 0, .ss_size = PGSZ*16};
+  if (sigaltstack(&ss, NULL))
+    fprintf(stderr, "sigaltstack failed\n");
+
+  struct sigaction sa = {.sa_sigaction = wait_gdb_handler, .sa_flags = SA_SIGINFO | SA_ONSTACK};
+  sigemptyset(&(sa.sa_mask));
+  const int fatals[] = {SIGSEGV, SIGFPE, SIGILL, SIGBUS, 0};
+  for (int i = 0; fatals[i]; i++) {
+    if (sigaction(fatals[i], &sa, NULL) == -1) {
+      fprintf(stderr, "Failed to set signal handler for %s\n", strsignal(fatals[i]));
+      fflush(stderr);
+    }
+  }
+}
+
+__attribute__((destructor))
+  static void
+debug_exit(void)
+{
+  // to get rid of valgrind warnings
+  stack_t ss = {.ss_flags = SS_DISABLE};
+  stack_t oss = {};
+  sigaltstack(&ss, &oss);
+  if (oss.ss_sp)
+    pages_unmap(oss.ss_sp, PGSZ * 16);
+}
+#endif // !defined(NOSIGNAL)
+
+  void
+debug_dump_maps(FILE * const out)
+{
+  FILE * const in = fopen("/proc/self/smaps", "r");
+  char * line0 = yalloc(1024);
+  size_t size0 = 1024;
+  while (!feof(in)) {
+    const ssize_t r1 = getline(&line0, &size0, in);
+    if (r1 < 0) break;
+    fprintf(out, "%s", line0);
+  }
+  free(line0);
+  fflush(out);
+  fclose(in);
+}
+
+static pid_t perf_pid = 0;
+
+#if defined(__linux__)
+__attribute__((constructor))
+  static void
+debug_perf_init(void)
+{
+  const pid_t ppid = getppid();
+  char tmp[256] = {};
+  sprintf(tmp, "/proc/%d/cmdline", ppid);
+  FILE * const fc = fopen(tmp, "r");
+  const size_t nr = fread(tmp, 1, sizeof(tmp) - 1, fc);
+  fclose(fc);
+  // look for "perf record"
+  if (nr < 12)
+    return;
+  tmp[nr] = '\0';
+  for (u64 i = 0; i < nr; i++)
+    if (tmp[i] == 0)
+      tmp[i] = ' ';
+
+  char * const perf = strstr(tmp, "perf record");
+  if (perf) {
+    fprintf(stderr, "%s: perf detected\n", __func__);
+    perf_pid = ppid;
+  }
+}
+#endif // __linux__
+
+  bool
+debug_perf_switch(void)
+{
+  if (perf_pid > 0) {
+    kill(perf_pid, SIGUSR2);
+    return true;
+  } else {
+    return false;
+  }
+}
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  bool
+alloc_fail(void)
+{
+#define ALLOCFAIL_RECP ((64lu))
+#define ALLOCFAIL_MAGIC ((ALLOCFAIL_RECP / 3lu))
+  return ((random_u64() % ALLOCFAIL_RECP) == ALLOCFAIL_MAGIC);
+}
+
+#ifdef MALLOCFAIL
+extern void * __libc_malloc(size_t size);
+  void *
+malloc(size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_malloc(size);
+}
+
+extern void * __libc_calloc(size_t nmemb, size_t size);
+  void *
+calloc(size_t nmemb, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_calloc(nmemb, size);
+}
+
+extern void *__libc_realloc(void *ptr, size_t size);
+
+  void *
+realloc(void *ptr, size_t size)
+{
+  if (alloc_fail())
+    return NULL;
+  return __libc_realloc(ptr, size);
+}
+#endif // MALLOC_FAIL
+#endif // ALLOC_FAIL
+
+  void *
+xalloc(const size_t align, const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, align, size) == 0) ? p : NULL;
+}
+
+// alloc cache-line aligned address
+  void *
+yalloc(const size_t size)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  void * p;
+  return (posix_memalign(&p, 64, size) == 0) ? p : NULL;
+}
+
+  void **
+malloc_2d(const size_t nr, const size_t size)
+{
+  const size_t size1 = nr * sizeof(void *);
+  const size_t size2 = nr * size;
+  void ** const mem = malloc(size1 + size2);
+  u8 * const mem2 = ((u8 *)mem) + size1;
+  for (size_t i = 0; i < nr; i++)
+    mem[i] = mem2 + (i * size);
+  return mem;
+}
+
+  inline void **
+calloc_2d(const size_t nr, const size_t size)
+{
+  void ** const ret = malloc_2d(nr, size);
+  memset(ret[0], 0, nr * size);
+  return ret;
+}
+
+  inline void
+pages_unmap(void * const ptr, const size_t size)
+{
+#ifndef HEAPCHECKING
+  munmap(ptr, size);
+#else
+  (void)size;
+  free(ptr);
+#endif
+}
+
+  void
+pages_lock(void * const ptr, const size_t size)
+{
+  static bool use_mlock = true;
+  if (use_mlock) {
+    const int ret = mlock(ptr, size);
+    if (ret != 0) {
+      use_mlock = false;
+      fprintf(stderr, "%s: mlock disabled\n", __func__);
+    }
+  }
+}
+
+#ifndef HEAPCHECKING
+  static void *
+pages_do_alloc(const size_t size, const int flags)
+{
+  // vi /etc/security/limits.conf
+  // * - memlock unlimited
+  void * const p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (p == MAP_FAILED)
+    return NULL;
+
+  pages_lock(p, size);
+  return p;
+}
+
+#if defined(__linux__) && defined(MAP_HUGETLB)
+
+#if defined(MAP_HUGE_SHIFT)
+#define PAGES_FLAGS_1G ((MAP_HUGETLB | (30 << MAP_HUGE_SHIFT)))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB | (21 << MAP_HUGE_SHIFT)))
+#else // MAP_HUGE_SHIFT
+#define PAGES_FLAGS_1G ((MAP_HUGETLB))
+#define PAGES_FLAGS_2M ((MAP_HUGETLB))
+#endif // MAP_HUGE_SHIFT
+
+#else
+#define PAGES_FLAGS_1G ((0))
+#define PAGES_FLAGS_2M ((0))
+#endif // __linux__
+
+#endif // HEAPCHECKING
+
+  inline void *
+pages_alloc_1gb(const size_t nr_1gb)
+{
+  const u64 sz = nr_1gb << 30;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_1G);
+#else
+  void * const p = xalloc(1lu << 21, sz); // Warning: valgrind fails with 30
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_2mb(const size_t nr_2mb)
+{
+  const u64 sz = nr_2mb << 21;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS | PAGES_FLAGS_2M);
+#else
+  void * const p = xalloc(1lu << 21, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  inline void *
+pages_alloc_4kb(const size_t nr_4kb)
+{
+  const size_t sz = nr_4kb << 12;
+#ifndef HEAPCHECKING
+  return pages_do_alloc(sz, MAP_PRIVATE | MAP_ANONYMOUS);
+#else
+  void * const p = xalloc(1lu << 12, sz);
+  if (p)
+    memset(p, 0, sz);
+  return p;
+#endif
+}
+
+  void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  // 1gb huge page: at least 0.25GB
+  if (try_1gb) {
+    if (size >= (1lu << 28)) {
+      const size_t nr_1gb = bits_round_up(size, 30) >> 30;
+      void * const p1 = pages_alloc_1gb(nr_1gb);
+      if (p1) {
+        *size_out = nr_1gb << 30;
+        return p1;
+      }
+    }
+  }
+
+  // 2mb huge page: at least 0.5MB
+  if (size >= (1lu << 19)) {
+    const size_t nr_2mb = bits_round_up(size, 21) >> 21;
+    void * const p2 = pages_alloc_2mb(nr_2mb);
+    if (p2) {
+      *size_out = nr_2mb << 21;
+      return p2;
+    }
+  }
+
+  const size_t nr_4kb = bits_round_up(size, 12) >> 12;
+  void * const p3 = pages_alloc_4kb(nr_4kb);
+  if (p3)
+    *size_out = nr_4kb << 12;
+  return p3;
+}
+// }}} mm
+
+// process/thread {{{
+static u32 process_ncpu;
+#if defined(__FreeBSD__)
+typedef cpuset_t cpu_set_t;
+#elif defined(__APPLE__) && defined(__MACH__)
+typedef u64 cpu_set_t;
+#define CPU_SETSIZE ((64))
+#define CPU_COUNT(__cpu_ptr__) (__builtin_popcountl(*__cpu_ptr__))
+#define CPU_ISSET(__cpu_idx__, __cpu_ptr__) (((*__cpu_ptr__) >> __cpu_idx__) & 1lu)
+#define CPU_ZERO(__cpu_ptr__) ((*__cpu_ptr__) = 0)
+#define CPU_SET(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) |= (1lu << __cpu_idx__))
+#define CPU_CLR(__cpu_idx__, __cpu_ptr__) ((*__cpu_ptr__) &= ~(1lu << __cpu_idx__))
+#define pthread_attr_setaffinity_np(...) ((void)0)
+#endif
+
+__attribute__((constructor))
+  static void
+process_init(void)
+{
+  // Linux's default is 1024 cpus
+  process_ncpu = (u32)sysconf(_SC_NPROCESSORS_CONF);
+  if (process_ncpu > CPU_SETSIZE) {
+    fprintf(stderr, "%s: can use only %zu cores\n",
+        __func__, (size_t)CPU_SETSIZE);
+    process_ncpu = CPU_SETSIZE;
+  }
+  thread_set_name(pthread_self(), "main");
+}
+
+  static inline int
+thread_getaffinity_set(cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_getaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_getaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  *cpuset = (1lu << process_ncpu) - 1;
+  return (int)process_ncpu; // TODO
+#endif // OS
+}
+
+  static inline int
+thread_setaffinity_set(const cpu_set_t * const cpuset)
+{
+#if defined(__linux__)
+  return sched_setaffinity(0, sizeof(*cpuset), cpuset);
+#elif defined(__FreeBSD__)
+  return cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(*cpuset), cpuset);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)cpuset; // TODO
+  return 0;
+#endif // OS
+}
+
+  void
+thread_get_name(const pthread_t pt, char * const name, const size_t len)
+{
+#if defined(__linux__)
+  pthread_getname_np(pt, name, len);
+#elif defined(__FreeBSD__)
+  pthread_get_name_np(pt, name, len);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)len;
+  strcpy(name, "unknown"); // TODO
+#endif // OS
+}
+
+  void
+thread_set_name(const pthread_t pt, const char * const name)
+{
+#if defined(__linux__)
+  pthread_setname_np(pt, name);
+#elif defined(__FreeBSD__)
+  pthread_set_name_np(pt, name);
+#elif defined(__APPLE__) && defined(__MACH__)
+  (void)pt;
+  (void)name; // TODO
+#endif // OS
+}
+
+// kB
+  long
+process_get_rss(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  return rs.ru_maxrss;
+}
+
+  u32
+process_affinity_count(void)
+{
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return process_ncpu;
+
+  const u32 nr = (u32)CPU_COUNT(&set);
+  return nr ? nr : process_ncpu;
+}
+
+  u32
+process_getaffinity_list(const u32 max, u32 * const cores)
+{
+  memset(cores, 0, max * sizeof(cores[0]));
+  cpu_set_t set;
+  if (thread_getaffinity_set(&set) != 0)
+    return 0;
+
+  const u32 nr_affinity = (u32)CPU_COUNT(&set);
+  const u32 nr = nr_affinity < max ? nr_affinity : max;
+  u32 j = 0;
+  for (u32 i = 0; i < process_ncpu; i++) {
+    if (CPU_ISSET(i, &set))
+      cores[j++] = i;
+
+    if (j >= nr)
+      break;
+  }
+  return j;
+}
+
+  void
+thread_setaffinity_list(const u32 nr, const u32 * const list)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  for (u32 i = 0; i < nr; i++)
+    if (list[i] < process_ncpu)
+      CPU_SET(list[i], &set);
+  thread_setaffinity_set(&set);
+}
+
+  void
+thread_pin(const u32 cpu)
+{
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(cpu % process_ncpu, &set);
+  thread_setaffinity_set(&set);
+}
+
+  u64
+process_cpu_time_usec(void)
+{
+  struct rusage rs;
+  getrusage(RUSAGE_SELF, &rs);
+  const u64 usr = (((u64)rs.ru_utime.tv_sec) * 1000000lu) + ((u64)rs.ru_utime.tv_usec);
+  const u64 sys = (((u64)rs.ru_stime.tv_sec) * 1000000lu) + ((u64)rs.ru_stime.tv_usec);
+  return usr + sys;
+}
+
+struct fork_join_info {
+  u32 total;
+  u32 ncores;
+  u32 * cores;
+  void *(*func)(void *);
+  bool args;
+  union {
+    void * arg1;
+    void ** argn;
+  };
+  union {
+    struct { au32 ferr, jerr; };
+    au64 xerr;
+  };
+};
+
+// DON'T CHANGE!
+#define FORK_JOIN_RANK_BITS ((16)) // 16
+#define FORK_JOIN_MAX ((1u << FORK_JOIN_RANK_BITS))
+
+/*
+ * fj(6):     T0
+ *         /      \
+ *       T0        T4
+ *     /   \      /
+ *    T0   T2    T4
+ *   / \   / \   / \
+ *  t0 t1 t2 t3 t4 t5
+ */
+
+// recursive tree fork-join
+  static void *
+thread_do_fork_join_worker(void * const ptr)
+{
+  struct entry13 fjp = {.ptr = ptr};
+  // GCC: Without explicitly casting from fjp.fji (a 45-bit u64 value),
+  // the high bits will get truncated, which is always CORRECT in gcc.
+  // Don't use gcc.
+  struct fork_join_info * const fji = u64_to_ptr(fjp.e3);
+  const u32 rank = (u32)fjp.e1;
+
+  const u32 nchild = (u32)__builtin_ctz(rank ? rank : bits_p2_up_u32(fji->total));
+  debug_assert(nchild <= FORK_JOIN_RANK_BITS);
+  pthread_t tids[FORK_JOIN_RANK_BITS];
+  if (nchild) {
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); // Joinable by default
+    // fork top-down
+    for (u32 i = nchild - 1; i < nchild; i--) {
+      const u32 cr = rank + (1u << i); // child's rank
+      if (cr >= fji->total)
+        continue; // should not break
+      const u32 core = fji->cores[(cr < fji->ncores) ? cr : (cr % fji->ncores)];
+      CPU_SET(core, &set);
+      pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+      fjp.e1 = (u16)cr;
+      const int r = pthread_create(&tids[i], &attr, thread_do_fork_join_worker, fjp.ptr);
+      CPU_CLR(core, &set);
+      if (unlikely(r)) { // fork failed
+        memset(&tids[0], 0, sizeof(tids[0]) * (i+1));
+        u32 nmiss = (1u << (i + 1)) - 1;
+        if ((rank + nmiss) >= fji->total)
+          nmiss = fji->total - 1 - rank;
+        (void)atomic_fetch_add_explicit(&fji->ferr, nmiss, MO_RELAXED);
+        break;
+      }
+    }
+    pthread_attr_destroy(&attr);
+  }
+
+  char thname0[16];
+  char thname1[16];
+  thread_get_name(pthread_self(), thname0, 16);
+  snprintf(thname1, 16, "%.8s_%u", thname0, rank);
+  thread_set_name(pthread_self(), thname1);
+
+  void * const ret = fji->func(fji->args ? fji->argn[rank] : fji->arg1);
+
+  thread_set_name(pthread_self(), thname0);
+  // join bottom-up
+  for (u32 i = 0; i < nchild; i++) {
+    const u32 cr = rank + (1u << i); // child rank
+    if (cr >= fji->total)
+      break; // safe to break
+    if (tids[i]) {
+      const int r = pthread_join(tids[i], NULL);
+      if (unlikely(r)) { // error
+        //fprintf(stderr, "pthread_join %u..%u = %d: %s\n", rank, cr, r, strerror(r));
+        (void)atomic_fetch_add_explicit(&fji->jerr, 1, MO_RELAXED);
+      }
+    }
+  }
+  return ret;
+}
+
+  u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx)
+{
+  if (unlikely(nr > FORK_JOIN_MAX)) {
+    fprintf(stderr, "%s reduce nr to %u\n", __func__, FORK_JOIN_MAX);
+    nr = FORK_JOIN_MAX;
+  }
+
+  u32 cores[CPU_SETSIZE];
+  u32 ncores = process_getaffinity_list(process_ncpu, cores);
+  if (unlikely(ncores == 0)) { // force to use all cores
+    ncores = process_ncpu;
+    for (u32 i = 0; i < process_ncpu; i++)
+      cores[i] = i;
+  }
+  if (unlikely(nr == 0))
+    nr = ncores;
+
+  // the compiler does not know fji can change since we cast &fji into fjp
+  struct fork_join_info fji = {.total = nr, .cores = cores, .ncores = ncores,
+      .func = func, .args = args, .arg1 = argx};
+  const struct entry13 fjp = entry13(0, (u64)(&fji));
+
+  // save current affinity
+  cpu_set_t set0;
+  thread_getaffinity_set(&set0);
+
+  // master thread shares thread0's core
+  cpu_set_t set;
+  CPU_ZERO(&set);
+  CPU_SET(fji.cores[0], &set);
+  thread_setaffinity_set(&set);
+
+  const u64 t0 = time_nsec();
+  (void)thread_do_fork_join_worker(fjp.ptr);
+  const u64 dt = time_diff_nsec(t0);
+
+  // restore original affinity
+  thread_setaffinity_set(&set0);
+
+  // check and report errors (unlikely)
+  if (atomic_load_explicit(&fji.xerr, MO_CONSUME))
+    fprintf(stderr, "%s errors: fork %u join %u\n", __func__, fji.ferr, fji.jerr);
+  return dt;
+}
+
+  int
+thread_create_at(const u32 cpu, pthread_t * const thread,
+    void *(*start_routine) (void *), void * const arg)
+{
+  const u32 cpu_id = (cpu < process_ncpu) ? cpu : (cpu % process_ncpu);
+  pthread_attr_t attr;
+  pthread_attr_init(&attr);
+  //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+  cpu_set_t set;
+
+  CPU_ZERO(&set);
+  CPU_SET(cpu_id, &set);
+  pthread_attr_setaffinity_np(&attr, sizeof(set), &set);
+  const int r = pthread_create(thread, &attr, start_routine, arg);
+  pthread_attr_destroy(&attr);
+  return r;
+}
+// }}} process/thread
+
+// locking {{{
+
+// spinlock {{{
+#if defined(__linux__)
+#define SPINLOCK_PTHREAD
+#endif // __linux__
+
+#if defined(SPINLOCK_PTHREAD)
+static_assert(sizeof(pthread_spinlock_t) <= sizeof(spinlock), "spinlock size");
+#else // SPINLOCK_PTHREAD
+static_assert(sizeof(au32) <= sizeof(spinlock), "spinlock size");
+#endif // SPINLOCK_PTHREAD
+
+  void
+spinlock_init(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_init(p, PTHREAD_PROCESS_PRIVATE);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_lock(spinlock * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!spinlock_trylock(lock))
+    corr_yield();
+#else // CORR
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_lock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+#pragma nounroll
+  do {
+    if (atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0)
+      return;
+#pragma nounroll
+    do {
+      cpu_pause();
+    } while (atomic_load_explicit(p, MO_CONSUME));
+  } while (true);
+#endif // SPINLOCK_PTHREAD
+#endif // CORR
+}
+
+  inline bool
+spinlock_trylock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  return !pthread_spin_trylock(p);
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  return atomic_fetch_sub_explicit(p, 1, MO_ACQUIRE) == 0;
+#endif // SPINLOCK_PTHREAD
+}
+
+  inline void
+spinlock_unlock(spinlock * const lock)
+{
+#if defined(SPINLOCK_PTHREAD)
+  pthread_spinlock_t * const p = (typeof(p))lock;
+  pthread_spin_unlock(p); // return value ignored
+#else // SPINLOCK_PTHREAD
+  au32 * const p = (typeof(p))lock;
+  atomic_store_explicit(p, 0, MO_RELEASE);
+#endif // SPINLOCK_PTHREAD
+}
+// }}} spinlock
+
+// pthread mutex {{{
+static_assert(sizeof(pthread_mutex_t) <= sizeof(mutex), "mutexlock size");
+  inline void
+mutex_init(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_init(p, NULL);
+}
+
+  inline void
+mutex_lock(mutex * const lock)
+{
+#if defined(CORR)
+#pragma nounroll
+  while (!mutex_trylock(lock))
+    corr_yield();
+#else
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_lock(p); // return value ignored
+#endif
+}
+
+  inline bool
+mutex_trylock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  return !pthread_mutex_trylock(p); // return value ignored
+}
+
+  inline void
+mutex_unlock(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_unlock(p); // return value ignored
+}
+
+  inline void
+mutex_deinit(mutex * const lock)
+{
+  pthread_mutex_t * const p = (typeof(p))lock;
+  pthread_mutex_destroy(p);
+}
+// }}} pthread mutex
+
+// rwdep {{{
+// poor man's lockdep for rwlock
+// per-thread lock list
+// it calls debug_die() when local double-(un)locking is detected
+// cyclic dependencies can be manually identified by looking at the two lists below in gdb
+#ifdef RWDEP
+#define RWDEP_NR ((16))
+__thread const rwlock * rwdep_readers[RWDEP_NR] = {};
+__thread const rwlock * rwdep_writers[RWDEP_NR] = {};
+
+  static void
+rwdep_check(const rwlock * const lock)
+{
+  debug_assert(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock)
+      debug_die();
+    if (rwdep_writers[i] == lock)
+      debug_die();
+  }
+}
+#endif // RWDEP
+
+  static void
+rwdep_lock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == NULL) {
+      rwdep_readers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_read(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_readers[i] == lock) {
+      rwdep_readers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_lock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  rwdep_check(lock);
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == NULL) {
+      rwdep_writers[i] = lock;
+      return;
+    }
+  }
+#else
+  (void)lock;
+#endif // RWDEP
+}
+
+  static void
+rwdep_unlock_write(const rwlock * const lock)
+{
+#ifdef RWDEP
+  for (u64 i = 0; i < RWDEP_NR; i++) {
+    if (rwdep_writers[i] == lock) {
+      rwdep_writers[i] = NULL;
+      return;
+    }
+  }
+  debug_die();
+#else
+  (void)lock;
+#endif // RWDEP
+}
+// }}} rwlockdep
+
+// rwlock {{{
+typedef au32 lock_t;
+typedef u32 lock_v;
+static_assert(sizeof(lock_t) == sizeof(lock_v), "lock size");
+static_assert(sizeof(lock_t) <= sizeof(rwlock), "lock size");
+
+#define RWLOCK_WSHIFT ((sizeof(lock_t) * 8 - 1))
+#define RWLOCK_WBIT ((((lock_v)1) << RWLOCK_WSHIFT))
+
+  inline void
+rwlock_init(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_store_explicit(pvar, 0, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  } else {
+    atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_read_lp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) {
+    cpu_pause();
+    return false;
+  }
+  return rwlock_trylock_read(lock);
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  if ((atomic_fetch_add_explicit(pvar, 1, MO_ACQUIRE) >> RWLOCK_WSHIFT) == 0) {
+    rwdep_lock_read(lock);
+    return true;
+  }
+
+#pragma nounroll
+  do { // someone already locked; wait for a little while
+    cpu_pause();
+    if ((atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT) == 0) {
+      rwdep_lock_read(lock);
+      return true;
+    }
+  } while (nr--);
+
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELAXED);
+  return false;
+}
+
+  inline void
+rwlock_lock_read(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_read(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME) >> RWLOCK_WSHIFT);
+  } while (true);
+}
+
+  inline void
+rwlock_unlock_read(rwlock * const lock)
+{
+  rwdep_unlock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, 1, MO_RELEASE);
+}
+
+  inline bool
+rwlock_trylock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if ((v0 == 0) && atomic_compare_exchange_weak_explicit(pvar, &v0, RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// actually nr + 1
+  inline bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write(lock))
+      return;
+#pragma nounroll
+    do {
+#if defined(CORR)
+      corr_yield();
+#else
+      cpu_pause();
+#endif
+    } while (atomic_load_explicit(pvar, MO_CONSUME));
+  } while (true);
+}
+
+  inline bool
+rwlock_trylock_write_hp(rwlock * const lock)
+{
+  lock_t * const pvar = (typeof(pvar))lock;
+  lock_v v0 = atomic_load_explicit(pvar, MO_CONSUME);
+  if (v0 >> RWLOCK_WSHIFT)
+    return false;
+
+  if (atomic_compare_exchange_weak_explicit(pvar, &v0, v0|RWLOCK_WBIT, MO_ACQUIRE, MO_RELAXED)) {
+    rwdep_lock_write(lock);
+    // WBIT successfully marked; must wait for readers to leave
+    if (v0) { // saw active readers
+#pragma nounroll
+      while (atomic_load_explicit(pvar, MO_CONSUME) != RWLOCK_WBIT) {
+#if defined(CORR)
+        corr_yield();
+#else
+        cpu_pause();
+#endif
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+  inline bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr)
+{
+#pragma nounroll
+  do {
+    if (rwlock_trylock_write_hp(lock))
+      return true;
+    cpu_pause();
+  } while (nr--);
+  return false;
+}
+
+  inline void
+rwlock_lock_write_hp(rwlock * const lock)
+{
+#pragma nounroll
+  while (!rwlock_trylock_write_hp(lock)) {
+#if defined(CORR)
+    corr_yield();
+#else
+    cpu_pause();
+#endif
+  }
+}
+
+  inline void
+rwlock_unlock_write(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  atomic_fetch_sub_explicit(pvar, RWLOCK_WBIT, MO_RELEASE);
+}
+
+  inline void
+rwlock_write_to_read(rwlock * const lock)
+{
+  rwdep_unlock_write(lock);
+  rwdep_lock_read(lock);
+  lock_t * const pvar = (typeof(pvar))lock;
+  // +R -W
+  atomic_fetch_add_explicit(pvar, ((lock_v)1) - RWLOCK_WBIT, MO_ACQ_REL);
+}
+
+#undef RWLOCK_WSHIFT
+#undef RWLOCK_WBIT
+// }}} rwlock
+
+// }}} locking
+
+// coroutine {{{
+
+// asm {{{
+#if defined(__x86_64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((6))
+
+// for switch/exit: pass a return value to the target
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error Supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "push %rbp; push %rbx; push %r12;"
+    "push %r13; push %r14; push %r15;"
+    "mov  %rsp, (%rdi);"
+    "mov  %rsi, %rsp;"
+    "pop  %r15; pop  %r14; pop  %r13;"
+    "pop  %r12; pop  %rbx; pop  %rbp;"
+    "mov  %rdx, %rax;"
+    "retq;"
+    );
+
+#elif defined(__aarch64__)
+// number pushes in co_switch_stack
+#define CO_CONTEXT_SIZE ((20))
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_switch_stack;"
+    ".type co_switch_stack, @function;"
+    "co_switch_stack:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_switch_stack;"
+    "_co_switch_stack:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "sub  x8, sp, 160;"
+    "str  x8, [x0];"
+    "stp x30, x19, [x8];      ldp x30, x19, [x1];"
+    "stp x20, x21, [x8, 16];  ldp x20, x21, [x1, 16];"
+    "stp x22, x23, [x8, 32];  ldp x22, x23, [x1, 32];"
+    "stp x24, x25, [x8, 48];  ldp x24, x25, [x1, 48];"
+    "stp x26, x27, [x8, 64];  ldp x26, x27, [x1, 64];"
+    "stp x28, x29, [x8, 80];  ldp x28, x29, [x1, 80];"
+    "stp  d8,  d9, [x8, 96];  ldp  d8,  d9, [x1, 96];"
+    "stp d10, d11, [x8, 112]; ldp d10, d11, [x1, 112];"
+    "stp d12, d13, [x8, 128]; ldp d12, d13, [x1, 128];"
+    "stp d14, d15, [x8, 144]; ldp d14, d15, [x1, 144];"
+    "add  sp, x1, 160;"
+    "mov  x0, x2;"
+    "br  x30;"
+    );
+
+extern void co_entry_aarch64(void);
+asm (
+    ".align 16;"
+#if defined(__linux__) || defined(__FreeBSD__)
+    ".global co_entry_aarch64;"
+    ".type co_entry_aarch64, @function;"
+    "co_entry_aarch64:"
+#elif defined(__APPLE__) && defined(__MACH__)
+    ".global _co_entry_aarch64;"
+    "_co_entry_aarch64:"
+#else
+#error supported platforms: Linux/FreeBSD/Apple
+#endif // OS
+    "ldr x8, [sp, 0];"
+    "blr x8;"
+    "ldr x8, [sp, 8];"
+    "blr x8;"
+    "ldr x8, [sp, 16];"
+    "blr x8;"
+    );
+#else
+#error supported CPUs: x86_64 or AArch64
+#endif // co_switch_stack x86_64 and aarch64
+// }}} asm
+
+// co {{{
+struct co {
+  u64 rsp;
+  void * priv;
+  u64 * host; // set host to NULL to exit
+  size_t stksz;
+};
+
+// not atomic: no concurrent access
+// volatile: avoid caching of co_curr
+static __thread struct co * volatile co_curr = NULL; // NULL in host
+
+// the stack sits under the struct co
+  static void
+co_init(struct co * const co, void * func, void * priv, u64 * const host,
+    const u64 stksz, void * func_exit)
+{
+  debug_assert((stksz & 0x3f) == 0); // a multiple of 64 bytes
+  u64 * rsp = ((u64 *)co) - 4;
+  rsp[0] = (u64)func;
+  rsp[1] = (u64)func_exit;
+  rsp[2] = (u64)debug_die;
+  rsp[3] = 0;
+
+  rsp -= CO_CONTEXT_SIZE;
+
+#if defined(__aarch64__)
+  rsp[0] = (u64)co_entry_aarch64;
+#endif
+
+  co->rsp = (u64)rsp;
+  co->priv = priv;
+  co->host = host;
+  co->stksz = stksz;
+}
+
+  static void
+co_exit0(void)
+{
+  co_exit(0);
+}
+
+  struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct co);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct co * const co = (typeof(co))(mem + stksz);
+  co_init(co, func, priv, host, stksz, co_exit0);
+  return co;
+}
+
+  inline void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(co, func, priv, host, co->stksz, co_exit0);
+}
+
+  inline struct co *
+co_fork(void * func, void * priv)
+{
+  return co_curr ? co_create(co_curr->stksz, func, priv, co_curr->host) : NULL;
+}
+
+  inline void *
+co_priv(void)
+{
+  return co_curr ? co_curr->priv : NULL;
+}
+
+// the host calls this to enter a coroutine.
+  inline u64
+co_enter(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr == NULL); // must entry from the host
+  debug_assert(to && to->host);
+  u64 * const save = to->host;
+  co_curr = to;
+  const u64 ret = co_switch_stack(save, to->rsp, retval);
+  co_curr = NULL;
+  return ret;
+}
+
+// switch from a coroutine to another coroutine
+// co_curr must be valid
+// the target will resume and receive the retval
+  inline u64
+co_switch_to(struct co * const to, const u64 retval)
+{
+  debug_assert(co_curr);
+  debug_assert(co_curr != to);
+  debug_assert(to && to->host);
+  struct co * const save = co_curr;
+  co_curr = to;
+  return co_switch_stack(&(save->rsp), to->rsp, retval);
+}
+
+// switch from a coroutine to the host routine
+// co_yield is now a c++ keyword...
+  inline u64
+co_back(const u64 retval)
+{
+  debug_assert(co_curr);
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  return co_switch_stack(&(save->rsp), *(save->host), retval);
+}
+
+#ifdef CO_STACK_CHECK
+  static void
+co_stack_check(const u8 * const mem, const u64 stksz)
+{
+  const u64 * const mem64 = (typeof(mem64))mem;
+  const u64 size64 = stksz / sizeof(u64);
+  for (u64 i = 0; i < size64; i++) {
+    if (mem64[i] != 0x5c5c5c5c5c5c5c5clu) {
+      fprintf(stderr, "%s co stack usage: %lu/%lu\n", __func__, stksz - (i * sizeof(u64)), stksz);
+      break;
+    }
+  }
+}
+#endif // CO_STACK_CHECK
+
+// return to host and set host to NULL
+__attribute__((noreturn))
+  void
+co_exit(const u64 retval)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  u8 * const mem = ((u8 *)co_curr) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+  const u64 hostrsp = *(co_curr->host);
+  co_curr->host = NULL;
+  struct co * const save = co_curr;
+  co_curr = NULL;
+  (void)co_switch_stack(&(save->rsp), hostrsp, retval);
+  // return to co_enter
+  debug_die();
+}
+
+// host is set to NULL on exit
+  inline bool
+co_valid(struct co * const co)
+{
+  return co->host != NULL;
+}
+
+// return NULL on host
+  inline struct co *
+co_self(void)
+{
+  return co_curr;
+}
+
+  inline void
+co_destroy(struct co * const co)
+{
+  u8 * const mem = ((u8 *)co) - co->stksz;
+  free(mem);
+}
+// }}} co
+
+// corr {{{
+struct corr {
+  struct co co;
+  struct corr * next;
+  struct corr * prev;
+};
+
+// initial and link guest to the run-queue
+  struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, host, stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+  return co;
+}
+
+  struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev)
+{
+  const u64 stksz = bits_round_up(stacksize, 6);
+  const size_t alloc_size = stksz + sizeof(struct corr);
+  u8 * const mem = yalloc(alloc_size);
+  if (mem == NULL)
+    return NULL;
+
+#ifdef CO_STACK_CHECK
+  memset(mem, 0x5c, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const co = (typeof(co))(mem + stksz);
+  co_init(&(co->co), func, priv, prev->co.host, stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+  return co;
+}
+
+  inline void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host)
+{
+  co_init(&(co->co), func, priv, host, co->co.stksz, corr_exit);
+  co->next = co;
+  co->prev = co;
+}
+
+  inline void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev)
+{
+  co_init(&(co->co), func, priv, prev->co.host, co->co.stksz, corr_exit);
+  co->next = prev->next;
+  co->prev = prev;
+  co->prev->next = co;
+  co->next->prev = co;
+}
+
+  inline void
+corr_enter(struct corr * const co)
+{
+  (void)co_enter(&(co->co), 0);
+}
+
+  inline void
+corr_yield(void)
+{
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr && (curr->next != curr))
+    (void)co_switch_to(&(curr->next->co), 0);
+}
+
+__attribute__((noreturn))
+  inline void
+corr_exit(void)
+{
+  debug_assert(co_curr);
+#ifdef CO_STACK_CHECK
+  const u64 stksz = co_curr->stksz;
+  const u8 * const mem = ((u8 *)(co_curr)) - stksz;
+  co_stack_check(mem, stksz);
+#endif // CO_STACK_CHECK
+
+  struct corr * const curr = (typeof(curr))co_curr;
+  if (curr->next != curr) { // have more corr
+    struct corr * const next = curr->next;
+    struct corr * const prev = curr->prev;
+    next->prev = prev;
+    prev->next = next;
+    curr->next = NULL;
+    curr->prev = NULL;
+    curr->co.host = NULL; // invalidate
+    (void)co_switch_to(&(next->co), 0);
+  } else { // the last corr
+    co_exit0();
+  }
+  debug_die();
+}
+
+  inline void
+corr_destroy(struct corr * const co)
+{
+  co_destroy(&(co->co));
+}
+// }}} corr
+
+// }}} co
+
+// bits {{{
+  inline u32
+bits_reverse_u32(const u32 v)
+{
+  const u32 v2 = __builtin_bswap32(v);
+  const u32 v3 = ((v2 & 0xf0f0f0f0u) >> 4) | ((v2 & 0x0f0f0f0fu) << 4);
+  const u32 v4 = ((v3 & 0xccccccccu) >> 2) | ((v3 & 0x33333333u) << 2);
+  const u32 v5 = ((v4 & 0xaaaaaaaau) >> 1) | ((v4 & 0x55555555u) << 1);
+  return v5;
+}
+
+  inline u64
+bits_reverse_u64(const u64 v)
+{
+  const u64 v2 = __builtin_bswap64(v);
+  const u64 v3 = ((v2 & 0xf0f0f0f0f0f0f0f0lu) >>  4) | ((v2 & 0x0f0f0f0f0f0f0f0flu) <<  4);
+  const u64 v4 = ((v3 & 0xcccccccccccccccclu) >>  2) | ((v3 & 0x3333333333333333lu) <<  2);
+  const u64 v5 = ((v4 & 0xaaaaaaaaaaaaaaaalu) >>  1) | ((v4 & 0x5555555555555555lu) <<  1);
+  return v5;
+}
+
+  inline u64
+bits_rotl_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v << sh) | (v >> (64 - sh));
+}
+
+  inline u64
+bits_rotr_u64(const u64 v, const u8 n)
+{
+  const u8 sh = n & 0x3f;
+  return (v >> sh) | (v << (64 - sh));
+}
+
+  inline u32
+bits_rotl_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v << sh) | (v >> (32 - sh));
+}
+
+  inline u32
+bits_rotr_u32(const u32 v, const u8 n)
+{
+  const u8 sh = n & 0x1f;
+  return (v >> sh) | (v << (32 - sh));
+}
+
+  inline u64
+bits_p2_up_u64(const u64 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1lu << (64 - __builtin_clzl(v - 1lu))) : v;
+}
+
+  inline u32
+bits_p2_up_u32(const u32 v)
+{
+  // clz(0) is undefined
+  return (v > 1) ? (1u << (32 - __builtin_clz(v - 1u))) : v;
+}
+
+  inline u64
+bits_p2_down_u64(const u64 v)
+{
+  return v ? (1lu << (63 - __builtin_clzl(v))) : v;
+}
+
+  inline u32
+bits_p2_down_u32(const u32 v)
+{
+  return v ? (1u << (31 - __builtin_clz(v))) : v;
+}
+
+  inline u64
+bits_round_up(const u64 v, const u8 power)
+{
+  return (v + (1lu << power) - 1lu) >> power << power;
+}
+
+  inline u64
+bits_round_up_a(const u64 v, const u64 a)
+{
+  return (v + a - 1) / a * a;
+}
+
+  inline u64
+bits_round_down(const u64 v, const u8 power)
+{
+  return v >> power << power;
+}
+
+  inline u64
+bits_round_down_a(const u64 v, const u64 a)
+{
+  return v / a * a;
+}
+// }}} bits
+
+// vi128 {{{
+#if defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH __attribute__ ((fallthrough))
+#else
+#define FALLTHROUGH ((void)0)
+#endif /* __GNUC__ >= 7 */
+
+  inline u32
+vi128_estimate_u32(const u32 v)
+{
+  static const u8 t[] = {5,5,5,5,
+    4,4,4,4,4,4,4, 3,3,3,3,3,3,3,
+    2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clz(v)] : 2;
+  // 0 -> [0x80 0x00] the first byte is non-zero
+
+  // nz bit range -> enc length    offset in t[]
+  // 0 -> 2          special case
+  // 1 to 7 -> 1     31 to 25
+  // 8 to 14 -> 2    24 to 18
+  // 15 to 21 -> 3   17 to 11
+  // 22 to 28 -> 4   10 to 4
+  // 29 to 32 -> 5    3 to 0
+}
+
+  u8 *
+vi128_encode_u32(u8 * dst, u32 v)
+{
+  switch (vi128_estimate_u32(v)) {
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+  const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out)
+{
+  debug_assert(*src);
+  u32 r = 0;
+  for (u32 shift = 0; shift < 32; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u32)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+  inline u32
+vi128_estimate_u64(const u64 v)
+{
+  static const u8 t[] = {10,
+    9,9,9,9,9,9,9, 8,8,8,8,8,8,8, 7,7,7,7,7,7,7,
+    6,6,6,6,6,6,6, 5,5,5,5,5,5,5, 4,4,4,4,4,4,4,
+    3,3,3,3,3,3,3, 2,2,2,2,2,2,2, 1,1,1,1,1,1,1};
+  return v ? t[__builtin_clzl(v)] : 2;
+}
+
+// return ptr after the generated bytes
+  u8 *
+vi128_encode_u64(u8 * dst, u64 v)
+{
+  switch (vi128_estimate_u64(v)) {
+  case 10:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 9:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 8:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 7:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 6:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 5:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 4:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 3:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 2:
+    *(dst++) = (u8)(v | 0x80); v >>= 7; FALLTHROUGH;
+  case 1:
+    *(dst++) = (u8)v;
+    break;
+  default:
+    debug_die();
+    break;
+  }
+  return dst;
+}
+
+// return ptr after the consumed bytes
+  const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out)
+{
+  u64 r = 0;
+  for (u32 shift = 0; shift < 64; shift += 7) {
+    const u8 byte = *(src++);
+    r |= (((u64)(byte & 0x7f)) << shift);
+    if ((byte & 0x80) == 0) { // No more bytes to consume
+      *out = r;
+      return src;
+    }
+  }
+  *out = 0;
+  return NULL; // invalid
+}
+
+#undef FALLTHROUGH
+// }}} vi128
+
+// misc {{{
+  inline struct entry13
+entry13(const u16 e1, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  return (struct entry13){.v64 = (e3 << 16) | e1};
+}
+
+  inline void
+entry13_update_e3(struct entry13 * const e, const u64 e3)
+{
+  debug_assert((e3 >> 48) == 0);
+  *e = entry13(e->e1, e3);
+}
+
+  inline void *
+u64_to_ptr(const u64 v)
+{
+  return (void *)v;
+}
+
+  inline u64
+ptr_to_u64(const void * const ptr)
+{
+  return (u64)ptr;
+}
+
+// portable malloc_usable_size
+  inline size_t
+m_usable_size(void * const ptr)
+{
+#if defined(__linux__) || defined(__FreeBSD__)
+  const size_t sz = malloc_usable_size(ptr);
+#elif defined(__APPLE__) && defined(__MACH__)
+  const size_t sz = malloc_size(ptr);
+#endif // OS
+
+#ifndef HEAPCHECKING
+  // valgrind and asan may return unaligned usable size
+  debug_assert((sz & 0x7lu) == 0);
+#endif // HEAPCHECKING
+
+  return sz;
+}
+
+  inline size_t
+fdsize(const int fd)
+{
+  struct stat st;
+  st.st_size = 0;
+  if (fstat(fd, &st) != 0)
+    return 0;
+
+  if (S_ISBLK(st.st_mode)) {
+#if defined(__linux__)
+    ioctl(fd, BLKGETSIZE64, &st.st_size);
+#elif defined(__APPLE__) && defined(__MACH__)
+    u64 blksz = 0;
+    u64 nblks = 0;
+    ioctl(fd, DKIOCGETBLOCKSIZE, &blksz);
+    ioctl(fd, DKIOCGETBLOCKCOUNT, &nblks);
+    st.st_size = (ssize_t)(blksz * nblks);
+#elif defined(__FreeBSD__)
+    ioctl(fd, DIOCGMEDIASIZE, &st.st_size);
+#endif // OS
+  }
+
+  return (size_t)st.st_size;
+}
+
+  u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max)
+{
+  const u32 max64 = max & (~7u);
+  u32 clen = 0;
+  while (clen < max64) {
+    const u64 v1 = *(const u64 *)(p1+clen);
+    const u64 v2 = *(const u64 *)(p2+clen);
+    const u64 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctzl(x) >> 3);
+
+    clen += sizeof(u64);
+  }
+
+  if ((clen + sizeof(u32)) <= max) {
+    const u32 v1 = *(const u32 *)(p1+clen);
+    const u32 v2 = *(const u32 *)(p2+clen);
+    const u32 x = v1 ^ v2;
+    if (x)
+      return clen + (u32)(__builtin_ctz(x) >> 3);
+
+    clen += sizeof(u32);
+  }
+
+  while ((clen < max) && (p1[clen] == p2[clen]))
+    clen++;
+  return clen;
+}
+
+static double logger_t0 = 0.0;
+
+__attribute__((constructor))
+  static void
+logger_init(void)
+{
+  logger_t0 = time_sec();
+}
+
+__attribute__ ((format (printf, 2, 3)))
+  void
+logger_printf(const int fd, const char * const fmt, ...)
+{
+  char buf[4096];
+  va_list ap;
+  va_start(ap, fmt);
+  vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+  dprintf(fd, "%010.3lf %08x %s", time_diff_sec(logger_t0), crc32c_u64(0x12345678, (u64)pthread_self()), buf);
+}
+// }}} misc
+
+// astk {{{
+// atomic stack
+struct acell { struct acell * next; };
+
+// extract ptr from m value
+  static inline struct acell *
+astk_ptr(const u64 m)
+{
+  return (struct acell *)(m >> 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1(const u64 m0, struct acell * const ptr)
+{
+  return ((m0 + 1) & 0xfffflu) | (((u64)ptr) << 16);
+}
+
+// calculate the new magic
+  static inline u64
+astk_m1_unsafe(struct acell * const ptr)
+{
+  return ((u64)ptr) << 16;
+}
+
+  static bool
+astk_try_push(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1(m0, first);
+  return atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_RELEASE, MO_RELAXED);
+}
+
+  static void
+astk_push_safe(au64 * const pmagic, struct acell * const first, struct acell * const last)
+{
+  while (!astk_try_push(pmagic, first, last));
+}
+
+  static void
+astk_push_unsafe(au64 * const pmagic, struct acell * const first,
+    struct acell * const last)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  last->next = astk_ptr(m0);
+  const u64 m1 = astk_m1_unsafe(first);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+}
+
+//// can fail for two reasons: (1) NULL: no available object; (2) ~0lu: contention
+//  static void *
+//astk_try_pop(au64 * const pmagic)
+//{
+//  u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+//  struct acell * const ret = astk_ptr(m0);
+//  if (ret == NULL)
+//    return NULL;
+//
+//  const u64 m1 = astk_m1(m0, ret->next);
+//  if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+//    return ret;
+//  else
+//    return (void *)(~0lu);
+//}
+
+  static void *
+astk_pop_safe(au64 * const pmagic)
+{
+  do {
+    u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+    struct acell * const ret = astk_ptr(m0);
+    if (ret == NULL)
+      return NULL;
+
+    const u64 m1 = astk_m1(m0, ret->next);
+    if (atomic_compare_exchange_weak_explicit(pmagic, &m0, m1, MO_ACQUIRE, MO_RELAXED))
+      return ret;
+  } while (true);
+}
+
+  static void *
+astk_pop_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  struct acell * const ret = astk_ptr(m0);
+  if (ret == NULL)
+    return NULL;
+
+  const u64 m1 = astk_m1_unsafe(ret->next);
+  atomic_store_explicit(pmagic, m1, MO_RELAXED);
+  return (void *)ret;
+}
+
+  static void *
+astk_peek_unsafe(au64 * const pmagic)
+{
+  const u64 m0 = atomic_load_explicit(pmagic, MO_CONSUME);
+  return astk_ptr(m0);
+}
+// }}} astk
+
+// slab {{{
+#define SLAB_OBJ0_OFFSET ((64))
+struct slab {
+  au64 magic; // hi 48: ptr, lo 16: seq
+  u64 padding1[7];
+
+  // 2nd line
+  struct acell * head_active; // list of blocks in use or in magic
+  struct acell * head_backup; // list of unused full blocks
+  u64 nr_ready; // UNSAFE only! number of objects under magic
+  u64 padding2[5];
+
+  // 3rd line const
+  u64 obj_size; // const: aligned size of each object
+  u64 blk_size; // const: size of each memory block
+  u64 objs_per_slab; // const: number of objects in a slab
+  u64 obj0_offset; // const: offset of the first object in a block
+  u64 padding3[4];
+
+  // 4th line
+  union {
+    mutex lock;
+    u64 padding4[8];
+  };
+};
+static_assert(sizeof(struct slab) == 256, "sizeof(struct slab) != 256");
+
+  static void
+slab_add(struct slab * const slab, struct acell * const blk, const bool is_safe)
+{
+  // insert into head_active
+  blk->next = slab->head_active;
+  slab->head_active = blk;
+
+  u8 * const base = ((u8 *)blk) + slab->obj0_offset;
+  struct acell * iter = (typeof(iter))base; // [0]
+  for (u64 i = 1; i < slab->objs_per_slab; i++) {
+    struct acell * const next = (typeof(next))(base + (i * slab->obj_size));
+    iter->next = next;
+    iter = next;
+  }
+
+  // base points to the first block; iter points to the last block
+  if (is_safe) { // other threads can poll magic
+    astk_push_safe(&slab->magic, (struct acell *)base, iter);
+  } else { // unsafe
+    astk_push_unsafe(&slab->magic, (struct acell *)base, iter);
+    slab->nr_ready += slab->objs_per_slab;
+  }
+}
+
+// critical section; call with lock
+  static bool
+slab_expand(struct slab * const slab, const bool is_safe)
+{
+  struct acell * const old = slab->head_backup;
+  if (old) { // pop old from backup and add
+    slab->head_backup = old->next;
+    slab_add(slab, old, is_safe);
+  } else { // more core
+    size_t blk_size;
+    struct acell * const new = pages_alloc_best(slab->blk_size, true, &blk_size);
+    (void)blk_size;
+    if (new == NULL)
+      return false;
+
+    slab_add(slab, new, is_safe);
+  }
+  return true;
+}
+
+// return 0 on failure; otherwise, obj0_offset
+  static u64
+slab_check_sizes(const u64 obj_size, const u64 blk_size)
+{
+  // obj must be non-zero and 8-byte aligned
+  // blk must be at least of page size and power of 2
+  if ((!obj_size) || (obj_size % 8lu) || (blk_size < 4096lu) || (blk_size & (blk_size - 1)))
+    return 0;
+
+  // each slab should have at least one object
+  const u64 obj0_offset = (obj_size & (obj_size - 1)) ? SLAB_OBJ0_OFFSET : obj_size;
+  if (obj0_offset >= blk_size || (blk_size - obj0_offset) < obj_size)
+    return 0;
+
+  return obj0_offset;
+}
+
+  static void
+slab_init_internal(struct slab * const slab, const u64 obj_size, const u64 blk_size, const u64 obj0_offset)
+{
+  memset(slab, 0, sizeof(*slab));
+  slab->obj_size = obj_size;
+  slab->blk_size = blk_size;
+  slab->objs_per_slab = (blk_size - obj0_offset) / obj_size;
+  debug_assert(slab->objs_per_slab); // >= 1
+  slab->obj0_offset = obj0_offset;
+  mutex_init(&(slab->lock));
+}
+
+  struct slab *
+slab_create(const u64 obj_size, const u64 blk_size)
+{
+  const u64 obj0_offset = slab_check_sizes(obj_size, blk_size);
+  if (!obj0_offset)
+    return NULL;
+
+  struct slab * const slab = yalloc(sizeof(*slab));
+  if (slab == NULL)
+    return NULL;
+
+  slab_init_internal(slab, obj_size, blk_size, obj0_offset);
+  return slab;
+}
+
+// unsafe
+  bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr)
+{
+  while (slab->nr_ready < nr)
+    if (!slab_expand(slab, false))
+      return false;
+  return true;
+}
+
+  void *
+slab_alloc_unsafe(struct slab * const slab)
+{
+  void * ret = astk_pop_unsafe(&slab->magic);
+  if (ret == NULL) {
+    if (!slab_expand(slab, false))
+      return NULL;
+    ret = astk_pop_unsafe(&slab->magic);
+  }
+  debug_assert(ret);
+  slab->nr_ready--;
+  return ret;
+}
+
+  void *
+slab_alloc_safe(struct slab * const slab)
+{
+  void * ret = astk_pop_safe(&slab->magic);
+  if (ret)
+    return ret;
+
+  mutex_lock(&slab->lock);
+  do {
+    ret = astk_pop_safe(&slab->magic); // may already have new objs
+    if (ret)
+      break;
+    if (!slab_expand(slab, true))
+      break;
+  } while (true);
+  mutex_unlock(&slab->lock);
+  return ret;
+}
+
+  void
+slab_free_unsafe(struct slab * const slab, void * const ptr)
+{
+  debug_assert(ptr);
+  astk_push_unsafe(&slab->magic, ptr, ptr);
+  slab->nr_ready++;
+}
+
+  void
+slab_free_safe(struct slab * const slab, void * const ptr)
+{
+  astk_push_safe(&slab->magic, ptr, ptr);
+}
+
+// UNSAFE
+  void
+slab_free_all(struct slab * const slab)
+{
+  slab->magic = 0;
+  slab->nr_ready = 0; // backup does not count
+
+  if (slab->head_active) {
+    struct acell * iter = slab->head_active;
+    while (iter->next)
+      iter = iter->next;
+    // now iter points to the last blk
+    iter->next = slab->head_backup; // active..backup
+    slab->head_backup = slab->head_active; // backup gets all
+    slab->head_active = NULL; // empty active
+  }
+}
+
+// unsafe
+  u64
+slab_get_nalloc(struct slab * const slab)
+{
+  struct acell * iter = slab->head_active;
+  u64 n = 0;
+  while (iter) {
+    n++;
+    iter = iter->next;
+  }
+  n *= slab->objs_per_slab;
+
+  iter = astk_peek_unsafe(&slab->magic);
+  while (iter) {
+    n--;
+    iter = iter->next;
+  }
+  return n;
+}
+
+  static void
+slab_deinit(struct slab * const slab)
+{
+  debug_assert(slab);
+  struct acell * iter = slab->head_active;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+  iter = slab->head_backup;
+  while (iter) {
+    struct acell * const next = iter->next;
+    pages_unmap(iter, slab->blk_size);
+    iter = next;
+  }
+}
+
+  void
+slab_destroy(struct slab * const slab)
+{
+  slab_deinit(slab);
+  free(slab);
+}
+// }}} slab
+
+// string {{{
+static union { u16 v16; u8 v8[2]; } strdec_table[100];
+
+__attribute__((constructor))
+  static void
+strdec_init(void)
+{
+  for (u8 i = 0; i < 100; i++) {
+    const u8 hi = (typeof(hi))('0' + (i / 10));
+    const u8 lo = (typeof(lo))('0' + (i % 10));
+    strdec_table[i].v8[0] = hi;
+    strdec_table[i].v8[1] = lo;
+  }
+}
+
+// output 10 bytes
+  void
+strdec_32(void * const out, const u32 v)
+{
+  u32 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 4; i <= 4; i--) { // x5
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100u;
+  }
+}
+
+// output 20 bytes
+  void
+strdec_64(void * const out, const u64 v)
+{
+  u64 vv = v;
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 9; i <= 9; i--) { // x10
+    ptr[i] = strdec_table[vv % 100].v16;
+    vv /= 100;
+  }
+}
+
+static const u8 strhex_table_16[16] = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+
+#if defined(__x86_64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+
+  const m128 tmp = _mm_set_epi64x((s64)(v>>4), (s64)v); // mm want s64
+  const m128 hilo = _mm_and_si128(tmp, _mm_set1_epi8(0xf));
+  const m128 bin = _mm_shuffle_epi8(hilo, _mm_load_si128((void *)mask1));
+  const m128 str = _mm_shuffle_epi8(_mm_load_si128((const void *)strhex_table_16), bin);
+  return str;
+}
+#elif defined(__aarch64__)
+  static inline m128
+strhex_helper(const u64 v)
+{
+  static const u8 mask1[16] = {15,7,14,6,13,5,12,4,11,3,10,2,9,1,8,0};
+  u64 v2[2] = {v, v>>4};
+  const m128 tmp = vld1q_u8((u8 *)v2);
+  const m128 hilo = vandq_u8(tmp, vdupq_n_u8(0xf));
+  const m128 bin = vqtbl1q_u8(hilo, vld1q_u8(mask1));
+  const m128 str = vqtbl1q_u8(vld1q_u8(strhex_table_16), bin);
+  return str;
+}
+#else
+static u16 strhex_table_256[256];
+
+__attribute__((constructor))
+  static void
+strhex_init(void)
+{
+  for (u64 i = 0; i < 256; i++)
+    strhex_table_256[i] = (((u16)strhex_table_16[i & 0xf]) << 8) | (strhex_table_16[i>>4]);
+}
+#endif // __x86_64__
+
+// output 8 bytes
+  void
+strhex_32(void * const out, u32 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper((u64)v);
+  _mm_storel_epi64(out, _mm_srli_si128(str, 8));
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper((u64)v);
+  vst1q_lane_u64(out, vreinterpretq_u64_u8(str), 1);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 4; i++) {
+    ptr[3-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// output 16 bytes // buffer must be aligned by 16B
+  void
+strhex_64(void * const out, u64 v)
+{
+#if defined(__x86_64__)
+  const m128 str = strhex_helper(v);
+  _mm_storeu_si128(out, str);
+#elif defined(__aarch64__)
+  const m128 str = strhex_helper(v);
+  vst1q_u8(out, str);
+#else
+  u16 * const ptr = (typeof(ptr))out;
+  for (u64 i = 0; i < 8; i++) {
+    ptr[7-i] = strhex_table_256[v & 0xff];
+    v >>= 8;
+  }
+#endif
+}
+
+// string to u64
+  inline u64
+a2u64(const void * const str)
+{
+  return strtoull(str, NULL, 10);
+}
+
+  inline u32
+a2u32(const void * const str)
+{
+  return (u32)strtoull(str, NULL, 10);
+}
+
+  inline s64
+a2s64(const void * const str)
+{
+  return strtoll(str, NULL, 10);
+}
+
+  inline s32
+a2s32(const void * const str)
+{
+  return (s32)strtoll(str, NULL, 10);
+}
+
+  void
+str_print_hex(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 3;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    buf[i*3] = ' ';
+    buf[i*3+1] = strhex_table_16[ptr[i]>>4];
+    buf[i*3+2] = strhex_table_16[ptr[i] & 0xf];
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+  void
+str_print_dec(FILE * const out, const void * const data, const u32 len)
+{
+  const u8 * const ptr = data;
+  const u32 strsz = len * 4;
+  u8 * const buf = malloc(strsz);
+  for (u32 i = 0; i < len; i++) {
+    const u8 v = ptr[i];
+    buf[i*4] = ' ';
+    const u8 v1 = v / 100u;
+    const u8 v23 = v % 100u;
+    buf[i*4+1] = (u8)'0' + v1;
+    buf[i*4+2] = (u8)'0' + (v23 / 10u);
+    buf[i*4+3] = (u8)'0' + (v23 % 10u);
+  }
+  fwrite(buf, strsz, 1, out);
+  free(buf);
+}
+
+// returns a NULL-terminated list of string tokens.
+// After use you only need to free the returned pointer (char **).
+  char **
+strtoks(const char * const str, const char * const delim)
+{
+  if (str == NULL)
+    return NULL;
+  size_t nptr_alloc = 32;
+  char ** tokens = malloc(sizeof(tokens[0]) * nptr_alloc);
+  if (tokens == NULL)
+    return NULL;
+  const size_t bufsize = strlen(str) + 1;
+  char * const buf = malloc(bufsize);
+  if (buf == NULL)
+    goto fail_buf;
+
+  memcpy(buf, str, bufsize);
+  char * saveptr = NULL;
+  char * tok = strtok_r(buf, delim, &saveptr);
+  size_t ntoks = 0;
+  while (tok) {
+    if (ntoks >= nptr_alloc) {
+      nptr_alloc += 32;
+      char ** const r = realloc(tokens, sizeof(tokens[0]) * nptr_alloc);
+      if (r == NULL)
+        goto fail_realloc;
+
+      tokens = r;
+    }
+    tokens[ntoks] = tok;
+    ntoks++;
+    tok = strtok_r(NULL, delim, &saveptr);
+  }
+  tokens[ntoks] = NULL;
+  const size_t nptr = ntoks + 1; // append a NULL
+  const size_t rsize = (sizeof(tokens[0]) * nptr) + bufsize;
+  char ** const r = realloc(tokens, rsize);
+  if (r == NULL)
+    goto fail_realloc;
+
+  tokens = r;
+  char * const dest = (char *)(&(tokens[nptr]));
+  memcpy(dest, buf, bufsize);
+  for (u64 i = 0; i < ntoks; i++)
+    tokens[i] += (dest - buf);
+
+  free(buf);
+  return tokens;
+
+fail_realloc:
+  free(buf);
+fail_buf:
+  free(tokens);
+  return NULL;
+}
+
+  u32
+strtoks_count(const char * const * const toks)
+{
+  if (!toks)
+    return 0;
+  u32 n = 0;
+  while (toks[n++]);
+  return n;
+}
+// }}} string
+
+// qsbr {{{
+#define QSBR_STATES_NR ((23)) // shard capacity; valid values are 3*8-1 == 23; 5*8-1 == 39; 7*8-1 == 55
+#define QSBR_SHARD_BITS  ((5)) // 2^n shards
+#define QSBR_SHARD_NR    (((1u) << QSBR_SHARD_BITS))
+#define QSBR_SHARD_MASK  ((QSBR_SHARD_NR - 1))
+
+struct qsbr_ref_real {
+#ifdef QSBR_DEBUG
+  pthread_t ptid; // 8
+  u32 status; // 4
+  u32 nbt; // 4 (number of backtrace frames)
+#define QSBR_DEBUG_BTNR ((14))
+  void * backtrace[QSBR_DEBUG_BTNR];
+#endif
+  au64 qstate; // user updates it
+  au64 * pptr; // internal only
+  struct qsbr_ref_real * park;
+};
+
+static_assert(sizeof(struct qsbr_ref) == sizeof(struct qsbr_ref_real), "sizeof qsbr_ref");
+
+// Quiescent-State-Based Reclamation RCU
+struct qsbr {
+  struct qsbr_ref_real target;
+  u64 padding0[5];
+  struct qshard {
+    au64 bitmap;
+    au64 ptrs[QSBR_STATES_NR];
+  } shards[QSBR_SHARD_NR];
+};
+
+  struct qsbr *
+qsbr_create(void)
+{
+  struct qsbr * const q = yalloc(sizeof(*q));
+  memset(q, 0, sizeof(*q));
+  return q;
+}
+
+  static inline struct qshard *
+qsbr_shard(struct qsbr * const q, void * const ptr)
+{
+  const u32 sid = crc32c_u64(0, (u64)ptr) & QSBR_SHARD_MASK;
+  debug_assert(sid < QSBR_SHARD_NR);
+  return &(q->shards[sid]);
+}
+
+  static inline void
+qsbr_write_qstate(struct qsbr_ref_real * const ref, const u64 v)
+{
+  atomic_store_explicit(&ref->qstate, v, MO_RELAXED);
+}
+
+  bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  qsbr_write_qstate(ref, 0);
+
+  do {
+    u64 bits = atomic_load_explicit(&shard->bitmap, MO_CONSUME);
+    const u32 pos = (u32)__builtin_ctzl(~bits);
+    if (unlikely(pos >= QSBR_STATES_NR))
+      return false;
+
+    const u64 bits1 = bits | (1lu << pos);
+    if (atomic_compare_exchange_weak_explicit(&shard->bitmap, &bits, bits1, MO_ACQUIRE, MO_RELAXED)) {
+      atomic_store_explicit(&shard->ptrs[pos], (u64)ref, MO_RELAXED);
+      //shard->ptrs[pos] = ref;
+
+      ref->pptr = &(shard->ptrs[pos]);
+      ref->park = &q->target;
+#ifdef QSBR_DEBUG
+      ref->ptid = (u64)pthread_self();
+      ref->tid = 0;
+      ref->status = 1;
+      ref->nbt = backtrace(ref->backtrace, QSBR_DEBUG_BTNR);
+#endif
+      return true;
+    }
+  } while (true);
+}
+
+  void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  struct qshard * const shard = qsbr_shard(q, ref);
+  const u32 pos = (u32)(ref->pptr - shard->ptrs);
+  debug_assert(pos < QSBR_STATES_NR);
+  debug_assert(shard->bitmap & (1lu << pos));
+
+  atomic_store_explicit(&shard->ptrs[pos], (u64)(&q->target), MO_RELAXED);
+  //shard->ptrs[pos] = &q->target;
+  (void)atomic_fetch_and_explicit(&shard->bitmap, ~(1lu << pos), MO_RELEASE);
+#ifdef QSBR_DEBUG
+  ref->tid = 0;
+  ref->ptid = 0;
+  ref->status = 0xffff; // unregistered
+  ref->nbt = 0;
+#endif
+  ref->pptr = NULL;
+  // wait for qsbr_wait to leave if it's working on the shard
+  while (atomic_load_explicit(&shard->bitmap, MO_CONSUME) >> 63)
+    cpu_pause();
+}
+
+  inline void
+qsbr_update(struct qsbr_ref * const qref, const u64 v)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  debug_assert((*ref->pptr) == (u64)ref); // must be unparked
+  // rcu update does not require release or acquire order
+  qsbr_write_qstate(ref, v);
+}
+
+  inline void
+qsbr_park(struct qsbr_ref * const qref)
+{
+  cpu_cfence();
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref->park, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xfff; // parked
+#endif
+}
+
+  inline void
+qsbr_resume(struct qsbr_ref * const qref)
+{
+  struct qsbr_ref_real * const ref = (typeof(ref))qref;
+  atomic_store_explicit(ref->pptr, (u64)ref, MO_RELAXED);
+#ifdef QSBR_DEBUG
+  ref->status = 0xf; // resumed
+#endif
+  cpu_cfence();
+}
+
+// waiters needs external synchronization
+  void
+qsbr_wait(struct qsbr * const q, const u64 target)
+{
+  cpu_cfence();
+  qsbr_write_qstate(&q->target, target);
+  u64 cbits = 0; // check-bits; each bit corresponds to a shard
+  u64 bms[QSBR_SHARD_NR]; // copy of all bitmap
+  // take an unsafe snapshot of active users
+  for (u32 i = 0; i < QSBR_SHARD_NR; i++) {
+    bms[i] = atomic_load_explicit(&q->shards[i].bitmap, MO_CONSUME);
+    if (bms[i])
+      cbits |= (1lu << i); // set to 1 if [i] has ptrs
+  }
+
+  while (cbits) {
+    for (u64 ctmp = cbits; ctmp; ctmp &= (ctmp - 1)) {
+      // shard id
+      const u32 i = (u32)__builtin_ctzl(ctmp);
+      struct qshard * const shard = &(q->shards[i]);
+      const u64 bits1 = atomic_fetch_or_explicit(&(shard->bitmap), 1lu << 63, MO_ACQUIRE);
+      for (u64 bits = bms[i]; bits; bits &= (bits - 1)) {
+        const u64 bit = bits & -bits; // extract lowest bit
+        if ((bits1 & bit) == 0) {
+          bms[i] &= ~bit;
+        } else {
+          au64 * pptr = &(shard->ptrs[__builtin_ctzl(bit)]);
+          struct qsbr_ref_real * const ptr = (typeof(ptr))atomic_load_explicit(pptr, MO_RELAXED);
+          if (atomic_load_explicit(&(ptr->qstate), MO_CONSUME) == target)
+            bms[i] &= ~bit;
+        }
+      }
+      (void)atomic_fetch_and_explicit(&(shard->bitmap), ~(1lu << 63), MO_RELEASE);
+      if (bms[i] == 0)
+        cbits &= ~(1lu << i);
+    }
+#if defined(CORR)
+    corr_yield();
+#endif
+  }
+  debug_assert(cbits == 0);
+  cpu_cfence();
+}
+
+  void
+qsbr_destroy(struct qsbr * const q)
+{
+  if (q)
+    free(q);
+}
+#undef QSBR_STATES_NR
+#undef QSBR_BITMAP_NR
+// }}} qsbr
+
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/lib.h b/test/MassTrie-beta/wormhole/lib.h
new file mode 100644
index 00000000..40a2f40d
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/lib.h
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+// includes {{{
+// C headers
+#include <errno.h>
+#include <inttypes.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+// POSIX headers
+#include <fcntl.h>
+#include <pthread.h>
+#include <unistd.h>
+
+// Linux headers
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+// SIMD
+#if defined(__x86_64__)
+#include <x86intrin.h>
+#elif defined(__aarch64__)
+#include <arm_acle.h>
+#include <arm_neon.h>
+#endif
+// }}} includes
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// types {{{
+#ifndef typeof
+#define typeof __typeof__
+#endif
+#ifndef asm
+#define asm __asm__
+#endif
+typedef char            s8;
+typedef short           s16;
+typedef int             s32;
+typedef long            s64;
+typedef __int128_t      s128;
+static_assert(sizeof(s8) == 1, "sizeof(s8)");
+static_assert(sizeof(s16) == 2, "sizeof(s16)");
+static_assert(sizeof(s32) == 4, "sizeof(s32)");
+static_assert(sizeof(s64) == 8, "sizeof(s64)");
+static_assert(sizeof(s128) == 16, "sizeof(s128)");
+
+typedef unsigned char   u8;
+typedef unsigned short  u16;
+typedef unsigned int    u32;
+typedef unsigned long   u64;
+typedef __uint128_t     u128;
+static_assert(sizeof(u8) == 1, "sizeof(u8)");
+static_assert(sizeof(u16) == 2, "sizeof(u16)");
+static_assert(sizeof(u32) == 4, "sizeof(u32)");
+static_assert(sizeof(u64) == 8, "sizeof(u64)");
+static_assert(sizeof(u128) == 16, "sizeof(u128)");
+
+#if defined(__x86_64__)
+typedef __m128i m128;
+#if defined(__AVX2__)
+typedef __m256i m256;
+#endif // __AVX2__
+#if defined(__AVX512F__)
+typedef __m512i m512;
+#endif // __AVX512F__
+#elif defined(__aarch64__)
+typedef uint8x16_t m128;
+#else
+#error Need x86_64 or AArch64.
+#endif
+// }}} types
+
+// defs {{{
+#define likely(____x____)   __builtin_expect(____x____, 1)
+#define unlikely(____x____) __builtin_expect(____x____, 0)
+
+// ansi colors
+// 3X:fg; 4X:bg; 9X:light fg; 10X:light bg;
+// X can be one of the following colors:
+// 0:black;   1:red;     2:green;  3:yellow;
+// 4:blue;    5:magenta; 6:cyan;   7:white;
+#define TERMCLR(____code____) "\x1b[" #____code____ "m"
+// }}} defs
+
+// const {{{
+#define PGBITS ((12))
+#define PGSZ ((1lu << PGBITS))
+// }}} const
+
+// math {{{
+  extern u64
+mhash64(const u64 v);
+
+  extern u32
+mhash32(const u32 v);
+
+  extern u64
+gcd64(u64 a, u64 b);
+// }}} math
+
+// random {{{
+  extern u64
+random_u64(void);
+
+  extern void
+srandom_u64(const u64 seed);
+
+  extern double
+random_double(void);
+// }}} random
+
+// timing {{{
+  extern u64
+time_nsec(void);
+
+  extern double
+time_sec(void);
+
+  extern u64
+time_diff_nsec(const u64 last);
+
+  extern double
+time_diff_sec(const double last);
+
+  extern void
+time_stamp(char * str, const size_t size);
+
+  extern void
+time_stamp2(char * str, const size_t size);
+// }}} timing
+
+// cpucache {{{
+  extern void
+cpu_pause(void);
+
+  extern void
+cpu_mfence(void);
+
+  extern void
+cpu_cfence(void);
+
+  extern void
+cpu_prefetch0(const void * const ptr);
+
+  extern void
+cpu_prefetch1(const void * const ptr);
+
+  extern void
+cpu_prefetch2(const void * const ptr);
+
+  extern void
+cpu_prefetch3(const void * const ptr);
+
+  extern void
+cpu_prefetchw(const void * const ptr);
+// }}} cpucache
+
+// crc32c {{{
+  extern u32
+crc32c_u8(const u32 crc, const u8 v);
+
+  extern u32
+crc32c_u16(const u32 crc, const u16 v);
+
+  extern u32
+crc32c_u32(const u32 crc, const u32 v);
+
+  extern u32
+crc32c_u64(const u32 crc, const u64 v);
+
+// 1 <= nr <= 3
+  extern u32
+crc32c_inc_123(const u8 * buf, u32 nr, u32 crc);
+
+// nr % 4 == 0
+  extern u32
+crc32c_inc_x4(const u8 * buf, u32 nr, u32 crc);
+
+  extern u32
+crc32c_inc(const u8 * buf, u32 nr, u32 crc);
+// }}} crc32c
+
+// debug {{{
+  extern void
+debug_break(void);
+
+  extern void
+debug_backtrace(void);
+
+  extern void
+watch_u64_usr1(u64 * const ptr);
+
+#ifndef NDEBUG
+  extern void
+debug_assert(const bool v);
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+__attribute__((noreturn))
+  extern void
+debug_die(void);
+
+__attribute__((noreturn))
+  extern void
+debug_die_perror(void);
+
+  extern void
+debug_dump_maps(FILE * const out);
+
+  extern bool
+debug_perf_switch(void);
+// }}} debug
+
+// mm {{{
+#ifdef ALLOCFAIL
+  extern bool
+alloc_fail(void);
+#endif
+
+  extern void *
+xalloc(const size_t align, const size_t size);
+
+  extern void *
+yalloc(const size_t size);
+
+  extern void **
+malloc_2d(const size_t nr, const size_t size);
+
+  extern void **
+calloc_2d(const size_t nr, const size_t size);
+
+  extern void
+pages_unmap(void * const ptr, const size_t size);
+
+  extern void
+pages_lock(void * const ptr, const size_t size);
+
+/* hugepages */
+// force posix allocators: -DVALGRIND_MEMCHECK
+  extern void *
+pages_alloc_4kb(const size_t nr_4kb);
+
+  extern void *
+pages_alloc_2mb(const size_t nr_2mb);
+
+  extern void *
+pages_alloc_1gb(const size_t nr_1gb);
+
+  extern void *
+pages_alloc_best(const size_t size, const bool try_1gb, u64 * const size_out);
+// }}} mm
+
+// process/thread {{{
+  extern void
+thread_get_name(const pthread_t pt, char * const name, const size_t len);
+
+  extern void
+thread_set_name(const pthread_t pt, const char * const name);
+
+  extern long
+process_get_rss(void);
+
+  extern u32
+process_affinity_count(void);
+
+  extern u32
+process_getaffinity_list(const u32 max, u32 * const cores);
+
+  extern void
+thread_setaffinity_list(const u32 nr, const u32 * const list);
+
+  extern void
+thread_pin(const u32 cpu);
+
+  extern u64
+process_cpu_time_usec(void);
+
+// if args == true, argx is void **
+// if args == false, argx is void *
+  extern u64
+thread_fork_join(u32 nr, void *(*func) (void *), const bool args, void * const argx);
+
+  extern int
+thread_create_at(const u32 cpu, pthread_t * const thread, void *(*start_routine) (void *), void * const arg);
+// }}} process/thread
+
+// locking {{{
+typedef union {
+  u32 opaque;
+} spinlock;
+
+  extern void
+spinlock_init(spinlock * const lock);
+
+  extern void
+spinlock_lock(spinlock * const lock);
+
+  extern bool
+spinlock_trylock(spinlock * const lock);
+
+  extern void
+spinlock_unlock(spinlock * const lock);
+
+typedef union {
+  u32 opaque;
+} rwlock;
+
+  extern void
+rwlock_init(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read(rwlock * const lock);
+
+// low-priority reader-lock; use with trylock_write_hp
+  extern bool
+rwlock_trylock_read_lp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_read_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_read(rwlock * const lock);
+
+  extern void
+rwlock_unlock_read(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write(rwlock * const lock);
+
+// writer has higher priority; new readers are blocked
+  extern bool
+rwlock_trylock_write_hp(rwlock * const lock);
+
+  extern bool
+rwlock_trylock_write_hp_nr(rwlock * const lock, u16 nr);
+
+  extern void
+rwlock_lock_write_hp(rwlock * const lock);
+
+  extern void
+rwlock_unlock_write(rwlock * const lock);
+
+  extern void
+rwlock_write_to_read(rwlock * const lock);
+
+typedef union {
+  u64 opqaue[8];
+} mutex;
+
+  extern void
+mutex_init(mutex * const lock);
+
+  extern void
+mutex_lock(mutex * const lock);
+
+  extern bool
+mutex_trylock(mutex * const lock);
+
+  extern void
+mutex_unlock(mutex * const lock);
+
+  extern void
+mutex_deinit(mutex * const lock);
+// }}} locking
+
+// coroutine {{{
+extern u64 co_switch_stack(u64 * const saversp, const u64 newrsp, const u64 retval);
+
+struct co;
+
+  extern struct co *
+co_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern void
+co_reuse(struct co * const co, void * func, void * priv, u64 * const host);
+
+  extern struct co *
+co_fork(void * func, void * priv);
+
+  extern void *
+co_priv(void);
+
+  extern u64
+co_enter(struct co * const to, const u64 retval);
+
+  extern u64
+co_switch_to(struct co * const to, const u64 retval);
+
+  extern u64
+co_back(const u64 retval);
+
+  extern void
+co_exit(const u64 retval);
+
+  extern bool
+co_valid(struct co * const co);
+
+  extern struct co *
+co_self(void);
+
+  extern void
+co_destroy(struct co * const co);
+
+struct corr;
+
+  extern struct corr *
+corr_create(const u64 stacksize, void * func, void * priv, u64 * const host);
+
+  extern struct corr *
+corr_link(const u64 stacksize, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_reuse(struct corr * const co, void * func, void * priv, u64 * const host);
+
+  extern void
+corr_relink(struct corr * const co, void * func, void * priv, struct corr * const prev);
+
+  extern void
+corr_enter(struct corr * const co);
+
+  extern void
+corr_yield(void);
+
+  extern void
+corr_exit(void);
+
+  extern void
+corr_destroy(struct corr * const co);
+// }}} coroutine
+
+// bits {{{
+  extern u32
+bits_reverse_u32(const u32 v);
+
+  extern u64
+bits_reverse_u64(const u64 v);
+
+  extern u64
+bits_rotl_u64(const u64 v, const u8 n);
+
+  extern u64
+bits_rotr_u64(const u64 v, const u8 n);
+
+  extern u32
+bits_rotl_u32(const u32 v, const u8 n);
+
+  extern u32
+bits_rotr_u32(const u32 v, const u8 n);
+
+  extern u64
+bits_p2_up_u64(const u64 v);
+
+  extern u32
+bits_p2_up_u32(const u32 v);
+
+  extern u64
+bits_p2_down_u64(const u64 v);
+
+  extern u32
+bits_p2_down_u32(const u32 v);
+
+  extern u64
+bits_round_up(const u64 v, const u8 power);
+
+  extern u64
+bits_round_up_a(const u64 v, const u64 a);
+
+  extern u64
+bits_round_down(const u64 v, const u8 power);
+
+  extern u64
+bits_round_down_a(const u64 v, const u64 a);
+// }}} bits
+
+// vi128 {{{
+  extern u32
+vi128_estimate_u32(const u32 v);
+
+  extern u8 *
+vi128_encode_u32(u8 * dst, u32 v);
+
+  extern const u8 *
+vi128_decode_u32(const u8 * src, u32 * const out);
+
+  extern u32
+vi128_estimate_u64(const u64 v);
+
+  extern u8 *
+vi128_encode_u64(u8 * dst, u64 v);
+
+  extern const u8 *
+vi128_decode_u64(const u8 * src, u64 * const out);
+// }}} vi128
+
+// misc {{{
+// TODO: only works on little endian?
+struct entry13 { // what a beautiful name
+  union {
+    u16 e1;
+    struct { // easy for debugging
+      u64 e1_64:16;
+      u64 e3:48;
+    };
+    u64 v64;
+    void * ptr;
+  };
+};
+
+static_assert(sizeof(struct entry13) == 8, "sizeof(entry13) != 8");
+
+// directly access read .e1 and .e3
+// directly write .e1
+// use entry13_update() to update the entire entry
+
+  extern struct entry13
+entry13(const u16 e1, const u64 e3);
+
+  extern void
+entry13_update_e3(struct entry13 * const e, const u64 e3);
+
+  extern void *
+u64_to_ptr(const u64 v);
+
+  extern u64
+ptr_to_u64(const void * const ptr);
+
+  extern size_t
+m_usable_size(void * const ptr);
+
+  extern size_t
+fdsize(const int fd);
+
+  extern u32
+memlcp(const u8 * const p1, const u8 * const p2, const u32 max);
+
+__attribute__ ((format (printf, 2, 3)))
+  extern void
+logger_printf(const int fd, const char * const fmt, ...);
+// }}} misc
+
+// slab {{{
+struct slab;
+
+  extern struct slab *
+slab_create(const u64 obj_size, const u64 blk_size);
+
+  extern bool
+slab_reserve_unsafe(struct slab * const slab, const u64 nr);
+
+  extern void *
+slab_alloc_unsafe(struct slab * const slab);
+
+  extern void *
+slab_alloc_safe(struct slab * const slab);
+
+  extern void
+slab_free_unsafe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_safe(struct slab * const slab, void * const ptr);
+
+  extern void
+slab_free_all(struct slab * const slab);
+
+  extern u64
+slab_get_nalloc(struct slab * const slab);
+
+  extern void
+slab_destroy(struct slab * const slab);
+// }}}  slab
+
+// string {{{
+// XXX strdec_ and strhex_ functions does not append the trailing '\0' to the output string
+// size of out should be >= 10
+  extern void
+strdec_32(void * const out, const u32 v);
+
+// size of out should be >= 20
+  extern void
+strdec_64(void * const out, const u64 v);
+
+// size of out should be >= 8
+  extern void
+strhex_32(void * const out, const u32 v);
+
+// size of out should be >= 16
+  extern void
+strhex_64(void * const out, const u64 v);
+
+  extern u64
+a2u64(const void * const str);
+
+  extern u32
+a2u32(const void * const str);
+
+  extern s64
+a2s64(const void * const str);
+
+  extern s32
+a2s32(const void * const str);
+
+  extern void
+str_print_hex(FILE * const out, const void * const data, const u32 len);
+
+  extern void
+str_print_dec(FILE * const out, const void * const data, const u32 len);
+
+// user should free returned ptr (and nothing else) after use
+  extern char **
+strtoks(const char * const str, const char * const delim);
+
+  extern u32
+strtoks_count(const char * const * const toks);
+// }}} string
+
+// qsbr {{{
+// QSBR vs EBR (Quiescent-State vs Epoch Based Reclaimation)
+// QSBR: readers just use qsbr_update -> qsbr_update -> ... repeatedly
+// EBR: readers use qsbr_update -> qsbr_park -> qsbr_resume -> qsbr_update -> ...
+// The advantage of EBR is qsbr_park can happen much earlier than the next qsbr_update
+// The disadvantage is the extra cost, a pair of park/resume is used in every iteration
+struct qsbr;
+struct qsbr_ref {
+#ifdef QSBR_DEBUG
+  u64 debug[16];
+#endif
+  u64 opaque[3];
+};
+
+  extern struct qsbr *
+qsbr_create(void);
+
+// every READER accessing the shared data must first register itself with the qsbr
+  extern bool
+qsbr_register(struct qsbr * const q, struct qsbr_ref * const qref);
+
+  extern void
+qsbr_unregister(struct qsbr * const q, struct qsbr_ref * const qref);
+
+// For READER: mark the beginning of critical section; like rcu_read_lock()
+  extern void
+qsbr_update(struct qsbr_ref * const qref, const u64 v);
+
+// temporarily stop access the shared data to avoid blocking writers
+// READER can use qsbr_park (like rcu_read_unlock()) in conjunction with qsbr_update
+// qsbr_park is roughly equivalent to qsbr_unregister, but faster
+  extern void
+qsbr_park(struct qsbr_ref * const qref);
+
+// undo the effect of qsbr_park; must use it between qsbr_park and qsbr_update
+// qsbr_resume is roughly equivalent to qsbr_register, but faster
+  extern void
+qsbr_resume(struct qsbr_ref * const qref);
+
+// WRITER: wait until all the readers have announced v=target with qsbr_update
+  extern void
+qsbr_wait(struct qsbr * const q, const u64 target);
+
+  extern void
+qsbr_destroy(struct qsbr * const q);
+// }}} qsbr
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/libwh.so b/test/MassTrie-beta/wormhole/libwh.so
new file mode 100644
index 00000000..2ecd7e7e
Binary files /dev/null and b/test/MassTrie-beta/wormhole/libwh.so differ
diff --git a/test/MassTrie-beta/wormhole/stresstest.c b/test/MassTrie-beta/wormhole/stresstest.c
new file mode 100644
index 00000000..93fb6f05
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/stresstest.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016-2020  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+#include "lib.h"
+#include "kv.h"
+#include "wh.h"
+#include "ctypes.h"
+
+struct stress_info {
+  u64 nkeys;
+  u32 nloader;
+  u32 nunldr;
+  u32 nth;
+  u32 cpt;
+  bool has_iter;
+
+  au64 seqno;
+  struct kv ** keys;
+
+  const struct kvmap_api * api;
+  void * map;
+  au64 tot;
+  au64 wfail;
+  u64 endtime;
+};
+
+  static void *
+stress_load_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  void * const ref = kvmap_ref(si->api, si->map);
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nloader * seq;
+  const u64 nz = (seq == (si->nloader - 1)) ? si->nkeys : (si->nkeys / si->nloader * (seq + 1));
+  //printf("load worker %lu %lu\n", n0, nz-1);
+
+  char * buf = malloc(128);
+  debug_assert(buf);
+  u64 * buf64 = (typeof(buf64))buf;
+  for (u64 i = n0; i < nz; i++) {
+    const u32 klen = (u32)(random_u64() & 0x3flu) + 8;
+    const u32 klen8 = (klen + 7) >> 3;
+    /*
+       buf64[0] = bswap_64(i); // little endian
+       for (u64 j = 1; j < klen8; j++)
+       buf64[j] = random_u64();
+     */
+    const u64 rkey = random_u64();
+    for (u32 j = 0; j < klen8; j++)
+      buf64[j] = (rkey >> j) & 0x0101010101010101lu;
+
+    si->keys[i] = kv_create(buf, klen, buf, 8);
+    if (si->keys[i] == NULL)
+      exit(0);
+    kvmap_kv_put(si->api, ref, si->keys[i]);
+  }
+  free(buf);
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void *
+stress_unload_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  const u64 seq = atomic_fetch_add(&si->seqno, 1);
+  const u64 n0 = si->nkeys / si->nunldr * seq;
+  const u64 nz = (seq == (si->nunldr - 1)) ? si->nkeys : (si->nkeys / si->nunldr * (seq + 1));
+
+  void * const ref = kvmap_ref(si->api, si->map);
+  for (u64 i = n0; i < nz; i++) {
+    kvmap_kv_del(si->api, ref, si->keys[i]);
+    free(si->keys[i]);
+  }
+  kvmap_unref(si->api, ref);
+  return NULL;
+}
+
+  static void
+stress_inp_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+  }
+}
+
+  static struct kv *
+stress_merge_plus1(struct kv * const kv0, void * const priv)
+{
+  (void)priv;
+  if (kv0) { // can be NULL
+    u64 * ptr = kv_vptr(kv0);
+    ++(*ptr);
+    return kv0;
+  } else {
+    u64 * ptr = kv_vptr((struct kv *)priv);
+    *ptr = 0;
+    return priv;
+  }
+}
+
+  static void
+stress_func(struct stress_info * const si)
+{
+  srandom_u64(time_nsec() * time_nsec() / time_nsec());
+  const struct kvmap_api * const api = si->api;
+  void * ref = kvmap_ref(api, si->map);
+  struct kv * next = si->keys[random_u64() % si->nkeys];
+  u64 rnext = random_u64() % si->nkeys;
+  struct kv * const tmp = malloc(128);
+  struct kref tmpkref;
+  struct kvref tmpkvref;
+  debug_assert(tmp);
+  void * iter = NULL;
+  if (api->iter_park) {
+    iter = api->iter_create(ref);
+    api->iter_park(iter);
+  }
+  u64 wfail1 = 0;
+  u64 nops = 0;
+#define BATCHSIZE ((4096))
+  do {
+    for (u64 i = 0; i < BATCHSIZE; i++) {
+      // reading kv keys leads to unnecessary cache misses
+      // use prefetch to minimize overhead on workload generation
+      struct kv * const key = next;
+      next = si->keys[rnext];
+      cpu_prefetch0(next);
+      cpu_prefetch0(((u8 *)next) + 64);
+      rnext = random_u64() % si->nkeys;
+      cpu_prefetch0(&(si->keys[rnext]));
+
+      // do probe
+      // customize your benchmark: do a mix of wh operations with switch-cases
+      const u64 r = random_u64() % 16;
+      switch (r) {
+      case 0:
+        kvmap_kv_probe(api, ref, key);
+        break;
+      case 1:
+        kvmap_kv_get(api, ref, key, tmp);
+        break;
+      case 2:
+        if (si->has_iter) {
+          if (api->iter_park == NULL)
+            iter = api->iter_create(ref);
+          debug_assert(iter);
+          kvmap_kv_iter_seek(api, iter, key);
+          api->iter_next(iter, tmp);
+          api->iter_peek(iter, tmp);
+          api->iter_skip(iter, 2);
+          // this is unsafe; only reader's lock is acquired
+          if (api->iter_inp)
+            api->iter_inp(iter, stress_inp_plus1, NULL);
+          // kref
+          if (api->iter_kref)
+            api->iter_kref(iter, &tmpkref);
+          // kvref
+          if (api->iter_kvref)
+            api->iter_kvref(iter, &tmpkvref);
+          // done
+          if (api->iter_park)
+            api->iter_park(iter);
+          else
+            api->iter_destroy(iter);
+        }
+        break;
+      case 3:
+        if (api->refpark) {
+          api->park(ref);
+          api->resume(ref);
+        }
+        break;
+      case 4:
+        if (api->iter_park)
+          api->iter_destroy(iter);
+        (void)kvmap_unref(api, ref);
+        ref = kvmap_ref(api, si->map);
+        if (api->iter_park)
+          iter = api->iter_create(ref);
+        break;
+      case 5:
+        if (api->merge) {
+          kv_dup2_key(key, tmp);
+          tmp->vlen = 8;
+          kvmap_kv_merge(api, ref, key, stress_merge_plus1, tmp);
+        }
+        break;
+      case 6:
+        if ((random_u64() & 0x7fffu) == 0x22 && api->delr)
+          (void)kvmap_kv_delr(api, ref, si->keys[rnext], (rnext + 10) < si->nkeys ? si->keys[rnext + 10] : NULL);
+        else
+          kvmap_kv_probe(api, ref, key);
+        break;
+      case 7: case 8: case 9:
+        (void)kvmap_kv_del(api, ref, key);
+        break;
+      case 10: case 11:
+        if (api->inpw)
+          kvmap_kv_inpw(api, ref, key, stress_inp_plus1, NULL);
+        break;
+      case 12: case 13: case 14: case 15:
+        if (!kvmap_kv_put(api, ref, key))
+          wfail1++;
+        break;
+      default:
+        break;
+      }
+    }
+    nops += BATCHSIZE;
+  } while (time_nsec() < si->endtime);
+  si->wfail += wfail1;
+  if (api->iter_park)
+    api->iter_destroy(iter);
+  kvmap_unref(api, ref);
+  free(tmp);
+  si->tot += nops;
+}
+
+  static void
+stress_co_worker(void)
+{
+  struct stress_info * const si = (typeof(si))co_priv();
+  debug_assert(si);
+  stress_func(si);
+}
+
+  static void *
+stress_thread_worker(void * ptr)
+{
+  struct stress_info * const si = (typeof(si))ptr;
+  if (si->cpt) {
+    u64 hostrsp = 0;
+    struct corr * crs[32];
+    do { // to work smoothly with ALLOCFAIL
+      crs[0] = corr_create(16*PGSZ, stress_co_worker, si, &hostrsp);
+    } while (crs[0] == NULL);
+    for (u32 j = 1; j < si->cpt; j++) {
+      do { // to work smoothly with ALLOCFAIL
+        crs[j] = corr_link(16*PGSZ, stress_co_worker, si, crs[j-1]);
+      } while (crs[j] == NULL);
+    }
+
+    corr_enter(crs[0]);
+    for (u32 j = 0; j < si->cpt; j++)
+      corr_destroy(crs[j]);
+  } else {
+    stress_func(si);
+  }
+  return NULL;
+}
+
+  int
+main(int argc, char ** argv)
+{
+  struct stress_info si = {.nkeys = 10000, .nloader = 1, .nunldr = 1, .nth = 1, .cpt = 0};
+  argc--;
+  argv++;
+  int n = -1;
+  if ((n = kvmap_api_helper(argc, argv, NULL, &si.api, &si.map)) < 0) {
+    fprintf(stderr, "usage: api ... [<#keys>=10000 [<#load-threads>=1 [<#unload-threads>=1 [<#threads>=1 [<#co-per-thread>=0 (disabled) [<rounds>=1 [<epochs>=1]]]]]]]\n");
+    kvmap_api_helper_message();
+    exit(0);
+  }
+  argc -= n;
+  argv += n;
+
+  const bool has_point = si.api->get && si.api->probe && si.api->del && si.api->put;
+  if (!has_point) {
+    fprintf(stderr, "api not supported\n");
+    exit(0);
+  }
+  if (!si.api->inpw)
+    fprintf(stderr, "api->inpw function not found: ignored\n");
+  if (!si.api->merge)
+    fprintf(stderr, "api->merge function not found: ignored\n");
+  if (!si.api->delr)
+    fprintf(stderr, "api->delr function not found: ignored\n");
+
+  si.has_iter = si.api->iter_create && si.api->iter_seek && si.api->iter_peek &&
+    si.api->iter_skip && si.api->iter_next && si.api->iter_destroy;
+  if (!si.has_iter)
+    fprintf(stderr, "iter functions not complete: ignored\n");
+
+  // generate keys
+  if (argc >= 1)
+    si.nkeys = a2u64(argv[0]);
+  si.keys = malloc(sizeof(struct kv *) * si.nkeys);
+  debug_assert(si.keys);
+  if (argc >= 2)
+    si.nloader = a2u32(argv[1]);
+  if (argc >= 3)
+    si.nunldr = a2u32(argv[2]);
+  if (argc >= 4)
+    si.nth = a2u32(argv[3]);
+  if (argc >= 5)
+    si.cpt = a2u32(argv[4]);
+  if (si.cpt > 32)
+    si.cpt = 32;
+#if !defined(CORR)
+  if (si.cpt > 1)
+    fprintf(stderr, TERMCLR(35) "CORR not enabled. Compile with -DCORR to enable it.\n" TERMCLR(0));
+#endif // CORR
+  const u64 nr = (argc >= 6) ? a2u64(argv[5]) : 1; // default 1
+  const u64 ne = (argc >= 7) ? a2u64(argv[6]) : 1; // default 1
+  printf("stresstest: nkeys %lu ldr %u uldr %u th %u cpt %u r %lu e %lu\n",
+      si.nkeys, si.nloader, si.nunldr, si.nth, si.cpt, nr, ne);
+
+  for (u64 e = 0; e < ne; e++) {
+    si.seqno = 0;
+    const u64 dtl = thread_fork_join(si.nloader, (void *)stress_load_worker, false, &si);
+    printf("load th %u mops %.2lf\n", si.nloader, ((double)si.nkeys) * 1e3 / ((double)dtl));
+    if (si.api->fprint)
+      si.api->fprint(si.map, stdout);
+
+    debug_perf_switch();
+    for (u64 r = 0; r < nr; r++) {
+      si.tot = 0;
+      si.wfail = 0;
+      si.endtime = time_nsec() + 2000000000lu;
+      const u64 dt = thread_fork_join(si.nth, (void *)stress_thread_worker, false, &si);
+      const double mops = ((double)si.tot) * 1e3 / ((double)dt);
+      char ts[64];
+      time_stamp(ts, 64);
+      const long rss = process_get_rss();
+      printf("%s e %lu r %lu th %u cpt %u tot %lu mops %.2lf rss %ldkB wfail %lu\n",
+          ts, e, r, si.nth, si.cpt, si.tot, mops, rss, si.wfail);
+      debug_perf_switch();
+    }
+    si.seqno = 0;
+    if (si.nunldr == 0) { // use clean
+      const u64 t0 = time_nsec();
+      si.api->clean(si.map);
+      const u64 dtu = time_diff_nsec(t0);
+      for (u64 i = 0; i < si.nkeys; i++)
+        free(si.keys[i]);
+      printf("clean mops %.2lf\n", ((double)si.nkeys) *1e3 / ((double)dtu));
+    } else {
+      const u64 dtu = thread_fork_join(si.nunldr, (void *)stress_unload_worker, false, &si);
+      printf("unload th %u mops %.2lf\n", si.nunldr, ((double)si.nkeys) *1e3 / ((double)dtu));
+    }
+  }
+
+  free(si.keys);
+  si.api->destroy(si.map);
+  return 0;
+}
diff --git a/test/MassTrie-beta/wormhole/stresstest.out b/test/MassTrie-beta/wormhole/stresstest.out
new file mode 100644
index 00000000..874d359c
Binary files /dev/null and b/test/MassTrie-beta/wormhole/stresstest.out differ
diff --git a/test/MassTrie-beta/wormhole/wh.c b/test/MassTrie-beta/wormhole/wh.c
new file mode 100644
index 00000000..1d31e231
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/wh.c
@@ -0,0 +1,3876 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#define _GNU_SOURCE
+
+// headers {{{
+#include <assert.h> // static_assert
+#include "lib.h"
+#include "ctypes.h"
+#include "kv.h"
+#include "wh.h"
+// }}} headers
+
+// def {{{
+#define WH_HMAPINIT_SIZE ((1u << 12)) // 10: 16KB/64KB  12: 64KB/256KB  14: 256KB/1MB
+#define WH_SLABMETA_SIZE ((1lu << 21)) // 2MB
+
+#ifndef HEAPCHECKING
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB is ok
+#else
+#define WH_SLABLEAF_SIZE ((1lu << 21)) // 2MB for valgrind
+#endif
+
+#define WH_KPN ((128u)) // keys per node; power of 2
+#define WH_HDIV (((1u << 16)) / WH_KPN)
+#define WH_MID ((WH_KPN >> 1)) // ideal cut point for split, the closer the better
+#define WH_BKT_NR ((8))
+#define WH_KPN2 ((WH_KPN + WH_KPN))
+
+#define WH_KPN_MRG (((WH_KPN + WH_MID) >> 1 )) // 3/4
+
+// FO is fixed at 256. Don't change it
+#define WH_FO  ((256u)) // index fan-out
+// number of bits in a bitmap
+#define WH_BMNR ((WH_FO >> 6)) // number of u64
+// }}} def
+
+// struct {{{
+struct wormmeta {
+  struct entry13 k13; // kref+klen
+  struct entry13 l13; // lmost+bitmin+bitmax
+  struct entry13 r13; // rmost+hash32_lo
+  struct entry13 p13; // lpath+hash32_hi
+  u64 bitmap[0]; // 4 if bitmin != bitmax
+};
+static_assert(sizeof(struct wormmeta) == 32, "sizeof(wormmeta) != 32");
+
+struct wormkv64 { u64 key; void * ptr; }; // u64 keys (whu64)
+
+struct wormleaf {
+  // first line
+  rwlock leaflock;
+  spinlock sortlock; // to protect the seemingly "read-only" iter_seek
+  au64 lv; // version (dont use the first u64)
+  struct wormleaf * prev; // prev leaf
+  struct wormleaf * next; // next leaf
+  struct kv * anchor;
+
+  u32 nr_sorted;
+  u32 nr_keys;
+  u64 reserved[2];
+
+  struct entry13 hs[WH_KPN]; // sorted by hashes
+  u8 ss[WH_KPN]; // sorted by keys
+};
+
+struct wormslot { u16 t[WH_BKT_NR]; };
+static_assert(sizeof(struct wormslot) == 16, "sizeof(wormslot) != 16");
+
+struct wormmbkt { struct wormmeta * e[WH_BKT_NR]; };
+static_assert(sizeof(struct wormmbkt) == 64, "sizeof(wormmbkt) != 64");
+
+struct wormhmap {
+  au64 hv;
+  struct wormslot * wmap;
+  struct wormmbkt * pmap;
+  u32 mask;
+  u32 maxplen;
+  u64 msize;
+
+  struct slab * slab1;
+  struct slab * slab2;
+  struct kv * pbuf;
+};
+static_assert(sizeof(struct wormhmap) == 64, "sizeof(wormhmap) != 64");
+
+struct wormhole {
+  // 1 line
+  union {
+    au64 hmap_ptr; // safe
+    struct wormhmap * hmap; // unsafe
+  };
+  u64 padding0[6];
+  struct wormleaf * leaf0; // usually not used
+  // 1 line
+  struct kvmap_mm mm;
+  struct qsbr * qsbr;
+  struct slab * slab_leaf;
+  struct kv * pbuf;
+  u32 leaftype;
+  u32 padding1;
+  // 2 lines
+  struct wormhmap hmap2[2];
+  // fifth line
+  rwlock metalock;
+  u32 padding2[15];
+};
+
+struct wormhole_iter {
+  struct wormref * ref; // safe-iter only
+  struct wormhole * map;
+  struct wormleaf * leaf;
+  u32 is;
+};
+
+struct wormref {
+  struct wormhole * map;
+  struct qsbr_ref qref;
+};
+// }}} struct
+
+// helpers {{{
+
+// meta {{{
+  static inline struct kv *
+wormmeta_keyref_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->k13.e3);
+}
+
+  static inline u16
+wormmeta_klen_load(const struct wormmeta * const meta)
+{
+  return meta->k13.e1;
+}
+
+  static inline struct wormleaf *
+wormmeta_lmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->l13.e3 & (~0x3flu));
+}
+
+  static inline u32
+wormmeta_bitmin_load(const struct wormmeta * const meta)
+{
+  return (u32)(meta->l13.v64 & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_bitmax_load(const struct wormmeta * const meta)
+{
+  return (u32)((meta->l13.v64 >> 9) & 0x1fflu);
+}
+
+  static inline u32
+wormmeta_hash32_load(const struct wormmeta * const meta)
+{
+  return ((u32)meta->r13.e1) | (((u32)meta->p13.e1) << 16);
+}
+
+  static inline struct wormleaf *
+wormmeta_rmost_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->r13.e3);
+}
+
+  static inline struct wormleaf *
+wormmeta_lpath_load(const struct wormmeta * const meta)
+{
+  return u64_to_ptr(meta->p13.e3);
+}
+
+// internal
+  static inline void
+wormmeta_lpath_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->p13, ptr_to_u64(leaf));
+}
+
+// also updates leaf_klen_eq and
+  static inline void
+wormmeta_lmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  const u64 minmax = meta->l13.v64 & 0x3fffflu;
+  meta->l13.v64 = (((u64)leaf) << 16) | minmax;
+
+  const bool leaf_klen_eq = leaf->anchor->klen == wormmeta_klen_load(meta);
+  wormmeta_lpath_store(meta, leaf_klen_eq ? leaf : leaf->prev);
+}
+
+  static inline void
+wormmeta_bitmin_store(struct wormmeta * const meta, const u32 bitmin)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x1fflu)) | bitmin;
+}
+
+  static inline void
+wormmeta_bitmax_store(struct wormmeta * const meta, const u32 bitmax)
+{
+  meta->l13.v64 = (meta->l13.v64 & (~0x3fe00lu)) | (bitmax << 9);
+}
+
+  static inline void
+wormmeta_rmost_store(struct wormmeta * const meta, struct wormleaf * const leaf)
+{
+  entry13_update_e3(&meta->r13, ptr_to_u64(leaf));
+}
+
+// for wormmeta_alloc
+  static void
+wormmeta_init(struct wormmeta * const meta, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  keyref->refcnt++; // shared
+
+  const u32 plen = keyref->klen;
+  debug_assert(plen <= UINT16_MAX);
+  meta->k13 = entry13((u16)plen, ptr_to_u64(keyref));
+  meta->l13.v64 = (ptr_to_u64(lrmost) << 16) | (bit << 9) | bit;
+
+  const u32 hash32 = keyref->hashlo;
+  meta->r13 = entry13((u16)hash32, ptr_to_u64(lrmost));
+
+  const bool leaf_klen_eq = alen == plen;
+  meta->p13 = entry13((u16)(hash32 >> 16), ptr_to_u64(leaf_klen_eq ? lrmost : lrmost->prev));
+}
+// }}} meta
+
+// meta-bitmap {{{
+  static inline bool
+wormmeta_bm_test(const struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin == bitmax) { // half node
+    return bitmin == id;
+  } else { // full node
+    return (bool)((meta->bitmap[id >> 6u] >> (id & 0x3fu)) & 1lu);
+  }
+}
+
+// meta must be a full node
+  static void
+wormmeta_bm_set(struct wormmeta * const meta, const u32 id)
+{
+  // need to replace meta
+  u64 * const ptr = &(meta->bitmap[id >> 6u]);
+  const u64 bit = 1lu << (id & 0x3fu);
+  if ((*ptr) & bit)
+    return;
+
+  (*ptr) |= bit;
+
+  // min
+  if (id < wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, id);
+
+  // max
+  const u32 oldmax = wormmeta_bitmax_load(meta);
+  if (oldmax == WH_FO || id > oldmax)
+    wormmeta_bitmax_store(meta, id);
+}
+
+// find the lowest bit > id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_gt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ~((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + (u32)__builtin_ctzl(bits);
+
+  while (++ix < WH_BMNR) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + (u32)__builtin_ctzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// find the highest bit that is lower than the id0
+// return WH_FO if not found
+  static inline u32
+wormmeta_bm_lt(const struct wormmeta * const meta, const u32 id0)
+{
+  u32 ix = id0 >> 6;
+  u64 bits = meta->bitmap[ix] & ((1lu << (id0 & 0x3fu)) - 1lu);
+  if (bits)
+    return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+
+  while (ix--) {
+    bits = meta->bitmap[ix];
+    if (bits)
+      return (ix << 6) + 63u - (u32)__builtin_clzl(bits);
+  }
+
+  return WH_FO;
+}
+
+// meta must be a full node
+  static inline void
+wormmeta_bm_clear(struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(wormmeta_bitmin_load(meta) < wormmeta_bitmax_load(meta));
+  meta->bitmap[id >> 6u] &= (~(1lu << (id & 0x3fu)));
+
+  // min
+  if (id == wormmeta_bitmin_load(meta))
+    wormmeta_bitmin_store(meta, wormmeta_bm_gt(meta, id));
+
+  // max
+  if (id == wormmeta_bitmax_load(meta))
+    wormmeta_bitmax_store(meta, wormmeta_bm_lt(meta, id));
+}
+// }}} meta-bitmap
+
+// key/prefix {{{
+  static inline u16
+wormhole_pkey(const u32 hash32)
+{
+  const u16 pkey0 = ((u16)hash32) ^ ((u16)(hash32 >> 16));
+  return pkey0 ? pkey0 : 1;
+}
+
+  static inline u32
+wormhole_bswap(const u32 hashlo)
+{
+  return __builtin_bswap32(hashlo);
+}
+
+  static inline bool
+wormhole_key_meta_match(const struct kv * const key, const struct wormmeta * const meta)
+{
+  return (key->klen == wormmeta_klen_load(meta))
+    && (!memcmp(key->kv, wormmeta_keyref_load(meta)->kv, key->klen));
+}
+
+// called by get_kref_slot
+  static inline bool
+wormhole_kref_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta)
+{
+  return (kref->len == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, wormmeta_keyref_load(meta)->kv, kref->len));
+}
+
+// called from meta_down ... get_kref1_slot
+// will access rmost, prefetching is effective here
+  static inline bool
+wormhole_kref1_meta_match(const struct kref * const kref,
+    const struct wormmeta * const meta, const u8 cid)
+{
+  const u8 * const keybuf = wormmeta_keyref_load(meta)->kv;
+  const u32 plen = kref->len;
+  return ((plen + 1) == wormmeta_klen_load(meta))
+    && (!memcmp(kref->ptr, keybuf, plen))
+    && (keybuf[plen] == cid);
+}
+
+// warning: be careful with buffer overflow
+  static inline void
+wormhole_prefix(struct kv * const pfx, const u32 klen)
+{
+  pfx->klen = klen;
+  kv_update_hash(pfx);
+}
+
+// for split
+  static inline void
+wormhole_prefix_inc1(struct kv * const pfx)
+{
+  pfx->hashlo = crc32c_u8(pfx->hashlo, pfx->kv[pfx->klen]);
+  pfx->klen++;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+
+// meta_lcp only
+  static inline void
+wormhole_kref_inc_123(struct kref * const kref, const u32 len0,
+    const u32 crc, const u32 inc)
+{
+  kref->hash32 = crc32c_inc_123(kref->ptr + len0, inc, crc);
+  kref->len = len0 + inc;
+}
+// }}} key/prefix
+
+// alloc {{{
+  static inline struct kv *
+wormhole_alloc_akey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_akey(struct kv * const akey)
+{
+  free(akey);
+}
+
+  static inline struct kv *
+wormhole_alloc_mkey(const size_t klen)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return NULL;
+#endif
+  return malloc(sizeof(struct kv) + klen);
+}
+
+  static inline void
+wormhole_free_mkey(struct kv * const mkey)
+{
+  free(mkey);
+}
+
+  static struct wormleaf *
+wormleaf_alloc(struct wormhole * const map, struct wormleaf * const prev,
+    struct wormleaf * const next, struct kv * const anchor)
+{
+  struct wormleaf * const leaf = slab_alloc_safe(map->slab_leaf);
+  if (leaf == NULL)
+    return NULL;
+
+  rwlock_init(&(leaf->leaflock));
+  spinlock_init(&(leaf->sortlock));
+
+  // keep the old version; new version will be assigned by split functions
+  //leaf->lv = 0;
+
+  leaf->prev = prev;
+  leaf->next = next;
+  leaf->anchor = anchor;
+
+  leaf->nr_keys = 0;
+  leaf->nr_sorted = 0;
+
+  // hs requires zero init.
+  memset(leaf->hs, 0, sizeof(leaf->hs[0]) * WH_KPN);
+  return leaf;
+}
+
+  static void
+wormleaf_free(struct slab * const slab, struct wormleaf * const leaf)
+{
+  debug_assert(leaf->leaflock.opaque == 0);
+  wormhole_free_akey(leaf->anchor);
+  slab_free_safe(slab, leaf);
+}
+
+  static struct wormmeta *
+wormmeta_alloc(struct wormhmap * const hmap, struct wormleaf * const lrmost,
+    struct kv * const keyref, const u32 alen, const u32 bit)
+{
+  debug_assert(alen <= UINT16_MAX);
+  debug_assert(lrmost && keyref);
+
+  struct wormmeta * const meta = slab_alloc_unsafe(hmap->slab1);
+  if (meta == NULL)
+    return NULL;
+
+  wormmeta_init(meta, lrmost, keyref, alen, bit);
+  return meta;
+}
+
+  static inline bool
+wormhole_slab_reserve(struct wormhole * const map, const u32 nr)
+{
+#ifdef ALLOCFAIL
+  if (alloc_fail())
+    return false;
+#endif
+  for (u32 i = 0; i < 2; i++) {
+    if (!(map->hmap2[i].slab1 && map->hmap2[i].slab2))
+      continue;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab1, nr))
+      return false;
+    if (!slab_reserve_unsafe(map->hmap2[i].slab2, nr))
+      return false;
+  }
+  return true;
+}
+
+  static void
+wormmeta_keyref_release(struct wormmeta * const meta)
+{
+  struct kv * const keyref = wormmeta_keyref_load(meta);
+  debug_assert(keyref->refcnt);
+  keyref->refcnt--;
+  if (keyref->refcnt == 0)
+    wormhole_free_mkey(keyref);
+}
+
+  static void
+wormmeta_free(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  wormmeta_keyref_release(meta);
+  slab_free_unsafe(hmap->slab1, meta);
+}
+// }}} alloc
+
+// lock {{{
+  static void
+wormleaf_lock_write(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_lock_read(struct wormleaf * const leaf, struct wormref * const ref)
+{
+  if (!rwlock_trylock_read(&(leaf->leaflock))) {
+    wormhole_park(ref);
+    rwlock_lock_read(&(leaf->leaflock));
+    wormhole_resume(ref);
+  }
+}
+
+  static void
+wormleaf_unlock_write(struct wormleaf * const leaf)
+{
+  rwlock_unlock_write(&(leaf->leaflock));
+}
+
+  static void
+wormleaf_unlock_read(struct wormleaf * const leaf)
+{
+  rwlock_unlock_read(&(leaf->leaflock));
+}
+
+  static void
+wormhmap_lock(struct wormhole * const map, struct wormref * const ref)
+{
+  if (!rwlock_trylock_write(&(map->metalock))) {
+    wormhole_park(ref);
+    rwlock_lock_write(&(map->metalock));
+    wormhole_resume(ref);
+  }
+}
+
+  static inline void
+wormhmap_unlock(struct wormhole * const map)
+{
+  rwlock_unlock_write(&(map->metalock));
+}
+// }}} lock
+
+// hmap-version {{{
+  static inline struct wormhmap *
+wormhmap_switch(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  return (hmap == map->hmap2) ? (hmap + 1) : (hmap - 1);
+}
+
+  static inline struct wormhmap *
+wormhmap_load(struct wormhole * const map)
+{
+  return (struct wormhmap *)atomic_load_explicit(&(map->hmap_ptr), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_store(struct wormhole * const map, struct wormhmap * const hmap)
+{
+  atomic_store_explicit(&(map->hmap_ptr), (u64)hmap, MO_RELEASE);
+}
+
+  static inline u64
+wormhmap_version_load(const struct wormhmap * const hmap)
+{
+  // no concurrent access
+  return atomic_load_explicit(&(hmap->hv), MO_ACQUIRE);
+}
+
+  static inline void
+wormhmap_version_store(struct wormhmap * const hmap, const u64 v)
+{
+  atomic_store_explicit(&(hmap->hv), v, MO_RELEASE);
+}
+
+  static inline u64
+wormleaf_version_load(struct wormleaf * const leaf)
+{
+  return atomic_load_explicit(&(leaf->lv), MO_CONSUME);
+}
+
+  static inline void
+wormleaf_version_store(struct wormleaf * const leaf, const u64 v)
+{
+  atomic_store_explicit(&(leaf->lv), v, MO_RELEASE);
+}
+// }}} hmap-version
+
+// co {{{
+  static inline void
+wormhmap_prefetch_pmap(const struct wormhmap * const hmap, const u32 idx)
+{
+#if defined(CORR)
+  (void)hmap;
+  (void)idx;
+#else
+  cpu_prefetch0(&(hmap->pmap[idx]));
+#endif
+}
+
+  static inline struct wormmeta *
+wormhmap_get_meta(const struct wormhmap * const hmap, const u32 mid, const u32 i)
+{
+  struct wormmeta * const meta = hmap->pmap[mid].e[i];
+#if defined(CORR)
+  cpu_prefetch0(meta);
+  corr_yield();
+#endif
+  return meta;
+}
+
+  static inline void
+wormleaf_prefetch(struct wormleaf * const leaf, const u32 hashlo)
+{
+  const u32 i = wormhole_pkey(hashlo) / WH_HDIV;
+#if defined(CORR)
+  cpu_prefetch0(leaf);
+  cpu_prefetch0(&(leaf->hs[i-4]));
+  cpu_prefetch0(&(leaf->hs[i+4]));
+  corr_yield();
+#else
+  cpu_prefetch0(&(leaf->hs[i]));
+#endif
+}
+
+  static inline bool
+wormhole_kref_kv_match(const struct kref * const key, const struct kv * const curr)
+{
+#if defined(CORR)
+  const u8 * const ptr = (typeof(ptr))curr;
+  cpu_prefetch0(ptr);
+  cpu_prefetch0(ptr + 64);
+  if (key->len > 56) {
+    cpu_prefetch0(ptr + 128);
+    cpu_prefetch0(ptr + 192);
+  }
+  corr_yield();
+#endif
+  return kref_kv_match(key, curr);
+}
+
+  static inline void
+wormhole_qsbr_update_pause(struct wormref * const ref, const u64 v)
+{
+  qsbr_update(&ref->qref, v);
+#if defined(CORR)
+  corr_yield();
+#endif
+}
+// }}} co
+
+// }}} helpers
+
+// hmap {{{
+// hmap is the MetaTrieHT of Wormhole
+  static bool
+wormhmap_init(struct wormhmap * const hmap, struct kv * const pbuf)
+{
+  const u64 wsize = sizeof(hmap->wmap[0]) * WH_HMAPINIT_SIZE;
+  const u64 psize = sizeof(hmap->pmap[0]) * WH_HMAPINIT_SIZE;
+  u64 msize = wsize + psize;
+  u8 * const mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL)
+    return false;
+
+  hmap->pmap = (typeof(hmap->pmap))mem;
+  hmap->wmap = (typeof(hmap->wmap))(mem + psize);
+  hmap->msize = msize;
+  hmap->mask = WH_HMAPINIT_SIZE - 1;
+  wormhmap_version_store(hmap, 0);
+  hmap->maxplen = 0;
+  hmap->pbuf = pbuf;
+  return true;
+}
+
+  static inline void
+wormhmap_deinit(struct wormhmap * const hmap)
+{
+  if (hmap->pmap) {
+    pages_unmap(hmap->pmap, hmap->msize);
+    hmap->pmap = NULL;
+    hmap->wmap = NULL;
+  }
+}
+
+  static inline m128
+wormhmap_zero(void)
+{
+#if defined(__x86_64__)
+  return _mm_setzero_si128();
+#elif defined(__aarch64__)
+  return vdupq_n_u8(0);
+#endif
+}
+
+  static inline m128
+wormhmap_m128_pkey(const u16 pkey)
+{
+#if defined(__x86_64__)
+  return _mm_set1_epi16((short)pkey);
+#elif defined(__aarch64__)
+  return vreinterpretq_u8_u16(vdupq_n_u16(pkey));
+#endif
+}
+
+  static inline u32
+wormhmap_match_mask(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  const m128 sv = _mm_load_si128((const void *)s);
+  return (u32)_mm_movemask_epi8(_mm_cmpeq_epi16(skey, sv));
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  static const uint16x8_t mbits = {0x3, 0xc, 0x30, 0xc0, 0x300, 0xc00, 0x3000, 0xc000};
+  return (u32)vaddvq_u16(vandq_u16(cmp, mbits));
+#endif
+}
+
+  static inline bool
+wormhmap_match_any(const struct wormslot * const s, const m128 skey)
+{
+#if defined(__x86_64__)
+  return wormhmap_match_mask(s, skey) != 0;
+#elif defined(__aarch64__)
+  const uint16x8_t sv = vld1q_u16((const u16 *)s); // load 16 bytes at s
+  const uint16x8_t cmp = vceqq_u16(vreinterpretq_u16_u8(skey), sv); // cmpeq => 0xffff or 0x0000
+  return vaddvq_u32(vreinterpretq_u32_u16(cmp)) != 0;
+#endif
+}
+
+// meta_lcp only
+  static inline bool
+wormhmap_peek(const struct wormhmap * const hmap, const u32 hash32)
+{
+  const m128 sk = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  return wormhmap_match_any(&(hmap->wmap[midx]), sk)
+    || wormhmap_match_any(&(hmap->wmap[midy]), sk);
+}
+
+  static inline struct wormmeta *
+wormhmap_get_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kv * const key)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_key_meta_match(key, meta)))
+      return meta;
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+  static struct wormmeta *
+wormhmap_get(const struct wormhmap * const hmap, const struct kv * const key)
+{
+  const u32 hash32 = key->hashlo;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_slot(hmap, midx, skey, key);
+  if (r)
+    return r;
+  return wormhmap_get_slot(hmap, midy, skey, key);
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    if (likely(wormhole_kref_meta_match(kref, meta)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_lcp only
+  static inline struct wormmeta *
+wormhmap_get_kref(const struct wormhmap * const hmap, const struct kref * const kref)
+{
+  const u32 hash32 = kref->hash32;
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref_slot(hmap, midx, skey, kref);
+  if (r)
+    return r;
+  return wormhmap_get_kref_slot(hmap, midy, skey, kref);
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1_slot(const struct wormhmap * const hmap, const u32 mid,
+    const m128 skey, const struct kref * const kref, const u8 cid)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta * const meta = wormhmap_get_meta(hmap, mid, i2>>1);
+    //cpu_prefetch0(wormmeta_rmost_load(meta)); // will access
+    if (likely(wormhole_kref1_meta_match(kref, meta, cid)))
+      return meta;
+
+    mask ^= (3u << i2);
+  }
+  return NULL;
+}
+
+// for meta_down only
+  static inline struct wormmeta *
+wormhmap_get_kref1(const struct wormhmap * const hmap,
+    const struct kref * const kref, const u8 cid)
+{
+  const u32 hash32 = crc32c_u8(kref->hash32, cid);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+
+  struct wormmeta * const r = wormhmap_get_kref1_slot(hmap, midx, skey, kref, cid);
+  if (r)
+    return r;
+  return wormhmap_get_kref1_slot(hmap, midy, skey, kref, cid);
+}
+
+  static inline u32
+wormhmap_slot_count(const struct wormslot * const slot)
+{
+  const u32 mask = wormhmap_match_mask(slot, wormhmap_zero());
+  return mask ? ((u32)__builtin_ctz(mask) >> 1) : 8;
+}
+
+  static inline void
+wormhmap_squeeze(const struct wormhmap * const hmap)
+{
+  struct wormslot * const wmap = hmap->wmap;
+  struct wormmbkt * const pmap = hmap->pmap;
+  const u32 mask = hmap->mask;
+  const u64 nrs64 = ((u64)(hmap->mask)) + 1; // must use u64; u32 can overflow
+  for (u64 si64 = 0; si64 < nrs64; si64++) { // # of buckets
+    const u32 si = (u32)si64;
+    u32 ci = wormhmap_slot_count(&(wmap[si]));
+    for (u32 ei = ci - 1; ei < WH_BKT_NR; ei--) {
+      struct wormmeta * const meta = pmap[si].e[ei];
+      const u32 sj = wormmeta_hash32_load(meta) & mask; // first hash
+      if (sj == si)
+        continue;
+
+      // move
+      const u32 ej = wormhmap_slot_count(&(wmap[sj]));
+      if (ej < WH_BKT_NR) { // has space at home location
+        wmap[sj].t[ej] = wmap[si].t[ei];
+        pmap[sj].e[ej] = pmap[si].e[ei];
+        const u32 ni = ci - 1;
+        if (ei < ni) {
+          wmap[si].t[ei] = wmap[si].t[ni];
+          pmap[si].e[ei] = pmap[si].e[ni];
+        }
+        wmap[si].t[ni] = 0;
+        pmap[si].e[ni] = NULL;
+        ci--;
+      }
+    }
+  }
+}
+
+  static void
+wormhmap_expand(struct wormhmap * const hmap)
+{
+  // sync expand
+  const u32 mask0 = hmap->mask;
+  if (mask0 == UINT32_MAX)
+    debug_die();
+  const u32 nr0 = mask0 + 1;
+  const u32 mask1 = mask0 + nr0;
+  const u64 nr1 = ((u64)nr0) << 1; // must use u64; u32 can overflow
+  const u64 wsize = nr1 * sizeof(hmap->wmap[0]);
+  const u64 psize = nr1 * sizeof(hmap->pmap[0]);
+  u64 msize = wsize + psize;
+  u8 * mem = pages_alloc_best(msize, true, &msize);
+  if (mem == NULL) {
+    // We are at a very deep call stack from wormhole_put().
+    // Gracefully handling the failure requires lots of changes.
+    // Currently we simply wait for available memory
+    // TODO: gracefully return with insertion failure
+    char ts[64];
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s sleep-wait for memory allocation %lukB\n",
+        __func__, ts, msize >> 10);
+    do {
+      sleep(1);
+      mem = pages_alloc_best(msize, true, &msize);
+    } while (mem == NULL);
+    time_stamp(ts, 64);
+    fprintf(stderr, "%s %s memory allocation done\n", __func__, ts);
+  }
+
+  struct wormhmap hmap1 = *hmap;
+  hmap1.pmap = (typeof(hmap1.pmap))mem;
+  hmap1.wmap = (typeof(hmap1.wmap))(mem + psize);
+  hmap1.msize = msize;
+  hmap1.mask = mask1;
+
+  const struct wormslot * const wmap0 = hmap->wmap;
+  const struct wormmbkt * const pmap0 = hmap->pmap;
+
+  for (u32 s = 0; s < nr0; s++) {
+    const struct wormmbkt * const bkt = &pmap0[s];
+    for (u32 i = 0; (i < WH_BKT_NR) && bkt->e[i]; i++) {
+      const struct wormmeta * const meta = bkt->e[i];
+      const u32 hash32 = wormmeta_hash32_load(meta);
+      const u32 idx0 = hash32 & mask0;
+      const u32 idx1 = ((idx0 == s) ? hash32 : wormhole_bswap(hash32)) & mask1;
+
+      const u32 n = wormhmap_slot_count(&(hmap1.wmap[idx1]));
+      debug_assert(n < 8);
+      hmap1.wmap[idx1].t[n] = wmap0[s].t[i];
+      hmap1.pmap[idx1].e[n] = bkt->e[i];
+    }
+  }
+  pages_unmap(hmap->pmap, hmap->msize);
+  hmap->pmap = hmap1.pmap;
+  hmap->wmap = hmap1.wmap;
+  hmap->msize = hmap1.msize;
+  hmap->mask = hmap1.mask;
+  wormhmap_squeeze(hmap);
+}
+
+  static bool
+wormhmap_cuckoo(struct wormhmap * const hmap, const u32 mid0,
+    struct wormmeta * const e0, const u16 s0, const u32 depth)
+{
+  const u32 ii = wormhmap_slot_count(&(hmap->wmap[mid0]));
+  if (ii < WH_BKT_NR) {
+    hmap->wmap[mid0].t[ii] = s0;
+    hmap->pmap[mid0].e[ii] = e0;
+    return true;
+  } else if (depth == 0) {
+    return false;
+  }
+
+  // depth > 0
+  struct wormmbkt * const bkt = &(hmap->pmap[mid0]);
+  u16 * const sv = &(hmap->wmap[mid0].t[0]);
+  for (u32 i = 0; i < WH_BKT_NR; i++) {
+    const struct wormmeta * const meta = bkt->e[i];
+    debug_assert(meta);
+    const u32 hash32 = wormmeta_hash32_load(meta);
+
+    const u32 midx = hash32 & hmap->mask;
+    const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+    const u32 midt = (midx != mid0) ? midx : midy;
+    if (midt != mid0) { // possible
+      // no penalty if moving someone back to its 1st hash location
+      const u32 depth1 = (midt == midx) ? depth : (depth - 1);
+      if (wormhmap_cuckoo(hmap, midt, bkt->e[i], sv[i], depth1)) {
+        bkt->e[i] = e0;
+        sv[i] = s0;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+  static void
+wormhmap_set(struct wormhmap * const hmap, struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midx);
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  wormhmap_prefetch_pmap(hmap, midy);
+  const u16 pkey = wormhole_pkey(hash32);
+  // insert with cuckoo
+  if (likely(wormhmap_cuckoo(hmap, midx, meta, pkey, 1)))
+    return;
+  if (wormhmap_cuckoo(hmap, midy, meta, pkey, 1))
+    return;
+  if (wormhmap_cuckoo(hmap, midx, meta, pkey, 2))
+    return;
+
+  // expand
+  wormhmap_expand(hmap);
+
+  wormhmap_set(hmap, meta);
+}
+
+  static bool
+wormhmap_del_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const meta, const m128 skey)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    const struct wormmeta * const meta1 = hmap->pmap[mid].e[i2>>1];
+    if (likely(meta == meta1)) {
+      const u32 i = i2 >> 1;
+      const u32 j = wormhmap_slot_count(&(hmap->wmap[mid])) - 1;
+      hmap->wmap[mid].t[i] = hmap->wmap[mid].t[j];
+      hmap->pmap[mid].e[i] = hmap->pmap[mid].e[j];
+      hmap->wmap[mid].t[j] = 0;
+      hmap->pmap[mid].e[j] = NULL;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_del(struct wormhmap * const hmap, const struct wormmeta * const meta)
+{
+  const u32 hash32 = wormmeta_hash32_load(meta);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_del_slot(hmap, midx, meta, skey)
+    || wormhmap_del_slot(hmap, midy, meta, skey);
+}
+
+  static bool
+wormhmap_replace_slot(struct wormhmap * const hmap, const u32 mid,
+    const struct wormmeta * const old, const m128 skey, struct wormmeta * const new)
+{
+  u32 mask = wormhmap_match_mask(&(hmap->wmap[mid]), skey);
+  while (mask) {
+    const u32 i2 = (u32)__builtin_ctz(mask);
+    struct wormmeta ** const pslot = &hmap->pmap[mid].e[i2>>1];
+    if (likely(old == *pslot)) {
+      *pslot = new;
+      return true;
+    }
+    mask -= (3u << i2);
+  }
+  return false;
+}
+
+  static bool
+wormhmap_replace(struct wormhmap * const hmap, const struct wormmeta * const old, struct wormmeta * const new)
+{
+  const u32 hash32 = wormmeta_hash32_load(old);
+  const u32 midx = hash32 & hmap->mask;
+  const u32 midy = wormhole_bswap(hash32) & hmap->mask;
+  const m128 skey = wormhmap_m128_pkey(wormhole_pkey(hash32));
+  return wormhmap_replace_slot(hmap, midx, old, skey, new)
+    || wormhmap_replace_slot(hmap, midy, old, skey, new);
+}
+// }}} hmap
+
+// create {{{
+// it's unsafe
+  static bool
+wormhole_create_leaf0(struct wormhole * const map)
+{
+  const bool sr = wormhole_slab_reserve(map, 1);
+  if (unlikely(!sr))
+    return false;
+
+  // create leaf of empty key
+  struct kv * const anchor = wormhole_alloc_akey(0);
+  if (anchor == NULL)
+    return false;
+  kv_dup2(kv_null(), anchor);
+
+  struct wormleaf * const leaf0 = wormleaf_alloc(map, NULL, NULL, anchor);
+  if (leaf0 == NULL) {
+    wormhole_free_akey(anchor);
+    return false;
+  }
+
+  struct kv * const mkey = wormhole_alloc_mkey(0);
+  if (mkey == NULL) {
+    wormleaf_free(map->slab_leaf, leaf0);
+    return false;
+  }
+
+  wormhole_prefix(mkey, 0);
+  mkey->refcnt = 0;
+  // create meta of empty key
+  for (u32 i = 0; i < 2; i++) {
+    if (map->hmap2[i].slab1) {
+      struct wormmeta * const m0 = wormmeta_alloc(&map->hmap2[i], leaf0, mkey, 0, WH_FO);
+      debug_assert(m0); // already reserved enough
+      wormhmap_set(&(map->hmap2[i]), m0);
+    }
+  }
+
+  map->leaf0 = leaf0;
+  return true;
+}
+
+  static struct wormhole *
+wormhole_create_internal(const struct kvmap_mm * const mm, const u32 nh)
+{
+  struct wormhole * const map = yalloc(sizeof(*map));
+  if (map == NULL)
+    return NULL;
+  memset(map, 0, sizeof(*map));
+  // mm
+  map->mm = mm ? (*mm) : kvmap_mm_dup;
+
+  // pbuf for meta-merge
+  map->pbuf = yalloc(1lu << 16); // 64kB
+  if (map->pbuf == NULL)
+    goto fail;
+
+  // hmap
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (!wormhmap_init(hmap, map->pbuf))
+      goto fail;
+
+    hmap->slab1 = slab_create(sizeof(struct wormmeta), WH_SLABMETA_SIZE);
+    if (hmap->slab1 == NULL)
+      goto fail;
+
+    hmap->slab2 = slab_create(sizeof(struct wormmeta) + (sizeof(u64) * WH_BMNR), WH_SLABMETA_SIZE);
+    if (hmap->slab2 == NULL)
+      goto fail;
+  }
+
+  // leaf slab
+  map->slab_leaf = slab_create(sizeof(struct wormleaf), WH_SLABLEAF_SIZE);
+  if (map->slab_leaf == NULL)
+    goto fail;
+
+  // qsbr
+  map->qsbr = qsbr_create();
+  if (map->qsbr == NULL)
+    goto fail;
+
+  // leaf0
+  if (!wormhole_create_leaf0(map))
+    goto fail;
+
+  rwlock_init(&(map->metalock));
+  wormhmap_store(map, &map->hmap2[0]);
+  return map;
+
+fail:
+  if (map->qsbr)
+    qsbr_destroy(map->qsbr);
+
+  if (map->slab_leaf)
+    slab_destroy(map->slab_leaf);
+
+  for (u32 i = 0; i < nh; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+
+  if (map->pbuf)
+    free(map->pbuf);
+
+  free(map);
+  return NULL;
+}
+
+  struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 2);
+}
+
+  struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm)
+{
+  return wormhole_create_internal(mm, 1);
+}
+// }}} create
+
+// jump {{{
+
+// lcp {{{
+// search in the hash table for the Longest Prefix Match of the search key
+// The corresponding wormmeta node is returned and the LPM is recorded in kref
+  static struct wormmeta *
+wormhole_meta_lcp(const struct wormhmap * const hmap, struct kref * const kref, const u32 klen)
+{
+  // invariant: lo <= lcp < (lo + gd)
+  // ending condition: gd == 1
+  u32 gd = (hmap->maxplen < klen ? hmap->maxplen : klen) + 1u;
+  u32 lo = 0;
+  u32 loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_1 ((7u))
+  while (META_LCP_GAP_1 < gd) {
+    const u32 inc = gd >> 3 << 2; // x4
+    const u32 hash32 = crc32c_inc_x4(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = gd >> 1;
+    const u32 hash32 = crc32c_inc_123(kref->ptr + lo, inc, loh);
+    if (wormhmap_peek(hmap, hash32)) {
+      loh = hash32;
+      lo += inc;
+      gd -= inc;
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_1
+
+  kref->hash32 = loh;
+  kref->len = lo;
+  struct wormmeta * ret = wormhmap_get_kref(hmap, kref);
+  if (likely(ret != NULL))
+    return ret;
+
+  gd = lo;
+  lo = 0;
+  loh = KV_CRC32C_SEED;
+
+#define META_LCP_GAP_2 ((5u))
+  while (META_LCP_GAP_2 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        gd = 1;
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+
+  while (1 < gd) {
+    const u32 inc = (gd * 3) >> 2;
+    wormhole_kref_inc_123(kref, lo, loh, inc);
+    struct wormmeta * const tmp = wormhmap_get_kref(hmap, kref);
+    if (tmp) {
+      loh = kref->hash32;
+      lo += inc;
+      gd -= inc;
+      ret = tmp;
+      if (wormmeta_bm_test(tmp, kref->ptr[lo])) {
+        loh = crc32c_u8(loh, kref->ptr[lo]);
+        lo++;
+        gd--;
+        ret = NULL;
+      } else {
+        break;
+      }
+    } else {
+      gd = inc;
+    }
+  }
+#undef META_LCP_GAP_2
+
+  if (kref->len != lo) {
+    kref->hash32 = loh;
+    kref->len = lo;
+  }
+  if (ret == NULL)
+    ret = wormhmap_get_kref(hmap, kref);
+  debug_assert(ret);
+  return ret;
+}
+// }}} lcp
+
+// down {{{
+  static struct wormleaf *
+wormhole_meta_down(const struct wormhmap * const hmap, const struct kref * const lcp,
+    const struct wormmeta * const meta, const u32 klen)
+{
+  if (likely(lcp->len < klen)) { // partial match
+    const u32 id0 = lcp->ptr[lcp->len];
+    if (wormmeta_bitmin_load(meta) > id0) { // no left, don't care about right.
+      return wormmeta_lpath_load(meta);
+    } else if (wormmeta_bitmax_load(meta) < id0) { // has left sibling but no right sibling
+      return wormmeta_rmost_load(meta);
+    } else { // has both (expensive)
+      return wormmeta_rmost_load(wormhmap_get_kref1(hmap, lcp, (u8)wormmeta_bm_lt(meta, id0)));
+    }
+  } else { // lcp->len == klen
+    return wormmeta_lpath_load(meta);
+  }
+}
+// }}} down
+
+// jump-rw {{{
+  static struct wormleaf *
+wormhole_jump_leaf(const struct wormhmap * const hmap, const struct kref * const key)
+{
+  struct kref kref = {.ptr = key->ptr};
+  debug_assert(kv_crc32c(key->ptr, key->len) == key->hash32);
+
+  const struct wormmeta * const meta = wormhole_meta_lcp(hmap, &kref, key->len);
+  return wormhole_meta_down(hmap, &kref, meta, key->len);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_read(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_read_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_read(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+
+  static struct wormleaf *
+wormhole_jump_leaf_write(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormhole * const map = ref->map;
+#pragma nounroll
+  do {
+    const struct wormhmap * const hmap = wormhmap_load(map);
+    const u64 v = wormhmap_version_load(hmap);
+    qsbr_update(&ref->qref, v);
+    struct wormleaf * const leaf = wormhole_jump_leaf(hmap, key);
+    wormleaf_prefetch(leaf, key->hash32);
+#pragma nounroll
+    do {
+      if (rwlock_trylock_write_nr(&(leaf->leaflock), 64)) {
+        if (wormleaf_version_load(leaf) <= v)
+          return leaf;
+        wormleaf_unlock_write(leaf);
+        break;
+      }
+      // v1 is loaded before lv; if lv <= v, can update v1 without redo jump
+      const u64 v1 = wormhmap_version_load(wormhmap_load(map));
+      if (wormleaf_version_load(leaf) > v)
+        break;
+      wormhole_qsbr_update_pause(ref, v1);
+    } while (true);
+  } while (true);
+}
+// }}} jump-rw
+
+// }}} jump
+
+// leaf-read {{{
+  static inline struct kv *
+wormleaf_kv_at_ih(const struct wormleaf * const leaf, const u32 ih)
+{
+  return u64_to_ptr(leaf->hs[ih].e3);
+}
+
+  static inline struct kv *
+wormleaf_kv_at_is(const struct wormleaf * const leaf, const u32 is)
+{
+  return u64_to_ptr(leaf->hs[leaf->ss[is]].e3);
+}
+
+  static inline void
+wormleaf_prefetch_ss(const struct wormleaf * const leaf)
+{
+  for (u32 i = 0; i < WH_KPN; i+=64)
+    cpu_prefetch0(&leaf->ss[i]);
+}
+
+// leaf must have been sorted
+// return the key at [i] as if k1 has been inserted into leaf; i <= leaf->nr_sorted
+  static const struct kv *
+wormleaf_kv_at_is1(const struct wormleaf * const leaf, const u32 i, const u32 is1, const struct kv * const k1)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(is1 <= leaf->nr_sorted);
+  if (i < is1)
+    return wormleaf_kv_at_is(leaf, i);
+  else if (i > is1)
+    return wormleaf_kv_at_is(leaf, i-1);
+  else // i == is1
+    return k1;
+}
+
+
+
+// fast point-lookup
+// returns WH_KPN if not found
+  static u32
+wormleaf_match_hs(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  const u16 pkey = wormhole_pkey(key->hash32);
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+
+  if (hs[i0].e1 == pkey) {
+    struct kv * const curr = u64_to_ptr(hs[i0].e3);
+    if (likely(wormhole_kref_kv_match(key, curr)))
+      return i0;
+  }
+  if (hs[i0].e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if (hs[i].e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    if (hs[i].e1 == pkey) {
+      struct kv * const curr = u64_to_ptr(hs[i].e3);
+      if (likely(wormhole_kref_kv_match(key, curr)))
+        return i;
+    } else if ((hs[i].e1 > pkey) || (hs[i].e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+  
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in hs
+  static u32
+wormleaf_search_ih(const struct wormleaf * const leaf, const struct entry13 e)
+{
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  const struct entry13 * const hs = leaf->hs;
+  const struct entry13 e0 = hs[i0];
+
+  if (e0.v64 == e.v64)
+    return i0;
+
+  if (e0.e1 == 0)
+    return WH_KPN;
+
+  // search left
+  u32 i = i0 - 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if (ei.e1 < pkey) {
+      break;
+    }
+    i--;
+  }
+
+  // search right
+  i = i0 + 1;
+  while (i < WH_KPN) {
+    const struct entry13 ei = hs[i];
+    if (ei.v64 == e.v64) {
+      return i;
+    } else if ((ei.e1 > pkey) || (ei.e1 == 0)) {
+      break;
+    }
+    i++;
+  }
+
+  // not found
+  return WH_KPN;
+}
+
+// search for an existing entry in ss
+  static u32
+wormleaf_search_is(const struct wormleaf * const leaf, const u8 ih)
+{
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 i1 = _mm256_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm256_movemask_epi8(_mm256_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#else // SSE4.2
+  const m128 i1 = _mm_set1_epi8((char)ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const u32 mask = (u32)_mm_movemask_epi8(_mm_cmpeq_epi8(sv, i1));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__)
+  static const m128 vtbl = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+  static const uint16x8_t mbits = {0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};
+  const m128 i1 = vdupq_n_u8(ih);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 cmp = vceqq_u8(vld1q_u8(leaf->ss+i), i1); // cmpeq => 0xff or 0x00
+    const m128 cmp1 = vqtbl1q_u8(cmp, vtbl); // reorder
+    const u32 mask = (u32)vaddvq_u16(vandq_u8(vreinterpretq_u16_u8(cmp1), mbits));
+    if (mask)
+      return i + (u32)__builtin_ctz(mask);
+  }
+#endif // __x86_64__
+  debug_die();
+}
+
+// assumes there in no duplicated keys
+// search the first key that is >= the given key
+// return 0 .. nr_sorted
+  static u32
+wormleaf_search_ss(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  u32 lo = 0;
+  u32 hi = leaf->nr_sorted;
+  while ((lo + 2) < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    cpu_prefetch0(curr);
+    cpu_prefetch0(leaf->hs + leaf->ss[(lo + i) >> 1]);
+    cpu_prefetch0(leaf->hs + leaf->ss[(i + 1 + hi) >> 1]);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+
+  while (lo < hi) {
+    const u32 i = (lo + hi) >> 1;
+    const struct kv * const curr = wormleaf_kv_at_is(leaf, i);
+    const int cmp = kref_kv_compare(key, curr);
+    debug_assert(cmp != 0);
+    if (cmp < 0)
+      hi = i;
+    else
+      lo = i + 1;
+  }
+  return lo;
+}
+
+  static u32
+wormleaf_seek(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_sorted == leaf->nr_keys);
+  wormleaf_prefetch_ss(leaf); // effective for both hit and miss
+  const u32 ih = wormleaf_match_hs(leaf, key);
+  if (ih < WH_KPN) { // hit
+    return wormleaf_search_is(leaf, (u8)ih);
+  } else { // miss, binary search for gt
+    return wormleaf_search_ss(leaf, key);
+  }
+}
+
+// same to search_sorted but the target is very likely beyond the end
+  static u32
+wormleaf_seek_end(const struct wormleaf * const leaf, const struct kref * const key)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  if (leaf->nr_sorted) {
+    const int cmp = kref_kv_compare(key, wormleaf_kv_at_is(leaf, leaf->nr_sorted-1));
+    if (cmp > 0)
+      return leaf->nr_sorted;
+    else if (cmp == 0)
+      return leaf->nr_sorted - 1;
+    else
+      return wormleaf_seek(leaf, key);
+  } else {
+    return 0;
+  }
+}
+// }}} leaf-read
+
+// leaf-write {{{
+  static void
+wormleaf_sort_m2(struct wormleaf * const leaf, const u32 n1, const u32 n2)
+{
+  if (n1 == 0 || n2 == 0)
+    return; // no need to sort
+
+  u8 * const ss = leaf->ss;
+  u8 et[WH_KPN/2]; // min(n1,n2) < KPN/2
+  if (n1 <= n2) { // merge left
+    memcpy(et, &(ss[0]), sizeof(ss[0]) * n1);
+    u8 * eo = ss;
+    u8 * e1 = et; // size == n1
+    u8 * e2 = &(ss[n1]); // size == n2
+    const u8 * const z1 = e1 + n1;
+    const u8 * const z2 = e2 + n2;
+    while ((e1 < z1) && (e2 < z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo++) = *(e1++);
+      else if (cmp > 0)
+        *(eo++) = *(e2++);
+      else
+        debug_die();
+
+      if (eo == e2)
+        break; // finish early
+    }
+    if (eo < e2)
+      memcpy(eo, e1, sizeof(*eo) * (size_t)(e2 - eo));
+  } else {
+    memcpy(et, &(ss[n1]), sizeof(ss[0]) * n2);
+    u8 * eo = &(ss[n1 + n2 - 1]); // merge backwards
+    u8 * e1 = &(ss[n1 - 1]); // size == n1
+    u8 * e2 = &(et[n2 - 1]); // size == n2
+    const u8 * const z1 = e1 - n1;
+    const u8 * const z2 = e2 - n2;
+    while ((e1 > z1) && (e2 > z2)) {
+      const int cmp = kv_compare(wormleaf_kv_at_ih(leaf, *e1), wormleaf_kv_at_ih(leaf, *e2));
+      if (cmp < 0)
+        *(eo--) = *(e2--);
+      else if (cmp > 0)
+        *(eo--) = *(e1--);
+      else
+        debug_die();
+
+      if (eo == e1)
+        break;
+    }
+    if (eo > e1)
+      memcpy(e1 + 1, et, sizeof(*eo) * (size_t)(eo - e1));
+  }
+}
+
+#if defined(__linux__)
+  static int
+wormleaf_ss_cmp(const void * const p1, const void * const p2, void * priv)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#else // (FreeBSD and APPLE only)
+  static int
+wormleaf_ss_cmp(void * priv, const void * const p1, const void * const p2)
+{
+  const struct kv * const k1 = wormleaf_kv_at_ih(priv, *(const u8 *)p1);
+  const struct kv * const k2 = wormleaf_kv_at_ih(priv, *(const u8 *)p2);
+  return kv_compare(k1, k2);
+}
+#endif // __linux__
+
+  static inline void
+wormleaf_sort_range(struct wormleaf * const leaf, const u32 i0, const u32 nr)
+{
+#if defined(__linux__)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), wormleaf_ss_cmp, leaf);
+#else // (FreeBSD and APPLE only)
+  qsort_r(&(leaf->ss[i0]), nr, sizeof(leaf->ss[0]), leaf, wormleaf_ss_cmp);
+#endif // __linux__
+}
+
+// make sure all keys are sorted in a leaf node
+  static void
+wormleaf_sync_sorted(struct wormleaf * const leaf)
+{
+  const u32 s = leaf->nr_sorted;
+  const u32 n = leaf->nr_keys;
+  if (s == n)
+    return;
+
+  wormleaf_sort_range(leaf, s, n - s);
+  // merge-sort inplace
+  wormleaf_sort_m2(leaf, s, n - s);
+  leaf->nr_sorted = n;
+}
+
+// shift a sequence of entries on hs and update the corresponding ss values
+  static void
+wormleaf_shift_inc(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from+1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_add_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_add_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vaddq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+  static void
+wormleaf_shift_dec(struct wormleaf * const leaf, const u32 to, const u32 from, const u32 nr)
+{
+  debug_assert(to == (from-1));
+  struct entry13 * const hs = leaf->hs;
+  memmove(&(hs[to]), &(hs[from]), sizeof(hs[0]) * nr);
+
+#if defined(__x86_64__)
+  // TODO: avx512
+#if defined(__AVX2__)
+  const m256 ones = _mm256_set1_epi8(1);
+  const m256 addx = _mm256_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m256 cmpx = _mm256_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m256)) {
+    const m256 sv = _mm256_load_si256((m256 *)(leaf->ss+i));
+    const m256 add1 = _mm256_and_si256(_mm256_cmpgt_epi8(_mm256_add_epi8(sv, addx), cmpx), ones);
+    _mm256_store_si256((m256 *)(leaf->ss+i), _mm256_sub_epi8(sv, add1));
+  }
+#else // SSE4.2
+  const m128 ones = _mm_set1_epi8(1);
+  const m128 addx = _mm_set1_epi8((char)(u8)(INT8_MAX + 1 - from - nr));
+  const m128 cmpx = _mm_set1_epi8((char)(u8)(INT8_MAX - nr));
+  for (u32 i = 0; i < leaf->nr_keys; i += 16) {
+    const m128 sv = _mm_load_si128((m128 *)(leaf->ss+i));
+    const m128 add1 = _mm_and_si128(_mm_cmpgt_epi8(_mm_add_epi8(sv, addx), cmpx), ones);
+    _mm_store_si128((m128 *)(leaf->ss+i), _mm_sub_epi8(sv, add1));
+  }
+#endif // __AVX2__
+#elif defined(__aarch64__) // __x86_64__
+  // aarch64
+  const m128 subx = vdupq_n_u8((u8)from);
+  const m128 cmpx = vdupq_n_u8((u8)nr);
+  for (u32 i = 0; i < leaf->nr_keys; i += sizeof(m128)) {
+    const m128 sv = vld1q_u8(leaf->ss+i);
+    const m128 add1 = vshrq_n_u8(vcltq_u8(vsubq_u8(sv, subx), cmpx), 7);
+    vst1q_u8(leaf->ss+i, vsubq_u8(sv, add1));
+  }
+#endif // __x86_64__
+}
+
+// insert hs and also shift ss
+  static u32
+wormleaf_insert_hs(struct wormleaf * const leaf, const struct entry13 e)
+{
+  struct entry13 * const hs = leaf->hs;
+  const u16 pkey = e.e1;
+  const u32 i0 = pkey / WH_HDIV;
+  if (hs[i0].e1 == 0) { // insert
+    hs[i0] = e;
+    return i0;
+  }
+
+  // find left-most insertion point
+  u32 i = i0;
+  while (i && hs[i-1].e1 && (hs[i-1].e1 >= pkey))
+    i--;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 < pkey)) // stop at >= or empty
+    i++;
+  const u32 il = --i; // i in [0, KPN]
+
+  // find left empty slot
+  if (i > (i0 - 1))
+    i = i0 - 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i--;
+  const u32 el = i; // el < i0 or el is invalid (>= KPN)
+
+  // find right-most insertion point.
+  i = il + 1;
+  while ((i < WH_KPN) && hs[i].e1 && (hs[i].e1 == pkey))
+    i++;
+  const u32 ir = i; // ir >= il, in [0, KPN]
+
+  // find right empty slot
+  if (i < (i0 + 1))
+    i = i0 + 1;
+  while ((i < WH_KPN) && hs[i].e1)
+    i++;
+  const u32 er = i; // er > i0 or el is invalid (>= KPN)
+
+  // el <= il < ir <= er    (if < WH_KPN)
+  const u32 dl = (el < WH_KPN) ? (il - el) : WH_KPN;
+  const u32 dr = (er < WH_KPN) ? (er - ir) : WH_KPN;
+  if (dl <= dr) { // push left
+    debug_assert(dl < WH_KPN);
+    if (dl)
+      wormleaf_shift_dec(leaf, el, el+1, dl);
+    hs[il] = e;
+    return il;
+  } else {
+    debug_assert(dr < WH_KPN);
+    if (dr)
+      wormleaf_shift_inc(leaf, ir+1, ir, dr);
+    hs[ir] = e;
+    return ir;
+  }
+}
+
+  static void
+wormleaf_insert_e13(struct wormleaf * const leaf, const struct entry13 e)
+{
+  // insert to hs and fix all existing is
+  const u32 ih = wormleaf_insert_hs(leaf, e);
+  debug_assert(ih < WH_KPN);
+  // append the new is
+  leaf->ss[leaf->nr_keys] = (u8)ih;
+  // fix nr
+  leaf->nr_keys++;
+}
+
+  static void
+wormleaf_insert(struct wormleaf * const leaf, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  debug_assert(leaf->nr_keys < WH_KPN);
+
+  // insert
+  const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  const u32 nr0 = leaf->nr_keys;
+  wormleaf_insert_e13(leaf, e);
+
+  // optimize for seq insertion
+  if (nr0 == leaf->nr_sorted) {
+    if (nr0) {
+      const struct kv * const kvn = wormleaf_kv_at_is(leaf, nr0 - 1);
+      if (kv_compare(new, kvn) > 0)
+        leaf->nr_sorted = nr0 + 1;
+    } else {
+      leaf->nr_sorted = 1;
+    }
+  }
+}
+
+  static void
+wormleaf_pull_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  struct entry13 * const hs = leaf->hs;
+  // try left
+  u32 i = ih - 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) > i))
+    i--;
+
+  if ((++i) < ih) {
+    wormleaf_shift_inc(leaf, i+1, i, ih - i);
+    leaf->hs[i].v64 = 0;
+    return;
+  }
+
+  // try right
+  i = ih + 1;
+  while ((i < WH_KPN) && hs[i].e1 && ((hs[i].e1 / WH_HDIV) < i))
+    i++;
+
+  if ((--i) > ih) {
+    wormleaf_shift_dec(leaf, ih, ih+1, i - ih);
+    hs[i].v64 = 0;
+  }
+  // hs[ih] may still be 0
+}
+
+// internal only
+  static struct kv *
+wormleaf_remove(struct wormleaf * const leaf, const u32 ih, const u32 is)
+{
+  // ss
+  leaf->ss[is] = leaf->ss[leaf->nr_keys - 1];
+  if (leaf->nr_sorted > is)
+    leaf->nr_sorted = is;
+
+  // ret
+  struct kv * const victim = wormleaf_kv_at_ih(leaf, ih);
+  // hs
+  leaf->hs[ih].v64 = 0;
+  leaf->nr_keys--;
+  // use magnet
+  wormleaf_pull_ih(leaf, ih);
+  return victim;
+}
+
+// remove key from leaf but do not call free
+  static struct kv *
+wormleaf_remove_ih(struct wormleaf * const leaf, const u32 ih)
+{
+  // remove from ss
+  const u32 is = wormleaf_search_is(leaf, (u8)ih);
+  debug_assert(is < leaf->nr_keys);
+  return wormleaf_remove(leaf, ih, is);
+}
+
+  static struct kv *
+wormleaf_remove_is(struct wormleaf * const leaf, const u32 is)
+{
+  return wormleaf_remove(leaf, leaf->ss[is], is);
+}
+
+// for delr (delete-range)
+  static void
+wormleaf_delete_range(struct wormhole * const map, struct wormleaf * const leaf,
+    const u32 i0, const u32 end)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  for (u32 i = end; i > i0; i--) {
+    const u32 ir = i - 1;
+    struct kv * const victim = wormleaf_remove_is(leaf, ir);
+    map->mm.free(victim, map->mm.priv);
+  }
+}
+
+// return the old kv; the caller should free the old kv
+  static struct kv *
+wormleaf_update(struct wormleaf * const leaf, const u32 ih, const struct kv * const new)
+{
+  debug_assert(new->hash == kv_crc32c_extend(kv_crc32c(new->kv, new->klen)));
+  // search entry in ss (is)
+  struct kv * const old = wormleaf_kv_at_ih(leaf, ih);
+  debug_assert(old);
+
+  entry13_update_e3(&leaf->hs[ih], (u64)new);
+  return old;
+}
+// }}} leaf-write
+
+// leaf-split {{{
+// It only works correctly in cut_search
+// quickly tell if a cut between k1 and k2 can achieve a specific anchor-key length
+  static bool
+wormhole_split_cut_alen_check(const u32 alen, const struct kv * const k1, const struct kv * const k2)
+{
+  debug_assert(k2->klen >= alen);
+  return (k1->klen < alen) || (k1->kv[alen - 1] != k2->kv[alen - 1]);
+}
+
+// return the number of keys that should go to leaf1
+// assert(r > 0 && r <= nr_keys)
+// (1) r < is1, anchor key is ss[r-1]:ss[r]
+// (2) r == is1: anchor key is ss[r-1]:new
+// (3) r == is1+1: anchor key is new:ss[r-1] (ss[r-1] is the ss[r] on the logically sorted array)
+// (4) r > is1+1: anchor key is ss[r-2]:ss[r-1] (ss[r-2] is the [r-1] on the logically sorted array)
+// edge cases:
+//   (case 2) is1 == nr_keys: r = nr_keys; ss[r-1]:new
+//   (case 3) is1 == 0, r == 1; new:ss[0]
+// return 1..WH_KPN
+  static u32
+wormhole_split_cut_search1(struct wormleaf * const leaf, u32 l, u32 h, const u32 is1, const struct kv * const new)
+{
+  debug_assert(leaf->nr_keys == leaf->nr_sorted);
+  debug_assert(leaf->nr_keys);
+  debug_assert(l < h && h <= leaf->nr_sorted);
+
+  const struct kv * const kl0 = wormleaf_kv_at_is1(leaf, l, is1, new);
+  const struct kv * const kh0 = wormleaf_kv_at_is1(leaf, h, is1, new);
+  const u32 alen = kv_key_lcp(kl0, kh0) + 1;
+  if (unlikely(alen > UINT16_MAX))
+    return WH_KPN2;
+
+  const u32 target = leaf->next ? WH_MID : WH_KPN_MRG;
+  while ((l + 1) < h) {
+    const u32 m = (l + h + 1) >> 1;
+    if (m <= target) { // try right
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, h, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        l = m;
+      else
+        h = m;
+    } else { // try left
+      const struct kv * const k1 = wormleaf_kv_at_is1(leaf, l, is1, new);
+      const struct kv * const k2 = wormleaf_kv_at_is1(leaf, m, is1, new);
+      if (wormhole_split_cut_alen_check(alen, k1, k2))
+        h = m;
+      else
+        l = m;
+    }
+  }
+  return h;
+}
+
+  static void
+wormhole_split_leaf_move1(struct wormleaf * const leaf1, struct wormleaf * const leaf2,
+    const u32 cut, const u32 is1, const struct kv * const new)
+{
+  const u32 nr_keys = leaf1->nr_keys;
+  const struct entry13 e1 = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+  struct entry13 es[WH_KPN];
+
+  if (cut <= is1) { // e1 goes to leaf2
+    // leaf2
+    for (u32 i = cut; i < is1; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    wormleaf_insert_e13(leaf2, e1);
+
+    for (u32 i = is1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+  } else { // e1 goes to leaf1
+    // leaf2
+    for (u32 i = cut - 1; i < nr_keys; i++)
+      wormleaf_insert_e13(leaf2, leaf1->hs[leaf1->ss[i]]);
+
+    // leaf1
+    for (u32 i = 0; i < is1; i++)
+      es[i] = leaf1->hs[leaf1->ss[i]];
+
+    es[is1] = e1;
+
+    for (u32 i = is1 + 1; i < cut; i++)
+      es[i] = leaf1->hs[leaf1->ss[i - 1]];
+  }
+
+  leaf2->nr_sorted = leaf2->nr_keys;
+
+  memset(leaf1->hs, 0, sizeof(leaf1->hs[0]) * WH_KPN);
+  leaf1->nr_keys = 0;
+  for (u32 i = 0; i < cut; i++)
+    wormleaf_insert_e13(leaf1, es[i]);
+  leaf1->nr_sorted = cut;
+  debug_assert((leaf1->nr_sorted + leaf2->nr_sorted) == (nr_keys + 1));
+}
+
+// create an anchor for leaf-split
+  static struct kv *
+wormhole_split_alloc_anchor(const struct kv * const key1, const struct kv * const key2)
+{
+  const u32 alen = kv_key_lcp(key1, key2) + 1;
+  debug_assert(alen <= key2->klen);
+
+  struct kv * const anchor = wormhole_alloc_akey(alen);
+  if (anchor)
+    kv_refill(anchor, key2->kv, alen, NULL, 0);
+  return anchor;
+}
+
+// leaf1 is locked
+// split leaf1 into leaf1+leaf2; insert new into leaf1 or leaf2, return leaf2
+  static struct wormleaf *
+wormhole_split_leaf(struct wormhole * const map, struct wormleaf * const leaf1, struct kv * const new)
+{
+  wormleaf_sync_sorted(leaf1);
+  struct kref kref_new;
+  kref_ref_kv(&kref_new, new);
+  const u32 is1 = wormleaf_search_ss(leaf1, &kref_new); // new should be inserted at [is1]
+  const u32 cut = wormhole_split_cut_search1(leaf1, 0, leaf1->nr_keys, is1, new);
+  if (unlikely(cut == WH_KPN2))
+    return NULL;
+
+  // anchor of leaf2
+  debug_assert(cut && (cut <= leaf1->nr_keys));
+  const struct kv * const key1 = wormleaf_kv_at_is1(leaf1, cut - 1, is1, new);
+  const struct kv * const key2 = wormleaf_kv_at_is1(leaf1, cut, is1, new);
+  struct kv * const anchor2 = wormhole_split_alloc_anchor(key1, key2);
+  if (unlikely(anchor2 == NULL)) // anchor alloc failed
+    return NULL;
+
+  // create leaf2 with anchor2
+  struct wormleaf * const leaf2 = wormleaf_alloc(map, leaf1, leaf1->next, anchor2);
+  if (unlikely(leaf2 == NULL)) {
+    wormhole_free_akey(anchor2);
+    return NULL;
+  }
+
+  // split_hmap will unlock the leaf nodes; must move now
+  wormhole_split_leaf_move1(leaf1, leaf2, cut, is1, new);
+  // leaf1 and leaf2 should be sorted after split
+  debug_assert(leaf1->nr_keys == leaf1->nr_sorted);
+  debug_assert(leaf2->nr_keys == leaf2->nr_sorted);
+
+  return leaf2;
+}
+// }}} leaf-split
+
+// leaf-merge {{{
+// MERGE is the only operation that deletes a leaf node (leaf2).
+// It ALWAYS merges the right node into the left node even if the left is empty.
+// This requires both of their writer locks to be acquired.
+// This allows iterators to safely probe the next node (but not backwards).
+// In other words, if either the reader or the writer lock of node X has been acquired:
+// X->next (the pointer) cannot be changed by any other thread.
+// X->next cannot be deleted.
+// But the content in X->next can still be changed.
+  static bool
+wormleaf_merge(struct wormleaf * const leaf1, struct wormleaf * const leaf2)
+{
+  debug_assert((leaf1->nr_keys + leaf2->nr_keys) <= WH_KPN);
+  const bool leaf1_sorted = leaf1->nr_keys == leaf1->nr_sorted;
+
+  for (u32 i = 0; i < leaf2->nr_keys; i++)
+    wormleaf_insert_e13(leaf1, leaf2->hs[leaf2->ss[i]]);
+  if (leaf1_sorted)
+    leaf1->nr_sorted += leaf2->nr_sorted;
+  return true;
+}
+
+// for undoing insertion under split_meta failure; leaf2 is still local
+// remove the new key; merge keys in leaf2 into leaf1; free leaf2
+  static void
+wormleaf_split_undo(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, struct kv * const new)
+{
+  if (new) {
+    const struct entry13 e = entry13(wormhole_pkey(new->hashlo), ptr_to_u64(new));
+    const u32 im1 = wormleaf_search_ih(leaf1, e);
+    if (im1 < WH_KPN) {
+      (void)wormleaf_remove_ih(leaf1, im1);
+    } else { // not found in leaf1; search leaf2
+      const u32 im2 = wormleaf_search_ih(leaf2, e);
+      debug_assert(im2 < WH_KPN);
+      (void)wormleaf_remove_ih(leaf2, im2);
+    }
+  }
+  // this merge must succeed
+  if (!wormleaf_merge(leaf1, leaf2))
+    debug_die();
+  // Keep this to avoid triggering false alarm in wormleaf_free
+  leaf2->leaflock.opaque = 0;
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} leaf-merge
+
+// get/probe {{{
+  struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  struct kv * const tmp = (i < WH_KPN) ? ref->map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+  wormleaf_unlock_read(leaf);
+  return tmp;
+}
+
+  struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out)
+{
+  wormhole_resume(ref);
+  struct kv * const ret = wormhole_get(ref, key, out);
+  wormhole_park(ref);
+  return ret;
+}
+
+  struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  return (i < WH_KPN) ? map->mm.out(wormleaf_kv_at_ih(leaf, i), out) : NULL;
+}
+
+  bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 i = wormleaf_match_hs(leaf, key);
+  wormleaf_unlock_read(leaf);
+  return i < WH_KPN;
+}
+
+  bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_probe(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  return wormleaf_match_hs(leaf, key) < WH_KPN;
+}
+// }}} get/probe
+
+// meta-split {{{
+// duplicate from meta1; only has one bit but will soon add a new bit
+  static struct wormmeta *
+wormmeta_expand(struct wormhmap * const hmap, struct wormmeta * const meta1)
+{
+  struct wormmeta * const meta2 = slab_alloc_unsafe(hmap->slab2);
+  if (meta2 == NULL)
+    return NULL;
+
+  memcpy(meta2, meta1, sizeof(*meta1));
+  for (u32 i = 0; i < WH_BMNR; i++)
+    meta2->bitmap[i] = 0;
+  const u32 bitmin = wormmeta_bitmin_load(meta1);
+  debug_assert(bitmin == wormmeta_bitmax_load(meta1));
+  debug_assert(bitmin < WH_FO);
+  // set the only bit
+  meta2->bitmap[bitmin >> 6u] |= (1lu << (bitmin & 0x3fu));
+
+  wormhmap_replace(hmap, meta1, meta2);
+  slab_free_unsafe(hmap->slab1, meta1);
+  return meta2;
+}
+
+  static struct wormmeta *
+wormmeta_bm_set_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  debug_assert(id < WH_FO);
+  const u32 bitmin = wormmeta_bitmin_load(meta);
+  const u32 bitmax = wormmeta_bitmax_load(meta);
+  if (bitmin < bitmax) { // already in full size
+    wormmeta_bm_set(meta, id);
+    return meta;
+  } else if (id == bitmin) { // do nothing
+    return meta;
+  } else if (bitmin == WH_FO) { // add the first bit
+    wormmeta_bitmin_store(meta, id);
+    wormmeta_bitmax_store(meta, id);
+    return meta;
+  } else { // need to expand
+    struct wormmeta * const meta2 = wormmeta_expand(hmap, meta);
+    wormmeta_bm_set(meta2, id);
+    return meta2;
+  }
+}
+
+// return true if a new node is created
+  static void
+wormmeta_split_touch(struct wormhmap * const hmap, struct kv * const mkey,
+    struct wormleaf * const leaf, const u32 alen)
+{
+  struct wormmeta * meta = wormhmap_get(hmap, mkey);
+  if (meta) {
+    if (mkey->klen < alen)
+      meta = wormmeta_bm_set_helper(hmap, meta, mkey->kv[mkey->klen]);
+    if (wormmeta_lmost_load(meta) == leaf->next)
+      wormmeta_lmost_store(meta, leaf);
+    else if (wormmeta_rmost_load(meta) == leaf->prev)
+      wormmeta_rmost_store(meta, leaf);
+  } else { // create new node
+    const u32 bit = (mkey->klen < alen) ? mkey->kv[mkey->klen] : WH_FO;
+    meta = wormmeta_alloc(hmap, leaf, mkey, alen, bit);
+    debug_assert(meta);
+    wormhmap_set(hmap, meta);
+  }
+}
+
+  static void
+wormmeta_lpath_update(struct wormhmap * const hmap, const struct kv * const a1, const struct kv * const a2,
+    struct wormleaf * const lpath)
+{
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(a2, pbuf);
+
+  // only need to update a2's own branch
+  u32 i = kv_key_lcp(a1, a2) + 1;
+  debug_assert(i <= pbuf->klen);
+  wormhole_prefix(pbuf, i);
+  while (i < a2->klen) {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * const meta = wormhmap_get(hmap, pbuf);
+    debug_assert(meta);
+    wormmeta_lpath_store(meta, lpath);
+
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  }
+}
+
+// for leaf1, a leaf2 is already linked at its right side.
+// this function updates the meta-map by moving leaf1 and hooking leaf2 at correct positions
+  static void
+wormmeta_split(struct wormhmap * const hmap, struct wormleaf * const leaf,
+    struct kv * const mkey)
+{
+  // left branches
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  u32 i = next ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+
+  // save klen
+  const u32 mklen = mkey->klen;
+  wormhole_prefix(mkey, i);
+  do {
+    wormmeta_split_touch(hmap, mkey, leaf, alen);
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(mkey);
+  } while (true);
+
+  // adjust maxplen; i is the plen of the last _touch()
+  if (i > hmap->maxplen)
+    hmap->maxplen = i;
+  debug_assert(i <= UINT16_MAX);
+
+  // restore klen
+  mkey->klen = mklen;
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, leaf);
+}
+
+// all locks will be released before returning
+  static bool
+wormhole_split_meta(struct wormref * const ref, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  struct wormhole * const map = ref->map;
+  // metalock
+  wormhmap_lock(map, ref);
+
+  // check slab reserve
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+
+  // link
+  struct wormleaf * const leaf1 = leaf2->prev;
+  leaf1->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  // update versions
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_split(hmap1, leaf2, mkey);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_split(hmap0, leaf2, mkey);
+
+  wormhmap_unlock(map);
+
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+// all locks (metalock + leaflocks) will be released before returning
+// leaf1->lock (write) is already taken
+  static bool
+wormhole_split_insert(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(ref->map, leaf1, new);
+  if (unlikely(leaf2 == NULL)) {
+    wormleaf_unlock_write(leaf1);
+    return false;
+  }
+
+  rwlock_lock_write(&(leaf2->leaflock));
+  const bool rsm = wormhole_split_meta(ref, leaf2);
+  if (unlikely(!rsm)) {
+    // undo insertion & merge; free leaf2
+    wormleaf_split_undo(ref->map, leaf1, leaf2, new);
+    wormleaf_unlock_write(leaf1);
+  }
+  return rsm;
+}
+
+  static bool
+whunsafe_split_meta(struct wormhole * const map, struct wormleaf * const leaf2)
+{
+  struct kv * const mkey = wormhole_alloc_mkey(leaf2->anchor->klen);
+  if (unlikely(mkey == NULL))
+    return false;
+  kv_dup2_key(leaf2->anchor, mkey);
+
+  const bool sr = wormhole_slab_reserve(map, mkey->klen);
+  if (unlikely(!sr)) {
+    wormhmap_unlock(map);
+    wormhole_free_mkey(mkey);
+    return false;
+  }
+
+  // link
+  leaf2->prev->next = leaf2;
+  if (leaf2->next)
+    leaf2->next->prev = leaf2;
+
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_split(&(map->hmap2[i]), leaf2, mkey);
+  if (mkey->refcnt == 0) // this is possible
+    wormhole_free_mkey(mkey);
+  return true;
+}
+
+  static bool
+whunsafe_split_insert(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct kv * const new)
+{
+  struct wormleaf * const leaf2 = wormhole_split_leaf(map, leaf1, new);
+  if (unlikely(leaf2 == NULL))
+    return false;
+
+  const bool rsm = whunsafe_split_meta(map, leaf2);
+  if (unlikely(!rsm))  // undo insertion, merge, free leaf2
+    wormleaf_split_undo(map, leaf1, leaf2, new);
+
+  return rsm;
+}
+// }}} meta-split
+
+// meta-merge {{{
+// now it only contains one bit
+  static struct wormmeta *
+wormmeta_shrink(struct wormhmap * const hmap, struct wormmeta * const meta2)
+{
+  debug_assert(wormmeta_bitmin_load(meta2) == wormmeta_bitmax_load(meta2));
+  struct wormmeta * const meta1 = slab_alloc_unsafe(hmap->slab1);
+  if (meta1 == NULL)
+    return NULL;
+
+  memcpy(meta1, meta2, sizeof(*meta1));
+
+  wormhmap_replace(hmap, meta2, meta1);
+  slab_free_unsafe(hmap->slab2, meta2);
+  return meta1;
+}
+
+  static void
+wormmeta_bm_clear_helper(struct wormhmap * const hmap, struct wormmeta * const meta, const u32 id)
+{
+  if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta)) {
+    debug_assert(wormmeta_bitmin_load(meta) < WH_FO);
+    wormmeta_bitmin_store(meta, WH_FO);
+    wormmeta_bitmax_store(meta, WH_FO);
+  } else { // has more than 1 bit
+    wormmeta_bm_clear(meta, id);
+    if (wormmeta_bitmin_load(meta) == wormmeta_bitmax_load(meta))
+      wormmeta_shrink(hmap, meta);
+  }
+}
+
+// all locks held
+  static void
+wormmeta_merge(struct wormhmap * const hmap, struct wormleaf * const leaf)
+{
+  // leaf->next is the new next after merge, which can be NULL
+  struct wormleaf * const prev = leaf->prev;
+  struct wormleaf * const next = leaf->next;
+  struct kv * const pbuf = hmap->pbuf;
+  kv_dup2_key(leaf->anchor, pbuf);
+  u32 i = (prev && next) ? kv_key_lcp(prev->anchor, next->anchor) : 0;
+  const u32 alen = leaf->anchor->klen;
+  wormhole_prefix(pbuf, i);
+  struct wormmeta * parent = NULL;
+  do {
+    debug_assert(i <= hmap->maxplen);
+    struct wormmeta * meta = wormhmap_get(hmap, pbuf);
+    if (wormmeta_lmost_load(meta) == wormmeta_rmost_load(meta)) { // delete single-child
+      debug_assert(wormmeta_lmost_load(meta) == leaf);
+      const u32 bitmin = wormmeta_bitmin_load(meta);
+      wormhmap_del(hmap, meta);
+      wormmeta_free(hmap, meta);
+      if (parent) {
+        wormmeta_bm_clear_helper(hmap, parent, pbuf->kv[i-1]);
+        parent = NULL;
+      }
+      if (bitmin == WH_FO) // no child
+        break;
+    } else { // adjust lmost rmost
+      if (wormmeta_lmost_load(meta) == leaf)
+        wormmeta_lmost_store(meta, next);
+      else if (wormmeta_rmost_load(meta) == leaf)
+        wormmeta_rmost_store(meta, prev);
+      parent = meta;
+    }
+
+    if (i >= alen)
+      break;
+    i++;
+    wormhole_prefix_inc1(pbuf);
+  } while (true);
+
+  if (next)
+    wormmeta_lpath_update(hmap, leaf->anchor, next->anchor, prev);
+}
+
+// all locks (metalock + two leaflock) will be released before returning
+// merge leaf2 to leaf1, removing all metadata to leaf2 and leaf2 itself
+  static void
+wormhole_meta_merge(struct wormref * const ref, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2, const bool unlock_leaf1)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  struct wormhole * const map = ref->map;
+
+  wormhmap_lock(map, ref);
+
+  struct wormhmap * const hmap0 = wormhmap_load(map);
+  struct wormhmap * const hmap1 = wormhmap_switch(map, hmap0);
+  const u64 v1 = wormhmap_version_load(hmap0) + 1;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+
+  wormleaf_version_store(leaf1, v1);
+  wormleaf_version_store(leaf2, v1);
+  wormhmap_version_store(hmap1, v1);
+
+  wormmeta_merge(hmap1, leaf2);
+
+  qsbr_update(&ref->qref, v1);
+
+  // switch hmap
+  wormhmap_store(map, hmap1);
+
+  if (unlock_leaf1)
+    wormleaf_unlock_write(leaf1);
+  wormleaf_unlock_write(leaf2);
+
+  qsbr_wait(map->qsbr, v1);
+
+  wormmeta_merge(hmap0, leaf2);
+  // leaf2 is now safe to be removed
+  wormleaf_free(map->slab_leaf, leaf2);
+  wormhmap_unlock(map);
+}
+
+// caller must acquire leaf->wlock and next->wlock
+// all locks will be released when this function returns
+  static bool
+wormhole_meta_leaf_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  debug_assert(next);
+
+  // double check
+  if ((leaf->nr_keys + next->nr_keys) <= WH_KPN) {
+    if (wormleaf_merge(leaf, next)) {
+      wormhole_meta_merge(ref, leaf, next, true);
+      return true;
+    }
+  }
+  // merge failed but it's fine
+  wormleaf_unlock_write(leaf);
+  wormleaf_unlock_write(next);
+  return false;
+}
+
+  static void
+whunsafe_meta_leaf_merge(struct wormhole * const map, struct wormleaf * const leaf1,
+    struct wormleaf * const leaf2)
+{
+  debug_assert(leaf1->next == leaf2);
+  debug_assert(leaf2->prev == leaf1);
+  if (!wormleaf_merge(leaf1, leaf2))
+    return;
+
+  leaf1->next = leaf2->next;
+  if (leaf2->next)
+    leaf2->next->prev = leaf1;
+  for (u32 i = 0; i < 2; i++)
+    if (map->hmap2[i].pmap)
+      wormmeta_merge(&(map->hmap2[i]), leaf2);
+  wormleaf_free(map->slab_leaf, leaf2);
+}
+// }}} meta-merge
+
+// put {{{
+  bool
+wormhole_put(struct wormref * const ref, struct kv * const kv)
+{
+  // we always allocate a new item on SET
+  // future optimizations may perform in-place update
+  struct wormhole * const map = ref->map;
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) {
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_put(struct wormref * const ref, struct kv * const kv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_put(ref, kv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv)
+{
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL))
+    return false;
+  const struct kref kref = kv_kref(new);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, &kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, &kref);
+  if (im < WH_KPN) { // overwrite
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormhole * const map = ref->map;
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL)) { // no replacement
+      wormleaf_unlock_write(leaf);
+      return true;
+    }
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL)) { // mm error
+      wormleaf_unlock_write(leaf);
+      return false;
+    }
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    wormleaf_unlock_write(leaf);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) { // nothing to be inserted
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) { // mm error
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    wormleaf_unlock_write(leaf);
+    return true;
+  }
+
+  // split_insert changes hmap
+  // all locks should be released in wormhole_split_insert()
+  const bool rsi = wormhole_split_insert(ref, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+
+  bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_merge(ref, kref, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, kref);
+  // update
+  const u32 im = wormleaf_match_hs(leaf, kref);
+  if (im < WH_KPN) { // update
+    struct kv * const kv0 = wormleaf_kv_at_ih(leaf, im);
+    struct kv * const kv = uf(kv0, priv);
+    if ((kv == kv0) || (kv == NULL))
+      return true;
+
+    struct kv * const new = map->mm.in(kv, map->mm.priv);
+    if (unlikely(new == NULL))
+      return false;
+
+    struct kv * const old = wormleaf_update(leaf, im, new);
+    map->mm.free(old, map->mm.priv);
+    return true;
+  }
+
+  struct kv * const kv = uf(NULL, priv);
+  if (kv == NULL) // nothing to be inserted
+    return true;
+
+  struct kv * const new = map->mm.in(kv, map->mm.priv);
+  if (unlikely(new == NULL)) // mm error
+    return false;
+
+  // insert
+  if (likely(leaf->nr_keys < WH_KPN)) { // just insert
+    wormleaf_insert(leaf, new);
+    return true;
+  }
+
+  // split_insert changes hmap
+  const bool rsi = whunsafe_split_insert(map, leaf, new);
+  if (!rsi)
+    map->mm.free(new, map->mm.priv);
+  return rsi;
+}
+// }}} put
+
+// inplace {{{
+  bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_read(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_read(leaf);
+    return false;
+  }
+}
+
+  bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) {
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    wormleaf_unlock_write(leaf);
+    return true;
+  } else {
+    uf(NULL, priv);
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpr(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_inpw(ref, key, uf, priv);
+  wormhole_park(ref);
+  return r;
+}
+
+  bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // overwrite
+    uf(wormleaf_kv_at_ih(leaf, im), priv);
+    return true;
+  } else {
+    uf(NULL, priv);
+    return false;
+  }
+}
+// }}} put
+
+// del {{{
+  static void
+wormhole_del_try_merge(struct wormref * const ref, struct wormleaf * const leaf)
+{
+  struct wormleaf * const next = leaf->next;
+  if (next && ((leaf->nr_keys == 0) || ((leaf->nr_keys + next->nr_keys) < WH_KPN_MRG))) {
+    // try merge, it may fail if size becomes larger after locking
+    wormleaf_lock_write(next, ref);
+    (void)wormhole_meta_leaf_merge(ref, leaf);
+    // locks are already released; immediately return
+  } else {
+    wormleaf_unlock_write(leaf);
+  }
+}
+
+  bool
+wormhole_del(struct wormref * const ref, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf_write(ref, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    wormhole_del_try_merge(ref, leaf);
+    debug_assert(kv);
+    // free after releasing locks
+    struct wormhole * const map = ref->map;
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  } else {
+    wormleaf_unlock_write(leaf);
+    return false;
+  }
+}
+
+  bool
+whsafe_del(struct wormref * const ref, const struct kref * const key)
+{
+  wormhole_resume(ref);
+  const bool r = wormhole_del(ref, key);
+  wormhole_park(ref);
+  return r;
+}
+
+  static void
+whunsafe_del_try_merge(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 n0 = leaf->prev ? leaf->prev->nr_keys : WH_KPN;
+  const u32 n1 = leaf->nr_keys;
+  const u32 n2 = leaf->next ? leaf->next->nr_keys : WH_KPN;
+
+  if ((leaf->prev && (n1 == 0)) || ((n0 + n1) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf->prev, leaf);
+  } else if ((leaf->next && (n1 == 0)) || ((n1 + n2) < WH_KPN_MRG)) {
+    whunsafe_meta_leaf_merge(map, leaf, leaf->next);
+  }
+}
+
+  bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(map->hmap, key);
+  const u32 im = wormleaf_match_hs(leaf, key);
+  if (im < WH_KPN) { // found
+    struct kv * const kv = wormleaf_remove_ih(leaf, im);
+    debug_assert(kv);
+
+    whunsafe_del_try_merge(map, leaf);
+    map->mm.free(kv, map->mm.priv);
+    return true;
+  }
+  return false;
+}
+
+  u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  struct wormleaf * const leafa = wormhole_jump_leaf_write(ref, start);
+  wormleaf_sync_sorted(leafa);
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia) { // do nothing if end < start
+    wormleaf_unlock_write(leafa);
+    return 0;
+  }
+  u64 ndel = iaz - ia;
+  struct wormhole * const map = ref->map;
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  if (leafa->nr_keys > ia) { // end hit; done
+    wormhole_del_try_merge(ref, leafa);
+    return ndel;
+  }
+
+  while (leafa->next) {
+    struct wormleaf * const leafx = leafa->next;
+    wormleaf_lock_write(leafx, ref);
+    // two leaf nodes locked
+    wormleaf_sync_sorted(leafx);
+    const u32 iz = end ? wormleaf_seek_end(leafx, end) : leafx->nr_keys;
+    ndel += iz;
+    wormleaf_delete_range(map, leafx, 0, iz);
+    if (leafx->nr_keys == 0) { // removed all
+      // must hold leaf1's lock for the next iteration
+      wormhole_meta_merge(ref, leafa, leafx, false);
+    } else { // partially removed; done
+      (void)wormhole_meta_leaf_merge(ref, leafa);
+      return ndel;
+    }
+  }
+  wormleaf_unlock_write(leafa);
+  return ndel;
+}
+
+  u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end)
+{
+  wormhole_resume(ref);
+  const u64 ret = wormhole_delr(ref, start, end);
+  wormhole_park(ref);
+  return ret;
+}
+
+  u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end)
+{
+  // first leaf
+  struct wormhmap * const hmap = map->hmap;
+  struct wormleaf * const leafa = wormhole_jump_leaf(hmap, start);
+  wormleaf_sync_sorted(leafa);
+  // last leaf
+  struct wormleaf * const leafz = end ? wormhole_jump_leaf(hmap, end) : NULL;
+
+  // select start/end on leafa
+  const u32 ia = wormleaf_seek(leafa, start);
+  const u32 iaz = end ? wormleaf_seek_end(leafa, end) : leafa->nr_keys;
+  if (iaz < ia)
+    return 0;
+
+  wormleaf_delete_range(map, leafa, ia, iaz);
+  u64 ndel = iaz - ia;
+
+  if (leafa == leafz) { // one node only
+    whunsafe_del_try_merge(map, leafa);
+    return ndel;
+  }
+
+  // 0 or more nodes between leafa and leafz
+  while (leafa->next != leafz) {
+    struct wormleaf * const leafx = leafa->next;
+    ndel += leafx->nr_keys;
+    for (u32 i = 0; i < leafx->nr_keys; i++)
+      map->mm.free(wormleaf_kv_at_is(leafx, i), map->mm.priv);
+    leafx->nr_keys = 0;
+    leafx->nr_sorted = 0;
+    whunsafe_meta_leaf_merge(map, leafa, leafx);
+  }
+  // delete the smaller keys in leafz
+  if (leafz) {
+    wormleaf_sync_sorted(leafz);
+    const u32 iz = wormleaf_seek_end(leafz, end);
+    wormleaf_delete_range(map, leafz, 0, iz);
+    ndel += iz;
+    whunsafe_del_try_merge(map, leafa);
+  }
+  return ndel;
+}
+// }}} del
+
+// iter {{{
+// safe iter: safe sort with read-lock acquired
+// unsafe iter: allow concurrent seek/skip
+  static void
+wormhole_iter_leaf_sync_sorted(struct wormleaf * const leaf)
+{
+  if (unlikely(leaf->nr_keys != leaf->nr_sorted)) {
+    spinlock_lock(&(leaf->sortlock));
+    wormleaf_sync_sorted(leaf);
+    spinlock_unlock(&(leaf->sortlock));
+  }
+}
+
+  struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = ref;
+  iter->map = ref->map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  return iter;
+}
+
+  static void
+wormhole_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL)) {
+      struct wormref * const ref = iter->ref;
+      wormleaf_lock_read(next, ref);
+      wormleaf_unlock_read(iter->leaf);
+
+      wormhole_iter_leaf_sync_sorted(next);
+    } else {
+      wormleaf_unlock_read(iter->leaf);
+    }
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  debug_assert(key);
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+
+  struct wormleaf * const leaf = wormhole_jump_leaf_read(iter->ref, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  wormhole_iter_fix(iter);
+}
+
+  void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  wormhole_resume(iter->ref);
+  wormhole_iter_seek(iter, key);
+}
+
+  bool
+wormhole_iter_valid(struct wormhole_iter * const iter)
+{
+  return iter->leaf != NULL;
+}
+
+  static struct kv *
+wormhole_iter_current(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    debug_assert(iter->is < iter->leaf->nr_sorted);
+    struct kv * const kv = wormleaf_kv_at_is(iter->leaf, iter->is);
+    return kv;
+  }
+  return NULL;
+}
+
+  struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    struct kv * const ret = iter->map->mm.out(kv, out);
+    return ret;
+  }
+  return NULL;
+}
+
+  bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kref_ref_kv(kref, kv);
+    return true;
+  }
+  return false;
+}
+
+  bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  if (kv) {
+    kvref_ref_kv(kvref, kv);
+    return true;
+  }
+  return false;
+}
+
+  void
+wormhole_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    wormhole_iter_fix(iter);
+  }
+}
+
+  void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    wormhole_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  wormhole_iter_skip1(iter);
+  return ret;
+}
+
+  bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  struct kv * const kv = wormhole_iter_current(iter);
+  uf(kv, priv); // call uf even if (kv == NULL)
+  return kv != NULL;
+}
+
+  void
+wormhole_iter_park(struct wormhole_iter * const iter)
+{
+  if (iter->leaf) {
+    wormleaf_unlock_read(iter->leaf);
+    iter->leaf = NULL;
+  }
+}
+
+  void
+whsafe_iter_park(struct wormhole_iter * const iter)
+{
+  wormhole_iter_park(iter);
+  wormhole_park(iter->ref);
+}
+
+  void
+wormhole_iter_destroy(struct wormhole_iter * const iter)
+{
+  if (iter->leaf)
+    wormleaf_unlock_read(iter->leaf);
+  free(iter);
+}
+
+  void
+whsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  wormhole_park(iter->ref);
+  wormhole_iter_destroy(iter);
+}
+// }}} iter
+
+// unsafe iter {{{
+  struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map)
+{
+  struct wormhole_iter * const iter = malloc(sizeof(*iter));
+  if (iter == NULL)
+    return NULL;
+  iter->ref = NULL;
+  iter->map = map;
+  iter->leaf = NULL;
+  iter->is = 0;
+  whunsafe_iter_seek(iter, kref_null());
+  return iter;
+}
+
+  static void
+whunsafe_iter_fix(struct wormhole_iter * const iter)
+{
+  if (!wormhole_iter_valid(iter))
+    return;
+
+  while (unlikely(iter->is >= iter->leaf->nr_sorted)) {
+    struct wormleaf * const next = iter->leaf->next;
+    if (likely(next != NULL))
+      wormhole_iter_leaf_sync_sorted(next);
+    iter->leaf = next;
+    iter->is = 0;
+    if (!wormhole_iter_valid(iter))
+      return;
+  }
+}
+
+  void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key)
+{
+  struct wormleaf * const leaf = wormhole_jump_leaf(iter->map->hmap, key);
+  wormhole_iter_leaf_sync_sorted(leaf);
+
+  iter->leaf = leaf;
+  iter->is = wormleaf_seek(leaf, key);
+  whunsafe_iter_fix(iter);
+}
+
+  void
+whunsafe_iter_skip1(struct wormhole_iter * const iter)
+{
+  if (wormhole_iter_valid(iter)) {
+    iter->is++;
+    whunsafe_iter_fix(iter);
+  }
+}
+
+  void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  u32 todo = nr;
+  while (todo && wormhole_iter_valid(iter)) {
+    const u32 cap = iter->leaf->nr_sorted - iter->is;
+    const u32 nskip = (cap < todo) ? cap : todo;
+    iter->is += nskip;
+    whunsafe_iter_fix(iter);
+    todo -= nskip;
+  }
+}
+
+  struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out)
+{
+  struct kv * const ret = wormhole_iter_peek(iter, out);
+  whunsafe_iter_skip1(iter);
+  return ret;
+}
+
+  void
+whunsafe_iter_destroy(struct wormhole_iter * const iter)
+{
+  free(iter);
+}
+// }}} unsafe iter
+
+// misc {{{
+  struct wormref *
+wormhole_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = malloc(sizeof(*ref));
+  if (ref == NULL)
+    return NULL;
+  ref->map = map;
+  if (qsbr_register(map->qsbr, &(ref->qref)) == false) {
+    free(ref);
+    return NULL;
+  }
+  return ref;
+}
+
+  struct wormref *
+whsafe_ref(struct wormhole * const map)
+{
+  struct wormref * const ref = wormhole_ref(map);
+  if (ref)
+    wormhole_park(ref);
+  return ref;
+}
+
+  struct wormhole *
+wormhole_unref(struct wormref * const ref)
+{
+  struct wormhole * const map = ref->map;
+  qsbr_unregister(map->qsbr, &(ref->qref));
+  free(ref);
+  return map;
+}
+
+  inline void
+wormhole_park(struct wormref * const ref)
+{
+  qsbr_park(&(ref->qref));
+}
+
+  inline void
+wormhole_resume(struct wormref * const ref)
+{
+  qsbr_resume(&(ref->qref));
+}
+
+  inline void
+wormhole_refresh_qstate(struct wormref * const ref)
+{
+  qsbr_update(&(ref->qref), wormhmap_version_load(wormhmap_load(ref->map)));
+}
+
+  static void
+wormhole_clean_hmap(struct wormhole * const map)
+{
+  for (u32 x = 0; x < 2; x++) {
+    if (map->hmap2[x].pmap == NULL)
+      continue;
+    struct wormhmap * const hmap = &(map->hmap2[x]);
+    const u64 nr_slots = ((u64)(hmap->mask)) + 1;
+    struct wormmbkt * const pmap = hmap->pmap;
+    for (u64 s = 0; s < nr_slots; s++) {
+      struct wormmbkt * const slot = &(pmap[s]);
+      for (u32 i = 0; i < WH_BKT_NR; i++)
+        if (slot->e[i])
+          wormmeta_keyref_release(slot->e[i]);
+    }
+
+    slab_free_all(hmap->slab1);
+    slab_free_all(hmap->slab2);
+    memset(hmap->pmap, 0, hmap->msize);
+    hmap->maxplen = 0;
+  }
+}
+
+  static void
+wormhole_free_leaf_keys(struct wormhole * const map, struct wormleaf * const leaf)
+{
+  const u32 nr = leaf->nr_keys;
+  for (u32 i = 0; i < nr; i++) {
+    void * const curr = wormleaf_kv_at_is(leaf, i);
+    debug_assert(curr);
+    map->mm.free(curr, map->mm.priv);
+  }
+  wormhole_free_akey(leaf->anchor);
+}
+
+  static void
+wormhole_clean_helper(struct wormhole * const map)
+{
+  wormhole_clean_hmap(map);
+  for (struct wormleaf * leaf = map->leaf0; leaf; leaf = leaf->next)
+    wormhole_free_leaf_keys(map, leaf);
+  slab_free_all(map->slab_leaf);
+  map->leaf0 = NULL;
+}
+
+// unsafe
+  void
+wormhole_clean(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  wormhole_create_leaf0(map);
+}
+
+  void
+wormhole_destroy(struct wormhole * const map)
+{
+  wormhole_clean_helper(map);
+  for (u32 i = 0; i < 2; i++) {
+    struct wormhmap * const hmap = &map->hmap2[i];
+    if (hmap->slab1)
+      slab_destroy(hmap->slab1);
+    if (hmap->slab2)
+      slab_destroy(hmap->slab2);
+    wormhmap_deinit(hmap);
+  }
+  qsbr_destroy(map->qsbr);
+  slab_destroy(map->slab_leaf);
+  free(map->pbuf);
+  free(map);
+}
+
+  void
+wormhole_fprint(struct wormhole * const map, FILE * const out)
+{
+  const u64 nr_slab_ul = slab_get_nalloc(map->slab_leaf);
+  const u64 nr_slab_um11 = slab_get_nalloc(map->hmap2[0].slab1);
+  const u64 nr_slab_um12 = slab_get_nalloc(map->hmap2[0].slab2);
+  const u64 nr_slab_um21 = map->hmap2[1].slab1 ? slab_get_nalloc(map->hmap2[1].slab1) : 0;
+  const u64 nr_slab_um22 = map->hmap2[1].slab2 ? slab_get_nalloc(map->hmap2[1].slab2) : 0;
+  fprintf(out, "%s L-SLAB %lu M-SLAB [0] %lu+%lu [1] %lu+%lu\n",
+      __func__, nr_slab_ul, nr_slab_um11, nr_slab_um12, nr_slab_um21, nr_slab_um22);
+}
+// }}} misc
+
+// api {{{
+const struct kvmap_api kvmap_api_wormhole = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .refpark = true,
+  .put = (void *)wormhole_put,
+  .get = (void *)wormhole_get,
+  .probe = (void *)wormhole_probe,
+  .del = (void *)wormhole_del,
+  .inpr = (void *)wormhole_inpr,
+  .inpw = (void *)wormhole_inpw,
+  .merge = (void *)wormhole_merge,
+  .delr = (void *)wormhole_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)wormhole_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)wormhole_iter_park,
+  .iter_destroy = (void *)wormhole_iter_destroy,
+  .ref = (void *)wormhole_ref,
+  .unref = (void *)wormhole_unref,
+  .park = (void *)wormhole_park,
+  .resume = (void *)wormhole_resume,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .threadsafe = true,
+  .unique = true,
+  .put = (void *)whsafe_put,
+  .get = (void *)whsafe_get,
+  .probe = (void *)whsafe_probe,
+  .del = (void *)whsafe_del,
+  .inpr = (void *)whsafe_inpr,
+  .inpw = (void *)whsafe_inpw,
+  .merge = (void *)whsafe_merge,
+  .delr = (void *)whsafe_delr,
+  .iter_create = (void *)wormhole_iter_create,
+  .iter_seek = (void *)whsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)wormhole_iter_skip1,
+  .iter_skip = (void *)wormhole_iter_skip,
+  .iter_next = (void *)wormhole_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_park = (void *)whsafe_iter_park,
+  .iter_destroy = (void *)whsafe_iter_destroy,
+  .ref = (void *)whsafe_ref,
+  .unref = (void *)wormhole_unref,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+const struct kvmap_api kvmap_api_whunsafe = {
+  .hashkey = true,
+  .ordered = true,
+  .unique = true,
+  .put = (void *)whunsafe_put,
+  .get = (void *)whunsafe_get,
+  .probe = (void *)whunsafe_probe,
+  .del = (void *)whunsafe_del,
+  .inpr = (void *)whunsafe_inp,
+  .inpw = (void *)whunsafe_inp,
+  .merge = (void *)whunsafe_merge,
+  .delr = (void *)whunsafe_delr,
+  .iter_create = (void *)whunsafe_iter_create,
+  .iter_seek = (void *)whunsafe_iter_seek,
+  .iter_valid = (void *)wormhole_iter_valid,
+  .iter_peek = (void *)wormhole_iter_peek,
+  .iter_kref = (void *)wormhole_iter_kref,
+  .iter_kvref = (void *)wormhole_iter_kvref,
+  .iter_skip1 = (void *)whunsafe_iter_skip1,
+  .iter_skip = (void *)whunsafe_iter_skip,
+  .iter_next = (void *)whunsafe_iter_next,
+  .iter_inp = (void *)wormhole_iter_inp,
+  .iter_destroy = (void *)whunsafe_iter_destroy,
+  .clean = (void *)wormhole_clean,
+  .destroy = (void *)wormhole_destroy,
+  .fprint = (void *)wormhole_fprint,
+};
+
+  static void *
+wormhole_kvmap_api_create(const char * const name, const struct kvmap_mm * const mm, char ** args)
+{
+  (void)args;
+  if ((!strcmp(name, "wormhole")) || (!strcmp(name, "whsafe"))) {
+    return wormhole_create(mm);
+  } else if (!strcmp(name, "whunsafe")) {
+    return whunsafe_create(mm);
+  } else {
+    return NULL;
+  }
+}
+
+__attribute__((constructor))
+  static void
+wormhole_kvmap_api_init(void)
+{
+  kvmap_api_register(0, "wormhole", "", wormhole_kvmap_api_create, &kvmap_api_wormhole);
+  kvmap_api_register(0, "whsafe", "", wormhole_kvmap_api_create, &kvmap_api_whsafe);
+  kvmap_api_register(0, "whunsafe", "", wormhole_kvmap_api_create, &kvmap_api_whunsafe);
+}
+// }}} api
+
+// wh {{{
+// Users often don't enjoy dealing with struct kv/kref and just want to use plain buffers.
+// No problem!
+// This example library shows you how to use Wormhole efficiently in the most intuitive way.
+
+// Use the worry-free api
+static const struct kvmap_api * const wh_api = &kvmap_api_whsafe;
+
+// You can change the wh_api to kvmap_api_wormhole with a one-line replacement
+// The standard Wormhole api can give you ~5% boost; read README for thread-safety tips
+//static const struct kvmap_api * const wh_api = &kvmap_api_wormhole;
+
+  struct wormhole *
+wh_create(void)
+{
+  // kvmap_mm_ndf (kv.h) will let the caller allocate the kv when inserting
+  // This can avoid a memcpy if the caller does not have the data in a struct kv
+  return wormhole_create(&kvmap_mm_ndf);
+}
+
+  struct wormref *
+wh_ref(struct wormhole * const wh)
+{
+  return wh_api->ref(wh);
+}
+
+  void
+wh_unref(struct wormref * const ref)
+{
+  (void)wh_api->unref(ref);
+}
+
+  void
+wh_park(struct wormref * const ref)
+{
+  if (wh_api->park)
+    wh_api->park(ref);
+}
+
+  void
+wh_resume(struct wormref * const ref)
+{
+  if (wh_api->resume)
+    wh_api->resume(ref);
+}
+
+  void
+wh_clean(struct wormhole * const map)
+{
+  wh_api->clean(map);
+}
+
+  void
+wh_destroy(struct wormhole * const map)
+{
+  wh_api->destroy(map);
+}
+
+// Do set/put with explicit kv buffers
+  bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen)
+{
+  struct kv * const newkv = kv_create(kbuf, klen, vbuf, vlen);
+  if (newkv == NULL)
+    return false;
+  // must use with kvmap_mm_ndf (see below)
+  // the newkv will be saved in the Wormhole and freed by Wormhole when upon deletion
+  return wh_api->put(ref, newkv);
+}
+
+// delete a key
+  bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->del(ref, &kref);
+}
+
+// test if the key exist in Wormhole
+  bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->probe(ref, &kref);
+}
+
+// for wh_get()
+struct wh_inp_info { void * vbuf_out; u32 * vlen_out; u32 vbuf_size; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+wh_inp_copy_value(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_inp_info * const info = (typeof(info))priv;
+    // copy the value data out
+    const u32 copy_size = info->vbuf_size < curr->vlen ? info->vbuf_size : curr->vlen;
+    memcpy(info->vbuf_out, kv_vptr_c(curr), copy_size);
+    // copy the vlen out
+    *info->vlen_out = curr->vlen;
+  }
+}
+
+// returns a boolean value indicating whether the key is found.
+// the value's data will be written to *vlen_out and vbuf_out if the key is found
+// if vbuf_size < vlen, then only the first vbuf_size bytes is copied to the buffer
+// a small vbuf_size can be used to reduce memcpy cost when only the first a few bytes are needed
+  bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  struct wh_inp_info info = {vbuf_out, vlen_out, vbuf_size};
+  // use the inplace read function to get the value if it exists
+  return wh_api->inpr(ref, &kref, wh_inp_copy_value, &info);
+}
+
+  bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpr(ref, &kref, uf, priv);
+}
+
+// inplace update KV's value with a user-defined hook function
+// the update should only modify the data in the value; It should not change the value size
+  bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->inpw(ref, &kref, uf, priv);
+}
+
+// merge existing KV with updates with a user-defined hook function
+  bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  return wh_api->merge(ref, &kref, uf, priv);
+}
+
+// remove a range of KVs from start (inclusive) to end (exclusive); [start, end)
+  u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end)
+{
+  struct kref kref_start, kref_end;
+  kref_ref_hash32(&kref_start, kbuf_start, klen_start);
+  kref_ref_hash32(&kref_end, kbuf_end, klen_end);
+  return wh_api->delr(ref, &kref_start, &kref_end);
+}
+
+  struct wormhole_iter *
+wh_iter_create(struct wormref * const ref)
+{
+  return wh_api->iter_create(ref);
+}
+
+  void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen)
+{
+  struct kref kref;
+  kref_ref_hash32(&kref, kbuf, klen);
+  wh_api->iter_seek(iter, &kref);
+}
+
+  bool
+wh_iter_valid(struct wormhole_iter * const iter)
+{
+  return wh_api->iter_valid(iter);
+}
+
+// for wh_iter_peek()
+// the out ptrs must be provided in pairs; use a pair of NULLs to ignore the key or value
+struct wh_iter_inp_info { void * kbuf_out; void * vbuf_out; u32 kbuf_size; u32 vbuf_size; u32 * klen_out; u32 * vlen_out; };
+
+// a kv_inp_func; use this to retrieve the KV's data without unnecesary memory copying
+  static void
+inp_copy_kv_cb(struct kv * const curr, void * const priv)
+{
+  if (curr) { // found
+    struct wh_iter_inp_info * const info = (typeof(info))priv;
+
+    // copy the key
+    if (info->kbuf_out) { // it assumes klen_out is also not NULL
+      // copy the key data out
+      const u32 clen = curr->klen < info->kbuf_size ? curr->klen : info->kbuf_size;
+      memcpy(info->kbuf_out, kv_kptr_c(curr), clen);
+      // copy the klen out
+      *info->klen_out = curr->klen;
+    }
+
+    // copy the value
+    if (info->vbuf_out) { // it assumes vlen_out is also not NULL
+      // copy the value data out
+      const u32 clen = curr->vlen < info->vbuf_size ? curr->vlen : info->vbuf_size;
+      memcpy(info->vbuf_out, kv_vptr_c(curr), clen);
+      // copy the vlen out
+      *info->vlen_out = curr->vlen;
+    }
+  }
+}
+
+// seek is similar to get
+  bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out)
+{
+  struct wh_iter_inp_info info = {kbuf_out, vbuf_out, kbuf_size, vbuf_size, klen_out, vlen_out};
+  return wh_api->iter_inp(iter, inp_copy_kv_cb, &info);
+}
+
+  void
+wh_iter_skip1(struct wormhole_iter * const iter)
+{
+  wh_api->iter_skip1(iter);
+}
+
+  void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr)
+{
+  wh_api->iter_skip(iter, nr);
+}
+
+  bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv)
+{
+  return wh_api->iter_inp(iter, uf, priv);
+}
+
+  void
+wh_iter_park(struct wormhole_iter * const iter)
+{
+  wh_api->iter_park(iter);
+}
+
+  void
+wh_iter_destroy(struct wormhole_iter * const iter)
+{
+  wh_api->iter_destroy(iter);
+}
+// }}} wh
+
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/wh.h b/test/MassTrie-beta/wormhole/wh.h
new file mode 100644
index 00000000..bd17b38d
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/wh.h
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+ *
+ * All rights reserved. No warranty, explicit or implicit, provided.
+ */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct wormhole;
+struct wormref;
+
+// wormhole {{{
+// the wh created by wormhole_create() can work with all of safe/unsafe operations.
+  extern struct wormhole *
+wormhole_create(const struct kvmap_mm * const mm);
+
+// the wh created by whunsafe_create() can only work with the unsafe operations.
+  extern struct wormhole *
+whunsafe_create(const struct kvmap_mm * const mm);
+
+  extern struct kv *
+wormhole_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+wormhole_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+wormhole_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+wormhole_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+wormhole_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wormhole_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+wormhole_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+wormhole_iter_create(struct wormref * const ref);
+
+  extern void
+wormhole_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern bool
+wormhole_iter_valid(struct wormhole_iter * const iter);
+
+  extern struct kv *
+wormhole_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_kref(struct wormhole_iter * const iter, struct kref * const kref);
+
+  extern bool
+wormhole_iter_kvref(struct wormhole_iter * const iter, struct kvref * const kvref);
+
+  extern void
+wormhole_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+wormhole_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+  extern bool
+wormhole_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wormhole_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+wormhole_ref(struct wormhole * const map);
+
+  extern struct wormhole *
+wormhole_unref(struct wormref * const ref);
+
+  extern void
+wormhole_park(struct wormref * const ref);
+
+  extern void
+wormhole_resume(struct wormref * const ref);
+
+  extern void
+wormhole_refresh_qstate(struct wormref * const ref);
+
+// clean with more threads
+  extern void
+wormhole_clean_th(struct wormhole * const map, const u32 nr_threads);
+
+  extern void
+wormhole_clean(struct wormhole * const map);
+
+  extern void
+wormhole_destroy(struct wormhole * const map);
+
+// safe API (no need to refresh qstate)
+
+  extern struct kv *
+whsafe_get(struct wormref * const ref, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whsafe_probe(struct wormref * const ref, const struct kref * const key);
+
+  extern bool
+whsafe_put(struct wormref * const ref, struct kv * const kv);
+
+  extern bool
+whsafe_merge(struct wormref * const ref, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whsafe_inpr(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_inpw(struct wormref * const ref, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whsafe_del(struct wormref * const ref, const struct kref * const key);
+
+  extern u64
+whsafe_delr(struct wormref * const ref, const struct kref * const start,
+    const struct kref * const end);
+
+// use wormhole_iter_create
+  extern void
+whsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+  extern struct kv *
+whsafe_iter_peek(struct wormhole_iter * const iter, struct kv * const out);
+
+// use wormhole_iter_valid
+// use wormhole_iter_peek
+// use wormhole_iter_kref
+// use wormhole_iter_kvref
+// use wormhole_iter_skip1
+// use wormhole_iter_skip
+// use wormhole_iter_next
+// use wormhole_iter_inp
+
+  extern void
+whsafe_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+whsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern struct wormref *
+whsafe_ref(struct wormhole * const map);
+
+// use wormhole_unref
+
+// unsafe API
+
+  extern struct kv *
+whunsafe_get(struct wormhole * const map, const struct kref * const key, struct kv * const out);
+
+  extern bool
+whunsafe_probe(struct wormhole * const map, const struct kref * const key);
+
+  extern bool
+whunsafe_put(struct wormhole * const map, struct kv * const kv);
+
+  extern bool
+whunsafe_merge(struct wormhole * const map, const struct kref * const kref,
+    kv_merge_func uf, void * const priv);
+
+  extern bool
+whunsafe_inp(struct wormhole * const map, const struct kref * const key,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+whunsafe_del(struct wormhole * const map, const struct kref * const key);
+
+  extern u64
+whunsafe_delr(struct wormhole * const map, const struct kref * const start,
+    const struct kref * const end);
+
+  extern struct wormhole_iter *
+whunsafe_iter_create(struct wormhole * const map);
+
+  extern void
+whunsafe_iter_seek(struct wormhole_iter * const iter, const struct kref * const key);
+
+// unsafe iter_valid: use wormhole_iter_valid
+// unsafe iter_peek: use wormhole_iter_peek
+// unsafe iter_kref: use wormhole_iter_kref
+
+  extern void
+whunsafe_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+whunsafe_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern struct kv *
+whunsafe_iter_next(struct wormhole_iter * const iter, struct kv * const out);
+
+// unsafe iter_inp: use wormhole_iter_inp
+
+  extern void
+whunsafe_iter_destroy(struct wormhole_iter * const iter);
+
+  extern void
+wormhole_fprint(struct wormhole * const map, FILE * const out);
+
+extern const struct kvmap_api kvmap_api_wormhole;
+extern const struct kvmap_api kvmap_api_whsafe;
+extern const struct kvmap_api kvmap_api_whunsafe;
+// }}} wormhole
+
+// wh {{{
+  extern struct wormhole *
+wh_create(void);
+
+  extern struct wormref *
+wh_ref(struct wormhole * const wh);
+
+  extern void
+wh_unref(struct wormref * const ref);
+
+  extern void
+wh_park(struct wormref * const ref);
+
+  extern void
+wh_resume(struct wormref * const ref);
+
+  extern void
+wh_clean(struct wormhole * const map);
+
+  extern void
+wh_destroy(struct wormhole * const map);
+
+  extern bool
+wh_put(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    const void * const vbuf, const u32 vlen);
+
+  extern bool
+wh_del(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_probe(struct wormref * const ref, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_get(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern bool
+wh_inpr(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_inpw(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_inp_func uf, void * const priv);
+
+  extern bool
+wh_merge(struct wormref * const ref, const void * const kbuf, const u32 klen,
+    kv_merge_func uf, void * const priv);
+
+  extern u64
+wh_delr(struct wormref * const ref, const void * const kbuf_start, const u32 klen_start,
+    const void * const kbuf_end, const u32 klen_end);
+
+  extern struct wormhole_iter *
+wh_iter_create(struct wormref * const ref);
+
+  extern void
+wh_iter_seek(struct wormhole_iter * const iter, const void * const kbuf, const u32 klen);
+
+  extern bool
+wh_iter_valid(struct wormhole_iter * const iter);
+
+  extern bool
+wh_iter_peek(struct wormhole_iter * const iter,
+    void * const kbuf_out, const u32 kbuf_size, u32 * const klen_out,
+    void * const vbuf_out, const u32 vbuf_size, u32 * const vlen_out);
+
+  extern void
+wh_iter_skip1(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_skip(struct wormhole_iter * const iter, const u32 nr);
+
+  extern bool
+wh_iter_inp(struct wormhole_iter * const iter, kv_inp_func uf, void * const priv);
+
+  extern void
+wh_iter_park(struct wormhole_iter * const iter);
+
+  extern void
+wh_iter_destroy(struct wormhole_iter * const iter);
+// }}} wh
+
+#ifdef __cplusplus
+}
+#endif
+// vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/wh.py b/test/MassTrie-beta/wormhole/wh.py
new file mode 100644
index 00000000..e744cec8
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/wh.py
@@ -0,0 +1,192 @@
+#!/usr/bin/python3
+
+#
+# Copyright (c) 2016--2021  Wu, Xingbo <wuxb45@gmail.com>
+#
+# All rights reserved. No warranty, explicit or implicit, provided.
+#
+
+import msgpack
+from ctypes import *   # CDLL and c_xxx types
+
+# libwh {{{
+# Change this path when necessary
+libwh = CDLL("./libwh.so")
+
+# create
+libwh.wh_create.argtypes = []
+libwh.wh_create.restype = c_void_p
+
+# close (no return value)
+libwh.wh_destroy.argtypes = [c_void_p]
+
+# ref
+libwh.wh_ref.argtypes = [c_void_p]
+libwh.wh_ref.restype = c_void_p
+
+# unref
+libwh.wh_unref.argtypes = [c_void_p]
+
+# put
+libwh.wh_put.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint]
+libwh.wh_put.restype = c_bool
+
+# get
+libwh.wh_get.argtypes = [c_void_p, c_char_p, c_uint, c_char_p, c_uint, c_void_p]
+libwh.wh_get.restype = c_bool
+
+# probe
+libwh.wh_probe.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_probe.restype = c_bool
+
+# del
+libwh.wh_del.argtypes = [c_void_p, c_char_p, c_uint]
+libwh.wh_del.restype = c_bool
+
+# iter_create
+libwh.wh_iter_create.argtypes = [c_void_p]
+libwh.wh_iter_create.restype = c_void_p
+
+# iter_seek
+libwh.wh_iter_seek.argtypes = [c_void_p, c_char_p, c_uint]
+
+# iter_valid
+libwh.wh_iter_valid.argtypes = [c_void_p]
+libwh.wh_iter_valid.restype = c_bool
+
+# iter_skip1
+libwh.wh_iter_skip1.argtypes = [c_void_p]
+
+# iter_skip
+libwh.wh_iter_skip.argtypes = [c_void_p, c_uint]
+
+# iter_peek
+libwh.wh_iter_peek.argtypes = [c_void_p, c_char_p, c_uint, c_void_p, c_char_p, c_uint, c_void_p]
+libwh.wh_iter_peek.restype = c_bool
+
+# iter_park
+libwh.wh_iter_park.argtypes = [c_void_p]
+
+# iter_destroy
+libwh.wh_iter_destroy.argtypes = [c_void_p]
+# }}} libwh
+
+# class {{{
+class Wh:
+    def __init__(self, maxklen=256, maxvlen=8192):
+        self.whptr = libwh.wh_create()
+        self.kbufsz = maxklen
+        self.vbufsz = maxvlen
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_destroy(self.whptr)
+
+    def ref(self):
+        return WhRef(self.whptr, self.kbufsz, self.vbufsz)
+
+class WhRef:
+    def __init__(self, whptr, kbufsz, vbufsz):
+        self.refptr = libwh.wh_ref(whptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.vbuf = create_string_buffer(self.vbufsz)
+
+    # user must call explicitly
+    def unref(self):
+        libwh.wh_unref(self.refptr)
+
+    def iter(self):
+        return WhIter(self.refptr, self.kbufsz, self.vbufsz)
+
+    # key: python string; value: any (hierarchical) python object
+    def put(self, key, value):
+        binkey = key.encode()
+        binvalue = msgpack.packb(value)
+        return libwh.wh_put(self.refptr, binkey, c_uint(len(binkey)), binvalue, c_uint(len(binvalue)))
+
+    # return the value as a python object
+    def get(self, key):
+        binkey = key.encode()
+        vlen = c_uint()
+        ret = libwh.wh_get(self.refptr, binkey, len(binkey), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and vlen.value <= self.vbufsz:
+            return msgpack.unpackb(self.vbuf.value)
+        else:
+            return None
+
+    def delete(self, key):
+        binkey = key.encode()
+        return libwh.wh_del(self.refptr, binkey, c_uint(len(binkey)))
+
+    def probe(self, key):
+        binkey = key.encode()
+        return libwh.wh_probe(self.refptr, binkey, c_uint(len(binkey)))
+
+class WhIter:
+    def __init__(self, refptr, kbufsz, vbufsz):
+        self.iptr = libwh.wh_iter_create(refptr)
+        self.kbufsz = kbufsz
+        self.vbufsz = vbufsz
+        self.kbuf = create_string_buffer(kbufsz)
+        self.vbuf = create_string_buffer(vbufsz)
+
+    # user must call explicitly
+    def destroy(self):
+        libwh.wh_iter_destroy(self.iptr)
+
+    def seek(self, key):
+        if key is None:
+            libwh.wh_iter_seek(self.iptr, None, c_uint(0))
+        else:
+            binkey = key.encode()
+            libwh.wh_iter_seek(self.iptr, binkey, c_uint(len(binkey)))
+
+    def valid(self):
+        return libwh.wh_iter_valid(self.iptr)
+
+    def skip1(self):
+        libwh.wh_iter_skip1(self.iptr)
+
+    def skip(self, nr):
+        libwh.wh_iter_skip(self.iptr, c_uint(nr))
+
+    # return (key, value) pair or None
+    def peek(self):
+        klen = c_uint()
+        vlen = c_uint()
+        ret = libwh.wh_iter_peek(self.iptr, self.kbuf, self.kbufsz, byref(klen), self.vbuf, self.vbufsz, byref(vlen))
+        if ret and klen.value <= self.kbufsz and vlen.value <= self.vbufsz:
+            self.kbuf[klen.value] = b'\x00'
+            return (self.kbuf.value.decode(), klen.value, msgpack.unpackb(self.vbuf.value), vlen.value)
+        else:
+            return None
+
+# }}} class
+
+# examples
+wh1 = Wh(32, 1024)
+ref1 = wh1.ref()  # take a ref for kv operations
+
+ref1.put("Hello", "pywh")
+ref1.put("key1", "value1")
+ref1.put("key2", "value2")
+ref1.put("key3", {"xxx":"valuex", "yyy":"valuey"})
+ref1.delete("key2")
+
+rget = ref1.get("Hello")
+print(rget)
+
+# don't use ref when iterating
+iter1 = ref1.iter()
+iter1.seek(None)
+while iter1.valid():
+    r = iter1.peek()
+    print(r)
+    iter1.skip1()
+
+iter1.destroy() # must destroy all iters before unref
+ref1.unref() # must unref all refs before close()
+wh1.destroy()
+
+# vim:fdm=marker
diff --git a/test/MassTrie-beta/wormhole/wh.strip b/test/MassTrie-beta/wormhole/wh.strip
new file mode 100644
index 00000000..e7b3971f
--- /dev/null
+++ b/test/MassTrie-beta/wormhole/wh.strip
@@ -0,0 +1,161 @@
+-K key_size
+-K key_size_align
+-K kref_compare
+-K kref_kv_compare
+-K kref_kv_match
+-K kref_lcp
+-K kref_match
+-K kref_null
+-K kref_ref_hash32
+-K kref_ref_kv
+-K kref_ref_kv_hash32
+-K kref_ref_raw
+-K kref_update_hash32
+-K kv_compare
+-K kv_compare_ptrs
+-K kv_crc32c
+-K kv_crc32c_extend
+-K kv_create
+-K kv_create_kref
+-K kv_create_str
+-K kv_create_str_str
+-K kv_dup
+-K kv_dup2
+-K kv_dup2_key
+-K kv_dup2_key_prefix
+-K kv_dup_key
+-K kv_key_lcp
+-K kv_kptr
+-K kv_kptr_c
+-K kv_kref
+-K kvmap_api_whsafe
+-K kvmap_api_whunsafe
+-K kvmap_api_wormhole
+-K kvmap_dump_keys
+-K kvmap_inp_steal_kv
+-K kvmap_kv_del
+-K kvmap_kv_delr
+-K kvmap_kv_get
+-K kvmap_kv_inpr
+-K kvmap_kv_inpw
+-K kvmap_kv_iter_seek
+-K kvmap_kv_merge
+-K kvmap_kv_probe
+-K kvmap_kv_put
+-K kvmap_mm_dup
+-K kvmap_mm_free_free
+-K kvmap_mm_free_noop
+-K kvmap_mm_in_dup
+-K kvmap_mm_in_noop
+-K kvmap_mm_ndf
+-K kvmap_mm_out_dup
+-K kvmap_mm_out_noop
+-K kvmap_raw_del
+-K kvmap_raw_get
+-K kvmap_raw_inpr
+-K kvmap_raw_inpw
+-K kvmap_raw_iter_seek
+-K kvmap_raw_probe
+-K kvmap_ref
+-K kvmap_unref
+-K kv_match
+-K kv_match_full
+-K kv_null
+-K kv_print
+-K kv_qsort
+-K kvref_dup2_key
+-K kvref_dup2_kv
+-K kv_refill
+-K kv_refill_hex32
+-K kv_refill_hex64
+-K kv_refill_hex64_klen
+-K kv_refill_kref
+-K kv_refill_kref_v
+-K kv_refill_str
+-K kv_refill_str_str
+-K kv_refill_u64
+-K kv_refill_value
+-K kvref_kv_compare
+-K kvref_ref_kv
+-K kv_size
+-K kv_size_align
+-K kv_update_hash
+-K kv_vptr
+-K kv_vptr_c
+-K wh_clean
+-K wh_create
+-K wh_del
+-K wh_delr
+-K wh_destroy
+-K wh_get
+-K wh_inpr
+-K wh_inpw
+-K wh_iter_create
+-K wh_iter_destroy
+-K wh_iter_inp
+-K wh_iter_park
+-K wh_iter_peek
+-K wh_iter_seek
+-K wh_iter_skip
+-K wh_iter_valid
+-K wh_merge
+-K wh_park
+-K wh_probe
+-K wh_ref
+-K wh_resume
+-K whsafe_del
+-K whsafe_delr
+-K whsafe_get
+-K whsafe_inpr
+-K whsafe_inpw
+-K whsafe_iter_destroy
+-K whsafe_iter_park
+-K whsafe_iter_seek
+-K whsafe_merge
+-K whsafe_probe
+-K whsafe_ref
+-K whsafe_put
+-K wh_put
+-K wh_unref
+-K whunsafe_create
+-K whunsafe_del
+-K whunsafe_delr
+-K whunsafe_get
+-K whunsafe_inp
+-K whunsafe_iter_create
+-K whunsafe_iter_destroy
+-K whunsafe_iter_next
+-K whunsafe_iter_seek
+-K whunsafe_iter_skip
+-K whunsafe_merge
+-K whunsafe_probe
+-K whunsafe_put
+-K wormhole_clean
+-K wormhole_create
+-K wormhole_del
+-K wormhole_delr
+-K wormhole_destroy
+-K wormhole_fprint
+-K wormhole_get
+-K wormhole_inpr
+-K wormhole_inpw
+-K wormhole_iter_create
+-K wormhole_iter_destroy
+-K wormhole_iter_inp
+-K wormhole_iter_kref
+-K wormhole_iter_kvref
+-K wormhole_iter_next
+-K wormhole_iter_park
+-K wormhole_iter_peek
+-K wormhole_iter_seek
+-K wormhole_iter_skip
+-K wormhole_iter_valid
+-K wormhole_kvmap_api_create
+-K wormhole_merge
+-K wormhole_park
+-K wormhole_probe
+-K wormhole_ref
+-K wormhole_refresh_qstate
+-K wormhole_resume
+-K wormhole_put
+-K wormhole_unref
diff --git a/test/unit-dboindex.cc b/test/unit-dboindex.cc
index 86f5cf6b..f5ddd1dc 100644
--- a/test/unit-dboindex.cc
+++ b/test/unit-dboindex.cc
@@ -1,781 +1,1671 @@
 #include "DB_index.hh"
+
 #include "DB_structs.hh"
+
 #include "DB_params.hh"
 
+#include <chrono>
+
+
+
 struct coarse_grained_row {
+
     enum class NamedColumn : int { aa = 0, bb, cc };
 
+
+
     uint64_t aa;
+
     uint64_t bb;
+
     uint64_t cc;
 
+
+
     coarse_grained_row() : aa(), bb(), cc() {}
 
+
+
     coarse_grained_row(uint64_t a, uint64_t b, uint64_t c)
+
             : aa(a), bb(b), cc(c) {}
+
 };
 
+
+
 struct key_type {
+
     uint64_t id;
 
+
+
     explicit key_type(uint64_t key) : id(bench::bswap(key)) {}
+
     operator lcdf::Str() const {
+
         return lcdf::Str((const char *)this, sizeof(*this));
+
     }
+
 };
 
+
+
 // using example_row from VersionSelector.hh
 
+
+
 namespace bench {
 
+
+
 template <>
+
 struct SplitParams<coarse_grained_row> {
+
     using split_type_list = std::tuple<coarse_grained_row>;
+
     using layout_type = typename SplitMvObjectBuilder<split_type_list>::type;
+
     static constexpr size_t num_splits = std::tuple_size<split_type_list>::value;
 
+
+
     static constexpr auto split_builder = std::make_tuple(
+
         [](const coarse_grained_row& in) -> coarse_grained_row {
+
             coarse_grained_row out;
+
             out.aa = in.aa;
+
             out.bb = in.bb;
+
             out.cc = in.cc;
+
             return out;
+
         }
+
     );
 
+
+
     static constexpr auto split_merger = std::make_tuple(
+
         [](coarse_grained_row* out, const coarse_grained_row& in) -> void {
+
             out->aa = in.aa;
+
             out->bb = in.bb;
+
             out->cc = in.cc;
+
         }
+
     );
 
+
+
     static constexpr auto map = [](int col_n) -> int {
+
         (void)col_n;
+
         return 0;
+
     };
+
 };
 
+
+
 template <typename A>
+
 class RecordAccessor<A, coarse_grained_row> {
+
 public:
+
     const uint64_t& aa() const {
+
         return impl().aa_impl();
+
     }
 
+
+
     const uint64_t& bb() const {
+
         return impl().bb_impl();
+
     }
 
+
+
     const uint64_t& cc() const {
+
         return impl().cc_impl();
+
     }
 
+
+
     void copy_into(coarse_grained_row* dst) const {
+
         return impl().copy_into_impl(dst);
+
     }
 
+
+
 private:
+
     const A& impl() const {
+
         return *static_cast<const A*>(this);
+
     }
+
 };
 
+
+
 template <>
+
 class UniRecordAccessor<coarse_grained_row> : public RecordAccessor<UniRecordAccessor<coarse_grained_row>, coarse_grained_row> {
+
 public:
+
     UniRecordAccessor(const coarse_grained_row* const vptr) : vptr_(vptr) {}
 
+
+
 private:
+
     const uint64_t& aa_impl() const {
+
         return vptr_->aa;
+
     }
 
+
+
     const uint64_t& bb_impl() const {
+
         return vptr_->bb;
+
     }
 
+
+
     const uint64_t& cc_impl() const {
+
         return vptr_->cc;
+
     }
 
+
+
     void copy_into_impl(coarse_grained_row* dst) const {
+
         if (vptr_) {
+
             dst->aa = vptr_->aa;
+
             dst->bb = vptr_->bb;
+
             dst->cc = vptr_->cc;
+
         }
+
     }
 
+
+
     const coarse_grained_row* vptr_;
+
     friend RecordAccessor<UniRecordAccessor<coarse_grained_row>, coarse_grained_row>;
+
 };
 
+
+
 template <>
+
 class SplitRecordAccessor<coarse_grained_row> : public RecordAccessor<SplitRecordAccessor<coarse_grained_row>, coarse_grained_row> {
+
 public:
+
     static constexpr size_t num_splits = SplitParams<coarse_grained_row>::num_splits;
 
+
+
     SplitRecordAccessor(const std::array<void*, num_splits>& vptrs)
+
         : vptr_0_(reinterpret_cast<coarse_grained_row*>(vptrs[0])) {}
 
+
+
 private:
+
     const uint64_t& aa_impl() const {
+
         return vptr_0_->aa;
+
     }
 
+
+
     const uint64_t& bb_impl() const {
+
         return vptr_0_->bb;
+
     }
 
+
+
     const uint64_t& cc_impl() const {
+
         return vptr_0_->cc;
+
     }
 
+
+
     void copy_into_impl(coarse_grained_row* dst) const {
+
         if (vptr_0_) {
+
             dst->aa = vptr_0_->aa;
+
             dst->bb = vptr_0_->bb;
+
             dst->cc = vptr_0_->cc;
+
         }
+
     }
 
+
+
     const coarse_grained_row* vptr_0_;
 
+
+
     friend RecordAccessor<SplitRecordAccessor<coarse_grained_row>, coarse_grained_row>;
+
 };
 
+
+
 template <>
+
 struct SplitParams<example_row> {
+
     using split_type_list = std::tuple<example_row>;
+
     using layout_type = typename SplitMvObjectBuilder<split_type_list>::type;
+
     static constexpr size_t num_splits = std::tuple_size<split_type_list>::value;
 
+
+
     static constexpr auto split_builder = std::make_tuple(
+
         [](const example_row& in) -> example_row {
+
             example_row out;
+
             out.d_ytd = in.d_ytd;
+
             out.d_payment_cnt = in.d_payment_cnt;
+
             out.d_date = in.d_date;
+
             out.d_tax = in.d_tax;
+
             out.d_next_oid = in.d_next_oid;
+
             return out;
+
         }
+
     );
 
+
+
     static constexpr auto split_merger = std::make_tuple(
+
         [](example_row* out, const example_row& in) -> void {
+
             out->d_ytd = in.d_ytd;
+
             out->d_payment_cnt = in.d_payment_cnt;
+
             out->d_date = in.d_date;
+
             out->d_tax = in.d_tax;
+
             out->d_next_oid = in.d_next_oid;
+
         }
+
     );
 
+
+
     static constexpr auto map = [](int col_n) -> int {
+
         (void)col_n;
+
         return 0;
+
     };
+
 };
 
+
+
 template <typename A>
+
 class RecordAccessor<A, example_row> {
+
 public:
+
     const uint32_t& d_ytd() const {
+
         return impl().d_ytd_impl();
+
     }
 
+
+
     const uint32_t& d_payment_cnt() const {
+
         return impl().d_payment_cnt_impl();
+
     }
 
+
+
     const uint32_t& d_date() const {
+
         return impl().d_date_impl();
+
     }
 
+
+
     const uint32_t& d_tax() const {
+
         return impl().d_tax_impl();
+
     }
 
+
+
     const uint32_t& d_next_oid() const {
+
         return impl().d_next_oid_impl();
+
     }
 
+
+
     void copy_into(example_row* dst) const {
+
         return impl().copy_into_impl(dst);
+
     }
 
+
+
 private:
+
     const A& impl() const {
+
         return *static_cast<const A*>(this);
+
     }
+
 };
 
+
+
 template <>
+
 class UniRecordAccessor<example_row> : public RecordAccessor<UniRecordAccessor<example_row>, example_row> {
+
 public:
+
     UniRecordAccessor(const example_row* const vptr) : vptr_(vptr) {}
 
+
+
 private:
+
     const uint32_t& d_ytd_impl() const {
+
         return vptr_->d_ytd;
+
     }
 
+
+
     const uint32_t& d_payment_cnt_impl() const {
+
         return vptr_->d_payment_cnt;
+
     }
 
+
+
     const uint32_t& d_date_impl() const {
+
         return vptr_->d_date;
+
     }
 
+
+
     const uint32_t& d_tax_impl() const {
+
         return vptr_->d_tax;
+
     }
 
+
+
     const uint32_t& d_next_oid_impl() const {
+
         return vptr_->d_next_oid;
+
     }
 
+
+
     void copy_into_impl(example_row* dst) const {
+
         if (vptr_) {
+
             dst->d_ytd = vptr_->d_ytd;
+
             dst->d_payment_cnt = vptr_->d_payment_cnt;
+
             dst->d_date = vptr_->d_date;
+
             dst->d_tax = vptr_->d_tax;
+
             dst->d_next_oid = vptr_->d_next_oid;
+
         }
+
     }
 
+
+
     const example_row* vptr_;
+
     friend RecordAccessor<UniRecordAccessor<example_row>, example_row>;
+
 };
 
-template <>
+
+
+template <>
+
 class SplitRecordAccessor<example_row> : public RecordAccessor<SplitRecordAccessor<example_row>, example_row> {
+
 public:
+
     static constexpr size_t num_splits = SplitParams<example_row>::num_splits;
 
+
+
     SplitRecordAccessor(const std::array<void*, num_splits>& vptrs)
+
         : vptr_0_(reinterpret_cast<example_row*>(vptrs[0])) {}
 
+
+
 private:
+
     const uint32_t& d_ytd_impl() const {
+
         return vptr_0_->d_ytd;
+
     }
 
+
+
     const uint32_t& d_payment_cnt_impl() const {
+
         return vptr_0_->d_payment_cnt;
+
     }
 
+
+
     const uint32_t& d_date_impl() const {
+
         return vptr_0_->d_date;
+
     }
 
+
+
     const uint32_t& d_tax_impl() const {
+
         return vptr_0_->d_tax;
+
     }
 
+
+
     const uint32_t& d_next_oid_impl() const {
+
         return vptr_0_->d_next_oid;
+
     }
 
+
+
     void copy_into_impl(example_row* dst) const {
+
         if (vptr_0_) {
+
             dst->d_ytd = vptr_0_->d_ytd;
+
             dst->d_payment_cnt = vptr_0_->d_payment_cnt;
+
             dst->d_date = vptr_0_->d_date;
+
             dst->d_tax = vptr_0_->d_tax;
+
             dst->d_next_oid = vptr_0_->d_next_oid;
+
         }
+
     }
 
+
+
     const example_row* vptr_0_;
 
+
+
     friend RecordAccessor<SplitRecordAccessor<example_row>, example_row>;
+
 };
 
+
+
 };  // namespace bench
 
+
+
 using CoarseIndex = bench::ordered_index<key_type, coarse_grained_row, db_params::db_default_params>;
+
 using FineIndex = bench::ordered_index<key_type, example_row, db_params::db_default_params>;
+
 using access_t = bench::access_t;
+
 using RowAccess = bench::RowAccess;
 
+
+
 using MVIndex = bench::mvcc_ordered_index<key_type, coarse_grained_row, db_params::db_mvcc_params>;
 
+
+
 template <typename IndexType>
+
 void init_cindex(IndexType& ci) {
+
     for (uint64_t i = 1; i <= 10; ++i)
+
         ci.nontrans_put(key_type(i), coarse_grained_row(i, i, i));
+
 }
 
+
+
 void init_findex(FineIndex& fi) {
+
     example_row row;
+
     row.d_ytd = 3000;
+
     row.d_tax = 10;
+
     row.d_date = 200;
+
     row.d_next_oid = 100;
+
     row.d_payment_cnt = 50;
 
+
+
     for (uint64_t i = 1; i <= 10; ++i)
+
         fi.nontrans_put(key_type(i), row);
+
 }
 
+
+
 void test_coarse_basic() {
+
     typedef CoarseIndex::NamedColumn nc;
+
     CoarseIndex ci;
+
     ci.thread_init();
 
+
+
     init_cindex(ci);
 
+
+
     {
+
         TestTransaction t(0);
+
         auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
         (void) row;
+
         assert(success && found);
+
         assert(value.aa() == 1);
+
         assert(t.try_commit());
+
     }
 
+
+
     {
+
         TestTransaction t(0);
+
         auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::update}});
+
         (void) row;
+
         assert(success && found);
+
         auto new_row = Sto::tx_alloc<coarse_grained_row>();
+
         value.copy_into(new_row);
+
         new_row->aa = 2;
+
         ci.update_row(row, new_row);
+
         assert(t.try_commit());
+
     }
 
+
+
     {
+
         TestTransaction t1(1);
+
         auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
         (void) row;
+
         assert(success && found);
+
         assert(value.aa() == 2);
+
         assert(t1.try_commit());
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_coarse_read_my_split() {
+
     typedef CoarseIndex::NamedColumn nc;
+
     CoarseIndex ci;
+
     ci.thread_init();
 
+
+
     init_cindex(ci);
 
+
+
     {
+
         TestTransaction t(0);
+
         auto [success, found, row, value] = ci.select_split_row(key_type(20), {{nc::aa, access_t::read}});
+
         (void) row;
+
         (void) value;
+
         assert(success && !found);
+
         for (int i = 0; i < 10; ++i) {
+
             auto r = Sto::tx_alloc<coarse_grained_row>();
+
             new (r) coarse_grained_row(i, i, i);
+
             ci.insert_row(key_type(10 + i), r);
+
         }
+
         assert(t.try_commit());
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_coarse_conflict0() {
+
     typedef CoarseIndex::NamedColumn nc;
+
     CoarseIndex ci;
+
     ci.thread_init();
 
+
+
     init_cindex(ci);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.aa() == 1);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<coarse_grained_row>();
+
             value.copy_into(new_row);
+
             new_row->aa = 2;
+
             ci.update_row(row, new_row);
+
             assert(t2.try_commit());
+
         }
 
+
+
         t1.use();
+
         assert(!t1.try_commit());
+
     }
 
+
+
     {
+
         TestTransaction t1(0);
+
         coarse_grained_row row_value(100, 100, 100);
+
         {
+
             auto [success, found] = ci.insert_row(key_type(100), &row_value);
+
             assert(success && !found);
+
         }
 
+
+
         TestTransaction t2(0);
+
         {
+
             auto [success, found, row, value] = ci.select_split_row(key_type(100), {{nc::aa, access_t::read}});
+
             (void) row;
+
             (void) value;
+
             assert(!success || !found);
+
         }
 
+
+
         t1.use();
+
         assert(t1.try_commit());
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_coarse_conflict1() {
+
     typedef CoarseIndex::NamedColumn nc;
+
     CoarseIndex ci;
+
     ci.thread_init();
 
+
+
     init_cindex(ci);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.aa() == 1);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::bb, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<coarse_grained_row>();
+
             value.copy_into(new_row);
+
             new_row->aa = 2; // Will get installed
+
             new_row->bb = 2;
+
             ci.update_row(row, new_row);
+
             assert(t2.try_commit());
 
+
+
             t1.use();
+
             assert(value.aa() == 2);
+
             assert(!t1.try_commit()); // expected coarse-grained behavior
+
         }
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_fine_conflict0() {
+
     typedef FineIndex::NamedColumn nc;
+
     FineIndex fi;
+
     fi.thread_init();
 
+
+
     init_findex(fi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_ytd() == 3000);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             new_row->d_ytd += 10;
+
             fi.update_row(row, new_row);
+
             assert(t2.try_commit());
 
+
+
             t1.use();
+
             assert(value.d_ytd() == 3010);
+
             assert(!t1.try_commit());
+
         }
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_fine_conflict1() {
+
     typedef FineIndex::NamedColumn nc;
+
     FineIndex fi;
+
     fi.thread_init();
 
+
+
     init_findex(fi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_ytd() == 3000);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::payment_cnt, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             new_row->d_ytd += 10;
+
             new_row->d_payment_cnt += 1;
+
             fi.update_row(row, new_row);
+
             assert(t2.try_commit());
 
+
+
             t1.use();
+
             assert(value.d_ytd() == 3000); // unspecified modifications are not installed
+
             assert(value.d_payment_cnt() == 51);
+
             assert(!t1.try_commit()); // not able to commit due to hierarchical versions
+
         }
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_fine_conflict2() {
+
     typedef FineIndex::NamedColumn nc;
+
     FineIndex fi;
+
     fi.thread_init();
 
+
+
     init_findex(fi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_tax() == 10);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             new_row->d_ytd += 10;
+
             new_row->d_payment_cnt += 1;
+
             fi.update_row(row, new_row);
+
             assert(t2.try_commit());
 
+
+
             t1.use();
+
             assert(value.d_ytd() == 3010);
+
             assert(value.d_payment_cnt() == 50); // unspecified modifications are not installed
+
             assert(t1.try_commit()); // can commit because of fine-grained versions
+
         }
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_fine_delete0() {
+
     typedef FineIndex::NamedColumn nc;
+
     FineIndex fi;
+
     fi.thread_init();
 
+
+
     init_findex(fi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found] = fi.delete_row(key_type(1));
+
             assert(success && found);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::update}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_tax() == 10);
 
+
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             fi.update_row(row, new_row);
+
         }
 
+
+
         assert(t2.try_commit());
 
+
+
         t1.use();
+
         assert(!t1.try_commit());
+
     }
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found] = fi.delete_row(key_type(2));
+
             assert(success && found);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(2), {{nc::tax, access_t::update}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_tax() == 10);
 
+
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             fi.update_row(row, new_row);
+
         }
 
+
+
         assert(t1.try_commit());
 
+
+
         t2.use();
+
         assert(!t2.try_commit());
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_fine_delete1() {
+
     typedef FineIndex::NamedColumn nc;
+
     FineIndex fi;
+
     fi.thread_init();
 
+
+
     init_findex(fi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::update}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_tax() == 10);
 
+
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             fi.update_row(row, new_row);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found] = fi.delete_row(key_type(1));
+
             assert(success && found);
+
         }
 
+
+
         assert(t2.try_commit());
 
+
+
         t1.use();
+
         assert(!t1.try_commit());
+
     }
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = fi.select_split_row(key_type(2), {{nc::tax, access_t::update}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.d_tax() == 10);
 
+
+
             auto new_row = Sto::tx_alloc<example_row>();
+
             value.copy_into(new_row);
+
             fi.update_row(row, new_row);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found] = fi.delete_row(key_type(2));
+
             assert(success && found);
+
         }
 
+
+
         assert(t1.try_commit());
 
+
+
         t2.use();
+
         assert(!t2.try_commit());
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 void test_mvcc_snapshot() {
+
     typedef CoarseIndex::NamedColumn nc;
+
     MVIndex mi;
+
     mi.thread_init();
 
+
+
     init_cindex(mi);
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             auto [success, found, row, value] = mi.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
             (void) row;
+
             assert(success && found);
+
             assert(value.aa() == 1);
+
         }
 
+
+
         TestTransaction t2(1);
+
         {
+
             auto [success, found, row, value] = mi.select_split_row(key_type(1), {{nc::aa, access_t::update}});
+
             assert(success && found);
+
             auto new_row = Sto::tx_alloc<coarse_grained_row>();
+
             value.copy_into(new_row);
+
             new_row->aa = 2;
+
             mi.update_row(row, new_row);
+
             assert(t2.try_commit());
 
+
+
             t1.use();
+
             assert(t1.try_commit());
+
         }
+
     }
 
+
+
     {
+
         TestTransaction t1(0);
+
         {
+
             coarse_grained_row row_value(100, 100, 100);
+
             auto [success, found] = mi.insert_row(key_type(100), &row_value);
+
             assert(success && !found);
+
         }
 
+
+
         TestTransaction t2(0);
+
         {
+
             auto [success, found, row, value] = mi.select_split_row(key_type(100), {{nc::aa, access_t::read}});
+
             (void) row;
+
             (void) value;
+
             assert(!success || !found);
 
+
+
             t1.use();
+
             assert(t1.try_commit());
+
         }
+
     }
 
-    printf("pass %s\n", __FUNCTION__);
+
+
+    //printf("pass %s\n", __FUNCTION__);
+
 }
 
+
+
 int main() {
+
+auto start = std::chrono::steady_clock::now();
+
+   for(int i=0;i<1000;i++)
+
     test_coarse_basic();
+
+    auto end = std::chrono::steady_clock::now();
+
+    std::cout<<"The average elapsed time for test_coarse_basic with masstree is "<<
+
+    std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+ 
+
+    start = std::chrono::steady_clock::now();
+
+     for(int i=0;i<1000;i++)
+
     test_coarse_read_my_split();
+
+    end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_coarse_read_my_split with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+     
+
+     start = std::chrono::steady_clock::now();
+
+     for(int i=0;i<1000;i++)
+
     test_coarse_conflict0();
+
+     end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_coarse_conflict0 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+     
+
+      start = std::chrono::steady_clock::now();
+
+      for(int i=0;i<1000;i++)
+
     test_coarse_conflict1();
+
+      end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_coarse_conflict1 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+    
+
+      start = std::chrono::steady_clock::now();
+
+      for(int i=0;i<1000;i++)
+
     test_fine_conflict0();
+
+       end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_fine_conflict0 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+     
+
+    start = std::chrono::steady_clock::now();
+
+    for(int i=0;i<1000;i++)
+
     test_fine_conflict1();
+
+     end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_fine_conflict1 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()<<" ns"<<std::endl;
+
+     
+
+     start = std::chrono::steady_clock::now();
+
+     for(int i=0;i<1000;i++)
+
     test_fine_conflict2();
+
+      end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_fine_conflict2 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()<<" ns"<<std::endl;
+
+     
+
+        start = std::chrono::steady_clock::now();
+
+        for(int i=0;i<1000;i++)
+
     test_fine_delete0();
+
+          end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_fine_delete0 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+     
+
+     start = std::chrono::steady_clock::now();
+
+     for(int i=0;i<1000;i++)
+
     test_fine_delete1();
+
+    end = std::chrono::steady_clock::now();
+
+     std::cout<<"The average elapsed time for test_fine_delete1 with masstree is "<<
+
+       std::chrono::duration_cast<std::chrono::nanoseconds>(end-start).count()/1000<<" ns"<<std::endl;
+
+   
+
     test_mvcc_snapshot();
+
     printf("All tests pass!\n");
 
+
+
     std::thread advancer;  // empty thread because we have no advancer thread
+
     Transaction::rcu_release_all(advancer, 2);
+
     return 0;
+
 }
diff --git a/test/unit-test_MTrie.cc b/test/unit-test_MTrie.cc
new file mode 100644
index 00000000..d6ec39da
--- /dev/null
+++ b/test/unit-test_MTrie.cc
@@ -0,0 +1,1282 @@
+#include "DB_oindexMTrie.hh"
+
+#include "DB_index.hh"
+
+#include "DB_structs.hh"
+
+#include "DB_params.hh"
+
+#include <chrono>
+
+struct MTrie_coarse_grained_row
+{
+
+    enum class NamedColumn : int
+    {
+        aa = 0,
+        bb,
+        cc
+    };
+
+    uint64_t aa;
+
+    uint64_t bb;
+
+    uint64_t cc;
+
+    MTrie_coarse_grained_row() : aa(), bb(), cc() {}
+
+    MTrie_coarse_grained_row(uint64_t a, uint64_t b, uint64_t c)
+
+        : aa(a), bb(b), cc(c)
+    {
+    }
+};
+
+struct key_type
+{
+
+    uint64_t id;
+
+    explicit key_type(uint64_t key) : id(bench::bswap(key)) {}
+
+    operator lcdf::Str() const
+    {
+
+        return lcdf::Str((const char *)this, sizeof(*this));
+    }
+};
+
+int j = 0;
+
+// using example_row from VersionSelector.hh
+
+namespace bench
+{
+
+    template <>
+
+    struct SplitParams<MTrie_coarse_grained_row>
+    {
+
+        using split_type_list = std::tuple<MTrie_coarse_grained_row>;
+
+        using layout_type = typename SplitMvObjectBuilder<split_type_list>::type;
+
+        static constexpr size_t num_splits = std::tuple_size<split_type_list>::value;
+
+        static constexpr auto split_builder = std::make_tuple(
+
+            [](const MTrie_coarse_grained_row &in) -> MTrie_coarse_grained_row
+            {
+                MTrie_coarse_grained_row out;
+
+                out.aa = in.aa;
+
+                out.bb = in.bb;
+
+                out.cc = in.cc;
+
+                return out;
+            }
+
+        );
+
+        static constexpr auto split_merger = std::make_tuple(
+
+            [](MTrie_coarse_grained_row *out, const MTrie_coarse_grained_row &in) -> void
+            {
+                out->aa = in.aa;
+
+                out->bb = in.bb;
+
+                out->cc = in.cc;
+            }
+
+        );
+
+        static constexpr auto map = [](int col_n) -> int
+        {
+            (void)col_n;
+
+            return 0;
+        };
+    };
+
+    template <typename A>
+
+    class RecordAccessor<A, MTrie_coarse_grained_row>
+    {
+
+    public:
+        const uint64_t &aa() const
+        {
+
+            return impl().aa_impl();
+        }
+
+        const uint64_t &bb() const
+        {
+
+            return impl().bb_impl();
+        }
+
+        const uint64_t &cc() const
+        {
+
+            return impl().cc_impl();
+        }
+
+        void copy_into(MTrie_coarse_grained_row *dst) const
+        {
+
+            return impl().copy_into_impl(dst);
+        }
+
+    private:
+        const A &impl() const
+        {
+
+            return *static_cast<const A *>(this);
+        }
+    };
+
+    template <>
+
+    class UniRecordAccessor<MTrie_coarse_grained_row> : public RecordAccessor<UniRecordAccessor<MTrie_coarse_grained_row>, MTrie_coarse_grained_row>
+    {
+
+    public:
+        UniRecordAccessor(const MTrie_coarse_grained_row *const vptr) : vptr_(vptr) {}
+
+    private:
+        const uint64_t &aa_impl() const
+        {
+
+            return vptr_->aa;
+        }
+
+        const uint64_t &bb_impl() const
+        {
+
+            return vptr_->bb;
+        }
+
+        const uint64_t &cc_impl() const
+        {
+
+            return vptr_->cc;
+        }
+
+        void copy_into_impl(MTrie_coarse_grained_row *dst) const
+        {
+
+            if (vptr_)
+            {
+
+                dst->aa = vptr_->aa;
+
+                dst->bb = vptr_->bb;
+
+                dst->cc = vptr_->cc;
+            }
+        }
+
+        const MTrie_coarse_grained_row *vptr_;
+
+        friend RecordAccessor<UniRecordAccessor<MTrie_coarse_grained_row>, MTrie_coarse_grained_row>;
+    };
+
+    template <>
+
+    class SplitRecordAccessor<MTrie_coarse_grained_row> : public RecordAccessor<SplitRecordAccessor<MTrie_coarse_grained_row>, MTrie_coarse_grained_row>
+    {
+
+    public:
+        static constexpr size_t num_splits = SplitParams<MTrie_coarse_grained_row>::num_splits;
+
+        SplitRecordAccessor(const std::array<void *, num_splits> &vptrs)
+
+            : vptr_0_(reinterpret_cast<MTrie_coarse_grained_row *>(vptrs[0]))
+        {
+        }
+
+    private:
+        const uint64_t &aa_impl() const
+        {
+
+            return vptr_0_->aa;
+        }
+
+        const uint64_t &bb_impl() const
+        {
+
+            return vptr_0_->bb;
+        }
+
+        const uint64_t &cc_impl() const
+        {
+
+            return vptr_0_->cc;
+        }
+
+        void copy_into_impl(MTrie_coarse_grained_row *dst) const
+        {
+
+            if (vptr_0_)
+            {
+
+                dst->aa = vptr_0_->aa;
+
+                dst->bb = vptr_0_->bb;
+
+                dst->cc = vptr_0_->cc;
+            }
+        }
+
+        const MTrie_coarse_grained_row *vptr_0_;
+
+        friend RecordAccessor<SplitRecordAccessor<MTrie_coarse_grained_row>, MTrie_coarse_grained_row>;
+    };
+
+    template <>
+
+    struct SplitParams<example_row>
+    {
+
+        using split_type_list = std::tuple<example_row>;
+
+        using layout_type = typename SplitMvObjectBuilder<split_type_list>::type;
+
+        static constexpr size_t num_splits = std::tuple_size<split_type_list>::value;
+
+        static constexpr auto split_builder = std::make_tuple(
+
+            [](const example_row &in) -> example_row
+            {
+                example_row out;
+
+                out.d_ytd = in.d_ytd;
+
+                out.d_payment_cnt = in.d_payment_cnt;
+
+                out.d_date = in.d_date;
+
+                out.d_tax = in.d_tax;
+
+                out.d_next_oid = in.d_next_oid;
+
+                return out;
+            }
+
+        );
+
+        static constexpr auto split_merger = std::make_tuple(
+
+            [](example_row *out, const example_row &in) -> void
+            {
+                out->d_ytd = in.d_ytd;
+
+                out->d_payment_cnt = in.d_payment_cnt;
+
+                out->d_date = in.d_date;
+
+                out->d_tax = in.d_tax;
+
+                out->d_next_oid = in.d_next_oid;
+            }
+
+        );
+
+        static constexpr auto map = [](int col_n) -> int
+        {
+            (void)col_n;
+
+            return 0;
+        };
+    };
+
+    template <typename A>
+
+    class RecordAccessor<A, example_row>
+    {
+
+    public:
+        const uint32_t &d_ytd() const
+        {
+
+            return impl().d_ytd_impl();
+        }
+
+        const uint32_t &d_payment_cnt() const
+        {
+
+            return impl().d_payment_cnt_impl();
+        }
+
+        const uint32_t &d_date() const
+        {
+
+            return impl().d_date_impl();
+        }
+
+        const uint32_t &d_tax() const
+        {
+
+            return impl().d_tax_impl();
+        }
+
+        const uint32_t &d_next_oid() const
+        {
+
+            return impl().d_next_oid_impl();
+        }
+
+        void copy_into(example_row *dst) const
+        {
+
+            return impl().copy_into_impl(dst);
+        }
+
+    private:
+        const A &impl() const
+        {
+
+            return *static_cast<const A *>(this);
+        }
+    };
+
+    template <>
+
+    class UniRecordAccessor<example_row> : public RecordAccessor<UniRecordAccessor<example_row>, example_row>
+    {
+
+    public:
+        UniRecordAccessor(const example_row *const vptr) : vptr_(vptr) {}
+
+    private:
+        const uint32_t &d_ytd_impl() const
+        {
+
+            return vptr_->d_ytd;
+        }
+
+        const uint32_t &d_payment_cnt_impl() const
+        {
+
+            return vptr_->d_payment_cnt;
+        }
+
+        const uint32_t &d_date_impl() const
+        {
+
+            return vptr_->d_date;
+        }
+
+        const uint32_t &d_tax_impl() const
+        {
+
+            return vptr_->d_tax;
+        }
+
+        const uint32_t &d_next_oid_impl() const
+        {
+
+            return vptr_->d_next_oid;
+        }
+
+        void copy_into_impl(example_row *dst) const
+        {
+
+            if (vptr_)
+            {
+
+                dst->d_ytd = vptr_->d_ytd;
+
+                dst->d_payment_cnt = vptr_->d_payment_cnt;
+
+                dst->d_date = vptr_->d_date;
+
+                dst->d_tax = vptr_->d_tax;
+
+                dst->d_next_oid = vptr_->d_next_oid;
+            }
+        }
+
+        const example_row *vptr_;
+
+        friend RecordAccessor<UniRecordAccessor<example_row>, example_row>;
+    };
+
+    template <>
+
+    class SplitRecordAccessor<example_row> : public RecordAccessor<SplitRecordAccessor<example_row>, example_row>
+    {
+
+    public:
+        static constexpr size_t num_splits = SplitParams<example_row>::num_splits;
+
+        SplitRecordAccessor(const std::array<void *, num_splits> &vptrs)
+
+            : vptr_0_(reinterpret_cast<example_row *>(vptrs[0]))
+        {
+        }
+
+    private:
+        const uint32_t &d_ytd_impl() const
+        {
+
+            return vptr_0_->d_ytd;
+        }
+
+        const uint32_t &d_payment_cnt_impl() const
+        {
+
+            return vptr_0_->d_payment_cnt;
+        }
+
+        const uint32_t &d_date_impl() const
+        {
+
+            return vptr_0_->d_date;
+        }
+
+        const uint32_t &d_tax_impl() const
+        {
+
+            return vptr_0_->d_tax;
+        }
+
+        const uint32_t &d_next_oid_impl() const
+        {
+
+            return vptr_0_->d_next_oid;
+        }
+
+        void copy_into_impl(example_row *dst) const
+        {
+
+            if (vptr_0_)
+            {
+
+                dst->d_ytd = vptr_0_->d_ytd;
+
+                dst->d_payment_cnt = vptr_0_->d_payment_cnt;
+
+                dst->d_date = vptr_0_->d_date;
+
+                dst->d_tax = vptr_0_->d_tax;
+
+                dst->d_next_oid = vptr_0_->d_next_oid;
+            }
+        }
+
+        const example_row *vptr_0_;
+
+        friend RecordAccessor<SplitRecordAccessor<example_row>, example_row>;
+    };
+
+}; // namespace bench
+
+using CoarseIndex = bench::MTrie_ordered_index<key_type, MTrie_coarse_grained_row, db_params::db_default_params>;
+
+using FineIndex = bench::MTrie_ordered_index<key_type, example_row, db_params::db_default_params>;
+
+using access_t = bench::access_t;
+
+using RowAccess = bench::RowAccess;
+
+using MVIndex = bench::MTrie_mvcc_ordered_index<key_type, MTrie_coarse_grained_row, db_params::db_mvcc_params>;
+
+template <typename IndexType>
+
+void init_cindex(IndexType &ci)
+{
+
+    for (uint64_t i = 1; i <= 10; ++i)
+
+        ci.nontrans_put(key_type(i), MTrie_coarse_grained_row(i, i, i));
+}
+
+void init_findex(FineIndex &fi)
+{
+
+    example_row row;
+
+    row.d_ytd = 3000;
+
+    row.d_tax = 10;
+
+    row.d_date = 200;
+
+    row.d_next_oid = 100;
+
+    row.d_payment_cnt = 50;
+
+    for (uint64_t i = 1; i <= 10; ++i)
+
+        fi.nontrans_put(key_type(i), row);
+}
+
+void test_coarse_basic()
+{
+
+    typedef CoarseIndex::NamedColumn nc;
+
+    CoarseIndex ci;
+
+    ci.thread_init();
+
+    init_cindex(ci);
+
+    {
+
+        TestTransaction t(0);
+
+        auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
+        (void)row;
+
+        assert(success && found);
+
+        // std::cout<<"value().aa() = "<<value.aa()<<std::endl;
+
+        assert(value.aa() == 1);
+
+        assert(t.try_commit());
+    }
+
+    {
+
+        TestTransaction t(0);
+
+        auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::update}});
+
+        (void)row;
+
+        assert(success && found);
+
+        auto new_row = Sto::tx_alloc<MTrie_coarse_grained_row>();
+
+        value.copy_into(new_row);
+
+        new_row->aa = 2;
+
+        ci.update_row(row, new_row);
+
+        assert(t.try_commit());
+    }
+
+    {
+
+        TestTransaction t1(1);
+
+        auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
+        (void)row;
+
+        assert(success && found);
+
+        assert(value.aa() == 2);
+
+        assert(t1.try_commit());
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_coarse_read_my_split()
+{
+
+    typedef CoarseIndex::NamedColumn nc;
+
+    CoarseIndex ci;
+
+    ci.thread_init();
+
+    init_cindex(ci);
+
+    {
+
+        TestTransaction t(0);
+
+        auto [success, found, row, value] = ci.select_split_row(key_type(20), {{nc::aa, access_t::read}});
+
+        (void)row;
+
+        (void)value;
+
+        // if only one iteration is performed,
+        //  the value shouldn't be found pre-insertion
+        // if it is run multiple times, the MassTrie acts as a cache
+        //  for quick performances
+
+        if (j == 0)
+            assert(success && !found);
+
+        for (int i = 0; i < 10; ++i)
+        {
+
+            auto r = Sto::tx_alloc<MTrie_coarse_grained_row>();
+
+            new (r) MTrie_coarse_grained_row(i, i, i);
+
+            ci.insert_row(key_type(10 + i), r);
+        }
+
+        assert(t.try_commit());
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_coarse_conflict0()
+{
+
+    typedef CoarseIndex::NamedColumn nc;
+
+    CoarseIndex ci;
+
+    ci.thread_init();
+
+    init_cindex(ci);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.aa() == 1);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::update}});
+
+            assert(success && found);
+
+            auto new_row = Sto::tx_alloc<MTrie_coarse_grained_row>();
+
+            value.copy_into(new_row);
+
+            new_row->aa = 2;
+
+            ci.update_row(row, new_row);
+
+            assert(t2.try_commit());
+        }
+
+        t1.use();
+
+        assert(!t1.try_commit());
+    }
+
+    {
+
+        TestTransaction t1(0);
+
+        MTrie_coarse_grained_row row_value(100, 100, 100);
+
+        {
+
+            auto [success, found] = ci.insert_row(key_type(100), &row_value);
+            // if only one iteration is performed,
+            // the value shouldn't be found pre-insertion
+            // if it is run multiple times, the MassTrie acts as a cache
+            // for quick performances
+            if (j == 0)
+                assert(success && !found);
+        }
+
+        TestTransaction t2(0);
+
+        {
+
+            auto [success, found, row, value] = ci.select_split_row(key_type(100), {{nc::aa, access_t::read}});
+
+            (void)row;
+
+            (void)value;
+
+            // if only one iteration is performed,
+            //  the value shouldn't be found pre-insertion
+            // if it is run multiple times, the MassTrie acts as a cache
+            //  for quick performances
+            if (j == 0)
+                assert(!success || !found);
+        }
+
+        t1.use();
+
+        assert(t1.try_commit());
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_coarse_conflict1()
+{
+
+    typedef CoarseIndex::NamedColumn nc;
+
+    CoarseIndex ci;
+
+    ci.thread_init();
+
+    init_cindex(ci);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::aa, access_t::read}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.aa() == 1);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = ci.select_split_row(key_type(1), {{nc::bb, access_t::update}});
+
+            assert(success && found);
+
+            auto new_row = Sto::tx_alloc<MTrie_coarse_grained_row>();
+
+            value.copy_into(new_row);
+
+            new_row->aa = 2; // Will get installed
+
+            new_row->bb = 2;
+
+            ci.update_row(row, new_row);
+
+            assert(t2.try_commit());
+
+            t1.use();
+
+            assert(value.aa() == 2);
+
+            assert(!t1.try_commit()); // expected coarse-grained behavior
+        }
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_fine_conflict0()
+{
+
+    typedef FineIndex::NamedColumn nc;
+
+    FineIndex fi;
+
+    fi.thread_init();
+
+    init_findex(fi);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::read}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_ytd() == 3000);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::update}});
+
+            assert(success && found);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            new_row->d_ytd += 10;
+
+            fi.update_row(row, new_row);
+
+            assert(t2.try_commit());
+
+            t1.use();
+
+            assert(value.d_ytd() == 3010);
+
+            assert(!t1.try_commit());
+        }
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_fine_conflict1()
+{
+
+    typedef FineIndex::NamedColumn nc;
+
+    FineIndex fi;
+
+    fi.thread_init();
+
+    init_findex(fi);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::read}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_ytd() == 3000);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::payment_cnt, access_t::update}});
+
+            assert(success && found);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            new_row->d_ytd += 10;
+
+            new_row->d_payment_cnt += 1;
+
+            fi.update_row(row, new_row);
+
+            assert(t2.try_commit());
+
+            t1.use();
+
+            assert(value.d_ytd() == 3000); // unspecified modifications are not installed
+
+            assert(value.d_payment_cnt() == 51);
+
+            assert(!t1.try_commit()); // not able to commit due to hierarchical versions
+        }
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_fine_conflict2()
+{
+
+    typedef FineIndex::NamedColumn nc;
+
+    FineIndex fi;
+
+    fi.thread_init();
+
+    init_findex(fi);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::read}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_tax() == 10);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::ytd, access_t::update}});
+
+            assert(success && found);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            new_row->d_ytd += 10;
+
+            new_row->d_payment_cnt += 1;
+
+            fi.update_row(row, new_row);
+
+            assert(t2.try_commit());
+
+            t1.use();
+
+            assert(value.d_ytd() == 3010);
+
+            assert(value.d_payment_cnt() == 50); // unspecified modifications are not installed
+
+            assert(t1.try_commit()); // can commit because of fine-grained versions
+        }
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_fine_delete0()
+{
+
+    typedef FineIndex::NamedColumn nc;
+
+    FineIndex fi;
+
+    fi.thread_init();
+
+    init_findex(fi);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found] = fi.delete_row(key_type(1));
+
+            // std::cout<<"success ="<<success<<" and found = "<<found<<endl;
+
+            assert(success && found);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::update}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_tax() == 10);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            fi.update_row(row, new_row);
+        }
+
+        assert(t2.try_commit());
+
+        t1.use();
+
+        assert(!t1.try_commit());
+    }
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found] = fi.delete_row(key_type(2));
+
+            assert(success && found);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(2), {{nc::tax, access_t::update}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_tax() == 10);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            fi.update_row(row, new_row);
+        }
+
+        assert(t1.try_commit());
+
+        t2.use();
+
+        assert(!t2.try_commit());
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_fine_delete1()
+{
+
+    typedef FineIndex::NamedColumn nc;
+
+    FineIndex fi;
+
+    fi.thread_init();
+
+    init_findex(fi);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(1), {{nc::tax, access_t::update}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_tax() == 10);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            fi.update_row(row, new_row);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found] = fi.delete_row(key_type(1));
+
+            assert(success && found);
+        }
+
+        assert(t2.try_commit());
+
+        t1.use();
+
+        assert(!t1.try_commit());
+    }
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            auto [success, found, row, value] = fi.select_split_row(key_type(2), {{nc::tax, access_t::update}});
+
+            (void)row;
+
+            assert(success && found);
+
+            assert(value.d_tax() == 10);
+
+            auto new_row = Sto::tx_alloc<example_row>();
+
+            value.copy_into(new_row);
+
+            fi.update_row(row, new_row);
+        }
+
+        TestTransaction t2(1);
+
+        {
+
+            auto [success, found] = fi.delete_row(key_type(2));
+
+            assert(success && found);
+        }
+
+        assert(t1.try_commit());
+
+        t2.use();
+
+        assert(!t2.try_commit());
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+void test_get()
+{
+
+    typedef CoarseIndex::NamedColumn nc;
+
+    CoarseIndex ci;
+
+    ci.thread_init();
+
+    init_cindex(ci);
+
+    {
+
+        TestTransaction t1(0);
+
+        {
+
+            ci.nontrans_get(key_type(1));
+
+            assert(t1.try_commit());
+        }
+    }
+
+    // printf("pass %s\n", __FUNCTION__);
+}
+
+/****/
+
+int main()
+{
+
+    auto start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_coarse_basic();
+
+    auto end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_coarse_basic with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_coarse_read_my_split();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_coarse_read_my_split with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_coarse_conflict0();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_coarse_conflict0 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_coarse_conflict1();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_coarse_conflict1 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_fine_conflict0();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_fine_conflict0 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_fine_conflict1();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_fine_conflict1 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_fine_conflict2();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_fine_conflict2 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_fine_delete0();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_fine_delete0 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    start = std::chrono::steady_clock::now();
+
+    for (j = 0; j < 1000; j++)
+
+        test_fine_delete1();
+
+    end = std::chrono::steady_clock::now();
+
+    std::cout << "The average elapsed time for test_fine_delete1 with MassTrie is " <<
+
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() / 1000 << " ns" << std::endl;
+
+    // test_get();
+
+    printf("All tests pass!\n");
+
+    std::thread advancer; // empty thread because we have no advancer thread
+
+    Transaction::rcu_release_all(advancer, 2);
+
+    return 0;
+}