Cleanup cache_bin_info_t* info input args

Optimize mutex and bin alignment / locality.
Change stack_size assertion back to the more compatabile one.
2023-10-25 10:27:31 -07:00 · 2023-10-23 20:28:26 -07:00 · 2023-10-23 20:28:26 -07:00 · 2023-10-18 22:11:13 -07:00 · 2023-10-18 14:12:43 -07:00 · 2023-10-18 14:11:46 -07:00
256 changed files with 7972 additions and 2331 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -29,7 +29,7 @@ task:
        UNCOMMON_CONFIG: --with-lg-page=16 --with-malloc-conf=tcache:false
  freebsd_instance:
    matrix:
-      image: freebsd-12-3-release-amd64
+      image: freebsd-13-0-release-amd64
  install_script:
    - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
    - pkg upgrade -y
--- a/.github/workflows/check_formatting.yaml
+++ b/.github/workflows/check_formatting.yaml
@ -0,0 +1,10 @@
+name: 'Check Formatting'
+on: [pull_request]
+jobs:
+  check-formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v3
+      - name: Check for trailing whitespace
+        run: scripts/check_trailing_whitespace.sh
--- a/.github/workflows/static_analysis.yaml
+++ b/.github/workflows/static_analysis.yaml
@ -0,0 +1,68 @@
+name: 'Static Analysis'
+on: [pull_request]
+jobs:
+  static-analysis:
+    runs-on: ubuntu-latest
+    steps:
+      # We build libunwind ourselves because sadly the version
+      # provided by Ubuntu via apt-get is much too old.
+      - name: Check out libunwind
+        uses: actions/checkout@v3
+        with:
+          repository: libunwind/libunwind
+          path: libunwind
+          ref: 'v1.6.2'
+          github-server-url: 'https://github.com'
+      - name: Install libunwind
+        run: |
+          cd libunwind
+          autoreconf -i
+          ./configure --prefix=/usr
+          make -s -j $(nproc) V=0
+          sudo make -s install V=0
+          cd ..
+          rm -rf libunwind
+      - name: Check out repository
+        uses: actions/checkout@v3
+      # We download LLVM directly from the latest stable release
+      # on GitHub, because this tends to be much newer than the
+      # version available via apt-get in Ubuntu.
+      - name: Download LLVM
+        uses: dsaltares/fetch-gh-release-asset@master
+        with:
+          repo: 'llvm/llvm-project'
+          version: 'tags/llvmorg-16.0.4'
+          file: 'clang[+]llvm-.*x86_64-linux-gnu.*'
+          regex: true
+          target: 'llvm_assets/'
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install prerequisites
+        id: install_prerequisites
+        run: |
+          tar -C llvm_assets -xaf llvm_assets/*.tar* &
+          sudo apt-get update
+          sudo apt-get install -y jq bear python3-pip
+          pip install codechecker
+          echo "Extracting LLVM from tar" 1>&2
+          wait
+          echo "LLVM_BIN_DIR=$(echo llvm_assets/clang*/bin)" >> "$GITHUB_OUTPUT"
+      - name: Run static analysis
+        id: run_static_analysis
+        run: >
+          PATH="${{ steps.install_prerequisites.outputs.LLVM_BIN_DIR }}:$PATH"
+          LDFLAGS='-L/usr/lib'
+          scripts/run_static_analysis.sh static_analysis_results "$GITHUB_OUTPUT"
+      - name: Upload static analysis results
+        if: ${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }} == '1'
+        uses: actions/upload-artifact@v3
+        with:
+          name: static_analysis_results
+          path: static_analysis_results
+      - name: Check static analysis results
+        run: |
+          if [[ "${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }}" == '1' ]]
+          then
+              echo "::error::Static analysis found issues with your code. Download the 'static_analysis_results' artifact from this workflow and view the 'index.html' file contained within it in a web browser locally for detailed results."
+              exit 1
+          fi
+
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,10 @@
 /src/*.[od]
 /src/*.sym

+compile_commands.json
+/static_analysis_raw_results
+/static_analysis_results
+
 /run_tests.out/

 /test/test.sh
--- a/.travis.yml
+++ b/.travis.yml
@ -6,7 +6,7 @@
 # Differences are explained here:
 # https://docs.travis-ci.com/user/languages/minimal-and-generic/
 language: minimal
-dist: focal
+dist: jammy

 jobs:
  include:
@ -376,6 +376,9 @@ jobs:
    - os: osx
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+    - os: osx
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
    # Development build
    - os: linux
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
--- a/INSTALL.md
+++ b/INSTALL.md
@ -396,6 +396,102 @@ exclusively):

    Use this to search for programs used during configuration and building.

+## Building for Windows
+
+There are at least two ways to build jemalloc's libraries for Windows. They
+differ in their ease of use and flexibility.
+
+### With MSVC solutions
+This is the easy, but less flexible approach. It doesn't let you specify
+arguments to the `configure` script.
+  
+1. Install Cygwin with at least the following packages:
+   * autoconf
+   * autogen
+   * gawk
+   * grep
+   * sed
+
+2. Install Visual Studio 2015 or 2017 with Visual C++
+
+3. Add Cygwin\bin to the PATH environment variable
+
+4. Open "x64 Native Tools Command Prompt for VS 2017"
+   (note: x86/x64 doesn't matter at this point)
+
+5. Generate header files:
+   sh -c "CC=cl ./autogen.sh"
+
+6. Now the project can be opened and built in Visual Studio:
+   msvc\jemalloc_vc2017.sln
+
+### With MSYS
+This is a more involved approach that offers the same configuration flexibility
+as Linux builds. We use it for our CI workflow to test different jemalloc
+configurations on Windows.
+
+1. Install the prerequisites
+    1. MSYS2
+    2. Chocolatey
+    3. Visual Studio if you want to compile with MSVC compiler
+
+2. Run your bash emulation. It could be MSYS2 or Git Bash (this manual was
+   tested on both)
+3. Manually and selectively follow
+   [before_install.sh](https://github.com/jemalloc/jemalloc/blob/dev/scripts/windows/before_install.sh)
+   script.
+    1. Skip the `TRAVIS_OS_NAME` check, `rm -rf C:/tools/msys64` and `choco
+       uninstall/upgrade` part.
+    2.  If using `msys2` shell, add path to `RefreshEnv.cmd` to `PATH`:
+        `PATH="$PATH:/c/ProgramData/chocolatey/bin"`
+    3. Assign `msys_shell_cmd`, `msys2`, `mingw32` and `mingw64` as in the
+       script.
+    4. Pick `CROSS_COMPILE_32BIT` , `CC` and `USE_MSVC` values depending on
+       your needs. For instance, if you'd like to build for x86_64 Windows
+       with `gcc`, then `CROSS_COMPILE_32BIT="no"`, `CC="gcc"` and
+       `USE_MSVC=""`. If you'd like to build for x86 Windows with `cl.exe`,
+       then `CROSS_COMPILE_32BIT="yes"`, `CC="cl.exe"`, `USE_MSVC="x86"`.
+       For x86_64 builds with `cl.exe`, assign `USE_MSVC="amd64"` and
+       `CROSS_COMPILE_32BIT="no"`.
+    5. Replace the path to `vcvarsall.bat` with the path on your system. For
+       instance, on my Windows PC with Visual Studio 17, the path is
+       `C:\Program Files (x86)\Microsoft Visual
+       Studio\2017\BuildTools\VC\Auxiliary\Build\vcvarsall.bat`.
+    6. Execute the rest of the script. It will install the required
+       dependencies and assign the variable `build_env`, which is a function
+       that executes following commands with the correct environment
+       variables set.
+4. Use `$build_env <command>` as you would in a Linux shell:
+     1. `$build_env autoconf`
+     2. `$build_env ./configure CC="<desired compiler>" <configuration flags>`
+     3. `$build_env mingw32-make`
+
+If you're having any issues with the above, ensure the following:
+
+5. When you run `cmd //C RefreshEnv.cmd`, you get an output line starting with
+   `Refreshing` . If it errors saying `RefreshEnv.cmd` is not found, then you
+   need to add it to your `PATH` as described above in item 3.2
+
+6. When you run `cmd //C $vcvarsall`, it prints a bunch of environment
+   variables. Otherwise, check the path to the `vcvarsall.bat` in `$vcvarsall`
+   script and fix it.
+
+### Building from vcpkg
+
+The jemalloc port in vcpkg is kept up to date by Microsoft team members and
+community contributors. The url of vcpkg is: https://github.com/Microsoft/vcpkg
+. You can download and install jemalloc using the vcpkg dependency manager:
+
+```shell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh  # ./bootstrap-vcpkg.bat for Windows
+./vcpkg integrate install
+./vcpkg install jemalloc
+```
+
+If the version is out of date, please [create an issue or pull
+request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.

 ## Development

--- a/Makefile.in
+++ b/Makefile.in
@ -155,6 +155,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/thread_event.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
+	$(srcroot)src/util.c \
 	$(srcroot)src/witness.c
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
@ -225,6 +226,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/hook.c \
 	$(srcroot)test/unit/hpa.c \
 	$(srcroot)test/unit/hpa_background_thread.c \
+	$(srcroot)test/unit/hpa_validate_conf.c \
 	$(srcroot)test/unit/hpdata.c \
 	$(srcroot)test/unit/huge.c \
 	$(srcroot)test/unit/inspect.c \
@ -240,6 +242,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/nstime.c \
+	$(srcroot)test/unit/ncached_max.c \
 	$(srcroot)test/unit/oversize_threshold.c \
 	$(srcroot)test/unit/pa.c \
 	$(srcroot)test/unit/pack.c \
@ -256,6 +259,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/prof_mdump.c \
 	$(srcroot)test/unit/prof_recent.c \
 	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_small.c \
 	$(srcroot)test/unit/prof_stats.c \
 	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
@ -332,10 +336,15 @@ TESTS_STRESS := $(srcroot)test/stress/batch_alloc.c \
 	$(srcroot)test/stress/large_microbench.c \
 	$(srcroot)test/stress/mallctl.c \
 	$(srcroot)test/stress/microbench.c
+ifeq (@enable_cxx@, 1)
+TESTS_STRESS_CPP := $(srcroot)test/stress/cpp/microbench.cpp
+else
+TESTS_STRESS_CPP :=
+endif


 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) \
-	$(TESTS_ANALYZE) $(TESTS_STRESS)
+	$(TESTS_ANALYZE) $(TESTS_STRESS) $(TESTS_STRESS_CPP)

 PRIVATE_NAMESPACE_HDRS := $(objroot)include/jemalloc/internal/private_namespace.h $(objroot)include/jemalloc/internal/private_namespace_jet.h
 PRIVATE_NAMESPACE_GEN_HDRS := $(PRIVATE_NAMESPACE_HDRS:%.h=%.gen.h)
@ -362,9 +371,10 @@ TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_CPP_OBJS := $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
 TESTS_ANALYZE_OBJS := $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_STRESS_CPP_OBJS := $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
 TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_ANALYZE_OBJS) \
 	$(TESTS_STRESS_OBJS)
-TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS)
+TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS) $(TESTS_STRESS_CPP_OBJS)

 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
@ -454,10 +464,13 @@ $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_INTEGRATION_CPP_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_CPP_TEST
 $(TESTS_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
+$(TESTS_STRESS_CPP_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_CPP_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
 $(TESTS_CPP_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.cpp
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_CPP_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_OBJS): CFLAGS += -fno-builtin
+$(TESTS_CPP_OBJS): CPPFLAGS += -fno-builtin
 ifneq ($(IMPORTLIB),$(SO))
 $(CPP_OBJS) $(C_SYM_OBJS) $(C_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif
@ -472,7 +485,7 @@ $(TESTS_OBJS) $(TESTS_CPP_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif

 $(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_INTEGRATION_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace.h
-$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h
+$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS) $(TESTS_STRESS_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h

 $(C_SYM_OBJS) $(C_OBJS) $(C_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
 	@mkdir -p $(@D)
@ -513,7 +526,11 @@ endif

 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) : $(if $(PIC_CFLAGS),$(C_PIC_OBJS),$(C_OBJS)) $(if $(PIC_CFLAGS),$(CPP_PIC_OBJS),$(CPP_OBJS))
 	@mkdir -p $(@D)
+ifeq (@enable_cxx@, 1)
+	$(CXX) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+else
 	$(CC) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+endif

 $(objroot)lib/$(LIBJEMALLOC)_pic.$(A) : $(C_PIC_OBJS) $(CPP_PIC_OBJS)
 $(objroot)lib/$(LIBJEMALLOC).$(A) : $(C_OBJS) $(CPP_OBJS)
@ -555,18 +572,20 @@ endif
 install_bin:
 	$(INSTALL) -d $(BINDIR)
 	@for b in $(BINS); do \
-	$(INSTALL) -v -m 755 $$b $(BINDIR); \
+	echo "$(INSTALL) -m 755 $$b $(BINDIR)"; \
+	$(INSTALL) -m 755 $$b $(BINDIR); \
 done

 install_include:
 	$(INSTALL) -d $(INCLUDEDIR)/jemalloc
 	@for h in $(C_HDRS); do \
-	$(INSTALL) -v -m 644 $$h $(INCLUDEDIR)/jemalloc; \
+	echo "$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc"; \
+	$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc; \
 done

 install_lib_shared: $(DSOS)
 	$(INSTALL) -d $(LIBDIR)
-	$(INSTALL) -v -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
+	$(INSTALL) -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
 ifneq ($(SOREV),$(SO))
 	ln -sf $(LIBJEMALLOC).$(SOREV) $(LIBDIR)/$(LIBJEMALLOC).$(SO)
 endif
@ -574,13 +593,15 @@ endif
 install_lib_static: $(STATIC_LIBS)
 	$(INSTALL) -d $(LIBDIR)
 	@for l in $(STATIC_LIBS); do \
-	$(INSTALL) -v -m 755 $$l $(LIBDIR); \
+	echo "$(INSTALL) -m 755 $$l $(LIBDIR)"; \
+	$(INSTALL) -m 755 $$l $(LIBDIR); \
 done

 install_lib_pc: $(PC)
 	$(INSTALL) -d $(LIBDIR)/pkgconfig
 	@for l in $(PC); do \
-	$(INSTALL) -v -m 644 $$l $(LIBDIR)/pkgconfig; \
+	echo "$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done

 ifeq ($(enable_shared), 1)
@ -594,13 +615,15 @@ install_lib: install_lib_pc
 install_doc_html: build_doc_html
 	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
 	@for d in $(DOCS_HTML); do \
-	$(INSTALL) -v -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
+	echo "$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix)"; \
+	$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
 done

 install_doc_man: build_doc_man
 	$(INSTALL) -d $(MANDIR)/man3
 	@for d in $(DOCS_MAN3); do \
-	$(INSTALL) -v -m 644 $$d $(MANDIR)/man3; \
+	echo "$(INSTALL) -m 644 $$d $(MANDIR)/man3"; \
+	$(INSTALL) -m 644 $$d $(MANDIR)/man3; \
 done

 install_doc: install_doc_html install_doc_man
@ -656,7 +679,7 @@ endif
 tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
 tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
 tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
-tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE))
+tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
 tests: tests_unit tests_integration tests_analyze tests_stress

 check_unit_dir:
@ -689,6 +712,7 @@ else
 endif
 stress: tests_stress stress_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
+	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%)
 check: check_unit check_integration check_integration_decay check_integration_prof

 clean:
--- a/2
+++ b/2
@ -17,4 +17,4 @@ jemalloc.

 The ChangeLog file contains a brief summary of changes for each release.

-URL: http://jemalloc.net/
+URL: https://jemalloc.net/
--- a/TUNING.md
+++ b/TUNING.md
@ -11,9 +11,9 @@ by a few percent, or make favorable trade-offs.
 ## Notable runtime options for performance tuning

 Runtime options can be set via
-[malloc_conf](http://jemalloc.net/jemalloc.3.html#tuning).
+[malloc_conf](https://jemalloc.net/jemalloc.3.html#tuning).

-* [background_thread](http://jemalloc.net/jemalloc.3.html#background_thread)
+* [background_thread](https://jemalloc.net/jemalloc.3.html#background_thread)

    Enabling jemalloc background threads generally improves the tail latency for
    application threads, since unused memory purging is shifted to the dedicated
@ -23,7 +23,7 @@ Runtime options can be set via
    Suggested: `background_thread:true` when jemalloc managed threads can be
    allowed.

-* [metadata_thp](http://jemalloc.net/jemalloc.3.html#opt.metadata_thp)
+* [metadata_thp](https://jemalloc.net/jemalloc.3.html#opt.metadata_thp)

    Allowing jemalloc to utilize transparent huge pages for its internal
    metadata usually reduces TLB misses significantly, especially for programs
@ -35,8 +35,8 @@ Runtime options can be set via
    `metadata_thp:always`, which is expected to improve CPU utilization at a
    small memory cost.

-* [dirty_decay_ms](http://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms) and
-  [muzzy_decay_ms](http://jemalloc.net/jemalloc.3.html#opt.muzzy_decay_ms)
+* [dirty_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms) and
+  [muzzy_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.muzzy_decay_ms)

    Decay time determines how fast jemalloc returns unused pages back to the
    operating system, and therefore provides a fairly straightforward trade-off
@ -46,7 +46,7 @@ Runtime options can be set via

    Suggested: tune the values based on the desired trade-offs.

-* [narenas](http://jemalloc.net/jemalloc.3.html#opt.narenas)
+* [narenas](https://jemalloc.net/jemalloc.3.html#opt.narenas)

    By default jemalloc uses multiple arenas to reduce internal lock contention.
    However high arena count may also increase overall memory fragmentation,
@ -57,7 +57,7 @@ Runtime options can be set via
    Suggested: if low parallelism is expected, try lower arena count while
    monitoring CPU and memory usage.

-* [percpu_arena](http://jemalloc.net/jemalloc.3.html#opt.percpu_arena)
+* [percpu_arena](https://jemalloc.net/jemalloc.3.html#opt.percpu_arena)

    Enable dynamic thread to arena association based on running CPU.  This has
    the potential to improve locality, e.g. when thread to CPU affinity is
@ -100,28 +100,28 @@ aborts immediately on illegal options.
 In addition to the runtime options, there are a number of programmatic ways to
 improve application performance with jemalloc.

-* [Explicit arenas](http://jemalloc.net/jemalloc.3.html#arenas.create)
+* [Explicit arenas](https://jemalloc.net/jemalloc.3.html#arenas.create)

    Manually created arenas can help performance in various ways, e.g. by
    managing locality and contention for specific usages.  For example,
    applications can explicitly allocate frequently accessed objects from a
    dedicated arena with
-    [mallocx()](http://jemalloc.net/jemalloc.3.html#MALLOCX_ARENA) to improve
+    [mallocx()](https://jemalloc.net/jemalloc.3.html#MALLOCX_ARENA) to improve
    locality.  In addition, explicit arenas often benefit from individually
    tuned options, e.g. relaxed [decay
-    time](http://jemalloc.net/jemalloc.3.html#arena.i.dirty_decay_ms) if
+    time](https://jemalloc.net/jemalloc.3.html#arena.i.dirty_decay_ms) if
    frequent reuse is expected.

-* [Extent hooks](http://jemalloc.net/jemalloc.3.html#arena.i.extent_hooks)
+* [Extent hooks](https://jemalloc.net/jemalloc.3.html#arena.i.extent_hooks)

    Extent hooks allow customization for managing underlying memory.  One use
    case for performance purpose is to utilize huge pages -- for example,
-    [HHVM](https://github.com/facebook/hhvm/blob/master/hphp/util/alloc.cpp)
+    [HHVM](httpss://github.com/facebook/hhvm/blob/master/hphp/util/alloc.cpp)
    uses explicit arenas with customized extent hooks to manage 1GB huge pages
    for frequently accessed data, which reduces TLB misses significantly.

 * [Explicit thread-to-arena
-  binding](http://jemalloc.net/jemalloc.3.html#thread.arena)
+  binding](https://jemalloc.net/jemalloc.3.html#thread.arena)

    It is common for some threads in an application to have different memory
    access / allocation patterns.  Threads with heavy workloads often benefit
--- a/bin/jeprof.in
+++ b/bin/jeprof.in
@ -2955,8 +2955,11 @@ sub RemoveUninterestingFrames {
    foreach my $name ('@JEMALLOC_PREFIX@calloc',
                      'cfree',
                      '@JEMALLOC_PREFIX@malloc',
+                      'je_malloc_default',
                      'newImpl',
                      'void* newImpl',
+                      'fallbackNewImpl',
+                      'void* fallbackNewImpl',
                      '@JEMALLOC_PREFIX@free',
                      '@JEMALLOC_PREFIX@memalign',
                      '@JEMALLOC_PREFIX@posix_memalign',
@ -2966,6 +2969,7 @@ sub RemoveUninterestingFrames {
                      '@JEMALLOC_PREFIX@realloc',
                      '@JEMALLOC_PREFIX@mallocx',
                      '@JEMALLOC_PREFIX@rallocx',
+                      'do_rallocx',
                      '@JEMALLOC_PREFIX@xallocx',
                      '@JEMALLOC_PREFIX@dallocx',
                      '@JEMALLOC_PREFIX@sdallocx',
--- a/configure.ac
+++ b/configure.ac
@ -510,6 +510,23 @@ typedef unsigned __int32 uint32_t;
      else
        AC_MSG_ERROR([cannot determine number of significant virtual address bits])
      fi
+      AC_CACHE_CHECK([rdtscp support],
+		     [je_cv_rdtscp],
+		     AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+#include <stdint.h>
+]],
+[[
+      unsigned int dx;
+      asm volatile("rdtscp" : "=d"(dx) ::);
+      return 0;
+]])],
+      [je_cv_rdtscp=yes],
+      [je_cv_rdtscp=no],
+      [je_cv_rdtscp=no]))
+      if test "x${je_cv_rdtscp}" = "xyes"; then
+        AC_DEFINE([JEMALLOC_HAVE_RDTSCP], [ ], [ ])
+      fi
    fi
    ;;
  *)
@ -529,6 +546,22 @@ typedef unsigned __int32 uint32_t;
    ;;
 esac
 AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ])
+AC_CACHE_CHECK([asm volatile support],
+               [je_cv_asm_volatile],
+               AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+]],
+[[
+      void* ptr;
+      asm volatile("" : "+r"(ptr));
+      return 0;
+]])],
+[je_cv_asm_volatile=yes],
+[je_cv_asm_volatile=no],
+[je_cv_asm_volatile=no]))
+if test "x${je_cv_asm_volatile}" = "xyes"; then
+  AC_DEFINE([JEMALLOC_HAVE_ASM_VOLATILE], [ ], [ ])
+fi

 LD_PRELOAD_VAR="LD_PRELOAD"
 so="so"
@ -654,6 +687,9 @@ case "${host}" in
 	SOREV="${rev}.${so}"
 	sbrk_deprecated="1"
 	SYM_PREFIX="_"
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
 	;;
  *-*-freebsd*)
 	JE_APPEND_VS(CPPFLAGS, -D_BSD_SOURCE)
@ -687,6 +723,19 @@ case "${host}" in
 	fi
 	zero_realloc_default_free="1"
 	;;
+  *-*-linux-musl*)
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
+	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ])
+	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ])
+	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ])
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
+	zero_realloc_default_free="1"
+	;;
  *-*-linux*)
 	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
 	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
@ -939,6 +988,30 @@ if test "x${je_cv_cold}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_HAVE_ATTR_COLD], [ ], [ ])
 fi

+dnl Check for deprecated attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Wdeprecated-declarations])
+JE_COMPILABLE([deprecated attribute],
+              [#if !__has_attribute(deprecated)
+               #error "deprecated attribute not supported"
+               #endif
+               struct has_deprecated_field {
+                   int good;
+                   int __attribute__((deprecated("Do not use"))) bad;
+               };
+              ],
+              [struct has_deprecated_field instance;
+               instance.good = 0;
+               instance.bad = 1;
+              ],
+              [je_cv_deprecated])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_deprecated}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_DEPRECATED], [ ], [ ])
+  JE_CFLAGS_ADD([-Wdeprecated-declarations])
+  JE_CXXFLAGS_ADD([-Wdeprecated-declarations])
+fi
+
 dnl Check for VM_MAKE_TAG for mmap support.
 JE_COMPILABLE([vm_make_tag],
 	      [#include <sys/mman.h>
@ -1052,11 +1125,11 @@ AC_SUBST([JEMALLOC_CPREFIX])
 AC_ARG_WITH([export],
  [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
  [if test "x$with_export" = "xno"; then
-  AC_DEFINE([JEMALLOC_EXPORT],[], [ ])
+  AC_DEFINE([JEMALLOC_EXPORT], [ ], [ ])
 fi]
 )

-public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+public_syms="aligned_alloc calloc dallocx free free_sized free_aligned_sized mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
 dnl Check for additional platform-specific public API functions.
 AC_CHECK_FUNC([memalign],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ], [ ])
@ -1064,6 +1137,9 @@ AC_CHECK_FUNC([memalign],
 AC_CHECK_FUNC([valloc],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ], [ ])
 	       public_syms="${public_syms} valloc"])
+AC_CHECK_FUNC([pvalloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_PVALLOC], [ ], [ ])
+	       public_syms="${public_syms} pvalloc"])
 AC_CHECK_FUNC([malloc_size],
 	      [AC_DEFINE([JEMALLOC_HAVE_MALLOC_SIZE], [ ], [ ])
 	       public_syms="${public_syms} malloc_size"])
@ -1077,6 +1153,16 @@ if test "x${JEMALLOC_PREFIX}" = "x" ; then
  AC_CHECK_FUNC([__libc_free],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_free"])
+  dnl __libc_free_sized and __libc_free_aligned_sized are here speculatively
+  dnl under the assumption that glibc will eventually define symbols with these
+  dnl names. In the event glibc chooses different names for these symbols,
+  dnl these will need to be amended to match.
+  AC_CHECK_FUNC([__libc_free_sized],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE_SIZED], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_free_sized"])
+  AC_CHECK_FUNC([__libc_free_aligned_sized],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_free_aligned_sized"])
  AC_CHECK_FUNC([__libc_malloc],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MALLOC], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_malloc"])
@ -1089,6 +1175,9 @@ if test "x${JEMALLOC_PREFIX}" = "x" ; then
  AC_CHECK_FUNC([__libc_valloc],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_VALLOC], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_valloc"])
+  AC_CHECK_FUNC([__libc_pvalloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_PVALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_pvalloc"])
  AC_CHECK_FUNC([__posix_memalign],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___POSIX_MEMALIGN], [ ], [ ])
 		 wrap_syms="${wrap_syms} __posix_memalign"])
@ -1404,6 +1493,18 @@ if test "x$zero_realloc_default_free" = "x1" ; then
  AC_DEFINE([JEMALLOC_ZERO_REALLOC_DEFAULT_FREE], [ ], [ ])
 fi

+dnl Support allocation from DSS by default
+AC_ARG_ENABLE([dss],
+  [AS_HELP_STRING([--disable-dss], [Disable usage of sbrk(2)])],
+[if test "x$enable_dss" = "xno" ; then
+  enable_dss="0"
+else
+  enable_dss="1"
+fi
+],
+[enable_dss="1"]
+)
+
 dnl Enable allocation from DSS if supported by the OS.
 have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
@ -1417,7 +1518,7 @@ else
  have_dss="0"
 fi

-if test "x$have_dss" = "x1" ; then
+if test "x$have_dss" = "x1" -a "x$enable_dss" = "x1" ; then
  AC_DEFINE([JEMALLOC_DSS], [ ], [ ])
 fi

@ -1545,6 +1646,22 @@ if test "x$enable_readlinkat" = "x1" ; then
 fi
 AC_SUBST([enable_readlinkat])

+dnl Do not force getenv by default
+AC_ARG_ENABLE([force-getenv],
+  [AS_HELP_STRING([--enable-force-getenv], [Use getenv over secure_getenv])],
+[if test "x$enable_force_getenv" = "xno" ; then
+  enable_force_getenv="0"
+else
+  enable_force_getenv="1"
+fi
+],
+[enable_force_getenv="0"]
+)
+if test "x$enable_force_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_FORCE_GETENV], [ ], [ ])
+fi
+AC_SUBST([force_getenv])
+
 dnl Avoid extra safety checks by default
 AC_ARG_ENABLE([opt-safety-checks],
  [AS_HELP_STRING([--enable-opt-safety-checks],
@ -1592,7 +1709,7 @@ fi
 [enable_uaf_detection="0"]
 )
 if test "x$enable_uaf_detection" = "x1" ; then
-  AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ])
+  AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ], [ ])
 fi
 AC_SUBST([enable_uaf_detection])

@ -1855,6 +1972,16 @@ dnl Check if we have dlsym support.
  if test "x${je_cv_pthread_getname_np}" = "xyes" ; then
    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GETNAME_NP], [ ], [ ])
  fi
+  dnl Check if pthread_set_name_np is available with the expected API.
+  JE_COMPILABLE([pthread_set_name_np(3)], [
+#include <pthread.h>
+#include <pthread_np.h>
+], [
+  pthread_set_name_np(pthread_self(), "set_name_test");
+], [je_cv_pthread_set_name_np])
+  if test "x${je_cv_pthread_set_name_np}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SET_NAME_NP], [ ], [ ])
+  fi
  dnl Check if pthread_get_name_np is not necessarily present despite
  dnl the pthread_set_name_np counterpart
  JE_COMPILABLE([pthread_get_name_np(3)], [
@ -1998,6 +2125,15 @@ if test "x$have_sched_setaffinity" = "x1" ; then
  AC_DEFINE([JEMALLOC_HAVE_SCHED_SETAFFINITY], [ ], [ ])
 fi

+dnl Check if the pthread_setaffinity_np function exists.
+AC_CHECK_FUNC([pthread_setaffinity_np],
+              [have_pthread_setaffinity_np="1"],
+              [have_pthread_setaffinity_np="0"]
+             )
+if test "x$have_pthread_setaffinity_np" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP], [ ], [ ])
+fi
+
 dnl Check if the Solaris/BSD issetugid function exists.
 AC_CHECK_FUNC([issetugid],
              [have_issetugid="1"],
@ -2041,6 +2177,14 @@ if test "x$have_memcntl" = "x1" ; then
  AC_DEFINE([JEMALLOC_HAVE_MEMCNTL], [ ], [ ])
 fi

+AC_CHECK_FUNC([prctl],
+	      [have_prctl="1"],
+	      [have_prctl="0"],
+	      )
+if test "x$have_prctl" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PRCTL], [ ], [ ])
+fi
+
 dnl Disable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
  [AS_HELP_STRING([--enable-lazy-lock],
@ -2409,6 +2553,22 @@ else
  AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
 fi

+dnl Do not compile with debugging by default.
+AC_ARG_ENABLE([pageid],
+  [AS_HELP_STRING([--enable-pageid],
+                  [Enable named pages])],
+[if test "x$enable_pageid" = "xno" ; then
+  enable_pageid="0"
+else
+  enable_pageid="1"
+fi
+],
+[enable_pageid="0"]
+)
+if test "x$enable_pageid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_PAGEID], [ ], [ ])
+fi
+
 dnl ============================================================================
 dnl Enable background threads if possible.

@ -2665,5 +2825,7 @@ AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([log                : ${enable_log}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
+AC_MSG_RESULT([pageid             : ${enable_pageid}])
 AC_MSG_RESULT([cxx                : ${enable_cxx}])
+AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([===============================================================================])
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@ -33,6 +33,8 @@
    <refname>aligned_alloc</refname>
    <refname>realloc</refname>
    <refname>free</refname>
+    <refname>free_sized</refname>
+    <refname>free_aligned_sized</refname>
    <refname>mallocx</refname>
    <refname>rallocx</refname>
    <refname>xallocx</refname>
@ -89,6 +91,17 @@
          <funcdef>void <function>free</function></funcdef>
          <paramdef>void *<parameter>ptr</parameter></paramdef>
        </funcprototype>
+        <funcprototype>
+          <funcdef>void <function>free_sized</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+        </funcprototype>
+        <funcprototype>
+          <funcdef>void <function>free_aligned_sized</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>alignment</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+        </funcprototype>
      </refsect2>
      <refsect2>
        <title>Non-standard API</title>
@ -227,6 +240,17 @@
      allocated memory referenced by <parameter>ptr</parameter> to be made
      available for future allocations.  If <parameter>ptr</parameter> is
      <constant>NULL</constant>, no action occurs.</para>
+
+      <para>The <function>free_sized()</function> function is an extension of
+      <function>free()</function> with a <parameter>size</parameter> parameter
+      to allow the caller to pass in the allocation size as an optimization.
+      </para>
+
+      <para>The <function>free_aligned_sized()</function> function accepts a
+      <parameter>ptr</parameter> which was allocated with a requested
+      <parameter>size</parameter> and <parameter>alignment</parameter>, causing
+      the allocated memory referenced by <parameter>ptr</parameter> to be made
+      available for future allocations.</para>
    </refsect2>
    <refsect2>
      <title>Non-standard API</title>
@ -451,6 +475,24 @@ for (i = 0; i < nbins; i++) {
      depended on, since such behavior is entirely implementation-dependent.
      </para>
    </refsect2>
+    <refsect2>
+      <title>Interactions Between the Standard and Non-standard APIs</title>
+      <para>Generally speaking it is permissible to pass pointers obtained from
+      the standard API to the non-standard API and vice versa (e.g. calling
+      <function>free()</function> with a pointer returned by a call to
+      <function>mallocx()</function>, calling <function>sdallocx()</function>
+      with a pointer returned by a call to <function>calloc()</function>).
+      There are however a few exceptions. In keeping with the C23 standard –
+      which forbids calling <function>free_sized()</function> on a pointer
+      returned by <function>aligned_alloc()</function>, mandating that either
+      <function>free_aligned_sized()</function> or <function>free()</function>
+      be used instead – using any combination of the standard and non-standard
+      APIs in an equivalent fashion (i.e. taking a pointer which was allocated
+      with an explicitly requested alignment and attempting to free it via an
+      API that accepts a size hint, without also providing the alignment hint)
+      is likewise forbidden.
+      </para>
+    </refsect2>
  </refsect1>
  <refsect1 id="tuning">
    <title>TUNING</title>
@ -1121,9 +1163,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
        linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
        for related dynamic control options.  See <link
        linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.for a description of muzzy pages.  Note
-        that when the <link
-        linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
+        for a description of muzzy pages.  Note that when the <link linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
        feature is enabled, the arenas reserved for oversize requests may have
        its own default decay settings.</para></listitem>
      </varlistentry>
@ -3267,7 +3307,7 @@ struct extent_hooks_s {
        <listitem><para>Current number of nonfull slabs.</para></listitem>
      </varlistentry>

-      <varlistentry id="stats.arenas.i.bins.mutex">
+      <varlistentry id="stats.arenas.i.bins.j.mutex">
        <term>
          <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.mutex.{counter}</mallctl>
          (<type>counter specific type</type>) <literal>r-</literal>
--- a/doc_internal/PROFILING_INTERNALS.md
+++ b/doc_internal/PROFILING_INTERNALS.md
@ -99,7 +99,25 @@ Using this approach means that there are a few things users need to be aware of.
 If one stack appears twice as often as another, this by itself does not imply that it allocates twice as often. Consider the case in which there are only two types of allocating call stacks in a program. Stack A allocates 8 bytes, and occurs a million times in a program. Stack B allocates 8 MB, and occurs just once in a program. If our sampling rate $R$ is about 1MB, we expect stack A to show up about 8 times, and stack B to show up once. Stack A isn't 8 times more frequent than stack B, though; it's a million times more frequent.

 ### Aggregation must be done after unbiasing samples
-Some tools manually parse heap dump output, and aggregate across stacks (or across program runs) to provide wider-scale data analyses. When doing this aggregation, though, it's important to unbias-and-then-sum, rather than sum-and-then-unbias. Reusing our example from the previous section: suppose we collect heap dumps of the program from a million machines. We then have 8 million occurs of stack A (each of 8 bytes), and a million occurrences of stack B (each of 8 MB). If we sum first, we'll attribute 64 MB to stack A, and 8 TB to stack B. Unbiasing changes these numbers by an infinitesimal amount, so that sum-then-unbias dramatically underreports the amount of memory allocated by stack A.
+Some tools manually parse heap dump output, and aggregate across stacks (or across program runs) to provide wider-scale data analyses. When doing this aggregation, though, it's important to unbias-and-then-sum, rather than sum-and-then-unbias. Reusing our example from the previous section: suppose we collect heap dumps of the program from 1 million machines. We then have 8 million samples of stack A (8 per machine, each of 8 bytes), and 1 million samples of stack B (1 per machine, each of 8 MB).
+
+If we sum first then unbias based on this formula: $1 - e^{-Z/R}$ we get:
+
+$$Z = 8,000,000 * 8 bytes = 64MB$$
+$$64MB / (1 - e^{-64MB/1MB}) \approx 64MB (Stack A)$$
+
+$$Z = 1,000,000 * 8MB = 8TB$$
+$$8TB / (1 - e^{-1TB/1MB}) \approx 8TB (Stack B)$$
+
+Clearly we are unbiasing by an infinitesimal amount, which dramatically underreports the amount of memory allocated by stack A. Whereas if we unbias first and then sum:
+
+$$Z = 8 bytes$$
+$$8 bytes / (1 - e^{-8 bytes/1MB}) \approx 1MB$$
+$$1MB * 8,000,000 = 8TB (Stack A)$$
+
+$$Z = 8MB$$
+$$8MB / (1 - e^{-8MB/1MB})  \approx 8MB$$
+$$8MB * 1,000,000 = 8TB (Stack B)$$

 ## An avenue for future exploration
 While the framework we laid out above is pretty general, as an engineering decision we're only interested in fairly simple approaches (i.e. ones for which the chance of an allocation being sampled depends only on its size). Our job is then: for each size class $Z$, pick a probability $p_Z$ that an allocation of that size will be sampled. We made some handwave-y references to statistical distributions to justify our choices, but there's no reason we need to pick them that way. Any set of non-zero probabilities is a valid choice.
--- a/include/jemalloc/internal/activity_callback.h
+++ b/include/jemalloc/internal/activity_callback.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H
 #define JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
 /*
 * The callback to be executed "periodically", in response to some amount of
 * allocator activity.
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -1,8 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
 #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/div.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
@ -18,11 +21,10 @@ extern ssize_t opt_dirty_decay_ms;
 extern ssize_t opt_muzzy_decay_ms;

 extern percpu_arena_mode_t opt_percpu_arena;
-extern const char *percpu_arena_mode_names[];
+extern const char *const percpu_arena_mode_names[];

 extern div_info_t arena_binind_div_info[SC_NBINS];

-extern malloc_mutex_t arenas_lock;
 extern emap_t arena_emap_global;

 extern size_t opt_oversize_threshold;
@ -48,9 +50,9 @@ edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
 void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
    edata_t *edata);
 void arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena,
-    edata_t *edata, size_t oldsize);
+    edata_t *edata, size_t oldusize);
 void arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena,
-    edata_t *edata, size_t oldsize);
+    edata_t *edata, size_t oldusize);
 bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state,
    ssize_t decay_ms);
 ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
@ -61,14 +63,14 @@ void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
 void arena_reset(tsd_t *tsd, arena_t *arena);
 void arena_destroy(tsd_t *tsd, arena_t *arena);
 void arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena,
-    cache_bin_t *cache_bin, cache_bin_info_t *cache_bin_info, szind_t binind,
-    const unsigned nfill);
+    cache_bin_t *cache_bin, szind_t binind, const unsigned nfill);

 void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
-    szind_t ind, bool zero);
+    szind_t ind, bool zero, bool slab);
 void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache);
-void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
+    size_t alignment, bool zero, bool slab, tcache_t *tcache);
+void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize,
+    size_t bumped_usize);
 void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
    bool slow_path);
 void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
@ -81,13 +83,15 @@ void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
 bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
    size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    size_t size, size_t alignment, bool zero, bool slab, tcache_t *tcache,
    hook_ralloc_args_t *hook_args);
 dss_prec_t arena_dss_prec_get(arena_t *arena);
 ehooks_t *arena_get_ehooks(arena_t *arena);
 extent_hooks_t *arena_set_extent_hooks(tsd_t *tsd, arena_t *arena,
    extent_hooks_t *extent_hooks);
 bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+void arena_name_get(arena_t *arena, char *name);
+void arena_name_set(arena_t *arena, const char *name);
 ssize_t arena_dirty_decay_ms_default_get(void);
 bool arena_dirty_decay_ms_default_set(ssize_t decay_ms);
 ssize_t arena_muzzy_decay_ms_default_get(void);
@ -98,7 +102,7 @@ unsigned arena_nthreads_get(arena_t *arena, bool internal);
 void arena_nthreads_inc(arena_t *arena, bool internal);
 void arena_nthreads_dec(arena_t *arena, bool internal);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
-bool arena_init_huge(void);
+bool arena_init_huge(arena_t *a0);
 bool arena_is_huge(unsigned arena_ind);
 arena_t *arena_choose_huge(tsd_t *tsd);
 bin_t *arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind,
--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H
 #define JEMALLOC_INTERNAL_ARENA_INLINES_A_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_structs.h"
+
 static inline unsigned
 arena_ind_get(const arena_t *arena) {
 	return arena->ind;
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -1,14 +1,22 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
 #define JEMALLOC_INTERNAL_ARENA_INLINES_B_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_structs.h"
 #include "jemalloc/internal/div.h"
 #include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_b.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/large_externs.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/prof_structs.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/tcache_inlines.h"
 #include "jemalloc/internal/ticker.h"

 static inline arena_t *
@ -28,14 +36,46 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
 	 * thread is not assigned to a manual arena.
 	 */
-	if (unlikely(size >= oversize_threshold)) {
-		arena_t *tsd_arena = tsd_arena_get(tsd);
-		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
-			return arena_choose_huge(tsd);
-		}
+	arena_t *tsd_arena = tsd_arena_get(tsd);
+	if (tsd_arena == NULL) {
+		tsd_arena = arena_choose(tsd, NULL);
 	}

-	return arena_choose(tsd, NULL);
+	size_t threshold = atomic_load_zu(
+	    &tsd_arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED);
+	if (unlikely(size >= threshold) && arena_is_auto(tsd_arena)) {
+		return arena_choose_huge(tsd);
+	}
+
+	return tsd_arena;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) {
+	if (!config_opt_safety_checks) {
+		return false;
+	}
+
+	/*
+	 * Eagerly detect double free and sized dealloc bugs for large sizes.
+	 * The cost is low enough (as edata will be accessed anyway) to be
+	 * enabled all the time.
+	 */
+	if (unlikely(edata == NULL ||
+	    edata_state_get(edata) != extent_state_active)) {
+		safety_check_fail("Invalid deallocation detected: "
+		    "pages being freed (%p) not currently active, "
+		    "possibly caused by double free bugs.", ptr);
+		return true;
+	}
+	size_t input_size = sz_index2size(szind);
+	if (unlikely(input_size != edata_usize_get(edata))) {
+		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
+		    /* true_size */ edata_usize_get(edata), input_size);
+		return true;
+	}
+
+	return false;
 }

 JEMALLOC_ALWAYS_INLINE void
@ -61,12 +101,18 @@ arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
 	if (unlikely(!is_slab)) {
 		/* edata must have been initialized at this point. */
 		assert(edata != NULL);
+		if (reset_recent &&
+		    large_dalloc_safety_checks(edata, ptr,
+		    edata_szind_get(edata))) {
+			prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
+			return;
+		}
 		large_prof_info_get(tsd, edata, prof_info, reset_recent);
 	} else {
-		prof_info->alloc_tctx = (prof_tctx_t *)(uintptr_t)1U;
+		prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
 		/*
 		 * No need to set other fields in prof_info; they will never be
-		 * accessed if (uintptr_t)alloc_tctx == (uintptr_t)1U.
+		 * accessed if alloc_tctx == PROF_TCTX_SENTINEL.
 		 */
 	}
 }
@ -131,7 +177,8 @@ arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
 	 */
 	ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd);
 	uint64_t *prng_state = tsd_prng_statep_get(tsd);
-	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks))) {
+	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks,
+	    tsd_reentrancy_level_get(tsd) > 0))) {
 		arena_decay(tsdn, arena, false, false);
 	}
 }
@ -143,23 +190,25 @@ arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {

 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
-    tcache_t *tcache, bool slow_path) {
+    bool slab, tcache_t *tcache, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);

 	if (likely(tcache != NULL)) {
-		if (likely(size <= SC_SMALL_MAXCLASS)) {
+		if (likely(slab)) {
+			assert(sz_can_use_slab(size));
 			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
-		}
-		if (likely(size <= tcache_maxclass)) {
+		} else if (likely(
+		    ind < tcache_nbins_get(tcache->tcache_slow) &&
+		    !tcache_bin_disabled(ind, &tcache->bins[ind],
+		    tcache->tcache_slow))) {
 			return tcache_alloc_large(tsdn_tsd(tsdn), arena,
 			    tcache, size, ind, zero, slow_path);
 		}
-		/* (size > tcache_maxclass) case falls through. */
-		assert(size > tcache_maxclass);
+		/* (size > tcache_max) case falls through. */
 	}

-	return arena_malloc_hard(tsdn, arena, size, ind, zero);
+	return arena_malloc_hard(tsdn, arena, size, ind, zero, slab);
 }

 JEMALLOC_ALWAYS_INLINE arena_t *
@ -210,35 +259,6 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 	return sz_index2size(full_alloc_ctx.szind);
 }

-JEMALLOC_ALWAYS_INLINE bool
-large_dalloc_safety_checks(edata_t *edata, void *ptr, szind_t szind) {
-	if (!config_opt_safety_checks) {
-		return false;
-	}
-
-	/*
-	 * Eagerly detect double free and sized dealloc bugs for large sizes.
-	 * The cost is low enough (as edata will be accessed anyway) to be
-	 * enabled all the time.
-	 */
-	if (unlikely(edata == NULL ||
-	    edata_state_get(edata) != extent_state_active)) {
-		safety_check_fail("Invalid deallocation detected: "
-		    "pages being freed (%p) not currently active, "
-		    "possibly caused by double free bugs.",
-		    (uintptr_t)edata_addr_get(edata));
-		return true;
-	}
-	size_t input_size = sz_index2size(szind);
-	if (unlikely(input_size != edata_usize_get(edata))) {
-		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
-		    /* true_size */ edata_usize_get(edata), input_size);
-		return true;
-	}
-
-	return false;
-}
-
 static inline void
 arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
 	if (config_prof && unlikely(szind < SC_NBINS)) {
@ -280,24 +300,76 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
    bool slow_path) {
-	if (szind < nhbins) {
-		if (config_prof && unlikely(szind < SC_NBINS)) {
-			arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
-		} else {
+	assert (!tsdn_null(tsdn) && tcache != NULL);
+	bool is_sample_promoted = config_prof && szind < SC_NBINS;
+	if (unlikely(is_sample_promoted)) {
+		arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
+	} else {
+		if (szind < tcache_nbins_get(tcache->tcache_slow) &&
+		    !tcache_bin_disabled(szind, &tcache->bins[szind],
+		    tcache->tcache_slow)) {
 			tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind,
 			    slow_path);
+		} else {
+			edata_t *edata = emap_edata_lookup(tsdn,
+			    &arena_emap_global, ptr);
+			if (large_dalloc_safety_checks(edata, ptr, szind)) {
+				/* See the comment in isfree. */
+				return;
+			}
+			large_dalloc(tsdn, edata);
 		}
-	} else {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
-		if (large_dalloc_safety_checks(edata, ptr, szind)) {
-			/* See the comment in isfree. */
-			return;
-		}
-		large_dalloc(tsdn, edata);
 	}
 }

+/* Find the region index of a pointer. */
+JEMALLOC_ALWAYS_INLINE size_t
+arena_slab_regind_impl(div_info_t* div_info, szind_t binind,
+    edata_t *slab, const void *ptr) {
+	size_t diff, regind;
+
+	/* Freeing a pointer outside the slab can cause assertion failure. */
+	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
+	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
+	/* Freeing an interior pointer can cause assertion failure. */
+	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
+	    (uintptr_t)bin_infos[binind].reg_size == 0);
+
+	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
+
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(div_info, diff);
+	assert(regind < bin_infos[binind].nregs);
+	return regind;
+}
+
+/* Checks whether ptr is currently active in the arena. */
+JEMALLOC_ALWAYS_INLINE bool
+arena_tcache_dalloc_small_safety_check(tsdn_t *tsdn, void *ptr) {
+	if (!config_debug) {
+		return false;
+	}
+	edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr);
+	szind_t binind = edata_szind_get(edata);
+	div_info_t div_info = arena_binind_div_info[binind];
+	/*
+	 * Calls the internal function arena_slab_regind_impl because the
+	 * safety check does not require a lock.
+	 */
+	size_t regind = arena_slab_regind_impl(&div_info, binind, edata, ptr);
+	slab_data_t *slab_data = edata_slab_data_get(edata);
+	const bin_info_t *bin_info = &bin_infos[binind];
+	assert(edata_nfree_get(edata) < bin_info->nregs);
+	if (unlikely(!bitmap_get(slab_data->bitmap, &bin_info->bitmap_info,
+	    regind))) {
+		safety_check_fail(
+		    "Invalid deallocation detected: the pointer being freed (%p) not "
+		    "currently active, possibly caused by double free bugs.\n", ptr);
+		return true;
+	}
+	return false;
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
@ -313,7 +385,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	if (caller_alloc_ctx != NULL) {
 		alloc_ctx = *caller_alloc_ctx;
 	} else {
-		util_assume(!tsdn_null(tsdn));
+		util_assume(tsdn != NULL);
 		emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
 		    &alloc_ctx);
 	}
@ -328,6 +400,9 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
 		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
 		    alloc_ctx.szind, slow_path);
 	} else {
@ -415,6 +490,9 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
 		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
 		    alloc_ctx.szind, slow_path);
 	} else {
@ -442,7 +520,7 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
 		}
 		uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE -
 		    lg_range);
-		edata->e_addr = (void *)((uintptr_t)edata->e_addr +
+		edata->e_addr = (void *)((byte_t *)edata->e_addr +
 		    random_offset);
 		assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment) ==
 		    edata->e_addr);
@ -465,22 +543,7 @@ struct arena_dalloc_bin_locked_info_s {
 JEMALLOC_ALWAYS_INLINE size_t
 arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
    edata_t *slab, const void *ptr) {
-	size_t diff, regind;
-
-	/* Freeing a pointer outside the slab can cause assertion failure. */
-	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
-	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
-	/* Freeing an interior pointer can cause assertion failure. */
-	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
-	    (uintptr_t)bin_infos[binind].reg_size == 0);
-
-	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
-
-	/* Avoid doing division with a variable divisor. */
-	regind = div_compute(&info->div_info, diff);
-
-	assert(regind < bin_infos[binind].nregs);
-
+	size_t regind = arena_slab_regind_impl(&info->div_info, binind, slab, ptr);
 	return regind;
 }

@ -543,7 +606,7 @@ arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,

 static inline bin_t *
 arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) {
-	bin_t *shard0 = (bin_t *)((uintptr_t)arena + arena_bin_offsets[binind]);
+	bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]);
 	return shard0 + binshard;
 }

--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
 #define JEMALLOC_INTERNAL_ARENA_STATS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/mutex.h"
@ -51,6 +52,8 @@ struct arena_stats_s {
 	 * in pa_shard_stats_t.
 	 */
 	size_t			base; /* Derived. */
+	size_t			metadata_edata; /* Derived. */
+	size_t			metadata_rtree; /* Derived. */
 	size_t			resident; /* Derived. */
 	size_t			metadata_thp; /* Derived. */
 	size_t			mapped; /* Derived. */
--- a/include/jemalloc/internal/arena_structs.h
+++ b/include/jemalloc/internal/arena_structs.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H
 #define JEMALLOC_INTERNAL_ARENA_STRUCTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bin.h"
@ -91,11 +92,19 @@ struct arena_s {
 	/* Used to determine uptime.  Read-only after initialization. */
 	nstime_t		create_time;

+	/* The name of the arena. */
+	char 			name[ARENA_NAME_LEN];
+
 	/*
 	 * The arena is allocated alongside its bins; really this is a
 	 * dynamically sized array determined by the binshard settings.
+	 * Enforcing cacheline-alignment to minimize the number of cachelines
+	 * touched on the hot paths.
 	 */
-	bin_t			bins[0];
+	JEMALLOC_WARN_ON_USAGE("Do not use this field directly. "
+	                       "Use `arena_get_bin` instead.")
+	JEMALLOC_ALIGNED(CACHELINE)
+	bin_t			all_bins[0];
 };

 #endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
 #define JEMALLOC_INTERNAL_ARENA_TYPES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/sc.h"

 /* Default decay times in milliseconds. */
@ -8,6 +9,8 @@
 #define MUZZY_DECAY_MS_DEFAULT	(0)
 /* Number of event ticks between time checks. */
 #define ARENA_DECAY_NTICKS_PER_UPDATE	1000
+/* Maximum length of the arena name. */
+#define ARENA_NAME_LEN 32

 typedef struct arena_decay_s arena_decay_t;
 typedef struct arena_s arena_t;
--- a/include/jemalloc/internal/assert.h
+++ b/include/jemalloc/internal/assert.h
@ -1,3 +1,4 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/util.h"

--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@ -1,7 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H

-#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+#include "jemalloc/internal/jemalloc_preamble.h"

 #define JEMALLOC_U8_ATOMICS
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
@ -22,6 +22,8 @@
 #  error "Don't have atomics implemented on this platform."
 #endif

+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
 /*
 * This header gives more or less a backport of C11 atomics. The user can write
 * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
--- a/include/jemalloc/internal/atomic_c11.h
+++ b/include/jemalloc/internal/atomic_c11.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
 #define JEMALLOC_INTERNAL_ATOMIC_C11_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include <stdatomic.h>

 #define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
--- a/include/jemalloc/internal/atomic_gcc_atomic.h
+++ b/include/jemalloc/internal/atomic_gcc_atomic.h
@ -1,8 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
 #define ATOMIC_INIT(...) {__VA_ARGS__}

 typedef enum {
@ -126,4 +129,6 @@ atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
 	    atomic_enum_to_builtin(mo));				\
 }

+#undef ATOMIC_INLINE
+
 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
 #define ATOMIC_INIT(...) {__VA_ARGS__}

 typedef enum {
@ -192,4 +196,6 @@ atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
 	return __sync_fetch_and_xor(&a->repr, val);			\
 }

+#undef ATOMIC_INLINE
+
 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
--- a/include/jemalloc/internal/atomic_msvc.h
+++ b/include/jemalloc/internal/atomic_msvc.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
 #define JEMALLOC_INTERNAL_ATOMIC_MSVC_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
 #define ATOMIC_INIT(...) {__VA_ARGS__}

 typedef enum {
@ -155,4 +159,6 @@ atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
 	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
 }

+#undef ATOMIC_INLINE
+
 #endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@ -1,6 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/mutex.h"
+
 extern bool opt_background_thread;
 extern size_t opt_max_background_threads;
 extern malloc_mutex_t background_thread_lock;
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@ -1,6 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/background_thread_externs.h"
+
 JEMALLOC_ALWAYS_INLINE bool
 background_thread_enabled(void) {
 	return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED);
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
+
 /* This file really combines "structs" and "types", but only transitionally. */

 #if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK)
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BASE_H
 #define JEMALLOC_INTERNAL_BASE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/ehooks.h"
 #include "jemalloc/internal/mutex.h"
@ -23,7 +24,7 @@ typedef enum metadata_thp_mode_e metadata_thp_mode_t;

 #define METADATA_THP_DEFAULT metadata_thp_disabled
 extern metadata_thp_mode_t opt_metadata_thp;
-extern const char *metadata_thp_mode_names[];
+extern const char *const metadata_thp_mode_names[];


 /* Embedded at the beginning of every block of base-managed virtual memory. */
@ -72,8 +73,13 @@ struct base_s {
 	/* Heap of extents that track unused trailing space within blocks. */
 	edata_heap_t avail[SC_NSIZES];

+	/* Contains reusable base edata (used by tcache_stacks currently). */
+	edata_avail_t edata_avail;
+
 	/* Stats, only maintained if config_stats. */
 	size_t allocated;
+	size_t edata_allocated;
+	size_t rtree_allocated;
 	size_t resident;
 	size_t mapped;
 	/* Number of THP regions touched. */
@ -100,8 +106,12 @@ extent_hooks_t *base_extent_hooks_set(base_t *base,
    extent_hooks_t *extent_hooks);
 void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
+void *base_alloc_rtree(tsdn_t *tsdn, base_t *base, size_t size);
+void *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size);
+void b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack);
 void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
-    size_t *resident, size_t *mapped, size_t *n_thp);
+    size_t *edata_allocated, size_t *rtree_allocated, size_t *resident,
+    size_t *mapped, size_t *n_thp);
 void base_prefork(tsdn_t *tsdn, base_t *base);
 void base_postfork_parent(tsdn_t *tsdn, base_t *base);
 void base_postfork_child(tsdn_t *tsdn, base_t *base);
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_H
 #define JEMALLOC_INTERNAL_BIN_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bin_stats.h"
 #include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/edata.h"
@ -48,7 +49,7 @@ struct bins_s {
 	bin_t *bin_shards;
 };

-void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+void bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]);
 bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
    size_t end_size, size_t nshards);

--- a/include/jemalloc/internal/bin_info.h
+++ b/include/jemalloc/internal/bin_info.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_INFO_H
 #define JEMALLOC_INTERNAL_BIN_INFO_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bitmap.h"

 /*
--- a/include/jemalloc/internal/bin_stats.h
+++ b/include/jemalloc/internal/bin_stats.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_STATS_H
 #define JEMALLOC_INTERNAL_BIN_STATS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/mutex_prof.h"

 typedef struct bin_stats_s bin_stats_t;
--- a/include/jemalloc/internal/bin_types.h
+++ b/include/jemalloc/internal/bin_types.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
 #define JEMALLOC_INTERNAL_BIN_TYPES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/sc.h"

 #define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH)
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
 #define JEMALLOC_INTERNAL_BIT_UTIL_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /* Sanity check. */
@ -340,7 +341,6 @@ ffs_u32(uint32_t x) {
 #else
 #error No implementation for 32-bit ffs()
 #endif
-	return ffs_u(x);
 }

 static inline unsigned
@ -350,7 +350,6 @@ fls_u32(uint32_t x) {
 #else
 #error No implementation for 32-bit fls()
 #endif
-	return fls_u(x);
 }

 static inline uint64_t
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BITMAP_H
 #define JEMALLOC_INTERNAL_BITMAP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/sc.h"

--- a/include/jemalloc/internal/buf_writer.h
+++ b/include/jemalloc/internal/buf_writer.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_BUF_WRITER_H
 #define JEMALLOC_INTERNAL_BUF_WRITER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * Note: when using the buffered writer, cbopaque is passed to write_cb only
 * when the buffer is flushed.  It would make a difference if cbopaque points
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
 #define JEMALLOC_INTERNAL_CACHE_BIN_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_externs.h"
 #include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sz.h"

 /*
@ -20,16 +23,20 @@
 */
 typedef uint16_t cache_bin_sz_t;

+#define JUNK_ADDR ((uintptr_t)0x7a7a7a7a7a7a7a7aULL)
 /*
 * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a
 * bug starts leaking those.  Make it look like the junk pattern but be distinct
 * from it.
 */
-static const uintptr_t cache_bin_preceding_junk =
-    (uintptr_t)0x7a7a7a7a7a7a7a7aULL;
-/* Note: a7 vs. 7a above -- this tells you which pointer leaked. */
-static const uintptr_t cache_bin_trailing_junk =
-    (uintptr_t)0xa7a7a7a7a7a7a7a7ULL;
+static const uintptr_t cache_bin_preceding_junk = JUNK_ADDR;
+/* Note: JUNK_ADDR vs. JUNK_ADDR + 1 -- this tells you which pointer leaked. */
+static const uintptr_t cache_bin_trailing_junk = JUNK_ADDR + 1;
+/*
+ * A pointer used to initialize a fake stack_head for disabled small bins
+ * so that the enabled/disabled assessment does not rely on ncached_max.
+ */
+extern const uintptr_t disabled_bin;

 /*
 * That implies the following value, for the maximum number of items in any
@ -122,6 +129,9 @@ struct cache_bin_s {
 	 * array.  Immutable after initialization.
 	 */
 	uint16_t low_bits_empty;
+
+	/* The maximum number of cached items in the bin. */
+	cache_bin_info_t bin_info;
 };

 /*
@ -168,10 +178,41 @@ cache_bin_nonfast_aligned(const void *ptr) {
 	return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0;
 }

+static inline const void *
+cache_bin_disabled_bin_stack(void) {
+	return &disabled_bin;
+}
+
+/*
+ * If a cache bin was zero initialized (either because it lives in static or
+ * thread-local storage, or was memset to 0), this function indicates whether or
+ * not cache_bin_init was called on it.
+ */
+static inline bool
+cache_bin_still_zero_initialized(cache_bin_t *bin) {
+	return bin->stack_head == NULL;
+}
+
+static inline bool
+cache_bin_disabled(cache_bin_t *bin) {
+	bool disabled = (bin->stack_head == cache_bin_disabled_bin_stack());
+	if (disabled) {
+		assert((uintptr_t)(*bin->stack_head) == JUNK_ADDR);
+	}
+	return disabled;
+}
+
+/* Gets ncached_max without asserting that the bin is enabled. */
+static inline cache_bin_sz_t
+cache_bin_ncached_max_get_unsafe(cache_bin_t *bin) {
+	return bin->bin_info.ncached_max;
+}
+
 /* Returns ncached_max: Upper limit on ncached. */
 static inline cache_bin_sz_t
-cache_bin_info_ncached_max(cache_bin_info_t *info) {
-	return info->ncached_max;
+cache_bin_ncached_max_get(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
+	return cache_bin_ncached_max_get_unsafe(bin);
 }

 /*
@ -193,28 +234,19 @@ cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
 * Does difference calculations that handle wraparound correctly.  Earlier must
 * be associated with the position earlier in memory.
 */
-static inline uint16_t
-cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later, bool racy) {
-	/*
-	 * When it's racy, bin->low_bits_full can be modified concurrently. It
-	 * can cross the uint16_t max value and become less than
-	 * bin->low_bits_empty at the time of the check.
-	 */
-	if (!racy) {
-		cache_bin_assert_earlier(bin, earlier, later);
-	}
+static inline cache_bin_sz_t
+cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
+	cache_bin_assert_earlier(bin, earlier, later);
 	return later - earlier;
 }

 /*
 * Number of items currently cached in the bin, without checking ncached_max.
- * We require specifying whether or not the request is racy or not (i.e. whether
- * or not concurrent modifications are possible).
 */
 static inline cache_bin_sz_t
-cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
+cache_bin_ncached_get_internal(cache_bin_t *bin) {
 	cache_bin_sz_t diff = cache_bin_diff(bin,
-	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty, racy);
+	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
 	cache_bin_sz_t n = diff / sizeof(void *);
 	/*
 	 * We have undefined behavior here; if this function is called from the
@ -225,7 +257,7 @@ cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
 	 * fast paths.  This should still be "safe" in the sense of generating
 	 * the correct assembly for the foreseeable future, though.
 	 */
-	assert(n == 0 || *(bin->stack_head) != NULL || racy);
+	assert(n == 0 || *(bin->stack_head) != NULL);
 	return n;
 }

@ -235,10 +267,9 @@ cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
 * possible.
 */
 static inline cache_bin_sz_t
-cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
-	    /* racy */ false);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_ncached_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
 	return n;
 }

@ -253,9 +284,8 @@ cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
 static inline void **
 cache_bin_empty_position_get(cache_bin_t *bin) {
 	cache_bin_sz_t diff = cache_bin_diff(bin,
-	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty,
-	    /* racy */ false);
-	uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff;
+	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
+	byte_t *empty_bits = (byte_t *)bin->stack_head + diff;
 	void **ret = (void **)empty_bits;

 	assert(ret >= bin->stack_head);
@ -274,9 +304,9 @@ cache_bin_empty_position_get(cache_bin_t *bin) {
 * arena statistics collection.
 */
 static inline uint16_t
-cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
+cache_bin_low_bits_low_bound_get(cache_bin_t *bin) {
 	return (uint16_t)bin->low_bits_empty -
-	    info->ncached_max * sizeof(void *);
+	    cache_bin_ncached_max_get(bin) * sizeof(void *);
 }

 /*
@ -285,8 +315,8 @@ cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
 * A pointer to the position with the lowest address of the backing array.
 */
 static inline void **
-cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
+cache_bin_low_bound_get(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
 	void **ret = cache_bin_empty_position_get(bin) - ncached_max;
 	assert(ret <= bin->stack_head);

@ -298,8 +328,8 @@ cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
 * batch fill a nonempty cache bin.
 */
 static inline void
-cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
-	assert(cache_bin_ncached_get_local(bin, info) == 0);
+cache_bin_assert_empty(cache_bin_t *bin) {
+	assert(cache_bin_ncached_get_local(bin) == 0);
 	assert(cache_bin_empty_position_get(bin) == bin->stack_head);
 }

@ -311,15 +341,15 @@ cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
 static inline cache_bin_sz_t
 cache_bin_low_water_get_internal(cache_bin_t *bin) {
 	return cache_bin_diff(bin, bin->low_bits_low_water,
-	    bin->low_bits_empty, /* racy */ false) / sizeof(void *);
+	    bin->low_bits_empty) / sizeof(void *);
 }

 /* Returns the numeric value of low water in [0, ncached]. */
 static inline cache_bin_sz_t
-cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
+cache_bin_low_water_get(cache_bin_t *bin) {
 	cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin);
-	assert(low_water <= cache_bin_info_ncached_max(info));
-	assert(low_water <= cache_bin_ncached_get_local(bin, info));
+	assert(low_water <= cache_bin_ncached_max_get(bin));
+	assert(low_water <= cache_bin_ncached_get_local(bin));

 	cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head,
 	    bin->low_bits_low_water);
@ -333,12 +363,14 @@ cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
 */
 static inline void
 cache_bin_low_water_set(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
 	bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
 }

 static inline void
 cache_bin_low_water_adjust(cache_bin_t *bin) {
-	if (cache_bin_ncached_get_internal(bin, /* racy */ false)
+	assert(!cache_bin_disabled(bin));
+	if (cache_bin_ncached_get_internal(bin)
 	    < cache_bin_low_water_get_internal(bin)) {
 		cache_bin_low_water_set(bin);
 	}
@ -410,8 +442,7 @@ cache_bin_alloc(cache_bin_t *bin, bool *success) {

 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
 cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
-	    /* racy */ false);
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
 	if (n > num) {
 		n = (cache_bin_sz_t)num;
 	}
@ -427,6 +458,35 @@ cache_bin_full(cache_bin_t *bin) {
 	return ((uint16_t)(uintptr_t)bin->stack_head == bin->low_bits_full);
 }

+/*
+ * Scans the allocated area of the cache_bin for the given pointer up to limit.
+ * Fires safety_check_fail if the ptr is found and returns true.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_safety_checks(cache_bin_t *bin, void *ptr) {
+	if (!config_debug || opt_debug_double_free_max_scan == 0) {
+		return false;
+	}
+
+	cache_bin_sz_t ncached = cache_bin_ncached_get_internal(bin);
+	unsigned max_scan = opt_debug_double_free_max_scan < ncached
+	    ? opt_debug_double_free_max_scan
+	    : ncached;
+
+	void **cur = bin->stack_head;
+	void **limit = cur + max_scan;
+	for (; cur < limit; cur++) {
+		if (*cur == ptr) {
+			safety_check_fail(
+			    "Invalid deallocation detected: double free of "
+			    "pointer %p\n",
+			    ptr);
+			return true;
+		}
+	}
+	return false;
+}
+
 /*
 * Free an object into the given bin.  Fails only if the bin is full.
 */
@ -436,6 +496,10 @@ cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
 		return false;
 	}

+	if (unlikely(cache_bin_dalloc_safety_checks(bin, ptr))) {
+		return true;
+	}
+
 	bin->stack_head--;
 	*bin->stack_head = ptr;
 	cache_bin_assert_earlier(bin, bin->low_bits_full,
@ -454,9 +518,8 @@ cache_bin_stash(cache_bin_t *bin, void *ptr) {
 	/* Stash at the full position, in the [full, head) range. */
 	uint16_t low_bits_head = (uint16_t)(uintptr_t)bin->stack_head;
 	/* Wraparound handled as well. */
-	uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head,
-	    /* racy */ false);
-	*(void **)((uintptr_t)bin->stack_head - diff) = ptr;
+	uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head);
+	*(void **)((byte_t *)bin->stack_head - diff) = ptr;

 	assert(!cache_bin_full(bin));
 	bin->low_bits_full += sizeof(void *);
@ -465,28 +528,18 @@ cache_bin_stash(cache_bin_t *bin, void *ptr) {
 	return true;
 }

-/*
- * Get the number of stashed pointers.
- *
- * When called from a thread not owning the TLS (i.e. racy = true), it's
- * important to keep in mind that 'bin->stack_head' and 'bin->low_bits_full' can
- * be modified concurrently and almost none assertions about their values can be
- * made.
- */
+/* Get the number of stashed pointers. */
 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
-cache_bin_nstashed_get_internal(cache_bin_t *bin, cache_bin_info_t *info,
-    bool racy) {
-	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
-	uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin,
-	    info);
+cache_bin_nstashed_get_internal(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
+	uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin);

 	cache_bin_sz_t n = cache_bin_diff(bin, low_bits_low_bound,
-	    bin->low_bits_full, racy) / sizeof(void *);
+	    bin->low_bits_full) / sizeof(void *);
 	assert(n <= ncached_max);
-
-	if (!racy) {
+	if (config_debug && n != 0) {
 		/* Below are for assertions only. */
-		void **low_bound = cache_bin_low_bound_get(bin, info);
+		void **low_bound = cache_bin_low_bound_get(bin);

 		assert((uint16_t)(uintptr_t)low_bound == low_bits_low_bound);
 		void *stashed = *(low_bound + n - 1);
@ -495,35 +548,56 @@ cache_bin_nstashed_get_internal(cache_bin_t *bin, cache_bin_info_t *info,
 		/* Allow arbitrary pointers to be stashed in tests. */
 		aligned = true;
 #endif
-		assert(n == 0 || (stashed != NULL && aligned));
+		assert(stashed != NULL && aligned);
 	}

 	return n;
 }

 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
-cache_bin_nstashed_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin, info,
-	    /* racy */ false);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_nstashed_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
 	return n;
 }

 /*
 * Obtain a racy view of the number of items currently in the cache bin, in the
 * presence of possible concurrent modifications.
+ *
+ * Note that this is the only racy function in this header.  Any other functions
+ * are assumed to be non-racy.  The "racy" term here means accessed from another
+ * thread (that is not the owner of the specific cache bin).  This only happens
+ * when gathering stats (read-only).  The only change because of the racy
+ * condition is that assertions based on mutable fields are omitted.
+ *
+ * It's important to keep in mind that 'bin->stack_head' and
+ * 'bin->low_bits_full' can be modified concurrently and almost no assertions
+ * about their values can be made.
+ *
+ * This function should not call other utility functions because the racy
+ * condition may cause unexpected / undefined behaviors in unverified utility
+ * functions.  Currently, this function calls two utility functions
+ * cache_bin_ncached_max_get and cache_bin_low_bits_low_bound_get because
+ * they help access values that will not be concurrently modified.
 */
 static inline void
-cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, /* racy */ true);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_sz_t *ncached,
+    cache_bin_sz_t *nstashed) {
+	/* Racy version of cache_bin_ncached_get_internal. */
+	cache_bin_sz_t diff = bin->low_bits_empty -
+	    (uint16_t)(uintptr_t)bin->stack_head;
+	cache_bin_sz_t n = diff / sizeof(void *);
 	*ncached = n;

-	n = cache_bin_nstashed_get_internal(bin, info, /* racy */ true);
-	assert(n <= cache_bin_info_ncached_max(info));
+	/* Racy version of cache_bin_nstashed_get_internal. */
+	uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin);
+	n = (bin->low_bits_full - low_bits_low_bound) / sizeof(void *);
 	*nstashed = n;
-	/* Note that cannot assert ncached + nstashed <= ncached_max (racy). */
+	/*
+	 * Note that cannot assert anything regarding ncached_max because
+	 * it can be configured on the fly and is thus racy.
+	 */
 }

 /*
@ -567,9 +641,9 @@ struct cache_bin_ptr_array_s {
 * finish_fill call before doing any alloc/dalloc operations on the bin.
 */
 static inline void
-cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
-	cache_bin_assert_empty(bin, info);
+cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_ptr_array_t *arr,
+    cache_bin_sz_t nfill) {
+	cache_bin_assert_empty(bin);
 	arr->ptr = cache_bin_empty_position_get(bin) - nfill;
 }

@ -579,9 +653,9 @@ cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
 * case of OOM.
 */
 static inline void
-cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
-	cache_bin_assert_empty(bin, info);
+cache_bin_finish_fill(cache_bin_t *bin, cache_bin_ptr_array_t *arr,
+    cache_bin_sz_t nfilled) {
+	cache_bin_assert_empty(bin);
 	void **empty_position = cache_bin_empty_position_get(bin);
 	if (nfilled < arr->n) {
 		memmove(empty_position - nfilled, empty_position - arr->n,
@ -595,42 +669,41 @@ cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
 * everything we give them.
 */
 static inline void
-cache_bin_init_ptr_array_for_flush(cache_bin_t *bin, cache_bin_info_t *info,
+cache_bin_init_ptr_array_for_flush(cache_bin_t *bin,
    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) {
 	arr->ptr = cache_bin_empty_position_get(bin) - nflush;
-	assert(cache_bin_ncached_get_local(bin, info) == 0
+	assert(cache_bin_ncached_get_local(bin) == 0
 	    || *arr->ptr != NULL);
 }

 static inline void
-cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
-	unsigned rem = cache_bin_ncached_get_local(bin, info) - nflushed;
+cache_bin_finish_flush(cache_bin_t *bin, cache_bin_ptr_array_t *arr,
+    cache_bin_sz_t nflushed) {
+	unsigned rem = cache_bin_ncached_get_local(bin) - nflushed;
 	memmove(bin->stack_head + nflushed, bin->stack_head,
 	    rem * sizeof(void *));
-	bin->stack_head = bin->stack_head + nflushed;
+	bin->stack_head += nflushed;
 	cache_bin_low_water_adjust(bin);
 }

 static inline void
 cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind,
-    cache_bin_info_t *info, cache_bin_ptr_array_t *arr,
-    cache_bin_sz_t nstashed) {
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nstashed) {
 	assert(nstashed > 0);
-	assert(cache_bin_nstashed_get_local(bin, info) == nstashed);
+	assert(cache_bin_nstashed_get_local(bin) == nstashed);

-	void **low_bound = cache_bin_low_bound_get(bin, info);
+	void **low_bound = cache_bin_low_bound_get(bin);
 	arr->ptr = low_bound;
 	assert(*arr->ptr != NULL);
 }

 static inline void
-cache_bin_finish_flush_stashed(cache_bin_t *bin, cache_bin_info_t *info) {
-	void **low_bound = cache_bin_low_bound_get(bin, info);
+cache_bin_finish_flush_stashed(cache_bin_t *bin) {
+	void **low_bound = cache_bin_low_bound_get(bin);

 	/* Reset the bin local full position. */
 	bin->low_bits_full = (uint16_t)(uintptr_t)low_bound;
-	assert(cache_bin_nstashed_get_local(bin, info) == 0);
+	assert(cache_bin_nstashed_get_local(bin) == 0);
 }

 /*
@ -643,8 +716,8 @@ void cache_bin_info_init(cache_bin_info_t *bin_info,
 * Given an array of initialized cache_bin_info_ts, determine how big an
 * allocation is required to initialize a full set of cache_bin_ts.
 */
-void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
-    size_t *size, size_t *alignment);
+void cache_bin_info_compute_alloc(const cache_bin_info_t *infos,
+    szind_t ninfos, size_t *size, size_t *alignment);

 /*
 * Actually initialize some cache bins.  Callers should allocate the backing
@ -653,18 +726,13 @@ void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
 * cache_bin_postincrement.  *alloc_cur will then point immediately past the end
 * of the allocation.
 */
-void cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos,
+void cache_bin_preincrement(const cache_bin_info_t *infos, szind_t ninfos,
    void *alloc, size_t *cur_offset);
-void cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos,
+void cache_bin_postincrement(void *alloc, size_t *cur_offset);
+void cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info,
    void *alloc, size_t *cur_offset);
-void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
-    size_t *cur_offset);
+void cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max);

-/*
- * If a cache bin was zero initialized (either because it lives in static or
- * thread-local storage, or was memset to 0), this function indicates whether or
- * not cache_bin_init was called on it.
- */
-bool cache_bin_still_zero_initialized(cache_bin_t *bin);
+bool cache_bin_stack_use_thp(void);

 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_CKH_H
 #define JEMALLOC_INTERNAL_CKH_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/tsd.h"

 /* Cuckoo hashing implementation.  Skip to the end for the interface. */
--- a/include/jemalloc/internal/counter.h
+++ b/include/jemalloc/internal/counter.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_COUNTER_H
 #define JEMALLOC_INTERNAL_COUNTER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/mutex.h"

 typedef struct counter_accum_s {
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_CTL_H
 #define JEMALLOC_INTERNAL_CTL_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/bin_stats.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex_prof.h"
@ -10,6 +14,7 @@

 /* Maximum ctl tree depth. */
 #define CTL_MAX_DEPTH	7
+#define CTL_MULTI_SETTING_MAX_LEN 1000

 typedef struct ctl_node_s {
 	bool named;
@ -53,6 +58,8 @@ typedef struct ctl_stats_s {
 	size_t allocated;
 	size_t active;
 	size_t metadata;
+	size_t metadata_edata;
+	size_t metadata_rtree;
 	size_t metadata_thp;
 	size_t resident;
 	size_t mapped;
--- a/include/jemalloc/internal/decay.h
+++ b/include/jemalloc/internal/decay.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_DECAY_H
 #define JEMALLOC_INTERNAL_DECAY_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/smoothstep.h"

 #define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t)-1)
--- a/include/jemalloc/internal/div.h
+++ b/include/jemalloc/internal/div.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_DIV_H
 #define JEMALLOC_INTERNAL_DIV_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /*
--- a/include/jemalloc/internal/ecache.h
+++ b/include/jemalloc/internal/ecache.h
@ -1,9 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ECACHE_H
 #define JEMALLOC_INTERNAL_ECACHE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/eset.h"
-#include "jemalloc/internal/san.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/san.h"

 typedef struct ecache_s ecache_t;
 struct ecache_s {
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@ -1,12 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_EDATA_H
 #define JEMALLOC_INTERNAL_EDATA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bin_info.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/hpdata.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/slab_data.h"
@ -375,18 +377,18 @@ edata_ps_get(const edata_t *edata) {

 static inline void *
 edata_before_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) - PAGE);
+	return (void *)((byte_t *)edata_base_get(edata) - PAGE);
 }

 static inline void *
 edata_last_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) +
+	return (void *)((byte_t *)edata_base_get(edata) +
 	    edata_size_get(edata) - PAGE);
 }

 static inline void *
 edata_past_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) +
+	return (void *)((byte_t *)edata_base_get(edata) +
 	    edata_size_get(edata));
 }

@ -619,7 +621,8 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
 }

 static inline void
-edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
+edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn,
+    bool reused) {
 	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
 	edata_addr_set(edata, addr);
 	edata_bsize_set(edata, bsize);
@ -627,7 +630,8 @@ edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
 	edata_szind_set(edata, SC_NSIZES);
 	edata_sn_set(edata, sn);
 	edata_state_set(edata, extent_state_active);
-	edata_guarded_set(edata, false);
+	/* See comments in base_edata_is_reused. */
+	edata_guarded_set(edata, reused);
 	edata_zeroed_set(edata, true);
 	edata_committed_set(edata, true);
 	/*
@ -656,19 +660,28 @@ edata_ead_comp(const edata_t *a, const edata_t *b) {

 static inline edata_cmp_summary_t
 edata_cmp_summary_get(const edata_t *edata) {
-	return (edata_cmp_summary_t){edata_sn_get(edata),
-		(uintptr_t)edata_addr_get(edata)};
+	edata_cmp_summary_t result;
+	result.sn = edata_sn_get(edata);
+	result.addr = (uintptr_t)edata_addr_get(edata);
+	return result;
 }

 static inline int
 edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
-	int ret;
-	ret = (a.sn > b.sn) - (a.sn < b.sn);
-	if (ret != 0) {
-		return ret;
-	}
-	ret = (a.addr > b.addr) - (a.addr < b.addr);
-	return ret;
+	/*
+	 * Logically, what we're doing here is comparing based on `.sn`, and
+	 * falling back to comparing on `.addr` in the case that `a.sn == b.sn`.
+	 * We accomplish this by multiplying the result of the `.sn` comparison
+	 * by 2, so that so long as it is not 0, it will dominate the `.addr`
+	 * comparison in determining the sign of the returned result value.
+	 * The justification for doing things this way is that this is
+	 * branchless - all of the branches that would be present in a
+	 * straightforward implementation are common cases, and thus the branch
+	 * prediction accuracy is not great. As a result, this implementation
+	 * is measurably faster (by around 30%).
+	 */
+	return (2 * ((a.sn > b.sn) - (a.sn < b.sn))) +
+	       ((a.addr > b.addr) - (a.addr < b.addr));
 }

 static inline int
@ -681,15 +694,11 @@ edata_snad_comp(const edata_t *a, const edata_t *b) {

 static inline int
 edata_esnead_comp(const edata_t *a, const edata_t *b) {
-	int ret;
-
-	ret = edata_esn_comp(a, b);
-	if (ret != 0) {
-		return ret;
-	}
-
-	ret = edata_ead_comp(a, b);
-	return ret;
+	/*
+	 * Similar to `edata_cmp_summary_comp`, we've opted for a
+	 * branchless implementation for the sake of performance.
+	 */
+	return (2 * edata_esn_comp(a, b)) + edata_ead_comp(a, b);
 }

 ph_proto(, edata_avail, edata_t)
--- a/include/jemalloc/internal/edata_cache.h
+++ b/include/jemalloc/internal/edata_cache.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H
 #define JEMALLOC_INTERNAL_EDATA_CACHE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/base.h"

 /* For tests only. */
--- a/include/jemalloc/internal/ehooks.h
+++ b/include/jemalloc/internal/ehooks.h
@ -1,8 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_EHOOKS_H
 #define JEMALLOC_INTERNAL_EHOOKS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/tsd.h"
+#include "jemalloc/internal/tsd_types.h"

 /*
 * This module is the internal interface to the extent hooks (both
@ -53,7 +56,7 @@ bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length);
 #ifdef PAGES_CAN_PURGE_FORCED
 bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length);
 #endif
-bool ehooks_default_split_impl();
+bool ehooks_default_split_impl(void);
 /*
 * Merge is the only default extent hook we declare -- see the comment in
 * ehooks_merge.
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_EMAP_H
 #define JEMALLOC_INTERNAL_EMAP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"

--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_EMITTER_H
 #define JEMALLOC_INTERNAL_EMITTER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/ql.h"

 typedef enum emitter_output_e emitter_output_t;
--- a/include/jemalloc/internal/eset.h
+++ b/include/jemalloc/internal/eset.h
@ -1,9 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ESET_H
 #define JEMALLOC_INTERNAL_ESET_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/fb.h"
 #include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/fb.h"
 #include "jemalloc/internal/mutex.h"

 /*
--- a/include/jemalloc/internal/exp_grow.h
+++ b/include/jemalloc/internal/exp_grow.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_EXP_GROW_H
 #define JEMALLOC_INTERNAL_EXP_GROW_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/sz.h"
 typedef struct exp_grow_s exp_grow_t;
 struct exp_grow_s {
 	/*
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@ -1,8 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_H
 #define JEMALLOC_INTERNAL_EXTENT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/ecache.h"
 #include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/pac.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/rtree.h"

@ -44,8 +46,6 @@ void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
    edata_t *edata);
 bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
    size_t offset, size_t length);
-bool extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
-    size_t offset, size_t length);
 bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
    size_t offset, size_t length);
 bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
--- a/include/jemalloc/internal/extent_dss.h
+++ b/include/jemalloc/internal/extent_dss.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H
 #define JEMALLOC_INTERNAL_EXTENT_DSS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
 typedef enum {
 	dss_prec_disabled  = 0,
 	dss_prec_primary   = 1,
@ -11,7 +15,7 @@ typedef enum {
 #define DSS_PREC_DEFAULT dss_prec_secondary
 #define DSS_DEFAULT "secondary"

-extern const char *dss_prec_names[];
+extern const char *const dss_prec_names[];

 extern const char *opt_dss;

--- a/include/jemalloc/internal/extent_mmap.h
+++ b/include/jemalloc/internal/extent_mmap.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
 #define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
 extern bool opt_retain;

 void *extent_alloc_mmap(void *new_addr, size_t size, size_t alignment,
--- a/include/jemalloc/internal/fb.h
+++ b/include/jemalloc/internal/fb.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_FB_H
 #define JEMALLOC_INTERNAL_FB_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+
 /*
 * The flat bitmap module.  This has a larger API relative to the bitmap module
 * (supporting things like backwards searches, and searching for both set and
--- a/include/jemalloc/internal/fxp.h
+++ b/include/jemalloc/internal/fxp.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_FXP_H
 #define JEMALLOC_INTERNAL_FXP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
 /*
 * A simple fixed-point math implementation, supporting only unsigned values
 * (with overflow being an error).
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_HASH_H
 #define JEMALLOC_INTERNAL_HASH_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /*
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_HOOK_H
 #define JEMALLOC_INTERNAL_HOOK_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/tsd.h"

 /*
@ -55,6 +56,7 @@ enum hook_alloc_e {
 	hook_alloc_calloc,
 	hook_alloc_memalign,
 	hook_alloc_valloc,
+	hook_alloc_pvalloc,
 	hook_alloc_mallocx,

 	/* The reallocating functions have both alloc and dalloc variants */
@ -143,9 +145,9 @@ struct hook_ralloc_args_s {
 * Returns an opaque handle to be used when removing the hook.  NULL means that
 * we couldn't install the hook.
 */
-bool hook_boot();
+bool hook_boot(void);

-void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
+void *hook_install(tsdn_t *tsdn, hooks_t *to_install);
 /* Uninstalls the hook with the handle previously returned from hook_install. */
 void hook_remove(tsdn_t *tsdn, void *opaque);

--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -1,19 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_HPA_H
 #define JEMALLOC_INTERNAL_HPA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/exp_grow.h"
 #include "jemalloc/internal/hpa_hooks.h"
 #include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"

 typedef struct hpa_central_s hpa_central_t;
 struct hpa_central_s {
-	/*
-	 * The mutex guarding most of the operations on the central data
-	 * structure.
-	 */
-	malloc_mutex_t mtx;
 	/*
 	 * Guards expansion of eden.  We separate this from the regular mutex so
 	 * that cheaper operations can still continue while we're doing the OS
@ -148,7 +148,7 @@ struct hpa_shard_s {
 * is not necessarily a guarantee that it backs its allocations by hugepages,
 * just that it can function properly given the system it's running on.
 */
-bool hpa_supported();
+bool hpa_supported(void);
 bool hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
 bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
    base_t *base, edata_cache_t *edata_cache, unsigned ind,
--- a/include/jemalloc/internal/hpa_hooks.h
+++ b/include/jemalloc/internal/hpa_hooks.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H
 #define JEMALLOC_INTERNAL_HPA_HOOKS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/nstime.h"
+
 typedef struct hpa_hooks_s hpa_hooks_t;
 struct hpa_hooks_s {
 	void *(*map)(size_t size);
@ -12,6 +15,6 @@ struct hpa_hooks_s {
 	uint64_t (*ms_since)(nstime_t *r_time);
 };

-extern hpa_hooks_t hpa_hooks_default;
+extern const hpa_hooks_t hpa_hooks_default;

 #endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_HPA_OPTS_H
 #define JEMALLOC_INTERNAL_HPA_OPTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/fxp.h"

 /*
--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_HPDATA_H
 #define JEMALLOC_INTERNAL_HPDATA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/typed_list.h"
@ -343,12 +346,12 @@ hpdata_assert_consistent(hpdata_t *hpdata) {
 }

 static inline bool
-hpdata_empty(hpdata_t *hpdata) {
+hpdata_empty(const hpdata_t *hpdata) {
 	return hpdata->h_nactive == 0;
 }

 static inline bool
-hpdata_full(hpdata_t *hpdata) {
+hpdata_full(const hpdata_t *hpdata) {
 	return hpdata->h_nactive == HUGEPAGE_PAGES;
 }

@ -359,7 +362,7 @@ void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
 * offset within that allocation.
 */
 void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz);
-void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
+void hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz);

 /*
 * The hpdata_purge_prepare_t allows grabbing the metadata required to purge
--- a/include/jemalloc/internal/inspect.h
+++ b/include/jemalloc/internal/inspect.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_INSPECT_H
 #define JEMALLOC_INTERNAL_INSPECT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * This module contains the heap introspection capabilities.  For now they are
 * exposed purely through mallctl APIs in the experimental namespace, but this
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@ -32,7 +32,7 @@
 #    include <sys/uio.h>
 #  endif
 #  include <pthread.h>
-#  if defined(__FreeBSD__) || defined(__DragonFly__)
+#  if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__OpenBSD__)
 #  include <pthread_np.h>
 #  include <sched.h>
 #  if defined(__FreeBSD__)
@ -105,4 +105,21 @@ isblank(int c) {
 #  undef small
 #endif

+/*
+ * Oftentimes we'd like to perform some kind of arithmetic to obtain
+ * a pointer from another pointer but with some offset or mask applied.
+ * Naively you would accomplish this by casting the source pointer to
+ * `uintptr_t`, performing all of the relevant arithmetic, and then casting
+ * the result to the desired pointer type. However, this has the unfortunate
+ * side-effect of concealing pointer provenance, hiding useful information for
+ * optimization from the compiler (see here for details:
+ * https://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html
+ * )
+ * Instead what one should do is cast the source pointer to `char *` and perform
+ * the equivalent arithmetic (since `char` of course represents one byte). But
+ * because `char *` has the semantic meaning of "string", we define this typedef
+ * simply to make it clearer where we are performing such pointer arithmetic.
+ */
+typedef char byte_t;
+
 #endif /* JEMALLOC_INTERNAL_H */
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -14,10 +14,13 @@
 */
 #undef JEMALLOC_OVERRIDE___LIBC_CALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_FREE
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_SIZED
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED
 #undef JEMALLOC_OVERRIDE___LIBC_MALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN
 #undef JEMALLOC_OVERRIDE___LIBC_REALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_VALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_PVALLOC
 #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN

 /*
@ -88,6 +91,9 @@
 /* Defined if pthread_getname_np(3) is available. */
 #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP

+/* Defined if pthread_set_name_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_SET_NAME_NP
+
 /* Defined if pthread_get_name_np(3) is available. */
 #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP

@ -161,6 +167,12 @@
 /* Use gcc intrinsics for profile backtracing if defined. */
 #undef JEMALLOC_PROF_GCC

+/* JEMALLOC_PAGEID enabled page id */
+#undef JEMALLOC_PAGEID
+
+/* JEMALLOC_HAVE_PRCTL checks prctl */
+#undef JEMALLOC_HAVE_PRCTL
+
 /*
 * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
 * segment (DSS).
@ -259,6 +271,12 @@
 */
 #undef JEMALLOC_READLINKAT

+/*
+ * If defined, use getenv() (instead of secure_getenv() or
+ * alternatives) to access MALLOC_CONF.
+ */
+#undef JEMALLOC_FORCE_GETENV
+
 /*
 * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
 */
@ -384,6 +402,9 @@
 /* GNU specific sched_setaffinity support */
 #undef JEMALLOC_HAVE_SCHED_SETAFFINITY

+/* pthread_setaffinity_np support */
+#undef JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP
+
 /*
 * If defined, all the features necessary for background threads are present.
 */
@ -424,4 +445,15 @@
 /* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */
 #undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE

+/* If defined, use volatile asm during benchmarks. */
+#undef JEMALLOC_HAVE_ASM_VOLATILE
+
+/*
+ * If defined, support the use of rdtscp to get the time stamp counter
+ * and the processor ID.
+ */
+#undef JEMALLOC_HAVE_RDTSCP
+
+#include "jemalloc/internal/jemalloc_internal_overrides.h"
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@ -1,11 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_EXTERNS_H
 #define JEMALLOC_INTERNAL_EXTERNS_H

+#include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/sec_opts.h"
 #include "jemalloc/internal/tsd_types.h"
-#include "jemalloc/internal/nstime.h"

 /* TSD checks this to set thread local slow state accordingly. */
 extern bool malloc_slow;
@ -22,8 +23,9 @@ extern sec_opts_t opt_hpa_sec_opts;
 extern const char *opt_junk;
 extern bool opt_junk_alloc;
 extern bool opt_junk_free;
-extern void (*junk_free_callback)(void *ptr, size_t size);
-extern void (*junk_alloc_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE junk_free_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE junk_alloc_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE invalid_conf_abort)(void);
 extern bool opt_utrace;
 extern bool opt_xmalloc;
 extern bool opt_experimental_infallible_new;
@ -31,9 +33,10 @@ extern bool opt_zero;
 extern unsigned opt_narenas;
 extern zero_realloc_action_t opt_zero_realloc_action;
 extern malloc_init_t malloc_init_state;
-extern const char *zero_realloc_mode_names[];
+extern const char *const zero_realloc_mode_names[];
 extern atomic_zu_t zero_realloc_count;
 extern bool opt_cache_oblivious;
+extern unsigned opt_debug_double_free_max_scan;

 /* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */
 extern uintptr_t san_cache_bin_nonfast_mask;
@ -69,7 +72,8 @@ size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
 void jemalloc_prefork(void);
 void jemalloc_postfork_parent(void);
 void jemalloc_postfork_child(void);
-void je_sdallocx_noflags(void *ptr, size_t size);
+void sdallocx_default(void *ptr, size_t size, int flags);
+void free_default(void *ptr);
 void *malloc_default(size_t size);

 #endif /* JEMALLOC_INTERNAL_EXTERNS_H */
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@ -1,10 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_A_H
 #define JEMALLOC_INTERNAL_INLINES_A_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/tcache_externs.h"
 #include "jemalloc/internal/ticker.h"

 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
@ -14,6 +18,15 @@ malloc_getcpu(void) {
 	return GetCurrentProcessorNumber();
 #elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
 	return (malloc_cpuid_t)sched_getcpu();
+#elif defined(JEMALLOC_HAVE_RDTSCP)
+	unsigned int ecx;
+	asm volatile("rdtscp" : "=c" (ecx) :: "eax", "edx");
+	return (malloc_cpuid_t)(ecx & 0xfff);
+#elif defined(__aarch64__) && defined(__APPLE__)
+	/* Other oses most likely use tpidr_el0 instead */
+	uintptr_t c;
+	asm volatile("mrs %x0, tpidrro_el0" : "=r"(c) :: "memory");
+	return (malloc_cpuid_t)(c & (1 << 3) - 1);
 #else
 	not_reached();
 	return -1;
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_B_H
 #define JEMALLOC_INTERNAL_INLINES_B_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
 #include "jemalloc/internal/extent.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_a.h"

 static inline void
 percpu_arena_update(tsd_t *tsd, unsigned cpu) {
@ -20,6 +23,7 @@ percpu_arena_update(tsd_t *tsd, unsigned cpu) {
 		tcache_t *tcache = tcache_get(tsd);
 		if (tcache != NULL) {
 			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
+			assert(tcache_slow->arena != NULL);
 			tcache_arena_reassociate(tsd_tsdn(tsd), tcache_slow,
 			    tcache, newarena);
 		}
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_C_H
 #define JEMALLOC_INTERNAL_INLINES_C_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/log.h"
@ -8,6 +12,15 @@
 #include "jemalloc/internal/thread_event.h"
 #include "jemalloc/internal/witness.h"

+/*
+ * These correspond to the macros in jemalloc/jemalloc_macros.h.  Broadly, we
+ * should have one constant here per magic value there.  Note however that the
+ * representations need not be related.
+ */
+#define TCACHE_IND_NONE ((unsigned)-1)
+#define TCACHE_IND_AUTOMATIC ((unsigned)-2)
+#define ARENA_IND_AUTOMATIC ((unsigned)-1)
+
 /*
 * Translating the names of the 'i' functions:
 *   Abbreviations used in the first part of the function name (before
@ -41,10 +54,12 @@ isalloc(tsdn_t *tsdn, const void *ptr) {
 }

 JEMALLOC_ALWAYS_INLINE void *
-iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
-    bool is_internal, arena_t *arena, bool slow_path) {
+iallocztm_explicit_slab(tsdn_t *tsdn, size_t size, szind_t ind, bool zero,
+    bool slab, tcache_t *tcache, bool is_internal, arena_t *arena,
+    bool slow_path) {
 	void *ret;

+	assert(!slab || sz_can_use_slab(size)); /* slab && large is illegal */
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
 	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
@ -52,13 +67,21 @@ iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
 		    WITNESS_RANK_CORE, 0);
 	}

-	ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path);
+	ret = arena_malloc(tsdn, arena, size, ind, zero, slab, tcache, slow_path);
 	if (config_stats && is_internal && likely(ret != NULL)) {
 		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
 	}
 	return ret;
 }

+JEMALLOC_ALWAYS_INLINE void *
+iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
+    bool is_internal, arena_t *arena, bool slow_path) {
+	bool slab = sz_can_use_slab(size);
+	return iallocztm_explicit_slab(tsdn, size, ind, zero, slab, tcache,
+	    is_internal, arena, slow_path);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
 	return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false,
@ -66,10 +89,11 @@ ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
 }

 JEMALLOC_ALWAYS_INLINE void *
-ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
-    tcache_t *tcache, bool is_internal, arena_t *arena) {
+ipallocztm_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    bool slab, tcache_t *tcache, bool is_internal, arena_t *arena) {
 	void *ret;

+	assert(!slab || sz_can_use_slab(usize)); /* slab && large is illegal */
 	assert(usize != 0);
 	assert(usize == sz_sa2u(usize, alignment));
 	assert(!is_internal || tcache == NULL);
@ -77,7 +101,7 @@ ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);

-	ret = arena_palloc(tsdn, arena, usize, alignment, zero, tcache);
+	ret = arena_palloc(tsdn, arena, usize, alignment, zero, slab, tcache);
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
 	if (config_stats && is_internal && likely(ret != NULL)) {
 		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
@ -85,12 +109,26 @@ ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
 	return ret;
 }

+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_internal, arena_t *arena) {
+	return ipallocztm_explicit_slab(tsdn, usize, alignment, zero,
+	    sz_can_use_slab(usize), tcache, is_internal, arena);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
    tcache_t *tcache, arena_t *arena) {
 	return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena);
 }

+JEMALLOC_ALWAYS_INLINE void *
+ipalloct_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment,
+    bool zero, bool slab, tcache_t *tcache, arena_t *arena) {
+	return ipallocztm_explicit_slab(tsdn, usize, alignment, zero, slab,
+	    tcache, false, arena);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) {
 	return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero,
@ -135,7 +173,7 @@ isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,

 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
    hook_ralloc_args_t *hook_args) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_CORE, 0);
@ -146,7 +184,8 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
-	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
+	p = ipalloct_explicit_slab(tsdn, usize, alignment, zero, slab,
+	    tcache, arena);
 	if (p == NULL) {
 		return NULL;
 	}
@ -173,8 +212,9 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 * passed-around anywhere.
 */
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
+iralloct_explicit_slab(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args)
 {
 	assert(ptr != NULL);
 	assert(size != 0);
@ -188,18 +228,28 @@ iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
 		 * and copy.
 		 */
 		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
-		    zero, tcache, arena, hook_args);
+		    zero, slab, tcache, arena, hook_args);
 	}

 	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
-	    tcache, hook_args);
+	    slab, tcache, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    size_t usize, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args)
+{
+	bool slab = sz_can_use_slab(usize);
+	return iralloct_explicit_slab(tsdn, ptr, oldsize, size, alignment, zero,
+	    slab, tcache, arena, hook_args);
 }

 JEMALLOC_ALWAYS_INLINE void *
 iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, hook_ralloc_args_t *hook_args) {
-	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
-	    tcache_get(tsd), NULL, hook_args);
+    size_t usize, bool zero, hook_ralloc_args_t *hook_args) {
+	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, usize,
+	    zero, tcache_get(tsd), NULL, hook_args);
 }

 JEMALLOC_ALWAYS_INLINE bool
@ -314,6 +364,8 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
 	tcache_t *tcache = tsd_tcachep_get(tsd);
 	assert(tcache == tcache_get(tsd));
 	cache_bin_t *bin = &tcache->bins[ind];
+	/* Suppress spurious warning from static analysis */
+	assert(bin != NULL);
 	bool tcache_success;
 	void *ret;

@ -337,4 +389,217 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
 	return fallback_alloc(size);
 }

+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcache_get_from_ind(tsd_t *tsd, unsigned tcache_ind, bool slow, bool is_alloc) {
+        tcache_t *tcache;
+        if (tcache_ind == TCACHE_IND_AUTOMATIC) {
+                if (likely(!slow)) {
+                        /* Getting tcache ptr unconditionally. */
+                        tcache = tsd_tcachep_get(tsd);
+                        assert(tcache == tcache_get(tsd));
+                } else if (is_alloc ||
+                    likely(tsd_reentrancy_level_get(tsd) == 0)) {
+                        tcache = tcache_get(tsd);
+                } else {
+                        tcache = NULL;
+                }
+        } else {
+                /*
+                 * Should not specify tcache on deallocation path when being
+                 * reentrant.
+                 */
+                assert(is_alloc || tsd_reentrancy_level_get(tsd) == 0 ||
+                    tsd_state_nocleanup(tsd));
+                if (tcache_ind == TCACHE_IND_NONE) {
+                        tcache = NULL;
+                } else {
+                        tcache = tcaches_get(tsd, tcache_ind);
+                }
+        }
+        return tcache;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+        if (config_opt_size_checks) {
+                emap_alloc_ctx_t dbg_ctx;
+                emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr,
+                    &dbg_ctx);
+                if (alloc_ctx->szind != dbg_ctx.szind) {
+                        safety_check_fail_sized_dealloc(
+                            /* current_dealloc */ true, ptr,
+                            /* true_size */ sz_size2index(dbg_ctx.szind),
+                            /* input_size */ sz_size2index(alloc_ctx->szind));
+                        return true;
+                }
+                if (alloc_ctx->slab != dbg_ctx.slab) {
+                        safety_check_fail(
+                            "Internal heap corruption detected: "
+                            "mismatch in slab bit");
+                        return true;
+                }
+        }
+        return false;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_aligned(const void *ptr) {
+	return ((uintptr_t)ptr & PROF_SAMPLE_ALIGNMENT_MASK) == 0;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+free_fastpath_nonfast_aligned(void *ptr, bool check_prof) {
+        /*
+         * free_fastpath do not handle two uncommon cases: 1) sampled profiled
+         * objects and 2) sampled junk & stash for use-after-free detection.
+         * Both have special alignments which are used to escape the fastpath.
+         *
+         * prof_sample is page-aligned, which covers the UAF check when both
+         * are enabled (the assertion below).  Avoiding redundant checks since
+         * this is on the fastpath -- at most one runtime branch from this.
+         */
+        if (config_debug && cache_bin_nonfast_aligned(ptr)) {
+                assert(prof_sample_aligned(ptr));
+        }
+
+        if (config_prof && check_prof) {
+                /* When prof is enabled, the prof_sample alignment is enough. */
+                if (prof_sample_aligned(ptr)) {
+                        return true;
+                } else {
+                        return false;
+                }
+        }
+
+        if (config_uaf_detection) {
+                if (cache_bin_nonfast_aligned(ptr)) {
+                        return true;
+                } else {
+                        return false;
+                }
+        }
+
+        return false;
+}
+
+/* Returns whether or not the free attempt was successful. */
+JEMALLOC_ALWAYS_INLINE
+bool free_fastpath(void *ptr, size_t size, bool size_hint) {
+        tsd_t *tsd = tsd_get(false);
+        /* The branch gets optimized away unless tsd_get_allocates(). */
+        if (unlikely(tsd == NULL)) {
+                return false;
+        }
+        /*
+         *  The tsd_fast() / initialized checks are folded into the branch
+         *  testing (deallocated_after >= threshold) later in this function.
+         *  The threshold will be set to 0 when !tsd_fast.
+         */
+        assert(tsd_fast(tsd) ||
+            *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0);
+
+        emap_alloc_ctx_t alloc_ctx;
+        if (!size_hint) {
+                bool err = emap_alloc_ctx_try_lookup_fast(tsd,
+                    &arena_emap_global, ptr, &alloc_ctx);
+
+                /* Note: profiled objects will have alloc_ctx.slab set */
+                if (unlikely(err || !alloc_ctx.slab ||
+                    free_fastpath_nonfast_aligned(ptr,
+                    /* check_prof */ false))) {
+                        return false;
+                }
+                assert(alloc_ctx.szind != SC_NSIZES);
+        } else {
+                /*
+                 * Check for both sizes that are too large, and for sampled /
+                 * special aligned objects.  The alignment check will also check
+                 * for null ptr.
+                 */
+                if (unlikely(size > SC_LOOKUP_MAXCLASS ||
+                    free_fastpath_nonfast_aligned(ptr,
+                    /* check_prof */ true))) {
+                        return false;
+                }
+                alloc_ctx.szind = sz_size2index_lookup(size);
+                /* Max lookup class must be small. */
+                assert(alloc_ctx.szind < SC_NBINS);
+                /* This is a dead store, except when opt size checking is on. */
+                alloc_ctx.slab = true;
+        }
+        /*
+         * Currently the fastpath only handles small sizes.  The branch on
+         * SC_LOOKUP_MAXCLASS makes sure of it.  This lets us avoid checking
+         * tcache szind upper limit (i.e. tcache_max) as well.
+         */
+        assert(alloc_ctx.slab);
+
+        uint64_t deallocated, threshold;
+        te_free_fastpath_ctx(tsd, &deallocated, &threshold);
+
+        size_t usize = sz_index2size(alloc_ctx.szind);
+        uint64_t deallocated_after = deallocated + usize;
+        /*
+         * Check for events and tsd non-nominal (fast_threshold will be set to
+         * 0) in a single branch.  Note that this handles the uninitialized case
+         * as well (TSD init will be triggered on the non-fastpath).  Therefore
+         * anything depends on a functional TSD (e.g. the alloc_ctx sanity check
+         * below) needs to be after this branch.
+         */
+        if (unlikely(deallocated_after >= threshold)) {
+                return false;
+        }
+        assert(tsd_fast(tsd));
+        bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
+        if (fail) {
+                /* See the comment in isfree. */
+                return true;
+        }
+
+        tcache_t *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
+            /* slow */ false, /* is_alloc */ false);
+        cache_bin_t *bin = &tcache->bins[alloc_ctx.szind];
+
+        /*
+         * If junking were enabled, this is where we would do it.  It's not
+         * though, since we ensured above that we're on the fast path.  Assert
+         * that to double-check.
+         */
+        assert(!opt_junk_free);
+
+        if (!cache_bin_dalloc_easy(bin, ptr)) {
+                return false;
+        }
+
+        *tsd_thread_deallocatedp_get(tsd) = deallocated_after;
+
+        return true;
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_noflags(void *ptr, size_t size) {
+        LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: 0", ptr,
+                size);
+
+        if (!free_fastpath(ptr, size, true)) {
+                sdallocx_default(ptr, size, 0);
+        }
+
+        LOG("core.sdallocx.exit", "");
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_impl(void *ptr, size_t size, int flags) {
+        if (flags != 0 || !free_fastpath(ptr, size, true)) {
+                sdallocx_default(ptr, size, flags);
+        }
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_free_impl(void *ptr) {
+        if (!free_fastpath(ptr, 0, false)) {
+                free_default(ptr);
+        }
+}
+
 #endif /* JEMALLOC_INTERNAL_INLINES_C_H */
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@ -37,8 +37,10 @@
 /* Various function pointers are static and immutable except during testing. */
 #ifdef JEMALLOC_JET
 #  define JET_MUTABLE
+#  define JET_EXTERN extern
 #else
 #  define JET_MUTABLE const
+#  define JET_EXTERN static
 #endif

 #define JEMALLOC_VA_ARGS_HEAD(head, ...) head
@ -50,8 +52,10 @@
 #  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
 #  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
 #  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 /* #pragma GCC diagnostic first appeared in gcc 4.6. */
 #elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
@ -79,6 +83,8 @@
 #    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
 #  endif

+#  define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS  \
+     JEMALLOC_DIAGNOSTIC_IGNORE("-Wframe-address")
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
@ -89,6 +95,12 @@
 #  else
 #    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
 #  endif
+#  ifdef JEMALLOC_HAVE_ATTR_DEPRECATED
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED \
+       JEMALLOC_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations")
+#  else
+#    define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#  endif
 #  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
  JEMALLOC_DIAGNOSTIC_PUSH \
  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
@ -97,11 +109,19 @@
 #  define JEMALLOC_DIAGNOSTIC_POP
 #  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
 #  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#  define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
 #  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 #endif

+#define JEMALLOC_SUPPRESS_WARN_ON_USAGE(...) \
+   JEMALLOC_DIAGNOSTIC_PUSH \
+   JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED \
+   __VA_ARGS__ \
+   JEMALLOC_DIAGNOSTIC_POP
+
 /*
 * Disables spurious diagnostics for all headers.  Since these headers are not
 * included by users directly, it does not affect their diagnostic settings.
--- a/include/jemalloc/internal/jemalloc_internal_overrides.h
+++ b/include/jemalloc/internal/jemalloc_internal_overrides.h
@ -0,0 +1,21 @@
+#ifndef JEMALLOC_INTERNAL_OVERRIDES_H
+#define JEMALLOC_INTERNAL_OVERRIDES_H
+
+/*
+ * Under normal circumstances this header serves no purpose, as these settings
+ * can be customized via the corresponding autoconf options at configure-time.
+ * Overriding in this fashion is useful when the header files generated by
+ * autoconf are used as input for another build system.
+ */
+
+#ifdef JEMALLOC_OVERRIDE_LG_PAGE
+    #undef LG_PAGE
+    #define LG_PAGE JEMALLOC_OVERRIDE_LG_PAGE
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+	#undef JEMALLOC_CONFIG_MALLOC_CONF
+	#define JEMALLOC_CONFIG_MALLOC_CONF JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+#endif
+
+#endif /* JEMALLOC_INTERNAL_OVERRIDES_H */
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@ -45,12 +45,12 @@ typedef enum malloc_init_e malloc_init_t;
 #define MALLOCX_ARENA_SHIFT	20
 #define MALLOCX_TCACHE_SHIFT	8
 #define MALLOCX_ARENA_MASK \
-    (((1 << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT)
+    ((unsigned)(((1U << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT))
 /* NB: Arena index bias decreases the maximum number of arenas by 1. */
-#define MALLOCX_ARENA_LIMIT	((1 << MALLOCX_ARENA_BITS) - 1)
+#define MALLOCX_ARENA_LIMIT	((unsigned)((1U << MALLOCX_ARENA_BITS) - 1))
 #define MALLOCX_TCACHE_MASK \
-    (((1 << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT)
-#define MALLOCX_TCACHE_MAX	((1 << MALLOCX_TCACHE_BITS) - 3)
+    ((unsigned)(((1U << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT))
+#define MALLOCX_TCACHE_MAX	((unsigned)((1U << MALLOCX_TCACHE_BITS) - 3))
 #define MALLOCX_LG_ALIGN_MASK	((1 << MALLOCX_LG_ALIGN_BITS) - 1)
 /* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
 #define MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
@ -99,7 +99,8 @@ typedef enum malloc_init_e malloc_init_t;

 /* Return the nearest aligned address at or below a. */
 #define ALIGNMENT_ADDR2BASE(a, alignment)				\
-	((void *)((uintptr_t)(a) & ((~(alignment)) + 1)))
+	((void *)(((byte_t *)(a)) - (((uintptr_t)(a)) -			\
+	    ((uintptr_t)(a) & ((~(alignment)) + 1)))))

 /* Return the offset between a and the nearest aligned address at or below a. */
 #define ALIGNMENT_ADDR2OFFSET(a, alignment)				\
@ -109,8 +110,21 @@ typedef enum malloc_init_e malloc_init_t;
 #define ALIGNMENT_CEILING(s, alignment)					\
 	(((s) + (alignment - 1)) & ((~(alignment)) + 1))

+/*
+ * Return the nearest aligned address at or above a.
+ *
+ * While at first glance this would appear to be merely a more complicated
+ * way to perform the same computation as `ALIGNMENT_CEILING`,
+ * this has the important additional property of not concealing pointer
+ * provenance from the compiler. See the block-comment on the
+ * definition of `byte_t` for more details.
+ */
+#define ALIGNMENT_ADDR2CEILING(a, alignment)				\
+	((void *)(((byte_t *)(a)) + (((((uintptr_t)(a)) +		\
+	    (alignment - 1)) & ((~(alignment)) + 1)) - ((uintptr_t)(a)))))
+
 /* Declare a variable-length array. */
-#if __STDC_VERSION__ < 199901L
+#if __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
 #  ifdef _MSC_VER
 #    include <malloc.h>
 #    define alloca _alloca
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@ -1,7 +1,7 @@
 #ifndef JEMALLOC_PREAMBLE_H
 #define JEMALLOC_PREAMBLE_H

-#include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_defs.h"
 #include "jemalloc/internal/jemalloc_internal_decls.h"

 #if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
@ -215,7 +215,7 @@ static const bool config_enable_cxx =
 #endif
 ;

-#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
+#if defined(_WIN32) || defined(__APPLE__) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 #define JEMALLOC_INTERNAL_LARGE_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/hook.h"

 void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
--- a/include/jemalloc/internal/lockedint.h
+++ b/include/jemalloc/internal/lockedint.h
@ -1,6 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_LOCKEDINT_H
 #define JEMALLOC_INTERNAL_LOCKEDINT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * In those architectures that support 64-bit atomics, we use atomic updates for
 * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
--- a/include/jemalloc/internal/log.h
+++ b/include/jemalloc/internal/log.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_LOG_H
 #define JEMALLOC_INTERNAL_LOG_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"
@ -26,9 +27,9 @@
 * log("extent.a", "log msg for extent.a"); // 5
 * log("extent.b", "log msg for extent.b"); // 6
 *
- * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and
+ * And your malloc_conf option is "log:arena.a|extent", then lines 2, 4, 5, and
 * 6 will print at runtime.  You can enable logging from all log vars by
- * writing "log=.".
+ * writing "log:.".
 *
 * None of this should be regarded as a stable API for right now.  It's intended
 * as a debugging interface, to let us keep around some of our printf-debugging
@ -96,8 +97,7 @@ log_impl_varargs(const char *name, ...) {
 	dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name);
 	dst_offset += malloc_vsnprintf(buf + dst_offset,
 	    JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
-	dst_offset += malloc_snprintf(buf + dst_offset,
-	    JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
+	malloc_snprintf(buf + dst_offset, JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
 	va_end(ap);

 	malloc_write(buf);
--- a/include/jemalloc/internal/malloc_io.h
+++ b/include/jemalloc/internal/malloc_io.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MALLOC_IO_H
 #define JEMALLOC_INTERNAL_MALLOC_IO_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"

 #ifdef _WIN32
@ -67,7 +68,7 @@ void malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format,
 void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);

 static inline ssize_t
-malloc_write_fd(int fd, const void *buf, size_t count) {
+malloc_write_fd_syscall(int fd, const void *buf, size_t count) {
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
 	/*
 	 * Use syscall(2) rather than write(2) when possible in order to avoid
@ -89,7 +90,22 @@ malloc_write_fd(int fd, const void *buf, size_t count) {
 }

 static inline ssize_t
-malloc_read_fd(int fd, void *buf, size_t count) {
+malloc_write_fd(int fd, const void *buf, size_t count) {
+	size_t bytes_written = 0;
+	do {
+		ssize_t result = malloc_write_fd_syscall(fd,
+		    &((const byte_t *)buf)[bytes_written],
+		    count - bytes_written);
+		if (result < 0) {
+			return result;
+		}
+		bytes_written += result;
+	} while (bytes_written < count);
+	return bytes_written;
+}
+
+static inline ssize_t
+malloc_read_fd_syscall(int fd, void *buf, size_t count) {
 #if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
 	long result = syscall(SYS_read, fd, buf, count);
 #else
@ -102,4 +118,20 @@ malloc_read_fd(int fd, void *buf, size_t count) {
 	return (ssize_t)result;
 }

+static inline ssize_t
+malloc_read_fd(int fd, void *buf, size_t count) {
+	size_t bytes_read = 0;
+	do {
+		ssize_t result = malloc_read_fd_syscall(fd,
+		    &((byte_t *)buf)[bytes_read], count - bytes_read);
+		if (result < 0) {
+			return result;
+		} else if (result == 0) {
+			break;
+		}
+		bytes_read += result;
+	} while (bytes_read < count);
+	return bytes_read;
+}
+
 #endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */
--- a/include/jemalloc/internal/mpsc_queue.h
+++ b/include/jemalloc/internal/mpsc_queue.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H
 #define JEMALLOC_INTERNAL_MPSC_QUEUE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"

 /*
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MUTEX_H
 #define JEMALLOC_INTERNAL_MUTEX_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/tsd.h"
@ -31,6 +32,12 @@ struct malloc_mutex_s {
 			 * unlocking thread).
 			 */
 			mutex_prof_data_t	prof_data;
+			/*
+			 * Hint flag to avoid exclusive cache line contention
+			 * during spin waiting.  Placed along with prof_data
+			 * since it's always modified even with no contention.
+			 */
+			atomic_b_t		locked;
 #ifdef _WIN32
 #  if _WIN32_WINNT >= 0x0600
 			SRWLOCK         	lock;
@ -45,11 +52,6 @@ struct malloc_mutex_s {
 #else
 			pthread_mutex_t		lock;
 #endif
-			/*
-			 * Hint flag to avoid exclusive cache line contention
-			 * during spin waiting
-			 */
-			atomic_b_t		locked;
 		};
 		/*
 		 * We only touch witness when configured w/ debug.  However we
@ -98,21 +100,21 @@ struct malloc_mutex_s {
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 #  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
+  {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \
         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #    define MALLOC_MUTEX_INITIALIZER                      \
-  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
+  {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}},  \
      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
 #  if (defined(JEMALLOC_DEBUG))
 #     define MALLOC_MUTEX_INITIALIZER					\
-      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+      {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}},	\
           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #     define MALLOC_MUTEX_INITIALIZER					\
-      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
+      {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER, NULL}},	\
           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif

@ -120,11 +122,11 @@ struct malloc_mutex_s {
 #    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
 #  if defined(JEMALLOC_DEBUG)
 #    define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
+     {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}}, \
           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
 #  else
 #    define MALLOC_MUTEX_INITIALIZER                          \
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
+     {{{LOCK_PROF_DATA_INITIALIZER, ATOMIC_INIT(false), PTHREAD_MUTEX_INITIALIZER}},	\
      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
 #  endif
 #endif
@ -175,7 +177,6 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
-			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 			return true;
 		}
 		mutex_owner_stats_update(tsdn, mutex);
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MUTEX_PROF_H
 #define JEMALLOC_INTERNAL_MUTEX_PROF_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/tsd_types.h"
--- a/include/jemalloc/internal/nstime.h
+++ b/include/jemalloc/internal/nstime.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_NSTIME_H
 #define JEMALLOC_INTERNAL_NSTIME_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
 /* Maximum supported number of seconds (~584 years). */
 #define NSTIME_SEC_MAX KQU(18446744072)

@ -56,7 +59,7 @@ enum prof_time_res_e {
 typedef enum prof_time_res_e prof_time_res_t;

 extern prof_time_res_t opt_prof_time_res;
-extern const char *prof_time_res_mode_names[];
+extern const char *const prof_time_res_mode_names[];

 JEMALLOC_ALWAYS_INLINE void
 nstime_init_zero(nstime_t *time) {
--- a/include/jemalloc/internal/pa.h
+++ b/include/jemalloc/internal/pa.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PA_H
 #define JEMALLOC_INTERNAL_PA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/decay.h"
 #include "jemalloc/internal/ecache.h"
@ -131,7 +132,7 @@ pa_shard_ehooks_get(pa_shard_t *shard) {

 /* Returns true on error. */
 bool pa_central_init(pa_central_t *central, base_t *base, bool hpa,
-    hpa_hooks_t *hpa_hooks);
+    const hpa_hooks_t *hpa_hooks);

 /* Returns true on error. */
 bool pa_shard_init(tsdn_t *tsdn, pa_shard_t *shard, pa_central_t *central,
--- a/include/jemalloc/internal/pac.h
+++ b/include/jemalloc/internal/pac.h
@ -1,11 +1,15 @@
 #ifndef JEMALLOC_INTERNAL_PAC_H
 #define JEMALLOC_INTERNAL_PAC_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/decay.h"
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/edata_cache.h"
 #include "jemalloc/internal/exp_grow.h"
+#include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/pai.h"
 #include "san_bump.h"

-
 /*
 * Page allocator classic; an implementation of the PAI interface that:
 * - Can be used for arenas with custom extent hooks.
--- a/include/jemalloc/internal/pages.h
+++ b/include/jemalloc/internal/pages.h
@ -1,6 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_PAGES_EXTERNS_H
 #define JEMALLOC_INTERNAL_PAGES_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+
+/* Actual operating system page size, detected during bootstrap, <= PAGE. */
+extern size_t	os_page;
+
 /* Page size.  LG_PAGE is determined by the configure script. */
 #ifdef PAGE_MASK
 #  undef PAGE_MASK
@ -9,7 +15,7 @@
 #define PAGE_MASK	((size_t)(PAGE - 1))
 /* Return the page base address for the page containing address a. */
 #define PAGE_ADDR2BASE(a)						\
-	((void *)((uintptr_t)(a) & ~PAGE_MASK))
+	ALIGNMENT_ADDR2BASE(a, PAGE)
 /* Return the smallest pagesize multiple that is >= s. */
 #define PAGE_CEILING(s)							\
 	(((s) + PAGE_MASK) & ~PAGE_MASK)
@ -36,7 +42,7 @@

 /* Return the huge page base address for the huge page containing address a. */
 #define HUGEPAGE_ADDR2BASE(a)						\
-	((void *)((uintptr_t)(a) & ~HUGEPAGE_MASK))
+	ALIGNMENT_ADDR2BASE(a, HUGEPAGE)
 /* Return the smallest pagesize multiple that is >= s. */
 #define HUGEPAGE_CEILING(s)						\
 	(((s) + HUGEPAGE_MASK) & ~HUGEPAGE_MASK)
@ -99,7 +105,7 @@ typedef enum {
 #define THP_MODE_DEFAULT thp_mode_default
 extern thp_mode_t opt_thp;
 extern thp_mode_t init_system_thp_mode; /* Initial system wide state. */
-extern const char *thp_mode_names[];
+extern const char *const thp_mode_names[];

 void *pages_map(void *addr, size_t size, size_t alignment, bool *commit);
 void pages_unmap(void *addr, size_t size);
--- a/include/jemalloc/internal/pai.h
+++ b/include/jemalloc/internal/pai.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_PAI_H
 #define JEMALLOC_INTERNAL_PAI_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /* An interface for page allocation. */

 typedef struct pai_s pai_t;
--- a/include/jemalloc/internal/peak.h
+++ b/include/jemalloc/internal/peak.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_PEAK_H
 #define JEMALLOC_INTERNAL_PEAK_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
 typedef struct peak_s peak_t;
 struct peak_s {
 	/* The highest recorded peak value, after adjustment (see below). */
--- a/include/jemalloc/internal/peak_event.h
+++ b/include/jemalloc/internal/peak_event.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_PEAK_EVENT_H
 #define JEMALLOC_INTERNAL_PEAK_EVENT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * While peak.h contains the simple helper struct that tracks state, this
 * contains the allocator tie-ins (and knows about tsd, the event module, etc.).
--- a/include/jemalloc/internal/ph.h
+++ b/include/jemalloc/internal/ph.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_PH_H
 #define JEMALLOC_INTERNAL_PH_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+
 /*
 * A Pairing Heap implementation.
 *
@ -73,7 +77,7 @@ struct ph_s {

 JEMALLOC_ALWAYS_INLINE phn_link_t *
 phn_link_get(void *phn, size_t offset) {
-	return (phn_link_t *)(((uintptr_t)phn) + offset);
+	return (phn_link_t *)(((char *)phn) + offset);
 }

 JEMALLOC_ALWAYS_INLINE void
@ -127,6 +131,7 @@ phn_merge_ordered(void *phn0, void *phn1, size_t offset,
 	phn0child = phn_lchild_get(phn0, offset);
 	phn_next_set(phn1, phn0child, offset);
 	if (phn0child != NULL) {
+		/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 		phn_prev_set(phn0child, phn1, offset);
 	}
 	phn_lchild_set(phn0, phn1, offset);
@ -143,6 +148,7 @@ phn_merge(void *phn0, void *phn1, size_t offset, ph_cmp_t cmp) {
 		phn_merge_ordered(phn0, phn1, offset, cmp);
 		result = phn0;
 	} else {
+		/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 		phn_merge_ordered(phn1, phn0, offset, cmp);
 		result = phn1;
 	}
@ -188,10 +194,12 @@ phn_merge_siblings(void *phn, size_t offset, ph_cmp_t cmp) {
 				phn_prev_set(phn1, NULL, offset);
 				phn_next_set(phn1, NULL, offset);
 				phn0 = phn_merge(phn0, phn1, offset, cmp);
+				/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 				phn_next_set(tail, phn0, offset);
 				tail = phn0;
 				phn0 = phnrest;
 			} else {
+				/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 				phn_next_set(tail, phn0, offset);
 				tail = phn0;
 				phn0 = NULL;
@ -210,6 +218,7 @@ phn_merge_siblings(void *phn, size_t offset, ph_cmp_t cmp) {
 				if (head == NULL) {
 					break;
 				}
+				/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 				phn_next_set(tail, phn0, offset);
 				tail = phn0;
 				phn0 = head;
@ -298,6 +307,7 @@ ph_try_aux_merge_pair(ph_t *ph, size_t offset, ph_cmp_t cmp) {
 	phn0 = phn_merge(phn0, phn1, offset, cmp);
 	phn_next_set(phn0, next_phn1, offset);
 	if (next_phn1 != NULL) {
+		/* NOLINTNEXTLINE(readability-suspicious-call-argument) */
 		phn_prev_set(next_phn1, phn0, offset);
 	}
 	phn_next_set(ph->root, phn0, offset);
@ -318,36 +328,37 @@ ph_insert(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
 	 */
 	if (ph->root == NULL) {
 		ph->root = phn;
-	} else {
-		/*
-		 * As a special case, check to see if we can replace the root.
-		 * This is practically common in some important cases, and lets
-		 * us defer some insertions (hopefully, until the point where
-		 * some of the items in the aux list have been removed, savings
-		 * us from linking them at all).
-		 */
-		if (cmp(phn, ph->root) < 0) {
-			phn_lchild_set(phn, ph->root, offset);
-			phn_prev_set(ph->root, phn, offset);
-			ph->root = phn;
-			ph->auxcount = 0;
-			return;
-		}
-		ph->auxcount++;
-		phn_next_set(phn, phn_next_get(ph->root, offset), offset);
-		if (phn_next_get(ph->root, offset) != NULL) {
-			phn_prev_set(phn_next_get(ph->root, offset), phn,
-			    offset);
-		}
-		phn_prev_set(phn, ph->root, offset);
-		phn_next_set(ph->root, phn, offset);
+		return;
 	}
-	if (ph->auxcount > 1) {
-		unsigned nmerges = ffs_zu(ph->auxcount - 1);
-		bool done = false;
-		for (unsigned i = 0; i < nmerges && !done; i++) {
-			done = ph_try_aux_merge_pair(ph, offset, cmp);
-		}
+
+	/*
+	 * As a special case, check to see if we can replace the root.
+	 * This is practically common in some important cases, and lets
+	 * us defer some insertions (hopefully, until the point where
+	 * some of the items in the aux list have been removed, savings
+	 * us from linking them at all).
+	 */
+	if (cmp(phn, ph->root) < 0) {
+		phn_lchild_set(phn, ph->root, offset);
+		phn_prev_set(ph->root, phn, offset);
+		ph->root = phn;
+		ph->auxcount = 0;
+		return;
+	}
+
+	phn_next_set(phn, phn_next_get(ph->root, offset), offset);
+	if (phn_next_get(ph->root, offset) != NULL) {
+		phn_prev_set(phn_next_get(ph->root, offset), phn,
+		    offset);
+	}
+	phn_prev_set(phn, ph->root, offset);
+	phn_next_set(ph->root, phn, offset);
+
+	ph->auxcount++;
+	unsigned nmerges = ffs_zu(ph->auxcount);
+	bool done = false;
+	for (unsigned i = 0; i < nmerges && !done; i++) {
+		done = ph_try_aux_merge_pair(ph, offset, cmp);
 	}
 }

@ -368,9 +379,6 @@ ph_remove_first(ph_t *ph, size_t offset, ph_cmp_t cmp) {

 JEMALLOC_ALWAYS_INLINE void
 ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
-	void *replace;
-	void *parent;
-
 	if (ph->root == phn) {
 		/*
 		 * We can delete from aux list without merging it, but we need
@ -379,9 +387,6 @@ ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
 		 */
 		if (phn_lchild_get(phn, offset) == NULL) {
 			ph->root = phn_next_get(phn, offset);
-			if (ph->root != NULL) {
-				phn_prev_set(ph->root, NULL, offset);
-			}
 			return;
 		}
 		ph_merge_aux(ph, offset, cmp);
@ -391,50 +396,29 @@ ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
 		}
 	}

-	/* Get parent (if phn is leftmost child) before mutating. */
-	if ((parent = phn_prev_get(phn, offset)) != NULL) {
-		if (phn_lchild_get(parent, offset) != phn) {
-			parent = NULL;
-		}
-	}
-	/* Find a possible replacement node, and link to parent. */
-	replace = ph_merge_children(phn, offset, cmp);
-	/* Set next/prev for sibling linked list. */
+	void* prev = phn_prev_get(phn, offset);
+	void* next = phn_next_get(phn, offset);
+
+	/* If we have children, then we integrate them back in the heap. */
+	void* replace = ph_merge_children(phn, offset, cmp);
 	if (replace != NULL) {
-		if (parent != NULL) {
-			phn_prev_set(replace, parent, offset);
-			phn_lchild_set(parent, replace, offset);
-		} else {
-			phn_prev_set(replace, phn_prev_get(phn, offset),
-			    offset);
-			if (phn_prev_get(phn, offset) != NULL) {
-				phn_next_set(phn_prev_get(phn, offset), replace,
-				    offset);
-			}
-		}
-		phn_next_set(replace, phn_next_get(phn, offset), offset);
-		if (phn_next_get(phn, offset) != NULL) {
-			phn_prev_set(phn_next_get(phn, offset), replace,
-			    offset);
+		phn_next_set(replace, next, offset);
+		if (next != NULL) {
+			phn_prev_set(next, replace, offset);
 		}
+
+		next = replace;
+	}
+
+	if (next != NULL) {
+		phn_prev_set(next, prev, offset);
+	}
+
+	assert(prev != NULL);
+	if (phn_lchild_get(prev, offset) == phn) {
+		phn_lchild_set(prev, next, offset);
 	} else {
-		if (parent != NULL) {
-			void *next = phn_next_get(phn, offset);
-			phn_lchild_set(parent, next, offset);
-			if (next != NULL) {
-				phn_prev_set(next, parent, offset);
-			}
-		} else {
-			assert(phn_prev_get(phn, offset) != NULL);
-			phn_next_set(
-			    phn_prev_get(phn, offset),
-			    phn_next_get(phn, offset), offset);
-		}
-		if (phn_next_get(phn, offset) != NULL) {
-			phn_prev_set(
-			    phn_next_get(phn, offset),
-			    phn_prev_get(phn, offset), offset);
-		}
+		phn_next_set(prev, next, offset);
 	}
 }

--- a/include/jemalloc/internal/prng.h
+++ b/include/jemalloc/internal/prng.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PRNG_H
 #define JEMALLOC_INTERNAL_PRNG_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bit_util.h"

 /*
--- a/include/jemalloc/internal/prof_data.h
+++ b/include/jemalloc/internal/prof_data.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PROF_DATA_H
 #define JEMALLOC_INTERNAL_PROF_DATA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/mutex.h"

 extern malloc_mutex_t bt2gctx_mtx;
@ -18,9 +19,8 @@ bool prof_bt_keycomp(const void *k1, const void *k2);

 bool prof_data_init(tsd_t *tsd);
 prof_tctx_t *prof_lookup(tsd_t *tsd, prof_bt_t *bt);
-char *prof_thread_name_alloc(tsd_t *tsd, const char *thread_name);
 int prof_thread_name_set_impl(tsd_t *tsd, const char *thread_name);
-void prof_unbias_map_init();
+void prof_unbias_map_init(void);
 void prof_dump_impl(tsd_t *tsd, write_cb_t *prof_dump_write, void *cbopaque,
    prof_tdata_t *tdata, bool leakcheck);
 prof_tdata_t * prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid,
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@ -1,12 +1,15 @@
 #ifndef JEMALLOC_INTERNAL_PROF_EXTERNS_H
 #define JEMALLOC_INTERNAL_PROF_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/prof_hook.h"

 extern bool opt_prof;
 extern bool opt_prof_active;
 extern bool opt_prof_thread_active_init;
+extern unsigned opt_prof_bt_max;
 extern size_t opt_lg_prof_sample;    /* Mean bytes between samples. */
 extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
 extern bool opt_prof_gdump;          /* High-water memory dumping. */
@ -50,10 +53,16 @@ extern size_t lg_prof_sample;
 extern bool prof_booted;

 void prof_backtrace_hook_set(prof_backtrace_hook_t hook);
-prof_backtrace_hook_t prof_backtrace_hook_get();
+prof_backtrace_hook_t prof_backtrace_hook_get(void);

 void prof_dump_hook_set(prof_dump_hook_t hook);
-prof_dump_hook_t prof_dump_hook_get();
+prof_dump_hook_t prof_dump_hook_get(void);
+
+void prof_sample_hook_set(prof_sample_hook_t hook);
+prof_sample_hook_t prof_sample_hook_get(void);
+
+void prof_sample_free_hook_set(prof_sample_free_hook_t hook);
+prof_sample_free_hook_t prof_sample_free_hook_get(void);

 /* Functions only accessed in prof_inlines.h */
 prof_tdata_t *prof_tdata_init(tsd_t *tsd);
@ -62,7 +71,8 @@ prof_tdata_t *prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
 void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx);
 void prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size,
    size_t usize, prof_tctx_t *tctx);
-void prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info);
+void prof_free_sampled_object(tsd_t *tsd, const void *ptr, size_t usize,
+    prof_info_t *prof_info);
 prof_tctx_t *prof_tctx_create(tsd_t *tsd);
 void prof_idump(tsdn_t *tsdn);
 bool prof_mdump(tsd_t *tsd, const char *filename);
--- a/include/jemalloc/internal/prof_hook.h
+++ b/include/jemalloc/internal/prof_hook.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_PROF_HOOK_H
 #define JEMALLOC_INTERNAL_PROF_HOOK_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
 /*
 * The hooks types of which are declared in this file are experimental and
 * undocumented, thus the typedefs are located in an 'internal' header.
@ -18,4 +20,10 @@ typedef void (*prof_backtrace_hook_t)(void **, unsigned *, unsigned);
 */
 typedef void (*prof_dump_hook_t)(const char *filename);

+/* ptr, size, backtrace vector, backtrace vector length */
+typedef void (*prof_sample_hook_t)(const void *, size_t, void **, unsigned);
+
+/* ptr, size */
+typedef void (*prof_sample_free_hook_t)(const void *, size_t);
+
 #endif /* JEMALLOC_INTERNAL_PROF_HOOK_H */
--- a/include/jemalloc/internal/prof_inlines.h
+++ b/include/jemalloc/internal/prof_inlines.h
@ -1,12 +1,17 @@
 #ifndef JEMALLOC_INTERNAL_PROF_INLINES_H
 #define JEMALLOC_INTERNAL_PROF_INLINES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_c.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/prof_structs.h"
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/thread_event.h"

 JEMALLOC_ALWAYS_INLINE void
-prof_active_assert() {
+prof_active_assert(void) {
 	cassert(config_prof);
 	/*
 	 * If opt_prof is off, then prof_active must always be off, regardless
@ -37,6 +42,22 @@ prof_gdump_get_unlocked(void) {
 	return prof_gdump_val;
 }

+JEMALLOC_ALWAYS_INLINE void
+prof_thread_name_assert(prof_tdata_t *tdata) {
+	if (!config_debug) {
+		return;
+	}
+	prof_active_assert();
+
+	bool terminated = false;
+	for (unsigned i = 0; i < PROF_THREAD_NAME_MAX_LEN; i++) {
+		if (tdata->thread_name[i] == '\0') {
+			terminated = true;
+		}
+	}
+	assert(terminated);
+}
+
 JEMALLOC_ALWAYS_INLINE prof_tdata_t *
 prof_tdata_get(tsd_t *tsd, bool create) {
 	prof_tdata_t *tdata;
@ -58,6 +79,10 @@ prof_tdata_get(tsd_t *tsd, bool create) {
 		assert(tdata == NULL || tdata->attached);
 	}

+	if (tdata != NULL) {
+		prof_thread_name_assert(tdata);
+	}
+
 	return tdata;
 }

@ -81,6 +106,11 @@ prof_info_get_and_reset_recent(tsd_t *tsd, const void *ptr,
 	arena_prof_info_get(tsd, ptr, alloc_ctx, prof_info, true);
 }

+JEMALLOC_ALWAYS_INLINE bool
+prof_tctx_is_valid(const prof_tctx_t *tctx) {
+	return tctx != NULL && tctx != PROF_TCTX_SENTINEL;
+}
+
 JEMALLOC_ALWAYS_INLINE void
 prof_tctx_reset(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
@ -101,7 +131,7 @@ JEMALLOC_ALWAYS_INLINE void
 prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) {
 	cassert(config_prof);
 	assert(edata != NULL);
-	assert((uintptr_t)tctx > (uintptr_t)1U);
+	assert(prof_tctx_is_valid(tctx));

 	arena_prof_info_set(tsd, edata, tctx, size);
 }
@ -136,7 +166,7 @@ prof_alloc_prep(tsd_t *tsd, bool prof_active, bool sample_event) {

 	if (!prof_active ||
 	    likely(prof_sample_should_skip(tsd, sample_event))) {
-		ret = (prof_tctx_t *)(uintptr_t)1U;
+		ret = PROF_TCTX_SENTINEL;
 	} else {
 		ret = prof_tctx_create(tsd);
 	}
@ -151,7 +181,7 @@ prof_malloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
 	assert(ptr != NULL);
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));

-	if (unlikely((uintptr_t)tctx > (uintptr_t)1U)) {
+	if (unlikely(prof_tctx_is_valid(tctx))) {
 		prof_malloc_sample_object(tsd, ptr, size, usize, tctx);
 	} else {
 		prof_tctx_reset(tsd, ptr, alloc_ctx);
@ -165,7 +195,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
 	bool sampled, old_sampled, moved;

 	cassert(config_prof);
-	assert(ptr != NULL || (uintptr_t)tctx <= (uintptr_t)1U);
+	assert(ptr != NULL || !prof_tctx_is_valid(tctx));

 	if (prof_active && ptr != NULL) {
 		assert(usize == isalloc(tsd_tsdn(tsd), ptr));
@ -178,12 +208,12 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
 			 * sample threshold.
 			 */
 			prof_alloc_rollback(tsd, tctx);
-			tctx = (prof_tctx_t *)(uintptr_t)1U;
+			tctx = PROF_TCTX_SENTINEL;
 		}
 	}

-	sampled = ((uintptr_t)tctx > (uintptr_t)1U);
-	old_sampled = ((uintptr_t)old_prof_info->alloc_tctx > (uintptr_t)1U);
+	sampled = prof_tctx_is_valid(tctx);
+	old_sampled = prof_tctx_is_valid(old_prof_info->alloc_tctx);
 	moved = (ptr != old_ptr);

 	if (unlikely(sampled)) {
@ -201,7 +231,7 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
 	} else {
 		prof_info_t prof_info;
 		prof_info_get(tsd, ptr, NULL, &prof_info);
-		assert((uintptr_t)prof_info.alloc_tctx == (uintptr_t)1U);
+		assert(prof_info.alloc_tctx == PROF_TCTX_SENTINEL);
 	}

 	/*
@ -212,31 +242,28 @@ prof_realloc(tsd_t *tsd, const void *ptr, size_t size, size_t usize,
 	 * counters.
 	 */
 	if (unlikely(old_sampled)) {
-		prof_free_sampled_object(tsd, old_usize, old_prof_info);
+		prof_free_sampled_object(tsd, old_ptr, old_usize,
+		    old_prof_info);
 	}
 }

 JEMALLOC_ALWAYS_INLINE size_t
-prof_sample_align(size_t orig_align) {
+prof_sample_align(size_t usize, size_t orig_align) {
 	/*
-	 * Enforce page alignment, so that sampled allocations can be identified
+	 * Enforce alignment, so that sampled allocations can be identified
 	 * w/o metadata lookup.
 	 */
 	assert(opt_prof);
-	return (opt_cache_oblivious && orig_align < PAGE) ? PAGE :
-	    orig_align;
-}
-
-JEMALLOC_ALWAYS_INLINE bool
-prof_sample_aligned(const void *ptr) {
-	return ((uintptr_t)ptr & PAGE_MASK) == 0;
+	return (orig_align < PROF_SAMPLE_ALIGNMENT &&
+	       (sz_can_use_slab(usize) || opt_cache_oblivious)) ?
+	           PROF_SAMPLE_ALIGNMENT : orig_align;
 }

 JEMALLOC_ALWAYS_INLINE bool
 prof_sampled(tsd_t *tsd, const void *ptr) {
 	prof_info_t prof_info;
 	prof_info_get(tsd, ptr, NULL, &prof_info);
-	bool sampled = (uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U;
+	bool sampled = prof_tctx_is_valid(prof_info.alloc_tctx);
 	if (sampled) {
 		assert(prof_sample_aligned(ptr));
 	}
@ -252,10 +279,24 @@ prof_free(tsd_t *tsd, const void *ptr, size_t usize,
 	cassert(config_prof);
 	assert(usize == isalloc(tsd_tsdn(tsd), ptr));

-	if (unlikely((uintptr_t)prof_info.alloc_tctx > (uintptr_t)1U)) {
+	if (unlikely(prof_tctx_is_valid(prof_info.alloc_tctx))) {
 		assert(prof_sample_aligned(ptr));
-		prof_free_sampled_object(tsd, usize, &prof_info);
+		prof_free_sampled_object(tsd, ptr, usize, &prof_info);
 	}
 }

+JEMALLOC_ALWAYS_INLINE bool
+prof_thread_name_empty(prof_tdata_t *tdata) {
+	prof_active_assert();
+
+	return (tdata->thread_name[0] == '\0');
+}
+
+JEMALLOC_ALWAYS_INLINE void
+prof_thread_name_clear(prof_tdata_t *tdata) {
+	prof_active_assert();
+
+	tdata->thread_name[0] = '\0';
+}
+
 #endif /* JEMALLOC_INTERNAL_PROF_INLINES_H */
--- a/include/jemalloc/internal/prof_log.h
+++ b/include/jemalloc/internal/prof_log.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PROF_LOG_H
 #define JEMALLOC_INTERNAL_PROF_LOG_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/mutex.h"

 extern malloc_mutex_t log_mtx;
--- a/include/jemalloc/internal/prof_recent.h
+++ b/include/jemalloc/internal/prof_recent.h
@ -1,13 +1,17 @@
 #ifndef JEMALLOC_INTERNAL_PROF_RECENT_H
 #define JEMALLOC_INTERNAL_PROF_RECENT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/mutex.h"
+
 extern malloc_mutex_t prof_recent_alloc_mtx;
 extern malloc_mutex_t prof_recent_dump_mtx;

 bool prof_recent_alloc_prepare(tsd_t *tsd, prof_tctx_t *tctx);
 void prof_recent_alloc(tsd_t *tsd, edata_t *edata, size_t size, size_t usize);
 void prof_recent_alloc_reset(tsd_t *tsd, edata_t *edata);
-bool prof_recent_init();
+bool prof_recent_init(void);
 void edata_prof_recent_alloc_init(edata_t *edata);

 /* Used in unit tests. */
@ -16,7 +20,7 @@ extern prof_recent_list_t prof_recent_alloc_list;
 edata_t *prof_recent_alloc_edata_get_no_lock_test(const prof_recent_t *node);
 prof_recent_t *edata_prof_recent_alloc_get_no_lock_test(const edata_t *edata);

-ssize_t prof_recent_alloc_max_ctl_read();
+ssize_t prof_recent_alloc_max_ctl_read(void);
 ssize_t prof_recent_alloc_max_ctl_write(tsd_t *tsd, ssize_t max);
 void prof_recent_alloc_dump(tsd_t *tsd, write_cb_t *write_cb, void *cbopaque);

--- a/include/jemalloc/internal/prof_stats.h
+++ b/include/jemalloc/internal/prof_stats.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_PROF_STATS_H
 #define JEMALLOC_INTERNAL_PROF_STATS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
+
 typedef struct prof_stats_s prof_stats_t;
 struct prof_stats_s {
 	uint64_t req_sum;
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PROF_STRUCTS_H
 #define JEMALLOC_INTERNAL_PROF_STRUCTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/ckh.h"
 #include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/mutex.h"
@ -156,12 +157,6 @@ struct prof_tdata_s {
 	 */
 	uint64_t		thr_discrim;

-	/* Included in heap profile dumps if non-NULL. */
-	char			*thread_name;
-
-	bool			attached;
-	bool			expired;
-
 	rb_node(prof_tdata_t)	tdata_link;

 	/*
@ -179,6 +174,9 @@ struct prof_tdata_s {
 	 */
 	ckh_t			bt2tctx;

+	/* Included in heap profile dumps if has content. */
+	char			thread_name[PROF_THREAD_NAME_MAX_LEN];
+
 	/* State used to avoid dumping while operating on prof internals. */
 	bool			enq;
 	bool			enq_idump;
@ -198,11 +196,14 @@ struct prof_tdata_s {
 	 */
 	bool			active;

+	bool			attached;
+	bool			expired;
+
 	/* Temporary storage for summation during dump. */
 	prof_cnt_t		cnt_summed;

 	/* Backtrace vector, used for calls to prof_backtrace(). */
-	void			*vec[PROF_BT_MAX];
+	void 			**vec;
 };
 typedef rb_tree(prof_tdata_t) prof_tdata_tree_t;

--- a/include/jemalloc/internal/prof_sys.h
+++ b/include/jemalloc/internal/prof_sys.h
@ -1,13 +1,17 @@
 #ifndef JEMALLOC_INTERNAL_PROF_SYS_H
 #define JEMALLOC_INTERNAL_PROF_SYS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/mutex.h"
+
 extern malloc_mutex_t prof_dump_filename_mtx;
 extern base_t *prof_base;

 void bt_init(prof_bt_t *bt, void **vec);
 void prof_backtrace(tsd_t *tsd, prof_bt_t *bt);
-void prof_hooks_init();
-void prof_unwind_init();
+void prof_hooks_init(void);
+void prof_unwind_init(void);
 void prof_sys_thread_name_fetch(tsd_t *tsd);
 int prof_getpid(void);
 void prof_get_default_filename(tsdn_t *tsdn, char *filename, uint64_t ind);
@ -24,7 +28,7 @@ typedef int (prof_dump_open_file_t)(const char *, int);
 extern prof_dump_open_file_t *JET_MUTABLE prof_dump_open_file;
 typedef ssize_t (prof_dump_write_file_t)(int, const void *, size_t);
 extern prof_dump_write_file_t *JET_MUTABLE prof_dump_write_file;
-typedef int (prof_dump_open_maps_t)();
+typedef int (prof_dump_open_maps_t)(void);
 extern prof_dump_open_maps_t *JET_MUTABLE prof_dump_open_maps;

 #endif /* JEMALLOC_INTERNAL_PROF_SYS_H */
--- a/include/jemalloc/internal/prof_types.h
+++ b/include/jemalloc/internal/prof_types.h
@ -23,7 +23,12 @@ typedef struct prof_recent_s prof_recent_t;
 * is based on __builtin_return_address() necessarily has a hard-coded number
 * of backtrace frame handlers, and should be kept in sync with this setting.
 */
-#define PROF_BT_MAX			128
+#ifdef JEMALLOC_PROF_GCC
+#  define PROF_BT_MAX_LIMIT 256
+#else
+#  define PROF_BT_MAX_LIMIT UINT_MAX
+#endif
+#define PROF_BT_MAX_DEFAULT			128

 /* Initial hash table size. */
 #define PROF_CKH_MINITEMS		64
@ -72,4 +77,18 @@ typedef struct prof_recent_s prof_recent_t;
 /* Default number of recent allocations to record. */
 #define PROF_RECENT_ALLOC_MAX_DEFAULT 0

+/* Thread name storage size limit. */
+#define PROF_THREAD_NAME_MAX_LEN 16
+
+/*
+ * Minimum required alignment for sampled allocations. Over-aligning sampled
+ * allocations allows us to quickly identify them on the dalloc path without
+ * resorting to metadata lookup.
+ */
+#define PROF_SAMPLE_ALIGNMENT PAGE
+#define PROF_SAMPLE_ALIGNMENT_MASK PAGE_MASK
+
+/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+#define PROF_TCTX_SENTINEL ((prof_tctx_t *)((uintptr_t)1U))
+
 #endif /* JEMALLOC_INTERNAL_PROF_TYPES_H */
--- a/include/jemalloc/internal/psset.h
+++ b/include/jemalloc/internal/psset.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_PSSET_H
 #define JEMALLOC_INTERNAL_PSSET_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/hpdata.h"

 /*
--- a/Show More
+++ b/Show More