525.53

2022-11-10 08:39:33 -08:00 · 2022-11-10 08:39:33 -08:00 · 758b4ee818
parent 7c345b838b
commit 758b4ee818
1323 changed files with 262135 additions and 60754 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,22 @@
 # Changelog

+## Release 525 Entries
+
+### [525.53] 2022-11-10
+
+#### Changed
+
+- GSP firmware is now distributed as multiple firmware files: this release has `gsp_tu10x.bin` and `gsp_ad10x.bin` replacing `gsp.bin` from previous releases.
+    - Each file is named after a GPU architecture and supports GPUs from one or more architectures. This allows GSP firmware to better leverage each architecture's capabilities.
+    - The .run installer will continue to install firmware to `/lib/firmware/nvidia/<version>` and the `nvidia.ko` kernel module will load the appropriate firmware for each GPU at runtime.
+
+#### Fixed
+
+- Add support for IBT (indirect branch tracking) on supported platforms, [#256](https://github.com/NVIDIA/open-gpu-kernel-modules/issues/256) by @rnd-ash
+- Return EINVAL when [failing to] allocating memory, [#280](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/280) by @YusufKhan-gamedev
+- Fix various typos in nvidia/src/kernel, [#16](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/16) by @alexisgeoffrey
+- Added support for rotation in X11, Quadro Sync, Stereo, and YUV 4:2:0 on Turing.
+
 ## Release 520 Entries

 ### [520.56.06] 2022-10-12
@ -29,6 +46,8 @@
 - Improved compatibility with new Linux kernel releases
 - Fixed possible excessive GPU power draw on an idle X11 or Wayland desktop when driving high resolutions or refresh rates

+### [515.65.07] 2022-10-19
+
 ### [515.65.01] 2022-08-02

 #### Fixed
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 520.56.06.
+version 525.53.


 ## How to Build
@ -15,9 +15,9 @@ as root:

    make modules_install -j$(nproc)

-Note that the kernel modules built here must be used with gsp.bin
+Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-520.56.06 driver release.  This can be achieved by installing
+525.53 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -167,7 +167,7 @@ for the target kernel.
 ## Compatible GPUs

 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 520.56.06 release,
+(see the table below). However, in the 525.53 release,
 GeForce and Workstation support is still considered alpha-quality.

 To enable use of the open kernel modules on GeForce and Workstation GPUs,
@ -175,7 +175,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/520.56.06/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/525.53/README/kernel_open.html

 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
@ -652,6 +652,17 @@ Subsystem Device ID.
 | NVIDIA PG506-232                                | 20B6 10DE 1492 |
 | NVIDIA A30                                      | 20B7 10DE 1532 |
 | NVIDIA A100-PCIE-40GB                           | 20F1 10DE 145F |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179B |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179C |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179D |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179E |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179F |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 17A0 |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 17A1 |
+| NVIDIA A800-SXM4-80GB                           | 20F3 10DE 17A2 |
+| NVIDIA A800 80GB PCIe                           | 20F5 10DE 1799 |
+| NVIDIA A800 80GB PCIe LC                        | 20F5 10DE 179A |
+| NVIDIA A800 40GB PCIe                           | 20F6 10DE 17A3 |
 | NVIDIA GeForce GTX 1660 Ti                      | 2182           |
 | NVIDIA GeForce GTX 1660                         | 2184           |
 | NVIDIA GeForce GTX 1650 SUPER                   | 2187           |
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall -MD $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"520.56.06\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"525.53\"

 EXTRA_CFLAGS += -Wno-unused-function

@ -229,6 +229,7 @@ NV_HEADER_PRESENCE_TESTS = \
 drm/drm_ioctl.h \
 drm/drm_device.h \
 drm/drm_mode_config.h \
+ drm/drm_modeset_lock.h \
 dt-bindings/interconnect/tegra_icc_id.h \
 generated/autoconf.h \
 generated/compile.h \
@ -243,6 +244,8 @@ NV_HEADER_PRESENCE_TESTS = \
 linux/log2.h \
 linux/of.h \
 linux/bug.h \
+ linux/sched.h \
+ linux/sched/mm.h \
 linux/sched/signal.h \
 linux/sched/task.h \
 linux/sched/task_stack.h \
@ -286,7 +289,10 @@ NV_HEADER_PRESENCE_TESTS = \
 linux/ioasid.h \
 linux/stdarg.h \
 linux/iosys-map.h \
- asm/coco.h
+ asm/coco.h \
+ linux/vfio_pci_core.h \
+ soc/tegra/bpmp-abi.h \
+ soc/tegra/bpmp.h

 # Filename to store the define for the header in $(1); this is only consumed by
 # the rule below that concatenates all of these together.
--- a/kernel-open/common/inc/cpuopsys.h
+++ b/kernel-open/common/inc/cpuopsys.h
@ -242,7 +242,7 @@
 #endif

 /* For verification-only features not intended to be included in normal drivers */
-#if (defined(NV_MODS) || defined(NV_GSP_MODS)) && defined(DEBUG) && !defined(DISABLE_VERIF_FEATURES)
+#if defined(ENABLE_VERIF_FEATURES)
 #define NV_VERIF_FEATURES
 #endif

@ -276,12 +276,6 @@
 #define NV_IS_MODS 0
 #endif

-#if defined(NV_GSP_MODS)
-#define NV_IS_GSP_MODS 1
-#else
-#define NV_IS_GSP_MODS 0
-#endif
-
 #if defined(NV_WINDOWS)
 #define NVOS_IS_WINDOWS 1
 #else
--- a/kernel-open/common/inc/nv-firmware.h
+++ b/kernel-open/common/inc/nv-firmware.h
@ -0,0 +1,132 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef NV_FIRMWARE_H
+#define NV_FIRMWARE_H
+
+
+
+#include <nvtypes.h>
+#include <nvmisc.h>
+
+typedef enum
+{
+    NV_FIRMWARE_TYPE_GSP,
+    NV_FIRMWARE_TYPE_GSP_LOG
+} nv_firmware_type_t;
+
+typedef enum
+{
+    NV_FIRMWARE_CHIP_FAMILY_NULL = 0,
+    NV_FIRMWARE_CHIP_FAMILY_TU10X = 1,
+    NV_FIRMWARE_CHIP_FAMILY_TU11X = 2,
+    NV_FIRMWARE_CHIP_FAMILY_GA100 = 3,
+    NV_FIRMWARE_CHIP_FAMILY_GA10X = 4,
+    NV_FIRMWARE_CHIP_FAMILY_AD10X = 5,
+    NV_FIRMWARE_CHIP_FAMILY_GH100 = 6,
+    NV_FIRMWARE_CHIP_FAMILY_END,
+} nv_firmware_chip_family_t;
+
+static inline const char *nv_firmware_chip_family_to_string(
+    nv_firmware_chip_family_t fw_chip_family
+)
+{
+    switch (fw_chip_family) {
+        case NV_FIRMWARE_CHIP_FAMILY_GH100: return "gh100";
+        case NV_FIRMWARE_CHIP_FAMILY_AD10X: return "ad10x";
+        case NV_FIRMWARE_CHIP_FAMILY_GA10X: return "ga10x";
+        case NV_FIRMWARE_CHIP_FAMILY_GA100: return "ga100";
+        case NV_FIRMWARE_CHIP_FAMILY_TU11X: return "tu11x";
+        case NV_FIRMWARE_CHIP_FAMILY_TU10X: return "tu10x";
+
+        case NV_FIRMWARE_CHIP_FAMILY_END:  // fall through
+        case NV_FIRMWARE_CHIP_FAMILY_NULL:
+            return NULL;
+    }
+    return NULL;
+}
+
+// The includer (presumably nv.c) may optionally define
+// NV_FIRMWARE_PATH_FOR_FILENAME(filename)
+// to return a string "path" given a gsp_*.bin or gsp_log_*.bin filename.
+//
+// The function nv_firmware_path will then be available.
+#if defined(NV_FIRMWARE_PATH_FOR_FILENAME)
+static inline const char *nv_firmware_path(
+    nv_firmware_type_t fw_type,
+    nv_firmware_chip_family_t fw_chip_family
+)
+{
+    if (fw_type == NV_FIRMWARE_TYPE_GSP)
+    {
+        switch (fw_chip_family)
+        {
+            case NV_FIRMWARE_CHIP_FAMILY_AD10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_ad10x.bin");
+
+            case NV_FIRMWARE_CHIP_FAMILY_GH100:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA100:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA10X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_TU11X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_TU10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_tu10x.bin");
+
+            case NV_FIRMWARE_CHIP_FAMILY_END:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_NULL:
+                return "";
+        }
+    }
+    else if (fw_type == NV_FIRMWARE_TYPE_GSP_LOG)
+    {
+        switch (fw_chip_family)
+        {
+            case NV_FIRMWARE_CHIP_FAMILY_AD10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_log_ad10x.bin");
+
+            case NV_FIRMWARE_CHIP_FAMILY_GH100:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA100:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA10X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_TU11X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_TU10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_log_tu10x.bin");
+
+            case NV_FIRMWARE_CHIP_FAMILY_END:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_NULL:
+                return "";
+        }
+    }
+
+    return "";
+}
+#endif  // defined(NV_FIRMWARE_PATH_FOR_FILENAME)
+
+// The includer (presumably nv.c) may optionally define
+// NV_FIRMWARE_DECLARE_GSP_FILENAME(filename)
+// which will then be invoked (at the top-level) for each
+// gsp_*.bin (but not gsp_log_*.bin)
+#if defined(NV_FIRMWARE_DECLARE_GSP_FILENAME)
+NV_FIRMWARE_DECLARE_GSP_FILENAME("gsp_ad10x.bin")
+NV_FIRMWARE_DECLARE_GSP_FILENAME("gsp_tu10x.bin")
+#endif  // defined(NV_FIRMWARE_DECLARE_GSP_FILENAME)
+
+#endif  // NV_FIRMWARE_DECLARE_GSP_FILENAME
--- a/kernel-open/common/inc/nv-hash.h
+++ b/kernel-open/common/inc/nv-hash.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-22 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -91,6 +91,6 @@ static inline void _nv_hash_init(struct hlist_head *ht, unsigned int sz)
 * @key: the key of the objects to iterate over
 */
 #define nv_hash_for_each_possible(name, obj, member, key) \
-    nv_hlist_for_each_entry(obj, &name[NV_HASH_MIN(key, NV_HASH_BITS(name))], member)
+    hlist_for_each_entry(obj, &name[NV_HASH_MIN(key, NV_HASH_BITS(name))], member)

 #endif // __NV_HASH_H__
--- a/kernel-open/common/inc/nv-hypervisor.h
+++ b/kernel-open/common/inc/nv-hypervisor.h
@ -27,15 +27,13 @@
 #include <nv-kernel-interface-api.h>

 // Enums for supported hypervisor types.
-// New hypervisor type should be added before OS_HYPERVISOR_CUSTOM_FORCED
+// New hypervisor type should be added before OS_HYPERVISOR_UNKNOWN
 typedef enum _HYPERVISOR_TYPE
 {
    OS_HYPERVISOR_XEN = 0,
    OS_HYPERVISOR_VMWARE,
    OS_HYPERVISOR_HYPERV,
    OS_HYPERVISOR_KVM,
-    OS_HYPERVISOR_PARALLELS,
-    OS_HYPERVISOR_CUSTOM_FORCED,
    OS_HYPERVISOR_UNKNOWN
 } HYPERVISOR_TYPE;

--- a/kernel-open/common/inc/nv-kthread-q.h
+++ b/kernel-open/common/inc/nv-kthread-q.h
@ -115,11 +115,6 @@ struct nv_kthread_q_item
    void *function_args;
 };

-#if defined(NV_KTHREAD_CREATE_ON_NODE_PRESENT)
-    #define NV_KTHREAD_Q_SUPPORTS_AFFINITY() 1
-#else
-    #define NV_KTHREAD_Q_SUPPORTS_AFFINITY() 0
-#endif

 #ifndef NUMA_NO_NODE
 #define NUMA_NO_NODE (-1)
@ -142,18 +137,12 @@ struct nv_kthread_q_item
 //
 // A short prefix of the qname arg will show up in []'s, via the ps(1) utility.
 //
-// The kernel thread stack is preferably allocated on the specified NUMA node if
-// NUMA-affinity (NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1) is supported, but
-// fallback to another node is possible because kernel allocators do not
+// The kernel thread stack is preferably allocated on the specified NUMA node, 
+// but fallback to another node is possible because kernel allocators do not
 // guarantee affinity. Note that NUMA-affinity applies only to
 // the kthread stack. This API does not do anything about limiting the CPU
 // affinity of the kthread. That is left to the caller.
 //
-// On kernels, which do not support NUMA-aware kthread stack allocations
-// (NV_KTHTREAD_Q_SUPPORTS_AFFINITY() == 0), the API will return -ENOTSUPP
-// if the value supplied for 'preferred_node' is anything other than
-// NV_KTHREAD_NO_NODE.
-//
 // Reusing a queue: once a queue is initialized, it must be safely shut down
 // (see "Stopping the queue(s)", below), before it can be reused. So, for
 // a simple queue use case, the following will work:
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2001-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2001-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -191,13 +191,6 @@
 */
 #define NV_CURRENT_EUID() (__kuid_val(current->cred->euid))

-#if !defined(NV_KUID_T_PRESENT)
-static inline uid_t __kuid_val(uid_t uid)
-{
-    return uid;
-}
-#endif
-
 #if defined(CONFIG_VGA_ARB)
 #include <linux/vgaarb.h>
 #endif
@ -234,18 +227,6 @@ static inline uid_t __kuid_val(uid_t uid)
 #include <asm-generic/pci-dma-compat.h>
 #endif

-#if defined(NV_EFI_ENABLED_PRESENT) && defined(NV_EFI_ENABLED_ARGUMENT_COUNT)
-#if (NV_EFI_ENABLED_ARGUMENT_COUNT == 1)
-#define NV_EFI_ENABLED() efi_enabled(EFI_BOOT)
-#else
-#error "NV_EFI_ENABLED_ARGUMENT_COUNT value unrecognized!"
-#endif
-#elif (defined(NV_EFI_ENABLED_PRESENT) || defined(efi_enabled))
-#define NV_EFI_ENABLED() efi_enabled
-#else
-#define NV_EFI_ENABLED() 0
-#endif
-
 #if defined(CONFIG_CRAY_XT)
 #include <cray/cray_nvidia.h>
 NV_STATUS nvos_forward_error_to_cray(struct pci_dev *, NvU32,
@ -521,7 +502,7 @@ static inline void *nv_vmalloc(unsigned long size)
    return ptr;
 }

-static inline void nv_vfree(void *ptr, NvU32 size)
+static inline void nv_vfree(void *ptr, NvU64 size)
 {
    NV_MEMDBG_REMOVE(ptr, size);
    vfree(ptr);
@ -592,11 +573,7 @@ static NvBool nv_numa_node_has_memory(int node_id)
 {
    if (node_id < 0 || node_id >= MAX_NUMNODES)
        return NV_FALSE;
-#if defined(NV_NODE_STATES_N_MEMORY_PRESENT)
    return node_state(node_id, N_MEMORY) ? NV_TRUE : NV_FALSE;
-#else
-    return node_state(node_id, N_HIGH_MEMORY) ? NV_TRUE : NV_FALSE;
-#endif
 }

 #define NV_KMALLOC(ptr, size) \
@ -606,6 +583,13 @@ static NvBool nv_numa_node_has_memory(int node_id)
            NV_MEMDBG_ADD(ptr, size); \
    }

+#define NV_KZALLOC(ptr, size) \
+    { \
+        (ptr) = kzalloc(size, NV_GFP_KERNEL); \
+        if (ptr) \
+            NV_MEMDBG_ADD(ptr, size); \
+    }
+
 #define NV_KMALLOC_ATOMIC(ptr, size) \
    { \
        (ptr) = kmalloc(size, NV_GFP_ATOMIC); \
@ -838,10 +822,8 @@ static inline dma_addr_t nv_phys_to_dma(struct device *dev, NvU64 pa)
    })
 #endif

-#if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE_PRESENT)  // introduced in 3.4.9
+#if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE_PRESENT)  // introduced in 3.18-rc1 for aarch64
 #define NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(pci_dev) pci_stop_and_remove_bus_device(pci_dev)
-#elif defined(NV_PCI_REMOVE_BUS_DEVICE_PRESENT) // introduced in 2.6
-#define NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(pci_dev) pci_remove_bus_device(pci_dev)
 #endif

 #define NV_PRINT_AT(nv_debug_level,at)                                           \
@ -1139,11 +1121,14 @@ static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
 {
    nvidia_stack_t *sp = NULL;
 #if defined(NVCPU_X86_64)
-    sp = NV_KMEM_CACHE_ALLOC(nvidia_stack_t_cache);
-    if (sp == NULL)
-        return -ENOMEM;
-    sp->size = sizeof(sp->stack);
-    sp->top = sp->stack + sp->size;
+    if (rm_is_altstack_in_use())
+    {
+        sp = NV_KMEM_CACHE_ALLOC(nvidia_stack_t_cache);
+        if (sp == NULL)
+            return -ENOMEM;
+        sp->size = sizeof(sp->stack);
+        sp->top = sp->stack + sp->size;
+    }
 #endif
    *stack = sp;
    return 0;
@ -1152,7 +1137,7 @@ static inline int nv_kmem_cache_alloc_stack(nvidia_stack_t **stack)
 static inline void nv_kmem_cache_free_stack(nvidia_stack_t *stack)
 {
 #if defined(NVCPU_X86_64)
-    if (stack != NULL)
+    if (stack != NULL && rm_is_altstack_in_use())
    {
        NV_KMEM_CACHE_FREE(stack, nvidia_stack_t_cache);
    }
@ -1386,8 +1371,7 @@ typedef struct nv_dma_map_s {
 * xen_swiotlb_map_sg_attrs may try to route to the SWIOTLB. We must only use
 * single-page sg elements on Xen Server.
 */
-#if defined(NV_SG_ALLOC_TABLE_FROM_PAGES_PRESENT) && \
-    !defined(NV_DOM0_KERNEL_PRESENT)
+#if !defined(NV_DOM0_KERNEL_PRESENT)
    #define NV_ALLOC_DMA_SUBMAP_SCATTERLIST(dm, sm, i)                        \
        ((sg_alloc_table_from_pages(&sm->sgt,                                 \
            &dm->pages[NV_DMA_SUBMAP_IDX_TO_PAGE_IDX(i)],                     \
@ -1667,6 +1651,27 @@ static inline nv_linux_file_private_t *nv_get_nvlfp_from_nvfp(nv_file_private_t

 #define NV_STATE_PTR(nvl)   &(((nv_linux_state_t *)(nvl))->nv_state)

+static inline nvidia_stack_t *nv_nvlfp_get_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
+{
+#if defined(NVCPU_X86_64)
+    if (rm_is_altstack_in_use())
+    {
+        down(&nvlfp->fops_sp_lock[which]);
+        return nvlfp->fops_sp[which];
+    }
+#endif
+    return NULL;
+}
+
+static inline void nv_nvlfp_put_sp(nv_linux_file_private_t *nvlfp, nvidia_entry_point_index_t which)
+{
+#if defined(NVCPU_X86_64)
+    if (rm_is_altstack_in_use())
+    {
+        up(&nvlfp->fops_sp_lock[which]);
+    }
+#endif
+}

 #define NV_ATOMIC_READ(data)            atomic_read(&(data))
 #define NV_ATOMIC_SET(data,val)         atomic_set(&(data), (val))
@ -1895,20 +1900,12 @@ static inline NvU32 nv_default_irq_flags(nv_state_t *nv)
    #define NV_GET_UNUSED_FD_FLAGS(flags)  (-1)
 #endif

-#if defined(NV_SET_CLOSE_ON_EXEC_PRESENT)
-    #define NV_SET_CLOSE_ON_EXEC(fd, fdt) __set_close_on_exec(fd, fdt)
-#elif defined(NV_LINUX_TIME_H_PRESENT) && defined(FD_SET)
-    #define NV_SET_CLOSE_ON_EXEC(fd, fdt) FD_SET(fd, fdt->close_on_exec)
-#else
-    #define NV_SET_CLOSE_ON_EXEC(fd, fdt) __set_bit(fd, fdt->close_on_exec)
-#endif
-
 #define MODULE_BASE_NAME "nvidia"
 #define MODULE_INSTANCE_NUMBER 0
 #define MODULE_INSTANCE_STRING ""
 #define MODULE_NAME MODULE_BASE_NAME MODULE_INSTANCE_STRING

-NvS32 nv_request_soc_irq(nv_linux_state_t *, NvU32, nv_soc_irq_type_t, NvU32, NvU32);
+NvS32 nv_request_soc_irq(nv_linux_state_t *, NvU32, nv_soc_irq_type_t, NvU32, NvU32, const char*);

 static inline void nv_mutex_destroy(struct mutex *lock)
 {
--- a/kernel-open/common/inc/nv-list-helpers.h
+++ b/kernel-open/common/inc/nv-list-helpers.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2013-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2013-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -73,21 +73,4 @@
    }
 #endif

-#if defined(NV_HLIST_FOR_EACH_ENTRY_ARGUMENT_COUNT)
-#if NV_HLIST_FOR_EACH_ENTRY_ARGUMENT_COUNT == 3
-#define nv_hlist_for_each_entry(pos, head, member) \
-    hlist_for_each_entry(pos, head, member)
-#else
-#if !defined(hlist_entry_safe)
-#define hlist_entry_safe(ptr, type, member) \
-    (ptr) ? hlist_entry(ptr, type, member) : NULL
-#endif
-
-#define nv_hlist_for_each_entry(pos, head, member)                          \
-    for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);     \
-         pos;                                                               \
-         pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
-#endif
-#endif // NV_HLIST_FOR_EACH_ENTRY_ARGUMENT_COUNT
-
 #endif // __NV_LIST_HELPERS_H__
--- a/kernel-open/common/inc/nv-mm.h
+++ b/kernel-open/common/inc/nv-mm.h
@ -29,6 +29,25 @@
 typedef int vm_fault_t;
 #endif

+/* pin_user_pages
+ * Presence of pin_user_pages() also implies the presence of unpin-user_page().
+ * Both were added in the v5.6-rc1
+ *
+ * pin_user_pages() was added by commit eddb1c228f7951d399240
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6-rc1 (2020-01-30)
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#if defined(NV_PIN_USER_PAGES_PRESENT)
+    #define NV_PIN_USER_PAGES pin_user_pages
+    #define NV_UNPIN_USER_PAGE unpin_user_page
+#else
+    #define NV_PIN_USER_PAGES NV_GET_USER_PAGES
+    #define NV_UNPIN_USER_PAGE put_page
+#endif // NV_PIN_USER_PAGES_PRESENT
+
 /* get_user_pages
 *
 * The 8-argument version of get_user_pages was deprecated by commit
@ -47,51 +66,57 @@ typedef int vm_fault_t;
 *
 */

-#if defined(NV_GET_USER_PAGES_HAS_ARGS_WRITE_FORCE)
+#if defined(NV_GET_USER_PAGES_HAS_ARGS_FLAGS)
    #define NV_GET_USER_PAGES get_user_pages
-#elif defined(NV_GET_USER_PAGES_HAS_ARGS_TSK_WRITE_FORCE)
-    #define NV_GET_USER_PAGES(start, nr_pages, write, force, pages, vmas) \
-        get_user_pages(current, current->mm, start, nr_pages, write, force, pages, vmas)
+#elif defined(NV_GET_USER_PAGES_HAS_ARGS_TSK_FLAGS)
+    #define NV_GET_USER_PAGES(start, nr_pages, flags, pages, vmas) \
+        get_user_pages(current, current->mm, start, nr_pages, flags, pages, vmas)
 #else
-    #include <linux/mm.h>
-    #include <linux/sched.h>
-
    static inline long NV_GET_USER_PAGES(unsigned long start,
                                         unsigned long nr_pages,
-                                         int write,
-                                         int force,
+                                         unsigned int flags,
                                         struct page **pages,
                                         struct vm_area_struct **vmas)
    {
-        unsigned int flags = 0;
+        int write = flags & FOLL_WRITE;
+        int force = flags & FOLL_FORCE;

-        if (write)
-            flags |= FOLL_WRITE;
-        if (force)
-            flags |= FOLL_FORCE;
-
-    #if defined(NV_GET_USER_PAGES_HAS_ARGS_TSK_FLAGS)
-        return get_user_pages(current, current->mm, start, nr_pages, flags,
-                              pages, vmas);
+    #if defined(NV_GET_USER_PAGES_HAS_ARGS_WRITE_FORCE)
+        return get_user_pages(start, nr_pages, write, force, pages, vmas);
    #else
-        // remaining defination(NV_GET_USER_PAGES_HAS_ARGS_FLAGS)
-        return get_user_pages(start, nr_pages, flags, pages, vmas);
-    #endif
+        // NV_GET_USER_PAGES_HAS_ARGS_TSK_WRITE_FORCE
+        return get_user_pages(current, current->mm, start, nr_pages, write,
+                              force, pages, vmas);
+    #endif // NV_GET_USER_PAGES_HAS_ARGS_WRITE_FORCE
    }
-#endif
+#endif // NV_GET_USER_PAGES_HAS_ARGS_FLAGS
+
+/* pin_user_pages_remote
+ *
+ * pin_user_pages_remote() was added by commit eddb1c228f7951d399240
+ * ("mm/gup: introduce pin_user_pages*() and FOLL_PIN") in v5.6 (2020-01-30)
+ *
+ * pin_user_pages_remote() removed 'tsk' parameter by commit
+ * 64019a2e467a ("mm/gup: remove task_struct pointer for  all gup code")
+ * in v5.9-rc1 (2020-08-11). *
+ *
+ */
+
+#if defined(NV_PIN_USER_PAGES_REMOTE_PRESENT)
+    #if defined (NV_PIN_USER_PAGES_REMOTE_HAS_ARGS_TSK)
+        #define NV_PIN_USER_PAGES_REMOTE(mm, start, nr_pages, flags, pages, vmas, locked) \
+            pin_user_pages_remote(NULL, mm, start, nr_pages, flags, pages, vmas, locked)
+    #else
+        #define NV_PIN_USER_PAGES_REMOTE pin_user_pages_remote
+    #endif // NV_PIN_USER_PAGES_REMOTE_HAS_ARGS_TSK
+#else
+    #define NV_PIN_USER_PAGES_REMOTE NV_GET_USER_PAGES_REMOTE
+#endif // NV_PIN_USER_PAGES_REMOTE_PRESENT

 /*
 * get_user_pages_remote() was added by commit 1e9877902dc7
 * ("mm/gup: Introduce get_user_pages_remote()") in v4.6 (2016-02-12).
 *
- * The very next commit cde70140fed8 ("mm/gup: Overload get_user_pages()
- * functions") deprecated the 8-argument version of get_user_pages for the
- * non-remote case (calling get_user_pages with current and current->mm).
- *
- * The guidelines are: call NV_GET_USER_PAGES_REMOTE if you need the 8-argument
- * version that uses something other than current and current->mm. Use
- * NV_GET_USER_PAGES if you are refering to current and current->mm.
- *
 * Note that get_user_pages_remote() requires the caller to hold a reference on
 * the task_struct (if non-NULL and if this API has tsk argument) and the mm_struct.
 * This will always be true when using current and current->mm. If the kernel passes
@ -113,66 +138,55 @@ typedef int vm_fault_t;
 */

 #if defined(NV_GET_USER_PAGES_REMOTE_PRESENT)
-    #if defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_WRITE_FORCE)
-        #define NV_GET_USER_PAGES_REMOTE    get_user_pages_remote
+    #if defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_FLAGS_LOCKED)
+        #define NV_GET_USER_PAGES_REMOTE get_user_pages_remote
+
+    #elif defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_FLAGS_LOCKED)
+        #define NV_GET_USER_PAGES_REMOTE(mm, start, nr_pages, flags, pages, vmas, locked) \
+            get_user_pages_remote(NULL, mm, start, nr_pages, flags, pages, vmas, locked)
+
+    #elif defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_FLAGS)
+        #define NV_GET_USER_PAGES_REMOTE(mm, start, nr_pages, flags, pages, vmas, locked) \
+            get_user_pages_remote(NULL, mm, start, nr_pages, flags, pages, vmas)
+
    #else
-        static inline long NV_GET_USER_PAGES_REMOTE(struct task_struct *tsk,
-                                                    struct mm_struct *mm,
+        // NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_WRITE_FORCE
+        static inline long NV_GET_USER_PAGES_REMOTE(struct mm_struct *mm,
                                                    unsigned long start,
                                                    unsigned long nr_pages,
-                                                    int write,
-                                                    int force,
+                                                    unsigned int flags,
                                                    struct page **pages,
-                                                    struct vm_area_struct **vmas)
+                                                    struct vm_area_struct **vmas,
+                                                    int *locked)
        {
-            unsigned int flags = 0;
+            int write = flags & FOLL_WRITE;
+            int force = flags & FOLL_FORCE;

-            if (write)
-                flags |= FOLL_WRITE;
-            if (force)
-                flags |= FOLL_FORCE;
-
-        #if defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_FLAGS)
-            return get_user_pages_remote(tsk, mm, start, nr_pages, flags,
+            return get_user_pages_remote(NULL, mm, start, nr_pages, write, force,
                                         pages, vmas);
-        #elif defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_TSK_FLAGS_LOCKED)
-            return get_user_pages_remote(tsk, mm, start, nr_pages, flags,
-                                         pages, vmas, NULL);
-        #else
-            // remaining defined(NV_GET_USER_PAGES_REMOTE_HAS_ARGS_FLAGS_LOCKED)
-            return get_user_pages_remote(mm, start, nr_pages, flags,
-                                         pages, vmas, NULL);
-        #endif
        }
-    #endif
+    #endif // NV_GET_USER_PAGES_REMOTE_HAS_ARGS_FLAGS_LOCKED
 #else
    #if defined(NV_GET_USER_PAGES_HAS_ARGS_TSK_WRITE_FORCE)
-        #define NV_GET_USER_PAGES_REMOTE    get_user_pages
-    #else
-        #include <linux/mm.h>
-        #include <linux/sched.h>
-
-        static inline long NV_GET_USER_PAGES_REMOTE(struct task_struct *tsk,
-                                                    struct mm_struct *mm,
+        static inline long NV_GET_USER_PAGES_REMOTE(struct mm_struct *mm,
                                                    unsigned long start,
                                                    unsigned long nr_pages,
-                                                    int write,
-                                                    int force,
+                                                    unsigned int flags,
                                                    struct page **pages,
-                                                    struct vm_area_struct **vmas)
+                                                    struct vm_area_struct **vmas,
+                                                    int *locked)
        {
-            unsigned int flags = 0;
+            int write = flags & FOLL_WRITE;
+            int force = flags & FOLL_FORCE;

-            if (write)
-                flags |= FOLL_WRITE;
-            if (force)
-                flags |= FOLL_FORCE;
-
-            return get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+            return get_user_pages(NULL, mm, start, nr_pages, write, force, pages, vmas);
        }
-    #endif
-#endif

+    #else
+        #define NV_GET_USER_PAGES_REMOTE(mm, start, nr_pages, flags, pages, vmas, locked) \
+            get_user_pages(NULL, mm, start, nr_pages, flags, pages, vmas)
+    #endif // NV_GET_USER_PAGES_HAS_ARGS_TSK_WRITE_FORCE
+#endif // NV_GET_USER_PAGES_REMOTE_PRESENT

 /*
 * The .virtual_address field was effectively renamed to .address, by these
--- a/kernel-open/common/inc/nv-pci.h
+++ b/kernel-open/common/inc/nv-pci.h
@ -27,17 +27,6 @@
 #include <linux/pci.h>
 #include "nv-linux.h"

-#if defined(NV_DEV_IS_PCI_PRESENT)
-#define nv_dev_is_pci(dev) dev_is_pci(dev)
-#else
-/*
- * Non-PCI devices are only supported on kernels which expose the
- * dev_is_pci() function. For older kernels, we only support PCI
- * devices, hence returning true to take all the PCI code paths.
- */
-#define nv_dev_is_pci(dev) (true)
-#endif
-
 int nv_pci_register_driver(void);
 void nv_pci_unregister_driver(void);
 int nv_pci_count_devices(void);
--- a/kernel-open/common/inc/nv-pgprot.h
+++ b/kernel-open/common/inc/nv-pgprot.h
@ -78,13 +78,8 @@ static inline pgprot_t pgprot_modify_writecombine(pgprot_t old_prot)

 #define NV_PGPROT_UNCACHED_DEVICE(old_prot)     pgprot_noncached(old_prot)
 #if defined(NVCPU_AARCH64)
-#if defined(NV_MT_DEVICE_GRE_PRESENT)
-#define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
-                                         PTE_ATTRINDX(MT_DEVICE_GRE))
-#else
 #define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
                                         PTE_ATTRINDX(MT_DEVICE_nGnRE))
-#endif
 #define NV_PGPROT_WRITE_COMBINED_DEVICE(old_prot)                             \
    __pgprot_modify(old_prot, PTE_ATTRINDX_MASK, NV_PROT_WRITE_COMBINED_DEVICE)
 #define NV_PGPROT_WRITE_COMBINED(old_prot)      NV_PGPROT_UNCACHED(old_prot)
--- a/kernel-open/common/inc/nv-procfs-utils.h
+++ b/kernel-open/common/inc/nv-procfs-utils.h
@ -74,21 +74,8 @@ typedef struct file_operations nv_proc_ops_t;
        __entry;                                                         \
    })

-/*
- * proc_mkdir_mode exists in Linux 2.6.9, but isn't exported until Linux 3.0.
- * Use the older interface instead unless the newer interface is necessary.
- */
-#if defined(NV_PROC_REMOVE_PRESENT)
 # define NV_PROC_MKDIR_MODE(name, mode, parent)                \
    proc_mkdir_mode(name, mode, parent)
-#else
-# define NV_PROC_MKDIR_MODE(name, mode, parent)                \
-   ({                                                          \
-        struct proc_dir_entry *__entry;                        \
-        __entry = create_proc_entry(name, mode, parent);       \
-        __entry;                                               \
-    })
-#endif

 #define NV_CREATE_PROC_DIR(name,parent)                        \
   ({                                                          \
@ -104,16 +91,6 @@ typedef struct file_operations nv_proc_ops_t;
 #define NV_PDE_DATA(inode) PDE_DATA(inode)
 #endif

-#if defined(NV_PROC_REMOVE_PRESENT)
-# define NV_REMOVE_PROC_ENTRY(entry)                           \
-    proc_remove(entry);
-#else
-# define NV_REMOVE_PROC_ENTRY(entry)                           \
-    remove_proc_entry(entry->name, entry->parent);
-#endif
-
-void nv_procfs_unregister_all(struct proc_dir_entry *entry,
-                              struct proc_dir_entry *delimiter);
 #define NV_DEFINE_SINGLE_PROCFS_FILE_HELPER(name, lock)                     \
    static int nv_procfs_open_##name(                                       \
        struct inode *inode,                                                \
--- a/kernel-open/common/inc/nv-proto.h
+++ b/kernel-open/common/inc/nv-proto.h
@ -54,8 +54,6 @@ void        nv_free_contig_pages        (nv_alloc_t *);
 NV_STATUS   nv_alloc_system_pages       (nv_state_t *, nv_alloc_t *);
 void        nv_free_system_pages        (nv_alloc_t *);

-void        nv_address_space_init_once  (struct address_space *mapping);
-
 int         nv_uvm_init                 (void);
 void        nv_uvm_exit                 (void);
 NV_STATUS   nv_uvm_suspend              (void);
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -40,6 +40,7 @@
 #include <nvstatus.h>
 #include "nv_stdarg.h"
 #include <nv-caps.h>
+#include <nv-firmware.h>
 #include <nv-ioctl.h>
 #include <nvmisc.h>

@ -160,8 +161,14 @@ typedef enum _TEGRASOC_WHICH_CLK
    TEGRASOC_WHICH_CLK_MAUD,
    TEGRASOC_WHICH_CLK_AZA_2XBIT,
    TEGRASOC_WHICH_CLK_AZA_BIT,
-    TEGRA234_CLK_MIPI_CAL,
-    TEGRA234_CLK_UART_FST_MIPI_CAL,
+    TEGRASOC_WHICH_CLK_MIPI_CAL,
+    TEGRASOC_WHICH_CLK_UART_FST_MIPI_CAL,
+    TEGRASOC_WHICH_CLK_SOR0_DIV,
+    TEGRASOC_WHICH_CLK_DISP_ROOT,
+    TEGRASOC_WHICH_CLK_HUB_ROOT,
+    TEGRASOC_WHICH_CLK_PLLA_DISP,
+    TEGRASOC_WHICH_CLK_PLLA_DISPHUB,
+    TEGRASOC_WHICH_CLK_PLLA,
    TEGRASOC_WHICH_CLK_MAX, // TEGRASOC_WHICH_CLK_MAX is defined for boundary checks only.
 } TEGRASOC_WHICH_CLK;

@ -304,7 +311,7 @@ typedef struct nv_alloc_mapping_context_s {

 typedef enum
 {
-    NV_SOC_IRQ_DISPLAY_TYPE,
+    NV_SOC_IRQ_DISPLAY_TYPE = 0x1,
    NV_SOC_IRQ_DPAUX_TYPE,
    NV_SOC_IRQ_GPIO_TYPE,
    NV_SOC_IRQ_HDACODEC_TYPE,
@ -368,6 +375,7 @@ typedef struct nv_state_t
    nv_aperture_t *mipical_regs;
    nv_aperture_t *fb, ud;
    nv_aperture_t *simregs;
+    nv_aperture_t *emc_regs;

    NvU32  num_dpaux_instance;
    NvU32  interrupt_line;
@ -430,9 +438,6 @@ typedef struct nv_state_t
    /* Variable to force allocation of 32-bit addressable memory */
    NvBool force_dma32_alloc;

-    /* Variable to track if device has entered dynamic power state */
-    NvBool dynamic_power_entered;
-
    /* PCI power state should be D0 during system suspend */
    NvBool d0_state_in_suspend;

@ -465,6 +470,9 @@ typedef struct nv_state_t
    /* Check if NVPCF DSM function is implemented under NVPCF or GPU device scope */
    NvBool nvpcf_dsm_in_gpu_scope;

+    /* Bool to check if the device received a shutdown notification */
+    NvBool is_shutdown;
+
 } nv_state_t;

 // These define need to be in sync with defines in system.h
@ -473,6 +481,10 @@ typedef struct nv_state_t
 #define OS_TYPE_SUNOS   0x3
 #define OS_TYPE_VMWARE  0x4

+#define NVFP_TYPE_NONE       0x0
+#define NVFP_TYPE_REFCOUNTED 0x1
+#define NVFP_TYPE_REGISTERED 0x2
+
 struct nv_file_private_t
 {
    NvHandle *handles;
@ -482,6 +494,7 @@ struct nv_file_private_t

    nv_file_private_t *ctl_nvfp;
    void *ctl_nvfp_priv;
+    NvU32 register_or_refcount;
 };

 // Forward define the gpu ops structures
@ -513,8 +526,9 @@ typedef struct UvmGpuChannelResourceBindParams_tag  *nvgpuChannelResourceBindPar
 typedef struct UvmGpuPagingChannelAllocParams_tag    nvgpuPagingChannelAllocParams_t;
 typedef struct UvmGpuPagingChannel_tag              *nvgpuPagingChannelHandle_t;
 typedef struct UvmGpuPagingChannelInfo_tag          *nvgpuPagingChannelInfo_t;
-typedef NV_STATUS (*nvPmaEvictPagesCallback)(void *, NvU32, NvU64 *, NvU32, NvU64, NvU64);
-typedef NV_STATUS (*nvPmaEvictRangeCallback)(void *, NvU64, NvU64);
+typedef enum   UvmPmaGpuMemoryType_tag               nvgpuGpuMemoryType_t;
+typedef NV_STATUS (*nvPmaEvictPagesCallback)(void *, NvU32, NvU64 *, NvU32, NvU64, NvU64, nvgpuGpuMemoryType_t);
+typedef NV_STATUS (*nvPmaEvictRangeCallback)(void *, NvU64, NvU64, nvgpuGpuMemoryType_t);

 /*
 * flags
@ -566,12 +580,6 @@ typedef enum
    NV_POWER_STATE_RUNNING
 } nv_power_state_t;

-typedef enum
-{
-    NV_FIRMWARE_GSP,
-    NV_FIRMWARE_GSP_LOG
-} nv_firmware_t;
-
 #define NV_PRIMARY_VGA(nv)      ((nv)->primary_vga)

 #define NV_IS_CTL_DEVICE(nv)    ((nv)->flags & NV_FLAG_CONTROL)
@ -587,12 +595,6 @@ typedef enum
 #define NV_SOC_IS_ISO_IOMMU_PRESENT(nv)     \
        ((nv)->iso_iommu_present)

-/*
- * NVIDIA ACPI event ID to be passed into the core NVIDIA driver for
- * AC/DC event.
- */
-#define NV_SYSTEM_ACPI_BATTERY_POWER_EVENT   0x8002
-
 /*
 * GPU add/remove events
 */
@ -604,8 +606,6 @@ typedef enum
 * to core NVIDIA driver for ACPI events.
 */
 #define NV_SYSTEM_ACPI_EVENT_VALUE_DISPLAY_SWITCH_DEFAULT    0
-#define NV_SYSTEM_ACPI_EVENT_VALUE_POWER_EVENT_AC            0
-#define NV_SYSTEM_ACPI_EVENT_VALUE_POWER_EVENT_BATTERY       1
 #define NV_SYSTEM_ACPI_EVENT_VALUE_DOCK_EVENT_UNDOCKED       0
 #define NV_SYSTEM_ACPI_EVENT_VALUE_DOCK_EVENT_DOCKED         1

@ -616,14 +616,18 @@ typedef enum
 #define NV_EVAL_ACPI_METHOD_NVIF     0x01
 #define NV_EVAL_ACPI_METHOD_WMMX     0x02

-#define NV_I2C_CMD_READ              1
-#define NV_I2C_CMD_WRITE             2
-#define NV_I2C_CMD_SMBUS_READ        3
-#define NV_I2C_CMD_SMBUS_WRITE       4
-#define NV_I2C_CMD_SMBUS_QUICK_WRITE 5
-#define NV_I2C_CMD_SMBUS_QUICK_READ  6
-#define NV_I2C_CMD_SMBUS_BLOCK_READ  7
-#define NV_I2C_CMD_SMBUS_BLOCK_WRITE 8
+typedef enum {
+    NV_I2C_CMD_READ = 1,
+    NV_I2C_CMD_WRITE,
+    NV_I2C_CMD_SMBUS_READ,
+    NV_I2C_CMD_SMBUS_WRITE,
+    NV_I2C_CMD_SMBUS_QUICK_WRITE,
+    NV_I2C_CMD_SMBUS_QUICK_READ,
+    NV_I2C_CMD_SMBUS_BLOCK_READ,
+    NV_I2C_CMD_SMBUS_BLOCK_WRITE,
+    NV_I2C_CMD_BLOCK_READ,
+    NV_I2C_CMD_BLOCK_WRITE
+} nv_i2c_cmd_t;

 // Flags needed by OSAllocPagesNode
 #define NV_ALLOC_PAGES_NODE_NONE                0x0
@ -636,27 +640,33 @@ typedef enum
 #define NV_GET_NV_STATE(pGpu) \
    (nv_state_t *)((pGpu) ? (pGpu)->pOsGpuInfo : NULL)

-#define IS_REG_OFFSET(nv, offset, length)                                       \
-    (((offset) >= (nv)->regs->cpu_address) &&                                   \
-    (((offset) + ((length)-1)) <=                                               \
-        (nv)->regs->cpu_address + ((nv)->regs->size-1)))
+static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((offset >= nv->regs->cpu_address) &&
+            ((offset + (length - 1)) <= (nv->regs->cpu_address + (nv->regs->size - 1))));
+}

-#define IS_FB_OFFSET(nv, offset, length)                                        \
-    (((nv)->fb) && ((offset) >= (nv)->fb->cpu_address) &&                       \
-    (((offset) + ((length)-1)) <= (nv)->fb->cpu_address + ((nv)->fb->size-1)))
+static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
+}

-#define IS_UD_OFFSET(nv, offset, length)                                        \
-    (((nv)->ud.cpu_address != 0) && ((nv)->ud.size != 0) &&                     \
-    ((offset) >= (nv)->ud.cpu_address) &&                                       \
-    (((offset) + ((length)-1)) <= (nv)->ud.cpu_address + ((nv)->ud.size-1)))
+static inline NvBool IS_UD_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->ud.cpu_address != 0) && (nv->ud.size != 0) &&
+            (offset >= nv->ud.cpu_address) &&
+            ((offset + (length - 1)) <= (nv->ud.cpu_address + (nv->ud.size - 1))));
+}

-#define IS_IMEM_OFFSET(nv, offset, length)                                      \
-    (((nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&                    \
-     ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&                           \
-     ((offset) >= (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&             \
-     (((offset) + ((length) - 1)) <=                                            \
-        (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +                         \
-            ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size - 1)))
+static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&
+            (nv->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&
+            (offset >= nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&
+            ((offset + (length - 1)) <= (nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +
+                                         (nv->bars[NV_GPU_BAR_INDEX_IMEM].size - 1))));
+}

 #define NV_RM_MAX_MSIX_LINES  8

@ -787,7 +797,7 @@ NV_STATUS  NV_API_CALL  nv_pci_trigger_recovery  (nv_state_t *);
 NvBool     NV_API_CALL  nv_requires_dma_remap    (nv_state_t *);

 NvBool     NV_API_CALL  nv_is_rm_firmware_active(nv_state_t *);
-const void*NV_API_CALL  nv_get_firmware(nv_state_t *, nv_firmware_t, const void **, NvU32 *);
+const void*NV_API_CALL  nv_get_firmware(nv_state_t *, nv_firmware_type_t, nv_firmware_chip_family_t, const void **, NvU32 *);
 void       NV_API_CALL  nv_put_firmware(const void *);

 nv_file_private_t* NV_API_CALL nv_get_file_private(NvS32, NvBool, void **);
@ -828,6 +838,7 @@ NV_STATUS NV_API_CALL nv_acquire_fabric_mgmt_cap (int, int*);
 int       NV_API_CALL nv_cap_drv_init(void);
 void      NV_API_CALL nv_cap_drv_exit(void);
 NvBool    NV_API_CALL nv_is_gpu_accessible(nv_state_t *);
+NvBool    NV_API_CALL nv_match_gpu_os_info(nv_state_t *, void *);

 NvU32     NV_API_CALL nv_get_os_type(void);

@ -916,11 +927,11 @@ NvBool     NV_API_CALL  rm_is_supported_pci_device(NvU8   pci_class,

 void       NV_API_CALL  rm_i2c_remove_adapters    (nvidia_stack_t *, nv_state_t *);
 NvBool     NV_API_CALL  rm_i2c_is_smbus_capable   (nvidia_stack_t *, nv_state_t *, void *);
-NV_STATUS  NV_API_CALL  rm_i2c_transfer           (nvidia_stack_t *, nv_state_t *, void *, NvU8, NvU8, NvU8, NvU32, NvU8 *);
+NV_STATUS  NV_API_CALL  rm_i2c_transfer           (nvidia_stack_t *, nv_state_t *, void *, nv_i2c_cmd_t, NvU8, NvU8, NvU32, NvU8 *);

 NV_STATUS  NV_API_CALL  rm_perform_version_check  (nvidia_stack_t *, void *, NvU32);

-NV_STATUS  NV_API_CALL  rm_system_event           (nvidia_stack_t *, NvU32, NvU32);
+void       NV_API_CALL  rm_power_source_change_event        (nvidia_stack_t *, NvU32);

 void       NV_API_CALL  rm_disable_gpu_state_persistence    (nvidia_stack_t *sp, nv_state_t *);
 NV_STATUS  NV_API_CALL  rm_p2p_init_mapping       (nvidia_stack_t *, NvU64, NvU64 *, NvU64 *, NvU64 *, NvU64 *, NvU64, NvU64, NvU64, NvU64, void (*)(void *), void *);
@ -944,6 +955,7 @@ void       NV_API_CALL rm_kernel_rmapi_op(nvidia_stack_t *sp, void *ops_cmd);
 NvBool     NV_API_CALL rm_get_device_remove_flag(nvidia_stack_t *sp, NvU32 gpu_id);
 NV_STATUS  NV_API_CALL rm_gpu_copy_mmu_faults(nvidia_stack_t *, nv_state_t *, NvU32 *);
 NV_STATUS  NV_API_CALL rm_gpu_copy_mmu_faults_unlocked(nvidia_stack_t *, nv_state_t *, NvU32 *);
+NV_STATUS  NV_API_CALL rm_gpu_handle_mmu_faults(nvidia_stack_t *, nv_state_t *, NvU32 *);
 NvBool     NV_API_CALL rm_gpu_need_4k_page_isolation(nv_state_t *);
 NvBool     NV_API_CALL rm_is_chipset_io_coherent(nv_stack_t *);
 NvBool     NV_API_CALL rm_init_event_locks(nvidia_stack_t *, nv_state_t *);
@ -969,12 +981,13 @@ const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *,
 const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);

 void       NV_API_CALL rm_acpi_notify(nvidia_stack_t *, nv_state_t *, NvU32);
-NV_STATUS  NV_API_CALL rm_get_clientnvpcf_power_limits(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *);
+
+NvBool     NV_API_CALL rm_is_altstack_in_use(void);

 /* vGPU VFIO specific functions */
 NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU32, NvU16 *, NvU32, NvBool *);
 NV_STATUS  NV_API_CALL  nv_vgpu_delete(nvidia_stack_t *, const NvU8 *, NvU16);
-NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 **, NvBool);
+NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *, NvBool, NvU8, NvBool);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_info(nvidia_stack_t *, nv_state_t *, NvU32, char *, int, NvU8);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_bar_info(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU64 *, NvU32, void *);
 NV_STATUS  NV_API_CALL  nv_vgpu_start(nvidia_stack_t *, const NvU8 *, void *, NvS32 *, NvU8 *, NvU32);
@ -987,6 +1000,10 @@ NV_STATUS NV_API_CALL nv_get_usermap_access_params(nv_state_t*, nv_usermap_acces
 nv_soc_irq_type_t NV_API_CALL nv_get_current_irq_type(nv_state_t*);
 void       NV_API_CALL  nv_flush_coherent_cpu_cache_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size);

+#if defined(NV_VMWARE)
+const void* NV_API_CALL rm_get_firmware(nv_firmware_type_t fw_type, const void **fw_buf, NvU32 *fw_size);
+#endif
+
 /* Callbacks should occur roughly every 10ms. */
 #define NV_SNAPSHOT_TIMER_HZ 100
 void NV_API_CALL nv_start_snapshot_timer(void (*snapshot_callback)(void *context));
@ -998,6 +1015,16 @@ static inline const NvU8 *nv_get_cached_uuid(nv_state_t *nv)
    return nv->nv_uuid_cache.valid ? nv->nv_uuid_cache.uuid : NULL;
 }

+/* nano second resolution timer callback structure */
+typedef struct nv_nano_timer nv_nano_timer_t;
+
+/* nano timer functions */
+void        NV_API_CALL nv_create_nano_timer(nv_state_t *, void *pTmrEvent, nv_nano_timer_t **);
+void        NV_API_CALL nv_start_nano_timer(nv_state_t *nv, nv_nano_timer_t *, NvU64 timens);
+NV_STATUS   NV_API_CALL rm_run_nano_timer_callback(nvidia_stack_t *, nv_state_t *, void *pTmrEvent);
+void        NV_API_CALL nv_cancel_nano_timer(nv_state_t *, nv_nano_timer_t *);
+void        NV_API_CALL nv_destroy_nano_timer(nv_state_t *nv, nv_nano_timer_t *);
+
 #if defined(NVCPU_X86_64)

 static inline NvU64 nv_rdtsc(void)
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@ -331,10 +331,14 @@ typedef NV_STATUS (*uvmPmaEvictPagesCallback)(void *callbackData,
                                              NvU64 *pPages,
                                              NvU32 count,
                                              NvU64 physBegin,
-                                              NvU64 physEnd);
+                                              NvU64 physEnd,
+                                              UVM_PMA_GPU_MEMORY_TYPE mem_type);

 // Mirrors pmaEvictRangeCb_t, see its documentation in pma.h.
-typedef NV_STATUS (*uvmPmaEvictRangeCallback)(void *callbackData, NvU64 physBegin, NvU64 physEnd);
+typedef NV_STATUS (*uvmPmaEvictRangeCallback)(void *callbackData,
+                                              NvU64 physBegin,
+                                              NvU64 physEnd,
+                                              UVM_PMA_GPU_MEMORY_TYPE mem_type);

 /*******************************************************************************
    nvUvmInterfacePmaRegisterEvictionCallbacks
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -897,6 +897,16 @@ typedef struct UvmGpuAccessCntrConfig_tag
    NvU32 threshold;
 } UvmGpuAccessCntrConfig;

+//
+// When modifying this enum, make sure they are compatible with the mirrored
+// MEMORY_PROTECTION enum in phys_mem_allocator.h.
+//
+typedef enum UvmPmaGpuMemoryType_tag
+{
+    UVM_PMA_GPU_MEMORY_TYPE_UNPROTECTED = 0,
+    UVM_PMA_GPU_MEMORY_TYPE_PROTECTED   = 1
+} UVM_PMA_GPU_MEMORY_TYPE;
+
 typedef UvmGpuChannelInfo gpuChannelInfo;
 typedef UvmGpuChannelAllocParams gpuChannelAllocParams;
 typedef UvmGpuCaps gpuCaps;
--- a/kernel-open/common/inc/nvgputypes.h
+++ b/kernel-open/common/inc/nvgputypes.h
@ -150,9 +150,7 @@ typedef struct NvSyncPointFenceRec {
 |*                                                                           *|
 \***************************************************************************/

-#if !defined(XAPIGEN)   /* NvOffset is XAPIGEN builtin type, so skip typedef */
 typedef NvU64           NvOffset; /* GPU address                             */
-#endif

 #define NvOffset_HI32(n)  ((NvU32)(((NvU64)(n)) >> 32))
 #define NvOffset_LO32(n)  ((NvU32)((NvU64)(n)))
--- a/kernel-open/common/inc/nvkms-api-types.h
+++ b/kernel-open/common/inc/nvkms-api-types.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2014-2015 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2014-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -29,6 +29,7 @@
 #include <nvlimits.h>

 #define NVKMS_MAX_SUBDEVICES                  NV_MAX_SUBDEVICES
+#define NVKMS_MAX_HEADS_PER_DISP              NV_MAX_HEADS

 #define NVKMS_LEFT                            0
 #define NVKMS_RIGHT                           1
@ -530,4 +531,78 @@ typedef struct {
    NvBool noncoherent;
 } NvKmsDispIOCoherencyModes;

+enum NvKmsInputColorSpace {
+    /* Unknown colorspace; no de-gamma will be applied */
+    NVKMS_INPUT_COLORSPACE_NONE = 0,
+
+    /* Linear, Rec.709 [-0.5, 7.5) */
+    NVKMS_INPUT_COLORSPACE_SCRGB_LINEAR = 1,
+
+    /* PQ, Rec.2020 unity */
+    NVKMS_INPUT_COLORSPACE_BT2100_PQ = 2,
+};
+
+enum NvKmsOutputTf {
+    /*
+     * NVKMS itself won't apply any OETF (clients are still
+     * free to provide a custom OLUT)
+     */
+    NVKMS_OUTPUT_TF_NONE = 0,
+    NVKMS_OUTPUT_TF_TRADITIONAL_GAMMA_SDR = 1,
+    NVKMS_OUTPUT_TF_PQ = 2,
+};
+
+/*!
+ * HDR Static Metadata Type1 Descriptor as per CEA-861.3 spec.
+ * This is expected to match exactly with the spec.
+ */
+struct NvKmsHDRStaticMetadata {
+    /*!
+     * Color primaries of the data.
+     * These are coded as unsigned 16-bit values in units of 0.00002,
+     * where 0x0000 represents zero and 0xC350 represents 1.0000.
+     */
+    struct {
+        NvU16 x, y;
+    } displayPrimaries[3];
+
+    /*!
+     * White point of colorspace data.
+     * These are coded as unsigned  16-bit values in units of 0.00002,
+     * where 0x0000 represents zero and 0xC350 represents 1.0000.
+     */
+    struct {
+        NvU16 x, y;
+    } whitePoint;
+
+    /**
+     * Maximum mastering display luminance.
+     * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+     * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+     */
+    NvU16 maxDisplayMasteringLuminance;
+
+    /*!
+     * Minimum mastering display luminance.
+     * This value is coded as an unsigned 16-bit value in units of
+     * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF
+     * represents 6.5535 cd/m2.
+     */
+    NvU16 minDisplayMasteringLuminance;
+
+    /*!
+     * Maximum content light level.
+     * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+     * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+     */
+    NvU16 maxCLL;
+
+    /*!
+     * Maximum frame-average light level.
+     * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+     * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+     */
+    NvU16 maxFALL;
+};
+
 #endif /* NVKMS_API_TYPES_H */
--- a/kernel-open/common/inc/nvkms-format.h
+++ b/kernel-open/common/inc/nvkms-format.h
@ -86,8 +86,9 @@ enum NvKmsSurfaceMemoryFormat {
    NvKmsSurfaceMemoryFormatY12___V12U12_N420 = 32,
    NvKmsSurfaceMemoryFormatY8___U8___V8_N444 = 33,
    NvKmsSurfaceMemoryFormatY8___U8___V8_N420 = 34,
+    NvKmsSurfaceMemoryFormatRF16GF16BF16XF16  = 35,
    NvKmsSurfaceMemoryFormatMin = NvKmsSurfaceMemoryFormatI8,
-    NvKmsSurfaceMemoryFormatMax = NvKmsSurfaceMemoryFormatY8___U8___V8_N420,
+    NvKmsSurfaceMemoryFormatMax = NvKmsSurfaceMemoryFormatRF16GF16BF16XF16,
 };

 typedef struct NvKmsSurfaceMemoryFormatInfo {
--- a/kernel-open/common/inc/nvkms-kapi.h
+++ b/kernel-open/common/inc/nvkms-kapi.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2015 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2015-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -149,6 +149,7 @@ struct NvKmsKapiDeviceResourcesInfo {
    } caps;

    NvU64 supportedSurfaceMemoryFormats[NVKMS_KAPI_LAYER_MAX];
+    NvBool supportsHDR[NVKMS_KAPI_LAYER_MAX];
 };

 #define NVKMS_KAPI_LAYER_MASK(layerType) (1 << (layerType))
@ -218,6 +219,11 @@ struct NvKmsKapiLayerConfig {
    struct NvKmsRRParams rrParams;
    struct NvKmsKapiSyncpt syncptParams;

+    struct NvKmsHDRStaticMetadata hdrMetadata;
+    NvBool hdrMetadataSpecified;
+
+    enum NvKmsOutputTf tf;
+
    NvU8 minPresentInterval;
    NvBool tearing;

@ -226,6 +232,8 @@ struct NvKmsKapiLayerConfig {

    NvS16 dstX, dstY;
    NvU16 dstWidth, dstHeight;
+
+    enum NvKmsInputColorSpace inputColorSpace;
 };

 struct NvKmsKapiLayerRequestedConfig {
@ -277,6 +285,8 @@ struct NvKmsKapiHeadModeSetConfig {
    NvKmsKapiDisplay displays[NVKMS_KAPI_MAX_CLONE_DISPLAYS];

    struct NvKmsKapiDisplayMode mode;
+
+    NvBool vrrEnabled;
 };

 struct NvKmsKapiHeadRequestedConfig {
@ -368,6 +378,9 @@ struct NvKmsKapiDynamicDisplayParams {
    /* [OUT] Connection status */
    NvU32 connected;

+    /* [OUT] VRR status */
+    NvBool vrrSupported;
+
    /* [IN/OUT] EDID of connected monitor/ Input to override EDID */
    struct {
        NvU16  bufferSize;
@ -484,6 +497,38 @@ struct NvKmsKapiFunctionsTable {
     */
    void (*releaseOwnership)(struct NvKmsKapiDevice *device);

+    /*!
+     * Grant modeset permissions for a display to fd. Only one (dispIndex, head,
+     * display) is currently supported.
+     *
+     * \param [in]  fd         fd from opening /dev/nvidia-modeset.
+     *
+     * \param [in]  device     A device returned by allocateDevice().
+     *
+     * \param [in]  head       head of display.
+     *
+     * \param [in]  display    The display to grant.
+     *
+     * \return NV_TRUE on success, NV_FALSE on failure.
+     */
+    NvBool (*grantPermissions)
+    (
+        NvS32 fd,
+        struct NvKmsKapiDevice *device,
+        NvU32 head,
+        NvKmsKapiDisplay display
+    );
+
+    /*!
+     * Revoke modeset permissions previously granted. This currently applies for all
+     * previous grant requests for this device.
+     *
+     * \param [in]  device                  A device returned by allocateDevice().
+     *
+     * \return NV_TRUE on success, NV_FALSE on failure.
+     */
+    NvBool (*revokePermissions)(struct NvKmsKapiDevice *device);
+
    /*!
     * Registers for notification, via
     * NvKmsKapiAllocateDeviceParams::eventCallback, of the events specified
--- a/kernel-open/common/inc/nvmisc.h
+++ b/kernel-open/common/inc/nvmisc.h
@ -234,12 +234,14 @@ extern "C" {
 #define DRF_EXTENT(drf)         (drf##_HIGH_FIELD)
 #define DRF_SHIFT(drf)          ((drf##_LOW_FIELD) % 32U)
 #define DRF_SHIFT_RT(drf)       ((drf##_HIGH_FIELD) % 32U)
+#define DRF_SIZE(drf)           ((drf##_HIGH_FIELD)-(drf##_LOW_FIELD)+1U)
 #define DRF_MASK(drf)           (0xFFFFFFFFU >> (31U - ((drf##_HIGH_FIELD) % 32U) + ((drf##_LOW_FIELD) % 32U)))
 #else
 #define DRF_BASE(drf)           (NV_FALSE?drf)  // much better
 #define DRF_EXTENT(drf)         (NV_TRUE?drf)  // much better
 #define DRF_SHIFT(drf)          (((NvU32)DRF_BASE(drf)) % 32U)
 #define DRF_SHIFT_RT(drf)       (((NvU32)DRF_EXTENT(drf)) % 32U)
+#define DRF_SIZE(drf)           (DRF_EXTENT(drf)-DRF_BASE(drf)+1U)
 #define DRF_MASK(drf)           (0xFFFFFFFFU>>(31U - DRF_SHIFT_RT(drf) + DRF_SHIFT(drf)))
 #endif
 #define DRF_DEF(d,r,f,c)        (((NvU32)(NV ## d ## r ## f ## c))<<DRF_SHIFT(NV ## d ## r ## f))
@ -249,12 +251,12 @@ extern "C" {
 #define DRF_EXTENT(drf)         (1?drf)  // much better
 #define DRF_SHIFT(drf)          ((DRF_ISBIT(0,drf)) % 32)
 #define DRF_SHIFT_RT(drf)       ((DRF_ISBIT(1,drf)) % 32)
+#define DRF_SIZE(drf)           (DRF_EXTENT(drf)-DRF_BASE(drf)+1U)
 #define DRF_MASK(drf)           (0xFFFFFFFFU>>(31-((DRF_ISBIT(1,drf)) % 32)+((DRF_ISBIT(0,drf)) % 32)))
 #define DRF_DEF(d,r,f,c)        ((NV ## d ## r ## f ## c)<<DRF_SHIFT(NV ## d ## r ## f))
 #define DRF_NUM(d,r,f,n)        (((n)&DRF_MASK(NV ## d ## r ## f))<<DRF_SHIFT(NV ## d ## r ## f))
 #endif
 #define DRF_SHIFTMASK(drf)      (DRF_MASK(drf)<<(DRF_SHIFT(drf)))
-#define DRF_SIZE(drf)           (DRF_EXTENT(drf)-DRF_BASE(drf)+1U)

 #define DRF_VAL(d,r,f,v)        (((v)>>DRF_SHIFT(NV ## d ## r ## f))&DRF_MASK(NV ## d ## r ## f))
 #endif
@ -907,6 +909,16 @@ static NV_FORCEINLINE void *NV_NVUPTR_TO_PTR(NvUPtr address)
    return uAddr.p;
 }

+// Get bit at pos (k) from x
+#define NV_BIT_GET(k, x)                       (((x) >> (k)) & 1)
+// Get bit at pos (n) from (hi) if >= 64, otherwise from (lo). This is paired with NV_BIT_SET_128 which sets the bit.
+#define NV_BIT_GET_128(n, lo, hi)              (((n) < 64) ? NV_BIT_GET((n), (lo)) : NV_BIT_GET((n) - 64, (hi)))
+//
+// Set the bit at pos (b) for U64 which is < 128. Since the (b) can be >= 64, we need 2 U64 to store this.
+// Use (lo) if (b) is less than 64, and (hi) if >= 64.
+//
+#define NV_BIT_SET_128(b, lo, hi)              { nvAssert( (b) < 128 ); if ( (b) < 64 ) (lo) |= NVBIT64(b); else (hi) |= NVBIT64( b & 0x3F ); }
+
 #ifdef __cplusplus
 }
 #endif //__cplusplus
--- a/kernel-open/common/inc/nvstatus.h
+++ b/kernel-open/common/inc/nvstatus.h
@ -24,11 +24,6 @@
 #ifndef SDK_NVSTATUS_H
 #define SDK_NVSTATUS_H

-/* XAPIGEN - this file is not suitable for (nor needed by) xapigen.         */
-/*           Rather than #ifdef out every such include in every sdk         */
-/*           file, punt here.                                               */
-#if !defined(XAPIGEN)        /* rest of file */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -125,6 +120,4 @@ const char *nvstatusToString(NV_STATUS nvStatusIn);
 }
 #endif

-#endif // XAPIGEN
-
 #endif /* SDK_NVSTATUS_H */
--- a/kernel-open/common/inc/nvstatuscodes.h
+++ b/kernel-open/common/inc/nvstatuscodes.h
@ -24,11 +24,6 @@
 #ifndef SDK_NVSTATUSCODES_H
 #define SDK_NVSTATUSCODES_H

-/* XAPIGEN - this file is not suitable for (nor needed by) xapigen.         */
-/*           Rather than #ifdef out every such include in every sdk         */
-/*           file, punt here.                                               */
-#if !defined(XAPIGEN)        /* rest of file */
-
 NV_STATUS_CODE(NV_OK,                                  0x00000000, "Success")
 NV_STATUS_CODE(NV_ERR_GENERIC,                         0x0000FFFF, "Failure: Generic Error")

@ -153,6 +148,7 @@ NV_STATUS_CODE(NV_ERR_NVLINK_CLOCK_ERROR,              0x00000076, "Nvlink Clock
 NV_STATUS_CODE(NV_ERR_NVLINK_TRAINING_ERROR,           0x00000077, "Nvlink Training Error")
 NV_STATUS_CODE(NV_ERR_NVLINK_CONFIGURATION_ERROR,      0x00000078, "Nvlink Configuration Error")
 NV_STATUS_CODE(NV_ERR_RISCV_ERROR,                     0x00000079, "Generic RISC-V assert or halt")
+NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT,      0x0000007A, "Fabric Manager is not loaded")

 // Warnings:
 NV_STATUS_CODE(NV_WARN_HOT_SWITCH,                     0x00010001, "WARNING Hot switch")
@ -164,6 +160,4 @@ NV_STATUS_CODE(NV_WARN_NOTHING_TO_DO,                  0x00010006, "WARNING Noth
 NV_STATUS_CODE(NV_WARN_NULL_OBJECT,                    0x00010007, "WARNING NULL object found")
 NV_STATUS_CODE(NV_WARN_OUT_OF_RANGE,                   0x00010008, "WARNING value out of range")

-#endif // XAPIGEN
-
 #endif /* SDK_NVSTATUSCODES_H */
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 1999-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1999-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
@ -143,6 +143,14 @@ void        NV_API_CALL  os_free_semaphore           (void *);
 NV_STATUS   NV_API_CALL  os_acquire_semaphore        (void *);
 NV_STATUS   NV_API_CALL  os_cond_acquire_semaphore   (void *);
 NV_STATUS   NV_API_CALL  os_release_semaphore        (void *);
+void*       NV_API_CALL  os_alloc_rwlock             (void);
+void        NV_API_CALL  os_free_rwlock              (void *);
+NV_STATUS   NV_API_CALL  os_acquire_rwlock_read      (void *);
+NV_STATUS   NV_API_CALL  os_acquire_rwlock_write     (void *);
+NV_STATUS   NV_API_CALL  os_cond_acquire_rwlock_read (void *);
+NV_STATUS   NV_API_CALL  os_cond_acquire_rwlock_write(void *);
+void        NV_API_CALL  os_release_rwlock_read      (void *);
+void        NV_API_CALL  os_release_rwlock_write     (void *);
 NvBool      NV_API_CALL  os_semaphore_may_sleep      (void);
 NV_STATUS   NV_API_CALL  os_get_version_info         (os_version_info*);
 NvBool      NV_API_CALL  os_is_isr                   (void);
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
--- a/kernel-open/nvidia-drm/nvidia-drm-connector.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-connector.c
@ -118,6 +118,11 @@ __nv_drm_detect_encoder(struct NvKmsKapiDynamicDisplayParams *pDetectParams,
        return false;
    }

+#if defined(NV_DRM_CONNECTOR_HAS_VRR_CAPABLE_PROPERTY)
+    drm_connector_attach_vrr_capable_property(&nv_connector->base);
+    drm_connector_set_vrr_capable_property(&nv_connector->base, pDetectParams->vrrSupported ? true : false);
+#endif
+
    if (pDetectParams->connected) {
        if (!pDetectParams->overrideEdid && pDetectParams->edid.bufferSize) {

--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -46,6 +46,35 @@
 #include <linux/nvhost.h>
 #endif

+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+static int
+nv_drm_atomic_replace_property_blob_from_id(struct drm_device *dev,
+                                            struct drm_property_blob **blob,
+                                            uint64_t blob_id,
+                                            ssize_t expected_size)
+{
+    struct drm_property_blob *new_blob = NULL;
+
+    if (blob_id != 0) {
+        new_blob = drm_property_lookup_blob(dev, blob_id);
+        if (new_blob == NULL) {
+            return -EINVAL;
+        }
+
+        if ((expected_size > 0) &&
+            (new_blob->length != expected_size)) {
+            drm_property_blob_put(new_blob);
+            return -EINVAL;
+        }
+    }
+
+    drm_property_replace_blob(blob, new_blob);
+    drm_property_blob_put(new_blob);
+
+    return 0;
+}
+#endif
+
 static void nv_drm_plane_destroy(struct drm_plane *plane)
 {
    struct nv_drm_plane *nv_plane = to_nv_plane(plane);
@ -84,9 +113,6 @@ cursor_plane_req_config_update(struct drm_plane *plane,
 {
    struct nv_drm_plane *nv_plane = to_nv_plane(plane);
    struct NvKmsKapiCursorRequestedConfig old_config = *req_config;
-    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
-    struct nv_drm_plane_state *nv_drm_plane_state =
-        to_nv_drm_plane_state(plane_state);

    if (plane_state->fb == NULL) {
        cursor_req_config_disable(req_config);
@ -186,7 +212,6 @@ plane_req_config_update(struct drm_plane *plane,
    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
    struct nv_drm_plane_state *nv_drm_plane_state =
        to_nv_drm_plane_state(plane_state);
-    int ret = 0;

    if (plane_state->fb == NULL) {
        plane_req_config_disable(req_config);
@ -309,6 +334,9 @@ plane_req_config_update(struct drm_plane *plane,
        nv_plane->defaultCompositionMode;
 #endif

+    req_config->config.inputColorSpace =
+        nv_drm_plane_state->input_colorspace;
+
    req_config->config.syncptParams.preSyncptSpecified = false;
    req_config->config.syncptParams.postSyncptRequested = false;

@ -320,10 +348,10 @@ plane_req_config_update(struct drm_plane *plane,
 #if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
 #if defined(NV_NVHOST_DMA_FENCE_UNPACK_PRESENT)
        if (plane_state->fence != NULL) {
-            ret = nvhost_dma_fence_unpack(
-                      plane_state->fence,
-                      &req_config->config.syncptParams.preSyncptId,
-                      &req_config->config.syncptParams.preSyncptValue);
+            int ret = nvhost_dma_fence_unpack(
+                          plane_state->fence,
+                          &req_config->config.syncptParams.preSyncptId,
+                          &req_config->config.syncptParams.preSyncptValue);
            if (ret != 0) {
                return ret;
            }
@ -339,6 +367,60 @@ plane_req_config_update(struct drm_plane *plane,
 #endif
    }

+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    if (nv_drm_plane_state->hdr_output_metadata != NULL) {
+        struct hdr_output_metadata *hdr_metadata =
+            nv_drm_plane_state->hdr_output_metadata->data;
+        struct hdr_metadata_infoframe *info_frame =
+            &hdr_metadata->hdmi_metadata_type1;
+        struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
+        uint32_t i;
+
+        if (hdr_metadata->metadata_type != HDMI_STATIC_METADATA_TYPE1) {
+            NV_DRM_DEV_LOG_ERR(nv_dev, "Unsupported Metadata Type");
+            return -1;
+        }
+
+        for (i = 0; i < ARRAY_SIZE(info_frame->display_primaries); i ++) {
+            req_config->config.hdrMetadata.displayPrimaries[i].x =
+                info_frame->display_primaries[i].x;
+            req_config->config.hdrMetadata.displayPrimaries[i].y =
+                info_frame->display_primaries[i].y;
+        }
+
+        req_config->config.hdrMetadata.whitePoint.x =
+            info_frame->white_point.x;
+        req_config->config.hdrMetadata.whitePoint.y =
+            info_frame->white_point.y;
+        req_config->config.hdrMetadata.maxDisplayMasteringLuminance =
+            info_frame->max_display_mastering_luminance;
+        req_config->config.hdrMetadata.minDisplayMasteringLuminance =
+            info_frame->min_display_mastering_luminance;
+        req_config->config.hdrMetadata.maxCLL =
+            info_frame->max_cll;
+        req_config->config.hdrMetadata.maxFALL =
+            info_frame->max_fall;
+
+        req_config->config.hdrMetadataSpecified = true;
+
+        switch (info_frame->eotf) {
+            case HDMI_EOTF_SMPTE_ST2084:
+                req_config->config.tf = NVKMS_OUTPUT_TF_PQ;
+                break;
+            case HDMI_EOTF_TRADITIONAL_GAMMA_SDR:
+                req_config->config.tf =
+                    NVKMS_OUTPUT_TF_TRADITIONAL_GAMMA_SDR;
+                break;
+            default:
+                NV_DRM_DEV_LOG_ERR(nv_dev, "Unsupported EOTF");
+                return -1;
+        }
+    } else {
+        req_config->config.hdrMetadataSpecified = false;
+        req_config->config.tf = NVKMS_OUTPUT_TF_NONE;
+    }
+#endif
+
    /*
     * Unconditionally mark the surface as changed, even if nothing changed,
     * so that we always get a flip event: a DRM client may flip with
@ -509,9 +591,21 @@ static int nv_drm_plane_atomic_set_property(
        nv_drm_plane_state->fd_user_ptr = u64_to_user_ptr(val);
 #endif
        return 0;
-    } else {
-        return -EINVAL;
+    } else if (property == nv_dev->nv_input_colorspace_property) {
+        nv_drm_plane_state->input_colorspace = val;
+        return 0;
    }
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    else if (property == nv_dev->nv_hdr_output_metadata_property) {
+        return nv_drm_atomic_replace_property_blob_from_id(
+                nv_dev->dev,
+                &nv_drm_plane_state->hdr_output_metadata,
+                val,
+                sizeof(struct hdr_output_metadata));
+    }
+#endif
+
+    return -EINVAL;
 }

 static int nv_drm_plane_atomic_get_property(
@ -521,12 +615,26 @@ static int nv_drm_plane_atomic_get_property(
    uint64_t *val)
 {
    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);
+    const struct nv_drm_plane_state *nv_drm_plane_state =
+        to_nv_drm_plane_state_const(state);

    if (property == nv_dev->nv_out_fence_property) {
        return 0;
-    } else {
-        return -EINVAL;
+    } else if (property == nv_dev->nv_input_colorspace_property) {
+        *val = nv_drm_plane_state->input_colorspace;
+        return 0;
    }
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    else if (property ==  nv_dev->nv_hdr_output_metadata_property) {
+        const struct nv_drm_plane_state *nv_drm_plane_state =
+            to_nv_drm_plane_state_const(state);
+        *val = nv_drm_plane_state->hdr_output_metadata ?
+            nv_drm_plane_state->hdr_output_metadata->base.id : 0;
+        return 0;
+    }
+#endif
+
+    return -EINVAL;
 }

 static struct drm_plane_state *
@ -544,6 +652,14 @@ nv_drm_plane_atomic_duplicate_state(struct drm_plane *plane)
    __drm_atomic_helper_plane_duplicate_state(plane, &nv_plane_state->base);

    nv_plane_state->fd_user_ptr = nv_old_plane_state->fd_user_ptr;
+    nv_plane_state->input_colorspace = nv_old_plane_state->input_colorspace;
+
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    nv_plane_state->hdr_output_metadata = nv_old_plane_state->hdr_output_metadata;
+    if (nv_plane_state->hdr_output_metadata) {
+        drm_property_blob_get(nv_plane_state->hdr_output_metadata);
+    }
+#endif

    return &nv_plane_state->base;
 }
@ -557,6 +673,12 @@ static inline void __nv_drm_plane_atomic_destroy_state(
 #else
    __drm_atomic_helper_plane_destroy_state(state);
 #endif
+
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    struct nv_drm_plane_state *nv_drm_plane_state =
+        to_nv_drm_plane_state(state);
+    drm_property_blob_put(nv_drm_plane_state->hdr_output_metadata);
+#endif
 }

 static void nv_drm_plane_atomic_destroy_state(
@ -803,7 +925,8 @@ static const struct drm_crtc_helper_funcs nv_crtc_helper_funcs = {
 };

 static void nv_drm_plane_install_properties(
-    struct drm_plane *plane)
+    struct drm_plane *plane,
+    NvBool supportsHDR)
 {
    struct nv_drm_device *nv_dev = to_nv_device(plane->dev);

@ -811,6 +934,19 @@ static void nv_drm_plane_install_properties(
        drm_object_attach_property(
            &plane->base, nv_dev->nv_out_fence_property, 0);
    }
+
+    if (nv_dev->nv_input_colorspace_property) {
+        drm_object_attach_property(
+            &plane->base, nv_dev->nv_input_colorspace_property,
+            NVKMS_INPUT_COLORSPACE_NONE);
+    }
+
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    if (supportsHDR && nv_dev->nv_hdr_output_metadata_property) {
+        drm_object_attach_property(
+            &plane->base, nv_dev->nv_hdr_output_metadata_property, 0);
+    }
+#endif
 }

 static void
@ -990,7 +1126,9 @@ nv_drm_plane_create(struct drm_device *dev,
    drm_plane_helper_add(plane, &nv_plane_helper_funcs);

    if (plane_type != DRM_PLANE_TYPE_CURSOR) {
-        nv_drm_plane_install_properties(plane);
+        nv_drm_plane_install_properties(
+                plane,
+                pResInfo->supportsHDR[layer_idx]);
    }

    __nv_drm_plane_create_alpha_blending_properties(
@ -1141,11 +1279,13 @@ void nv_drm_enumerate_crtcs_and_planes(
        }

        for (layer = 0; layer < pResInfo->numLayers[i]; layer++) {
+            struct drm_plane *overlay_plane = NULL;
+
            if (layer == NVKMS_KAPI_LAYER_PRIMARY_IDX) {
                continue;
            }

-            struct drm_plane *overlay_plane =
+            overlay_plane =
                nv_drm_plane_create(nv_dev->dev,
                                    DRM_PLANE_TYPE_OVERLAY,
                                    layer,
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -205,6 +205,10 @@ static inline struct nv_drm_plane *to_nv_plane(struct drm_plane *plane)
 struct nv_drm_plane_state {
    struct drm_plane_state base;
    s32 __user *fd_user_ptr;
+    enum NvKmsInputColorSpace input_colorspace;
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    struct drm_property_blob *hdr_output_metadata;
+#endif
 };

 static inline struct nv_drm_plane_state *to_nv_drm_plane_state(struct drm_plane_state *state)
@ -212,6 +216,11 @@ static inline struct nv_drm_plane_state *to_nv_drm_plane_state(struct drm_plane_
    return container_of(state, struct nv_drm_plane_state, base);
 }

+static inline const struct nv_drm_plane_state *to_nv_drm_plane_state_const(const struct drm_plane_state *state)
+{
+    return container_of(state, const struct nv_drm_plane_state, base);
+}
+
 static inline struct nv_drm_crtc *to_nv_crtc(struct drm_crtc *crtc)
 {
    if (crtc == NULL) {
--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -86,6 +86,23 @@

 static struct nv_drm_device *dev_list = NULL;

+static const char* nv_get_input_colorspace_name(
+    enum NvKmsInputColorSpace colorSpace)
+{
+    switch (colorSpace) {
+        case NVKMS_INPUT_COLORSPACE_NONE:
+            return "None";
+        case NVKMS_INPUT_COLORSPACE_SCRGB_LINEAR:
+            return "IEC 61966-2-2 linear FP";
+        case NVKMS_INPUT_COLORSPACE_BT2100_PQ:
+            return "ITU-R BT.2100-PQ YCbCr";
+        default:
+            /* We shoudn't hit this */
+            WARN_ON("Unsupported input colorspace");
+            return "None";
+    }
+};
+
 #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)

 static void nv_drm_output_poll_changed(struct drm_device *dev)
@ -332,6 +349,15 @@ static void nv_drm_enumerate_encoders_and_connectors
 */
 static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
 {
+    struct drm_prop_enum_list enum_list[3] = { };
+    int i, len = 0;
+
+    for (i = 0; i < 3; i++) {
+        enum_list[len].type = i;
+        enum_list[len].name = nv_get_input_colorspace_name(i);
+        len++;
+    }
+
 #if defined(NV_LINUX_NVHOST_H_PRESENT) && defined(CONFIG_TEGRA_GRHOST)
    if (!nv_dev->supportsSyncpts) {
        return 0;
@ -345,6 +371,23 @@ static int nv_drm_create_properties(struct nv_drm_device *nv_dev)
    }
 #endif

+    nv_dev->nv_input_colorspace_property =
+        drm_property_create_enum(nv_dev->dev, 0, "NV_INPUT_COLORSPACE",
+                                 enum_list, len);
+    if (nv_dev->nv_input_colorspace_property == NULL) {
+        NV_DRM_LOG_ERR("Failed to create NV_INPUT_COLORSPACE property");
+        return -ENOMEM;
+    }
+
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    nv_dev->nv_hdr_output_metadata_property =
+        drm_property_create(nv_dev->dev, DRM_MODE_PROP_BLOB,
+            "NV_HDR_STATIC_METADATA", 0);
+    if (nv_dev->nv_hdr_output_metadata_property == NULL) {
+        return -ENOMEM;
+    }
+#endif
+
    return 0;
 }

--- a/kernel-open/nvidia-drm/nvidia-drm-format.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-format.c
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -40,9 +40,16 @@ static const u32  nvkms_to_drm_format[] = {
    [NvKmsSurfaceMemoryFormatR5G6B5]      = DRM_FORMAT_RGB565,
    [NvKmsSurfaceMemoryFormatA8R8G8B8]    = DRM_FORMAT_ARGB8888,
    [NvKmsSurfaceMemoryFormatX8R8G8B8]    = DRM_FORMAT_XRGB8888,
+    [NvKmsSurfaceMemoryFormatX8B8G8R8]    = DRM_FORMAT_XBGR8888,
    [NvKmsSurfaceMemoryFormatA2B10G10R10] = DRM_FORMAT_ABGR2101010,
    [NvKmsSurfaceMemoryFormatX2B10G10R10] = DRM_FORMAT_XBGR2101010,
    [NvKmsSurfaceMemoryFormatA8B8G8R8]    = DRM_FORMAT_ABGR8888,
+#if defined(DRM_FORMAT_ABGR16161616F)
+    [NvKmsSurfaceMemoryFormatRF16GF16BF16AF16] = DRM_FORMAT_ABGR16161616F,
+#endif
+#if defined(DRM_FORMAT_XBGR16161616F)
+    [NvKmsSurfaceMemoryFormatRF16GF16BF16XF16] = DRM_FORMAT_XBGR16161616F,
+#endif

    [NvKmsSurfaceMemoryFormatY8_U8__Y8_V8_N422] = DRM_FORMAT_YUYV,
    [NvKmsSurfaceMemoryFormatU8_Y8__V8_Y8_N422] = DRM_FORMAT_UYVY,
--- a/kernel-open/nvidia-drm/nvidia-drm-gem-user-memory.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem-user-memory.c
@ -113,7 +113,6 @@ static vm_fault_t __nv_drm_gem_user_memory_handle_vma_fault(
    page_offset = vmf->pgoff - drm_vma_node_start(&gem->vma_node);

    BUG_ON(page_offset > nv_user_memory->pages_count);
-
    ret = vm_insert_page(vma, address, nv_user_memory->pages[page_offset]);
    switch (ret) {
        case 0:
--- a/kernel-open/nvidia-drm/nvidia-drm-linux.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-linux.c
@ -93,8 +93,6 @@ int nv_drm_lock_user_pages(unsigned long address,
 {
    struct mm_struct *mm = current->mm;
    struct page **user_pages;
-    const int write = 1;
-    const int force = 0;
    int pages_pinned;

    user_pages = nv_drm_calloc(pages_count, sizeof(*user_pages));
@ -105,7 +103,7 @@ int nv_drm_lock_user_pages(unsigned long address,

    nv_mmap_read_lock(mm);

-    pages_pinned = NV_GET_USER_PAGES(address, pages_count, write, force,
+    pages_pinned = NV_PIN_USER_PAGES(address, pages_count, FOLL_WRITE,
                                     user_pages, NULL);
    nv_mmap_read_unlock(mm);

@ -123,7 +121,7 @@ failed:
        int i;

        for (i = 0; i < pages_pinned; i++) {
-            put_page(user_pages[i]);
+           NV_UNPIN_USER_PAGE(user_pages[i]);
        }
    }

@ -138,8 +136,7 @@ void nv_drm_unlock_user_pages(unsigned long  pages_count, struct page **pages)

    for (i = 0; i < pages_count; i++) {
        set_page_dirty_lock(pages[i]);
-
-        put_page(pages[i]);
+        NV_UNPIN_USER_PAGE(pages[i]);
    }

    nv_drm_free(pages);
@ -174,12 +171,7 @@ static void __exit nv_linux_drm_exit(void)
 module_init(nv_linux_drm_init);
 module_exit(nv_linux_drm_exit);

-#if defined(MODULE_LICENSE)
  MODULE_LICENSE("Dual MIT/GPL");
-#endif
-#if defined(MODULE_INFO)
-  MODULE_INFO(supported, "external");
-#endif
-#if defined(MODULE_VERSION)
-  MODULE_VERSION(NV_VERSION_STRING);
-#endif
+
+MODULE_INFO(supported, "external");
+MODULE_VERSION(NV_VERSION_STRING);
--- a/kernel-open/nvidia-drm/nvidia-drm-modeset.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-modeset.c
@ -93,9 +93,6 @@ static bool __will_generate_flip_event(struct drm_crtc *crtc,
        to_nv_crtc_state(new_crtc_state);
    struct drm_plane_state *old_plane_state = NULL;
    struct drm_plane *plane = NULL;
-    struct drm_plane *primary_plane = crtc->primary;
-    bool primary_event = false;
-    bool overlay_event = false;
    int i;

    if (!old_crtc_state->active  && !new_crtc_state->active) {
@ -274,6 +271,9 @@ nv_drm_atomic_apply_modeset_config(struct drm_device *dev,

                nv_new_crtc_state->nv_flip = NULL;
            }
+#if defined(NV_DRM_CRTC_STATE_HAS_VRR_ENABLED)
+            requested_config->headRequestedConfig[nv_crtc->head].modeSetConfig.vrrEnabled = new_crtc_state->vrr_enabled;
+#endif
        }
    }

--- a/kernel-open/nvidia-drm/nvidia-drm-priv.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-priv.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -122,6 +122,11 @@ struct nv_drm_device {
    NvBool supportsSyncpts;

    struct drm_property *nv_out_fence_property;
+    struct drm_property *nv_input_colorspace_property;
+
+#if defined(NV_DRM_HAS_HDR_OUTPUT_METADATA)
+    struct drm_property *nv_hdr_output_metadata_property;
+#endif

    struct nv_drm_device *next;
 };
--- a/kernel-open/nvidia-drm/nvidia-drm.Kbuild
+++ b/kernel-open/nvidia-drm/nvidia-drm.Kbuild
@ -59,11 +59,14 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_dev_unref
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_reinit_primary_mode_group
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_user_pages_remote
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += get_user_pages
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += pin_user_pages_remote
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += pin_user_pages
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_object_lookup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_atomic_state_ref_counting
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_driver_has_gem_prime_res_obj
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_atomic_helper_connector_dpms
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_connector_funcs_have_mode_in_name
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_connector_has_vrr_capable_property
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_framebuffer_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_gem_object_get
@ -100,6 +103,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_gem_object_has_resv
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_crtc_state_has_async_flip
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_crtc_state_has_pageflip_flags
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_crtc_state_has_vrr_enabled
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_format_modifiers_present
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_vma_node_is_allowed_has_tag_arg
@ -115,6 +119,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += drm_plane_atomic_check_has_atomic_state_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_device_has_pdev
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_crtc_state_has_no_vblank
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_config_has_allow_fb_modifiers
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_has_hdr_output_metadata
 NV_CONFTEST_TYPE_COMPILE_TESTS += dma_resv_add_fence
 NV_CONFTEST_TYPE_COMPILE_TESTS += dma_resv_reserve_fences
 NV_CONFTEST_TYPE_COMPILE_TESTS += reservation_object_reserve_shared_has_num_fences_arg
--- a/kernel-open/nvidia-modeset/nv-kthread-q.c
+++ b/kernel-open/nvidia-modeset/nv-kthread-q.c
@ -169,7 +169,6 @@ void nv_kthread_q_stop(nv_kthread_q_t *q)
 //
 // This function is never invoked when there is no NUMA preference (preferred
 // node is NUMA_NO_NODE).
-#if NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1
 static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
                                                 nv_kthread_q_t *q,
                                                 int preferred_node,
@ -217,7 +216,6 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

    return thread[i];
 }
-#endif

 int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node)
 {
@ -231,11 +229,7 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
        q->q_kthread = kthread_create(_main_loop, q, q_name);
    }
    else {
-#if NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1
        q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name);
-#else
-        return -ENOTSUPP;
-#endif
    }

    if (IS_ERR(q->q_kthread)) {
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -35,6 +35,8 @@
 #include <linux/list.h>
 #include <linux/rwsem.h>

+#include <acpi/video.h>
+
 #include "nvstatus.h"

 #include "nv-register-module.h"
@ -956,6 +958,12 @@ nvkms_register_backlight(NvU32 gpu_id, NvU32 display_id, void *drv_priv,
    struct nvkms_backlight_device *nvkms_bd = NULL;
    int i;

+#if defined(NV_ACPI_VIDEO_BACKLIGHT_USE_NATIVE)
+    if (!acpi_video_backlight_use_native()) {
+        return NULL;
+    }
+#endif
+
    gpu_info = nvkms_alloc(NV_MAX_GPUS * sizeof(*gpu_info), NV_TRUE);
    if (gpu_info == NULL) {
        return NULL;
@ -1346,29 +1354,7 @@ static void nvkms_proc_exit(void)
        return;
    }

-#if defined(NV_PROC_REMOVE_PRESENT)
    proc_remove(nvkms_proc_dir);
-#else
-    /*
-     * On kernel versions without proc_remove(), we need to explicitly
-     * remove each proc file beneath nvkms_proc_dir.
-     * nvkms_proc_init() only creates files directly under
-     * nvkms_proc_dir, so those are the only files we need to remove
-     * here: warn if there is any deeper directory nesting.
-     */
-    {
-        struct proc_dir_entry *entry = nvkms_proc_dir->subdir;
-
-        while (entry != NULL) {
-            struct proc_dir_entry *next = entry->next;
-            WARN_ON(entry->subdir != NULL);
-            remove_proc_entry(entry->name, entry->parent);
-            entry = next;
-        }
-    }
-
-    remove_proc_entry(nvkms_proc_dir->name, nvkms_proc_dir->parent);
-#endif /* NV_PROC_REMOVE_PRESENT */
 #endif /* CONFIG_PROC_FS */
 }

@ -1630,12 +1616,7 @@ restart:
 module_init(nvkms_init);
 module_exit(nvkms_exit);

-#if defined(MODULE_LICENSE)
  MODULE_LICENSE("Dual MIT/GPL");
-#endif
-#if defined(MODULE_INFO)
-  MODULE_INFO(supported, "external");
-#endif
-#if defined(MODULE_VERSION)
-  MODULE_VERSION(NV_VERSION_STRING);
-#endif
+
+MODULE_INFO(supported, "external");
+MODULE_VERSION(NV_VERSION_STRING);
--- a/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
+++ b/kernel-open/nvidia-modeset/nvidia-modeset.Kbuild
@ -85,15 +85,11 @@ $(obj)/$(NVIDIA_MODESET_INTERFACE): $(addprefix $(obj)/,$(NVIDIA_MODESET_OBJECTS

 NV_OBJECTS_DEPEND_ON_CONFTEST += $(NVIDIA_MODESET_OBJECTS)

-NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations
-NV_CONFTEST_TYPE_COMPILE_TESTS += node_states_n_memory
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += pde_data
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += proc_remove
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += timer_setup
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += kthread_create_on_node
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_real_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
-NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_kthread_create_on_node
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += acpi_video_backlight_use_native
--- a/kernel-open/nvidia-peermem/nvidia-peermem.Kbuild
+++ b/kernel-open/nvidia-peermem/nvidia-peermem.Kbuild
@ -30,8 +30,18 @@ NVIDIA_PEERMEM_CFLAGS += -UDEBUG -U_DEBUG -DNDEBUG -DNV_BUILD_MODULE_INSTANCES=0
 # MOFED's Module.symvers is needed for the build
 # to find the additional ib_* symbols.
 #
+# Also, MOFED doesn't use kbuild ARCH names.
+# So adapt OFA_ARCH to match MOFED's conventions.
+#
+ifeq ($(ARCH), arm64)
+    OFA_ARCH := aarch64
+else ifeq ($(ARCH), powerpc)
+    OFA_ARCH := ppc64le
+else
+    OFA_ARCH := $(ARCH)
+endif
 OFA_DIR := /usr/src/ofa_kernel
-OFA_CANDIDATES = $(OFA_DIR)/$(ARCH)/$(KERNELRELEASE) $(OFA_DIR)/$(KERNELRELEASE) $(OFA_DIR)/default /var/lib/dkms/mlnx-ofed-kernel
+OFA_CANDIDATES = $(OFA_DIR)/$(OFA_ARCH)/$(KERNELRELEASE) $(OFA_DIR)/$(KERNELRELEASE) $(OFA_DIR)/default /var/lib/dkms/mlnx-ofed-kernel
 MLNX_OFED_KERNEL := $(shell for d in $(OFA_CANDIDATES); do \
                              if [ -d "$$d" ]; then \
                                echo "$$d"; \
--- a/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c
@ -481,16 +481,6 @@ static int _check_cpu_affinity_test(void)
    int result, node;
    nv_kthread_q_t local_q;

-    // If the API does not support CPU affinity, check whether the correct
-    // error code is returned.
-    // Non-affinitized queue allocation has been verified by previous test
-    // so just ensure that the affinitized version also works.
-    if (!NV_KTHREAD_Q_SUPPORTS_AFFINITY()) {
-        result = nv_kthread_q_init_on_node(&local_q, "should_fail", 0);
-        TEST_CHECK_RET(result == -ENOTSUPP);
-        return 0;
-    }
-
    for_each_online_node(node) {
        unsigned i;
        const unsigned max_i = 100;
--- a/kernel-open/nvidia-uvm/nv-kthread-q.c
+++ b/kernel-open/nvidia-uvm/nv-kthread-q.c
@ -169,7 +169,6 @@ void nv_kthread_q_stop(nv_kthread_q_t *q)
 //
 // This function is never invoked when there is no NUMA preference (preferred
 // node is NUMA_NO_NODE).
-#if NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1
 static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
                                                 nv_kthread_q_t *q,
                                                 int preferred_node,
@ -217,7 +216,6 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),

    return thread[i];
 }
-#endif

 int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node)
 {
@ -231,11 +229,7 @@ int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferr
        q->q_kthread = kthread_create(_main_loop, q, q_name);
    }
    else {
-#if NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1
        q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name);
-#else
-        return -ENOTSUPP;
-#endif
    }

    if (IS_ERR(q->q_kthread)) {
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@ -67,17 +67,11 @@ endif

 NV_OBJECTS_DEPEND_ON_CONFTEST += $(NVIDIA_UVM_OBJECTS)

-NV_CONFTEST_FUNCTION_COMPILE_TESTS += address_space_init_once
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += vzalloc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += wait_on_bit_lock_argument_count
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += pde_data
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += proc_remove
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += bitmap_clear
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += usleep_range
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += radix_tree_empty
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += radix_tree_replace_slot
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += pnv_npu2_init_context
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += kthread_create_on_node
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += vmf_insert_pfn
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += cpumask_of_node
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += list_is_first
@ -88,17 +82,16 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero

-NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations
-NV_CONFTEST_TYPE_COMPILE_TESTS += kuid_t
-NV_CONFTEST_TYPE_COMPILE_TESTS += address_space
 NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += get_user_pages_remote
 NV_CONFTEST_TYPE_COMPILE_TESTS += get_user_pages
+NV_CONFTEST_TYPE_COMPILE_TESTS += pin_user_pages_remote
+NV_CONFTEST_TYPE_COMPILE_TESTS += pin_user_pages
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_has_address
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_ops_fault_removed_vma_arg
-NV_CONFTEST_TYPE_COMPILE_TESTS += node_states_n_memory
 NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -41,73 +41,6 @@
 static dev_t g_uvm_base_dev;
 static struct cdev g_uvm_cdev;

-// List of fault service contexts for CPU faults
-static LIST_HEAD(g_cpu_service_block_context_list);
-
-static uvm_spinlock_t g_cpu_service_block_context_list_lock;
-
-NV_STATUS uvm_service_block_context_init(void)
-{
-    unsigned num_preallocated_contexts = 4;
-
-    uvm_spin_lock_init(&g_cpu_service_block_context_list_lock, UVM_LOCK_ORDER_LEAF);
-
-    // Pre-allocate some fault service contexts for the CPU and add them to the global list
-    while (num_preallocated_contexts-- > 0) {
-        uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
-        if (!service_context)
-            return NV_ERR_NO_MEMORY;
-
-        list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
-    }
-
-    return NV_OK;
-}
-
-void uvm_service_block_context_exit(void)
-{
-    uvm_service_block_context_t *service_context, *service_context_tmp;
-
-    // Free fault service contexts for the CPU and add clear the global list
-    list_for_each_entry_safe(service_context, service_context_tmp, &g_cpu_service_block_context_list,
-                             cpu_fault.service_context_list) {
-        uvm_kvfree(service_context);
-    }
-    INIT_LIST_HEAD(&g_cpu_service_block_context_list);
-}
-
-// Get a fault service context from the global list or allocate a new one if there are no
-// available entries
-static uvm_service_block_context_t *uvm_service_block_context_cpu_alloc(void)
-{
-    uvm_service_block_context_t *service_context;
-
-    uvm_spin_lock(&g_cpu_service_block_context_list_lock);
-
-    service_context = list_first_entry_or_null(&g_cpu_service_block_context_list, uvm_service_block_context_t,
-                                               cpu_fault.service_context_list);
-
-    if (service_context)
-        list_del(&service_context->cpu_fault.service_context_list);
-
-    uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
-
-    if (!service_context)
-        service_context = uvm_kvmalloc(sizeof(*service_context));
-
-    return service_context;
-}
-
-// Put a fault service context in the global list
-static void uvm_service_block_context_cpu_free(uvm_service_block_context_t *service_context)
-{
-    uvm_spin_lock(&g_cpu_service_block_context_list_lock);
-
-    list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
-
-    uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
-}
-
 static int uvm_open(struct inode *inode, struct file *filp)
 {
    NV_STATUS status = uvm_global_get_status();
@ -489,139 +422,10 @@ static void uvm_vm_close_managed_entry(struct vm_area_struct *vma)
 static vm_fault_t uvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_va_block_t *va_block;
-    NvU64 fault_addr = nv_page_fault_va(vmf);
-    bool is_write = vmf->flags & FAULT_FLAG_WRITE;
-    NV_STATUS status = uvm_global_get_status();
-    bool tools_enabled;
-    bool major_fault = false;
-    uvm_service_block_context_t *service_context;
-    uvm_global_processor_mask_t gpus_to_check_for_ecc;

-    if (status != NV_OK)
-        goto convert_error;
-
-    // TODO: Bug 2583279: Lock tracking is disabled for the power management
-    // lock in order to suppress reporting of a lock policy violation.
-    // The violation consists in acquiring the power management lock multiple
-    // times, and it is manifested as an error during release. The
-    // re-acquisition of the power management locks happens upon re-entry in the
-    // UVM module, and it is benign on itself, but when combined with certain
-    // power management scenarios, it is indicative of a potential deadlock.
-    // Tracking will be re-enabled once the power management locking strategy is
-    // modified to avoid deadlocks.
-    if (!uvm_down_read_trylock_no_tracking(&g_uvm_global.pm.lock)) {
-        status = NV_ERR_BUSY_RETRY;
-        goto convert_error;
-    }
-
-    service_context = uvm_service_block_context_cpu_alloc();
-    if (!service_context) {
-        status = NV_ERR_NO_MEMORY;
-        goto unlock;
-    }
-
-    service_context->cpu_fault.wakeup_time_stamp = 0;
-
-    // The mmap_lock might be held in write mode, but the mode doesn't matter
-    // for the purpose of lock ordering and we don't rely on it being in write
-    // anywhere so just record it as read mode in all cases.
-    uvm_record_lock_mmap_lock_read(vma->vm_mm);
-
-    do {
-        bool do_sleep = false;
-        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
-            NvU64 now = NV_GETTIME();
-            if (now < service_context->cpu_fault.wakeup_time_stamp)
-                do_sleep = true;
-
-            if (do_sleep)
-                uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
-
-            // Drop the VA space lock while we sleep
-            uvm_va_space_up_read(va_space);
-
-            // usleep_range is preferred because msleep has a 20ms granularity
-            // and udelay uses a busy-wait loop. usleep_range uses high-resolution
-            // timers and, by adding a range, the Linux scheduler may coalesce
-            // our wakeup with others, thus saving some interrupts.
-            if (do_sleep) {
-                unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
-
-                usleep_range(nap_us, nap_us + nap_us / 2);
-            }
-        }
-
-        uvm_va_space_down_read(va_space);
-
-        if (do_sleep)
-            uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
-
-        status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
-        if (status != NV_OK) {
-            UVM_ASSERT_MSG(status == NV_ERR_NO_MEMORY, "status: %s\n", nvstatusToString(status));
-            break;
-        }
-
-        // Watch out, current->mm might not be vma->vm_mm
-        UVM_ASSERT(vma == uvm_va_range_vma(va_block->va_range));
-
-        // Loop until thrashing goes away.
-        status = uvm_va_block_cpu_fault(va_block, fault_addr, is_write, service_context);
-    } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
-
-    if (status != NV_OK) {
-        UvmEventFatalReason reason;
-
-        reason = uvm_tools_status_to_fatal_fault_reason(status);
-        UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
-
-        uvm_tools_record_cpu_fatal_fault(va_space, fault_addr, is_write, reason);
-    }
-
-    tools_enabled = va_space->tools.enabled;
-
-    if (status == NV_OK) {
-        uvm_va_space_global_gpus_in_mask(va_space,
-                                         &gpus_to_check_for_ecc,
-                                         &service_context->cpu_fault.gpus_to_check_for_ecc);
-        uvm_global_mask_retain(&gpus_to_check_for_ecc);
-    }
-
-    uvm_va_space_up_read(va_space);
-    uvm_record_unlock_mmap_lock_read(vma->vm_mm);
-
-    if (status == NV_OK) {
-        status = uvm_global_mask_check_ecc_error(&gpus_to_check_for_ecc);
-        uvm_global_mask_release(&gpus_to_check_for_ecc);
-    }
-
-    if (tools_enabled)
-        uvm_tools_flush_events();
-
-    // Major faults involve I/O in order to resolve the fault.
-    // If any pages were DMA'ed between the GPU and host memory, that makes it a major fault.
-    // A process can also get statistics for major and minor faults by calling readproc().
-    major_fault = service_context->cpu_fault.did_migrate;
-    uvm_service_block_context_cpu_free(service_context);
-
-unlock:
-    // TODO: Bug 2583279: See the comment above the matching lock acquisition
-    uvm_up_read_no_tracking(&g_uvm_global.pm.lock);
-
-convert_error:
-    switch (status) {
-        case NV_OK:
-        case NV_ERR_BUSY_RETRY:
-            return VM_FAULT_NOPAGE | (major_fault ? VM_FAULT_MAJOR : 0);
-        case NV_ERR_NO_MEMORY:
-            return VM_FAULT_OOM;
-        default:
-            return VM_FAULT_SIGBUS;
-    }
+    return uvm_va_space_cpu_fault_managed(va_space, vma, vmf);
 }

-
 static vm_fault_t uvm_vm_fault_entry(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
    UVM_ENTRY_RET(uvm_vm_fault(vma, vmf));
@ -986,8 +790,6 @@ bool uvm_file_is_nvidia_uvm(struct file *filp)
 NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_BUFFER_PARAMS *params, struct file *filp)
 {
    long ret;
-    int write = 1;
-    int force = 0;
    struct page *page;
    NV_STATUS status = NV_OK;

@ -998,7 +800,7 @@ NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_B
    // are not used because unload_state_buf may be a managed memory pointer and
    // therefore a locking assertion from the CPU fault handler could be fired.
    nv_mmap_read_lock(current->mm);
-    ret = NV_GET_USER_PAGES(params->unload_state_buf, 1, write, force, &page, NULL);
+    ret = NV_PIN_USER_PAGES(params->unload_state_buf, 1, FOLL_WRITE, &page, NULL);
    nv_mmap_read_unlock(current->mm);

    if (ret < 0)
@ -1008,7 +810,7 @@ NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_B
    uvm_mutex_lock(&g_uvm_global.global_lock);

    if (g_uvm_global.unload_state.ptr) {
-        put_page(page);
+        NV_UNPIN_USER_PAGE(page);
        status = NV_ERR_IN_USE;
        goto error;
    }
@ -1027,7 +829,7 @@ static void uvm_test_unload_state_exit(void)
 {
    if (g_uvm_global.unload_state.ptr) {
        kunmap(g_uvm_global.unload_state.page);
-        put_page(g_uvm_global.unload_state.page);
+        NV_UNPIN_USER_PAGE(g_uvm_global.unload_state.page);
    }
 }

--- a/kernel-open/nvidia-uvm/uvm_ats_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@ -25,9 +25,62 @@
 #include "uvm_ats_faults.h"
 #include "uvm_migrate_pageable.h"

+// TODO: Bug 2103669: Implement a real prefetching policy and remove or adapt
+// these experimental parameters. These are intended to help guide that policy.
+static unsigned int uvm_exp_perf_prefetch_ats_order_replayable = 0;
+module_param(uvm_exp_perf_prefetch_ats_order_replayable, uint, 0644);
+MODULE_PARM_DESC(uvm_exp_perf_prefetch_ats_order_replayable,
+                 "Max order of pages (2^N) to prefetch on replayable ATS faults");
+
+static unsigned int uvm_exp_perf_prefetch_ats_order_non_replayable = 0;
+module_param(uvm_exp_perf_prefetch_ats_order_non_replayable, uint, 0644);
+MODULE_PARM_DESC(uvm_exp_perf_prefetch_ats_order_non_replayable,
+                 "Max order of pages (2^N) to prefetch on non-replayable ATS faults");
+
+// Expand the fault region to the naturally-aligned region with order given by
+// the module parameters, clamped to the vma containing fault_addr (if any).
+// Note that this means the region contains fault_addr but may not begin at
+// fault_addr.
+static void expand_fault_region(struct mm_struct *mm,
+                                NvU64 fault_addr,
+                                uvm_fault_client_type_t client_type,
+                                unsigned long *start,
+                                unsigned long *size)
+{
+    struct vm_area_struct *vma;
+    unsigned int order;
+    unsigned long outer, aligned_start, aligned_size;
+
+    *start = fault_addr;
+    *size = PAGE_SIZE;
+
+    if (client_type == UVM_FAULT_CLIENT_TYPE_HUB)
+        order = uvm_exp_perf_prefetch_ats_order_non_replayable;
+    else
+        order = uvm_exp_perf_prefetch_ats_order_replayable;
+
+    if (order == 0)
+        return;
+
+    vma = find_vma_intersection(mm, fault_addr, fault_addr + 1);
+    if (!vma)
+        return;
+
+    UVM_ASSERT(order < BITS_PER_LONG - PAGE_SHIFT);
+
+    aligned_size = (1UL << order) * PAGE_SIZE;
+
+    aligned_start = fault_addr & ~(aligned_size - 1);
+
+    *start = max(vma->vm_start, aligned_start);
+    outer = min(vma->vm_end, aligned_start + aligned_size);
+    *size = outer - *start;
+}
+
 static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
                                       NvU64 fault_addr,
-                                       uvm_fault_access_type_t access_type)
+                                       uvm_fault_access_type_t access_type,
+                                       uvm_fault_client_type_t client_type)
 {
    uvm_va_space_t *va_space = gpu_va_space->va_space;
    struct mm_struct *mm = va_space->va_space_mm.mm;
@ -66,8 +119,6 @@ static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,
    {
        .va_space               = va_space,
        .mm                     = mm,
-        .start                  = fault_addr,
-        .length                 = PAGE_SIZE,
        .dst_id                 = gpu_va_space->gpu->parent->id,
        .dst_node_id            = -1,
        .populate_permissions   = write ? UVM_POPULATE_PERMISSIONS_WRITE : UVM_POPULATE_PERMISSIONS_ANY,
@ -79,6 +130,8 @@ static NV_STATUS uvm_ats_service_fault(uvm_gpu_va_space_t *gpu_va_space,

    UVM_ASSERT(uvm_ats_can_service_faults(gpu_va_space, mm));

+    expand_fault_region(mm, fault_addr, client_type, &uvm_migrate_args.start, &uvm_migrate_args.length);
+
    // TODO: Bug 2103669: Service more than a single fault at a time
    //
    // We are trying to use migrate_vma API in the kernel (if it exists) to
@ -131,7 +184,10 @@ NV_STATUS uvm_ats_service_fault_entry(uvm_gpu_va_space_t *gpu_va_space,
    }
    else {
        // TODO: Bug 2103669: Service more than a single fault at a time
-        status = uvm_ats_service_fault(gpu_va_space, current_entry->fault_address, service_access_type);
+        status = uvm_ats_service_fault(gpu_va_space,
+                                       current_entry->fault_address,
+                                       service_access_type,
+                                       current_entry->fault_source.client_type);
    }

    // Do not flag prefetch faults as fatal unless something fatal happened
@ -155,7 +211,8 @@ NV_STATUS uvm_ats_service_fault_entry(uvm_gpu_va_space_t *gpu_va_space,
                    uvm_fault_access_type_mask_test(current_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
                    status = uvm_ats_service_fault(gpu_va_space,
                                                   current_entry->fault_address,
-                                                   UVM_FAULT_ACCESS_TYPE_READ);
+                                                   UVM_FAULT_ACCESS_TYPE_READ,
+                                                   current_entry->fault_source.client_type);

                    // If read accesses are also invalid, cancel the fault. If a
                    // different error code is returned, exit
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -24,6 +24,7 @@
 #include "uvm_channel.h"

 #include "uvm_api.h"
+#include "uvm_common.h"
 #include "uvm_global.h"
 #include "uvm_hal.h"
 #include "uvm_procfs.h"
@ -68,6 +69,38 @@ typedef enum
    UVM_CHANNEL_UPDATE_MODE_FORCE_ALL
 } uvm_channel_update_mode_t;

+static void channel_pool_lock_init(uvm_channel_pool_t *pool)
+{
+    if (uvm_channel_pool_is_proxy(pool))
+        uvm_mutex_init(&pool->mutex, UVM_LOCK_ORDER_CHANNEL);
+    else
+        uvm_spin_lock_init(&pool->spinlock, UVM_LOCK_ORDER_CHANNEL);
+}
+
+void uvm_channel_pool_lock(uvm_channel_pool_t *pool)
+{
+    if (uvm_channel_pool_is_proxy(pool))
+        uvm_mutex_lock(&pool->mutex);
+    else
+        uvm_spin_lock(&pool->spinlock);
+}
+
+void uvm_channel_pool_unlock(uvm_channel_pool_t *pool)
+{
+    if (uvm_channel_pool_is_proxy(pool))
+        uvm_mutex_unlock(&pool->mutex);
+    else
+        uvm_spin_unlock(&pool->spinlock);
+}
+
+void uvm_channel_pool_assert_locked(uvm_channel_pool_t *pool)
+{
+    if (uvm_channel_pool_is_proxy(pool))
+        uvm_assert_mutex_locked(&pool->mutex);
+    else
+        uvm_assert_spinlock_locked(&pool->spinlock);
+}
+
 // Update channel progress, completing up to max_to_complete entries
 static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,
                                                  NvU32 max_to_complete,
@ -80,7 +113,7 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    NvU64 completed_value = uvm_channel_update_completed_value(channel);

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    // Completed value should never exceed the queued value
    UVM_ASSERT_MSG_RELEASE(completed_value <= channel->tracking_sem.queued_value,
@ -108,7 +141,7 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    channel->gpu_get = gpu_get;

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);

    if (cpu_put >= gpu_get)
        pending_gpfifos = cpu_put - gpu_get;
@ -157,7 +190,7 @@ static bool channel_is_available(uvm_channel_t *channel, NvU32 num_gpfifo_entrie
 {
    NvU32 pending_entries;

-    uvm_assert_spinlock_locked(&channel->pool->lock);
+    uvm_channel_pool_assert_locked(channel->pool);

    if (channel->cpu_put >= channel->gpu_get)
        pending_entries = channel->cpu_put - channel->gpu_get;
@ -174,14 +207,14 @@ static bool try_claim_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
    UVM_ASSERT(num_gpfifo_entries > 0);
    UVM_ASSERT(num_gpfifo_entries < channel->num_gpfifo_entries);

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    if (channel_is_available(channel, num_gpfifo_entries)) {
        channel->current_gpfifo_count += num_gpfifo_entries;
        claimed = true;
    }

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);

    return claimed;
 }
@ -248,7 +281,8 @@ static NV_STATUS channel_reserve_in_pool(uvm_channel_pool_t *pool, uvm_channel_t

 NV_STATUS uvm_channel_reserve_type(uvm_channel_manager_t *manager, uvm_channel_type_t type, uvm_channel_t **channel_out)
 {
-	UVM_ASSERT(type < UVM_CHANNEL_TYPE_COUNT);
+    UVM_ASSERT(type < UVM_CHANNEL_TYPE_COUNT);
+
    return channel_reserve_in_pool(manager->pool_to_use.default_for_type[type], channel_out);
 }

@ -289,14 +323,14 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
 {
    uvm_push_info_t *push_info;

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    push_info = list_first_entry_or_null(&channel->available_push_infos, uvm_push_info_t, available_list_node);
    UVM_ASSERT(push_info != NULL);
    UVM_ASSERT(push_info->on_complete == NULL && push_info->on_complete_data == NULL);
    list_del(&push_info->available_list_node);

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);

    return push_info - channel->push_infos;
 }
@ -355,10 +389,6 @@ static void proxy_channel_submit_work(uvm_push_t *push, NvU32 push_size)

    UVM_ASSERT(uvm_channel_is_proxy(channel));

-    // nvUvmInterfacePagingChannelPushStream should not sleep, because a
-    // spinlock is currently held.
-    uvm_assert_spinlock_locked(&channel->pool->lock);
-
    status = nvUvmInterfacePagingChannelPushStream(channel->proxy.handle, (char *) push->begin, push_size);

    if (status != NV_OK) {
@ -409,7 +439,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    NvU32 cpu_put;
    NvU32 new_cpu_put;

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    new_tracking_value = ++channel->tracking_sem.queued_value;
    new_payload = (NvU32)new_tracking_value;
@ -446,7 +476,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    // may notice the GPU work to be completed and hence all state tracking the
    // push must be updated before that. Notably uvm_pushbuffer_end_push() has
    // to be called first.
-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);
    unlock_push(channel);

    // This memory barrier is borrowed from CUDA, as it supposedly fixes perf
@ -470,7 +500,7 @@ static void write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_valu
    NvU32 new_cpu_put;
    uvm_gpu_t *gpu = channel->pool->manager->gpu;

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    cpu_put = channel->cpu_put;
    new_cpu_put = (cpu_put + 1) % channel->num_gpfifo_entries;
@ -505,7 +535,7 @@ static void write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_valu
    // The moment the channel is unlocked uvm_channel_update_progress_with_max()
    // may notice the GPU work to be completed and hence all state tracking the
    // push must be updated before that.
-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);
    unlock_push(channel);

    // This memory barrier is borrowed from CUDA, as it supposedly fixes perf
@ -591,12 +621,12 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch
    if (pending_count == 0)
        return NULL;

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    if (channel->gpu_get != channel->cpu_put)
        entry = &channel->gpfifo_entries[channel->gpu_get];

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);

    return entry;
 }
@ -720,9 +750,9 @@ static void channel_destroy(uvm_channel_pool_t *pool, uvm_channel_t *channel)
        channel_update_progress_all(channel, UVM_CHANNEL_UPDATE_MODE_FORCE_ALL);
    }

-    uvm_procfs_destroy_entry(channel->procfs.pushes);
-    uvm_procfs_destroy_entry(channel->procfs.info);
-    uvm_procfs_destroy_entry(channel->procfs.dir);
+    proc_remove(channel->procfs.pushes);
+    proc_remove(channel->procfs.info);
+    proc_remove(channel->procfs.dir);

    uvm_kvfree(channel->push_acquire_infos);
    uvm_kvfree(channel->push_infos);
@ -977,7 +1007,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    pool->engine_index = engine_index;
    pool->pool_type = pool_type;

-    uvm_spin_lock_init(&pool->lock, UVM_LOCK_ORDER_CHANNEL);
+    channel_pool_lock_init(pool);

    num_channels = channel_pool_type_num_channels(pool_type);

@ -1482,11 +1512,11 @@ void uvm_channel_manager_destroy(uvm_channel_manager_t *channel_manager)
    if (channel_manager == NULL)
        return;

-    uvm_procfs_destroy_entry(channel_manager->procfs.pending_pushes);
+    proc_remove(channel_manager->procfs.pending_pushes);

    channel_manager_destroy_pools(channel_manager);

-    uvm_procfs_destroy_entry(channel_manager->procfs.channels_dir);
+    proc_remove(channel_manager->procfs.channels_dir);

    uvm_pushbuffer_destroy(channel_manager->pushbuffer);

@ -1583,7 +1613,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    uvm_channel_manager_t *manager = channel->pool->manager;
    UVM_SEQ_OR_DBG_PRINT(s, "Channel %s\n", channel->name);

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    UVM_SEQ_OR_DBG_PRINT(s, "completed          %llu\n", uvm_channel_update_completed_value(channel));
    UVM_SEQ_OR_DBG_PRINT(s, "queued             %llu\n", channel->tracking_sem.queued_value);
@ -1595,7 +1625,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);
 }

 static void channel_print_push_acquires(uvm_push_acquire_info_t *push_acquire_info, struct seq_file *seq)
@ -1639,7 +1669,7 @@ static void channel_print_pushes(uvm_channel_t *channel, NvU32 finished_pushes_c

    NvU64 completed_value = uvm_channel_update_completed_value(channel);

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    cpu_put = channel->cpu_put;

@ -1687,7 +1717,7 @@ static void channel_print_pushes(uvm_channel_t *channel, NvU32 finished_pushes_c
                channel_print_push_acquires(push_acquire_info, seq);
        }
    }
-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);
 }

 void uvm_channel_print_pending_pushes(uvm_channel_t *channel)
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@ -163,7 +163,11 @@ typedef struct
    uvm_channel_pool_type_t pool_type;

    // Lock protecting the state of channels in the pool
-    uvm_spinlock_t lock;
+    union {
+        uvm_spinlock_t spinlock;
+        uvm_mutex_t mutex;
+    };
+
 } uvm_channel_pool_t;

 struct uvm_channel_struct
@ -309,10 +313,20 @@ struct uvm_channel_manager_struct
 // Create a channel manager for the GPU
 NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **manager_out);

+void uvm_channel_pool_lock(uvm_channel_pool_t *pool);
+void uvm_channel_pool_unlock(uvm_channel_pool_t *pool);
+void uvm_channel_pool_assert_locked(uvm_channel_pool_t *pool);
+
+static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
+{
+    UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
+
+    return pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY;
+}
+
 static bool uvm_channel_is_proxy(uvm_channel_t *channel)
 {
-    UVM_ASSERT(channel->pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
-    return channel->pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE_PROXY;
+    return uvm_channel_pool_is_proxy(channel->pool);
 }

 static bool uvm_channel_is_ce(uvm_channel_t *channel)
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@ -747,14 +747,14 @@ static NvU32 get_available_gpfifo_entries(uvm_channel_t *channel)
 {
    NvU32 pending_entries;

-    uvm_spin_lock(&channel->pool->lock);
+    uvm_channel_pool_lock(channel->pool);

    if (channel->cpu_put >= channel->gpu_get)
        pending_entries = channel->cpu_put - channel->gpu_get;
    else
        pending_entries = channel->cpu_put + channel->num_gpfifo_entries - channel->gpu_get;

-    uvm_spin_unlock(&channel->pool->lock);
+    uvm_channel_pool_unlock(channel->pool);

    return channel->num_gpfifo_entries - pending_entries - 1;
 }
--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@ -186,8 +186,7 @@ static void uvm_global_remove_parent_gpu(uvm_parent_gpu_t *parent_gpu)
    uvm_assert_mutex_locked(&g_uvm_global.global_lock);
    uvm_assert_spinlock_locked(&g_uvm_global.gpu_table_lock);

-    UVM_ASSERT(g_uvm_global.parent_gpus[gpu_index]);
-    UVM_ASSERT(g_uvm_global.parent_gpus[gpu_index] == parent_gpu);
+    UVM_ASSERT(g_uvm_global.parent_gpus[gpu_index] == NULL || g_uvm_global.parent_gpus[gpu_index] == parent_gpu);

    g_uvm_global.parent_gpus[gpu_index] = NULL;
 }
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -694,7 +694,7 @@ static NV_STATUS init_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)

 static void deinit_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu)
 {
-    uvm_procfs_destroy_entry(parent_gpu->procfs.dir);
+    proc_remove(parent_gpu->procfs.dir);
 }

 static NV_STATUS init_parent_procfs_files(uvm_parent_gpu_t *parent_gpu)
@ -722,8 +722,8 @@ static NV_STATUS init_parent_procfs_files(uvm_parent_gpu_t *parent_gpu)

 static void deinit_parent_procfs_files(uvm_parent_gpu_t *parent_gpu)
 {
-    uvm_procfs_destroy_entry(parent_gpu->procfs.access_counters_file);
-    uvm_procfs_destroy_entry(parent_gpu->procfs.fault_stats_file);
+    proc_remove(parent_gpu->procfs.access_counters_file);
+    proc_remove(parent_gpu->procfs.fault_stats_file);
 }

 static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
@ -774,9 +774,9 @@ static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu)
 // The kernel waits on readers to finish before returning from those calls
 static void deinit_procfs_dirs(uvm_gpu_t *gpu)
 {
-    uvm_procfs_destroy_entry(gpu->procfs.dir_peers);
-    uvm_procfs_destroy_entry(gpu->procfs.dir_symlink);
-    uvm_procfs_destroy_entry(gpu->procfs.dir);
+    proc_remove(gpu->procfs.dir_peers);
+    proc_remove(gpu->procfs.dir_symlink);
+    proc_remove(gpu->procfs.dir);
 }

 static NV_STATUS init_procfs_files(uvm_gpu_t *gpu)
@ -790,15 +790,15 @@ static NV_STATUS init_procfs_files(uvm_gpu_t *gpu)

 static void deinit_procfs_files(uvm_gpu_t *gpu)
 {
-    uvm_procfs_destroy_entry(gpu->procfs.info_file);
+    proc_remove(gpu->procfs.info_file);
 }

 static void deinit_procfs_peer_cap_files(uvm_gpu_peer_t *peer_caps)
 {
-    uvm_procfs_destroy_entry(peer_caps->procfs.peer_symlink_file[0]);
-    uvm_procfs_destroy_entry(peer_caps->procfs.peer_symlink_file[1]);
-    uvm_procfs_destroy_entry(peer_caps->procfs.peer_file[0]);
-    uvm_procfs_destroy_entry(peer_caps->procfs.peer_file[1]);
+    proc_remove(peer_caps->procfs.peer_symlink_file[0]);
+    proc_remove(peer_caps->procfs.peer_symlink_file[1]);
+    proc_remove(peer_caps->procfs.peer_file[0]);
+    proc_remove(peer_caps->procfs.peer_file[1]);
 }

 static NV_STATUS init_semaphore_pool(uvm_gpu_t *gpu)
@ -3080,41 +3080,41 @@ void uvm_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_add
    atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size);
 }

-NV_STATUS uvm_gpu_map_cpu_pages(uvm_gpu_t *gpu, struct page *page, size_t size, NvU64 *dma_address_out)
+NV_STATUS uvm_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out)
 {
    NvU64 dma_addr;

    UVM_ASSERT(PAGE_ALIGNED(size));

-    dma_addr = dma_map_page(&gpu->parent->pci_dev->dev, page, 0, size, DMA_BIDIRECTIONAL);
-    if (dma_mapping_error(&gpu->parent->pci_dev->dev, dma_addr))
+    dma_addr = dma_map_page(&parent_gpu->pci_dev->dev, page, 0, size, DMA_BIDIRECTIONAL);
+    if (dma_mapping_error(&parent_gpu->pci_dev->dev, dma_addr))
        return NV_ERR_OPERATING_SYSTEM;

-    if (dma_addr < gpu->parent->dma_addressable_start ||
-        dma_addr + size - 1 > gpu->parent->dma_addressable_limit) {
-        dma_unmap_page(&gpu->parent->pci_dev->dev, dma_addr, size, DMA_BIDIRECTIONAL);
+    if (dma_addr < parent_gpu->dma_addressable_start ||
+        dma_addr + size - 1 > parent_gpu->dma_addressable_limit) {
+        dma_unmap_page(&parent_gpu->pci_dev->dev, dma_addr, size, DMA_BIDIRECTIONAL);
        UVM_ERR_PRINT_RL("PCI mapped range [0x%llx, 0x%llx) not in the addressable range [0x%llx, 0x%llx), GPU %s\n",
                         dma_addr,
                         dma_addr + (NvU64)size,
-                         gpu->parent->dma_addressable_start,
-                         gpu->parent->dma_addressable_limit + 1,
-                         uvm_gpu_name(gpu));
+                         parent_gpu->dma_addressable_start,
+                         parent_gpu->dma_addressable_limit + 1,
+                         parent_gpu->name);
        return NV_ERR_INVALID_ADDRESS;
    }

-    atomic64_add(size, &gpu->parent->mapped_cpu_pages_size);
-    *dma_address_out = dma_addr_to_gpu_addr(gpu->parent, dma_addr);
+    atomic64_add(size, &parent_gpu->mapped_cpu_pages_size);
+    *dma_address_out = dma_addr_to_gpu_addr(parent_gpu, dma_addr);

    return NV_OK;
 }

-void uvm_gpu_unmap_cpu_pages(uvm_gpu_t *gpu, NvU64 dma_address, size_t size)
+void uvm_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size)
 {
    UVM_ASSERT(PAGE_ALIGNED(size));

-    dma_address = gpu_addr_to_dma_addr(gpu->parent, dma_address);
-    dma_unmap_page(&gpu->parent->pci_dev->dev, dma_address, size, DMA_BIDIRECTIONAL);
-    atomic64_sub(size, &gpu->parent->mapped_cpu_pages_size);
+    dma_address = gpu_addr_to_dma_addr(parent_gpu, dma_address);
+    dma_unmap_page(&parent_gpu->pci_dev->dev, dma_address, size, DMA_BIDIRECTIONAL);
+    atomic64_sub(size, &parent_gpu->mapped_cpu_pages_size);
 }

 // This function implements the UvmRegisterGpu API call, as described in uvm.h.
--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -44,6 +44,7 @@
 #include "uvm_va_block_types.h"
 #include "uvm_perf_module.h"
 #include "uvm_rb_tree.h"
+#include "uvm_perf_prefetch.h"
 #include "nv-kthread-q.h"

 // Buffer length to store uvm gpu id, RM device name and gpu uuid.
@ -159,6 +160,12 @@ struct uvm_service_block_context_struct

    // State used by the VA block routines called by the servicing routine
    uvm_va_block_context_t block_context;
+
+    // Prefetch state hint
+    uvm_perf_prefetch_hint_t prefetch_hint;
+
+    // Prefetch temporary state.
+    uvm_perf_prefetch_bitmap_tree_t prefetch_bitmap_tree;
 };

 struct uvm_fault_service_batch_context_struct
@ -374,6 +381,16 @@ struct uvm_access_counter_service_batch_context_struct
        // determine at fetch time that all the access counter notifications in the
        // batch report the same instance_ptr
        bool is_single_instance_ptr;
+
+        // Scratch space, used to generate artificial physically addressed notifications.
+        // Virtual address notifications are always aligned to 64k. This means up to 16
+        // different physical locations could have been accessed to trigger one notification.
+        // The sub-granularity mask can correspond to any of them.
+        struct {
+            uvm_processor_id_t resident_processors[16];
+            uvm_gpu_phys_address_t phys_addresses[16];
+            uvm_access_counter_buffer_entry_t phys_entry;
+        } scratch;
    } virt;

    struct
@ -1309,19 +1326,19 @@ NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu);
 //
 // Returns the physical address of the pages that can be used to access them on
 // the GPU.
-NV_STATUS uvm_gpu_map_cpu_pages(uvm_gpu_t *gpu, struct page *page, size_t size, NvU64 *dma_address_out);
+NV_STATUS uvm_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out);

 // Unmap num_pages pages previously mapped with uvm_gpu_map_cpu_pages().
-void uvm_gpu_unmap_cpu_pages(uvm_gpu_t *gpu, NvU64 dma_address, size_t size);
+void uvm_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size);

-static NV_STATUS uvm_gpu_map_cpu_page(uvm_gpu_t *gpu, struct page *page, NvU64 *dma_address_out)
+static NV_STATUS uvm_gpu_map_cpu_page(uvm_parent_gpu_t *parent_gpu, struct page *page, NvU64 *dma_address_out)
 {
-    return uvm_gpu_map_cpu_pages(gpu, page, PAGE_SIZE, dma_address_out);
+    return uvm_gpu_map_cpu_pages(parent_gpu, page, PAGE_SIZE, dma_address_out);
 }

-static void uvm_gpu_unmap_cpu_page(uvm_gpu_t *gpu, NvU64 dma_address)
+static void uvm_gpu_unmap_cpu_page(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address)
 {
-    uvm_gpu_unmap_cpu_pages(gpu, dma_address, PAGE_SIZE);
+    uvm_gpu_unmap_cpu_pages(parent_gpu, dma_address, PAGE_SIZE);
 }

 // Allocate and map a page of system DMA memory on the GPU for physical access
--- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -41,6 +41,10 @@
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_MAX       ((1 << 16) - 1)
 #define UVM_PERF_ACCESS_COUNTER_THRESHOLD_DEFAULT   256

+#define UVM_ACCESS_COUNTER_ACTION_NOTIFY 0x1
+#define UVM_ACCESS_COUNTER_ACTION_CLEAR  0x2
+#define UVM_ACCESS_COUNTER_ON_MANAGED    0x4
+
 // Each page in a tracked physical range may belong to a different VA Block. We
 // preallocate an array of reverse map translations. However, access counter
 // granularity can be set to up to 16G, which would require an array too large
@ -934,25 +938,6 @@ static void preprocess_virt_notifications(uvm_gpu_t *gpu,
    translate_virt_notifications_instance_ptrs(gpu, batch_context);
 }

-static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
-                                            uvm_access_counter_service_batch_context_t *batch_context)
-{
-    // TODO: Bug 1990466: Service virtual notifications. Entries with NULL
-    // va_space are simply dropped.
-    if (uvm_enable_builtin_tests) {
-        NvU32 i;
-
-        preprocess_virt_notifications(gpu, batch_context);
-
-        for (i = 0; i < batch_context->virt.num_notifications; ++i) {
-            const bool on_managed = false;
-            uvm_tools_broadcast_access_counter(gpu, batch_context->virt.notifications[i], on_managed);
-        }
-    }
-
-    return NV_OK;
-}
-
 // GPA notifications provide a physical address and an aperture. Sort
 // accesses by aperture to try to coalesce operations on the same target
 // processor.
@ -1046,9 +1031,19 @@ static NV_STATUS service_va_block_locked(uvm_processor_id_t processor,
            uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
        }

+        // If the underlying VMA is gone, skip HMM migrations.
+        if (uvm_va_block_is_hmm(va_block)) {
+            status = uvm_hmm_find_vma(&service_context->block_context, address);
+            if (status == NV_ERR_INVALID_ADDRESS)
+                continue;
+
+            UVM_ASSERT(status == NV_OK);
+        }
+
        service_context->block_context.policy = uvm_va_policy_get(va_block, address);

        new_residency = uvm_va_block_select_residency(va_block,
+                                                      &service_context->block_context,
                                                      page_index,
                                                      processor,
                                                      uvm_fault_access_type_mask_bit(UVM_FAULT_ACCESS_TYPE_PREFETCH),
@ -1158,7 +1153,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
                                              const uvm_access_counter_buffer_entry_t *current_entry,
                                              const uvm_reverse_map_t *reverse_mappings,
                                              size_t num_reverse_mappings,
-                                              bool *clear_counter)
+                                              unsigned *out_flags)
 {
    size_t index;
    uvm_va_block_t *va_block = reverse_mappings[0].va_block;
@ -1168,7 +1163,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
    const uvm_processor_id_t processor = current_entry->counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC?
                                             gpu->id: UVM_ID_CPU;

-    *clear_counter = false;
+    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

    UVM_ASSERT(num_reverse_mappings > 0);

@ -1217,7 +1212,7 @@ static NV_STATUS service_phys_single_va_block(uvm_gpu_t *gpu,
        uvm_mutex_unlock(&va_block->lock);

        if (status == NV_OK)
-            *clear_counter = true;
+            *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;
    }

 done:
@ -1238,25 +1233,26 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *current_entry,
                                        const uvm_reverse_map_t *reverse_mappings,
                                        size_t num_reverse_mappings,
-                                        bool *clear_counter)
+                                        unsigned *out_flags)
 {
    NV_STATUS status = NV_OK;
    size_t index;

-    *clear_counter = false;
+    *out_flags &= ~UVM_ACCESS_COUNTER_ACTION_CLEAR;

    for (index = 0; index < num_reverse_mappings; ++index) {
-        bool clear_counter_local = false;
+        unsigned out_flags_local = 0;
        status = service_phys_single_va_block(gpu,
                                              batch_context,
                                              current_entry,
                                              reverse_mappings + index,
                                              1,
-                                              &clear_counter_local);
+                                              &out_flags_local);
        if (status != NV_OK)
            break;

-        *clear_counter = *clear_counter || clear_counter_local;
+        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_CLEAR) == 0);
+        *out_flags |= out_flags_local;
    }

    // In the case of failure, drop the refcounts for the remaining reverse mappings
@ -1267,18 +1263,13 @@ static NV_STATUS service_phys_va_blocks(uvm_gpu_t *gpu,
 }

 // Iterate over all regions set in the given sub_granularity mask
-#define for_each_sub_granularity_region(region_start, region_end, sub_granularity, config)                       \
-    for ((region_start) = find_first_bit(&(sub_granularity), (config)->sub_granularity_regions_per_translation), \
-         (region_end) = find_next_zero_bit(&(sub_granularity),                                                   \
-                                           (config)->sub_granularity_regions_per_translation,                    \
-                                           (region_start) + 1);                                                  \
-         (region_start) < config->sub_granularity_regions_per_translation;                                       \
-         (region_start) = find_next_bit(&(sub_granularity),                                                      \
-                                        (config)->sub_granularity_regions_per_translation,                       \
-                                        (region_end) + 1),                                                       \
-         (region_end) = find_next_zero_bit(&(sub_granularity),                                                   \
-                                           (config)->sub_granularity_regions_per_translation,                    \
-                                           (region_start) + 1))
+#define for_each_sub_granularity_region(region_start, region_end, sub_granularity, num_regions)      \
+    for ((region_start) = find_first_bit(&(sub_granularity), (num_regions)),                         \
+         (region_end) = find_next_zero_bit(&(sub_granularity), (num_regions), (region_start) + 1);   \
+         (region_start) < (num_regions);                                                             \
+         (region_start) = find_next_bit(&(sub_granularity), (num_regions), (region_end) + 1),        \
+         (region_end) = find_next_zero_bit(&(sub_granularity), (num_regions), (region_start) + 1))
+

 static bool are_reverse_mappings_on_single_block(const uvm_reverse_map_t *reverse_mappings, size_t num_reverse_mappings)
 {
@ -1309,7 +1300,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                                       NvU64 address,
                                                       unsigned long sub_granularity,
                                                       size_t *num_reverse_mappings,
-                                                       bool *clear_counter)
+                                                       unsigned *out_flags)
 {
    NV_STATUS status;
    NvU32 region_start, region_end;
@ -1318,7 +1309,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,

    // Get the reverse_map translations for all the regions set in the
    // sub_granularity field of the counter.
-    for_each_sub_granularity_region(region_start, region_end, sub_granularity, config) {
+    for_each_sub_granularity_region(region_start, region_end, sub_granularity, config->sub_granularity_regions_per_translation) {
        NvU64 local_address = address + region_start * config->sub_granularity_region_size;
        NvU32 local_translation_size = (region_end - region_start) * config->sub_granularity_region_size;
        uvm_reverse_map_t *local_reverse_mappings = batch_context->phys.translations + *num_reverse_mappings;
@ -1350,7 +1341,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                              current_entry,
                                              batch_context->phys.translations,
                                              *num_reverse_mappings,
-                                              clear_counter);
+                                              out_flags);
    }
    else {
        status = service_phys_va_blocks(gpu,
@ -1358,7 +1349,7 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,
                                        current_entry,
                                        batch_context->phys.translations,
                                        *num_reverse_mappings,
-                                        clear_counter);
+                                        out_flags);
    }

    return status;
@ -1366,7 +1357,8 @@ static NV_STATUS service_phys_notification_translation(uvm_gpu_t *gpu,

 static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
                                           uvm_access_counter_service_batch_context_t *batch_context,
-                                           const uvm_access_counter_buffer_entry_t *current_entry)
+                                           const uvm_access_counter_buffer_entry_t *current_entry,
+                                           unsigned *out_flags)
 {
    NvU64 address;
    NvU64 translation_index;
@ -1377,7 +1369,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
    size_t total_reverse_mappings = 0;
    uvm_gpu_t *resident_gpu = NULL;
    NV_STATUS status = NV_OK;
-    bool clear_counter = false;
+    unsigned flags = 0;

    address = current_entry->address.address;
    UVM_ASSERT(address % config->translation_size == 0);
@ -1405,7 +1397,7 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,

    for (translation_index = 0; translation_index < config->translations_per_counter; ++translation_index) {
        size_t num_reverse_mappings;
-        bool clear_counter_local = false;
+        unsigned out_flags_local = 0;
        status = service_phys_notification_translation(gpu,
                                                       resident_gpu,
                                                       batch_context,
@ -1414,9 +1406,11 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
                                                       address,
                                                       sub_granularity,
                                                       &num_reverse_mappings,
-                                                       &clear_counter_local);
+                                                       &out_flags_local);
        total_reverse_mappings += num_reverse_mappings;
-        clear_counter = clear_counter || clear_counter_local;
+
+        UVM_ASSERT((out_flags_local & ~UVM_ACCESS_COUNTER_ACTION_CLEAR) == 0);
+        flags |= out_flags_local;

        if (status != NV_OK)
            break;
@ -1425,17 +1419,14 @@ static NV_STATUS service_phys_notification(uvm_gpu_t *gpu,
        sub_granularity = sub_granularity >> config->sub_granularity_regions_per_translation;
    }

-    // TODO: Bug 1990466: Here we already have virtual addresses and
-    // address spaces. Merge virtual and physical notification handling
-
    // Currently we only report events for our tests, not for tools
    if (uvm_enable_builtin_tests) {
-        const bool on_managed = total_reverse_mappings != 0;
-        uvm_tools_broadcast_access_counter(gpu, current_entry, on_managed);
+        *out_flags |= UVM_ACCESS_COUNTER_ACTION_NOTIFY;
+        *out_flags |= ((total_reverse_mappings != 0) ? UVM_ACCESS_COUNTER_ON_MANAGED : 0);
    }

-    if (status == NV_OK && clear_counter)
-        status = access_counter_clear_targeted(gpu, current_entry);
+    if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
+        *out_flags |= UVM_ACCESS_COUNTER_ACTION_CLEAR;

    return status;
 }
@ -1450,11 +1441,18 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
    for (i = 0; i < batch_context->phys.num_notifications; ++i) {
        NV_STATUS status;
        uvm_access_counter_buffer_entry_t *current_entry = batch_context->phys.notifications[i];
+        unsigned flags = 0;

        if (!UVM_ID_IS_VALID(current_entry->physical_info.resident_id))
            continue;

-        status = service_phys_notification(gpu, batch_context, current_entry);
+        status = service_phys_notification(gpu, batch_context, current_entry, &flags);
+        if (flags & UVM_ACCESS_COUNTER_ACTION_NOTIFY)
+            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
+
+        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
+            status = access_counter_clear_targeted(gpu, current_entry);
+
        if (status != NV_OK)
            return status;
    }
@ -1462,6 +1460,191 @@ static NV_STATUS service_phys_notifications(uvm_gpu_t *gpu,
    return NV_OK;
 }

+static int cmp_sort_gpu_phys_addr(const void *_a, const void *_b)
+{
+    return uvm_gpu_phys_addr_cmp(*(uvm_gpu_phys_address_t*)_a,
+                                 *(uvm_gpu_phys_address_t*)_b);
+}
+
+static bool gpu_phys_same_region(uvm_gpu_phys_address_t a, uvm_gpu_phys_address_t b, NvU64 granularity)
+{
+    if (a.aperture != b.aperture)
+        return false;
+
+    UVM_ASSERT(is_power_of_2(granularity));
+
+    return UVM_ALIGN_DOWN(a.address, granularity) == UVM_ALIGN_DOWN(b.address, granularity);
+}
+
+static bool phys_address_in_accessed_sub_region(uvm_gpu_phys_address_t address,
+                                                NvU64 region_size,
+                                                NvU64 sub_region_size,
+                                                NvU32 accessed_mask)
+{
+    const unsigned accessed_index = (address.address % region_size) / sub_region_size;
+
+    // accessed_mask is only filled for tracking granularities larger than 64K
+    if (region_size == UVM_PAGE_SIZE_64K)
+        return true;
+
+    UVM_ASSERT(accessed_index < 32);
+    return ((1 << accessed_index) & accessed_mask) != 0;
+}
+
+static NV_STATUS service_virt_notification(uvm_gpu_t *gpu,
+                                           uvm_access_counter_service_batch_context_t *batch_context,
+                                           const uvm_access_counter_buffer_entry_t *current_entry,
+                                           unsigned *out_flags)
+{
+    NV_STATUS status = NV_OK;
+    NvU64 notification_size;
+    NvU64 address;
+    uvm_processor_id_t *resident_processors = batch_context->virt.scratch.resident_processors;
+    uvm_gpu_phys_address_t *phys_addresses = batch_context->virt.scratch.phys_addresses;
+    int num_addresses = 0;
+    int i;
+
+    // Virtual address notifications are always 64K aligned
+    NvU64 region_start = current_entry->address.address;
+    NvU64 region_end = current_entry->address.address + UVM_PAGE_SIZE_64K;
+    
+
+    uvm_access_counter_buffer_info_t *access_counters = &gpu->parent->access_counter_buffer_info;
+    uvm_access_counter_type_t counter_type = current_entry->counter_type;
+
+    const uvm_gpu_access_counter_type_config_t *config = get_config_for_type(access_counters, counter_type);
+
+    uvm_va_space_t *va_space = current_entry->virtual_info.va_space;
+
+    UVM_ASSERT(counter_type == UVM_ACCESS_COUNTER_TYPE_MIMC);
+
+    // Entries with NULL va_space are simply dropped.
+    if (!va_space)
+        return NV_OK;
+
+    status = config_granularity_to_bytes(config->rm.granularity, &notification_size);
+    if (status != NV_OK)
+        return status;
+
+    // Collect physical locations that could have been touched
+    // in the reported 64K VA region. The notification mask can
+    // correspond to any of them.
+    uvm_va_space_down_read(va_space);
+    for (address = region_start; address < region_end;) {
+        uvm_va_block_t *va_block;
+
+        NV_STATUS local_status = uvm_va_block_find(va_space, address, &va_block);
+        if (local_status == NV_ERR_INVALID_ADDRESS || local_status == NV_ERR_OBJECT_NOT_FOUND) {
+            address += PAGE_SIZE;
+            continue;
+        }
+
+        uvm_mutex_lock(&va_block->lock);
+        while (address < va_block->end && address < region_end) {
+            const unsigned page_index = uvm_va_block_cpu_page_index(va_block, address);
+
+            // UVM va_block always maps the closest resident location to processor
+            const uvm_processor_id_t res_id = uvm_va_block_page_get_closest_resident(va_block, page_index, gpu->id);
+
+            // Add physical location if it's valid and not local vidmem
+            if (UVM_ID_IS_VALID(res_id) && !uvm_id_equal(res_id, gpu->id)) {
+                uvm_gpu_phys_address_t phys_address = uvm_va_block_res_phys_page_address(va_block, page_index, res_id, gpu);
+                if (phys_address_in_accessed_sub_region(phys_address,
+                                                        notification_size,
+                                                        config->sub_granularity_region_size,
+                                                        current_entry->sub_granularity)) {
+                    resident_processors[num_addresses] = res_id;
+                    phys_addresses[num_addresses] = phys_address;
+                    ++num_addresses;
+                }
+                else {
+                    UVM_DBG_PRINT_RL("Skipping phys address %llx:%s, because it couldn't have been accessed in mask %x",
+                                     phys_address.address,
+                                     uvm_aperture_string(phys_address.aperture),
+                                     current_entry->sub_granularity);
+                }
+            }
+
+            address += PAGE_SIZE;
+        }
+        uvm_mutex_unlock(&va_block->lock);
+    }
+    uvm_va_space_up_read(va_space);
+
+    // The addresses need to be sorted to aid coalescing.
+    sort(phys_addresses,
+         num_addresses,
+         sizeof(*phys_addresses),
+         cmp_sort_gpu_phys_addr,
+         NULL);
+
+    for (i = 0; i < num_addresses; ++i) {
+        uvm_access_counter_buffer_entry_t *fake_entry = &batch_context->virt.scratch.phys_entry;
+
+        // Skip the current pointer if the physical region was already handled
+        if (i > 0 && gpu_phys_same_region(phys_addresses[i - 1], phys_addresses[i], notification_size)) {
+            UVM_ASSERT(uvm_id_equal(resident_processors[i - 1], resident_processors[i]));
+            continue;
+        }
+        UVM_DBG_PRINT_RL("Faking MIMC address[%i/%i]: %llx (granularity mask: %llx) in aperture %s on device %s\n",
+                         i,
+                         num_addresses,
+                         phys_addresses[i].address,
+                         notification_size - 1,
+                         uvm_aperture_string(phys_addresses[i].aperture),
+                         uvm_gpu_name(gpu));
+
+        // Construct a fake phys addr AC entry
+        fake_entry->counter_type = current_entry->counter_type;
+        fake_entry->address.address = UVM_ALIGN_DOWN(phys_addresses[i].address, notification_size);
+        fake_entry->address.aperture = phys_addresses[i].aperture;
+        fake_entry->address.is_virtual = false;
+        fake_entry->physical_info.resident_id = resident_processors[i];
+        fake_entry->counter_value = current_entry->counter_value;
+        fake_entry->sub_granularity = current_entry->sub_granularity;
+
+        status = service_phys_notification(gpu, batch_context, fake_entry, out_flags);
+        if (status != NV_OK)
+            break;
+    }
+
+    return status;
+}
+
+static NV_STATUS service_virt_notifications(uvm_gpu_t *gpu,
+                                            uvm_access_counter_service_batch_context_t *batch_context)
+{
+    NvU32 i;
+    NV_STATUS status = NV_OK;
+    preprocess_virt_notifications(gpu, batch_context);
+
+    for (i = 0; i < batch_context->virt.num_notifications; ++i) {
+        unsigned flags = 0;
+        uvm_access_counter_buffer_entry_t *current_entry = batch_context->virt.notifications[i];
+
+        status = service_virt_notification(gpu, batch_context, current_entry, &flags);
+
+        UVM_DBG_PRINT_RL("Processed virt access counter (%d/%d): %sMANAGED (status: %d) clear: %s\n",
+                         i + 1,
+                         batch_context->virt.num_notifications,
+                         (flags & UVM_ACCESS_COUNTER_ON_MANAGED) ? "" : "NOT ",
+                         status,
+                         (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR) ? "YES" : "NO");
+
+        if (uvm_enable_builtin_tests)
+            uvm_tools_broadcast_access_counter(gpu, current_entry, flags & UVM_ACCESS_COUNTER_ON_MANAGED);
+
+        if (status == NV_OK && (flags & UVM_ACCESS_COUNTER_ACTION_CLEAR))
+            status = access_counter_clear_targeted(gpu, current_entry);
+
+        if (status != NV_OK)
+            break;
+    }
+
+    return status;
+}
+
+
 void uvm_gpu_service_access_counters(uvm_gpu_t *gpu)
 {
    NV_STATUS status = NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2017-2021 NVIDIA Corporation
+    Copyright (c) 2017-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -338,7 +338,6 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
    uvm_processor_id_t new_residency;
    bool read_duplicate;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
-    uvm_va_range_t *va_range = va_block->va_range;
    uvm_non_replayable_fault_buffer_info_t *non_replayable_faults = &gpu->parent->fault_buffer_info.non_replayable;

    UVM_ASSERT(!fault_entry->is_fatal);
@ -365,8 +364,11 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
    }

    // Check logical permissions
-    status = uvm_va_range_check_logical_permissions(va_range,
+    status = uvm_va_block_check_logical_permissions(va_block,
+                                                    &service_context->block_context,
                                                    gpu->id,
+                                                    uvm_va_block_cpu_page_index(va_block,
+                                                                                fault_entry->fault_address),
                                                    fault_entry->fault_access_type,
                                                    uvm_range_group_address_migratable(va_space,
                                                                                       fault_entry->fault_address));
@ -386,6 +388,7 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,

    // Compute new residency and update the masks
    new_residency = uvm_va_block_select_residency(va_block,
+                                                  &service_context->block_context,
                                                  page_index,
                                                  gpu->id,
                                                  fault_entry->access_type_mask,
@ -422,7 +425,6 @@ static NV_STATUS service_managed_fault_in_block_locked(uvm_gpu_t *gpu,
 }

 static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,
-                                                struct mm_struct *mm,
                                                uvm_va_block_t *va_block,
                                                uvm_fault_buffer_entry_t *fault_entry)
 {
@ -432,7 +434,6 @@ static NV_STATUS service_managed_fault_in_block(uvm_gpu_t *gpu,

    service_context->operation = UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS;
    service_context->num_retries = 0;
-    service_context->block_context.mm = mm;

    uvm_mutex_lock(&va_block->lock);

@ -598,6 +599,7 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e
    // to remain valid until we release. If no mm is registered, we
    // can only service managed faults, not ATS/HMM faults.
    mm = uvm_va_space_mm_retain_lock(va_space);
+    va_block_context->mm = mm;

    uvm_va_space_down_read(va_space);

@ -622,12 +624,11 @@ static NV_STATUS service_fault(uvm_gpu_t *gpu, uvm_fault_buffer_entry_t *fault_e

    if (!fault_entry->is_fatal) {
        status = uvm_va_block_find_create(fault_entry->va_space,
-                                          mm,
                                          fault_entry->fault_address,
                                          va_block_context,
                                          &va_block);
        if (status == NV_OK)
-            status = service_managed_fault_in_block(gpu_va_space->gpu, mm, va_block, fault_entry);
+            status = service_managed_fault_in_block(gpu_va_space->gpu, va_block, fault_entry);
        else
            status = service_non_managed_fault(gpu_va_space, mm, fault_entry, status);

--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -1055,13 +1055,17 @@ static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_
 // - service_access_type: highest access type that can be serviced.
 static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
                                                              uvm_va_block_t *va_block,
+                                                              uvm_va_block_context_t *va_block_context,
                                                              uvm_fault_buffer_entry_t *fault_entry,
                                                              bool allow_migration)
 {
    NV_STATUS perm_status;

-    perm_status = uvm_va_range_check_logical_permissions(va_block->va_range,
+    perm_status = uvm_va_block_check_logical_permissions(va_block,
+                                                         va_block_context,
                                                         gpu->id,
+                                                         uvm_va_block_cpu_page_index(va_block,
+                                                                                     fault_entry->fault_address),
                                                         fault_entry->fault_access_type,
                                                         allow_migration);
    if (perm_status == NV_OK)
@ -1083,8 +1087,11 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
        // service them before we can cancel the write/atomic faults. So we
        // retry with read fault access type.
        if (uvm_fault_access_type_mask_test(fault_entry->access_type_mask, UVM_FAULT_ACCESS_TYPE_READ)) {
-            perm_status = uvm_va_range_check_logical_permissions(va_block->va_range,
+            perm_status = uvm_va_block_check_logical_permissions(va_block,
+                                                                 va_block_context,
                                                                 gpu->id,
+                                                                 uvm_va_block_cpu_page_index(va_block,
+                                                                                             fault_entry->fault_address),
                                                                 UVM_FAULT_ACCESS_TYPE_READ,
                                                                 allow_migration);
            if (perm_status == NV_OK)
@ -1156,14 +1163,16 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,
    UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address >= va_block->start);
    UVM_ASSERT(ordered_fault_cache[first_fault_index]->fault_address <= va_block->end);

-    end = va_block->end;
-    if (uvm_va_block_is_hmm(va_block))
+    if (uvm_va_block_is_hmm(va_block)) {
        uvm_hmm_find_policy_end(va_block,
                                &block_context->block_context,
                                ordered_fault_cache[first_fault_index]->fault_address,
                                &end);
-    else
+    }
+    else {
        block_context->block_context.policy = uvm_va_range_get_policy(va_block->va_range);
+        end = va_block->end;
+    }

    // Scan the sorted array and notify the fault event for all fault entries
    // in the block
@ -1226,7 +1235,11 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,

        UVM_ASSERT(iter.start <= current_entry->fault_address && iter.end >= current_entry->fault_address);

-        service_access_type = check_fault_access_permissions(gpu, va_block, current_entry, iter.migratable);
+        service_access_type = check_fault_access_permissions(gpu,
+                                                             va_block,
+                                                             &block_context->block_context,
+                                                             current_entry,
+                                                             iter.migratable);

        // Do not exit early due to logical errors such as access permission
        // violation.
@ -1269,6 +1282,7 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,

        // Compute new residency and update the masks
        new_residency = uvm_va_block_select_residency(va_block,
+                                                      &block_context->block_context,
                                                      page_index,
                                                      gpu->id,
                                                      service_access_type_mask,
@ -1348,7 +1362,6 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,
 // See the comments for function service_fault_batch_block_locked for
 // implementation details and error codes.
 static NV_STATUS service_batch_managed_faults_in_block(uvm_gpu_t *gpu,
-                                                       struct mm_struct *mm,
                                                       uvm_va_block_t *va_block,
                                                       NvU32 first_fault_index,
                                                       uvm_fault_service_batch_context_t *batch_context,
@ -1361,7 +1374,6 @@ static NV_STATUS service_batch_managed_faults_in_block(uvm_gpu_t *gpu,

    fault_block_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
    fault_block_context->num_retries = 0;
-    fault_block_context->block_context.mm = mm;

    uvm_mutex_lock(&va_block->lock);

@ -1531,6 +1543,7 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            // to remain valid until we release. If no mm is registered, we
            // can only service managed faults, not ATS/HMM faults.
            mm = uvm_va_space_mm_retain_lock(va_space);
+            va_block_context->mm = mm;

            uvm_va_space_down_read(va_space);

@ -1576,13 +1589,11 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
        // TODO: Bug 2103669: Service more than one ATS fault at a time so we
        //       don't do an unconditional VA range lookup for every ATS fault.
        status = uvm_va_block_find_create(va_space,
-                                          mm,
                                          current_entry->fault_address,
                                          va_block_context,
                                          &va_block);
        if (status == NV_OK) {
            status = service_batch_managed_faults_in_block(gpu_va_space->gpu,
-                                                           mm,
                                                           va_block,
                                                           i,
                                                           batch_context,
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@ -118,6 +118,13 @@ static bool is_canary(NvU32 val)
    return (val & ~UVM_SEMAPHORE_CANARY_MASK) == UVM_SEMAPHORE_CANARY_BASE;
 }

+// Can the GPU access the semaphore, i.e., can Host/Esched address the semaphore
+// pool?
+static bool gpu_can_access_semaphore_pool(uvm_gpu_t *gpu, uvm_rm_mem_t *rm_mem)
+{
+    return ((uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu) + rm_mem->size - 1) < gpu->parent->max_host_va);
+}
+
 static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
 {
    NV_STATUS status;
@ -142,6 +149,9 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    if (status != NV_OK)
        goto error;

+    // Verify the GPU can access the semaphore pool.
+    UVM_ASSERT(gpu_can_access_semaphore_pool(pool->gpu, pool_page->memory));
+
    // All semaphores are initially free
    bitmap_fill(pool_page->free_semaphores, UVM_SEMAPHORE_COUNT_PER_PAGE);

--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
@ -46,6 +46,7 @@ MODULE_PARM_DESC(uvm_disable_hmm,
 #include "uvm_lock.h"
 #include "uvm_api.h"
 #include "uvm_va_policy.h"
+#include "uvm_tools.h"

 bool uvm_hmm_is_enabled_system_wide(void)
 {
@ -96,6 +97,9 @@ NV_STATUS uvm_hmm_va_space_initialize_test(uvm_va_space_t *va_space)
    if (!uvm_hmm_is_enabled_system_wide() || !mm)
        return NV_WARN_NOTHING_TO_DO;

+    if (va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM)
+        return NV_ERR_INVALID_STATE;
+
    uvm_assert_mmap_lock_locked_write(mm);
    uvm_assert_rwsem_locked_write(&va_space->lock);

@ -179,12 +183,19 @@ static bool hmm_invalidate(uvm_va_block_t *va_block,
    mmu_interval_set_seq(mni, cur_seq);

    // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff]
+    // Also note that hmm_invalidate() can be called when a new va_block is not
+    // yet inserted into the va_space->hmm.blocks table while the original
+    // va_block is being split. The original va_block may have its end address
+    // updated before the mmu interval notifier is updated so this invalidate
+    // may be for a range past the va_block end address.
    start = range->start;
    end = (range->end == ULONG_MAX) ? range->end : range->end - 1;
    if (start < va_block->start)
        start = va_block->start;
    if (end > va_block->end)
        end = va_block->end;
+    if (start > end)
+        goto unlock;

    if (range->event == MMU_NOTIFY_UNMAP)
        uvm_va_policy_clear(va_block, start, end);
@ -266,6 +277,7 @@ static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,

    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
    UVM_ASSERT(mm);
+    UVM_ASSERT(!va_block_context || va_block_context->mm == mm);
    uvm_assert_mmap_lock_locked(mm);
    uvm_assert_rwsem_locked(&va_space->lock);
    UVM_ASSERT(PAGE_ALIGNED(addr));
@ -294,11 +306,13 @@ static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
    // a maximum interval that doesn't overlap any existing UVM va_ranges.
    // We know that 'addr' is not within a va_range or
    // hmm_va_block_find_create() wouldn't be called.
-    uvm_range_tree_adjust_interval(&va_space->va_range_tree, addr, &start, &end);
+    status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end);
+    UVM_ASSERT(status == NV_OK);

    // Search for existing HMM va_blocks in the start/end interval and create
    // a maximum interval that doesn't overlap any existing HMM va_blocks.
-    uvm_range_tree_adjust_interval(&va_space->hmm.blocks, addr, &start, &end);
+    status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end);
+    UVM_ASSERT(status == NV_OK);

    // Create a HMM va_block with a NULL va_range pointer.
    status = uvm_va_block_create(NULL, start, end, &va_block);
@ -321,10 +335,7 @@ static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
    }

    status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node);
-    if (status != NV_OK) {
-        UVM_ASSERT(status != NV_ERR_UVM_ADDRESS_IN_USE);
-        goto err_unreg;
-    }
+    UVM_ASSERT(status == NV_OK);

 done:
    uvm_mutex_unlock(&va_space->hmm.blocks_lock);
@ -333,9 +344,6 @@ done:
    *va_block_ptr = va_block;
    return NV_OK;

-err_unreg:
-    mmu_interval_notifier_remove(&va_block->hmm.notifier);
-
 err_release:
    uvm_va_block_release(va_block);

@ -352,10 +360,67 @@ NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
    return hmm_va_block_find_create(va_space, addr, false, va_block_context, va_block_ptr);
 }

+NV_STATUS uvm_hmm_find_vma(uvm_va_block_context_t *va_block_context, NvU64 addr)
+{
+    struct mm_struct *mm = va_block_context->mm;
+    struct vm_area_struct *vma;
+
+    if (!mm)
+        return NV_ERR_INVALID_ADDRESS;
+
+    uvm_assert_mmap_lock_locked(mm);
+
+    vma = find_vma(mm, addr);
+    if (!uvm_hmm_vma_is_valid(vma, addr, false))
+        return NV_ERR_INVALID_ADDRESS;
+
+    va_block_context->hmm.vma = vma;
+
+    return NV_OK;
+}
+
+bool uvm_hmm_va_block_context_vma_is_valid(uvm_va_block_t *va_block,
+                                           uvm_va_block_context_t *va_block_context,
+                                           uvm_va_block_region_t region)
+{
+    uvm_assert_mutex_locked(&va_block->lock);
+
+    if (uvm_va_block_is_hmm(va_block)) {
+        struct vm_area_struct *vma = va_block_context->hmm.vma;
+
+        UVM_ASSERT(vma);
+        UVM_ASSERT(va_block_context->mm == vma->vm_mm);
+        uvm_assert_mmap_lock_locked(va_block_context->mm);
+        UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region));
+        UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region));
+    }
+
+    return true;
+}
+
+NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
+{
+    uvm_va_block_test_t *block_test;
+    uvm_va_block_t *va_block;
+    NV_STATUS status;
+
+    if (!uvm_hmm_is_enabled(va_space))
+        return NV_ERR_INVALID_ADDRESS;
+
+    status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block);
+    if (status != NV_OK)
+        return status;
+
+    block_test = uvm_va_block_get_test(va_block);
+    if (block_test)
+        block_test->inject_split_error = true;
+
+    return NV_OK;
+}
+
 typedef struct {
    struct mmu_interval_notifier notifier;
    uvm_va_block_t *existing_block;
-    uvm_va_block_t *new_block;
 } hmm_split_invalidate_data_t;

 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
@ -363,14 +428,9 @@ static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
                                 unsigned long cur_seq)
 {
    hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier);
-    uvm_va_block_t *existing_block = split_data->existing_block;
-    uvm_va_block_t *new_block = split_data->new_block;

-    if (uvm_ranges_overlap(existing_block->start, existing_block->end, range->start, range->end - 1))
-        hmm_invalidate(existing_block, range, cur_seq);
-
-    if (uvm_ranges_overlap(new_block->start, new_block->end, range->start, range->end - 1))
-        hmm_invalidate(new_block, range, cur_seq);
+    uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space);
+    hmm_invalidate(split_data->existing_block, range, cur_seq);

    return true;
 }
@ -404,6 +464,7 @@ static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
    uvm_va_space_t *va_space = va_block->hmm.va_space;
    struct mm_struct *mm = va_space->va_space_mm.mm;
    hmm_split_invalidate_data_t split_data;
+    NvU64 delay_us;
    uvm_va_block_t *new_va_block;
    NV_STATUS status;
    int ret;
@ -419,22 +480,23 @@ static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
        return status;

    // Initialize the newly created HMM va_block.
+    new_va_block->hmm.node.start = new_va_block->start;
+    new_va_block->hmm.node.end = new_va_block->end;
    new_va_block->hmm.va_space = va_space;
    uvm_range_tree_init(&new_va_block->hmm.va_policy_tree);

-    // The MMU interval notifier has to be removed in order to resize it.
-    // That means there would be a window of time where invalidation callbacks
-    // could be missed. To handle this case, we register a temporary notifier
-    // to cover the same address range while resizing the old notifier (it is
-    // OK to have multiple notifiers for the same range, we may simply try to
-    // invalidate twice).
-    split_data.existing_block = va_block;
-    split_data.new_block = new_va_block;
-    ret = mmu_interval_notifier_insert(&split_data.notifier,
+    ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
                                       mm,
-                                       va_block->start,
-                                       new_va_block->end - va_block->start + 1,
-                                       &hmm_notifier_split_ops);
+                                       new_va_block->start,
+                                       uvm_va_block_size(new_va_block),
+                                       &uvm_hmm_notifier_ops);
+
+    // Since __mmu_notifier_register() was called when the va_space was
+    // initially created, we know that mm->notifier_subscriptions is valid
+    // and mmu_interval_notifier_insert() can't return ENOMEM.
+    // The only error return is for start + length overflowing but we already
+    // registered the same address range before so there should be no error.
+    UVM_ASSERT(!ret);

    uvm_mutex_lock(&va_block->lock);

@ -444,40 +506,38 @@ static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,

    uvm_mutex_unlock(&va_block->lock);

-    // Since __mmu_notifier_register() was called when the va_space was
-    // initially created, we know that mm->notifier_subscriptions is valid
-    // and mmu_interval_notifier_insert() can't return ENOMEM.
-    // The only error return is for start + length overflowing but we already
-    // registered the same address range before so there should be no error.
+    // The MMU interval notifier has to be removed in order to resize it.
+    // That means there would be a window of time when invalidation callbacks
+    // could be missed. To handle this case, we register a temporary notifier
+    // to cover the address range while resizing the old notifier (it is
+    // OK to have multiple notifiers for the same range, we may simply try to
+    // invalidate twice).
+    split_data.existing_block = va_block;
+    ret = mmu_interval_notifier_insert(&split_data.notifier,
+                                       mm,
+                                       va_block->start,
+                                       new_end - va_block->start + 1,
+                                       &hmm_notifier_split_ops);
    UVM_ASSERT(!ret);

-    mmu_interval_notifier_remove(&va_block->hmm.notifier);
+    // Delay to allow hmm_sanity test to trigger an mmu_notifier during the
+    // critical window where the split invalidate callback is active.
+    delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us);
+    if (delay_us)
+        udelay(delay_us);

-    uvm_range_tree_shrink_node(&va_space->hmm.blocks, &va_block->hmm.node, va_block->start, va_block->end);
+    mmu_interval_notifier_remove(&va_block->hmm.notifier);

    // Enable notifications on the old block with the smaller size.
    ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
                                       mm,
                                       va_block->start,
-                                       va_block->end - va_block->start + 1,
-                                       &uvm_hmm_notifier_ops);
-    UVM_ASSERT(!ret);
-
-    new_va_block->hmm.node.start = new_va_block->start;
-    new_va_block->hmm.node.end = new_va_block->end;
-
-    ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
-                                       mm,
-                                       new_va_block->start,
-                                       new_va_block->end - new_va_block->start + 1,
+                                       uvm_va_block_size(va_block),
                                       &uvm_hmm_notifier_ops);
    UVM_ASSERT(!ret);

    mmu_interval_notifier_remove(&split_data.notifier);

-    status = uvm_range_tree_add(&va_space->hmm.blocks, &new_va_block->hmm.node);
-    UVM_ASSERT(status == NV_OK);
-
    if (new_block_ptr)
        *new_block_ptr = new_va_block;

@ -485,7 +545,7 @@ static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,

 err:
    uvm_mutex_unlock(&va_block->lock);
-    mmu_interval_notifier_remove(&split_data.notifier);
+    mmu_interval_notifier_remove(&new_va_block->hmm.notifier);
    uvm_va_block_release(new_va_block);
    return status;
 }
@ -536,9 +596,9 @@ static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
 // page tables. However, it doesn't destroy the va_block because that would
 // require calling mmu_interval_notifier_remove() which can't be called from
 // the invalidate callback due to Linux locking constraints. If a process
-// calls mmap()/munmap() for SAM and then creates a UVM managed allocation,
+// calls mmap()/munmap() for SAM and then creates a managed allocation,
 // the same VMA range can be picked and there would be a UVM/HMM va_block
-// conflict. Creating a UVM managed allocation (or other va_range) calls this
+// conflict. Creating a managed allocation (or other va_range) calls this
 // function to remove stale HMM va_blocks or split the HMM va_block so there
 // is no overlap.
 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
@ -585,6 +645,18 @@ NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
    return NV_OK;
 }

+void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
+{
+    uvm_va_space_t *va_space = existing_va_block->hmm.va_space;
+
+    UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block));
+    uvm_assert_rwsem_locked_write(&va_space->lock);
+
+    uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks,
+                         &existing_va_block->hmm.node,
+                         &new_block->hmm.node);
+}
+
 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
                                  NvU64 addr,
                                  uvm_va_policy_is_split_needed_t split_needed_cb,
@ -733,7 +805,7 @@ void uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
 {
    struct vm_area_struct *vma = va_block_context->hmm.vma;
    uvm_va_policy_node_t *node;
-    NvU64 end = *endp;
+    NvU64 end = va_block->end;

    uvm_assert_mmap_lock_locked(vma->vm_mm);
    uvm_assert_mutex_locked(&va_block->lock);
@ -747,8 +819,9 @@ void uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
        if (end > node->node.end)
            end = node->node.end;
    }
-    else
+    else {
        va_block_context->policy = &uvm_va_policy_default;
+    }

    *endp = end;
 }
@ -760,7 +833,7 @@ NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block,
 {
    struct vm_area_struct *vma;
    unsigned long addr;
-    NvU64 end = va_block->end;
+    NvU64 end;
    uvm_page_index_t outer;

    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
@ -801,9 +874,9 @@ static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block,
        // before the pinned pages information is destroyed.
        status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
                                           NULL,
-                                           unmap_remote_pinned_pages_from_all_processors(va_block,
-                                                                                         block_context,
-                                                                                         region));
+                                           uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
+                                                                                            block_context,
+                                                                                            region));

        uvm_perf_thrashing_info_destroy(va_block);

@ -839,5 +912,186 @@ NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space)
    return status;
 }

+uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
+                                                  uvm_va_block_context_t *va_block_context,
+                                                  NvU64 address)
+{
+    struct vm_area_struct *vma = va_block_context->hmm.vma;
+    uvm_va_policy_t *policy = va_block_context->policy;
+    NvU64 start, end;
+
+    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
+
+    // We need to limit the prefetch region to the VMA.
+    start = max(va_block->start, (NvU64)vma->vm_start);
+    end = min(va_block->end, (NvU64)vma->vm_end - 1);
+
+    // Also, we need to limit the prefetch region to the policy range.
+    if (policy == &uvm_va_policy_default) {
+        NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree,
+                                                       address,
+                                                       &start,
+                                                       &end);
+        // We already know the hole exists and covers the fault region.
+        UVM_ASSERT(status == NV_OK);
+    }
+    else {
+        uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy);
+
+        start = max(start, node->node.start);
+        end = min(end, node->node.end);
+    }
+
+    return uvm_va_block_region_from_start_end(va_block, start, end);
+}
+
+uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
+                                        uvm_va_block_context_t *va_block_context,
+                                        NvU64 addr)
+{
+    struct vm_area_struct *vma = va_block_context->hmm.vma;
+
+    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
+    uvm_assert_mmap_lock_locked(va_block_context->mm);
+    UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end);
+
+    if (!(vma->vm_flags & VM_READ))
+        return UVM_PROT_NONE;
+    else if (!(vma->vm_flags & VM_WRITE))
+        return UVM_PROT_READ_ONLY;
+    else
+        return UVM_PROT_READ_WRITE_ATOMIC;
+}
+
+NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
+{
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+
+    atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us);
+
+    return NV_OK;
+}
+
+NV_STATUS uvm_test_hmm_init(UVM_TEST_HMM_INIT_PARAMS *params, struct file *filp)
+{
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+    struct mm_struct *mm;
+    NV_STATUS status;
+
+    mm = uvm_va_space_mm_or_current_retain(va_space);
+    if (!mm)
+        return NV_WARN_NOTHING_TO_DO;
+
+    uvm_down_write_mmap_lock(mm);
+    uvm_va_space_down_write(va_space);
+    if (va_space->hmm.disable)
+        status = uvm_hmm_va_space_initialize_test(va_space);
+    else
+        status = NV_OK;
+    uvm_va_space_up_write(va_space);
+    uvm_up_write_mmap_lock(mm);
+    uvm_va_space_mm_or_current_release(va_space, mm);
+
+    return status;
+}
+
+NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
+                                struct mm_struct *mm,
+                                UVM_TEST_VA_RANGE_INFO_PARAMS *params)
+{
+    uvm_range_tree_node_t *tree_node;
+    uvm_va_policy_node_t *node;
+    struct vm_area_struct *vma;
+    uvm_va_block_t *va_block;
+
+    if (!mm || !uvm_hmm_is_enabled(va_space))
+        return NV_ERR_INVALID_ADDRESS;
+
+    uvm_assert_mmap_lock_locked(mm);
+    uvm_assert_rwsem_locked(&va_space->lock);
+
+    params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED;
+    params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM;
+    params->va_range_start = 0;
+    params->va_range_end = ULONG_MAX;
+    params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET;
+    memset(&params->preferred_location, 0, sizeof(params->preferred_location));
+    params->accessed_by_count = 0;
+    params->managed.vma_start = 0;
+    params->managed.vma_end = 0;
+    params->managed.is_zombie = NV_FALSE;
+    params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
+
+    vma = find_vma(mm, params->lookup_address);
+    if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false))
+        return NV_ERR_INVALID_ADDRESS;
+
+    params->va_range_start = vma->vm_start;
+    params->va_range_end   = vma->vm_end - 1;
+    params->managed.vma_start = vma->vm_start;
+    params->managed.vma_end   = vma->vm_end - 1;
+
+    uvm_mutex_lock(&va_space->hmm.blocks_lock);
+    tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address);
+    if (!tree_node) {
+        UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address,
+                                               &params->va_range_start, &params->va_range_end) == NV_OK);
+        uvm_mutex_unlock(&va_space->hmm.blocks_lock);
+        return NV_OK;
+    }
+
+    uvm_mutex_unlock(&va_space->hmm.blocks_lock);
+    va_block = hmm_va_block_from_node(tree_node);
+    uvm_mutex_lock(&va_block->lock);
+
+    params->va_range_start = va_block->start;
+    params->va_range_end   = va_block->end;
+
+    node = uvm_va_policy_node_find(va_block, params->lookup_address);
+    if (node) {
+        uvm_processor_id_t processor_id;
+
+        if (params->va_range_start < node->node.start)
+            params->va_range_start = node->node.start;
+        if (params->va_range_end > node->node.end)
+            params->va_range_end = node->node.end;
+
+        params->read_duplication = node->policy.read_duplication;
+
+        if (!UVM_ID_IS_INVALID(node->policy.preferred_location))
+            uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
+
+        for_each_id_in_mask(processor_id, &node->policy.accessed_by)
+            uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
+    }
+    else {
+        uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
+                                    &params->va_range_start, &params->va_range_end);
+    }
+
+    uvm_mutex_unlock(&va_block->lock);
+
+    return NV_OK;
+}
+
+// TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented
+// for VMAs other than anonymous private memory.
+bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
+                             uvm_va_block_context_t *va_block_context)
+{
+    struct vm_area_struct *vma = va_block_context->hmm.vma;
+
+    uvm_assert_mutex_locked(&va_block->lock);
+
+    if (!uvm_va_block_is_hmm(va_block))
+        return false;
+
+    UVM_ASSERT(vma);
+    UVM_ASSERT(va_block_context->mm == vma->vm_mm);
+    uvm_assert_mmap_lock_locked(va_block_context->mm);
+
+    return !vma_is_anonymous(vma);
+}
+
 #endif // UVM_IS_CONFIG_HMM()

--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@ -65,6 +65,8 @@ typedef struct
    // Initialize HMM for the given the va_space for testing.
    // Bug 1750144: UVM: Add HMM (Heterogeneous Memory Management) support to
    // the UVM driver. Remove this when enough HMM functionality is implemented.
+    // Locking: the va_space->va_space_mm.mm mmap_lock must be write locked
+    // and the va_space lock must be held in write mode.
    NV_STATUS uvm_hmm_va_space_initialize_test(uvm_va_space_t *va_space);

    // Destroy any HMM state for the given the va_space.
@ -87,6 +89,10 @@ typedef struct
    //
    // Return NV_ERR_INVALID_ADDRESS if there is no VMA associated with the
    // address 'addr' or the VMA does not have at least PROT_READ permission.
+    // The caller is also responsible for checking that there is no UVM
+    // va_range covering the given address before calling this function.
+    // If va_block_context is not NULL, the VMA is cached in
+    // va_block_context->hmm.vma.
    // Locking: This function must be called with mm retained and locked for
    // at least read and the va_space lock at least for read.
    NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
@ -94,23 +100,53 @@ typedef struct
                                           uvm_va_block_context_t *va_block_context,
                                           uvm_va_block_t **va_block_ptr);

+    // Find the VMA for the given address and set va_block_context->hmm.vma.
+    // Return NV_ERR_INVALID_ADDRESS if va_block_context->mm is NULL or there
+    // is no VMA associated with the address 'addr' or the VMA does not have at
+    // least PROT_READ permission.
+    // Locking: This function must be called with mm retained and locked for
+    // at least read or mm equal to NULL.
+    NV_STATUS uvm_hmm_find_vma(uvm_va_block_context_t *va_block_context, NvU64 addr);
+
+    // If va_block is a HMM va_block, check that va_block_context->hmm.vma is
+    // not NULL and covers the given region. This always returns true and is
+    // intended to only be used with UVM_ASSERT().
+    // Locking: This function must be called with the va_block lock held and if
+    // va_block is a HMM block, va_block_context->mm must be retained and
+    // locked for at least read.
+    bool uvm_hmm_va_block_context_vma_is_valid(uvm_va_block_t *va_block,
+                                               uvm_va_block_context_t *va_block_context,
+                                               uvm_va_block_region_t region);
+
+    // Find or create a HMM va_block and mark it so the next va_block split
+    // will fail for testing purposes.
+    // Locking: This function must be called with mm retained and locked for
+    // at least read and the va_space lock at least for read.
+    NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr);
+
    // Reclaim any HMM va_blocks that overlap the given range.
-    // Note that 'end' is inclusive.
-    // A HMM va_block can be reclaimed if it doesn't contain any "valid" VMAs.
-    // See uvm_hmm_vma_is_valid() for details.
+    // Note that 'end' is inclusive. If mm is NULL, any HMM va_block in the
+    // range will be reclaimed which assumes that the mm is being torn down
+    // and was not retained.
    // Return values:
    // NV_ERR_NO_MEMORY: Reclaim required a block split, which failed.
    // NV_OK:            There were no HMM blocks in the range, or all HMM
    //                   blocks in the range were successfully reclaimed.
    // Locking: If mm is not NULL, it must equal va_space_mm.mm, the caller
-    // must hold a reference on it, and it must be locked for at least read
-    // mode. Also, the va_space lock must be held in write mode.
+    // must retain it with uvm_va_space_mm_or_current_retain() or be sure that
+    // mm->mm_users is not zero, and it must be locked for at least read mode.
+    // Also, the va_space lock must be held in write mode.
    // TODO: Bug 3372166: add asynchronous va_block reclaim.
    NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
                                       struct mm_struct *mm,
                                       NvU64 start,
                                       NvU64 end);

+    // This is called to update the va_space tree of HMM va_blocks after an
+    // existing va_block is split.
+    // Locking: the va_space lock must be held in write mode.
+    void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block);
+
    // Find a HMM policy range that needs to be split. The callback function
    // 'split_needed_cb' returns true if the policy range needs to be split.
    // If a policy range is split, the existing range is truncated to
@ -148,7 +184,7 @@ typedef struct
    // Note that 'last_address' is inclusive.
    // Locking: the va_space->va_space_mm.mm mmap_lock must be write locked
    // and the va_space lock must be held in write mode.
-    // TODO: Bug 2046423: need to implement read duplication support in Linux.
+    // TODO: Bug 3660922: need to implement HMM read duplication support.
    static NV_STATUS uvm_hmm_set_read_duplication(uvm_va_space_t *va_space,
                                                  uvm_read_duplication_policy_t new_policy,
                                                  NvU64 base,
@ -159,10 +195,11 @@ typedef struct
        return NV_OK;
    }

-    // Set va_block_context->policy to the policy covering the given address
-    // 'addr' and update the ending address '*endp' to the minimum of *endp,
-    // va_block_context->hmm.vma->vm_end - 1, and the ending address of the
-    // policy range.
+    // This function assigns va_block_context->policy to the policy covering
+    // the given address 'addr' and assigns the ending address '*endp' to the
+    // minimum of va_block->end, va_block_context->hmm.vma->vm_end - 1, and the
+    // ending address of the policy range. Note that va_block_context->hmm.vma
+    // is expected to be initialized before calling this function.
    // Locking: This function must be called with
    // va_block_context->hmm.vma->vm_mm retained and locked for least read and
    // the va_block lock held.
@ -171,11 +208,11 @@ typedef struct
                                 unsigned long addr,
                                 NvU64 *endp);

-    // Find the VMA for the page index 'page_index',
-    // set va_block_context->policy to the policy covering the given address,
-    // and update the ending page range '*outerp' to the minimum of *outerp,
-    // va_block_context->hmm.vma->vm_end - 1, and the ending address of the
-    // policy range.
+    // This function finds the VMA for the page index 'page_index' and assigns
+    // it to va_block_context->vma, sets va_block_context->policy to the policy
+    // covering the given address, and sets the ending page range '*outerp'
+    // to the minimum of *outerp, va_block_context->hmm.vma->vm_end - 1, the
+    // ending address of the policy range, and va_block->end.
    // Return NV_ERR_INVALID_ADDRESS if no VMA is found; otherwise, NV_OK.
    // Locking: This function must be called with
    // va_block_context->hmm.vma->vm_mm retained and locked for least read and
@ -189,6 +226,48 @@ typedef struct
    // Locking: va_space lock must be held in write mode.
    NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space);

+    // Return the expanded region around 'address' limited to the intersection
+    // of va_block start/end, vma start/end, and policy start/end.
+    // va_block_context must not be NULL, va_block_context->hmm.vma must be
+    // valid (this is usually set by uvm_hmm_va_block_find_create()), and
+    // va_block_context->policy must be valid.
+    // Locking: the caller must hold mm->mmap_lock in at least read mode, the
+    // va_space lock must be held in at least read mode, and the va_block lock
+    // held.
+    uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
+                                                      uvm_va_block_context_t *va_block_context,
+                                                      NvU64 address);
+
+    // Return the logical protection allowed of a HMM va_block for the page at
+    // the given address.
+    // va_block_context must not be NULL and va_block_context->hmm.vma must be
+    // valid (this is usually set by uvm_hmm_va_block_find_create()).
+    // Locking: the caller must hold va_block_context->mm mmap_lock in at least
+    // read mode.
+    uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
+                                            uvm_va_block_context_t *va_block_context,
+                                            NvU64 addr);
+
+    NV_STATUS uvm_test_hmm_init(UVM_TEST_HMM_INIT_PARAMS *params, struct file *filp);
+
+    NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params,
+                                              struct file *filp);
+
+    NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
+                                    struct mm_struct *mm,
+                                    UVM_TEST_VA_RANGE_INFO_PARAMS *params);
+
+    // Return true if GPU fault new residency location should be system memory.
+    // va_block_context must not be NULL and va_block_context->hmm.vma must be
+    // valid (this is usually set by uvm_hmm_va_block_find_create()).
+    // TODO: Bug 3660968: Remove this hack as soon as HMM migration is
+    // implemented for VMAs other than anonymous memory.
+    // Locking: the va_block lock must be held. If the va_block is a HMM
+    // va_block, the va_block_context->mm must be retained and locked for least
+    // read.
+    bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
+                                 uvm_va_block_context_t *va_block_context);
+
 #else // UVM_IS_CONFIG_HMM()

    static bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
@ -230,6 +309,23 @@ typedef struct
        return NV_ERR_INVALID_ADDRESS;
    }

+    static NV_STATUS uvm_hmm_find_vma(uvm_va_block_context_t *va_block_context, NvU64 addr)
+    {
+        return NV_OK;
+    }
+
+    static bool uvm_hmm_va_block_context_vma_is_valid(uvm_va_block_t *va_block,
+                                                      uvm_va_block_context_t *va_block_context,
+                                                      uvm_va_block_region_t region)
+    {
+        return true;
+    }
+
+    static NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
    static NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
                                              struct mm_struct *mm,
                                              NvU64 start,
@ -238,6 +334,10 @@ typedef struct
        return NV_OK;
    }

+    static void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
+    {
+    }
+
    static NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
                                             NvU64 addr,
                                             uvm_va_policy_is_split_needed_t split_needed_cb,
@ -291,6 +391,44 @@ typedef struct
        return NV_OK;
    }

+    static uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
+                                                             uvm_va_block_context_t *va_block_context,
+                                                             NvU64 address)
+    {
+        return (uvm_va_block_region_t){};
+    }
+
+    static uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
+                                                   uvm_va_block_context_t *va_block_context,
+                                                   NvU64 addr)
+    {
+        return UVM_PROT_NONE;
+    }
+
+    static NV_STATUS uvm_test_hmm_init(UVM_TEST_HMM_INIT_PARAMS *params, struct file *filp)
+    {
+        return NV_WARN_NOTHING_TO_DO;
+    }
+
+    static NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params,
+                                              struct file *filp)
+    {
+        return NV_ERR_INVALID_STATE;
+    }
+
+    static NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
+                                           struct mm_struct *mm,
+                                           UVM_TEST_VA_RANGE_INFO_PARAMS *params)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
+                                        uvm_va_block_context_t *va_block_context)
+    {
+        return false;
+    }
+
 #endif // UVM_IS_CONFIG_HMM()

 #endif  // _UVM_HMM_H_
--- a/kernel-open/nvidia-uvm/uvm_hmm_sanity_test.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm_sanity_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -35,7 +35,7 @@ NV_STATUS uvm_test_hmm_sanity(UVM_TEST_HMM_SANITY_PARAMS *params, struct file *f
    uvm_va_block_t *hmm_block = NULL;
    NV_STATUS status;

-    mm = uvm_va_space_mm_retain(va_space);
+    mm = uvm_va_space_mm_or_current_retain(va_space);
    if (!mm)
        return NV_WARN_NOTHING_TO_DO;

@ -61,7 +61,7 @@ NV_STATUS uvm_test_hmm_sanity(UVM_TEST_HMM_SANITY_PARAMS *params, struct file *f
    status = uvm_hmm_va_block_find_create(va_space, 0UL, NULL, &hmm_block);
    TEST_CHECK_GOTO(status == NV_ERR_INVALID_ADDRESS, done);

-    // Try to create an HMM va_block which overlaps a UVM managed block.
+    // Try to create an HMM va_block which overlaps a managed block.
    // It should fail.
    status = uvm_hmm_va_block_find_create(va_space, params->uvm_address, NULL, &hmm_block);
    TEST_CHECK_GOTO(status == NV_ERR_INVALID_ADDRESS, done);
@ -77,14 +77,14 @@ NV_STATUS uvm_test_hmm_sanity(UVM_TEST_HMM_SANITY_PARAMS *params, struct file *f
 done:
    uvm_va_space_up_read(va_space);
    uvm_up_read_mmap_lock(mm);
-    uvm_va_space_mm_release(va_space);
+    uvm_va_space_mm_or_current_release(va_space, mm);

    return status;

 out:
    uvm_va_space_up_write(va_space);
    uvm_up_write_mmap_lock(mm);
-    uvm_va_space_mm_release(va_space);
+    uvm_va_space_mm_or_current_release(va_space, mm);

    return status;
 }
--- a/kernel-open/nvidia-uvm/uvm_linux.c
+++ b/kernel-open/nvidia-uvm/uvm_linux.c
@ -34,31 +34,6 @@
 // the (out-of-tree) UVM driver from changes to the upstream Linux kernel.
 //

-#if !defined(NV_ADDRESS_SPACE_INIT_ONCE_PRESENT)
-void address_space_init_once(struct address_space *mapping)
-{
-    memset(mapping, 0, sizeof(*mapping));
-    INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-
-#if defined(NV_ADDRESS_SPACE_HAS_RWLOCK_TREE_LOCK)
-    //
-    // The .tree_lock member variable was changed from type rwlock_t, to
-    // spinlock_t, on 25 July 2008, by mainline commit
-    // 19fd6231279be3c3bdd02ed99f9b0eb195978064.
-    //
-    rwlock_init(&mapping->tree_lock);
-#else
-    spin_lock_init(&mapping->tree_lock);
-#endif
-
-    spin_lock_init(&mapping->i_mmap_lock);
-    INIT_LIST_HEAD(&mapping->private_list);
-    spin_lock_init(&mapping->private_lock);
-    INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-    INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-#endif
-
 #if UVM_CGROUP_ACCOUNTING_SUPPORTED()
 void uvm_memcg_context_start(uvm_memcg_context_t *context, struct mm_struct *mm)
 {
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@ -88,7 +88,7 @@

 #include "nv-kthread-q.h"

-    #if NV_KTHREAD_Q_SUPPORTS_AFFINITY() == 1 && defined(NV_CPUMASK_OF_NODE_PRESENT)
+    #if defined(NV_CPUMASK_OF_NODE_PRESENT)
        #define UVM_THREAD_AFFINITY_SUPPORTED() 1
    #else
        #define UVM_THREAD_AFFINITY_SUPPORTED() 0
@ -136,8 +136,8 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
    #endif

 // See bug 1707453 for further details about setting the minimum kernel version.
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 32)
-#  error This driver does not support kernels older than 2.6.32!
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 10, 0)
+#  error This driver does not support kernels older than 3.10!
 #endif

 #if !defined(VM_RESERVED)
@ -217,10 +217,6 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)

 #define NV_UVM_GFP_FLAGS (GFP_KERNEL)

-#if !defined(NV_ADDRESS_SPACE_INIT_ONCE_PRESENT)
-    void address_space_init_once(struct address_space *mapping);
-#endif
-
 // Develop builds define DEBUG but enable optimization
 #if defined(DEBUG) && !defined(NVIDIA_UVM_DEVELOP)
  // Wrappers for functions not building correctly without optimizations on,
@ -352,23 +348,6 @@ static inline NvU64 NV_GETTIME(void)
             (bit) = find_next_zero_bit((addr), (size), (bit) + 1))
 #endif

-// bitmap_clear was added in 2.6.33 via commit c1a2a962a2ad103846e7950b4591471fabecece7
-#if !defined(NV_BITMAP_CLEAR_PRESENT)
-    static inline void bitmap_clear(unsigned long *map, unsigned int start, int len)
-    {
-        unsigned int index = start;
-        for_each_set_bit_from(index, map, start + len)
-            __clear_bit(index, map);
-    }
-
-    static inline void bitmap_set(unsigned long *map, unsigned int start, int len)
-    {
-        unsigned int index = start;
-        for_each_clear_bit_from(index, map, start + len)
-            __set_bit(index, map);
-    }
-#endif
-
 // Added in 2.6.24
 #ifndef ACCESS_ONCE
  #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
@ -439,17 +418,6 @@ static inline NvU64 NV_GETTIME(void)
    #define PAGE_ALIGNED(addr) (((addr) & (PAGE_SIZE - 1)) == 0)
 #endif

-// Added in 2.6.37 via commit e1ca7788dec6773b1a2bce51b7141948f2b8bccf
-#if !defined(NV_VZALLOC_PRESENT)
-    static inline void *vzalloc(unsigned long size)
-    {
-        void *p = vmalloc(size);
-        if (p)
-            memset(p, 0, size);
-        return p;
-    }
-#endif
-
 // Changed in 3.17 via commit 743162013d40ca612b4cb53d3a200dff2d9ab26e
 #if (NV_WAIT_ON_BIT_LOCK_ARGUMENT_COUNT == 3)
    #define UVM_WAIT_ON_BIT_LOCK(word, bit, mode) \
@ -505,21 +473,6 @@ static bool radix_tree_empty(struct radix_tree_root *tree)
 #endif
 #endif

-#if !defined(NV_USLEEP_RANGE_PRESENT)
-static void __sched usleep_range(unsigned long min, unsigned long max)
-{
-    unsigned min_msec = min / 1000;
-    unsigned max_msec = max / 1000;
-
-    if (min_msec != 0)
-        msleep(min_msec);
-    else if (max_msec != 0)
-        msleep(max_msec);
-    else
-        msleep(1);
-}
-#endif
-
 typedef struct
 {
    struct mem_cgroup *new_memcg;
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@ -337,7 +337,9 @@
 //
 // - Channel lock
 //      Order: UVM_LOCK_ORDER_CHANNEL
-//      Spinlock (uvm_spinlock_t)
+//      Spinlock (uvm_spinlock_t) or exclusive lock (mutex)
+//
+//      Lock protecting the state of all the channels in a channel pool.
 //
 // - Tools global VA space list lock (g_tools_va_space_list_lock)
 //      Order: UVM_LOCK_ORDER_TOOLS_VA_SPACE_LIST
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -605,7 +605,7 @@ static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_
        return NV_ERR_INVALID_ADDRESS;

    // The mm needs to be locked in order to remove stale HMM va_blocks.
-    mm = uvm_va_space_mm_retain_lock(va_space);
+    mm = uvm_va_space_mm_or_current_retain_lock(va_space);
    uvm_va_space_down_write(va_space);

    // Create the new external VA range.
@ -619,7 +619,7 @@ static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_
    }

    uvm_va_space_up_write(va_space);
-    uvm_va_space_mm_release_unlock(va_space, mm);
+    uvm_va_space_mm_or_current_release_unlock(va_space, mm);
    return status;
 }

@ -636,6 +636,11 @@ static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map,
 {
    uvm_gpu_t *owning_gpu;

+    if (!mem_info->deviceDescendant && !mem_info->sysmem) {
+        ext_gpu_map->owning_gpu = NULL;
+        ext_gpu_map->is_sysmem = false;
+        return NV_OK;
+    }
    // This is a local or peer allocation, so the owning GPU must have been
    // registered.
    owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid);
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -523,7 +523,7 @@ static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, struct mm_struct *mm, g

 // In case of failure, the caller is required to handle cleanup by calling
 // uvm_mem_free
-static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero)
+static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero, bool is_protected)
 {
    NV_STATUS status;

@ -559,7 +559,7 @@ static NV_STATUS mem_alloc_vidmem_chunks(uvm_mem_t *mem, bool zero)
    return NV_OK;
 }

-static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero)
+static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zero, bool is_protected)
 {
    if (uvm_mem_is_sysmem(mem)) {
        gfp_t gfp_flags;
@ -581,7 +581,7 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
        return status;
    }

-    return mem_alloc_vidmem_chunks(mem, zero);
+    return mem_alloc_vidmem_chunks(mem, zero, is_protected);
 }

 static const char *mem_physical_source(uvm_mem_t *mem)
@ -618,6 +618,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
 {
    NV_STATUS status;
    uvm_mem_t *mem = NULL;
+    bool is_protected = false;

    UVM_ASSERT(params->size > 0);

@ -639,7 +640,7 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou
    mem->physical_allocation_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
    mem->chunks_count = mem->physical_allocation_size / mem->chunk_size;

-    status = mem_alloc_chunks(mem, params->mm, params->zero);
+    status = mem_alloc_chunks(mem, params->mm, params->zero, is_protected);
    if (status != NV_OK)
        goto error;

@ -893,7 +894,7 @@ static void sysmem_unmap_gpu_phys(uvm_mem_t *mem, uvm_gpu_t *gpu)
            // partial map_gpu_sysmem_iommu() operation.
            break;
        }
-        uvm_gpu_unmap_cpu_pages(gpu, dma_addrs[i], mem->chunk_size);
+        uvm_gpu_unmap_cpu_pages(gpu->parent, dma_addrs[i], mem->chunk_size);
        dma_addrs[i] = 0;
    }

@ -914,7 +915,7 @@ static NV_STATUS sysmem_map_gpu_phys(uvm_mem_t *mem, uvm_gpu_t *gpu)
        return status;

    for (i = 0; i < mem->chunks_count; ++i) {
-        status = uvm_gpu_map_cpu_pages(gpu,
+        status = uvm_gpu_map_cpu_pages(gpu->parent,
                                       mem->sysmem.pages[i],
                                       mem->chunk_size,
                                       &mem->sysmem.dma_addrs[uvm_global_id_gpu_index(gpu->global_id)][i]);
--- a/kernel-open/nvidia-uvm/uvm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_mem.h
@ -179,6 +179,8 @@ struct uvm_mem_struct
            //
            // There is no equivalent mask for vidmem, because only the backing
            // GPU can physical access the memory
+            //
+            // TODO: Bug 3723779: Share DMA mappings within a single parent GPU
            uvm_global_processor_mask_t mapped_on_phys;

            struct page **pages;
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@ -207,6 +207,8 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,

    uvm_assert_mutex_locked(&va_block->lock);

+    va_block_context->policy = uvm_va_range_get_policy(va_block->va_range);
+
    if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space)) {
        status = uvm_va_block_make_resident_read_duplicate(va_block,
                                                           va_block_retry,
@ -466,6 +468,8 @@ static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
 {
    NvU64 preunmap_range_start = start;

+    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_range));
+
    should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(va_block_context->policy,
                                                                                       va_range->va_space);

@ -942,10 +946,8 @@ done:
    //       benchmarks to see if a two-pass approach would be faster (first
    //       pass pushes all GPU work asynchronously, second pass updates CPU
    //       mappings synchronously).
-    if (mm) {
+    if (mm)
        uvm_up_read_mmap_lock_out_of_order(mm);
-        uvm_va_space_mm_or_current_release(va_space, mm);
-    }

    if (tracker_ptr) {
        // If requested, release semaphore
@ -973,6 +975,7 @@ done:
    }

    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_or_current_release(va_space, mm);

    // If the migration is known to be complete, eagerly dispatch the migration
    // events, instead of processing them on a later event flush. Note that an
@ -1043,13 +1046,12 @@ done:
    //       benchmarks to see if a two-pass approach would be faster (first
    //       pass pushes all GPU work asynchronously, second pass updates CPU
    //       mappings synchronously).
-    if (mm) {
+    if (mm)
        uvm_up_read_mmap_lock_out_of_order(mm);
-        uvm_va_space_mm_or_current_release(va_space, mm);
-    }

    tracker_status = uvm_tracker_wait_deinit(&local_tracker);
    uvm_va_space_up_read(va_space);
+    uvm_va_space_mm_or_current_release(va_space, mm);

    // This API is synchronous, so wait for migrations to finish
    uvm_tools_flush_events();
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.c
@ -74,7 +74,7 @@ static NV_STATUS migrate_vma_page_copy_address(struct page *page,
    }
    else {
        // Sysmem/Indirect Peer
-        NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu, page, &state->dma.addrs[page_index]);
+        NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]);

        if (status != NV_OK)
            return status;
@ -628,7 +628,7 @@ void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_stat
    if (state->dma.num_pages > 0) {

        for_each_set_bit(i, state->dma.page_mask, state->num_pages)
-            uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i], state->dma.addrs[i]);
+            uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i]->parent, state->dma.addrs[i]);
    }

    UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages));
--- a/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
+++ b/kernel-open/nvidia-uvm/uvm_migrate_pageable.h
@ -34,8 +34,8 @@ typedef struct
 {
    uvm_va_space_t                  *va_space;
    struct mm_struct                *mm;
-    const unsigned long             start;
-    const unsigned long             length;
+    unsigned long                   start;
+    unsigned long                   length;
    uvm_processor_id_t              dst_id;

    // dst_node_id may be clobbered by uvm_migrate_pageable().
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -132,7 +132,7 @@ static NV_STATUS phys_mem_allocate_sysmem(uvm_page_tree_t *tree, NvLength size,

    // Check for fake GPUs from the unit test
    if (tree->gpu->parent->pci_dev)
-        status = uvm_gpu_map_cpu_pages(tree->gpu, out->handle.page, UVM_PAGE_ALIGN_UP(size), &dma_addr);
+        status = uvm_gpu_map_cpu_pages(tree->gpu->parent, out->handle.page, UVM_PAGE_ALIGN_UP(size), &dma_addr);
    else
        dma_addr = page_to_phys(out->handle.page);

@ -217,7 +217,7 @@ static void phys_mem_deallocate_sysmem(uvm_page_tree_t *tree, uvm_mmu_page_table

    UVM_ASSERT(ptr->addr.aperture == UVM_APERTURE_SYS);
    if (tree->gpu->parent->pci_dev)
-        uvm_gpu_unmap_cpu_pages(tree->gpu, ptr->addr.address, UVM_PAGE_ALIGN_UP(ptr->size));
+        uvm_gpu_unmap_cpu_pages(tree->gpu->parent, ptr->addr.address, UVM_PAGE_ALIGN_UP(ptr->size));
    __free_pages(ptr->handle.page, get_order(ptr->size));
 }

--- a/kernel-open/nvidia-uvm/uvm_perf_heuristics.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_heuristics.c
@ -50,7 +50,6 @@ NV_STATUS uvm_perf_heuristics_init()
 void uvm_perf_heuristics_exit()
 {
    uvm_perf_access_counters_exit();
-    uvm_perf_prefetch_exit();
    uvm_perf_thrashing_exit();
 }

@ -73,9 +72,6 @@ NV_STATUS uvm_perf_heuristics_load(uvm_va_space_t *va_space)
    NV_STATUS status;

    status = uvm_perf_thrashing_load(va_space);
-    if (status != NV_OK)
-        return status;
-    status = uvm_perf_prefetch_load(va_space);
    if (status != NV_OK)
        return status;
    status = uvm_perf_access_counters_load(va_space);
@ -105,6 +101,5 @@ void uvm_perf_heuristics_unload(uvm_va_space_t *va_space)
    uvm_assert_rwsem_locked_write(&va_space->lock);

    uvm_perf_access_counters_unload(va_space);
-    uvm_perf_prefetch_unload(va_space);
    uvm_perf_thrashing_unload(va_space);
 }
--- a/kernel-open/nvidia-uvm/uvm_perf_module.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_module.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -45,7 +45,6 @@
 //
 // - UVM_PERF_MODULE_TYPE_THRASHING: detects memory thrashing scenarios and
 // provides thrashing prevention mechanisms
-// - UVM_PERF_MODULE_TYPE_PREFETCH: detects memory prefetching opportunities
 // - UVM_PERF_MODULE_TYPE_ACCESS_COUNTERS: migrates memory using access counter
 // notifications
 typedef enum
@ -54,7 +53,6 @@ typedef enum

    UVM_PERF_MODULE_TYPE_TEST      = UVM_PERF_MODULE_FIRST_TYPE,
    UVM_PERF_MODULE_TYPE_THRASHING,
-    UVM_PERF_MODULE_TYPE_PREFETCH,
    UVM_PERF_MODULE_TYPE_ACCESS_COUNTERS,

    UVM_PERF_MODULE_TYPE_COUNT,
--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -30,31 +30,6 @@
 #include "uvm_va_range.h"
 #include "uvm_test.h"

-// Global cache to allocate the per-VA block prefetch detection structures
-static struct kmem_cache *g_prefetch_info_cache __read_mostly;
-
-// Per-VA block prefetch detection structure
-typedef struct
-{
-    uvm_page_mask_t prefetch_pages;
-
-    uvm_page_mask_t migrate_pages;
-
-    uvm_va_block_bitmap_tree_t bitmap_tree;
-
-    uvm_processor_id_t last_migration_proc_id;
-
-    uvm_va_block_region_t region;
-
-    size_t big_page_size;
-
-    uvm_va_block_region_t big_pages_region;
-
-    NvU16 pending_prefetch_pages;
-
-    NvU16 fault_migrations_to_last_proc;
-} block_prefetch_info_t;
-
 //
 // Tunables for prefetch detection/prevention (configurable via module parameters)
 //
@ -88,19 +63,54 @@ static bool g_uvm_perf_prefetch_enable;
 static unsigned g_uvm_perf_prefetch_threshold;
 static unsigned g_uvm_perf_prefetch_min_faults;

-// Callback declaration for the performance heuristics events
-static void prefetch_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
+void uvm_perf_prefetch_bitmap_tree_iter_init(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                             uvm_page_index_t page_index,
+                                             uvm_perf_prefetch_bitmap_tree_iter_t *iter)
+{
+    UVM_ASSERT(bitmap_tree->level_count > 0);
+    UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
+                   "%zd vs %zd",
+                   (size_t)page_index,
+                   (size_t)bitmap_tree->leaf_count);

-static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index, block_prefetch_info_t *prefetch_info)
+    iter->level_idx = bitmap_tree->level_count - 1;
+    iter->node_idx  = page_index;
+}
+
+uvm_va_block_region_t uvm_perf_prefetch_bitmap_tree_iter_get_range(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                                                   const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
+{
+    NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
+    NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
+    uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
+
+    UVM_ASSERT(iter->level_idx >= 0);
+    UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
+
+    return subregion;
+}
+
+NvU16 uvm_perf_prefetch_bitmap_tree_iter_get_count(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                                   const uvm_perf_prefetch_bitmap_tree_iter_t *iter)
+{
+    uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, iter);
+
+    return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
+}
+
+static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index,
+                                                     uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                                     uvm_va_block_region_t max_prefetch_region)
 {
    NvU16 counter;
-    uvm_va_block_bitmap_tree_iter_t iter;
-    uvm_va_block_bitmap_tree_t *bitmap_tree = &prefetch_info->bitmap_tree;
-    uvm_va_block_region_t prefetch_region = uvm_va_block_region(bitmap_tree->leaf_count,
-                                                                bitmap_tree->leaf_count + 1);
+    uvm_perf_prefetch_bitmap_tree_iter_t iter;
+    uvm_va_block_region_t prefetch_region = uvm_va_block_region(0, 0);

-    uvm_va_block_bitmap_tree_traverse_counters(counter, bitmap_tree, page_index, &iter) {
-        uvm_va_block_region_t subregion = uvm_va_block_bitmap_tree_iter_get_range(bitmap_tree, &iter);
+    uvm_perf_prefetch_bitmap_tree_traverse_counters(counter,
+                                                    bitmap_tree,
+                                                    page_index - max_prefetch_region.first + bitmap_tree->offset,
+                                                    &iter) {
+        uvm_va_block_region_t subregion = uvm_perf_prefetch_bitmap_tree_iter_get_range(bitmap_tree, &iter);
        NvU16 subregion_pages = uvm_va_block_region_num_pages(subregion);

        UVM_ASSERT(counter <= subregion_pages);
@ -109,289 +119,287 @@ static uvm_va_block_region_t compute_prefetch_region(uvm_page_index_t page_index
    }

    // Clamp prefetch region to actual pages
-    if (prefetch_region.first < bitmap_tree->leaf_count) {
-        if (prefetch_region.first < prefetch_info->region.first)
-            prefetch_region.first = prefetch_info->region.first;
+    if (prefetch_region.outer) {
+        prefetch_region.first += max_prefetch_region.first;
+        if (prefetch_region.first < bitmap_tree->offset) {
+            prefetch_region.first = bitmap_tree->offset;
+        }
+        else {
+            prefetch_region.first -= bitmap_tree->offset;
+            if (prefetch_region.first < max_prefetch_region.first)
+                prefetch_region.first = max_prefetch_region.first;
+        }

-        if (prefetch_region.outer > prefetch_info->region.outer)
-            prefetch_region.outer = prefetch_info->region.outer;
+        prefetch_region.outer += max_prefetch_region.first;
+        if (prefetch_region.outer < bitmap_tree->offset) {
+            prefetch_region.outer = bitmap_tree->offset;
+        }
+        else {
+            prefetch_region.outer -= bitmap_tree->offset;
+            if (prefetch_region.outer > max_prefetch_region.outer)
+                prefetch_region.outer = max_prefetch_region.outer;
+        }
    }

    return prefetch_region;
 }

-// Performance heuristics module for prefetch
-static uvm_perf_module_t g_module_prefetch;
-
-static uvm_perf_module_event_callback_desc_t g_callbacks_prefetch[] = {
-    { UVM_PERF_EVENT_BLOCK_DESTROY, prefetch_block_destroy_cb },
-    { UVM_PERF_EVENT_MODULE_UNLOAD, prefetch_block_destroy_cb },
-    { UVM_PERF_EVENT_BLOCK_SHRINK,  prefetch_block_destroy_cb }
-};
-
-// Get the prefetch detection struct for the given block
-static block_prefetch_info_t *prefetch_info_get(uvm_va_block_t *va_block)
-{
-    return uvm_perf_module_type_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_PREFETCH);
-}
-
-static void prefetch_info_destroy(uvm_va_block_t *va_block)
-{
-    block_prefetch_info_t *prefetch_info = prefetch_info_get(va_block);
-    if (prefetch_info) {
-        kmem_cache_free(g_prefetch_info_cache, prefetch_info);
-        uvm_perf_module_type_unset_data(va_block->perf_modules_data, UVM_PERF_MODULE_TYPE_PREFETCH);
-    }
-}
-
-// Get the prefetch detection struct for the given block or create it if it
-// does not exist
-static block_prefetch_info_t *prefetch_info_get_create(uvm_va_block_t *va_block)
-{
-    block_prefetch_info_t *prefetch_info = prefetch_info_get(va_block);
-    if (!prefetch_info) {
-        // Create some ghost leaves so we can align the tree to big page boundary. We use the
-        // largest page size to handle the worst-case scenario
-        size_t big_page_size = UVM_PAGE_SIZE_128K;
-        uvm_va_block_region_t big_pages_region = uvm_va_block_big_page_region_all(va_block, big_page_size);
-        size_t num_leaves = uvm_va_block_num_cpu_pages(va_block);
-
-        // If the va block is not big enough to fit 128KB pages, maybe it still can fit 64KB pages
-        if (big_pages_region.outer == 0) {
-            big_page_size    = UVM_PAGE_SIZE_64K;
-            big_pages_region = uvm_va_block_big_page_region_all(va_block, big_page_size);
-        }
-
-        if (big_pages_region.first > 0)
-            num_leaves += (big_page_size / PAGE_SIZE - big_pages_region.first);
-
-        UVM_ASSERT(num_leaves <= PAGES_PER_UVM_VA_BLOCK);
-
-        prefetch_info = nv_kmem_cache_zalloc(g_prefetch_info_cache, NV_UVM_GFP_FLAGS);
-        if (!prefetch_info)
-            goto fail;
-
-        prefetch_info->last_migration_proc_id = UVM_ID_INVALID;
-
-        uvm_va_block_bitmap_tree_init_from_page_count(&prefetch_info->bitmap_tree, num_leaves);
-
-        uvm_perf_module_type_set_data(va_block->perf_modules_data, prefetch_info, UVM_PERF_MODULE_TYPE_PREFETCH);
-    }
-
-    return prefetch_info;
-
-fail:
-    prefetch_info_destroy(va_block);
-
-    return NULL;
-}
-
-static void grow_fault_granularity_if_no_thrashing(block_prefetch_info_t *prefetch_info,
+static void grow_fault_granularity_if_no_thrashing(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
                                                   uvm_va_block_region_t region,
+                                                   uvm_page_index_t first,
                                                   const uvm_page_mask_t *faulted_pages,
                                                   const uvm_page_mask_t *thrashing_pages)
 {
    if (!uvm_page_mask_region_empty(faulted_pages, region) &&
        (!thrashing_pages || uvm_page_mask_region_empty(thrashing_pages, region))) {
-        region.first += prefetch_info->region.first;
-        region.outer += prefetch_info->region.first;
-        uvm_page_mask_region_fill(&prefetch_info->bitmap_tree.pages, region);
+        UVM_ASSERT(region.first >= first);
+        region.first = region.first - first + bitmap_tree->offset;
+        region.outer = region.outer - first + bitmap_tree->offset;
+        UVM_ASSERT(region.outer <= bitmap_tree->leaf_count);
+        uvm_page_mask_region_fill(&bitmap_tree->pages, region);
    }
 }

-static void grow_fault_granularity(uvm_va_block_t *va_block,
-                                   block_prefetch_info_t *prefetch_info,
+static void grow_fault_granularity(uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                   NvU32 big_page_size,
+                                   uvm_va_block_region_t big_pages_region,
+                                   uvm_va_block_region_t max_prefetch_region,
                                   const uvm_page_mask_t *faulted_pages,
                                   const uvm_page_mask_t *thrashing_pages)
 {
-    size_t num_big_pages;
-    size_t big_page_index;
-    uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
+    uvm_page_index_t pages_per_big_page = big_page_size / PAGE_SIZE;
+    uvm_page_index_t page_index;
+
+    // Migrate whole block if no big pages and no page in it is thrashing
+    if (!big_pages_region.outer) {
+        grow_fault_granularity_if_no_thrashing(bitmap_tree,
+                                               max_prefetch_region,
+                                               max_prefetch_region.first,
+                                               faulted_pages,
+                                               thrashing_pages);
+        return;
+    }

    // Migrate whole "prefix" if no page in it is thrashing
-    if (prefetch_info->big_pages_region.first > 0) {
-        uvm_va_block_region_t prefix_region = uvm_va_block_region(0, prefetch_info->big_pages_region.first);
+    if (big_pages_region.first > max_prefetch_region.first) {
+        uvm_va_block_region_t prefix_region = uvm_va_block_region(max_prefetch_region.first, big_pages_region.first);

-        grow_fault_granularity_if_no_thrashing(prefetch_info, prefix_region, faulted_pages, thrashing_pages);
+        grow_fault_granularity_if_no_thrashing(bitmap_tree,
+                                               prefix_region,
+                                               max_prefetch_region.first,
+                                               faulted_pages,
+                                               thrashing_pages);
    }

    // Migrate whole big pages if they are not thrashing
-    num_big_pages = uvm_va_block_num_big_pages(va_block, prefetch_info->big_page_size);
-    for (big_page_index = 0; big_page_index < num_big_pages; ++big_page_index) {
-        uvm_va_block_region_t big_region = uvm_va_block_big_page_region(va_block,
-                                                                        big_page_index,
-                                                                        prefetch_info->big_page_size);
+    for (page_index = big_pages_region.first;
+         page_index < big_pages_region.outer;
+         page_index += pages_per_big_page) {
+        uvm_va_block_region_t big_region = uvm_va_block_region(page_index,
+                                                               page_index + pages_per_big_page);

-        grow_fault_granularity_if_no_thrashing(prefetch_info, big_region, faulted_pages, thrashing_pages);
+        grow_fault_granularity_if_no_thrashing(bitmap_tree,
+                                               big_region,
+                                               max_prefetch_region.first,
+                                               faulted_pages,
+                                               thrashing_pages);
    }

    // Migrate whole "suffix" if no page in it is thrashing
-    if (prefetch_info->big_pages_region.outer < block_region.outer) {
-        uvm_va_block_region_t suffix_region = uvm_va_block_region(prefetch_info->big_pages_region.outer,
-                                                                  block_region.outer);
+    if (big_pages_region.outer < max_prefetch_region.outer) {
+        uvm_va_block_region_t suffix_region = uvm_va_block_region(big_pages_region.outer,
+                                                                  max_prefetch_region.outer);

-        grow_fault_granularity_if_no_thrashing(prefetch_info, suffix_region, faulted_pages, thrashing_pages);
+        grow_fault_granularity_if_no_thrashing(bitmap_tree,
+                                               suffix_region,
+                                               max_prefetch_region.first,
+                                               faulted_pages,
+                                               thrashing_pages);
    }
 }

-// Within a block we only allow prefetching to a single processor. Therefore, if two processors
-// are accessing non-overlapping regions within the same block they won't benefit from
-// prefetching.
+// Within a block we only allow prefetching to a single processor. Therefore,
+// if two processors are accessing non-overlapping regions within the same
+// block they won't benefit from prefetching.
 //
-// TODO: Bug 1778034: [uvm] Explore prefetching to different processors within a VA block
-void uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
-                                                  uvm_va_block_context_t *va_block_context,
-                                                  uvm_processor_id_t new_residency,
-                                                  const uvm_page_mask_t *faulted_pages,
-                                                  uvm_va_block_region_t region)
+// TODO: Bug 1778034: [uvm] Explore prefetching to different processors within
+// a VA block.
+static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
+                                                          uvm_va_block_context_t *va_block_context,
+                                                          uvm_processor_id_t new_residency,
+                                                          const uvm_page_mask_t *faulted_pages,
+                                                          uvm_va_block_region_t faulted_region,
+                                                          uvm_page_mask_t *prefetch_pages,
+                                                          uvm_perf_prefetch_bitmap_tree_t *bitmap_tree)
 {
    uvm_page_index_t page_index;
-    block_prefetch_info_t *prefetch_info;
    const uvm_page_mask_t *resident_mask = NULL;
    const uvm_page_mask_t *thrashing_pages = NULL;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    uvm_va_policy_t *policy = va_block_context->policy;
+    uvm_va_block_region_t max_prefetch_region;
+    NvU32 big_page_size;
+    uvm_va_block_region_t big_pages_region;

-    uvm_assert_rwsem_locked(&va_space->lock);
-
-    if (!g_uvm_perf_prefetch_enable)
-        return;
-
-    prefetch_info = prefetch_info_get_create(va_block);
-    if (!prefetch_info)
-        return;
-
-    if (!uvm_id_equal(prefetch_info->last_migration_proc_id, new_residency)) {
-        prefetch_info->last_migration_proc_id = new_residency;
-        prefetch_info->fault_migrations_to_last_proc = 0;
+    if (!uvm_id_equal(va_block->prefetch_info.last_migration_proc_id, new_residency)) {
+        va_block->prefetch_info.last_migration_proc_id = new_residency;
+        va_block->prefetch_info.fault_migrations_to_last_proc = 0;
    }

-    prefetch_info->pending_prefetch_pages = 0;
+    // Compute the expanded region that prefetching is allowed from.
+    if (uvm_va_block_is_hmm(va_block)) {
+        max_prefetch_region = uvm_hmm_get_prefetch_region(va_block,
+                                                          va_block_context,
+                                                          uvm_va_block_region_start(va_block, faulted_region));
+    }
+    else {
+        max_prefetch_region = uvm_va_block_region_from_block(va_block);
+    }
+
+    uvm_page_mask_zero(prefetch_pages);

    if (UVM_ID_IS_CPU(new_residency) || va_block->gpus[uvm_id_gpu_index(new_residency)] != NULL)
        resident_mask = uvm_va_block_resident_mask_get(va_block, new_residency);

    // If this is a first-touch fault and the destination processor is the
-    // preferred location, populate the whole VA block
+    // preferred location, populate the whole max_prefetch_region.
    if (uvm_processor_mask_empty(&va_block->resident) &&
        uvm_id_equal(new_residency, policy->preferred_location)) {
-        uvm_page_mask_region_fill(&prefetch_info->prefetch_pages, uvm_va_block_region_from_block(va_block));
+        uvm_page_mask_region_fill(prefetch_pages, max_prefetch_region);
        goto done;
    }

    if (resident_mask)
-        uvm_page_mask_or(&prefetch_info->bitmap_tree.pages, resident_mask, faulted_pages);
+        uvm_page_mask_or(&bitmap_tree->pages, resident_mask, faulted_pages);
    else
-        uvm_page_mask_copy(&prefetch_info->bitmap_tree.pages, faulted_pages);
+        uvm_page_mask_copy(&bitmap_tree->pages, faulted_pages);

-    // Get the big page size for the new residency
+    // If we are using a subregion of the va_block, align bitmap_tree
+    uvm_page_mask_shift_right(&bitmap_tree->pages, &bitmap_tree->pages, max_prefetch_region.first);
+
+    // Get the big page size for the new residency.
    // Assume 64K size if the new residency is the CPU or no GPU va space is
    // registered in the current process for this GPU.
    if (UVM_ID_IS_GPU(new_residency) &&
        uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, new_residency)) {
        uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, new_residency);
-        prefetch_info->big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
+
+        big_page_size = uvm_va_block_gpu_big_page_size(va_block, gpu);
    }
    else {
-        prefetch_info->big_page_size = UVM_PAGE_SIZE_64K;
+        big_page_size = UVM_PAGE_SIZE_64K;
    }

+    big_pages_region = uvm_va_block_big_page_region_subset(va_block, max_prefetch_region, big_page_size);
+
    // Adjust the prefetch tree to big page granularity to make sure that we
    // get big page-friendly prefetching hints
-    prefetch_info->big_pages_region = uvm_va_block_big_page_region_all(va_block, prefetch_info->big_page_size);
-    if (prefetch_info->big_pages_region.first > 0) {
-        prefetch_info->region.first = prefetch_info->big_page_size / PAGE_SIZE - prefetch_info->big_pages_region.first;
+    if (big_pages_region.first - max_prefetch_region.first > 0) {
+        bitmap_tree->offset = big_page_size / PAGE_SIZE - (big_pages_region.first - max_prefetch_region.first);
+        bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region) + bitmap_tree->offset;

-        uvm_page_mask_shift_left(&prefetch_info->bitmap_tree.pages,
-                                 &prefetch_info->bitmap_tree.pages,
-                                 prefetch_info->region.first);
+        UVM_ASSERT(bitmap_tree->offset < big_page_size / PAGE_SIZE);
+        UVM_ASSERT(bitmap_tree->leaf_count <= PAGES_PER_UVM_VA_BLOCK);
+
+        uvm_page_mask_shift_left(&bitmap_tree->pages, &bitmap_tree->pages, bitmap_tree->offset);
    }
    else {
-        prefetch_info->region.first = 0;
+        bitmap_tree->offset = 0;
+        bitmap_tree->leaf_count = uvm_va_block_region_num_pages(max_prefetch_region);
    }

-    prefetch_info->region.outer = prefetch_info->region.first + uvm_va_block_num_cpu_pages(va_block);
+    bitmap_tree->level_count = ilog2(roundup_pow_of_two(bitmap_tree->leaf_count)) + 1;

    thrashing_pages = uvm_perf_thrashing_get_thrashing_pages(va_block);

-    // Assume big pages by default. Prefetch the rest of 4KB subregions within the big page
-    // region unless there is thrashing.
-    grow_fault_granularity(va_block, prefetch_info, faulted_pages, thrashing_pages);
+    // Assume big pages by default. Prefetch the rest of 4KB subregions within
+    // the big page region unless there is thrashing.
+    grow_fault_granularity(bitmap_tree,
+                           big_page_size,
+                           big_pages_region,
+                           max_prefetch_region,
+                           faulted_pages,
+                           thrashing_pages);

    // Do not compute prefetch regions with faults on pages that are thrashing
    if (thrashing_pages)
-        uvm_page_mask_andnot(&prefetch_info->migrate_pages, faulted_pages, thrashing_pages);
+        uvm_page_mask_andnot(&va_block_context->scratch_page_mask, faulted_pages, thrashing_pages);
    else
-        uvm_page_mask_copy(&prefetch_info->migrate_pages, faulted_pages);
+        uvm_page_mask_copy(&va_block_context->scratch_page_mask, faulted_pages);

-    // Update the tree using the migration mask to compute the pages to prefetch
-    uvm_page_mask_zero(&prefetch_info->prefetch_pages);
-    for_each_va_block_page_in_region_mask(page_index, &prefetch_info->migrate_pages, region) {
-        uvm_va_block_region_t prefetch_region = compute_prefetch_region(page_index + prefetch_info->region.first,
-                                                                        prefetch_info);
-        uvm_page_mask_region_fill(&prefetch_info->prefetch_pages, prefetch_region);
+    // Update the tree using the scratch mask to compute the pages to prefetch
+    for_each_va_block_page_in_region_mask(page_index, &va_block_context->scratch_page_mask, faulted_region) {
+        uvm_va_block_region_t region = compute_prefetch_region(page_index, bitmap_tree, max_prefetch_region);
+
+        uvm_page_mask_region_fill(prefetch_pages, region);

        // Early out if we have already prefetched until the end of the VA block
-        if (prefetch_region.outer == prefetch_info->region.outer)
+        if (region.outer == max_prefetch_region.outer)
            break;
    }

-    // Adjust prefetching page mask
-    if (prefetch_info->region.first > 0) {
-        uvm_page_mask_shift_right(&prefetch_info->prefetch_pages,
-                                  &prefetch_info->prefetch_pages,
-                                  prefetch_info->region.first);
-    }
-
 done:
    // Do not prefetch pages that are going to be migrated/populated due to a
    // fault
-    uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
-                         &prefetch_info->prefetch_pages,
-                         faulted_pages);
+    uvm_page_mask_andnot(prefetch_pages, prefetch_pages, faulted_pages);

    // TODO: Bug 1765432: prefetching pages that are already mapped on the CPU
    // would trigger a remap, which may cause a large overhead. Therefore,
    // exclude them from the mask.
-    if (UVM_ID_IS_CPU(new_residency)) {
+    // For HMM, we don't know what pages are mapped by the CPU unless we try to
+    // migrate them. Prefetch pages will only be opportunistically migrated.
+    if (UVM_ID_IS_CPU(new_residency) && !uvm_va_block_is_hmm(va_block)) {
        uvm_page_mask_and(&va_block_context->scratch_page_mask,
                          resident_mask,
                          &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
-        uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
-                             &prefetch_info->prefetch_pages,
-                             &va_block_context->scratch_page_mask);
+        uvm_page_mask_andnot(prefetch_pages, prefetch_pages, &va_block_context->scratch_page_mask);
    }

    // Avoid prefetching pages that are thrashing
-    if (thrashing_pages) {
-        uvm_page_mask_andnot(&prefetch_info->prefetch_pages,
-                             &prefetch_info->prefetch_pages,
-                             thrashing_pages);
-    }
+    if (thrashing_pages)
+        uvm_page_mask_andnot(prefetch_pages, prefetch_pages, thrashing_pages);

-    prefetch_info->fault_migrations_to_last_proc += uvm_page_mask_region_weight(faulted_pages, region);
-    prefetch_info->pending_prefetch_pages = uvm_page_mask_weight(&prefetch_info->prefetch_pages);
+    va_block->prefetch_info.fault_migrations_to_last_proc += uvm_page_mask_region_weight(faulted_pages, faulted_region);
+
+    return uvm_page_mask_weight(prefetch_pages);
 }

-uvm_perf_prefetch_hint_t uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
-                                                    const uvm_page_mask_t *new_residency_mask)
+void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
+                                uvm_va_block_context_t *va_block_context,
+                                uvm_processor_id_t new_residency,
+                                const uvm_page_mask_t *faulted_pages,
+                                uvm_va_block_region_t faulted_region,
+                                uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                uvm_perf_prefetch_hint_t *out_hint)
 {
-    uvm_perf_prefetch_hint_t ret = UVM_PERF_PREFETCH_HINT_NONE();
-    block_prefetch_info_t *prefetch_info;
+    uvm_va_policy_t *policy = va_block_context->policy;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
+    uvm_page_mask_t *prefetch_pages = &out_hint->prefetch_pages_mask;
+    NvU32 pending_prefetch_pages;
+
+    uvm_assert_rwsem_locked(&va_space->lock);
+    uvm_assert_mutex_locked(&va_block->lock);
+    UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, faulted_region));
+    UVM_ASSERT(uvm_hmm_va_block_context_vma_is_valid(va_block, va_block_context, faulted_region));
+
+    out_hint->residency = UVM_ID_INVALID;

    if (!g_uvm_perf_prefetch_enable)
-        return ret;
+        return;

    if (!va_space->test.page_prefetch_enabled)
-        return ret;
+        return;

-    prefetch_info = prefetch_info_get(va_block);
-    if (!prefetch_info)
-        return ret;
+    pending_prefetch_pages = uvm_perf_prefetch_prenotify_fault_migrations(va_block,
+                                                                          va_block_context,
+                                                                          new_residency,
+                                                                          faulted_pages,
+                                                                          faulted_region,
+                                                                          prefetch_pages,
+                                                                          bitmap_tree);

-    if (prefetch_info->fault_migrations_to_last_proc >= g_uvm_perf_prefetch_min_faults &&
-        prefetch_info->pending_prefetch_pages > 0) {
+    if (va_block->prefetch_info.fault_migrations_to_last_proc >= g_uvm_perf_prefetch_min_faults &&
+        pending_prefetch_pages > 0) {
        bool changed = false;
        uvm_range_group_range_t *rgr;

@ -402,62 +410,19 @@ uvm_perf_prefetch_hint_t uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
                                                                              max(rgr->node.start, va_block->start),
                                                                              min(rgr->node.end, va_block->end));

-            if (uvm_page_mask_region_empty(new_residency_mask, region) &&
-                !uvm_page_mask_region_empty(&prefetch_info->prefetch_pages, region)) {
-                uvm_page_mask_region_clear(&prefetch_info->prefetch_pages, region);
+            if (uvm_page_mask_region_empty(faulted_pages, region) &&
+                !uvm_page_mask_region_empty(prefetch_pages, region)) {
+                uvm_page_mask_region_clear(prefetch_pages, region);
                changed = true;
            }
        }

        if (changed)
-            prefetch_info->pending_prefetch_pages = uvm_page_mask_weight(&prefetch_info->prefetch_pages);
+            pending_prefetch_pages = uvm_page_mask_weight(prefetch_pages);

-        if (prefetch_info->pending_prefetch_pages > 0) {
-            ret.residency = prefetch_info->last_migration_proc_id;
-            ret.prefetch_pages_mask = &prefetch_info->prefetch_pages;
-        }
+        if (pending_prefetch_pages > 0)
+            out_hint->residency = va_block->prefetch_info.last_migration_proc_id;
    }
-
-    return ret;
-}
-
-void prefetch_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
-{
-    uvm_va_block_t *va_block;
-
-    UVM_ASSERT(g_uvm_perf_prefetch_enable);
-
-    UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_DESTROY ||
-               event_id == UVM_PERF_EVENT_MODULE_UNLOAD ||
-               event_id == UVM_PERF_EVENT_BLOCK_SHRINK);
-
-    if (event_id == UVM_PERF_EVENT_BLOCK_DESTROY)
-        va_block = event_data->block_destroy.block;
-    else if (event_id == UVM_PERF_EVENT_BLOCK_SHRINK)
-        va_block = event_data->block_shrink.block;
-    else
-        va_block = event_data->module_unload.block;
-
-    if (!va_block)
-        return;
-
-    prefetch_info_destroy(va_block);
-}
-
-NV_STATUS uvm_perf_prefetch_load(uvm_va_space_t *va_space)
-{
-    if (!g_uvm_perf_prefetch_enable)
-        return NV_OK;
-
-    return uvm_perf_module_load(&g_module_prefetch, va_space);
-}
-
-void uvm_perf_prefetch_unload(uvm_va_space_t *va_space)
-{
-    if (!g_uvm_perf_prefetch_enable)
-        return;
-
-    uvm_perf_module_unload(&g_module_prefetch, va_space);
 }

 NV_STATUS uvm_perf_prefetch_init()
@ -467,13 +432,6 @@ NV_STATUS uvm_perf_prefetch_init()
    if (!g_uvm_perf_prefetch_enable)
        return NV_OK;

-    uvm_perf_module_init("perf_prefetch", UVM_PERF_MODULE_TYPE_PREFETCH, g_callbacks_prefetch,
-                         ARRAY_SIZE(g_callbacks_prefetch), &g_module_prefetch);
-
-    g_prefetch_info_cache = NV_KMEM_CACHE_CREATE("block_prefetch_info_t", block_prefetch_info_t);
-    if (!g_prefetch_info_cache)
-        return NV_ERR_NO_MEMORY;
-
    if (uvm_perf_prefetch_threshold <= 100) {
        g_uvm_perf_prefetch_threshold = uvm_perf_prefetch_threshold;
    }
@ -498,14 +456,6 @@ NV_STATUS uvm_perf_prefetch_init()
    return NV_OK;
 }

-void uvm_perf_prefetch_exit()
-{
-    if (!g_uvm_perf_prefetch_enable)
-        return;
-
-    kmem_cache_destroy_safe(&g_prefetch_info_cache);
-}
-
 NV_STATUS uvm_test_set_page_prefetch_policy(UVM_TEST_SET_PAGE_PREFETCH_POLICY_PARAMS *params, struct file *filp)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(filp);
--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -30,32 +30,66 @@

 typedef struct
 {
-    const uvm_page_mask_t *prefetch_pages_mask;
+    uvm_page_mask_t prefetch_pages_mask;

    uvm_processor_id_t residency;
 } uvm_perf_prefetch_hint_t;

-// Global initialization/cleanup functions
+// Encapsulates a counter tree built on top of a page mask bitmap in which each
+// leaf represents a page in the block. It contains leaf_count and level_count
+// so that it can use some macros for perf trees.
+typedef struct
+{
+    uvm_page_mask_t pages;
+
+    uvm_page_index_t offset;
+
+    NvU16 leaf_count;
+
+    NvU8 level_count;
+} uvm_perf_prefetch_bitmap_tree_t;
+
+// Iterator for the bitmap tree. It contains level_idx and node_idx so that it
+// can use some macros for perf trees.
+typedef struct
+{
+    s8 level_idx;
+
+    uvm_page_index_t node_idx;
+} uvm_perf_prefetch_bitmap_tree_iter_t;
+
+// Global initialization function (no clean up needed).
 NV_STATUS uvm_perf_prefetch_init(void);
-void uvm_perf_prefetch_exit(void);

-// VA space Initialization/cleanup functions
-NV_STATUS uvm_perf_prefetch_load(uvm_va_space_t *va_space);
-void uvm_perf_prefetch_unload(uvm_va_space_t *va_space);
+// Return a hint with the pages that may be prefetched in the block.
+// The faulted_pages mask and faulted_region are the pages being migrated to
+// the given residency.
+// va_block_context must not be NULL, va_block_context->policy must be valid,
+// and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
+// which also means the va_block_context->mm is not NULL, retained, and locked
+// for at least read.
+// Locking: The caller must hold the va_space lock and va_block lock.
+void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
+                                uvm_va_block_context_t *va_block_context,
+                                uvm_processor_id_t new_residency,
+                                const uvm_page_mask_t *faulted_pages,
+                                uvm_va_block_region_t faulted_region,
+                                uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                uvm_perf_prefetch_hint_t *out_hint);

-// Obtain a hint with the pages that may be prefetched in the block
-uvm_perf_prefetch_hint_t uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
-                                                    const uvm_page_mask_t *new_residency_mask);
+void uvm_perf_prefetch_bitmap_tree_iter_init(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                             uvm_page_index_t page_index,
+                                             uvm_perf_prefetch_bitmap_tree_iter_t *iter);
+uvm_va_block_region_t uvm_perf_prefetch_bitmap_tree_iter_get_range(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                                                   const uvm_perf_prefetch_bitmap_tree_iter_t *iter);
+NvU16 uvm_perf_prefetch_bitmap_tree_iter_get_count(const uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
+                                                   const uvm_perf_prefetch_bitmap_tree_iter_t *iter);

-// Notify that the given mask of pages within region is going to migrate to
-// the given residency. The caller must hold the va_space lock.
-void uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_block,
-                                                  uvm_va_block_context_t *va_block_context,
-                                                  uvm_processor_id_t new_residency,
-                                                  const uvm_page_mask_t *migrate_pages,
-                                                  uvm_va_block_region_t region);
-
-#define UVM_PERF_PREFETCH_HINT_NONE()                       \
-    (uvm_perf_prefetch_hint_t){ NULL, UVM_ID_INVALID }
+#define uvm_perf_prefetch_bitmap_tree_traverse_counters(counter,tree,page,iter)                             \
+    for (uvm_perf_prefetch_bitmap_tree_iter_init((tree), (page), (iter)),                                   \
+         (counter) = uvm_perf_prefetch_bitmap_tree_iter_get_count((tree), (iter));                          \
+         (iter)->level_idx >= 0;                                                                            \
+         (counter) = --(iter)->level_idx < 0? 0:                                                            \
+                                              uvm_perf_prefetch_bitmap_tree_iter_get_count((tree), (iter)))

 #endif
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -458,7 +458,7 @@ static void cpu_thrashing_stats_exit(void)
 {
    if (g_cpu_thrashing_stats.procfs_file) {
        UVM_ASSERT(uvm_procfs_is_debug_enabled());
-        uvm_procfs_destroy_entry(g_cpu_thrashing_stats.procfs_file);
+        proc_remove(g_cpu_thrashing_stats.procfs_file);
        g_cpu_thrashing_stats.procfs_file = NULL;
    }
 }
@ -522,7 +522,7 @@ static void gpu_thrashing_stats_destroy(uvm_gpu_t *gpu)
        uvm_perf_module_type_unset_data(gpu->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);

        if (gpu_thrashing->procfs_file)
-            uvm_procfs_destroy_entry(gpu_thrashing->procfs_file);
+            proc_remove(gpu_thrashing->procfs_file);

        uvm_kvfree(gpu_thrashing);
    }
@ -652,7 +652,6 @@ done:

 static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 address, NvU64 bytes);

-// Destroy the thrashing detection struct for the given block
 void uvm_perf_thrashing_info_destroy(uvm_va_block_t *va_block)
 {
    block_thrashing_info_t *block_thrashing = thrashing_info_get(va_block);
@ -1066,11 +1065,11 @@ static void thrashing_reset_pages_in_region(uvm_va_block_t *va_block, NvU64 addr

 // Unmap remote mappings from the given processors on the pinned pages
 // described by region and block_thrashing->pinned pages.
-static NV_STATUS unmap_remote_pinned_pages_from_processors(uvm_va_block_t *va_block,
-                                                           uvm_va_block_context_t *va_block_context,
-                                                           block_thrashing_info_t *block_thrashing,
-                                                           uvm_va_block_region_t region,
-                                                           const uvm_processor_mask_t *unmap_processors)
+static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
+                                           uvm_va_block_context_t *va_block_context,
+                                           block_thrashing_info_t *block_thrashing,
+                                           uvm_va_block_region_t region,
+                                           const uvm_processor_mask_t *unmap_processors)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
@ -1116,17 +1115,16 @@ static NV_STATUS unmap_remote_pinned_pages_from_processors(uvm_va_block_t *va_bl
    return status;
 }

-// Unmap remote mappings from all processors on the pinned pages
-// described by region and block_thrashing->pinned pages.
-NV_STATUS unmap_remote_pinned_pages_from_all_processors(uvm_va_block_t *va_block,
-                                                        uvm_va_block_context_t *va_block_context,
-                                                        uvm_va_block_region_t region)
+NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_block,
+                                                           uvm_va_block_context_t *va_block_context,
+                                                           uvm_va_block_region_t region)
 {
    block_thrashing_info_t *block_thrashing;
    uvm_processor_mask_t unmap_processors;
-    uvm_va_policy_t *policy;
+    uvm_va_policy_t *policy = va_block_context->policy;

    uvm_assert_mutex_locked(&va_block->lock);
+    UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region));

    block_thrashing = thrashing_info_get(va_block);
    if (!block_thrashing || !block_thrashing->pages)
@ -1137,15 +1135,9 @@ NV_STATUS unmap_remote_pinned_pages_from_all_processors(uvm_va_block_t *va_block

    // Unmap all mapped processors (that are not SetAccessedBy) with
    // no copy of the page
-    policy = uvm_va_policy_get(va_block, uvm_va_block_region_start(va_block, region));
-
    uvm_processor_mask_andnot(&unmap_processors, &va_block->mapped, &policy->accessed_by);

-    return unmap_remote_pinned_pages_from_processors(va_block,
-                                                     va_block_context,
-                                                     block_thrashing,
-                                                     region,
-                                                     &unmap_processors);
+    return unmap_remote_pinned_pages(va_block, va_block_context, block_thrashing, region, &unmap_processors);
 }

 // Check that we are not migrating pages away from its pinned location and
@ -1246,7 +1238,7 @@ void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_
        if (!va_space_thrashing->params.enable)
            return;

-        // TODO: Bug 2046423: HMM will need to look up the policy when
+        // TODO: Bug 3660922: HMM will need to look up the policy when
        // read duplication is supported.
        read_duplication = uvm_va_block_is_hmm(va_block) ?
                           UVM_READ_DUPLICATION_UNSET :
@ -1796,6 +1788,7 @@ static void thrashing_unpin_pages(struct work_struct *work)
    struct delayed_work *dwork = to_delayed_work(work);
    va_space_thrashing_info_t *va_space_thrashing = container_of(dwork, va_space_thrashing_info_t, pinned_pages.dwork);
    uvm_va_space_t *va_space = va_space_thrashing->va_space;
+    uvm_va_block_context_t *va_block_context = &va_space_thrashing->pinned_pages.va_block_context;

    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);

@ -1857,12 +1850,13 @@ static void thrashing_unpin_pages(struct work_struct *work)
            UVM_ASSERT(block_thrashing);
            UVM_ASSERT(uvm_page_mask_test(&block_thrashing->pinned_pages.mask, page_index));

-            va_space_thrashing->pinned_pages.va_block_context.policy =
+            uvm_va_block_context_init(va_block_context, NULL);
+            va_block_context->policy =
                uvm_va_policy_get(va_block, uvm_va_block_cpu_page_address(va_block, page_index));

-            unmap_remote_pinned_pages_from_all_processors(va_block,
-                                                          &va_space_thrashing->pinned_pages.va_block_context,
-                                                          uvm_va_block_region_for_page(page_index));
+            uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
+                                                             va_block_context,
+                                                             uvm_va_block_region_for_page(page_index));
            thrashing_reset_page(va_space_thrashing, va_block, block_thrashing, page_index);
        }

@ -2105,11 +2099,10 @@ NV_STATUS uvm_test_set_page_thrashing_policy(UVM_TEST_SET_PAGE_THRASHING_POLICY_

                // Unmap may split PTEs and require a retry. Needs to be called
                // before the pinned pages information is destroyed.
-                status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
-                                                   NULL,
-                                                   unmap_remote_pinned_pages_from_all_processors(va_block,
-                                                                                                 block_context,
-                                                                                                 va_block_region));
+                status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL,
+                             uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
+                                                                              block_context,
+                                                                              va_block_region));

                uvm_perf_thrashing_info_destroy(va_block);

--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -108,8 +108,11 @@ void uvm_perf_thrashing_info_destroy(uvm_va_block_t *va_block);

 // Unmap remote mappings from all processors on the pinned pages
 // described by region and block_thrashing->pinned pages.
-NV_STATUS unmap_remote_pinned_pages_from_all_processors(uvm_va_block_t *va_block,
-                                                        uvm_va_block_context_t *va_block_context,
-                                                        uvm_va_block_region_t region);
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid() in uvm_va_block.h.
+// Locking: the va_block lock must be held.
+NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_block,
+                                                           uvm_va_block_context_t *va_block_context,
+                                                           uvm_va_block_region_t region);

 #endif
--- a/kernel-open/nvidia-uvm/uvm_perf_utils_test.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_utils_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -23,6 +23,7 @@

 #include "uvm_perf_utils.h"
 #include "uvm_va_block.h"
+#include "uvm_perf_prefetch.h"
 #include "uvm_test.h"

 static NV_STATUS test_saturating_counter_basic(void)
@ -681,10 +682,12 @@ fail:
 static NV_STATUS test_bitmap_tree_traversal(void)
 {
    int value;
-    uvm_va_block_bitmap_tree_t tree;
-    uvm_va_block_bitmap_tree_iter_t iter;
+    uvm_perf_prefetch_bitmap_tree_t tree;
+    uvm_perf_prefetch_bitmap_tree_iter_t iter;

-    uvm_va_block_bitmap_tree_init_from_page_count(&tree, 9);
+    tree.leaf_count = 9;
+    tree.level_count = ilog2(roundup_pow_of_two(tree.leaf_count)) + 1;
+    uvm_page_mask_zero(&tree.pages);

    TEST_CHECK_RET(tree.level_count == 5);
    TEST_CHECK_RET(tree.leaf_count == 9);
@ -695,7 +698,7 @@ static NV_STATUS test_bitmap_tree_traversal(void)
    uvm_page_mask_set(&tree.pages, 7);
    uvm_page_mask_set(&tree.pages, 8);

-    uvm_va_block_bitmap_tree_traverse_counters(value, &tree, 6, &iter) {
+    uvm_perf_prefetch_bitmap_tree_traverse_counters(value, &tree, 6, &iter) {
        if (iter.level_idx == 4)
            TEST_CHECK_RET(value == 0);
        else if (iter.level_idx == 3)
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -591,19 +591,16 @@ error:
    return status;
 }

-NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
-                                   size_t num_chunks,
-                                   uvm_chunk_size_t chunk_size,
-                                   uvm_pmm_alloc_flags_t flags,
-                                   uvm_gpu_chunk_t **chunks,
-                                   uvm_tracker_t *out_tracker)
+static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
+                                      size_t num_chunks,
+                                      uvm_chunk_size_t chunk_size,
+                                      uvm_pmm_gpu_memory_type_t memory_type,
+                                      uvm_pmm_alloc_flags_t flags,
+                                      uvm_gpu_chunk_t **chunks,
+                                      uvm_tracker_t *out_tracker)
 {
-    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
-    NV_STATUS status;
    size_t i;
-    uvm_pmm_gpu_memory_type_t memory_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
-
-    status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
+    NV_STATUS status = uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
    if (status != NV_OK)
        return status;

@ -618,6 +615,18 @@ NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
    return NV_OK;
 }

+NV_STATUS uvm_pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,
+                                   size_t num_chunks,
+                                   uvm_chunk_size_t chunk_size,
+                                   uvm_pmm_alloc_flags_t flags,
+                                   uvm_gpu_chunk_t **chunks,
+                                   uvm_tracker_t *out_tracker)
+{
+    uvm_pmm_gpu_memory_type_t memory_type = UVM_PMM_GPU_MEMORY_TYPE_KERNEL;
+
+    return pmm_gpu_alloc_kernel(pmm, num_chunks, chunk_size, memory_type, flags, chunks, out_tracker);
+}
+
 static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
@ -1174,7 +1183,7 @@ static void root_chunk_unmap_indirect_peer(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chun
    if (status != NV_OK)
        UVM_ASSERT(uvm_global_get_status() != NV_OK);

-    uvm_gpu_unmap_cpu_pages(other_gpu, indirect_peer->dma_addrs[index], UVM_CHUNK_SIZE_MAX);
+    uvm_gpu_unmap_cpu_pages(other_gpu->parent, indirect_peer->dma_addrs[index], UVM_CHUNK_SIZE_MAX);
    uvm_processor_mask_clear(&root_chunk->indirect_peers_mapped, other_gpu->id);
    new_count = atomic64_dec_return(&indirect_peer->map_count);
    UVM_ASSERT(new_count >= 0);
@ -1304,7 +1313,7 @@ NV_STATUS uvm_pmm_gpu_indirect_peer_map(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chu
    root_chunk_lock(pmm, root_chunk);

    if (!uvm_processor_mask_test(&root_chunk->indirect_peers_mapped, accessing_gpu->id)) {
-        status = uvm_gpu_map_cpu_pages(accessing_gpu,
+        status = uvm_gpu_map_cpu_pages(accessing_gpu->parent,
                                       uvm_gpu_chunk_to_page(pmm, &root_chunk->chunk),
                                       UVM_CHUNK_SIZE_MAX,
                                       &indirect_peer->dma_addrs[index]);
@ -2705,7 +2714,8 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_pages(void *void_pmm,
                                             NvU64 *pages,
                                             NvU32 num_pages_to_evict,
                                             NvU64 phys_start,
-                                             NvU64 phys_end)
+                                             NvU64 phys_end,
+                                             UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
    NV_STATUS status;
    uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
@ -2804,14 +2814,15 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper(void *void_pmm,
                                                     NvU64 *pages,
                                                     NvU32 num_pages_to_evict,
                                                     NvU64 phys_start,
-                                                     NvU64 phys_end)
+                                                     NvU64 phys_end,
+                                                     UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
    NV_STATUS status;

    // RM invokes the eviction callbacks with its API lock held, but not its GPU
    // lock.
    uvm_record_lock_rm_api();
-    status = uvm_pmm_gpu_pma_evict_pages(void_pmm, page_size, pages, num_pages_to_evict, phys_start, phys_end);
+    status = uvm_pmm_gpu_pma_evict_pages(void_pmm, page_size, pages, num_pages_to_evict, phys_start, phys_end, mem_type);
    uvm_record_unlock_rm_api();
    return status;
 }
@ -2821,19 +2832,24 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_pages_wrapper_entry(void *void_pmm,
                                                           NvU64 *pages,
                                                           NvU32 num_pages_to_evict,
                                                           NvU64 phys_start,
-                                                           NvU64 phys_end)
+                                                           NvU64 phys_end,
+                                                           UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
    UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_pages_wrapper(void_pmm,
                                                      page_size,
                                                      pages,
                                                      num_pages_to_evict,
                                                      phys_start,
-                                                      phys_end));
+                                                      phys_end,
+                                                      mem_type));
 }

 // See the documentation of pmaEvictRangeCb_t in pma.h for details of the
 // expected semantics.
-static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm, NvU64 phys_begin, NvU64 phys_end)
+static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm,
+                                             NvU64 phys_begin,
+                                             NvU64 phys_end,
+                                             UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
    NV_STATUS status;
    uvm_pmm_gpu_t *pmm = (uvm_pmm_gpu_t *)void_pmm;
@ -2922,21 +2938,27 @@ static NV_STATUS uvm_pmm_gpu_pma_evict_range(void *void_pmm, NvU64 phys_begin, N
    return NV_OK;
 }

-static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper(void *void_pmm, NvU64 phys_begin, NvU64 phys_end)
+static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper(void *void_pmm,
+                                                     NvU64 phys_begin,
+                                                     NvU64 phys_end,
+                                                     UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
    NV_STATUS status;

    // RM invokes the eviction callbacks with its API lock held, but not its GPU
    // lock.
    uvm_record_lock_rm_api();
-    status = uvm_pmm_gpu_pma_evict_range(void_pmm, phys_begin, phys_end);
+    status = uvm_pmm_gpu_pma_evict_range(void_pmm, phys_begin, phys_end, mem_type);
    uvm_record_unlock_rm_api();
    return status;
 }

-static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper_entry(void *void_pmm, NvU64 phys_begin, NvU64 phys_end)
+static NV_STATUS uvm_pmm_gpu_pma_evict_range_wrapper_entry(void *void_pmm,
+                                                           NvU64 phys_begin,
+                                                           NvU64 phys_end,
+                                                           UVM_PMA_GPU_MEMORY_TYPE mem_type)
 {
-    UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_range_wrapper(void_pmm, phys_begin, phys_end));
+    UVM_ENTRY_RET(uvm_pmm_gpu_pma_evict_range_wrapper(void_pmm, phys_begin, phys_end, mem_type));
 }

 static void deinit_chunk_split_cache(uvm_pmm_gpu_t *pmm)
@ -3420,12 +3442,13 @@ NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file
    params->evicted_physical_address = 0;
    params->chunk_size_backing_virtual = 0;

-    mm = uvm_va_space_mm_retain_lock(va_space);
+    mm = uvm_va_space_mm_or_current_retain_lock(va_space);
    uvm_va_space_down_read(va_space);

    gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
    if (!gpu || !uvm_gpu_supports_eviction(gpu)) {
        uvm_va_space_up_read(va_space);
+        uvm_va_space_mm_or_current_release_unlock(va_space, mm);
        return NV_ERR_INVALID_DEVICE;
    }
    pmm = &gpu->pmm;
@ -3436,13 +3459,24 @@ NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file
    // For virtual mode, look up and retain the block first so that eviction can
    // be started without the VA space lock held.
    if (params->eviction_mode == UvmTestEvictModeVirtual) {
-        status = uvm_va_block_find_create(va_space, mm, params->address, NULL, &block);
-        if (status != NV_OK) {
+        uvm_va_block_context_t *block_context;
+
+        block_context = uvm_va_block_context_alloc(mm);
+        if (!block_context) {
+            status = NV_ERR_NO_MEMORY;
            uvm_va_space_up_read(va_space);
            uvm_va_space_mm_release_unlock(va_space, mm);
            goto out;
        }

+        status = uvm_va_block_find_create(va_space, params->address, block_context, &block);
+        uvm_va_block_context_free(block_context);
+        if (status != NV_OK) {
+            uvm_va_space_up_read(va_space);
+            uvm_va_space_mm_or_current_release_unlock(va_space, mm);
+            goto out;
+        }
+
        // Retain the block before unlocking the VA space lock so that we can
        // safely access it later.
        uvm_va_block_retain(block);
@ -3451,7 +3485,7 @@ NV_STATUS uvm_test_evict_chunk(UVM_TEST_EVICT_CHUNK_PARAMS *params, struct file
    // Unlock the VA space to emulate real eviction better where a VA space lock
    // may not be held or may be held for a different VA space.
    uvm_va_space_up_read(va_space);
-    uvm_va_space_mm_release_unlock(va_space, mm);
+    uvm_va_space_mm_or_current_release_unlock(va_space, mm);

    if (params->eviction_mode == UvmTestEvictModeVirtual) {
        UVM_ASSERT(block);
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
@ -428,10 +428,10 @@ uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void)
        return uvm_cpu_chunk_allocation_sizes & UVM_CPU_CHUNK_SIZES;
 }

-static void uvm_cpu_chunk_set_phys_size(uvm_cpu_chunk_t *chunk, uvm_chunk_size_t size)
+static void uvm_cpu_chunk_set_size(uvm_cpu_chunk_t *chunk, uvm_chunk_size_t size)
 {
 #if !UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-    chunk->log2_phys_size = ilog2(size);
+    chunk->log2_size = ilog2(size);
 #endif
 }

@ -440,13 +440,7 @@ uvm_chunk_size_t uvm_cpu_chunk_get_size(uvm_cpu_chunk_t *chunk)
 #if UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
    return PAGE_SIZE;
 #else
-    uvm_chunk_size_t chunk_size;
-
-    UVM_ASSERT(chunk);
-    UVM_ASSERT(uvm_cpu_chunk_get_phys_size(chunk));
-    chunk_size = uvm_va_block_region_size(chunk->region);
-    UVM_ASSERT(uvm_cpu_chunk_get_phys_size(chunk) >= chunk_size);
-    return chunk_size;
+    return ((uvm_chunk_size_t)1) << chunk->log2_size;
 #endif
 }

@ -1036,8 +1030,7 @@ void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *
            return;
    };

-    uvm_page_mask_region_clear(&va_block->cpu.allocated,
-                               uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk)));
+    uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk->region);

    if (uvm_page_mask_empty(&va_block->cpu.allocated)) {
        if (UVM_CPU_STORAGE_GET_TYPE(va_block) != UVM_CPU_CHUNK_STORAGE_CHUNK)
@ -1191,7 +1184,7 @@ NV_STATUS uvm_cpu_chunk_alloc(uvm_va_block_t *va_block,
    }

    chunk->page = page;
-    uvm_cpu_chunk_set_phys_size(chunk, alloc_size);
+    uvm_cpu_chunk_set_size(chunk, alloc_size);
    chunk->region = region;
    nv_kref_init(&chunk->refcount);
    uvm_spin_lock_init(&chunk->lock, UVM_LOCK_ORDER_LEAF);
@ -1224,13 +1217,15 @@ error:
    return status;
 }

-NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_chunk_size_t new_size)
+NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block,
+                              uvm_cpu_chunk_t *chunk,
+                              uvm_chunk_size_t new_size,
+                              uvm_page_index_t page_index,
+                              uvm_cpu_chunk_t **new_chunks)
 {
    NV_STATUS status = NV_OK;
-    NV_STATUS insert_status;
    uvm_cpu_chunk_t *new_chunk;
-    uvm_page_index_t running_page_index = chunk->region.first;
-    uvm_page_index_t next_page_index;
+    uvm_page_index_t running_page_index = page_index;
    size_t num_new_chunks;
    size_t num_subchunk_pages;
    size_t i;
@ -1238,21 +1233,13 @@ NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk,
    UVM_ASSERT(chunk);
    UVM_ASSERT(is_power_of_2(new_size));
    UVM_ASSERT(new_size < uvm_cpu_chunk_get_size(chunk));
+    UVM_ASSERT(new_chunks);

-    // We subtract 1 from the computed number of subchunks because we always
-    // keep the original chunk as the first in the block's list. This is so we
-    // don't lose the physical chunk.
-    // All new subchunks will point to the original chunk as their parent.
-    num_new_chunks = (uvm_cpu_chunk_get_size(chunk) / new_size) - 1;
+    num_new_chunks = uvm_cpu_chunk_get_size(chunk) / new_size;
    num_subchunk_pages = new_size / PAGE_SIZE;
-    running_page_index += num_subchunk_pages;
-
-    // Remove the existing chunk from the block first. We re-insert it after
-    // the split.
-    uvm_cpu_chunk_remove_from_block(va_block, chunk, chunk->region.first);

    for (i = 0; i < num_new_chunks; i++) {
-        uvm_page_index_t relative_page_index = running_page_index - chunk->region.first;
+        uvm_page_index_t relative_page_index = running_page_index - page_index;
        uvm_gpu_id_t id;

        new_chunk = uvm_kvmalloc_zero(sizeof(*new_chunk));
@ -1264,10 +1251,10 @@ NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk,
        new_chunk->page = chunk->page + relative_page_index;
        new_chunk->offset = chunk->offset + relative_page_index;
        new_chunk->region = uvm_va_block_region(running_page_index, running_page_index + num_subchunk_pages);
-        uvm_cpu_chunk_set_phys_size(new_chunk, new_size);
+        uvm_cpu_chunk_set_size(new_chunk, new_size);
        nv_kref_init(&new_chunk->refcount);

-        // This lock is unused for logical blocks but initialize it for
+        // This lock is unused for logical chunks but initialize it for
        // consistency.
        uvm_spin_lock_init(&new_chunk->lock, UVM_LOCK_ORDER_LEAF);
        new_chunk->parent = chunk;
@ -1286,109 +1273,64 @@ NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk,
                                               parent_dma_addr + (relative_page_index * PAGE_SIZE));
        }

-        status = uvm_cpu_chunk_insert_in_block(va_block, new_chunk, new_chunk->region.first);
-        if (status != NV_OK) {
-            uvm_cpu_chunk_put(new_chunk);
-            goto error;
-        }
-
+        new_chunks[i] = new_chunk;
        running_page_index += num_subchunk_pages;
    }

-    chunk->region = uvm_va_block_region(chunk->region.first, chunk->region.first + num_subchunk_pages);
+    // Drop the original reference count on the parent (from its creation). This
+    // is done so the parent's reference count goes to 0 when all the children
+    // are released.
+    uvm_cpu_chunk_put(chunk);

 error:
-    // Re-insert the split chunk. This is done unconditionally in both the
-    // success and error paths. The difference is that on the success path,
-    // the chunk's region has been updated.
-    // This operation should never fail with NV_ERR_NO_MEMORY since all
-    // state memory should already be allocated. Failing with other errors
-    // is a programmer error.
-    insert_status = uvm_cpu_chunk_insert_in_block(va_block, chunk, chunk->region.first);
-    UVM_ASSERT(insert_status != NV_ERR_INVALID_ARGUMENT && insert_status != NV_ERR_INVALID_STATE);
-
    if (status != NV_OK) {
-        for_each_cpu_chunk_in_block_region_safe(new_chunk,
-                                                running_page_index,
-                                                next_page_index,
-                                                va_block,
-                                                chunk->region) {
-            uvm_cpu_chunk_remove_from_block(va_block, new_chunk, new_chunk->region.first);
+        while (i--)
            uvm_cpu_chunk_put(new_chunk);
-        }
    }

    return status;
 }

-uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk)
+NV_STATUS uvm_cpu_chunk_merge(uvm_va_block_t *va_block,
+                              uvm_cpu_chunk_t **chunks,
+                              size_t num_merge_chunks,
+                              uvm_chunk_size_t merge_size,
+                              uvm_cpu_chunk_t **merged_chunk)
 {
    uvm_cpu_chunk_t *parent;
-    uvm_cpu_chunk_t *subchunk;
-    uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
-    uvm_chunk_size_t merge_chunk_size;
-    uvm_chunk_size_t parent_phys_size;
    uvm_chunk_size_t chunk_size;
-    uvm_va_block_region_t subchunk_region;
-    uvm_page_index_t page_index;
-    uvm_page_index_t next_page_index;
-    NV_STATUS insert_status;
+    size_t i;

-    UVM_ASSERT(chunk);
-    parent = chunk->parent;
+    UVM_ASSERT(chunks);
+    UVM_ASSERT(num_merge_chunks > 0);
+    UVM_ASSERT(merged_chunk);

-    // If the chunk does not have a parent, a merge cannot be done.
+    parent = chunks[0]->parent;
    if (!parent)
-        return NULL;
+        return NV_WARN_NOTHING_TO_DO;

-    chunk_size = uvm_cpu_chunk_get_size(chunk);
-    parent_phys_size = uvm_cpu_chunk_get_phys_size(parent);
+    chunk_size = uvm_cpu_chunk_get_size(chunks[0]);

-    // Remove all sizes above the parent's physical size.
-    merge_sizes &= parent_phys_size | (parent_phys_size - 1);
+    UVM_ASSERT(uvm_cpu_chunk_get_size(parent) == merge_size);
+    UVM_ASSERT(merge_size > chunk_size);

-    // Remove all sizes including and below the chunk's current size.
-    merge_sizes &= ~(chunk_size | (chunk_size - 1));
+    for (i = 1; i < num_merge_chunks; i++) {
+        if (chunks[i]->parent != parent || uvm_cpu_chunk_get_size(chunks[i]) != chunk_size)
+            return NV_ERR_INVALID_ARGUMENT;

-    // Find the largest size that is fully contained within the VA block.
-    for_each_chunk_size_rev(merge_chunk_size, merge_sizes) {
-        NvU64 parent_start = uvm_cpu_chunk_get_virt_addr(va_block, parent);
-        NvU64 parent_end = parent_start + parent_phys_size - 1;
-
-        if (uvm_va_block_contains_address(va_block, parent_start) &&
-            uvm_va_block_contains_address(va_block, parent_start + merge_chunk_size - 1) &&
-            IS_ALIGNED(parent_start, merge_chunk_size) &&
-            IS_ALIGNED(parent_end + 1, merge_chunk_size))
-            break;
+        UVM_ASSERT(nv_kref_read(&chunks[i]->refcount) == 1);
    }

-    if (merge_chunk_size == UVM_CHUNK_SIZE_INVALID)
-        return NULL;
+    // Take a reference on the parent chunk so it doesn't get released when all
+    // of the children are released below.
+    uvm_cpu_chunk_get(parent);

-    if (uvm_cpu_chunk_get_size(parent) == merge_chunk_size)
-        return NULL;
+    for (i = 0; i < num_merge_chunks; i++)
+        uvm_cpu_chunk_put(chunks[i]);

-    UVM_ASSERT(chunk_size == uvm_cpu_chunk_get_size(parent));
-    UVM_ASSERT(IS_ALIGNED(merge_chunk_size, chunk_size));
+    *merged_chunk = parent;

-    subchunk_region = uvm_va_block_region(parent->region.first + uvm_cpu_chunk_num_pages(parent),
-                                          parent->region.first + (merge_chunk_size / PAGE_SIZE));
-
-    // Remove the first (parent) subchunk. It will be re-inserted later with an
-    // updated region.
-    uvm_cpu_chunk_remove_from_block(va_block, parent, parent->region.first);
-
-    for_each_cpu_chunk_in_block_region_safe(subchunk, page_index, next_page_index, va_block, subchunk_region) {
-        UVM_ASSERT(subchunk);
-        uvm_cpu_chunk_remove_from_block(va_block, subchunk, subchunk->region.first);
-        uvm_cpu_chunk_put(subchunk);
-    }
-
-    parent->region = uvm_va_block_region(parent->region.first, parent->region.first + (merge_chunk_size / PAGE_SIZE));
-    insert_status = uvm_cpu_chunk_insert_in_block(va_block, parent, parent->region.first);
-    UVM_ASSERT(insert_status != NV_ERR_INVALID_ARGUMENT && insert_status != NV_ERR_INVALID_STATE);
-
-    return parent;
+    return NV_OK;
 }

 static uvm_cpu_chunk_t *get_parent_cpu_chunk(uvm_cpu_chunk_t *chunk)
@ -1414,7 +1356,7 @@ static void check_cpu_dirty_flag(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_i
    // compound pages.
    page = chunk->page + page_index;
    if (PageDirty(page)) {
-        bitmap_fill(chunk->dirty_bitmap, uvm_cpu_chunk_get_phys_size(chunk) / PAGE_SIZE);
+        bitmap_fill(chunk->dirty_bitmap, uvm_cpu_chunk_get_size(chunk) / PAGE_SIZE);
        ClearPageDirty(page);
    }
 }
@ -1432,7 +1374,7 @@ static uvm_cpu_chunk_t *get_parent_and_page_index(uvm_cpu_chunk_t *chunk, uvm_pa

    page_index = chunk->offset + (page_index - chunk->region.first);
    parent = get_parent_cpu_chunk(chunk);
-    UVM_ASSERT(page_index < uvm_cpu_chunk_get_phys_size(parent) / PAGE_SIZE);
+    UVM_ASSERT(page_index < uvm_cpu_chunk_get_size(parent) / PAGE_SIZE);
    *out_page_index = page_index;
    return parent;
 }
@ -1442,7 +1384,7 @@ void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_inde
    uvm_cpu_chunk_t *parent;

    parent = get_parent_and_page_index(chunk, &page_index);
-    if (uvm_cpu_chunk_get_phys_size(parent) == PAGE_SIZE) {
+    if (uvm_cpu_chunk_get_size(parent) == PAGE_SIZE) {
        SetPageDirty(parent->page);
        return;
    }
@ -1457,7 +1399,7 @@ void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_inde
    uvm_cpu_chunk_t *parent;

    parent = get_parent_and_page_index(chunk, &page_index);
-    if (uvm_cpu_chunk_get_phys_size(parent) == PAGE_SIZE) {
+    if (uvm_cpu_chunk_get_size(parent) == PAGE_SIZE) {
        ClearPageDirty(parent->page);
        return;
    }
@ -1474,7 +1416,7 @@ bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
    bool dirty;

    parent = get_parent_and_page_index(chunk, &page_index);
-    if (uvm_cpu_chunk_get_phys_size(parent) == PAGE_SIZE)
+    if (uvm_cpu_chunk_get_size(parent) == PAGE_SIZE)
        return PageDirty(parent->page);

    uvm_spin_lock(&parent->lock);
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@ -181,6 +181,9 @@ size_t uvm_pmm_sysmem_mappings_dma_to_virt(uvm_pmm_sysmem_mappings_t *sysmem_map
 #if UVM_CPU_CHUNK_SIZES == PAGE_SIZE
 #define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 1
 typedef struct page uvm_cpu_chunk_t;
+
+#define UVM_CPU_CHUNK_PAGE_INDEX(chunk, page_index) (page_index)
+
 #else
 #define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 0
 typedef struct uvm_cpu_chunk_struct uvm_cpu_chunk_t;
@ -224,13 +227,10 @@ struct uvm_cpu_chunk_struct
    // parent.
    nv_kref_t refcount;

-    // Size of the chunk at the time of its creation.
-    // For chunks, which are the result of a split, this
-    // value will be the size of the chunk prior to the
-    // split.
-    // For chunks resulting from page allocations (physical),
+    // Size of the chunk.
+    // For chunks resulting from page allocations (physical chunks),
    // this value is the size of the physical allocation.
-    size_t log2_phys_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);
+    size_t log2_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);

    struct {
        // Per-GPU array of DMA mapping addresses for the chunk.
@ -252,6 +252,8 @@ struct uvm_cpu_chunk_struct
    // for logical chunks this will be NULL;
    unsigned long *dirty_bitmap;
 };
+
+#define UVM_CPU_CHUNK_PAGE_INDEX(chunk, page_index) (chunk->region.first)
 #endif // UVM_CPU_CHUNK_SIZES == PAGE_SIZE

 // Return the set of allowed CPU chunk allocation sizes.
@ -302,22 +304,6 @@ void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *
 // NULL is returned.
 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *block, uvm_page_index_t page_index);

-// Return the physical size of the CPU chunk.
-// The physical size of the CPU chunk is the size of the physical CPU
-// memory backing the CPU chunk. It is set at CPU chunk allocation time
-static uvm_chunk_size_t uvm_cpu_chunk_get_phys_size(uvm_cpu_chunk_t *chunk)
-{
-#if UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-    return (uvm_chunk_size_t)PAGE_SIZE;
-#else
-    return ((uvm_chunk_size_t)1) << chunk->log2_phys_size;
-#endif
-}
-
-// Return the size of the CPU chunk. While the physical size of the CPU
-// chunk reflects the size of the physical memory backing the chunk, this
-// size is the effective size of the chunk and changes as result of CPU
-// chunk splits.
 uvm_chunk_size_t uvm_cpu_chunk_get_size(uvm_cpu_chunk_t *chunk);

 // Return the number of base system pages covered by the CPU chunk.
@ -370,35 +356,27 @@ NvU64 uvm_cpu_chunk_get_gpu_mapping_addr(uvm_va_block_t *block,
 // new_size has to be one of the supported CPU chunk allocation sizes and has to
 // be smaller than the current size of chunk.
 //
-// On success, NV_OK is returned. All new chunks will have chunk as parent and
-// chunk's size will have been updated to new_size.
-//
-// Note that due to the way CPU chunks are managed and split, the number of
-// newly created chunks will be (size_of(chunk) / new_size) - 1.
-//
-// On failure NV_ERR_NO_MEMORY will be returned. chunk's size will not be
-// modified.
-NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_chunk_size_t new_size);
+// On success, NV_OK is returned. On failure NV_ERR_NO_MEMORY will be returned.
+NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block,
+                              uvm_cpu_chunk_t *chunk,
+                              uvm_chunk_size_t new_size,
+                              uvm_page_index_t page_index,
+                              uvm_cpu_chunk_t **new_chunks);

-// Merge chunk's parent to the highest possible CPU chunk size fully contained
-// within the parent's owning VA block.
+// Merge chunks to merge_size.
 //
-// The size to which chunks are merged is determined by finding the largest
-// size from the set of allowed CPU chunk sizes that satisfies both criteria
-// below:
-//    * The VA range of the parent chunk resulting from the merge has to be
-//      fully contained within the VA block.
-//    * The start and end VA addresses of the parent based on its physical
-//      size have to be aligned to the merge size.
+// All input chunks must have the same parent and size. If not,
+// NV_ERR_INVALID_ARGUMENT is returned.
 //
-// It is possible that a merge cannot be done if chunk does not have a parent
-// (it is a physical chunk), chunk's owning VA block is not the same as
-// its parent's owning VA block, or there is no chunk size that satisfied both
-// the above criteria.
+// If a merge cannot be done, NV_WARN_NOTHING_TO_DO is returned.
 //
-// Return a pointer to the merged chunk. If a merge could not be done, return
-// NULL.
-uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk);
+// On success, NV_OK is returned and merged_chunk is set to point to the
+// merged chunk.
+NV_STATUS uvm_cpu_chunk_merge(uvm_va_block_t *va_block,
+                              uvm_cpu_chunk_t **chunks,
+                              size_t num_merge_chunks,
+                              uvm_chunk_size_t merge_size,
+                              uvm_cpu_chunk_t **merged_chunk);

 // Mark the CPU sub-page page_index in the CPU chunk as dirty.
 // page_index has to be a page withing the chunk's region.
@ -414,14 +392,22 @@ bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)

 #else // UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()

-static NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_chunk_size_t new_size)
+static NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t *chunk,
+                                     uvm_chunk_size_t new_size,
+                                     uvm_page_index_t page_index,
+                                     uvm_cpu_chunk_t **new_chunks)
 {
    return NV_OK;
 }

-static uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk)
+static NV_STATUS uvm_cpu_chunk_merge(uvm_va_block_t *va_block,
+                                     uvm_cpu_chunk_t **chunk,
+                                     size_t num_merge_chunks,
+                                     uvm_chunk_size_t merge_size,
+                                     uvm_cpu_chunk_t **merged_chunk)
 {
-    return NULL;
+    return NV_WARN_NOTHING_TO_DO;
 }

 static void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@ -101,7 +101,7 @@ static NV_STATUS split_as_needed(uvm_va_space_t *va_space,

    UVM_ASSERT(PAGE_ALIGNED(addr));

-    // Look for UVM managed allocations first, then look for HMM policies.
+    // Look for managed allocations first, then look for HMM policies.
    va_range = uvm_va_range_find(va_space, addr);
    if (!va_range)
        return uvm_hmm_split_as_needed(va_space, addr, split_needed_cb, data);
@ -203,6 +203,10 @@ NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
                                                     uvm_va_block_context_t *va_block_context)
 {
    uvm_assert_mutex_locked(&va_block->lock);
+    // TODO: Bug 1750144: remove this restriction when HMM handles setting
+    // the preferred location semantics instead of just recording the policy.
+    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
+    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));

    uvm_va_block_mark_cpu_dirty(va_block);

@ -432,10 +436,9 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();

    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
+    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));

-    va_block_context->policy = uvm_va_range_get_policy(va_block->va_range);
-
-    // Read duplication takes precedence over SetAccesedBy. Do not add mappings
+    // Read duplication takes precedence over SetAccessedBy. Do not add mappings
    // if read duplication is enabled.
    if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space))
        return NV_OK;
@ -617,6 +620,10 @@ NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
    NV_STATUS status;
    uvm_va_block_retry_t va_block_retry;

+    // TODO: Bug 3660922: need to implement HMM read duplication support.
+    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
+    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));
+
    status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry,
                                     va_block_set_read_duplication_locked(va_block,
                                                                          &va_block_retry,
@ -714,6 +721,9 @@ NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
    NV_STATUS status = NV_OK;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();

+    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
+    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));
+
    // Restore all SetAccessedBy mappings
    status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry,
                                     va_block_unset_read_duplication_locked(va_block,
--- a/kernel-open/nvidia-uvm/uvm_populate_pageable.c
+++ b/kernel-open/nvidia-uvm/uvm_populate_pageable.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2018-2021 NVIDIA Corporation
+    Copyright (c) 2018-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -54,7 +54,7 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,
 {
    unsigned long vma_num_pages;
    unsigned long outer = start + length;
-    const bool is_writable = is_write_populate(vma, populate_permissions);
+    unsigned int gup_flags = is_write_populate(vma, populate_permissions) ? FOLL_WRITE : 0;
    struct mm_struct *mm = vma->vm_mm;
    unsigned long vm_flags = vma->vm_flags;
    bool uvm_managed_vma;
@ -97,7 +97,10 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,
    if (uvm_managed_vma)
        uvm_record_unlock_mmap_lock_read(mm);

-    ret = NV_GET_USER_PAGES_REMOTE(NULL, mm, start, vma_num_pages, is_writable, 0, pages, NULL);
+    if (touch)
+        ret = NV_PIN_USER_PAGES_REMOTE(mm, start, vma_num_pages, gup_flags, pages, NULL, NULL);
+    else
+        ret = NV_GET_USER_PAGES_REMOTE(mm, start, vma_num_pages, gup_flags, pages, NULL, NULL);

    if (uvm_managed_vma)
        uvm_record_lock_mmap_lock_read(mm);
@ -114,7 +117,7 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,

            for (i = 0; i < ret; i++) {
                UVM_ASSERT(pages[i]);
-                put_page(pages[i]);
+                NV_UNPIN_USER_PAGE(pages[i]);
            }
        }

@ -127,7 +130,7 @@ NV_STATUS uvm_populate_pageable_vma(struct vm_area_struct *vma,

        for (i = 0; i < vma_num_pages; i++) {
            uvm_touch_page(pages[i]);
-            put_page(pages[i]);
+            NV_UNPIN_USER_PAGE(pages[i]);
        }
    }

--- a/kernel-open/nvidia-uvm/uvm_procfs.c
+++ b/kernel-open/nvidia-uvm/uvm_procfs.c
@ -68,31 +68,7 @@ NV_STATUS uvm_procfs_init()

 void uvm_procfs_exit()
 {
-    uvm_procfs_destroy_entry(uvm_proc_dir);
-}
-
-// TODO: Bug 1767237: Copied from nv-procfs.c. Refactor it out to
-//       nv-procfs-common.c.
-static void procfs_destroy_entry_with_root(struct proc_dir_entry *entry, struct proc_dir_entry *delimiter)
-{
-#if defined(NV_PROC_REMOVE_PRESENT)
-    proc_remove(entry);
-#else
-    while (entry) {
-        struct proc_dir_entry *next = entry->next;
-        if (entry->subdir)
-            procfs_destroy_entry_with_root(entry->subdir, delimiter);
-        remove_proc_entry(entry->name, entry->parent);
-        if (entry == delimiter)
-            break;
-        entry = next;
-    }
-#endif
-}
-
-void uvm_procfs_destroy_entry(struct proc_dir_entry *entry)
-{
-    procfs_destroy_entry_with_root(entry, entry);
+    proc_remove(uvm_proc_dir);
 }

 struct proc_dir_entry *uvm_procfs_get_gpu_base_dir()
--- a/kernel-open/nvidia-uvm/uvm_procfs.h
+++ b/kernel-open/nvidia-uvm/uvm_procfs.h
@ -53,8 +53,6 @@ static bool uvm_procfs_is_debug_enabled(void)
 struct proc_dir_entry *uvm_procfs_get_gpu_base_dir(void);
 struct proc_dir_entry *uvm_procfs_get_cpu_base_dir(void);

-void uvm_procfs_destroy_entry(struct proc_dir_entry *entry);
-
 int uvm_procfs_open_callback(void);
 void uvm_procfs_close_callback(void);

--- a/kernel-open/nvidia-uvm/uvm_pushbuffer.c
+++ b/kernel-open/nvidia-uvm/uvm_pushbuffer.c
@ -121,7 +121,7 @@ NV_STATUS uvm_pushbuffer_create(uvm_channel_manager_t *channel_manager, uvm_push
        goto error;

    // Verify the GPU can access the pushbuffer.
-    UVM_ASSERT(uvm_pushbuffer_get_gpu_va_base(pushbuffer) + UVM_PUSHBUFFER_SIZE < gpu->parent->max_host_va);
+    UVM_ASSERT((uvm_pushbuffer_get_gpu_va_base(pushbuffer) + UVM_PUSHBUFFER_SIZE - 1) < gpu->parent->max_host_va);

    bitmap_fill(pushbuffer->idle_chunks, UVM_PUSHBUFFER_CHUNKS);
    bitmap_fill(pushbuffer->available_chunks, UVM_PUSHBUFFER_CHUNKS);
@ -372,7 +372,7 @@ void uvm_pushbuffer_destroy(uvm_pushbuffer_t *pushbuffer)
    if (pushbuffer == NULL)
        return;

-    uvm_procfs_destroy_entry(pushbuffer->procfs.info_file);
+    proc_remove(pushbuffer->procfs.info_file);

    uvm_rm_mem_free(pushbuffer->memory);
    uvm_kvfree(pushbuffer);
@ -448,7 +448,7 @@ void uvm_pushbuffer_end_push(uvm_pushbuffer_t *pushbuffer, uvm_push_t *push, uvm
 {
    uvm_pushbuffer_chunk_t *chunk = gpfifo_to_chunk(pushbuffer, gpfifo);

-    uvm_assert_spinlock_locked(&push->channel->pool->lock);
+    uvm_channel_pool_assert_locked(push->channel->pool);

    uvm_spin_lock(&pushbuffer->lock);

--- a/kernel-open/nvidia-uvm/uvm_range_tree.c
+++ b/kernel-open/nvidia-uvm/uvm_range_tree.c
@ -166,30 +166,6 @@ void uvm_range_tree_shrink_node(uvm_range_tree_t *tree, uvm_range_tree_node_t *n
    node->end = new_end;
 }

-void uvm_range_tree_adjust_interval(uvm_range_tree_t *tree,
-                                    NvU64 addr,
-                                    NvU64 *startp,
-                                    NvU64 *endp)
-{
-    uvm_range_tree_node_t *node;
-    NvU64 start = *startp;
-    NvU64 end = *endp;
-
-    uvm_range_tree_for_each_in(node, tree, start, end) {
-        if (node->start > addr) {
-            end = node->start - 1;
-            break;
-        }
-        else if (node->end < addr)
-            start = node->end + 1;
-        else
-            UVM_ASSERT_MSG(0, "Found node at address 0x%llx\n", addr);
-    }
-
-    *startp = start;
-    *endp = end;
-}
-
 void uvm_range_tree_split(uvm_range_tree_t *tree,
                          uvm_range_tree_node_t *existing,
                          uvm_range_tree_node_t *new)
@ -261,3 +237,55 @@ uvm_range_tree_node_t *uvm_range_tree_iter_first(uvm_range_tree_t *tree, NvU64 s

    return NULL;
 }
+
+NV_STATUS uvm_range_tree_find_hole(uvm_range_tree_t *tree, NvU64 addr, NvU64 *start, NvU64 *end)
+{
+    uvm_range_tree_node_t *node;
+
+    // Find the first node on or after addr, if any
+    node = uvm_range_tree_iter_first(tree, addr, ULLONG_MAX);
+    if (node) {
+        if (node->start <= addr)
+            return NV_ERR_UVM_ADDRESS_IN_USE;
+
+        // node->start can't be 0, otherwise it would contain addr
+        if (end)
+            *end = node->start - 1;
+
+        node = uvm_range_tree_prev(tree, node);
+    }
+    else {
+        // All nodes in the tree must come before addr, if any exist
+        node = uvm_range_tree_last(tree);
+        if (end)
+            *end = ULLONG_MAX;
+    }
+
+    if (start) {
+        if (node)
+            *start = node->end + 1;
+        else
+            *start = 0;
+    }
+
+    return NV_OK;
+}
+
+NV_STATUS uvm_range_tree_find_hole_in(uvm_range_tree_t *tree, NvU64 addr, NvU64 *start, NvU64 *end)
+{
+    NvU64 temp_start, temp_end;
+    NV_STATUS status;
+
+    UVM_ASSERT(start);
+    UVM_ASSERT(end);
+    UVM_ASSERT(*start <= addr);
+    UVM_ASSERT(*end >= addr);
+
+    status = uvm_range_tree_find_hole(tree, addr, &temp_start, &temp_end);
+    if (status == NV_OK) {
+        *start = max(temp_start, *start);
+        *end = min(temp_end, *end);
+    }
+
+    return status;
+}
--- a/kernel-open/nvidia-uvm/uvm_range_tree.h
+++ b/kernel-open/nvidia-uvm/uvm_range_tree.h
@ -73,11 +73,6 @@ static void uvm_range_tree_remove(uvm_range_tree_t *tree, uvm_range_tree_node_t
 // lesser or equal to node->end.
 void uvm_range_tree_shrink_node(uvm_range_tree_t *tree, uvm_range_tree_node_t *node, NvU64 new_start, NvU64 new_end);

-// Adjust start and end to be the largest contiguous interval surrounding addr
-// between *startp and *endp and without overlapping an existing tree node.
-// This function assumes there is no node that includes addr.
-void uvm_range_tree_adjust_interval(uvm_range_tree_t *tree, NvU64 addr, NvU64 *startp, NvU64 *endp);
-
 // Splits an existing node into two pieces, with the new node always after the
 // existing node. The caller must set new->start before calling this function.
 // existing should not be modified by the caller. On return, existing will
@ -100,6 +95,16 @@ uvm_range_tree_node_t *uvm_range_tree_merge_next(uvm_range_tree_t *tree, uvm_ran
 // Returns the node containing addr, if any
 uvm_range_tree_node_t *uvm_range_tree_find(uvm_range_tree_t *tree, NvU64 addr);

+// Find the largest hole containing addr but not containing any nodes. If addr
+// is contained by a node, NV_ERR_UVM_ADDRESS_IN_USE is returned.
+//
+// start and end may be NULL.
+NV_STATUS uvm_range_tree_find_hole(uvm_range_tree_t *tree, NvU64 addr, NvU64 *start, NvU64 *end);
+
+// Like uvm_range_tree_find_hole, but start and end are in/out parameters that
+// clamp the range.
+NV_STATUS uvm_range_tree_find_hole_in(uvm_range_tree_t *tree, NvU64 addr, NvU64 *start, NvU64 *end);
+
 // Returns the prev/next node in address order, or NULL if none exists
 static uvm_range_tree_node_t *uvm_range_tree_prev(uvm_range_tree_t *tree, uvm_range_tree_node_t *node)
 {
@ -118,17 +123,6 @@ static uvm_range_tree_node_t *uvm_range_tree_next(uvm_range_tree_t *tree, uvm_ra
 // Returns the first node in the range [start, end], if any
 uvm_range_tree_node_t *uvm_range_tree_iter_first(uvm_range_tree_t *tree, NvU64 start, NvU64 end);

-// Return true if the range tree is empty.
-static bool uvm_range_tree_empty(uvm_range_tree_t *tree)
-{
-    return list_empty(&tree->head);
-}
-
-static NvU64 uvm_range_tree_node_size(uvm_range_tree_node_t *node)
-{
-    return node->end - node->start + 1;
-}
-
 // Returns the node following the provided node in address order, if that node's
 // start <= the provided end.
 static uvm_range_tree_node_t *uvm_range_tree_iter_next(uvm_range_tree_t *tree, uvm_range_tree_node_t *node, NvU64 end)
@ -139,6 +133,25 @@ static uvm_range_tree_node_t *uvm_range_tree_iter_next(uvm_range_tree_t *tree, u
    return NULL;
 }

+// Return true if the range tree is empty.
+static bool uvm_range_tree_empty(uvm_range_tree_t *tree)
+{
+    return list_empty(&tree->head);
+}
+
+// Return the last node in the tree, or NULL if none exists
+static uvm_range_tree_node_t *uvm_range_tree_last(uvm_range_tree_t *tree)
+{
+    if (list_empty(&tree->head))
+        return NULL;
+    return list_last_entry(&tree->head, uvm_range_tree_node_t, list);
+}
+
+static NvU64 uvm_range_tree_node_size(uvm_range_tree_node_t *node)
+{
+    return node->end - node->start + 1;
+}
+
 #define uvm_range_tree_for_each(node, tree) list_for_each_entry((node), &(tree)->head, list)

 #define uvm_range_tree_for_each_safe(node, next, tree) \
--- a/kernel-open/nvidia-uvm/uvm_range_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_range_tree_test.c
@ -303,10 +303,93 @@ error:
    return status;
 }

+static NV_STATUS rtt_check_between(rtt_state_t *state, uvm_range_tree_node_t *lower, uvm_range_tree_node_t *upper)
+{
+    bool hole_exists = true;
+    NvU64 hole_start = 0, hole_end = ULLONG_MAX;
+    NvU64 test_start, test_end;
+
+    if (lower) {
+        if (lower->end == ULLONG_MAX) {
+            UVM_ASSERT(!upper);
+            hole_exists = false;
+        }
+        else {
+            hole_start = lower->end + 1;
+        }
+    }
+
+    if (upper) {
+        if (upper->start == 0) {
+            UVM_ASSERT(!lower);
+            hole_exists = false;
+        }
+        else {
+            hole_end = upper->start - 1;
+        }
+    }
+
+    if (hole_start > hole_end)
+        hole_exists = false;
+
+    if (hole_exists) {
+        size_t i;
+        NvU64 hole_mid = hole_start + ((hole_end - hole_start) / 2);
+        NvU64 inputs[] = {hole_start, hole_mid, hole_end};
+
+        for (i = 0; i < ARRAY_SIZE(inputs); i++) {
+            TEST_CHECK_RET(uvm_range_tree_find(&state->tree, inputs[i]) == NULL);
+
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, inputs[i], &test_start, &test_end));
+            TEST_CHECK_RET(test_start == hole_start);
+            TEST_CHECK_RET(test_end == hole_end);
+
+            test_start = 0;
+            test_end = ULLONG_MAX;
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, inputs[i], &test_start, &test_end));
+            TEST_CHECK_RET(test_start == hole_start);
+            TEST_CHECK_RET(test_end == hole_end);
+
+            test_start = hole_start;
+            test_end = inputs[i];
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, inputs[i], &test_start, &test_end));
+            TEST_CHECK_RET(test_start == hole_start);
+            TEST_CHECK_RET(test_end == inputs[i]);
+
+            test_start = inputs[i];
+            test_end = hole_end;
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, inputs[i], &test_start, &test_end));
+            TEST_CHECK_RET(test_start == inputs[i]);
+            TEST_CHECK_RET(test_end == hole_end);
+        }
+    }
+    else {
+        test_start = 0;
+        test_end = ULLONG_MAX;
+
+        if (lower) {
+            MEM_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, lower->end, NULL, NULL),
+                             NV_ERR_UVM_ADDRESS_IN_USE);
+            MEM_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, lower->end, &test_start, &test_end),
+                             NV_ERR_UVM_ADDRESS_IN_USE);
+        }
+
+        if (upper) {
+            MEM_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, upper->start, NULL, NULL),
+                             NV_ERR_UVM_ADDRESS_IN_USE);
+            MEM_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, upper->start, &test_start, &test_end),
+                             NV_ERR_UVM_ADDRESS_IN_USE);
+        }
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS rtt_check_node(rtt_state_t *state, uvm_range_tree_node_t *node)
 {
    uvm_range_tree_node_t *temp, *prev, *next;
    NvU64 start, mid, end;
+    NvU64 hole_start = 0, hole_end = ULLONG_MAX;

    start = node->start;
    end   = node->end;
@ -320,6 +403,18 @@ static NV_STATUS rtt_check_node(rtt_state_t *state, uvm_range_tree_node_t *node)
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, start) == node);
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, mid)   == node);
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, end)   == node);
+
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, start, NULL, NULL), NV_ERR_UVM_ADDRESS_IN_USE);
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, mid, NULL, NULL), NV_ERR_UVM_ADDRESS_IN_USE);
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, end, NULL, NULL), NV_ERR_UVM_ADDRESS_IN_USE);
+
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, start, &hole_start, &hole_end),
+                     NV_ERR_UVM_ADDRESS_IN_USE);
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, mid, &hole_start, &hole_end),
+                     NV_ERR_UVM_ADDRESS_IN_USE);
+    MEM_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, end, &hole_start, &hole_end),
+                     NV_ERR_UVM_ADDRESS_IN_USE);
+
    TEST_CHECK_RET(uvm_range_tree_node_size(node) == end - start + 1);

    if (end < ULLONG_MAX)
@ -327,6 +422,8 @@ static NV_STATUS rtt_check_node(rtt_state_t *state, uvm_range_tree_node_t *node)

    uvm_range_tree_for_each_in(temp, &state->tree, start, end)
        TEST_CHECK_RET(temp == node);
+    uvm_range_tree_for_each_in_safe(temp, next, &state->tree, start, end)
+        TEST_CHECK_RET(temp == node);

    prev = uvm_range_tree_prev(&state->tree, node);
    if (prev) {
@ -341,11 +438,16 @@ static NV_STATUS rtt_check_node(rtt_state_t *state, uvm_range_tree_node_t *node)
    if (next) {
        TEST_CHECK_RET(node->end < next->start);
        TEST_CHECK_RET(uvm_range_tree_prev(&state->tree, next) == node);
+        TEST_CHECK_RET(uvm_range_tree_last(&state->tree) != node);
    }
    else {
        TEST_CHECK_RET(uvm_range_tree_iter_next(&state->tree, node, ULLONG_MAX) == NULL);
+        TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == node);
    }

+    TEST_NV_CHECK_RET(rtt_check_between(state, prev, node));
+    TEST_NV_CHECK_RET(rtt_check_between(state, node, next));
+
    return NV_OK;
 }

@ -362,13 +464,17 @@ static NV_STATUS rtt_check_iterator_all(rtt_state_t *state)
            TEST_CHECK_RET(prev->end < node->start);
        TEST_CHECK_RET(uvm_range_tree_prev(&state->tree, node) == prev);

+        TEST_NV_CHECK_RET(rtt_check_between(state, prev, node));
+
        ++iter_count;
        prev = node;
        expected = uvm_range_tree_next(&state->tree, node);
    }
-    TEST_CHECK_RET(expected == NULL);

+    TEST_CHECK_RET(expected == NULL);
+    TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == prev);
    TEST_CHECK_RET(iter_count == state->count);
+    TEST_NV_CHECK_RET(rtt_check_between(state, prev, NULL));

    iter_count = 0;
    expected = NULL;
@ -381,13 +487,17 @@ static NV_STATUS rtt_check_iterator_all(rtt_state_t *state)
            TEST_CHECK_RET(prev->end < node->start);
        TEST_CHECK_RET(uvm_range_tree_prev(&state->tree, node) == prev);

+        // Skip rtt_check_between since it was done in the loop above
+
        ++iter_count;
        prev = node;
        expected = uvm_range_tree_next(&state->tree, node);
    }
-    TEST_CHECK_RET(expected == NULL);

+    TEST_CHECK_RET(expected == NULL);
+    TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == prev);
    TEST_CHECK_RET(iter_count == state->count);
+
    return NV_OK;
 }

@ -424,20 +534,32 @@ static NV_STATUS rtt_range_add_check(rtt_state_t *state, rtt_range_t *range)
        }
    }

-    status = rtt_range_add(state, range, &node);
-
+    // Verify tree state
    if (overlap) {
-        // Verify failure
-        MEM_NV_CHECK_RET(status, NV_ERR_UVM_ADDRESS_IN_USE);
-
-        // The tree said there's already a range there. Check whether its
-        // internal state is consistent.
        node = uvm_range_tree_iter_first(&state->tree, range->start, range->end);
        TEST_CHECK_RET(node);
        TEST_CHECK_RET(rtt_range_overlaps_node(node, range));
    }
    else {
-        // Verify success
+        NvU64 hole_start, hole_end;
+
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, range->start, &hole_start, &hole_end));
+        TEST_CHECK_RET(hole_start <= range->start);
+        TEST_CHECK_RET(hole_end >= range->end);
+
+        hole_start = range->start;
+        hole_end = range->end;
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, range->start, &hole_start, &hole_end));
+        TEST_CHECK_RET(hole_start == range->start);
+        TEST_CHECK_RET(hole_end == range->end);
+    }
+
+    status = rtt_range_add(state, range, &node);
+
+    if (overlap) {
+        MEM_NV_CHECK_RET(status, NV_ERR_UVM_ADDRESS_IN_USE);
+    }
+    else {
        MEM_NV_CHECK_RET(status, NV_OK);
        status = rtt_check_node(state, node);
    }
@ -450,6 +572,7 @@ static NV_STATUS rtt_index_remove_check(rtt_state_t *state, size_t index)
 {
    uvm_range_tree_node_t *node, *prev, *next;
    NvU64 start, end;
+    NvU64 hole_start, hole_end;
    NV_STATUS status;

    TEST_CHECK_RET(index < state->count);
@ -472,12 +595,35 @@ static NV_STATUS rtt_index_remove_check(rtt_state_t *state, size_t index)
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, start) == NULL);
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, end) == NULL);
    TEST_CHECK_RET(uvm_range_tree_iter_first(&state->tree, start, end) == NULL);
-    if (prev)
+
+    hole_start = start;
+    hole_end = end;
+    TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, start, &hole_start, &hole_end));
+    TEST_CHECK_RET(hole_start == start);
+    TEST_CHECK_RET(hole_end == end);
+
+    TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, start, &hole_start, &hole_end));
+    TEST_CHECK_RET(hole_start <= start);
+    TEST_CHECK_RET(hole_end >= end);
+
+    if (prev) {
        TEST_CHECK_RET(uvm_range_tree_next(&state->tree, prev) == next);
-    if (next)
+        TEST_CHECK_RET(hole_start == prev->end + 1);
+    }
+
+    if (next) {
        TEST_CHECK_RET(uvm_range_tree_prev(&state->tree, next) == prev);
+        TEST_CHECK_RET(hole_end == next->start - 1);
+    }
+    else {
+        TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == prev);
+    }
+
    if (!prev && !next) {
        TEST_CHECK_RET(uvm_range_tree_empty(&state->tree));
+        TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == NULL);
+        TEST_CHECK_RET(hole_start == 0);
+        TEST_CHECK_RET(hole_end == ULLONG_MAX);
        TEST_CHECK_RET(state->count == 0);
    }
    else {
@ -749,10 +895,11 @@ static NV_STATUS rtt_index_merge_check_next_val(rtt_state_t *state, NvU64 addr)

 static NV_STATUS rtt_directed(rtt_state_t *state)
 {
-    uvm_range_tree_node_t *node;
+    uvm_range_tree_node_t *node, *next;

    // Empty tree
    TEST_CHECK_RET(uvm_range_tree_empty(&state->tree));
+    TEST_CHECK_RET(uvm_range_tree_last(&state->tree) == NULL);
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, 0) == NULL);
    TEST_CHECK_RET(uvm_range_tree_find(&state->tree, ULLONG_MAX) == NULL);
    uvm_range_tree_for_each(node, &state->tree)
@ -763,6 +910,13 @@ static NV_STATUS rtt_directed(rtt_state_t *state)
        TEST_CHECK_RET(0);
    uvm_range_tree_for_each_in(node, &state->tree, ULLONG_MAX, ULLONG_MAX)
        TEST_CHECK_RET(0);
+    uvm_range_tree_for_each_in_safe(node, next, &state->tree, 0,          0)
+        TEST_CHECK_RET(0);
+    uvm_range_tree_for_each_in_safe(node, next, &state->tree, 0,          ULLONG_MAX)
+        TEST_CHECK_RET(0);
+    uvm_range_tree_for_each_in_safe(node, next, &state->tree, ULLONG_MAX, ULLONG_MAX)
+        TEST_CHECK_RET(0);
+    TEST_NV_CHECK_RET(rtt_check_between(state, NULL, NULL));

    // Consume entire range
    MEM_NV_CHECK_RET(rtt_range_add_check_val(state,  0,          ULLONG_MAX), NV_OK);
@ -1038,8 +1192,8 @@ static NV_STATUS rtt_batch_remove(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM
    return NV_OK;
 }

-// Attempts to shrink a randomly-selected range in the tree. On selecting a range
-// of size 1, the attempt is repeated with another range up to the
+// Attempts to shrink a randomly-selected range in the tree. On selecting a
+// range of size 1, the attempt is repeated with another range up to the
 // params->max_attempts threshold.
 static NV_STATUS rtt_rand_shrink(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM_PARAMS *params)
 {
@ -1151,11 +1305,12 @@ static NV_STATUS rtt_rand_split(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM_P
    return NV_OK;
 }

-// Attempts to merge a randomly-selected range in the tree in a randomly-selected
-// direction (next or prev). On selecting a range with a non-adjacent neighbor,
-// the attempt is repeated with another range up to the params->max_attempts
-// threshold. On reaching the attempt threshold the RNG probabilities are
-// adjusted to prefer split operations and NV_ERR_BUSY_RETRY is returned.
+// Attempts to merge a randomly-selected range in the tree in a randomly-
+// selected direction (next or prev). On selecting a range with a non-adjacent
+// neighbor, the attempt is repeated with another range up to the
+// params->max_attempts threshold. On reaching the attempt threshold the RNG
+// probabilities are adjusted to prefer split operations and NV_ERR_BUSY_RETRY
+// is returned.
 static NV_STATUS rtt_rand_merge(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM_PARAMS *params)
 {
    uvm_range_tree_node_t *node;
@ -1236,20 +1391,113 @@ static NV_STATUS rtt_rand_collision_check(rtt_state_t *state, NvU64 max_end)
 // in that range in order.
 static NV_STATUS rtt_rand_iterator_check(rtt_state_t *state, NvU64 max_end)
 {
-    uvm_range_tree_node_t *node, *prev = NULL;
+    uvm_range_tree_node_t *node;
+    uvm_range_tree_node_t *prev = NULL, *first = NULL, *last = NULL, *next = NULL;
    size_t i, target_count = 0, iter_count = 0;
+    NvU64 hole_start, hole_end, test_start, test_end;
    rtt_range_t range;

    // Generate the range to check
    rtt_get_rand_range(&state->rng, max_end, &range);

    // Phase 1: Iterate through the unordered list, counting how many nodes we
-    // ought to see from the tree iterator.
-    for (i = 0; i < state->count; i++)
-        target_count += rtt_range_overlaps_node(state->nodes[i], &range);
+    // ought to see from the tree iterator and finding the boundary nodes.
+    for (i = 0; i < state->count; i++) {
+        node = state->nodes[i];
+
+        if (rtt_range_overlaps_node(node, &range)) {
+            ++target_count;
+
+            // first is the lowest node with any overlap
+            if (!first || first->start > node->start)
+                first = node;
+
+            // last is the highest node with any overlap
+            if (!last || last->end < node->end)
+                last = node;
+        }
+        else {
+            // prev is the highest node with end < range.start
+            if (node->end < range.start && (!prev || node->end > prev->end))
+                prev = node;
+
+            // next is the lowest node with start > range.end
+            if (node->start > range.end && (!next || node->start < next->start))
+                next = node;
+        }
+    }
+
+    // Phase 2: Use the tree iterators
+
+    // The holes between the nodes will be checked within the iterator loop.
+    // Here we check the holes at the start and end of the range, if any.
+    if (first) {
+        if (range.start < first->start) {
+            // Check hole at range.start
+            hole_start = prev ? prev->end + 1 : 0;
+            hole_end = first->start - 1;
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, range.start, &test_start, &test_end));
+            TEST_CHECK_RET(test_start == hole_start);
+            TEST_CHECK_RET(test_end == hole_end);
+
+            test_start = range.start;
+            test_end = ULLONG_MAX;
+            TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, range.start, &test_start, &test_end));
+            TEST_CHECK_RET(test_start == range.start);
+            TEST_CHECK_RET(test_end == hole_end);
+        }
+
+        // Else, no hole at start
+    }
+    else {
+        // No nodes intersect the range
+        UVM_ASSERT(target_count == 0);
+        UVM_ASSERT(!last);
+
+        hole_start = prev ? prev->end + 1 : 0;
+        hole_end = next ? next->start - 1 : ULLONG_MAX;
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, range.start, &test_start, &test_end));
+        TEST_CHECK_RET(test_start == hole_start);
+        TEST_CHECK_RET(test_end == hole_end);
+
+        test_start = range.start;
+        test_end = range.end;
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, range.start, &test_start, &test_end));
+        TEST_CHECK_RET(test_start == range.start);
+        TEST_CHECK_RET(test_end == range.end);
+    }
+
+    if (last && range.end > last->end) {
+        // Check hole at range.end
+        hole_start = last->end + 1;
+        hole_end = next ? next->start - 1 : ULLONG_MAX;
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole(&state->tree, range.end, &test_start, &test_end));
+        TEST_CHECK_RET(test_start == hole_start);
+        TEST_CHECK_RET(test_end == hole_end);
+
+        test_start = 0;
+        test_end = range.end;
+        TEST_NV_CHECK_RET(uvm_range_tree_find_hole_in(&state->tree, range.end, &test_start, &test_end));
+        TEST_CHECK_RET(test_start == hole_start);
+        TEST_CHECK_RET(test_end == range.end);
+    }

-    // Phase 2: Use the tree iterator
    uvm_range_tree_for_each_in(node, &state->tree, range.start, range.end) {
+        TEST_CHECK_RET(rtt_range_overlaps_node(node, &range));
+        if (prev) {
+            TEST_CHECK_RET(prev->end < node->start);
+            TEST_NV_CHECK_RET(rtt_check_between(state, prev, node));
+        }
+
+        ++iter_count;
+        prev = node;
+    }
+
+    TEST_CHECK_RET(iter_count == target_count);
+
+    prev = NULL;
+    iter_count = 0;
+    uvm_range_tree_for_each_in_safe(node, next, &state->tree, range.start, range.end) {
        TEST_CHECK_RET(rtt_range_overlaps_node(node, &range));
        if (prev)
            TEST_CHECK_RET(prev->end < node->start);
@ -1277,9 +1525,9 @@ static rtt_op_t rtt_get_rand_op(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM_P
    if (state->count == 1 && state->count == params->max_ranges)
        return RTT_OP_REMOVE;

-    // r_group selects between the two groups of operations, either {add/remove/shrink}
-    // or {merge/split}. r_sub selects the sub operation within that group based
-    // on the current probability settings.
+    // r_group selects between the two groups of operations, either {add/remove/
+    // shrink} or {merge/split}. r_sub selects the sub operation within that
+    // group based on the current probability settings.
    r_group = uvm_test_rng_range_32(&state->rng, 1, 100);
    r_sub   = uvm_test_rng_range_32(&state->rng, 1, 100);

@ -1287,7 +1535,9 @@ static rtt_op_t rtt_get_rand_op(rtt_state_t *state, UVM_TEST_RANGE_TREE_RANDOM_P
        if (r_group <= params->add_remove_shrink_group_probability) {
            if (r_sub <= state->shrink_probability)
                return RTT_OP_SHRINK;
-            // After giving shrink a chance, redo the randomization for add/remove.
+
+            // After giving shrink a chance, redo the randomization for add/
+            // remove.
            r_sub = uvm_test_rng_range_32(&state->rng, 1, 100);

            if (r_sub <= state->add_chance)
--- a/kernel-open/nvidia-uvm/uvm_rm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_rm_mem_test.c
@ -60,10 +60,22 @@ static NV_STATUS map_cpu(uvm_rm_mem_t *rm_mem)
    return NV_OK;
 }

+static NV_STATUS check_alignment(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, NvU64 alignment)
+{
+    // Alignment requirements only apply to mappings in the UVM-owned VA space
+    if (alignment != 0) {
+        bool is_proxy_va_space = false;
+        NvU64 gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, is_proxy_va_space);
+
+        TEST_CHECK_RET(IS_ALIGNED(gpu_va, alignment));
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS map_gpu_owner(uvm_rm_mem_t *rm_mem, NvU64 alignment)
 {
    uvm_gpu_t *gpu = rm_mem->gpu_owner;
-    NvU64 gpu_va;

    // The memory should have been automatically mapped in the GPU owner
    TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu(rm_mem, gpu));
@ -73,9 +85,7 @@ static NV_STATUS map_gpu_owner(uvm_rm_mem_t *rm_mem, NvU64 alignment)
    // located in vidmem.
    TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu) == uvm_gpu_uses_proxy_channel_pool(gpu));

-    gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
-    if (alignment)
-        TEST_CHECK_RET(IS_ALIGNED(gpu_va, alignment));
+    TEST_NV_CHECK_RET(check_alignment(rm_mem, gpu, alignment));

    // Explicitly mapping or unmapping to the GPU that owns the allocation is
    // not allowed, so the testing related to GPU owners is simpler than that of
@ -87,7 +97,6 @@ static NV_STATUS map_other_gpus(uvm_rm_mem_t *rm_mem, uvm_va_space_t *va_space,
 {
    uvm_gpu_t *gpu_owner = rm_mem->gpu_owner;
    uvm_gpu_t *gpu;
-    NvU64 gpu_va;

    for_each_va_space_gpu(gpu, va_space) {
        if (gpu == gpu_owner)
@ -119,9 +128,7 @@ static NV_STATUS map_other_gpus(uvm_rm_mem_t *rm_mem, uvm_va_space_t *va_space,

        TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu) == uvm_gpu_uses_proxy_channel_pool(gpu));

-        gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
-        if (alignment)
-            TEST_CHECK_RET(IS_ALIGNED(gpu_va, alignment));
+        TEST_NV_CHECK_RET(check_alignment(rm_mem, gpu, alignment));
    }

    return NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -247,6 +247,7 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CHANNEL_STRESS,               uvm_test_channel_stress);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CE_SANITY,                    uvm_test_ce_sanity);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_HOST_SANITY,                  uvm_test_host_sanity);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN, uvm_test_va_space_mm_or_current_retain);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_BLOCK_INFO,                uvm_test_va_block_info);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_LOCK_SANITY,                  uvm_test_lock_sanity);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_PERF_UTILS_SANITY,            uvm_test_perf_utils_sanity);
@ -328,6 +329,8 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                                       uvm_test_va_range_inject_add_gpu_va_space_error);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY,   uvm_test_destroy_gpu_va_space_delay);
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_HMM_INIT, uvm_test_hmm_init);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
    }

    return -EINVAL;
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVidia Corporation
+    Copyright (c) 2015-2022 NVidia Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -23,9 +23,7 @@
 #ifndef __UVM_TEST_IOCTL_H__
 #define __UVM_TEST_IOCTL_H__

-#ifndef __KERNEL__

-#endif
 #include "uvm_types.h"
 #include "uvm_ioctl.h"
 #include "nv_uvm_types.h"
@ -151,6 +149,14 @@ typedef enum
    UVM_TEST_VA_RANGE_TYPE_MAX
 } UVM_TEST_VA_RANGE_TYPE;

+typedef enum
+{
+    UVM_TEST_RANGE_SUBTYPE_INVALID = 0,
+    UVM_TEST_RANGE_SUBTYPE_UVM,
+    UVM_TEST_RANGE_SUBTYPE_HMM,
+    UVM_TEST_RANGE_SUBTYPE_MAX
+} UVM_TEST_RANGE_SUBTYPE;
+
 // Keep this in sync with uvm_read_duplication_t in uvm_va_range.h
 typedef enum
 {
@ -169,6 +175,7 @@ typedef struct
    NvBool is_zombie;                   // Out
    // Note: if this is a zombie, this field is meaningless.
    NvBool owned_by_calling_process;    // Out
+    NvU32  subtype;                     // Out (UVM_TEST_RANGE_SUBTYPE)
 } UVM_TEST_VA_RANGE_INFO_MANAGED;

 #define UVM_TEST_VA_RANGE_INFO                           UVM_TEST_IOCTL_BASE(4)
@ -176,6 +183,10 @@ typedef struct
 {
    NvU64                           lookup_address                   NV_ALIGN_BYTES(8); // In

+    // For HMM ranges va_range_start/end will contain the lookup address but not
+    // neccessarily the maximal range over which the returned policy applies.
+    // For example there could be adjacent ranges with the same policy, implying
+    // the returned range could be as small as a page in the worst case for HMM.
    NvU64                           va_range_start                   NV_ALIGN_BYTES(8); // Out
    NvU64                           va_range_end                     NV_ALIGN_BYTES(8); // Out, inclusive
    NvU32                           read_duplication;                                   // Out (UVM_TEST_READ_DUPLICATION_POLICY)
@ -536,12 +547,14 @@ typedef struct
 // If user_pages_allocation_retry_force_count is non-0 then the next count user
 // memory allocations under the VA block will be forced to do allocation-retry.
 //
+// If cpu_pages_allocation_error_count is not zero, the subsequent operations
+// that need to allocate CPU pages will fail with NV_ERR_NO_MEMORY for
+// cpu_pages_allocation_error_count times. If cpu_pages_allocation_error_count
+// is equal to ~0U, the count is infinite.
+//
 // If eviction_failure is NV_TRUE, the next eviction attempt from the VA block
 // will fail with NV_ERR_NO_MEMORY.
 //
-// If cpu_pages_allocation_error is NV_TRUE, the subsequent operations that
-// need to allocate CPU pages will fail with NV_ERR_NO_MEMORY.
-//
 // If populate_failure is NV_TRUE, a retry error will be injected after the next
 // successful user memory allocation under the VA block but before that
 // allocation is used by the block. This is similar to
@ -558,8 +571,8 @@ typedef struct
    NvU32     page_table_allocation_retry_force_count;  // In
    NvU32     user_pages_allocation_retry_force_count;  // In
    NvU32     cpu_chunk_allocation_size_mask;           // In
+    NvU32     cpu_pages_allocation_error_count;         // In
    NvBool    eviction_error;                           // In
-    NvBool    cpu_pages_allocation_error;               // In
    NvBool    populate_error;                           // In
    NV_STATUS rmStatus;                                 // Out
 } UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS;
@ -1111,10 +1124,14 @@ typedef struct
 //
 // If migrate_vma_allocation_fail_nth is greater than 0, the nth page
 // allocation within migrate_vma will fail.
+//
+// If va_block_allocation_fail_nth is greater than 0, the nth call to
+// uvm_va_block_find_create() will fail with NV_ERR_NO_MEMORY.
 #define UVM_TEST_VA_SPACE_INJECT_ERROR                   UVM_TEST_IOCTL_BASE(72)
 typedef struct
 {
    NvU32                           migrate_vma_allocation_fail_nth;                    // In
+    NvU32                           va_block_allocation_fail_nth;                       // In

    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_VA_SPACE_INJECT_ERROR_PARAMS;
@ -1341,6 +1358,28 @@ typedef struct
    NV_STATUS                       rmStatus;                                           // Out
 } UVM_TEST_HOST_SANITY_PARAMS;

+// Calls uvm_va_space_mm_or_current_retain() on a VA space,
+// then releases the va_space_mm and returns.
+#define UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN           UVM_TEST_IOCTL_BASE(89)
+typedef struct
+{
+    // User address of a flag to act as a semaphore. If non-NULL, the address
+    // is set to 1 after successful retain but before the sleep.
+    NvU64 retain_done_ptr                                            NV_ALIGN_BYTES(8); // In
+
+    // Approximate duration for which to sleep with the va_space_mm retained.
+    NvU64 sleep_us                                                   NV_ALIGN_BYTES(8); // In
+
+    // On success, this contains the value of mm->mm_users before mmput() is
+    // called.
+    NvU64 mm_users                                                   NV_ALIGN_BYTES(8); // Out
+
+    // NV_ERR_PAGE_TABLE_NOT_AVAIL  Could not retain va_space_mm
+    //                              (uvm_va_space_mm_or_current_retain returned
+    //                              NULL)
+    NV_STATUS rmStatus;                                                                 // Out
+} UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN_PARAMS;
+
 #define UVM_TEST_GET_USER_SPACE_END_ADDRESS              UVM_TEST_IOCTL_BASE(90)
 typedef struct
 {
@ -1396,6 +1435,19 @@ typedef struct
    NV_STATUS rmStatus;                                  // Out
 } UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED_PARAMS;

+#define UVM_TEST_HMM_INIT                                UVM_TEST_IOCTL_BASE(97)
+typedef struct
+{
+    NV_STATUS rmStatus;                                  // Out
+} UVM_TEST_HMM_INIT_PARAMS;
+
+#define UVM_TEST_SPLIT_INVALIDATE_DELAY                  UVM_TEST_IOCTL_BASE(98)
+typedef struct
+{
+    NvU64 delay_us;                                      // In
+    NV_STATUS rmStatus;                                  // Out
+} UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS;
+
 #ifdef __cplusplus
 }
 #endif
--- a/kernel-open/nvidia-uvm/uvm_thread_context.c
+++ b/kernel-open/nvidia-uvm/uvm_thread_context.c
@ -430,10 +430,12 @@ static bool thread_context_non_interrupt_add(uvm_thread_context_t *thread_contex
        if (thread_context->array_index == UVM_THREAD_CONTEXT_ARRAY_SIZE) {
            NvU64 old = atomic64_cmpxchg(&array_entry->task, 0, task);

-            // Task already added a different thread context. There is nothing
-            // to undo because the current thread context has not been inserted.
-            if (old == task)
+            // Task already added a different thread context. The current thread
+            // context has not been inserted but needs to be freed.
+            if (old == task) {
+                thread_context_non_interrupt_deinit(thread_context);
                return false;
+            }

            // Speculatively add the current thread context.
            if (old == 0)
@ -444,6 +446,7 @@ static bool thread_context_non_interrupt_add(uvm_thread_context_t *thread_contex
            // Task already added a different thread context to the array, so
            // undo the speculative insertion
            atomic64_set(&table_entry->array[thread_context->array_index].task, 0);
+            thread_context_non_interrupt_deinit(thread_context);

            return false;
        }
@ -474,6 +477,9 @@ static bool thread_context_non_interrupt_add(uvm_thread_context_t *thread_contex
        added = true;
    }

+    if (!added)
+        thread_context_non_interrupt_deinit(thread_context);
+
    spin_unlock_irqrestore(&table_entry->tree_lock, flags);
    return added;
 }
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@ -218,7 +218,7 @@ static void uvm_put_user_pages_dirty(struct page **pages, NvU64 page_count)

    for (i = 0; i < page_count; i++) {
        set_page_dirty(pages[i]);
-        put_page(pages[i]);
+        NV_UNPIN_USER_PAGE(pages[i]);
    }
 }

@ -262,7 +262,7 @@ static NV_STATUS map_user_pages(NvU64 user_va, NvU64 size, void **addr, struct p
    }

    nv_mmap_read_lock(current->mm);
-    ret = NV_GET_USER_PAGES(user_va, num_pages, 1, 0, *pages, vmas);
+    ret = NV_PIN_USER_PAGES(user_va, num_pages, FOLL_WRITE, *pages, vmas);
    nv_mmap_read_unlock(current->mm);
    if (ret != num_pages) {
        status = NV_ERR_INVALID_ARGUMENT;
@ -1116,6 +1116,19 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
    uvm_tools_broadcast_event(&entry);
 }

+void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space)
+{
+    UvmEventEntry entry;
+
+    if (!va_space->tools.enabled)
+        return;
+
+    entry.testEventData.splitInvalidate.eventType = UvmEventTypeTestHmmSplitInvalidate;
+    uvm_down_read(&va_space->tools.lock);
+    uvm_tools_record_event(va_space, &entry);
+    uvm_up_read(&va_space->tools.lock);
+}
+
 // This function is used as a begin marker to group all migrations within a VA
 // block that are performed in the same call to
 // block_copy_resident_pages_between. All of these are pushed to the same
@ -2101,8 +2114,7 @@ exit:

    uvm_global_mask_release(retained_global_gpus);

-    if (mm)
-        uvm_va_space_mm_or_current_release(va_space, mm);
+    uvm_va_space_mm_or_current_release(va_space, mm);

    uvm_kvfree(global_gpus);
    uvm_kvfree(retained_global_gpus);
--- a/kernel-open/nvidia-uvm/uvm_tools.h
+++ b/kernel-open/nvidia-uvm/uvm_tools.h
@ -115,6 +115,8 @@ void uvm_tools_broadcast_access_counter(uvm_gpu_t *gpu,
                                        const uvm_access_counter_buffer_entry_t *buffer_entry,
                                        bool on_managed);

+void uvm_tools_test_hmm_split_invalidate(uvm_va_space_t *va_space);
+
 // schedules completed events and then waits from the to be dispatched
 void uvm_tools_flush_events(void);

--- a/kernel-open/nvidia-uvm/uvm_types.h
+++ b/kernel-open/nvidia-uvm/uvm_types.h
@ -34,9 +34,6 @@
 #include "nvstatus.h"
 #include "nvCpuUuid.h"

-#ifndef __KERNEL__
-
-#endif

 /*******************************************************************************
    UVM stream types
@ -359,9 +356,10 @@ typedef enum
    UvmEventNumTypes,

    // ---- Private event types for uvm tests
-    UvmEventTestTypesFirst                 = 63,
+    UvmEventTestTypesFirst                 = 62,

-    UvmEventTypeTestAccessCounter          = UvmEventTestTypesFirst,
+    UvmEventTypeTestHmmSplitInvalidate     = UvmEventTestTypesFirst,
+    UvmEventTypeTestAccessCounter          = UvmEventTestTypesFirst + 1,

    UvmEventTestTypesLast                  = UvmEventTypeTestAccessCounter,

@ -387,6 +385,7 @@ typedef enum
 #define UVM_EVENT_ENABLE_MAP_REMOTE                   ((NvU64)1 << UvmEventTypeMapRemote)
 #define UVM_EVENT_ENABLE_EVICTION                     ((NvU64)1 << UvmEventTypeEviction)
 #define UVM_EVENT_ENABLE_TEST_ACCESS_COUNTER          ((NvU64)1 << UvmEventTypeTestAccessCounter)
+#define UVM_EVENT_ENABLE_TEST_HMM_SPLIT_INVALIDATE    ((NvU64)1 << UvmEventTypeTestHmmSplitInvalidate)

 //------------------------------------------------------------------------------
 // Information associated with a memory violation event
@ -977,6 +976,11 @@ typedef struct
    NvU64 instancePtr;
 } UvmEventTestAccessCounterInfo;

+typedef struct
+{
+    NvU8 eventType;
+} UvmEventTestSplitInvalidateInfo;
+
 //------------------------------------------------------------------------------
 // Entry added in the event queue buffer when an enabled event occurs. For
 // compatibility with all tools ensure that this structure is 64 bit aligned.
@ -1010,6 +1014,7 @@ typedef struct
            NvU8 eventType;

            UvmEventTestAccessCounterInfo accessCounter;
+            UvmEventTestSplitInvalidateInfo splitInvalidate;
        } testEventData;
    };
 } UvmEventEntry;
--- a/kernel-open/nvidia-uvm/uvm_user_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_user_channel.c
@ -618,7 +618,7 @@ static NV_STATUS uvm_register_channel(uvm_va_space_t *va_space,
    uvm_va_space_up_read_rm(va_space);

    // The mm needs to be locked in order to remove stale HMM va_blocks.
-    mm = uvm_va_space_mm_retain_lock(va_space);
+    mm = uvm_va_space_mm_or_current_retain_lock(va_space);

    // We have the RM objects now so we know what the VA range layout should be.
    // Re-take the VA space lock in write mode to create and insert them.
@ -653,10 +653,8 @@ static NV_STATUS uvm_register_channel(uvm_va_space_t *va_space,
    if (status != NV_OK)
        goto error_under_write;

-    if (mm) {
+    if (mm)
        uvm_up_read_mmap_lock_out_of_order(mm);
-        uvm_va_space_mm_release(va_space);
-    }

    // The subsequent mappings will need to call into RM, which means we must
    // downgrade the VA space lock to read mode. Although we're in read mode no
@ -681,6 +679,7 @@ static NV_STATUS uvm_register_channel(uvm_va_space_t *va_space,
        goto error_under_read;

    uvm_va_space_up_read_rm(va_space);
+    uvm_va_space_mm_or_current_release(va_space, mm);
    uvm_gpu_release(gpu);
    return NV_OK;

@ -688,7 +687,7 @@ error_under_write:
    if (user_channel->gpu_va_space)
        uvm_user_channel_detach(user_channel, &deferred_free_list);
    uvm_va_space_up_write(va_space);
-    uvm_va_space_mm_release_unlock(va_space, mm);
+    uvm_va_space_mm_or_current_release_unlock(va_space, mm);
    uvm_deferred_free_object_list(&deferred_free_list);
    uvm_gpu_release(gpu);
    return status;
@ -714,10 +713,12 @@ error_under_read:
    if (user_channel->gpu_va_space) {
        uvm_user_channel_detach(user_channel, &deferred_free_list);
        uvm_va_space_up_write(va_space);
+        uvm_va_space_mm_or_current_release(va_space, mm);
        uvm_deferred_free_object_list(&deferred_free_list);
    }
    else {
        uvm_va_space_up_write(va_space);
+        uvm_va_space_mm_or_current_release(va_space, mm);
    }

    uvm_user_channel_release(user_channel);
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -249,7 +249,7 @@ struct uvm_va_block_struct
    // Lock protecting the block. See the comment at the top of uvm.c.
    uvm_mutex_t lock;

-    // Parent VA range. UVM managed blocks have this set. HMM blocks will have
+    // Parent VA range. Managed blocks have this set. HMM blocks will have
    // va_range set to NULL and hmm.va_space set instead. Dead blocks that are
    // waiting for the last ref count to be removed have va_range and
    // hmm.va_space set to NULL (could be either type of block).
@ -437,13 +437,22 @@ struct uvm_va_block_struct

    uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];

+    // Prefetch infomation that is updated while holding the va_block lock but
+    // records state while the lock is not held.
+    struct
+    {
+        uvm_processor_id_t last_migration_proc_id;
+
+        NvU16 fault_migrations_to_last_proc;
+    } prefetch_info;
+
 #if UVM_IS_CONFIG_HMM()
    struct
    {
        // The MMU notifier is registered per va_block.
        struct mmu_interval_notifier notifier;

-        // Parent VA space pointer. It is NULL for UVM managed blocks or if
+        // Parent VA space pointer. It is NULL for managed blocks or if
        // the HMM block is dead. This field can be read while holding the
        // block lock and is only modified while holding the va_space write
        // lock and va_block lock (same as the va_range pointer).
@ -488,21 +497,27 @@ struct uvm_va_block_wrapper_struct
        // uvm_cpu_chunk_allocation_sizes module parameter.
        NvU32 cpu_chunk_allocation_size_mask;

-        // Force the next eviction attempt on this block to fail. Used for
-        // testing only.
-        bool inject_eviction_error;
-
        // Subsequent operations that need to allocate CPU pages will fail. As
-        // opposed to other error injection settings, this one is persistent.
+        // opposed to other error injection settings, this one fails N times
+        // and then succeeds instead of failing on the Nth try. A value of ~0u
+        // means fail indefinitely.
        // This is because this error is supposed to be fatal and tests verify
        // the state of the VA blocks after the failure. However, some tests
        // use kernels to trigger migrations and a fault replay could trigger
        // a successful migration if this error flag is cleared.
-        bool inject_cpu_pages_allocation_error;
+        NvU32 inject_cpu_pages_allocation_error_count;
+
+        // Force the next eviction attempt on this block to fail. Used for
+        // testing only.
+        bool inject_eviction_error;

        // Force the next successful chunk allocation to then fail. Used for testing
        // only to simulate driver metadata allocation failure.
        bool inject_populate_error;
+
+        // Force the next split on this block to fail.
+        // Set by error injection ioctl for testing purposes only.
+        bool inject_split_error;
    } test;
 };

@ -639,8 +654,18 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
        memset(va_block_context, 0xff, sizeof(*va_block_context));

    va_block_context->mm = mm;
+#if UVM_IS_CONFIG_HMM()
+    va_block_context->hmm.vma = NULL;
+#endif
 }

+// Check that a single policy covers the given region for the given va_block.
+// This always returns true and is intended to only be used with UVM_ASSERT().
+// Locking: the va_block lock must be held.
+bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
+                                        uvm_va_policy_t *policy,
+                                        uvm_va_block_region_t region);
+
 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
 //       and page masks could simplify the below APIs and their implementations
 //       at the cost of having to scan the whole mask for small regions.
@ -651,8 +676,10 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // pages in the region which are present in the mask.
 //
 // prefetch_page_mask may be passed as a subset of page_mask when cause is
-// UVM_MAKE_RESIDENT_CAUSE_FAULT to indicate pages that have been pulled due
-// to automatic page prefetching heuristics. For pages in this mask,
+// UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT,
+// UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT, or
+// UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER to indicate pages that have been
+// pulled due to automatic page prefetching heuristics. For pages in this mask,
 // UVM_MAKE_RESIDENT_CAUSE_PREFETCH will be reported in migration events,
 // instead.
 //
@ -674,20 +701,24 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // block's lock has been unlocked and relocked as part of the call and that the
 // whole sequence of operations performed under the block's lock needs to be
 // attempted again. To facilitate that, the caller needs to provide the same
-// va_block_retry struct for each attempt that has been initialized before the first
-// attempt and needs to be deinitialized after the last one. Most callers can
-// just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the caller.
+// va_block_retry struct for each attempt that has been initialized before the
+// first attempt and needs to be deinitialized after the last one. Most callers
+// can just use UVM_VA_BLOCK_LOCK_RETRY() that takes care of that for the
+// caller.
 //
 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
 // user memory is guaranteed not to happen. Allocation-retry of page tables can
 // still occur though.
 //
-// va_block_context must be non-NULL. This function will set a bit in
+// va_block_context must not be NULL. This function will set a bit in
 // va_block_context->make_resident.pages_changed_residency for each page that
 // changed residency (due to a migration or first population) as a result of the
 // operation. This function only sets bits in that mask. It is the caller's
 // responsiblity to zero the mask or not first.
 //
+// va_block_context->policy must also be set by the caller for the given region.
+// See the comments for uvm_va_block_check_policy_is_valid().
+//
 // Notably any status other than NV_OK indicates that the block's lock might
 // have been unlocked and relocked.
 //
@ -710,6 +741,8 @@ NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
 //   where they are unmapped
 // - All remote mappings (due to either SetAccessedBy or performance heuristics)
 //   are broken
+// - Only managed va_blocks are supported.
+//   TODO: Bug 3660922: need to implement HMM read duplication support.
 // - LOCKING: If va_block_context->mm != NULL, va_block_context->mm->mmap_lock
 //            must be held in at least read mode.
 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
@ -721,6 +754,34 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                                    const uvm_page_mask_t *prefetch_page_mask,
                                                    uvm_make_resident_cause_t cause);

+// Similar to uvm_va_block_make_resident() (read documentation there). The
+// difference is that source pages are only copied to the destination and the
+// residency is not updated until uvm_va_block_make_resident_post() is called.
+// Otherwise, the combination of uvm_va_block_make_resident_pre() and
+// uvm_va_block_make_resident_post() should be the same as just calling
+// uvm_va_block_make_resident().
+// This split is needed when using migrate_vma_setup() and migrate_vma_pages()
+// so that when migrate_vma_pages() indicates a page is not migrating, the
+// va_block state is not updated.
+// LOCKING: The caller must hold the va_block lock.
+NV_STATUS uvm_va_block_make_resident_pre(uvm_va_block_t *va_block,
+                                         uvm_va_block_retry_t *va_block_retry,
+                                         uvm_va_block_context_t *va_block_context,
+                                         uvm_processor_id_t dest_id,
+                                         uvm_va_block_region_t region,
+                                         const uvm_page_mask_t *page_mask,
+                                         const uvm_page_mask_t *prefetch_page_mask,
+                                         uvm_make_resident_cause_t cause);
+
+// The page_mask must be the same or a subset of the page_mask passed to
+// uvm_va_block_make_resident_pre(). This step updates the residency and breaks
+// read duplication.
+// LOCKING: The caller must hold the va_block lock.
+void uvm_va_block_make_resident_post(uvm_va_block_t *va_block,
+                                     uvm_va_block_context_t *va_block_context,
+                                     uvm_va_block_region_t region,
+                                     const uvm_page_mask_t *page_mask);
+
 // Creates or upgrades a mapping from the input processor to the given virtual
 // address region. Pages which already have new_prot permissions or higher are
 // skipped, so this call ensures that the range is mapped with at least new_prot
@ -749,7 +810,8 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
 // pages because the earlier operation can cause a PTE split or merge which is
 // assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@ -805,7 +867,7 @@ NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
 // pages because the earlier operation can cause a PTE split or merge which is
 // assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL. The va_block_context->policy is unused.
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@ -837,12 +899,20 @@ NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
 //   up-to-date data.
 // - Unmap the preferred location's processor from any pages in this region
 //   which are not resident on the preferred location.
+//
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
+//
 // LOCKING: The caller must hold the VA block lock.
 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
                                                     uvm_va_block_context_t *va_block_context);

 // Maps the given processor to all resident pages in this block, as allowed by
 // location and policy. Waits for the operation to complete before returning.
+// This function should only be called with managed va_blocks.
+//
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@ -852,8 +922,10 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
                                       uvm_processor_id_t processor_id);

 // Breaks SetAccessedBy and remote mappings
+// This function should only be called with managed va_blocks.
 //
-// va_block_context must NOT be NULL
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@ -862,8 +934,10 @@ NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
                                            uvm_va_block_context_t *va_block_context);

 // Restores SetAccessedBy mappings
+// This function should only be called with managed va_blocks.
 //
-// va_block_context must NOT be NULL
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: This takes and releases the VA block lock. If va_block_context->mm
 //          != NULL, va_block_context->mm->mmap_lock must be held in at least
@ -871,6 +945,29 @@ NV_STATUS uvm_va_block_set_read_duplication(uvm_va_block_t *va_block,
 NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
                                              uvm_va_block_context_t *va_block_context);

+// Check if processor_id is allowed to access the va_block with access_type
+// permissions. Return values:
+//
+// NV_ERR_INVALID_ADDRESS       The VA block is logically dead (zombie)
+// NV_ERR_INVALID_ACCESS_TYPE   The vma corresponding to the VA range does not
+//                              allow access_type permissions, or migration is
+//                              disallowed and processor_id cannot access the
+//                              range remotely (UVM-Lite).
+// NV_ERR_INVALID_OPERATION     The access would violate the policies specified
+//                              by UvmPreventMigrationRangeGroups.
+//
+// va_block_context must not be NULL, va_block_context->policy must be valid,
+// and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
+// which also means the va_block_context->mm is not NULL, retained, and locked
+// for at least read.
+// Locking: the va_block lock must be held.
+NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
+                                                 uvm_va_block_context_t *va_block_context,
+                                                 uvm_processor_id_t processor_id,
+                                                 uvm_page_index_t page_index,
+                                                 uvm_fault_type_t access_type,
+                                                 bool allow_migration);
+
 // API for access privilege revocation
 //
 // Revoke prot_to_revoke access permissions for the given processor.
@ -898,7 +995,7 @@ NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
 // different pages because the earlier operation can cause a PTE split or merge
 // which is assumed by the later operation.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL. The va_block_context->policy is unused.
 //
 // If allocation-retry was required as part of the operation and was successful,
 // NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the entries in the
@ -938,7 +1035,8 @@ NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
 // processor_id, which triggered the migration and should have already been
 // mapped).
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // This function acquires/waits for the va_block tracker and updates that
 // tracker with any new work pushed.
@ -968,7 +1066,8 @@ NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
 // Note that this can return NV_ERR_MORE_PROCESSING_REQUIRED just like
 // uvm_va_block_map() indicating that the operation needs to be retried.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: The caller must hold the va block lock. If va_block_context->mm !=
 //          NULL, va_block_context->mm->mmap_lock must be held in at least read
@ -989,6 +1088,8 @@ NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_spa
 // If mm != NULL, that mm is used for any CPU mappings which may be created as
 // a result of this call. See uvm_va_block_context_t::mm for details.
 //
+// va_block_context must not be NULL. The va_block_context->policy is unused.
+//
 // LOCKING: The caller must hold the va_block lock. If block_context->mm is not
 // NULL, the caller must hold mm->mmap_lock in at least read mode.
 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
@ -1057,10 +1158,7 @@ NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
 // Exactly the same split semantics as uvm_va_block_split, including error
 // handling except the existing_va_block block lock needs to be held and
 // the new_va_block has to be preallocated.
-//
-// new_va_block's va_range is set to new_va_range before any reverse mapping is
-// established to the new block, but the caller is responsible for inserting the
-// new block into the range.
+// Also note that the existing_va_block lock may be dropped and re-acquired.
 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
                                    NvU64 new_end,
                                    uvm_va_block_t *new_va_block,
@ -1076,6 +1174,7 @@ NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
 //  - va_space lock must be held in at least read mode
 //
 // service_context->block_context.mm is ignored and vma->vm_mm is used instead.
+// service_context->block_context.policy is set by this function.
 //
 // Returns NV_ERR_INVALID_ACCESS_TYPE if a CPU mapping to fault_addr cannot be
 // accessed, for example because it's within a range group which is non-
@ -1089,6 +1188,8 @@ NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
 // (migrations, cache invalidates, etc.) in response to the given service block
 // context
 //
+// service_context->block_context.policy is set by this function.
+//
 // Locking:
 //  - service_context->block_context.mm->mmap_lock must be held in at least
 //    read mode, if valid.
@ -1132,10 +1233,18 @@ static inline NvU64 uvm_va_block_cpu_page_address(uvm_va_block_t *block, uvm_pag
    return block->start + PAGE_SIZE * page_index;
 }

+// Get the physical address on the given GPU for given residency
+uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
+                                                          uvm_page_index_t page_index,
+                                                          uvm_processor_id_t residency,
+                                                          uvm_gpu_t *gpu);
+
 // Get the page physical address on the given GPU
 //
 // This will assert that GPU state is indeed present.
-uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_gpu_t *gpu);
+uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
+                                                          uvm_page_index_t page_index,
+                                                          uvm_gpu_t *gpu);

 static bool uvm_va_block_contains_address(uvm_va_block_t *block, NvU64 address)
 {
@ -1191,26 +1300,28 @@ NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t

 // Same as uvm_va_block_find except that the block is created if not found.
 // If addr is covered by a UVM_VA_RANGE_TYPE_MANAGED va_range, a managed block
-// will be created. Otherwise, if addr is not covered by any va_range, mm is
-// non-NULL, and HMM is enabled in the va_space, an HMM block will be created.
-// In either case, if mm is non-NULL, it must be retained and locked in at
-// least read mode. Return values:
+// will be created. Otherwise, if addr is not covered by any va_range, HMM is
+// enabled in the va_space, and va_block_context and va_block_context->mm are
+// non-NULL, then a HMM block will be created and va_block_context->hmm.vma is
+// set to the VMA covering 'addr'. The va_block_context->policy field is left
+// unchanged.
+// In either case, if va_block_context->mm is non-NULL, it must be retained and
+// locked in at least read mode. Return values:
 // NV_ERR_INVALID_ADDRESS   addr is not a UVM_VA_RANGE_TYPE_MANAGED va_range nor
 //                          a HMM enabled VMA.
 // NV_ERR_NO_MEMORY         memory could not be allocated.
 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
-                                   struct mm_struct *mm,
                                   NvU64 addr,
                                   uvm_va_block_context_t *va_block_context,
                                   uvm_va_block_t **out_block);

-// Same as uvm_va_block_find_create except that only UVM managed va_blocks are
+// Same as uvm_va_block_find_create except that only managed va_blocks are
 // created if not already present in the VA range.
 static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
                                                  NvU64 addr,
                                                  uvm_va_block_t **out_block)
 {
-    return uvm_va_block_find_create(va_space, NULL, addr, NULL, out_block);
+    return uvm_va_block_find_create(va_space, addr, NULL, out_block);
 }

 // Look up a chunk backing a specific address within the VA block. Returns NULL if none.
@ -1232,7 +1343,8 @@ typedef enum
 // The caller needs to handle allocation-retry. va_block_retry can be NULL if
 // the destination is the CPU.
 //
-// va_block_context must not be NULL.
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
 //
 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
 //          NULL, va_block_context->mm->mmap_lock must be held in at least
@ -1249,6 +1361,9 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
 //
 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
 //
+// va_block_context must not be NULL. The caller is not required to set
+// va_block_context->policy.
+//
 // The caller needs to support allocation-retry of page tables.
 //
 // LOCKING: The caller must hold the va_block lock
@ -1317,6 +1432,8 @@ void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block);
 // successful, NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case the
 // block's lock was unlocked and relocked.
 //
+// va_block_context must not be NULL. The va_block_context->policy is unused.
+//
 // LOCKING: The caller must hold the va_block lock.
 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu);

@ -1396,6 +1513,26 @@ static uvm_va_block_region_t uvm_va_block_region_from_block(uvm_va_block_t *va_b
    return uvm_va_block_region(0, uvm_va_block_num_cpu_pages(va_block));
 }

+// Create a block region from a va block and page mask. Note that the region
+// covers the first through the last set bit and may have unset bits in between.
+static uvm_va_block_region_t uvm_va_block_region_from_mask(uvm_va_block_t *va_block, const uvm_page_mask_t *page_mask)
+{
+    uvm_va_block_region_t region;
+    uvm_page_index_t outer = uvm_va_block_num_cpu_pages(va_block);
+
+    region.first = find_first_bit(page_mask->bitmap, outer);
+    if (region.first >= outer) {
+        region = uvm_va_block_region(0, 0);
+    }
+    else {
+        // At least one bit is set so find_last_bit() should not return 'outer'.
+        region.outer = find_last_bit(page_mask->bitmap, outer) + 1;
+        UVM_ASSERT(region.outer <= outer);
+    }
+
+    return region;
+}
+
 static bool uvm_page_mask_test(const uvm_page_mask_t *mask, uvm_page_index_t page_index)
 {
    UVM_ASSERT(page_index < PAGES_PER_UVM_VA_BLOCK);
@ -1715,61 +1852,6 @@ static NvU64 uvm_reverse_map_end(const uvm_reverse_map_t *reverse_map)
 #define for_each_va_block_page(page_index, va_block)                                         \
    for_each_va_block_page_in_region((page_index), uvm_va_block_region_from_block(va_block))

-static void uvm_va_block_bitmap_tree_init_from_page_count(uvm_va_block_bitmap_tree_t *bitmap_tree, size_t page_count)
-{
-    bitmap_tree->leaf_count  = page_count;
-    bitmap_tree->level_count = ilog2(roundup_pow_of_two(page_count)) + 1;
-    uvm_page_mask_zero(&bitmap_tree->pages);
-}
-
-static void uvm_va_block_bitmap_tree_init(uvm_va_block_bitmap_tree_t *bitmap_tree, uvm_va_block_t *va_block)
-{
-    size_t num_pages = uvm_va_block_num_cpu_pages(va_block);
-    uvm_va_block_bitmap_tree_init_from_page_count(bitmap_tree, num_pages);
-}
-
-static void uvm_va_block_bitmap_tree_iter_init(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                               uvm_page_index_t page_index,
-                                               uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    UVM_ASSERT(bitmap_tree->level_count > 0);
-    UVM_ASSERT_MSG(page_index < bitmap_tree->leaf_count,
-                   "%zd vs %zd",
-                   (size_t)page_index,
-                   (size_t)bitmap_tree->leaf_count);
-
-    iter->level_idx = bitmap_tree->level_count - 1;
-    iter->node_idx  = page_index;
-}
-
-static uvm_va_block_region_t uvm_va_block_bitmap_tree_iter_get_range(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                                                     const uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    NvU16 range_leaves = uvm_perf_tree_iter_leaf_range(bitmap_tree, iter);
-    NvU16 range_start = uvm_perf_tree_iter_leaf_range_start(bitmap_tree, iter);
-    uvm_va_block_region_t subregion = uvm_va_block_region(range_start, range_start + range_leaves);
-
-    UVM_ASSERT(iter->level_idx >= 0);
-    UVM_ASSERT(iter->level_idx < bitmap_tree->level_count);
-
-    return subregion;
-}
-
-static NvU16 uvm_va_block_bitmap_tree_iter_get_count(const uvm_va_block_bitmap_tree_t *bitmap_tree,
-                                                     const uvm_va_block_bitmap_tree_iter_t *iter)
-{
-    uvm_va_block_region_t subregion = uvm_va_block_bitmap_tree_iter_get_range(bitmap_tree, iter);
-
-    return uvm_page_mask_region_weight(&bitmap_tree->pages, subregion);
-}
-
-#define uvm_va_block_bitmap_tree_traverse_counters(counter,tree,page,iter)                             \
-    for (uvm_va_block_bitmap_tree_iter_init((tree), (page), (iter)),                                   \
-         (counter) = uvm_va_block_bitmap_tree_iter_get_count((tree), (iter));                          \
-         (iter)->level_idx >= 0;                                                                       \
-         (counter) = --(iter)->level_idx < 0? 0:                                                       \
-                                              uvm_va_block_bitmap_tree_iter_get_count((tree), (iter)))
-
 // Return the block region covered by the given chunk size. page_index must be
 // any page within the block known to be covered by the chunk.
 static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
@ -1898,6 +1980,12 @@ uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block,
 // returned.
 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size);

+// Returns the largest sub-region region of 'region' which can fit big pages.
+// If the region cannot fit any big pages, an invalid region (0, 0) is returned.
+uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
+                                                          uvm_va_block_region_t region,
+                                                          NvU32 big_page_size);
+
 // Returns the big page index (the bit index within
 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
 // page_index cannot be covered by a big PTE due to alignment or block size,
@ -1907,7 +1995,14 @@ size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t pa
 // Returns the new residency for a page that faulted or triggered access
 // counter notifications. The read_duplicate output parameter indicates if the
 // page meets the requirements to be read-duplicated
+// va_block_context must not be NULL, va_block_context->policy must be valid,
+// and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
+// which also means the va_block_context->mm is not NULL, retained, and locked
+// for at least read. See the comments for uvm_va_block_check_policy_is_valid()
+// and uvm_hmm_va_block_context_vma_is_valid() in uvm_hmm.h.
+// Locking: the va_block lock must be held.
 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
+                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_page_index_t page_index,
                                                 uvm_processor_id_t processor_id,
                                                 NvU32 access_type_mask,
--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@ -75,28 +75,6 @@ typedef struct
    DECLARE_BITMAP(bitmap, PAGES_PER_UVM_VA_BLOCK);
 } uvm_page_mask_t;

-// Encapsulates a counter tree built on top of a page mask bitmap in
-// which each leaf represents a page in the block. It contains
-// leaf_count and level_count so that it can use some macros for
-// perf trees
-typedef struct
-{
-    uvm_page_mask_t pages;
-
-    NvU16 leaf_count;
-
-    NvU8 level_count;
-} uvm_va_block_bitmap_tree_t;
-
-// Iterator for the bitmap tree. It contains level_idx and node_idx so
-// that it can use some macros for perf trees
-typedef struct
-{
-    s8 level_idx;
-
-    uvm_page_index_t node_idx;
-} uvm_va_block_bitmap_tree_iter_t;
-
 // When updating GPU PTEs, this struct describes the new arrangement of PTE
 // sizes. It is calculated before the operation is applied so we know which PTE
 // sizes to allocate.
@ -127,11 +105,6 @@ typedef struct
    // that region should be 4k, and that some of those 4k PTEs will be written
    // by the operation.
    DECLARE_BITMAP(big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
-
-    // These are the big PTE regions which will no longer have any valid
-    // mappings after the operation. Only the bits which are set in
-    // big_ptes_covered are valid.
-    DECLARE_BITMAP(big_ptes_fully_unmapped, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
 } uvm_va_block_new_pte_state_t;

 // Event that triggered the call to uvm_va_block_make_resident/
@ -269,7 +242,8 @@ typedef struct
 typedef enum
 {
    UVM_VA_BLOCK_TRANSFER_MODE_MOVE = 1,
-    UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2
+    UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2,
+    UVM_VA_BLOCK_TRANSFER_MODE_COPY_ONLY = 3
 } uvm_va_block_transfer_mode_t;

 struct uvm_reverse_map_struct
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@ -49,8 +49,9 @@ uvm_va_policy_t *uvm_va_policy_get(uvm_va_block_t *va_block, NvU64 addr)

        return node ? &node->policy : &uvm_va_policy_default;
    }
-    else
+    else {
        return uvm_va_range_get_policy(va_block->va_range);
+    }
 }

 #if UVM_IS_CONFIG_HMM()
--- a/kernel-open/nvidia-uvm/uvm_va_policy.h
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.h
@ -50,7 +50,7 @@ typedef enum
 //
 // A policy covers one or more contiguous Linux VMAs or portion of a VMA and
 // does not cover non-existant VMAs.
-// The VA range is determined from either the uvm_va_range_t for UVM managed
+// The VA range is determined from either the uvm_va_range_t for managed
 // allocations or the uvm_va_policy_node_t for HMM allocations.
 //
 typedef struct uvm_va_policy_struct
@ -94,6 +94,12 @@ bool uvm_va_policy_is_read_duplicate(uvm_va_policy_t *policy, uvm_va_space_t *va
 // Locking: The va_block lock must be held.
 uvm_va_policy_t *uvm_va_policy_get(uvm_va_block_t *va_block, NvU64 addr);

+// Return a uvm_va_policy_node_t given a uvm_va_policy_t pointer.
+static uvm_va_policy_node_t *uvm_va_policy_node_from_policy(uvm_va_policy_t *policy)
+{
+    return container_of(policy, uvm_va_policy_node_t, policy);
+}
+
 #if UVM_IS_CONFIG_HMM()

 // Module load/exit
@ -239,6 +245,11 @@ static NV_STATUS uvm_va_policy_set_range(uvm_va_block_t *va_block,
    return NV_OK;
 }

+static uvm_va_policy_node_t *uvm_va_policy_node_iter_first(uvm_va_block_t *va_block, NvU64 start, NvU64 end)
+{
+    return NULL;
+}
+
 #endif // UVM_IS_CONFIG_HMM()

 #endif // __UVM_VA_POLICY_H__
--- a/Show More
+++ b/Show More