530.30.02

2023-02-28 11:12:44 -08:00 · 2023-02-28 11:12:44 -08:00 · 4397463e73
commit 4397463e73
parent e598191e8e
928 changed files with 124728 additions and 88525 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,13 @@
 # Changelog

+## Release 530 Entries
+ 
+### [530.30.02] 2023-02-28
+ 
+#### Fixed
+
+- Add support for resizable BAR on Linux when NVreg_EnableResizableBar=1 module param is set. [#3](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/3) by @sjkelly
+
 ## Release 525 Entries

 ### [525.89.02] 2023-02-08
@ -18,6 +26,10 @@

 ### [525.60.11] 2022-11-28

+#### Fixed
+
+- Fixed nvenc compatibility with usermode clients [#104](https://github.com/NVIDIA/open-gpu-kernel-modules/issues/104)
+
 ### [525.53] 2022-11-10

 #### Changed
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source

 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 525.89.02.
+version 530.30.02.


 ## How to Build
@ -17,7 +17,7 @@ as root:

 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-525.89.02 driver release.  This can be achieved by installing
+530.30.02 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,

@ -167,7 +167,7 @@ for the target kernel.
 ## Compatible GPUs

 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 525.89.02 release,
+(see the table below). However, in the 530.30.02 release,
 GeForce and Workstation support is still considered alpha-quality.

 To enable use of the open kernel modules on GeForce and Workstation GPUs,
@ -175,7 +175,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:

-https://us.download.nvidia.com/XFree86/Linux-x86_64/525.89.02/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/530.30.02/README/kernel_open.html

 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@ -70,9 +70,13 @@ $(foreach _module, $(NV_KERNEL_MODULES), \

 EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
-EXTRA_CFLAGS += -Wall -MD $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
+EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"525.89.02\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"530.30.02\"
+
+ifneq ($(SYSSRCHOST1X),)
+ EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
+endif

 EXTRA_CFLAGS += -Wno-unused-function

@ -259,6 +263,7 @@ NV_HEADER_PRESENCE_TESTS = \
 linux/platform/tegra/dce/dce-client-ipc.h \
 linux/nvhost.h \
 linux/nvhost_t194.h \
+ linux/host1x-next.h \
 asm/book3s/64/hash-64k.h \
 asm/set_memory.h \
 asm/prom.h \
--- a/kernel-open/common/inc/nv-firmware.h
+++ b/kernel-open/common/inc/nv-firmware.h
@ -81,12 +81,12 @@ static inline const char *nv_firmware_path(
    {
        switch (fw_chip_family)
        {
-            case NV_FIRMWARE_CHIP_FAMILY_AD10X:
-                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_ad10x.bin");
+            case NV_FIRMWARE_CHIP_FAMILY_AD10X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_ga10x.bin");

            case NV_FIRMWARE_CHIP_FAMILY_GH100:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_GA100:  // fall through
-            case NV_FIRMWARE_CHIP_FAMILY_GA10X:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_TU11X:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_TU10X:
                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_tu10x.bin");
@ -100,12 +100,12 @@ static inline const char *nv_firmware_path(
    {
        switch (fw_chip_family)
        {
-            case NV_FIRMWARE_CHIP_FAMILY_AD10X:
-                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_log_ad10x.bin");
+            case NV_FIRMWARE_CHIP_FAMILY_AD10X:  // fall through
+            case NV_FIRMWARE_CHIP_FAMILY_GA10X:
+                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_log_ga10x.bin");

            case NV_FIRMWARE_CHIP_FAMILY_GH100:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_GA100:  // fall through
-            case NV_FIRMWARE_CHIP_FAMILY_GA10X:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_TU11X:  // fall through
            case NV_FIRMWARE_CHIP_FAMILY_TU10X:
                return NV_FIRMWARE_PATH_FOR_FILENAME("gsp_log_tu10x.bin");
@ -125,7 +125,7 @@ static inline const char *nv_firmware_path(
 // which will then be invoked (at the top-level) for each
 // gsp_*.bin (but not gsp_log_*.bin)
 #if defined(NV_FIRMWARE_DECLARE_GSP_FILENAME)
-NV_FIRMWARE_DECLARE_GSP_FILENAME("gsp_ad10x.bin")
+NV_FIRMWARE_DECLARE_GSP_FILENAME("gsp_ga10x.bin")
 NV_FIRMWARE_DECLARE_GSP_FILENAME("gsp_tu10x.bin")
 #endif  // defined(NV_FIRMWARE_DECLARE_GSP_FILENAME)

--- a/kernel-open/common/inc/nv-hypervisor.h
+++ b/kernel-open/common/inc/nv-hypervisor.h
@ -90,30 +90,6 @@ typedef enum VGPU_DEVICE_STATE_E
    NV_VGPU_DEV_IN_USE = 2
 } VGPU_DEVICE_STATE;

-typedef enum _VMBUS_CMD_TYPE
-{
-    VMBUS_CMD_TYPE_INVALID    = 0,
-    VMBUS_CMD_TYPE_SETUP      = 1,
-    VMBUS_CMD_TYPE_SENDPACKET = 2,
-    VMBUS_CMD_TYPE_CLEANUP    = 3,
-} VMBUS_CMD_TYPE;
-
-typedef struct
-{
-    NvU32 request_id;
-    NvU32 page_count;
-    NvU64 *pPfns;
-    void *buffer;
-    NvU32 bufferlen;
-} vmbus_send_packet_cmd_params;
-
-
-typedef struct
-{
-    NvU32 override_sint;
-    NvU8 *nv_guid;
-} vmbus_setup_cmd_params;
-
 /*
 * Function prototypes
 */
--- a/kernel-open/common/inc/nv-ioctl.h
+++ b/kernel-open/common/inc/nv-ioctl.h
@ -104,7 +104,7 @@ typedef struct nv_ioctl_rm_api_version

 #define NV_RM_API_VERSION_CMD_STRICT         0
 #define NV_RM_API_VERSION_CMD_RELAXED       '1'
-#define NV_RM_API_VERSION_CMD_OVERRIDE      '2'
+#define NV_RM_API_VERSION_CMD_QUERY         '2'

 #define NV_RM_API_VERSION_REPLY_UNRECOGNIZED 0
 #define NV_RM_API_VERSION_REPLY_RECOGNIZED   1
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@ -633,6 +633,26 @@ static NvBool nv_numa_node_has_memory(int node_id)
        free_pages(ptr, order);                      \
    }

+static inline pgprot_t nv_adjust_pgprot(pgprot_t vm_prot, NvU32 extra)
+{
+    pgprot_t prot = __pgprot(pgprot_val(vm_prot) | extra);
+#if defined(CONFIG_AMD_MEM_ENCRYPT) && defined(NV_PGPROT_DECRYPTED_PRESENT)
+    /*
+     * When AMD memory encryption is enabled, device memory mappings with the
+     * C-bit set read as 0xFF, so ensure the bit is cleared for user mappings.
+     *
+     * If cc_mkdec() is present, then pgprot_decrypted() can't be used.
+     */
+#if defined(NV_CC_MKDEC_PRESENT)
+    prot =  __pgprot(__sme_clr(pgprot_val(vm_prot)));
+#else
+    prot = pgprot_decrypted(prot);
+#endif
+#endif
+
+    return prot;
+}
+
 #if defined(PAGE_KERNEL_NOENC)
 #if defined(__pgprot_mask)
 #define NV_PAGE_KERNEL_NOCACHE_NOENC __pgprot_mask(__PAGE_KERNEL_NOCACHE)
@ -654,7 +674,8 @@ static inline NvUPtr nv_vmap(struct page **pages, NvU32 page_count,
 #if defined(PAGE_KERNEL_NOENC)
    if (unencrypted)
    {
-        prot = cached ? PAGE_KERNEL_NOENC : NV_PAGE_KERNEL_NOCACHE_NOENC;
+        prot = cached ? nv_adjust_pgprot(PAGE_KERNEL_NOENC, 0) :
+                        nv_adjust_pgprot(NV_PAGE_KERNEL_NOCACHE_NOENC, 0);
    }
    else
 #endif
@ -939,26 +960,6 @@ static inline int nv_remap_page_range(struct vm_area_struct *vma,
    return ret;
 }

-static inline pgprot_t nv_adjust_pgprot(pgprot_t vm_prot, NvU32 extra)
-{
-    pgprot_t prot = __pgprot(pgprot_val(vm_prot) | extra);
-#if defined(CONFIG_AMD_MEM_ENCRYPT) && defined(NV_PGPROT_DECRYPTED_PRESENT)
-    /*
-     * When AMD memory encryption is enabled, device memory mappings with the
-     * C-bit set read as 0xFF, so ensure the bit is cleared for user mappings.
-     *
-     * If cc_mkdec() is present, then pgprot_decrypted() can't be used.
-     */
-#if defined(NV_CC_MKDEC_PRESENT)
-    prot =  __pgprot(__sme_clr(pgprot_val(vm_prot)));
-#else
-    prot = pgprot_decrypted(prot);
-#endif
-#endif
-
-    return prot;
-}
-
 static inline int nv_io_remap_page_range(struct vm_area_struct *vma,
    NvU64 phys_addr, NvU64 size, NvU32 extra_prot)
 {
@ -1182,7 +1183,7 @@ typedef struct nv_alloc_s {
        NvBool zeroed      : 1;
        NvBool aliased     : 1;
        NvBool user        : 1;
-        NvBool node0       : 1;
+        NvBool node        : 1;
        NvBool peer_io     : 1;
        NvBool physical    : 1;
        NvBool unencrypted : 1;
@ -1196,6 +1197,7 @@ typedef struct nv_alloc_s {
    unsigned int   pid;
    struct page  **user_pages;
    NvU64         guest_id;             /* id of guest VM */
+    NvS32         node_id;              /* Node id for memory allocation when node is set in flags */
    void          *import_priv;
    struct sg_table *import_sgt;
 } nv_alloc_t;
@ -1436,6 +1438,24 @@ struct nv_dma_device {
    NvBool nvlink;
 };

+#if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
+/*
+ * acpi data storage structure
+ *
+ * This structure retains the pointer to the device,
+ * and any other baggage we want to carry along
+ *
+ */
+typedef struct
+{
+    nvidia_stack_t *sp;
+    struct acpi_device *device;
+    struct acpi_handle *handle;
+    void *notifier_data;
+    int notify_handler_installed;
+} nv_acpi_t;
+#endif
+
 /* linux-specific version of old nv_state_t */
 /* this is a general os-specific state structure. the first element *must* be
   the general state structure, for the generic unix-based code */
@ -1530,8 +1550,13 @@ typedef struct nv_linux_state_s {
    /* Per-device notifier block for ACPI events */
    struct notifier_block acpi_nb;

+#if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
+    nv_acpi_t* nv_acpi_object;
+#endif
+
    /* Lock serializing ISRs for different SOC vectors */
    nv_spinlock_t soc_isr_lock;
+    void *soc_bh_mutex;

    struct nv_timer snapshot_timer;
    nv_spinlock_t snapshot_timer_lock;
@ -1577,24 +1602,6 @@ extern struct rw_semaphore nv_system_pm_lock;

 extern NvBool nv_ats_supported;

-#if defined(NV_LINUX_ACPI_EVENTS_SUPPORTED)
-/*
- * acpi data storage structure
- *
- * This structure retains the pointer to the device,
- * and any other baggage we want to carry along
- *
- */
-typedef struct
-{
-    nvidia_stack_t *sp;
-    struct acpi_device *device;
-    struct acpi_handle *handle;
-    int notify_handler_installed;
-} nv_acpi_t;
-
-#endif
-
 /*
 * file-private data
 * hide a pointer to our data structures in a file-private ptr
@ -1744,6 +1751,7 @@ static inline NV_STATUS nv_check_gpu_state(nv_state_t *nv)

 extern NvU32 NVreg_EnableUserNUMAManagement;
 extern NvU32 NVreg_RegisterPCIDriver;
+extern NvU32 NVreg_EnableResizableBar;

 extern NvU32 num_probed_nv_devices;
 extern NvU32 num_nv_devices;
--- a/kernel-open/common/inc/nv-pci.h
+++ b/kernel-open/common/inc/nv-pci.h
@ -27,6 +27,9 @@
 #include <linux/pci.h>
 #include "nv-linux.h"

+#define NV_GPU_BAR1 1
+#define NV_GPU_BAR3 3
+
 int nv_pci_register_driver(void);
 void nv_pci_unregister_driver(void);
 int nv_pci_count_devices(void);
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@ -315,6 +315,7 @@ typedef enum
    NV_SOC_IRQ_DPAUX_TYPE,
    NV_SOC_IRQ_GPIO_TYPE,
    NV_SOC_IRQ_HDACODEC_TYPE,
+    NV_SOC_IRQ_TCPC2DISP_TYPE,
    NV_SOC_IRQ_INVALID_TYPE
 } nv_soc_irq_type_t;

@ -329,6 +330,7 @@ typedef struct nv_soc_irq_info_s {
        NvU32 gpio_num;
        NvU32 dpaux_instance;
    } irq_data;
+    NvS32 ref_count;
 } nv_soc_irq_info_t;

 #define NV_MAX_SOC_IRQS              6
@ -384,9 +386,11 @@ typedef struct nv_state_t
    NvS32 current_soc_irq;
    NvU32 num_soc_irqs;
    NvU32 hdacodec_irq;
+    NvU32 tcpc2disp_irq;
    NvU8 *soc_dcb_blob;
    NvU32 soc_dcb_size;
    NvU32 disp_sw_soc_chip_id;
+    NvBool soc_is_dpalt_mode_supported;

    NvU32 igpu_stall_irq[NV_IGPU_MAX_STALL_IRQS];
    NvU32 igpu_nonstall_irq;
@ -649,7 +653,8 @@ static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)

 static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
 {
-    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+    return  ((nv->fb) && (nv->fb->size != 0) &&
+             (offset >= nv->fb->cpu_address) &&
             ((offset + (length - 1)) >= offset) &&
             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
 }
@ -739,7 +744,7 @@ nv_state_t*  NV_API_CALL  nv_get_ctl_state       (void);
 void   NV_API_CALL  nv_set_dma_address_size      (nv_state_t *, NvU32 );

 NV_STATUS  NV_API_CALL  nv_alias_pages           (nv_state_t *, NvU32, NvU32, NvU32, NvU64, NvU64 *, void **);
-NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvBool, NvU32, NvBool, NvBool, NvU64 *, void **);
+NV_STATUS  NV_API_CALL  nv_alloc_pages           (nv_state_t *, NvU32, NvBool, NvU32, NvBool, NvBool, NvS32, NvU64 *, void **);
 NV_STATUS  NV_API_CALL  nv_free_pages            (nv_state_t *, NvU32, NvBool, NvU32, void *);

 NV_STATUS  NV_API_CALL  nv_register_user_pages   (nv_state_t *, NvU64, NvU64 *, void *, void **);
@ -915,7 +920,6 @@ NV_STATUS  NV_API_CALL  rm_write_registry_string (nvidia_stack_t *, nv_state_t *
 void       NV_API_CALL  rm_parse_option_string   (nvidia_stack_t *, const char *);
 char*      NV_API_CALL  rm_remove_spaces         (const char *);
 char*      NV_API_CALL  rm_string_token          (char **, const char);
-void       NV_API_CALL  rm_vgpu_vfio_set_driver_vm(nvidia_stack_t *, NvBool);

 NV_STATUS  NV_API_CALL  rm_run_rc_callback       (nvidia_stack_t *, nv_state_t *);
 void       NV_API_CALL  rm_execute_work_item     (nvidia_stack_t *, void *);
@ -985,11 +989,12 @@ const char* NV_API_CALL rm_get_dynamic_power_management_status(nvidia_stack_t *,
 const char* NV_API_CALL rm_get_gpu_gcx_support(nvidia_stack_t *, nv_state_t *, NvBool);

 void       NV_API_CALL rm_acpi_notify(nvidia_stack_t *, nv_state_t *, NvU32);
+void       NV_API_CALL rm_acpi_nvpcf_notify(nvidia_stack_t *);

 NvBool     NV_API_CALL rm_is_altstack_in_use(void);

 /* vGPU VFIO specific functions */
-NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU32, NvU16 *, NvU32);
+NV_STATUS  NV_API_CALL  nv_vgpu_create_request(nvidia_stack_t *, nv_state_t *, const NvU8 *, NvU32, NvU16 *, NvU32, NvBool *);
 NV_STATUS  NV_API_CALL  nv_vgpu_delete(nvidia_stack_t *, const NvU8 *, NvU16);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_ids(nvidia_stack_t *, nv_state_t *, NvU32 *, NvU32 *, NvBool, NvU8, NvBool);
 NV_STATUS  NV_API_CALL  nv_vgpu_get_type_info(nvidia_stack_t *, nv_state_t *, NvU32, char *, int, NvU8);
--- a/kernel-open/common/inc/nv_uvm_interface.h
+++ b/kernel-open/common/inc/nv_uvm_interface.h
@ -921,6 +921,23 @@ NV_STATUS nvUvmInterfaceGetNonReplayableFaults(UvmGpuFaultInfo *pFaultInfo,
                                               void *pFaultBuffer,
                                               NvU32 *numFaults);

+/*******************************************************************************
+    nvUvmInterfaceFlushReplayableFaultBuffer
+
+    This function sends an RPC to GSP in order to flush the HW replayable fault buffer.
+
+    NOTES:
+    - This function DOES NOT acquire the RM API or GPU locks. That is because
+    it is called during fault servicing, which could produce deadlocks.
+
+    Arguments:
+        device[IN]        - Device handle associated with the gpu
+
+    Error codes:
+      NV_ERR_INVALID_ARGUMENT
+*/
+NV_STATUS nvUvmInterfaceFlushReplayableFaultBuffer(uvmGpuDeviceHandle device);
+
 /*******************************************************************************
    nvUvmInterfaceInitAccessCntrInfo

@ -1054,11 +1071,13 @@ void nvUvmInterfaceP2pObjectDestroy(uvmGpuSessionHandle session,
        hMemory[IN]                     -  Memory handle.
        offset [IN]                     -  Offset from the beginning of the allocation
                                           where PTE mappings should begin.
-                                           Should be aligned with pagesize associated
+                                           Should be aligned with mappingPagesize
+                                           in gpuExternalMappingInfo associated
                                           with the allocation.
        size [IN]                       -  Length of the allocation for which PTEs
                                           should be built.
-                                           Should be aligned with pagesize associated
+                                           Should be aligned with mappingPagesize
+                                           in gpuExternalMappingInfo associated
                                           with the allocation.
                                           size = 0 will be interpreted as the total size
                                           of the allocation.
--- a/kernel-open/common/inc/nv_uvm_types.h
+++ b/kernel-open/common/inc/nv_uvm_types.h
@ -110,7 +110,7 @@ typedef struct UvmGpuMemoryInfo_tag
    NvBool deviceDescendant;

    // Out: Page size associated with the phys alloc.
-    NvU32 pageSize;
+    NvU64 pageSize;

    // Out: Set to TRUE, if the allocation is contiguous.
    NvBool contig;
@ -306,6 +306,7 @@ typedef struct UvmGpuChannelAllocParams_tag

    // interpreted as UVM_GPU_CHANNEL_ENGINE_TYPE
    NvU32 engineType;
+
 } UvmGpuChannelAllocParams;

 typedef struct UvmGpuPagingChannelAllocParams_tag
@ -371,7 +372,6 @@ typedef enum
    UVM_LINK_TYPE_NVLINK_2,
    UVM_LINK_TYPE_NVLINK_3,
    UVM_LINK_TYPE_NVLINK_4,
-    UVM_LINK_TYPE_C2C,
 } UVM_LINK_TYPE;

 typedef struct UvmGpuCaps_tag
@ -409,7 +409,7 @@ typedef struct UvmGpuCaps_tag

 typedef struct UvmGpuAddressSpaceInfo_tag
 {
-    NvU32           bigPageSize;
+    NvU64           bigPageSize;

    NvBool          atsEnabled;

@ -430,7 +430,7 @@ typedef struct UvmGpuAddressSpaceInfo_tag
 typedef struct UvmGpuAllocInfo_tag
 {
    NvU64   gpuPhysOffset;          // Returns gpuPhysOffset if contiguous requested
-    NvU32   pageSize;               // default is RM big page size - 64K or 128 K" else use 4K or 2M
+    NvU64   pageSize;               // default is RM big page size - 64K or 128 K" else use 4K or 2M
    NvU64   alignment;              // Virtual alignment
    NvBool  bContiguousPhysAlloc;   // Flag to request contiguous physical allocation
    NvBool  bMemGrowsDown;          // Causes RM to reserve physical heap from top of FB
@ -516,6 +516,13 @@ typedef struct UvmGpuExternalMappingInfo_tag
    // In: Size of the buffer to store PTEs (in bytes).
    NvU64 pteBufferSize;

+    // In: Page size for mapping
+    //     If this field is passed as 0, the page size
+    //     of the allocation is used for mapping.
+    //     nvUvmInterfaceGetExternalAllocPtes must pass
+    //     this field as zero.
+    NvU64 mappingPageSize;
+
    // In: Pointer to a buffer to store PTEs.
    // Out: The interface will fill the buffer with PTEs
    NvU64 *pteBuffer;
@ -826,6 +833,7 @@ typedef struct UvmGpuFaultInfo_tag

        // Preallocated stack for functions called from the UVM isr bottom half
        void *isr_bh_sp;
+
    } nonReplayable;
    NvHandle faultBufferHandle;
 } UvmGpuFaultInfo;
--- a/kernel-open/common/inc/nvkms-kapi.h
+++ b/kernel-open/common/inc/nvkms-kapi.h
@ -520,14 +520,23 @@ struct NvKmsKapiFunctionsTable {
    );

    /*!
-     * Revoke modeset permissions previously granted. This currently applies for all
-     * previous grant requests for this device.
+     * Revoke permissions previously granted. Only one (dispIndex, head,
+     * display) is currently supported.
     *
     * \param [in]  device     A device returned by allocateDevice().
     *
+     * \param [in]  head       head of display.
+     *
+     * \param [in]  display    The display to revoke.
+     *
     * \return NV_TRUE on success, NV_FALSE on failure.
     */
-    NvBool (*revokePermissions)(struct NvKmsKapiDevice *device);
+    NvBool (*revokePermissions)
+    (
+        struct NvKmsKapiDevice *device,
+        NvU32 head,
+        NvKmsKapiDisplay display
+    );

    /*!
     * Registers for notification, via
--- a/kernel-open/common/inc/os-interface.h
+++ b/kernel-open/common/inc/os-interface.h
@ -181,7 +181,6 @@ NV_STATUS   NV_API_CALL  os_put_page                 (NvU64 address);
 NvU32       NV_API_CALL  os_get_page_refcount        (NvU64 address);
 NvU32       NV_API_CALL  os_count_tail_pages         (NvU64 address);
 void        NV_API_CALL  os_free_pages_phys          (NvU64, NvU32);
-NV_STATUS   NV_API_CALL  os_call_nv_vmbus            (NvU32, void *);
 NV_STATUS   NV_API_CALL  os_open_temporary_file      (void **);
 void        NV_API_CALL  os_close_file               (void *);
 NV_STATUS   NV_API_CALL  os_write_file               (void *, NvU8 *, NvU64, NvU64);
--- a/kernel-open/common/inc/rm-gpu-ops.h
+++ b/kernel-open/common/inc/rm-gpu-ops.h
@ -74,6 +74,7 @@ NV_STATUS NV_API_CALL rm_gpu_ops_own_page_fault_intr(nvidia_stack_t *, nvgpuDevi
 NV_STATUS  NV_API_CALL rm_gpu_ops_init_fault_info(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuFaultInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_destroy_fault_info(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuFaultInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_get_non_replayable_faults(nvidia_stack_t *, nvgpuFaultInfo_t, void *, NvU32 *);
+NV_STATUS  NV_API_CALL rm_gpu_ops_flush_replayable_fault_buffer(nvidia_stack_t *, nvgpuDeviceHandle_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_has_pending_non_replayable_faults(nvidia_stack_t *, nvgpuFaultInfo_t, NvBool *);
 NV_STATUS  NV_API_CALL rm_gpu_ops_init_access_cntr_info(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuAccessCntrInfo_t);
 NV_STATUS  NV_API_CALL rm_gpu_ops_destroy_access_cntr_info(nvidia_stack_t *, nvgpuDeviceHandle_t, nvgpuAccessCntrInfo_t);
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@ -8,19 +8,11 @@ cd $SCRIPTDIR

 CC="$1"
 ARCH=$2
-ISYSTEM=`$CC -print-file-name=include 2> /dev/null`
 SOURCES=$3
 HEADERS=$SOURCES/include
 OUTPUT=$4
 XEN_PRESENT=1
 PREEMPT_RT_PRESENT=0
-KERNEL_ARCH="$ARCH"
-
-if [ "$ARCH" = "i386" -o "$ARCH" = "x86_64" ]; then
-    if [ -d "$SOURCES/arch/x86" ]; then
-        KERNEL_ARCH="x86"
-    fi
-fi

 # VGX_BUILD parameter defined only for VGX builds (vGPU Host driver)
 # VGX_KVM_BUILD parameter defined only vGPU builds on KVM hypervisor
@ -69,16 +61,10 @@ test_header_presence() {
    # NV_LINUX_FENCE_H_PRESENT, and that is either defined or undefined, in the
    # output (which goes to stdout, just like the rest of this file).

-    # -MG or -MD can interfere with the use of -M and -M -MG for testing file
-    # existence; filter out any occurrences from CFLAGS. CFLAGS is intentionally
-    # wrapped with whitespace in the input to sed(1) so the regex can match zero
-    # or more occurrences of "-MD" or "-MG", surrounded by whitespace to avoid
-    # accidental matches with tokens that happen to contain either of those
-    # strings, without special handling of the beginning or the end of the line.
-    TEST_CFLAGS=`echo "-E -M $CFLAGS " | sed -e 's/\( -M[DG]\)* / /g'`
+    TEST_CFLAGS="-E -M $CFLAGS"

    file="$1"
-    file_define=NV_`echo $file | tr '/.' '_' | tr '-' '_' | tr 'a-z' 'A-Z'`_PRESENT
+    file_define=NV_`echo $file | tr '/.\-a-z' '___A-Z'`_PRESENT

    CODE="#include <$file>"

@ -99,6 +85,7 @@ test_header_presence() {
 }

 build_cflags() {
+    ISYSTEM=`$CC -print-file-name=include 2> /dev/null`
    BASE_CFLAGS="-O2 -D__KERNEL__ \
 -DKBUILD_BASENAME=\"#conftest$$\" -DKBUILD_MODNAME=\"#conftest$$\" \
 -nostdinc -isystem $ISYSTEM \
@ -125,6 +112,14 @@ build_cflags() {
        MACH_CFLAGS="-I$HEADERS/asm/mach-xen"
    fi

+    KERNEL_ARCH="$ARCH"
+
+    if [ "$ARCH" = "i386" -o "$ARCH" = "x86_64" ]; then
+        if [ -d "$SOURCES/arch/x86" ]; then
+            KERNEL_ARCH="x86"
+        fi
+    fi
+
    SOURCE_HEADERS="$HEADERS"
    SOURCE_ARCH_HEADERS="$SOURCES/arch/$KERNEL_ARCH/include"
    OUTPUT_HEADERS="$OUTPUT/include"
@ -764,24 +759,6 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_VFIO_INFO_ADD_CAPABILITY_HAS_CAP_TYPE_ID_ARGS" "" "types"
        ;;

-        vmbus_channel_has_ringbuffer_page)
-            #
-            # Check if ringbuffer_page field exist in vmbus_channel structure
-            #
-            # Changed in commit 52a42c2a90226dc61c99bbd0cb096deeb52c334b
-            # ("vmbus: keep pointer to ring buffer page") in v5.0 (2018-09-14)
-            #
-
-            CODE="
-            #include <linux/hyperv.h>
-
-            int conftest_vmbus_channel_has_ringbuffer_page(void) {
-                    return offsetof(struct vmbus_channel, ringbuffer_page);
-            }"
-
-            compile_check_conftest "$CODE" "NV_VMBUS_CHANNEL_HAS_RING_BUFFER_PAGE" "" "types"
-        ;;
-
        nvidia_grid_build)
            if [ -n "$GRID_BUILD" ]; then
                echo "#define NV_GRID_BUILD" | append_conftest "generic"
@ -2691,6 +2668,24 @@ compile_test() {
            fi
        ;;

+        enable_apicv)
+            #
+            # Determine if enable_apicv boolean is exported by kernel.
+            #
+            # Added by commit fdf513e37a3bd ("KVM: x86: Use common 'enable_apicv'
+            # variable for both APICv and AVIC")
+            #
+            CODE="
+            $CONFTEST_PREAMBLE
+            #include <asm/kvm_host.h>
+
+            bool is_enable_apicv_present() {
+                return enable_apicv;
+            }"
+
+            compile_check_conftest "$CODE" "NV_ENABLE_APICV_PRESENT" "" "types"
+        ;;
+
        pci_driver_has_driver_managed_dma)
            #
            # Determine if "struct pci_driver" has .driver_managed_dma member.
@ -2774,6 +2769,63 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_MASTER_DROP_HAS_FROM_RELEASE_ARG" "" "types"
        ;;

+        drm_master_has_leases)
+            #
+            # Determine if drm_master has 'leases', 'lessor', 'lessee_idr' fields.
+            # Also checks for struct drm_mode_revoke_lease.
+            #
+            # Added by commits 2ed077e467ee ("drm: Add drm_object lease infrastructure [v5]")
+            # and 62884cd386b8 ("drm: Add four ioctls for managing drm mode object leases [v7]")
+            # in v4.15 (2017-10-24)
+            #
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+            #if defined(NV_DRM_DRM_AUTH_H_PRESENT)
+            #include <drm/drm_auth.h>
+            #endif
+            #include <uapi/drm/drm_mode.h>
+
+            int conftest_drm_master_leases(void) {
+                return offsetof(struct drm_master, leases);
+            }
+            int conftest_drm_master_lessor(void) {
+                return offsetof(struct drm_master, lessor);
+            }
+            int conftest_drm_master_lessee_idr(void) {
+                return offsetof(struct drm_master, lessee_idr);
+            }
+            int conftest_drm_mode_revoke_lease(void) {
+                return offsetof(struct drm_mode_revoke_lease, lessee_id);
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_MASTER_HAS_LEASES" "" "types"
+        ;;
+
+        drm_file_get_master)
+            #
+            # Determine if function drm_file_get_master() is present.
+            #
+            # Added by commit 56f0729a510f ("drm: protect drm_master pointers in drm_lease.c")
+            # in v5.15 (2021-07-20)
+            #
+
+            CODE="
+            #if defined(NV_DRM_DRMP_H_PRESENT)
+            #include <drm/drmP.h>
+            #endif
+            #if defined(NV_DRM_DRM_AUTH_H_PRESENT)
+            #include <drm/drm_auth.h>
+            #endif
+
+            void conftest_drm_file_get_master(void) {
+                drm_file_get_master();
+            }"
+
+            compile_check_conftest "$CODE" "NV_DRM_FILE_GET_MASTER_PRESENT" "" "functions"
+        ;;
+
        drm_connector_lookup)
            #
            # Determine if function drm_connector_lookup() is present.
@ -3819,6 +3871,40 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_TEGRA_BPMP_SEND_RECEIVE" "" "functions"
        ;;

+        cmd_uphy_display_port_init)
+            #
+            # Determine if CMD_UPHY_DISPLAY_PORT_INIT enum present in bpmp-abi header
+            # This enum is used only in Tegra down-stream kernel.
+            #
+            CODE="
+            #include <stdint.h>
+            #include <soc/tegra/bpmp-abi.h>
+
+            int conftest_cmd_uphy_display_port_init(void) {
+                return CMD_UPHY_DISPLAY_PORT_INIT;
+            }
+            "
+            compile_check_conftest "$CODE" "NV_CMD_UPHY_DISPLAY_PORT_INIT_PRESENT" "" "generic"
+
+        ;;
+
+        cmd_uphy_display_port_off)
+            #
+            # Determine if CMD_UPHY_DISPLAY_PORT_OFF enum present in bpmp-abi header
+            # This enum is used only in Tegra down-stream kernel.
+            #
+            CODE="
+            #include <stdint.h>
+            #include <soc/tegra/bpmp-abi.h>
+
+            int conftest_cmd_uphy_display_port_off(void) {
+                return CMD_UPHY_DISPLAY_PORT_OFF;
+            }
+            "
+            compile_check_conftest "$CODE" "NV_CMD_UPHY_DISPLAY_PORT_OFF_PRESENT" "" "generic"
+
+        ;;
+
        drm_alpha_blending_available)
            #
            # Determine if the DRM subsystem supports alpha blending
@ -4744,6 +4830,21 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_REMOVE_MEMORY_HAS_NID_ARG" "" "types"
        ;;

+        offline_and_remove_memory)
+            #
+            # Determine if the offline_and_remove_memory function is present.
+            #
+            # Added by commit 08b3acd7a68fc179 ("mm/memory_hotplug: Introduce
+            # offline_and_remove_memory()") in v5.8-rc1 (2020-06-05)
+            #
+            CODE="
+            #include <linux/memory_hotplug.h>
+            void conftest_offline_and_remove_memory() {
+                offline_and_remove_memory();
+            }"
+
+            compile_check_conftest "$CODE" "NV_OFFLINE_AND_REMOVE_MEMORY_PRESENT" "" "functions"
+        ;;

        device_property_read_u64)
            #
@ -4825,6 +4926,38 @@ compile_test() {
            fi
        ;;

+        of_property_read_variable_u32_array)
+            #
+            # Determine if of_property_read_variable_u32_array is present
+            #
+            # Added by commit 1df09bcof (" Move OF property and graph API from
+            # base.c to property.c"
+            #
+            # Test if linux/of.h header file inclusion is successful or not,
+            # depending on that, check for of_property_read_variable_u32_array
+            # presence
+            #
+            echo "$CONFTEST_PREAMBLE
+            #include <linux/of.h>
+            " > conftest$$.c
+
+            $CC $CFLAGS -c conftest$$.c > /dev/null 2>&1
+            rm -f conftest$$.c
+
+            if [ -f conftest$$.o ]; then
+                rm -f conftest$$.o
+                CODE="
+                #include <linux/of.h>
+                void conftest_of_property_read_variable_u32_array() {
+                    of_property_read_variable_u32_array();
+                }"
+
+                compile_check_conftest "$CODE" "NV_OF_PROPERTY_READ_VARIABLE_U32_ARRAY_PRESENT" "" "functions"
+            else
+                echo "#undef NV_OF_PROPERTY_READ_VARIABLE_U32_ARRAY_PRESENT" | append_conftest "functions"
+            fi
+        ;;
+
        devm_of_platform_populate)
            #
            # Determine if devm_of_platform_populate() function is present
@ -5168,6 +5301,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_MAKE_DEVICE_EXCLUSIVE_RANGE_PRESENT" "" "functions"
        ;;

+        migrate_device_range)
+            #
+            # Determine if the migrate_device_range() function is present
+            #
+            # migrate_device_range() function was added by commit
+            # e778406b40dbb ("mm/migrate_device.c: add migrate_device_range()")
+            # in v6.1 (2022-09-28).
+            CODE="
+            #include <linux/migrate.h>
+            int conftest_migrate_device_range(void) {
+                migrate_device_range();
+            }"
+
+            compile_check_conftest "$CODE" "NV_MIGRATE_DEVICE_RANGE_PRESENT" "" "functions"
+        ;;
+
        ioasid_get)
            #
            # Determine if ioasid_get() function is present
@ -5186,6 +5335,25 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
        ;;

+        mm_pasid_set)
+            #
+            # Determine if mm_pasid_set() function is present
+            #
+            # mm_pasid_set() function was added by commit
+            # 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
+            # PASID to mm on PASID allocation and free it on mm exit) in v5.18.
+            # (2022-02-15).
+            CODE="
+            #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
+            #include <linux/sched/mm.h>
+            #endif
+            void conftest_mm_pasid_set(void) {
+                mm_pasid_set();
+            }"
+
+            compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
+        ;;
+
        drm_crtc_state_has_no_vblank)
            #
            # Determine if the 'drm_crtc_state' structure has 'no_vblank'.
@ -5453,6 +5621,22 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_ACPI_VIDEO_BACKLIGHT_USE_NATIVE" "" "functions"
        ;;

+        pci_rebar_get_possible_sizes)
+            #
+            # Determine if the pci_rebar_get_possible_sizes() function is present.
+            #
+            # Added by commit 8fbdbb66f8c10 ("PCI: Add resizable BAR infrastructure
+            # ") in v5.12
+            #
+            CODE="
+            #include <linux/pci.h>
+            void conftest_pci_rebar_get_possible_sizes(void) {
+                pci_rebar_get_possible_sizes();
+            }"
+
+            compile_check_conftest "$CODE" "NV_PCI_REBAR_GET_POSSIBLE_SIZES_PRESENT" "" "functions"
+        ;;
+
        drm_connector_has_override_edid)
            #
            # Determine if 'struct drm_connector' has an 'override_edid' member.
@ -5475,6 +5659,27 @@ compile_test() {
            compile_check_conftest "$CODE" "NV_DRM_CONNECTOR_HAS_OVERRIDE_EDID" "" "types"
        ;;

+        iommu_sva_bind_device_has_drvdata_arg)
+            #
+            # Check if iommu_sva_bind_device() has drvdata parameter.
+            #
+            # Removed by commit be51b1d6bbff48c7d1943a8ff1e5a55777807f6e
+            # ("iommu/sva: Refactoring iommu_sva_bind/unbind_device()")
+            # in v6.2 (2022-10-31)
+            #
+            CODE="
+            #include <linux/iommu.h>
+            #include <linux/mm_types.h>
+            #include <linux/device.h>
+            void conftest_iommu_sva_bind_device_has_drvdata_arg(struct device *dev,
+                                                                struct mm_struct *mm,
+                                                                void *drvdata) {
+                (void) iommu_sva_bind_device(dev, mm, drvdata);
+            }"
+
+            compile_check_conftest "$CODE" "NV_IOMMU_SVA_BIND_DEVICE_HAS_DRVDATA_ARG" "" "types"
+        ;;
+
        # When adding a new conftest entry, please use the correct format for
        # specifying the relevant upstream Linux kernel commit.
        #
@ -5743,10 +5948,6 @@ case "$5" in

        for i in $*; do compile_test $i; done

-        for file in conftest*.d; do
-            rm -f $file > /dev/null 2>&1
-        done
-
        exit 0
    ;;

@ -5882,10 +6083,6 @@ case "$5" in

        test_header_presence "${7}"

-        for file in conftest*.d; do
-            rm -f $file > /dev/null 2>&1
-        done
-
        exit $?
    ;;

--- a/kernel-open/nvidia-drm/nvidia-drm-connector.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-connector.c
@ -27,6 +27,7 @@
 #include "nvidia-drm-helper.h"
 #include "nvidia-drm-priv.h"
 #include "nvidia-drm-connector.h"
+#include "nvidia-drm-crtc.h"
 #include "nvidia-drm-utils.h"
 #include "nvidia-drm-encoder.h"

@ -207,6 +208,11 @@ done:

    nv_drm_free(pDetectParams);

+    if (status == connector_status_disconnected &&
+        nv_connector->modeset_permission_filep) {
+        nv_drm_connector_revoke_permissions(dev, nv_connector);
+    }
+
    return status;
 }

@ -372,6 +378,8 @@ nv_drm_connector_new(struct drm_device *dev,
    nv_connector->physicalIndex = physicalIndex;
    nv_connector->type     = type;
    nv_connector->internal = internal;
+    nv_connector->modeset_permission_filep = NULL;
+    nv_connector->modeset_permission_crtc = NULL;

    strcpy(nv_connector->dpAddress, dpAddress);

@ -474,4 +482,26 @@ done:
    return connector;
 }

+/*
+ * Revoke the permissions on this connector.
+ */
+bool nv_drm_connector_revoke_permissions(struct drm_device *dev,
+                                         struct nv_drm_connector* nv_connector)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    bool ret = true;
+
+    if (nv_connector->modeset_permission_crtc) {
+        if (nv_connector->nv_detected_encoder) {
+            ret = nvKms->revokePermissions(
+                nv_dev->pDevice, nv_connector->modeset_permission_crtc->head,
+                nv_connector->nv_detected_encoder->hDisplay);
+        }
+        nv_connector->modeset_permission_crtc->modeset_permission_filep = NULL;
+        nv_connector->modeset_permission_crtc = NULL;
+    }
+    nv_connector->modeset_permission_filep = NULL;
+    return ret;
+}
+
 #endif
--- a/kernel-open/nvidia-drm/nvidia-drm-connector.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-connector.h
@ -51,6 +51,20 @@ struct nv_drm_connector {

    atomic_t connection_status_dirty;

+    /**
+     * @modeset_permission_filep:
+     *
+     * The filep using this connector with DRM_IOCTL_NVIDIA_GRANT_PERMISSIONS.
+     */
+    struct drm_file *modeset_permission_filep;
+
+    /**
+     * @modeset_permission_crtc:
+     *
+     * The crtc using this connector with DRM_IOCTL_NVIDIA_GRANT_PERMISSIONS.
+     */
+    struct nv_drm_crtc *modeset_permission_crtc;
+
    struct drm_connector base;
 };

@ -84,6 +98,9 @@ nv_drm_get_connector(struct drm_device *dev,
                     NvBool internal,
                     char dpAddress[NVKMS_DP_ADDRESS_STRING_LENGTH]);

+bool nv_drm_connector_revoke_permissions(struct drm_device *dev,
+                                         struct nv_drm_connector *nv_connector);
+
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */

 #endif /* __NVIDIA_DRM_CONNECTOR_H__ */
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.c
@ -1181,6 +1181,7 @@ static struct drm_crtc *__nv_drm_crtc_create(struct nv_drm_device *nv_dev,
    nv_crtc->head = head;
    INIT_LIST_HEAD(&nv_crtc->flip_list);
    spin_lock_init(&nv_crtc->flip_list_lock);
+    nv_crtc->modeset_permission_filep = NULL;

    ret = drm_crtc_init_with_planes(nv_dev->dev,
                                    &nv_crtc->base,
@ -1329,7 +1330,7 @@ int nv_drm_get_crtc_crc32_v2_ioctl(struct drm_device *dev,
        return -ENOENT;
    }

-    crtc = nv_drm_crtc_find(dev, params->crtc_id);
+    crtc = nv_drm_crtc_find(dev, filep, params->crtc_id);
    if (!crtc) {
        return -ENOENT;
    }
@ -1357,7 +1358,7 @@ int nv_drm_get_crtc_crc32_ioctl(struct drm_device *dev,
        return -ENOENT;
    }

-    crtc = nv_drm_crtc_find(dev, params->crtc_id);
+    crtc = nv_drm_crtc_find(dev, filep, params->crtc_id);
    if (!crtc) {
        return -ENOENT;
    }
--- a/kernel-open/nvidia-drm/nvidia-drm-crtc.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-crtc.h
@ -35,38 +35,9 @@

 #include <drm/drm_crtc.h>

-#if defined(NV_DRM_ALPHA_BLENDING_AVAILABLE) || defined(NV_DRM_ROTATION_AVAILABLE)
-/* For DRM_ROTATE_* , DRM_REFLECT_* */
-#include <drm/drm_blend.h>
-#endif
-
-#if defined(NV_DRM_ROTATION_AVAILABLE)
-/* For DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* */
-#include <uapi/drm/drm_mode.h>
-#endif
-
 #include "nvtypes.h"
 #include "nvkms-kapi.h"

-#if defined(NV_DRM_ROTATION_AVAILABLE)
-/*
- * 19-05-2017 c2c446ad29437bb92b157423c632286608ebd3ec has added
- * DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* to UAPI and removed
- * DRM_ROTATE_* and DRM_MODE_REFLECT_*
- */
-#if !defined(DRM_MODE_ROTATE_0)
-#define DRM_MODE_ROTATE_0       DRM_ROTATE_0
-#define DRM_MODE_ROTATE_90      DRM_ROTATE_90
-#define DRM_MODE_ROTATE_180     DRM_ROTATE_180
-#define DRM_MODE_ROTATE_270     DRM_ROTATE_270
-#define DRM_MODE_REFLECT_X      DRM_REFLECT_X
-#define DRM_MODE_REFLECT_Y      DRM_REFLECT_Y
-#define DRM_MODE_ROTATE_MASK    DRM_ROTATE_MASK
-#define DRM_MODE_REFLECT_MASK   DRM_REFLECT_MASK
-#endif
-
-#endif //NV_DRM_ROTATION_AVAILABLE
-
 struct nv_drm_crtc {
    NvU32 head;

@ -85,6 +56,13 @@ struct nv_drm_crtc {
     */
    spinlock_t flip_list_lock;

+    /**
+     * @modeset_permission_filep:
+     *
+     * The filep using this crtc with DRM_IOCTL_NVIDIA_GRANT_PERMISSIONS.
+     */
+    struct drm_file *modeset_permission_filep;
+
    struct drm_crtc base;
 };

--- a/kernel-open/nvidia-drm/nvidia-drm-drv.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c
@ -30,7 +30,7 @@
 #include "nvidia-drm-connector.h"
 #include "nvidia-drm-gem.h"
 #include "nvidia-drm-crtc.h"
-#include "nvidia-drm-prime-fence.h"
+#include "nvidia-drm-fence.h"
 #include "nvidia-drm-helper.h"
 #include "nvidia-drm-gem-nvkms-memory.h"
 #include "nvidia-drm-gem-user-memory.h"
@ -706,6 +706,16 @@ static int nv_drm_get_dev_info_ioctl(struct drm_device *dev,
    return 0;
 }

+static int nv_drm_dmabuf_supported_ioctl(struct drm_device *dev,
+                                         void *data, struct drm_file *filep)
+{
+    /* check the pDevice since this only gets set if modeset = 1
+     * which is a requirement for the dma_buf extension to work
+     */
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    return nv_dev->pDevice ? 0 : -EINVAL;
+}
+
 static
 int nv_drm_get_client_capability_ioctl(struct drm_device *dev,
                                       void *data, struct drm_file *filep)
@ -735,6 +745,455 @@ int nv_drm_get_client_capability_ioctl(struct drm_device *dev,
    return 0;
 }

+#if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
+static bool nv_drm_connector_is_dpy_id(struct drm_connector *connector,
+                                       NvU32 dpyId)
+{
+    struct nv_drm_connector *nv_connector = to_nv_connector(connector);
+    return nv_connector->nv_detected_encoder &&
+           nv_connector->nv_detected_encoder->hDisplay == dpyId;
+}
+
+static int nv_drm_get_dpy_id_for_connector_id_ioctl(struct drm_device *dev,
+                                                    void *data,
+                                                    struct drm_file *filep)
+{
+    struct drm_nvidia_get_dpy_id_for_connector_id_params *params = data;
+    // Importantly, drm_connector_lookup (with filep) will only return the
+    // connector if we are master, a lessee with the connector, or not master at
+    // all. It will return NULL if we are a lessee with other connectors.
+    struct drm_connector *connector =
+        nv_drm_connector_lookup(dev, filep, params->connectorId);
+    struct nv_drm_connector *nv_connector;
+    int ret = 0;
+
+    if (!connector) {
+        return -EINVAL;
+    }
+
+    nv_connector = to_nv_connector(connector);
+    if (!nv_connector) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    if (!nv_connector->nv_detected_encoder) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    params->dpyId = nv_connector->nv_detected_encoder->hDisplay;
+
+done:
+    nv_drm_connector_put(connector);
+    return ret;
+}
+
+static int nv_drm_get_connector_id_for_dpy_id_ioctl(struct drm_device *dev,
+                                                    void *data,
+                                                    struct drm_file *filep)
+{
+    struct drm_nvidia_get_connector_id_for_dpy_id_params *params = data;
+    struct drm_connector *connector;
+    int ret = -EINVAL;
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    struct drm_connector_list_iter conn_iter;
+    nv_drm_connector_list_iter_begin(dev, &conn_iter);
+#endif
+
+    /* Lookup for existing connector with same dpyId */
+    nv_drm_for_each_connector(connector, &conn_iter, dev) {
+        if (nv_drm_connector_is_dpy_id(connector, params->dpyId)) {
+            params->connectorId = connector->base.id;
+            ret = 0;
+            break;
+        }
+    }
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_end(&conn_iter);
+#endif
+
+    return ret;
+}
+
+static NvU32 nv_drm_get_head_bit_from_connector(struct drm_connector *connector)
+{
+    struct nv_drm_connector *nv_connector = to_nv_connector(connector);
+
+    if (connector->state && connector->state->crtc) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(connector->state->crtc);
+        return NVBIT(nv_crtc->head);
+    } else if (nv_connector->nv_detected_encoder &&
+               nv_connector->nv_detected_encoder->base.crtc) {
+        struct nv_drm_crtc *nv_crtc =
+            to_nv_crtc(nv_connector->nv_detected_encoder->base.crtc);
+        return NVBIT(nv_crtc->head);
+    }
+
+    return 0;
+}
+
+static int nv_drm_grant_permission_ioctl(struct drm_device *dev, void *data,
+                                         struct drm_file *filep)
+{
+    struct drm_nvidia_grant_permissions_params *params = data;
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct nv_drm_connector *target_nv_connector = NULL;
+    struct nv_drm_crtc *target_nv_crtc = NULL;
+    struct drm_connector *connector, *target_connector = NULL;
+    struct drm_crtc *crtc;
+    NvU32 head = 0, freeHeadBits, targetHeadBit, possible_crtcs;
+    int ret = 0;
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    struct drm_connector_list_iter conn_iter;
+#endif
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    struct drm_modeset_acquire_ctx ctx;
+    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
+                               ret);
+#else
+    mutex_lock(&dev->mode_config.mutex);
+#endif
+
+    /* Get the connector for the dpyId. */
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_begin(dev, &conn_iter);
+#endif
+    nv_drm_for_each_connector(connector, &conn_iter, dev) {
+        if (nv_drm_connector_is_dpy_id(connector, params->dpyId)) {
+            target_connector =
+                nv_drm_connector_lookup(dev, filep, connector->base.id);
+            break;
+        }
+    }
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_end(&conn_iter);
+#endif
+
+    // Importantly, drm_connector_lookup/drm_crtc_find (with filep) will only
+    // return the object if we are master, a lessee with the object, or not
+    // master at all. It will return NULL if we are a lessee with other objects.
+    if (!target_connector) {
+        ret = -EINVAL;
+        goto done;
+    }
+    target_nv_connector = to_nv_connector(target_connector);
+    possible_crtcs =
+        target_nv_connector->nv_detected_encoder->base.possible_crtcs;
+
+    /* Target connector must not be previously granted. */
+    if (target_nv_connector->modeset_permission_filep) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    /* Add all heads that are owned and not already granted. */
+    freeHeadBits = 0;
+    nv_drm_for_each_crtc(crtc, dev) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+        if (nv_drm_crtc_find(dev, filep, crtc->base.id) &&
+            !nv_crtc->modeset_permission_filep &&
+            (drm_crtc_mask(crtc) & possible_crtcs)) {
+            freeHeadBits |= NVBIT(nv_crtc->head);
+        }
+    }
+
+    targetHeadBit = nv_drm_get_head_bit_from_connector(target_connector);
+    if (targetHeadBit & freeHeadBits) {
+        /* If a crtc is already being used by this connector, use it. */
+        freeHeadBits = targetHeadBit;
+    } else {
+        /* Otherwise, remove heads that are in use by other connectors. */
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+        nv_drm_connector_list_iter_begin(dev, &conn_iter);
+#endif
+        nv_drm_for_each_connector(connector, &conn_iter, dev) {
+            freeHeadBits &= ~nv_drm_get_head_bit_from_connector(connector);
+        }
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+        nv_drm_connector_list_iter_end(&conn_iter);
+#endif
+    }
+
+    /* Fail if no heads are available. */
+    if (!freeHeadBits) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    /*
+     * Loop through the crtc again and find a matching head.
+     * Record the filep that is using the crtc and the connector.
+     */
+    nv_drm_for_each_crtc(crtc, dev) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+        if (freeHeadBits & NVBIT(nv_crtc->head)) {
+            target_nv_crtc = nv_crtc;
+            head = nv_crtc->head;
+            break;
+        }
+    }
+
+    if (!nvKms->grantPermissions(params->fd, nv_dev->pDevice, head,
+                                 params->dpyId)) {
+        ret = -EINVAL;
+        goto done;
+    }
+
+    target_nv_connector->modeset_permission_crtc = target_nv_crtc;
+    target_nv_connector->modeset_permission_filep = filep;
+    target_nv_crtc->modeset_permission_filep = filep;
+
+done:
+    if (target_connector) {
+        nv_drm_connector_put(target_connector);
+    }
+
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
+#else
+    mutex_unlock(&dev->mode_config.mutex);
+#endif
+
+    return ret;
+}
+
+static bool nv_drm_revoke_connector(struct nv_drm_device *nv_dev,
+                                    struct nv_drm_connector *nv_connector)
+{
+    bool ret = true;
+    if (nv_connector->modeset_permission_crtc) {
+        if (nv_connector->nv_detected_encoder) {
+            ret = nvKms->revokePermissions(
+                nv_dev->pDevice, nv_connector->modeset_permission_crtc->head,
+                nv_connector->nv_detected_encoder->hDisplay);
+        }
+        nv_connector->modeset_permission_crtc->modeset_permission_filep = NULL;
+        nv_connector->modeset_permission_crtc = NULL;
+    }
+    nv_connector->modeset_permission_filep = NULL;
+    return ret;
+}
+
+static int nv_drm_revoke_permission(struct drm_device *dev,
+                                    struct drm_file *filep, NvU32 dpyId)
+{
+    struct drm_connector *connector;
+    struct drm_crtc *crtc;
+    int ret = 0;
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    struct drm_connector_list_iter conn_iter;
+#endif
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    struct drm_modeset_acquire_ctx ctx;
+    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
+                               ret);
+#else
+    mutex_lock(&dev->mode_config.mutex);
+#endif
+
+    /*
+     * If dpyId is set, only revoke those specific resources. Otherwise,
+     * it is from closing the file so revoke all resources for that filep.
+     */
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_begin(dev, &conn_iter);
+#endif
+    nv_drm_for_each_connector(connector, &conn_iter, dev) {
+        struct nv_drm_connector *nv_connector = to_nv_connector(connector);
+        if (nv_connector->modeset_permission_filep == filep &&
+            (!dpyId || nv_drm_connector_is_dpy_id(connector, dpyId))) {
+            if (!nv_drm_connector_revoke_permissions(dev, nv_connector)) {
+                ret = -EINVAL;
+                // Continue trying to revoke as much as possible.
+            }
+        }
+    }
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_end(&conn_iter);
+#endif
+
+    nv_drm_for_each_crtc(crtc, dev) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+        if (nv_crtc->modeset_permission_filep == filep && !dpyId) {
+            nv_crtc->modeset_permission_filep = NULL;
+        }
+    }
+
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
+#else
+    mutex_unlock(&dev->mode_config.mutex);
+#endif
+
+    return ret;
+}
+
+static int nv_drm_revoke_permission_ioctl(struct drm_device *dev, void *data,
+                                          struct drm_file *filep)
+{
+    struct drm_nvidia_revoke_permissions_params *params = data;
+    if (!params->dpyId) {
+        return -EINVAL;
+    }
+    return nv_drm_revoke_permission(dev, filep, params->dpyId);
+}
+
+static void nv_drm_postclose(struct drm_device *dev, struct drm_file *filep)
+{
+    /*
+     * Some systems like android can reach here without initializing the
+     * device, so check for that.
+     */
+    if (dev->mode_config.num_crtc > 0 &&
+        dev->mode_config.crtc_list.next != NULL &&
+        dev->mode_config.crtc_list.prev != NULL &&
+        dev->mode_config.num_connector > 0 &&
+        dev->mode_config.connector_list.next != NULL &&
+        dev->mode_config.connector_list.prev != NULL) {
+        nv_drm_revoke_permission(dev, filep, 0);
+    }
+}
+#endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
+
+#if defined(NV_DRM_MASTER_HAS_LEASES)
+static struct drm_master *nv_drm_find_lessee(struct drm_master *master,
+                                             int lessee_id)
+{
+    int object;
+    void *entry;
+
+    while (master->lessor != NULL) {
+        master = master->lessor;
+    }
+
+    idr_for_each_entry(&master->lessee_idr, entry, object)
+    {
+        if (object == lessee_id) {
+            return entry;
+        }
+    }
+
+    return NULL;
+}
+
+static void nv_drm_get_revoked_objects(struct drm_device *dev,
+                                       struct drm_file *filep, unsigned int cmd,
+                                       unsigned long arg, int **objects,
+                                       int *objects_count)
+{
+    unsigned int ioc_size;
+    struct drm_mode_revoke_lease revoke_lease;
+    struct drm_master *lessor, *lessee;
+    void *entry;
+    int *objs;
+    int obj, obj_count, obj_i;
+
+    ioc_size = _IOC_SIZE(cmd);
+    if (ioc_size > sizeof(revoke_lease)) {
+        return;
+    }
+
+    if (copy_from_user(&revoke_lease, (void __user *)arg, ioc_size) != 0) {
+        return;
+    }
+
+    lessor = nv_drm_file_get_master(filep);
+    if (lessor == NULL) {
+        return;
+    }
+
+    mutex_lock(&dev->mode_config.idr_mutex);
+    lessee = nv_drm_find_lessee(lessor, revoke_lease.lessee_id);
+
+    if (lessee == NULL) {
+        goto done;
+    }
+
+    obj_count = 0;
+    idr_for_each_entry(&lessee->leases, entry, obj) {
+        ++obj_count;
+    }
+    if (obj_count == 0) {
+        goto done;
+    }
+
+    objs = nv_drm_calloc(obj_count, sizeof(int));
+    if (objs == NULL) {
+        goto done;
+    }
+
+    obj_i = 0;
+    idr_for_each_entry(&lessee->leases, entry, obj) {
+        objs[obj_i++] = obj;
+    }
+    *objects = objs;
+    *objects_count = obj_count;
+
+done:
+    mutex_unlock(&dev->mode_config.idr_mutex);
+    drm_master_put(&lessor);
+}
+
+static bool nv_drm_is_in_objects(int object, int *objects, int objects_count)
+{
+    int i;
+    for (i = 0; i < objects_count; ++i) {
+        if (objects[i] == object) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static void nv_drm_finish_revoking_objects(struct drm_device *dev,
+                                           struct drm_file *filep, int *objects,
+                                           int objects_count)
+{
+    struct drm_connector *connector;
+    struct drm_crtc *crtc;
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    struct drm_connector_list_iter conn_iter;
+#endif
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    int ret = 0;
+    struct drm_modeset_acquire_ctx ctx;
+    DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, DRM_MODESET_ACQUIRE_INTERRUPTIBLE,
+                               ret);
+#else
+    mutex_lock(&dev->mode_config.mutex);
+#endif
+
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_begin(dev, &conn_iter);
+#endif
+    nv_drm_for_each_connector(connector, &conn_iter, dev) {
+        struct nv_drm_connector *nv_connector = to_nv_connector(connector);
+        if (nv_connector->modeset_permission_filep &&
+            nv_drm_is_in_objects(connector->base.id, objects, objects_count)) {
+            nv_drm_connector_revoke_permissions(dev, nv_connector);
+        }
+    }
+#if defined(NV_DRM_CONNECTOR_LIST_ITER_PRESENT)
+    nv_drm_connector_list_iter_end(&conn_iter);
+#endif
+
+    nv_drm_for_each_crtc(crtc, dev) {
+        struct nv_drm_crtc *nv_crtc = to_nv_crtc(crtc);
+        if (nv_crtc->modeset_permission_filep &&
+            nv_drm_is_in_objects(crtc->base.id, objects, objects_count)) {
+            nv_crtc->modeset_permission_filep = NULL;
+        }
+    }
+
+#if NV_DRM_MODESET_LOCK_ALL_END_ARGUMENT_COUNT == 3
+    DRM_MODESET_LOCK_ALL_END(dev, ctx, ret);
+#else
+    mutex_unlock(&dev->mode_config.mutex);
+#endif
+}
+#endif /* NV_DRM_MASTER_HAS_LEASES */
+
 #if defined(NV_DRM_BUS_PRESENT)

 #if defined(NV_DRM_BUS_HAS_GET_IRQ)
@ -766,12 +1225,50 @@ static struct drm_bus nv_drm_bus = {

 #endif /* NV_DRM_BUS_PRESENT */

+/*
+ * Wrapper around drm_ioctl to hook in to upstream ioctl.
+ *
+ * Currently used to add additional handling to REVOKE_LEASE.
+ */
+static long nv_drm_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+    long retcode;
+
+#if defined(NV_DRM_MASTER_HAS_LEASES)
+    struct drm_file *file_priv = filp->private_data;
+    struct drm_device *dev = file_priv->minor->dev;
+    int *objects = NULL;
+    int objects_count = 0;
+
+    if (cmd == DRM_IOCTL_MODE_REVOKE_LEASE) {
+        // Save the revoked objects before revoking.
+        nv_drm_get_revoked_objects(dev, file_priv, cmd, arg, &objects,
+                                   &objects_count);
+    }
+#endif
+
+    retcode = drm_ioctl(filp, cmd, arg);
+
+#if defined(NV_DRM_MASTER_HAS_LEASES)
+    if (cmd == DRM_IOCTL_MODE_REVOKE_LEASE && objects) {
+        if (retcode == 0) {
+            // If revoking was successful, finish revoking the objects.
+            nv_drm_finish_revoking_objects(dev, file_priv, objects,
+                                           objects_count);
+        }
+        nv_drm_free(objects);
+    }
+#endif
+
+    return retcode;
+}
+
 static const struct file_operations nv_drm_fops = {
    .owner          = THIS_MODULE,

    .open           = drm_open,
    .release        = drm_release,
-    .unlocked_ioctl = drm_ioctl,
+    .unlocked_ioctl = nv_drm_ioctl,
 #if defined(CONFIG_COMPAT)
    .compat_ioctl   = drm_compat_ioctl,
 #endif
@ -807,11 +1304,11 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
    DRM_IOCTL_DEF_DRV(NVIDIA_FENCE_SUPPORTED,
                      nv_drm_fence_supported_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
-    DRM_IOCTL_DEF_DRV(NVIDIA_FENCE_CONTEXT_CREATE,
-                      nv_drm_fence_context_create_ioctl,
+    DRM_IOCTL_DEF_DRV(NVIDIA_PRIME_FENCE_CONTEXT_CREATE,
+                      nv_drm_prime_fence_context_create_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
-    DRM_IOCTL_DEF_DRV(NVIDIA_GEM_FENCE_ATTACH,
-                      nv_drm_gem_fence_attach_ioctl,
+    DRM_IOCTL_DEF_DRV(NVIDIA_GEM_PRIME_FENCE_ATTACH,
+                      nv_drm_gem_prime_fence_attach_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
 #endif

@ -837,6 +1334,21 @@ static const struct drm_ioctl_desc nv_drm_ioctls[] = {
    DRM_IOCTL_DEF_DRV(NVIDIA_GEM_IDENTIFY_OBJECT,
                      nv_drm_gem_identify_object_ioctl,
                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_DMABUF_SUPPORTED,
+                      nv_drm_dmabuf_supported_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_GET_DPY_ID_FOR_CONNECTOR_ID,
+                      nv_drm_get_dpy_id_for_connector_id_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_GET_CONNECTOR_ID_FOR_DPY_ID,
+                      nv_drm_get_connector_id_for_dpy_id_ioctl,
+                      DRM_RENDER_ALLOW|DRM_UNLOCKED),
+    DRM_IOCTL_DEF_DRV(NVIDIA_GRANT_PERMISSIONS,
+                      nv_drm_grant_permission_ioctl,
+                      DRM_UNLOCKED|DRM_MASTER),
+    DRM_IOCTL_DEF_DRV(NVIDIA_REVOKE_PERMISSIONS,
+                      nv_drm_revoke_permission_ioctl,
+                      DRM_UNLOCKED|DRM_MASTER),
 #endif /* NV_DRM_ATOMIC_MODESET_AVAILABLE */
 };

@ -879,6 +1391,9 @@ static struct drm_driver nv_drm_driver = {

    .load                   = nv_drm_load,
    .unload                 = nv_drm_unload,
+#if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE)
+    .postclose              = nv_drm_postclose,
+#endif

    .fops                   = &nv_drm_fops,

--- a/kernel-open/nvidia-drm/nvidia-drm-prime-fence.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-prime-fence.c
@ -31,17 +31,28 @@
 #include "nvidia-drm-priv.h"
 #include "nvidia-drm-ioctl.h"
 #include "nvidia-drm-gem.h"
-#include "nvidia-drm-prime-fence.h"
+#include "nvidia-drm-fence.h"
 #include "nvidia-dma-resv-helper.h"

 #if defined(NV_DRM_FENCE_AVAILABLE)

 #include "nvidia-dma-fence-helper.h"

-struct nv_drm_fence_context {
-    struct nv_drm_device *nv_dev;
+struct nv_drm_fence_context;

+struct nv_drm_fence_context_ops {
+    void (*destroy)(struct nv_drm_fence_context *nv_fence_context);
+};
+
+struct nv_drm_fence_context {
+    const struct nv_drm_fence_context_ops *ops;
+
+    struct nv_drm_device *nv_dev;
    uint32_t context;
+};
+
+struct nv_drm_prime_fence_context {
+    struct nv_drm_fence_context base;

    NvU64 fenceSemIndex; /* Index into semaphore surface */

@ -53,10 +64,10 @@ struct nv_drm_fence_context {
    spinlock_t lock;

    /*
-     * Software signaling structures. __nv_drm_fence_context_new()
-     * allocates channel event and __nv_drm_fence_context_destroy() frees it.
-     * There are no simultaneous read/write access to 'cb', therefore it does
-     * not require spin-lock protection.
+     * Software signaling structures. __nv_drm_prime_fence_context_new()
+     * allocates channel event and __nv_drm_prime_fence_context_destroy() frees
+     * it. There are no simultaneous read/write access to 'cb', therefore it
+     * does not require spin-lock protection.
     */
    struct NvKmsKapiChannelEvent *cb;

@ -79,7 +90,7 @@ struct nv_drm_prime_fence *to_nv_drm_prime_fence(nv_dma_fence_t *fence)
 }

 static const char*
-nv_drm_gem_prime_fence_op_get_driver_name(nv_dma_fence_t *fence)
+nv_drm_gem_fence_op_get_driver_name(nv_dma_fence_t *fence)
 {
    return "NVIDIA";
 }
@ -122,7 +133,7 @@ nv_drm_gem_prime_fence_op_wait(nv_dma_fence_t *fence,
 }

 static const nv_dma_fence_ops_t nv_drm_gem_prime_fence_ops = {
-    .get_driver_name = nv_drm_gem_prime_fence_op_get_driver_name,
+    .get_driver_name = nv_drm_gem_fence_op_get_driver_name,
    .get_timeline_name = nv_drm_gem_prime_fence_op_get_timeline_name,
    .enable_signaling = nv_drm_gem_prime_fence_op_enable_signaling,
    .release = nv_drm_gem_prime_fence_op_release,
@ -138,7 +149,7 @@ __nv_drm_prime_fence_signal(struct nv_drm_prime_fence *nv_fence)
 }

 static void nv_drm_gem_prime_force_fence_signal(
-    struct nv_drm_fence_context *nv_fence_context)
+    struct nv_drm_prime_fence_context *nv_fence_context)
 {
    WARN_ON(!spin_is_locked(&nv_fence_context->lock));

@ -158,7 +169,7 @@ static void nv_drm_gem_prime_fence_event
    NvU32 dataU32
 )
 {
-    struct nv_drm_fence_context *nv_fence_context = dataPtr;
+    struct nv_drm_prime_fence_context *nv_fence_context = dataPtr;

    spin_lock(&nv_fence_context->lock);

@ -187,11 +198,53 @@ static void nv_drm_gem_prime_fence_event
    spin_unlock(&nv_fence_context->lock);
 }

-static inline struct nv_drm_fence_context *__nv_drm_fence_context_new(
-    struct nv_drm_device *nv_dev,
-    struct drm_nvidia_fence_context_create_params *p)
+static inline struct nv_drm_prime_fence_context*
+to_prime_fence_context(struct nv_drm_fence_context *nv_fence_context) {
+    return (struct nv_drm_prime_fence_context *)nv_fence_context;
+}
+
+static void __nv_drm_prime_fence_context_destroy(
+    struct nv_drm_fence_context *nv_fence_context)
 {
-    struct nv_drm_fence_context *nv_fence_context;
+    struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
+    struct nv_drm_prime_fence_context *nv_prime_fence_context =
+        to_prime_fence_context(nv_fence_context);
+
+    /*
+     * Free channel event before destroying the fence context, otherwise event
+     * callback continue to get called.
+     */
+    nvKms->freeChannelEvent(nv_dev->pDevice, nv_prime_fence_context->cb);
+
+    /* Force signal all pending fences and empty pending list */
+    spin_lock(&nv_prime_fence_context->lock);
+
+    nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context);
+
+    spin_unlock(&nv_prime_fence_context->lock);
+
+    /* Free nvkms resources */
+
+    nvKms->unmapMemory(nv_dev->pDevice,
+                       nv_prime_fence_context->pSemSurface,
+                       NVKMS_KAPI_MAPPING_TYPE_KERNEL,
+                       (void *) nv_prime_fence_context->pLinearAddress);
+
+    nvKms->freeMemory(nv_dev->pDevice, nv_prime_fence_context->pSemSurface);
+
+    nv_drm_free(nv_fence_context);
+}
+
+static struct nv_drm_fence_context_ops nv_drm_prime_fence_context_ops = {
+    .destroy = __nv_drm_prime_fence_context_destroy,
+};
+
+static inline struct nv_drm_prime_fence_context *
+__nv_drm_prime_fence_context_new(
+    struct nv_drm_device *nv_dev,
+    struct drm_nvidia_prime_fence_context_create_params *p)
+{
+    struct nv_drm_prime_fence_context *nv_prime_fence_context;
    struct NvKmsKapiMemory *pSemSurface;
    NvU32 *pLinearAddress;

@ -225,9 +278,9 @@ static inline struct nv_drm_fence_context *__nv_drm_fence_context_new(
     * event for it.
     */

-    if ((nv_fence_context = nv_drm_calloc(
+    if ((nv_prime_fence_context = nv_drm_calloc(
                    1,
-                    sizeof(*nv_fence_context))) == NULL) {
+                    sizeof(*nv_prime_fence_context))) == NULL) {
        goto failed_alloc_fence_context;
    }

@ -236,17 +289,18 @@ static inline struct nv_drm_fence_context *__nv_drm_fence_context_new(
     * to check a return value.
     */

-    *nv_fence_context = (struct nv_drm_fence_context) {
-        .nv_dev = nv_dev,
-        .context = nv_dma_fence_context_alloc(1),
+    *nv_prime_fence_context = (struct nv_drm_prime_fence_context) {
+        .base.ops = &nv_drm_prime_fence_context_ops,
+        .base.nv_dev = nv_dev,
+        .base.context = nv_dma_fence_context_alloc(1),
        .pSemSurface = pSemSurface,
        .pLinearAddress = pLinearAddress,
        .fenceSemIndex = p->index,
    };

-    INIT_LIST_HEAD(&nv_fence_context->pending);
+    INIT_LIST_HEAD(&nv_prime_fence_context->pending);

-    spin_lock_init(&nv_fence_context->lock);
+    spin_lock_init(&nv_prime_fence_context->lock);

    /*
     * Except 'cb', the fence context should be completely initialized
@ -256,22 +310,22 @@ static inline struct nv_drm_fence_context *__nv_drm_fence_context_new(
     * There are no simultaneous read/write access to 'cb', therefore it does
     * not require spin-lock protection.
     */
-    nv_fence_context->cb =
+    nv_prime_fence_context->cb =
        nvKms->allocateChannelEvent(nv_dev->pDevice,
                                    nv_drm_gem_prime_fence_event,
-                                    nv_fence_context,
+                                    nv_prime_fence_context,
                                    p->event_nvkms_params_ptr,
                                    p->event_nvkms_params_size);
-    if (!nv_fence_context->cb) {
+    if (!nv_prime_fence_context->cb) {
        NV_DRM_DEV_LOG_ERR(nv_dev,
                           "Failed to allocate fence signaling event");
        goto failed_to_allocate_channel_event;
    }

-    return nv_fence_context;
+    return nv_prime_fence_context;

 failed_to_allocate_channel_event:
-    nv_drm_free(nv_fence_context);
+    nv_drm_free(nv_prime_fence_context);

 failed_alloc_fence_context:

@ -287,38 +341,8 @@ failed:
    return NULL;
 }

-static void __nv_drm_fence_context_destroy(
-    struct nv_drm_fence_context *nv_fence_context)
-{
-    struct nv_drm_device *nv_dev = nv_fence_context->nv_dev;
-
-    /*
-     * Free channel event before destroying the fence context, otherwise event
-     * callback continue to get called.
-     */
-    nvKms->freeChannelEvent(nv_dev->pDevice, nv_fence_context->cb);
-
-    /* Force signal all pending fences and empty pending list */
-    spin_lock(&nv_fence_context->lock);
-
-    nv_drm_gem_prime_force_fence_signal(nv_fence_context);
-
-    spin_unlock(&nv_fence_context->lock);
-
-    /* Free nvkms resources */
-
-    nvKms->unmapMemory(nv_dev->pDevice,
-                       nv_fence_context->pSemSurface,
-                       NVKMS_KAPI_MAPPING_TYPE_KERNEL,
-                       (void *) nv_fence_context->pLinearAddress);
-
-    nvKms->freeMemory(nv_dev->pDevice, nv_fence_context->pSemSurface);
-
-    nv_drm_free(nv_fence_context);
-}
-
-static nv_dma_fence_t *__nv_drm_fence_context_create_fence(
-    struct nv_drm_fence_context *nv_fence_context,
+static nv_dma_fence_t *__nv_drm_prime_fence_context_create_fence(
+    struct nv_drm_prime_fence_context *nv_prime_fence_context,
    unsigned int seqno)
 {
    struct nv_drm_prime_fence *nv_fence;
@ -329,14 +353,14 @@ static nv_dma_fence_t *__nv_drm_fence_context_create_fence(
        goto out;
    }

-    spin_lock(&nv_fence_context->lock);
+    spin_lock(&nv_prime_fence_context->lock);

    /*
     * If seqno wrapped, force signal fences to make sure none of them
     * get stuck.
     */
-    if (seqno < nv_fence_context->last_seqno) {
-        nv_drm_gem_prime_force_fence_signal(nv_fence_context);
+    if (seqno < nv_prime_fence_context->last_seqno) {
+        nv_drm_gem_prime_force_fence_signal(nv_prime_fence_context);
    }

    INIT_LIST_HEAD(&nv_fence->list_entry);
@ -344,17 +368,17 @@ static nv_dma_fence_t *__nv_drm_fence_context_create_fence(
    spin_lock_init(&nv_fence->lock);

    nv_dma_fence_init(&nv_fence->base, &nv_drm_gem_prime_fence_ops,
-                      &nv_fence->lock, nv_fence_context->context,
+                      &nv_fence->lock, nv_prime_fence_context->base.context,
                      seqno);

    /* The context maintains a reference to any pending fences. */
    nv_dma_fence_get(&nv_fence->base);

-    list_add_tail(&nv_fence->list_entry, &nv_fence_context->pending);
+    list_add_tail(&nv_fence->list_entry, &nv_prime_fence_context->pending);

-    nv_fence_context->last_seqno = seqno;
+    nv_prime_fence_context->last_seqno = seqno;

-    spin_unlock(&nv_fence_context->lock);
+    spin_unlock(&nv_prime_fence_context->lock);

 out:
    return ret != 0 ? ERR_PTR(ret) : &nv_fence->base;
@ -388,12 +412,15 @@ static inline struct nv_drm_gem_fence_context *to_gem_fence_context(
 * because tear down sequence calls to flush all existing
 * worker thread.
 */
-static void __nv_drm_gem_fence_context_free(struct nv_drm_gem_object *nv_gem)
+static void
+__nv_drm_gem_fence_context_free(struct nv_drm_gem_object *nv_gem)
 {
    struct nv_drm_gem_fence_context *nv_gem_fence_context =
        to_gem_fence_context(nv_gem);
+    struct nv_drm_fence_context *nv_fence_context =
+        nv_gem_fence_context->nv_fence_context;

-    __nv_drm_fence_context_destroy(nv_gem_fence_context->nv_fence_context);
+    nv_fence_context->ops->destroy(nv_fence_context);

    nv_drm_free(nv_gem_fence_context);
 }
@ -403,7 +430,8 @@ const struct nv_drm_gem_object_funcs nv_gem_fence_context_ops = {
 };

 static inline
-struct nv_drm_gem_fence_context *__nv_drm_gem_object_fence_context_lookup(
+struct nv_drm_gem_fence_context *
+__nv_drm_gem_object_fence_context_lookup(
    struct drm_device *dev,
    struct drm_file *filp,
    u32 handle)
@ -419,11 +447,13 @@ struct nv_drm_gem_fence_context *__nv_drm_gem_object_fence_context_lookup(
    return to_gem_fence_context(nv_gem);
 }

-int nv_drm_fence_context_create_ioctl(struct drm_device *dev,
-                                      void *data, struct drm_file *filep)
+static int
+__nv_drm_gem_fence_context_create(struct drm_device *dev,
+                                  struct nv_drm_fence_context *nv_fence_context,
+                                  u32 *handle,
+                                  struct drm_file *filep)
 {
    struct nv_drm_device *nv_dev = to_nv_device(dev);
-    struct drm_nvidia_fence_context_create_params *p = data;
    struct nv_drm_gem_fence_context *nv_gem_fence_context = NULL;

    if ((nv_gem_fence_context = nv_drm_calloc(
@ -432,10 +462,7 @@ int nv_drm_fence_context_create_ioctl(struct drm_device *dev,
        goto done;
    }

-    if ((nv_gem_fence_context->nv_fence_context =
-                __nv_drm_fence_context_new(nv_dev, p)) == NULL) {
-        goto fence_context_new_failed;
-    }
+    nv_gem_fence_context->nv_fence_context = nv_fence_context;

    nv_drm_gem_object_init(nv_dev,
                           &nv_gem_fence_context->base,
@ -445,21 +472,45 @@ int nv_drm_fence_context_create_ioctl(struct drm_device *dev,

    return nv_drm_gem_handle_create_drop_reference(filep,
                                                   &nv_gem_fence_context->base,
-                                                   &p->handle);
-
-fence_context_new_failed:
-    nv_drm_free(nv_gem_fence_context);
+                                                   handle);

 done:
    return -ENOMEM;
 }

-int nv_drm_gem_fence_attach_ioctl(struct drm_device *dev,
+int nv_drm_prime_fence_context_create_ioctl(struct drm_device *dev,
+                                            void *data, struct drm_file *filep)
+{
+    struct nv_drm_device *nv_dev = to_nv_device(dev);
+    struct drm_nvidia_prime_fence_context_create_params *p = data;
+    struct nv_drm_prime_fence_context *nv_prime_fence_context =
+        __nv_drm_prime_fence_context_new(nv_dev, p);
+    int err;
+
+    if (!nv_prime_fence_context) {
+        goto done;
+    }
+
+    err = __nv_drm_gem_fence_context_create(dev,
+                                            &nv_prime_fence_context->base,
+                                            &p->handle,
+                                            filep);
+    if (err) {
+        __nv_drm_prime_fence_context_destroy(&nv_prime_fence_context->base);
+    }
+
+    return err;
+
+done:
+    return -ENOMEM;
+}
+
+int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
                                        void *data, struct drm_file *filep)
 {
    int ret = -EINVAL;
    struct nv_drm_device *nv_dev = to_nv_device(dev);
-    struct drm_nvidia_gem_fence_attach_params *p = data;
+    struct drm_nvidia_gem_prime_fence_attach_params *p = data;

    struct nv_drm_gem_object *nv_gem;
    struct nv_drm_gem_fence_context *nv_gem_fence_context;
@ -490,9 +541,22 @@ int nv_drm_gem_fence_attach_ioctl(struct drm_device *dev,
        goto fence_context_lookup_failed;
    }

-    if (IS_ERR(fence = __nv_drm_fence_context_create_fence(
-                            nv_gem_fence_context->nv_fence_context,
-                            p->sem_thresh))) {
+    if (nv_gem_fence_context->nv_fence_context->ops !=
+        &nv_drm_prime_fence_context_ops) {
+
+        NV_DRM_DEV_LOG_ERR(
+            nv_dev,
+            "Wrong fence context type: 0x%08x",
+            p->fence_context_handle);
+
+        goto fence_context_create_fence_failed;
+    }
+
+    fence = __nv_drm_prime_fence_context_create_fence(
+                to_prime_fence_context(nv_gem_fence_context->nv_fence_context),
+                p->sem_thresh);
+
+    if (IS_ERR(fence)) {
        ret = PTR_ERR(fence);

        NV_DRM_DEV_LOG_ERR(
--- a/kernel-open/nvidia-drm/nvidia-drm-prime-fence.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-prime-fence.h
@ -35,10 +35,10 @@ struct drm_device;
 int nv_drm_fence_supported_ioctl(struct drm_device *dev,
                                 void *data, struct drm_file *filep);

-int nv_drm_fence_context_create_ioctl(struct drm_device *dev,
+int nv_drm_prime_fence_context_create_ioctl(struct drm_device *dev,
                                            void *data, struct drm_file *filep);

-int nv_drm_gem_fence_attach_ioctl(struct drm_device *dev,
+int nv_drm_gem_prime_fence_attach_ioctl(struct drm_device *dev,
                                        void *data, struct drm_file *filep);

 #endif /* NV_DRM_FENCE_AVAILABLE */
--- a/kernel-open/nvidia-drm/nvidia-drm-gem.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-gem.c
@ -26,7 +26,7 @@

 #include "nvidia-drm-priv.h"
 #include "nvidia-drm-ioctl.h"
-#include "nvidia-drm-prime-fence.h"
+#include "nvidia-drm-fence.h"
 #include "nvidia-drm-gem.h"
 #include "nvidia-drm-gem-nvkms-memory.h"
 #include "nvidia-drm-gem-user-memory.h"
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.c
@ -28,6 +28,8 @@
 */

 #include "nvidia-drm-helper.h"
+#include "nvidia-drm-priv.h"
+#include "nvidia-drm-crtc.h"

 #include "nvmisc.h"

@ -148,6 +150,18 @@ int nv_drm_atomic_helper_disable_all(struct drm_device *dev,
            goto free;
    }

+#if defined(NV_DRM_ROTATION_AVAILABLE)
+    nv_drm_for_each_plane(plane, dev) {
+        plane_state = drm_atomic_get_plane_state(state, plane);
+        if (IS_ERR(plane_state)) {
+            ret = PTR_ERR(plane_state);
+            goto free;
+        }
+
+        plane_state->rotation = DRM_MODE_ROTATE_0;
+    }
+#endif
+
    nv_drm_for_each_connector_in_state(state, conn, conn_state, i) {
        ret = drm_atomic_set_crtc_for_connector(conn_state, NULL);
        if (ret < 0)
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.h
@ -35,6 +35,35 @@
 #include <drm/drm_drv.h>
 #endif

+#if defined(NV_DRM_ALPHA_BLENDING_AVAILABLE) || defined(NV_DRM_ROTATION_AVAILABLE)
+/* For DRM_ROTATE_* , DRM_REFLECT_* */
+#include <drm/drm_blend.h>
+#endif
+
+#if defined(NV_DRM_ROTATION_AVAILABLE)
+/* For DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* */
+#include <uapi/drm/drm_mode.h>
+#endif
+
+#if defined(NV_DRM_ROTATION_AVAILABLE)
+/*
+ * 19-05-2017 c2c446ad29437bb92b157423c632286608ebd3ec has added
+ * DRM_MODE_ROTATE_* and DRM_MODE_REFLECT_* to UAPI and removed
+ * DRM_ROTATE_* and DRM_REFLECT_*
+ */
+#if !defined(DRM_MODE_ROTATE_0)
+#define DRM_MODE_ROTATE_0       DRM_ROTATE_0
+#define DRM_MODE_ROTATE_90      DRM_ROTATE_90
+#define DRM_MODE_ROTATE_180     DRM_ROTATE_180
+#define DRM_MODE_ROTATE_270     DRM_ROTATE_270
+#define DRM_MODE_REFLECT_X      DRM_REFLECT_X
+#define DRM_MODE_REFLECT_Y      DRM_REFLECT_Y
+#define DRM_MODE_ROTATE_MASK    DRM_ROTATE_MASK
+#define DRM_MODE_REFLECT_MASK   DRM_REFLECT_MASK
+#endif
+
+#endif //NV_DRM_ROTATION_AVAILABLE
+
 /*
 * drm_dev_put() is added by commit 9a96f55034e41b4e002b767e9218d55f03bdff7d
 * (2017-09-26) and drm_dev_unref() is removed by
@ -277,11 +306,33 @@ int nv_drm_atomic_helper_disable_all(struct drm_device *dev,
    for_each_plane_in_state(__state, plane, plane_state, __i)
 #endif

-static inline struct drm_crtc *nv_drm_crtc_find(struct drm_device *dev,
+static inline struct drm_connector *
+nv_drm_connector_lookup(struct drm_device *dev, struct drm_file *filep,
                        uint32_t id)
 {
+#if !defined(NV_DRM_CONNECTOR_LOOKUP_PRESENT)
+    return drm_connector_find(dev, id);
+#elif defined(NV_DRM_MODE_OBJECT_FIND_HAS_FILE_PRIV_ARG)
+    return drm_connector_lookup(dev, filep, id);
+#else
+    return drm_connector_lookup(dev, id);
+#endif
+}
+
+static inline void nv_drm_connector_put(struct drm_connector *connector)
+{
+#if defined(NV_DRM_CONNECTOR_PUT_PRESENT)
+    drm_connector_put(connector);
+#elif defined(NV_DRM_CONNECTOR_LOOKUP_PRESENT)
+    drm_connector_unreference(connector);
+#endif
+}
+
+static inline struct drm_crtc *
+nv_drm_crtc_find(struct drm_device *dev, struct drm_file *filep, uint32_t id)
+{
 #if defined(NV_DRM_MODE_OBJECT_FIND_HAS_FILE_PRIV_ARG)
-    return drm_crtc_find(dev, NULL /* file_priv */, id);
+    return drm_crtc_find(dev, filep, id);
 #else
    return drm_crtc_find(dev, id);
 #endif
@ -297,6 +348,30 @@ static inline struct drm_encoder *nv_drm_encoder_find(struct drm_device *dev,
 #endif
 }

+#if defined(NV_DRM_DRM_AUTH_H_PRESENT)
+#include <drm/drm_auth.h>
+#endif
+#if defined(NV_DRM_DRM_FILE_H_PRESENT)
+#include <drm/drm_file.h>
+#endif
+
+/*
+ * drm_file_get_master() added by commit 56f0729a510f ("drm: protect drm_master
+ * pointers in drm_lease.c") in v5.15 (2021-07-20)
+ */
+static inline struct drm_master *nv_drm_file_get_master(struct drm_file *filep)
+{
+#if defined(NV_DRM_FILE_GET_MASTER_PRESENT)
+    return drm_file_get_master(filep);
+#else
+    if (filep->master) {
+        return drm_master_get(filep->master);
+    } else {
+        return NULL;
+    }
+#endif
+}
+
 /*
 * drm_connector_for_each_possible_encoder() is added by commit
 * 83aefbb887b59df0b3520965c3701e01deacfc52 which was Signed-off-by:
--- a/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
+++ b/kernel-open/nvidia-drm/nvidia-drm-ioctl.h
@ -34,8 +34,8 @@
 #define DRM_NVIDIA_GEM_IMPORT_USERSPACE_MEMORY      0x02
 #define DRM_NVIDIA_GET_DEV_INFO                     0x03
 #define DRM_NVIDIA_FENCE_SUPPORTED                  0x04
-#define DRM_NVIDIA_FENCE_CONTEXT_CREATE             0x05
-#define DRM_NVIDIA_GEM_FENCE_ATTACH                 0x06
+#define DRM_NVIDIA_PRIME_FENCE_CONTEXT_CREATE       0x05
+#define DRM_NVIDIA_GEM_PRIME_FENCE_ATTACH           0x06
 #define DRM_NVIDIA_GET_CLIENT_CAPABILITY            0x08
 #define DRM_NVIDIA_GEM_EXPORT_NVKMS_MEMORY          0x09
 #define DRM_NVIDIA_GEM_MAP_OFFSET                   0x0a
@ -43,6 +43,11 @@
 #define DRM_NVIDIA_GET_CRTC_CRC32_V2                0x0c
 #define DRM_NVIDIA_GEM_EXPORT_DMABUF_MEMORY         0x0d
 #define DRM_NVIDIA_GEM_IDENTIFY_OBJECT              0x0e
+#define DRM_NVIDIA_DMABUF_SUPPORTED                 0x0f
+#define DRM_NVIDIA_GET_DPY_ID_FOR_CONNECTOR_ID      0x10
+#define DRM_NVIDIA_GET_CONNECTOR_ID_FOR_DPY_ID      0x11
+#define DRM_NVIDIA_GRANT_PERMISSIONS                0x12
+#define DRM_NVIDIA_REVOKE_PERMISSIONS               0x13

 #define DRM_IOCTL_NVIDIA_GEM_IMPORT_NVKMS_MEMORY                           \
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IMPORT_NVKMS_MEMORY),      \
@ -65,17 +70,20 @@
 #if defined(NV_LINUX)
 #define DRM_IOCTL_NVIDIA_FENCE_SUPPORTED                         \
    DRM_IO(DRM_COMMAND_BASE + DRM_NVIDIA_FENCE_SUPPORTED)
+#define DRM_IOCTL_NVIDIA_DMABUF_SUPPORTED                        \
+    DRM_IO(DRM_COMMAND_BASE + DRM_NVIDIA_DMABUF_SUPPORTED)
 #else
 #define DRM_IOCTL_NVIDIA_FENCE_SUPPORTED 0
+#define DRM_IOCTL_NVIDIA_DMABUF_SUPPORTED 0
 #endif

-#define DRM_IOCTL_NVIDIA_FENCE_CONTEXT_CREATE                        \
-    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_FENCE_CONTEXT_CREATE),   \
-             struct drm_nvidia_fence_context_create_params)
+#define DRM_IOCTL_NVIDIA_PRIME_FENCE_CONTEXT_CREATE                     \
+    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_PRIME_FENCE_CONTEXT_CREATE),\
+             struct drm_nvidia_prime_fence_context_create_params)

-#define DRM_IOCTL_NVIDIA_GEM_FENCE_ATTACH                            \
-    DRM_IOW((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_FENCE_ATTACH),        \
-            struct drm_nvidia_gem_fence_attach_params)
+#define DRM_IOCTL_NVIDIA_GEM_PRIME_FENCE_ATTACH                         \
+    DRM_IOW((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_PRIME_FENCE_ATTACH),     \
+            struct drm_nvidia_gem_prime_fence_attach_params)

 #define DRM_IOCTL_NVIDIA_GET_CLIENT_CAPABILITY                          \
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GET_CLIENT_CAPABILITY),     \
@ -109,6 +117,22 @@
    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GEM_IDENTIFY_OBJECT),       \
              struct drm_nvidia_gem_identify_object_params)

+#define DRM_IOCTL_NVIDIA_GET_DPY_ID_FOR_CONNECTOR_ID                     \
+    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GET_DPY_ID_FOR_CONNECTOR_ID),\
+             struct drm_nvidia_get_dpy_id_for_connector_id_params)
+
+#define DRM_IOCTL_NVIDIA_GET_CONNECTOR_ID_FOR_DPY_ID                     \
+    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GET_CONNECTOR_ID_FOR_DPY_ID),\
+             struct drm_nvidia_get_connector_id_for_dpy_id_params)
+
+#define DRM_IOCTL_NVIDIA_GRANT_PERMISSIONS                              \
+    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_GRANT_PERMISSIONS),         \
+             struct drm_nvidia_grant_permissions_params)
+
+#define DRM_IOCTL_NVIDIA_REVOKE_PERMISSIONS                             \
+    DRM_IOWR((DRM_COMMAND_BASE + DRM_NVIDIA_REVOKE_PERMISSIONS),        \
+             struct drm_nvidia_revoke_permissions_params)
+
 struct drm_nvidia_gem_import_nvkms_memory_params {
    uint64_t mem_size;           /* IN */

@ -136,7 +160,7 @@ struct drm_nvidia_get_dev_info_params {
    uint32_t sector_layout;        /* OUT */
 };

-struct drm_nvidia_fence_context_create_params {
+struct drm_nvidia_prime_fence_context_create_params {
    uint32_t handle;            /* OUT GEM handle to fence context */

    uint32_t index;             /* IN Index of semaphore to use for fencing */
@ -151,7 +175,7 @@ struct drm_nvidia_fence_context_create_params {
    uint64_t event_nvkms_params_size; /* IN */
 };

-struct drm_nvidia_gem_fence_attach_params {
+struct drm_nvidia_gem_prime_fence_attach_params {
    uint32_t handle;                /* IN GEM handle to attach fence to */
    uint32_t fence_context_handle;  /* IN GEM handle to fence context on which fence is run on */
    uint32_t sem_thresh;            /* IN Semaphore value to reach before signal */
@ -232,4 +256,23 @@ struct drm_nvidia_gem_identify_object_params {
    drm_nvidia_gem_object_type  object_type;    /* OUT GEM object type */
 };

+struct drm_nvidia_get_dpy_id_for_connector_id_params {
+    uint32_t connectorId; /* IN */
+    uint32_t dpyId;       /* OUT */
+};
+
+struct drm_nvidia_get_connector_id_for_dpy_id_params {
+    uint32_t dpyId;       /* IN */
+    uint32_t connectorId; /* OUT */
+};
+
+struct drm_nvidia_grant_permissions_params {
+    int32_t fd;           /* IN */
+    uint32_t dpyId;       /* IN */
+};
+
+struct drm_nvidia_revoke_permissions_params {
+    uint32_t dpyId;       /* IN */
+};
+
 #endif /* _UAPI_NVIDIA_DRM_IOCTL_H_ */
--- a/kernel-open/nvidia-drm/nvidia-drm.Kbuild
+++ b/kernel-open/nvidia-drm/nvidia-drm.Kbuild
@ -16,7 +16,7 @@ NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-connector.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-gem.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-fb.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-modeset.c
-NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-prime-fence.c
+NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-fence.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-linux.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nvidia-drm-helper.c
 NVIDIA_DRM_SOURCES += nvidia-drm/nv-pci-table.c
@ -124,3 +124,8 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += dma_resv_add_fence
 NV_CONFTEST_TYPE_COMPILE_TESTS += dma_resv_reserve_fences
 NV_CONFTEST_TYPE_COMPILE_TESTS += reservation_object_reserve_shared_has_num_fences_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_has_override_edid
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_master_has_leases
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_file_get_master
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_modeset_lock_all_end
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_lookup
+NV_CONFTEST_TYPE_COMPILE_TESTS += drm_connector_put
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@ -75,6 +75,15 @@ MODULE_PARM_DESC(malloc_verbose, "Report information about malloc calls on modul
 static bool malloc_verbose = false;
 module_param_named(malloc_verbose, malloc_verbose, bool, 0400);

+/* This parameter is used to find the dpy override conf file */
+#define NVKMS_CONF_FILE_SPECIFIED (nvkms_conf != NULL)
+
+MODULE_PARM_DESC(config_file,
+                 "Path to the nvidia-modeset configuration file "
+                 "(default: disabled)");
+static char *nvkms_conf = NULL;
+module_param_named(config_file, nvkms_conf, charp, 0400);
+
 static atomic_t nvkms_alloc_called_count;

 NvBool nvkms_output_rounding_fix(void)
@ -1362,6 +1371,117 @@ static void nvkms_proc_exit(void)
 #endif /* CONFIG_PROC_FS */
 }

+/*************************************************************************
+ * NVKMS Config File Read
+ ************************************************************************/
+static NvBool nvkms_fs_mounted(void)
+{
+    return current->fs != NULL;
+}
+
+static size_t nvkms_config_file_open
+(
+    char *fname,
+    char ** const buff
+)
+{
+    int i = 0;
+    struct file *file;
+    struct inode *file_inode;
+    size_t file_size = 0;
+    size_t read_size = 0;
+#if defined(NV_KERNEL_READ_HAS_POINTER_POS_ARG)
+    loff_t pos = 0;
+#endif
+
+    if (!nvkms_fs_mounted()) {
+        printk(KERN_ERR NVKMS_LOG_PREFIX "ERROR: Filesystems not mounted\n");
+        return 0;
+    }
+
+    file = filp_open(fname, O_RDONLY, 0);
+    if (file == NULL || IS_ERR(file)) {
+        printk(KERN_WARNING NVKMS_LOG_PREFIX "WARNING: Failed to open %s\n",
+               fname);
+        return 0;
+    }
+
+    file_inode = file->f_inode;
+    if (file_inode == NULL || IS_ERR(file_inode)) {
+        printk(KERN_WARNING NVKMS_LOG_PREFIX "WARNING: Inode is invalid\n");
+        goto done;
+    }
+    file_size = file_inode->i_size;
+    if (file_size > NVKMS_READ_FILE_MAX_SIZE) {
+        printk(KERN_WARNING NVKMS_LOG_PREFIX "WARNING: File exceeds maximum size\n");
+        goto done;
+    }
+
+    *buff = nvkms_alloc(file_size, NV_FALSE);
+    if (*buff == NULL) {
+        printk(KERN_WARNING NVKMS_LOG_PREFIX "WARNING: Out of memory\n");
+        goto done;
+    }
+
+    /*
+     * TODO: Once we have access to GPL symbols, this can be replaced with
+     * kernel_read_file for kernels >= 4.6
+     */
+    while ((read_size < file_size) && (i++ < NVKMS_READ_FILE_MAX_LOOPS)) {
+#if defined(NV_KERNEL_READ_HAS_POINTER_POS_ARG)
+        ssize_t ret = kernel_read(file, *buff + read_size,
+                                  file_size - read_size, &pos);
+#else
+        ssize_t ret = kernel_read(file, read_size,
+                                  *buff + read_size,
+                                  file_size - read_size);
+#endif
+        if (ret <= 0) {
+            break;
+        }
+        read_size += ret;
+    }
+
+    if (read_size != file_size) {
+        printk(KERN_WARNING NVKMS_LOG_PREFIX "WARNING: Failed to read %s\n",
+               fname);
+        goto done;
+    }
+
+    filp_close(file, current->files);
+    return file_size;
+
+done:
+    nvkms_free(*buff, file_size);
+    filp_close(file, current->files);
+    return 0;
+}
+
+/* must be called with nvkms_lock locked */
+static void nvkms_read_config_file_locked(void)
+{
+    char *buffer = NULL;
+    size_t buf_size = 0;
+
+    /* only read the config file if the kernel parameter is set */
+    if (!NVKMS_CONF_FILE_SPECIFIED) {
+        return;
+    }
+
+    buf_size = nvkms_config_file_open(nvkms_conf, &buffer);
+
+    if (buf_size == 0) {
+        return;
+    }
+
+    if (nvKmsReadConf(buffer, buf_size, nvkms_config_file_open)) {
+        printk(KERN_INFO NVKMS_LOG_PREFIX "Successfully read %s\n",
+               nvkms_conf);
+    }
+
+    nvkms_free(buffer, buf_size);
+}
+
 /*************************************************************************
 * NVKMS KAPI functions
 ************************************************************************/
@ -1533,10 +1653,12 @@ static int __init nvkms_init(void)
    if (!nvKmsModuleLoad()) {
        ret = -ENOMEM;
    }
-    up(&nvkms_lock);
    if (ret != 0) {
+        up(&nvkms_lock);
        goto fail_module_load;
    }
+    nvkms_read_config_file_locked();
+    up(&nvkms_lock);

    nvkms_proc_init();

--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@ -43,14 +43,9 @@ enum NvKmsSyncPtOp {
    NVKMS_SYNCPT_OP_ALLOC,
    NVKMS_SYNCPT_OP_GET,
    NVKMS_SYNCPT_OP_PUT,
-    NVKMS_SYNCPT_OP_INCR_MAX,
-    NVKMS_SYNCPT_OP_CPU_INCR,
    NVKMS_SYNCPT_OP_FD_TO_ID_AND_THRESH,
    NVKMS_SYNCPT_OP_ID_AND_THRESH_TO_FD,
    NVKMS_SYNCPT_OP_READ_MINVAL,
-    NVKMS_SYNCPT_OP_READ_MAXVAL,
-    NVKMS_SYNCPT_OP_SET_MIN_EQ_MAX,
-    NVKMS_SYNCPT_OP_SET_MAXVAL,
 };

 typedef struct {
@ -60,24 +55,10 @@ typedef struct {
        NvU32 id;                       /*  out  */
    } alloc;

-    struct {
-        NvU32 id;                       /*  in   */
-    } get;
-
    struct {
        NvU32 id;                       /*  in   */
    } put;

-    struct {
-        NvU32 id;                       /*  in   */
-        NvU32 incr;                     /*  in   */
-        NvU32 value;                    /*  out  */
-    } incr_max;
-
-    struct {
-        NvU32 id;                       /*  in   */
-    } cpu_incr;
-
    struct {
        NvS32 fd;                       /*  in   */
        NvU32 id;                       /*  out  */
@ -94,20 +75,6 @@ typedef struct {
        NvU32 id;                       /*  in   */
        NvU32 minval;                   /*  out  */
    } read_minval;
-
-    struct {
-        NvU32 id;                       /*  in   */
-        NvU32 maxval;                   /*  out  */
-    } read_maxval;
-
-    struct {
-        NvU32 id;                       /*  in   */
-    } set_min_eq_max;
-
-    struct {
-        NvU32 id;                       /*  in   */
-        NvU32 val;                      /*  in   */
-    } set_maxval;
 } NvKmsSyncPtOpParams;

 NvBool nvkms_output_rounding_fix(void);
--- a/kernel-open/nvidia-modeset/nvkms.h
+++ b/kernel-open/nvidia-modeset/nvkms.h
@ -42,6 +42,20 @@ typedef void nvkms_procfs_proc_t(void *data,
                                 char *buffer, size_t size,
                                 nvkms_procfs_out_string_func_t *outString);

+/* max number of loops to prevent hanging the kernel if an edge case is hit */
+#define NVKMS_READ_FILE_MAX_LOOPS 1000
+/* max size for any file read by the config system */
+#define NVKMS_READ_FILE_MAX_SIZE  8192
+
+/*
+ * The read file callback should allocate a buffer pointed to by *buff, fill it
+ * with the contents of fname, and return the size of the buffer. Buffer is not
+ * guaranteed to be null-terminated. The caller is responsible for freeing the
+ * buffer with nvkms_free, not nvFree.
+ */
+typedef size_t nvkms_config_read_file_func_t(char *fname,
+                                             char ** const buff);
+
 typedef struct {
    const char *name;
    nvkms_procfs_proc_t *func;
@ -74,6 +88,9 @@ void nvKmsResume(NvU32 gpuId);

 void nvKmsGetProcFiles(const nvkms_procfs_file_t **ppProcFiles);

+NvBool nvKmsReadConf(const char *buff, size_t size,
+                     nvkms_config_read_file_func_t readfile);
+
 void nvKmsKapiHandleEventQueueChange
 (
    struct NvKmsKapiDevice *device
--- a/kernel-open/nvidia-uvm/clc365.h
+++ b/kernel-open/nvidia-uvm/clc365.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/clc369.h
+++ b/kernel-open/nvidia-uvm/clc369.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/clc3b5.h
+++ b/kernel-open/nvidia-uvm/clc3b5.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -69,6 +69,9 @@ extern "C" {
 #define NVC3B5_LAUNCH_DMA_FLUSH_ENABLE                                          2:2
 #define NVC3B5_LAUNCH_DMA_FLUSH_ENABLE_FALSE                                    (0x00000000)
 #define NVC3B5_LAUNCH_DMA_FLUSH_ENABLE_TRUE                                     (0x00000001)
+#define NVC3B5_LAUNCH_DMA_FLUSH_TYPE                                            25:25
+#define NVC3B5_LAUNCH_DMA_FLUSH_TYPE_SYS                                        (0x00000000)
+#define NVC3B5_LAUNCH_DMA_FLUSH_TYPE_GL                                         (0x00000001)
 #define NVC3B5_LAUNCH_DMA_SEMAPHORE_TYPE                                        4:3
 #define NVC3B5_LAUNCH_DMA_SEMAPHORE_TYPE_NONE                                   (0x00000000)
 #define NVC3B5_LAUNCH_DMA_SEMAPHORE_TYPE_RELEASE_ONE_WORD_SEMAPHORE             (0x00000001)
--- a/kernel-open/nvidia-uvm/ctrl2080mc.h
+++ b/kernel-open/nvidia-uvm/ctrl2080mc.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2013-2021 NVIDIA Corporation
+    Copyright (c) 2013-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
--- a/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm-sources.Kbuild
@ -1,12 +1,6 @@
 NVIDIA_UVM_SOURCES ?=
 NVIDIA_UVM_SOURCES_CXX ?=

-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_fault_buffer.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_ce.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_host.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_mmu.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ada.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_common.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_linux.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_debug_optimized.c
@ -58,6 +52,7 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_pascal_ce.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_pascal_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_pascal_mmu.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_pascal_fault_buffer.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_volta_ce.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_volta_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_volta_mmu.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_volta.c
@ -72,6 +67,12 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_ce.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_host.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ampere_mmu.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_fault_buffer.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_ce.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_host.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hopper_mmu.c
+NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_ada.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_policy.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_perf_utils.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_kvmalloc.c
@ -94,7 +95,6 @@ NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_test_rng.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_tree_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_range_allocator_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_gpu_semaphore_test.c
-NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_hmm_sanity_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_mem_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_rm_mem_test.c
 NVIDIA_UVM_SOURCES += nvidia-uvm/uvm_page_tree_test.c
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@ -36,7 +36,7 @@ NVIDIA_UVM_KO = nvidia-uvm/nvidia-uvm.ko
 #

 ifeq ($(UVM_BUILD_TYPE),debug)
-  NVIDIA_UVM_CFLAGS += -DDEBUG $(call cc-option,-Og,-O0) -g
+  NVIDIA_UVM_CFLAGS += -DDEBUG -O1 -g
 else
  ifeq ($(UVM_BUILD_TYPE),develop)
    # -DDEBUG is required, in order to allow pr_devel() print statements to
@ -81,8 +81,10 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += iommu_sva_bind_device_has_drvdata_arg

 NV_CONFTEST_TYPE_COMPILE_TESTS += backing_dev_info
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_context_t
@ -100,6 +102,6 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
 NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_vma_added_flags
-NV_CONFTEST_TYPE_COMPILE_TESTS += make_device_exclusive_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += migrate_device_range

 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_int_active_memcg
--- a/kernel-open/nvidia-uvm/uvm.c
+++ b/kernel-open/nvidia-uvm/uvm.c
@ -35,26 +35,95 @@
 #include "uvm_linux_ioctl.h"
 #include "uvm_hmm.h"
 #include "uvm_mem.h"
+#include "uvm_kvmalloc.h"

 #define NVIDIA_UVM_DEVICE_NAME          "nvidia-uvm"

 static dev_t g_uvm_base_dev;
 static struct cdev g_uvm_cdev;
+static const struct file_operations uvm_fops;

-static int uvm_open(struct inode *inode, struct file *filp)
+bool uvm_file_is_nvidia_uvm(struct file *filp)
 {
-    NV_STATUS status = uvm_global_get_status();
+    return (filp != NULL) && (filp->f_op == &uvm_fops);
+}

-    if (status == NV_OK) {
-        if (!uvm_down_read_trylock(&g_uvm_global.pm.lock))
-            return -EAGAIN;
+uvm_fd_type_t uvm_fd_type(struct file *filp, void **ptr_val)
+{
+    unsigned long uptr;
+    uvm_fd_type_t type;
+    void *ptr;

-        status = uvm_va_space_create(inode, filp);
+    UVM_ASSERT(uvm_file_is_nvidia_uvm(filp));

-        uvm_up_read(&g_uvm_global.pm.lock);
+    uptr = atomic_long_read_acquire((atomic_long_t *) (&filp->private_data));
+    type = (uvm_fd_type_t)(uptr & UVM_FD_TYPE_MASK);
+    ptr = (void *)(uptr & ~UVM_FD_TYPE_MASK);
+    BUILD_BUG_ON(UVM_FD_COUNT > UVM_FD_TYPE_MASK + 1);
+
+    switch (type) {
+        case UVM_FD_UNINITIALIZED:
+        case UVM_FD_INITIALIZING:
+            UVM_ASSERT(!ptr);
+            break;
+
+        case UVM_FD_VA_SPACE:
+            UVM_ASSERT(ptr);
+            BUILD_BUG_ON(__alignof__(uvm_va_space_t) < (1UL << UVM_FD_TYPE_BITS));
+            break;
+
+        default:
+            UVM_ASSERT(0);
    }

+    if (ptr_val)
+        *ptr_val = ptr;
+
+    return type;
+}
+
+// Called when opening /dev/nvidia-uvm. This code doesn't take any UVM locks, so
+// there's no need to acquire g_uvm_global.pm.lock, but if that changes the PM
+// lock will need to be taken.
+static int uvm_open(struct inode *inode, struct file *filp)
+{
+    struct address_space *mapping;
+    NV_STATUS status = uvm_global_get_status();
+
+    if (status != NV_OK)
        return -nv_status_to_errno(status);
+
+    mapping = uvm_kvmalloc(sizeof(*mapping));
+    if (!mapping)
+        return -ENOMEM;
+
+    // By default all struct files on the same inode share the same
+    // address_space structure (the inode's) across all processes. This means
+    // unmap_mapping_range would unmap virtual mappings across all processes on
+    // that inode.
+    //
+    // Since the UVM driver uses the mapping offset as the VA of the file's
+    // process, we need to isolate the mappings to each process.
+    address_space_init_once(mapping);
+    mapping->host = inode;
+
+    // Some paths in the kernel, for example force_page_cache_readahead which
+    // can be invoked from user-space via madvise MADV_WILLNEED and fadvise
+    // POSIX_FADV_WILLNEED, check the function pointers within
+    // file->f_mapping->a_ops for validity. However, those paths assume that a_ops
+    // itself is always valid. Handle that by using the inode's a_ops pointer,
+    // which is what f_mapping->a_ops would point to anyway if we weren't re-
+    // assigning f_mapping.
+    mapping->a_ops = inode->i_mapping->a_ops;
+
+#if defined(NV_ADDRESS_SPACE_HAS_BACKING_DEV_INFO)
+    mapping->backing_dev_info = inode->i_mapping->backing_dev_info;
+#endif
+
+    filp->private_data = NULL;
+    filp->f_mapping = mapping;
+
+    return NV_OK;
 }

 static int uvm_open_entry(struct inode *inode, struct file *filp)
@ -80,9 +149,18 @@ static void uvm_release_deferred(void *data)

 static int uvm_release(struct inode *inode, struct file *filp)
 {
-    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+    uvm_va_space_t *va_space;
+    uvm_fd_type_t fd_type;
    int ret;

+    fd_type = uvm_fd_type(filp, (void **)&va_space);
+    UVM_ASSERT(fd_type != UVM_FD_INITIALIZING);
+    if (fd_type == UVM_FD_UNINITIALIZED) {
+        uvm_kvfree(filp->f_mapping);
+        return 0;
+    }
+
+    UVM_ASSERT(fd_type == UVM_FD_VA_SPACE);
    filp->private_data = NULL;
    filp->f_mapping = NULL;

@ -100,7 +178,7 @@ static int uvm_release(struct inode *inode, struct file *filp)
        // been destroyed, and va_space->mapping won't be used again. Still,
        // the va_space survives the inode if its destruction is deferred, in
        // which case the references are rendered stale.
-        address_space_init_once(&va_space->mapping);
+        address_space_init_once(va_space->mapping);

        nv_kthread_q_item_init(&va_space->deferred_release_q_item, uvm_release_deferred, va_space);
        ret = nv_kthread_q_schedule_q_item(&g_uvm_global.deferred_release_q, &va_space->deferred_release_q_item);
@ -363,14 +441,12 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
    uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_gpu_t *gpu;
+    uvm_processor_id_t gpu_id;
    bool make_zombie = false;

    if (current->mm != NULL)
        uvm_record_lock_mmap_lock_write(current->mm);

-    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
-
    // current->mm will be NULL on process teardown, in which case we have
    // special handling.
    if (current->mm == NULL) {
@ -400,13 +476,11 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)

    uvm_destroy_vma_managed(vma, make_zombie);

-    // Notify GPU address spaces that the fault buffer needs to be flushed to avoid finding stale entries
-    // that can be attributed to new VA ranges reallocated at the same address
-    for_each_va_space_gpu_in_mask(gpu, va_space, &va_space->registered_gpu_va_spaces) {
-        uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
-        UVM_ASSERT(gpu_va_space);
-
-        gpu_va_space->needs_fault_buffer_flush = true;
+    // Notify GPU address spaces that the fault buffer needs to be flushed to
+    // avoid finding stale entries that can be attributed to new VA ranges
+    // reallocated at the same address.
+    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
+        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
    }
    uvm_va_space_up_write(va_space);

@ -556,7 +630,7 @@ static struct vm_operations_struct uvm_vm_ops_semaphore_pool =

 static int uvm_mmap(struct file *filp, struct vm_area_struct *vma)
 {
-    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+    uvm_va_space_t *va_space;
    uvm_va_range_t *va_range;
    NV_STATUS status = uvm_global_get_status();
    int ret = 0;
@ -565,8 +639,8 @@ static int uvm_mmap(struct file *filp, struct vm_area_struct *vma)
    if (status != NV_OK)
        return -nv_status_to_errno(status);

-    status = uvm_va_space_initialized(va_space);
-    if (status != NV_OK)
+    va_space = uvm_fd_va_space(filp);
+    if (!va_space)
        return -EBADFD;

    // When the VA space is associated with an mm, all vmas under the VA space
@ -618,7 +692,11 @@ static int uvm_mmap(struct file *filp, struct vm_area_struct *vma)
    // Using VM_DONTCOPY would be nice, but madvise(MADV_DOFORK) can reset that
    // so we have to handle vm_open on fork anyway. We could disable MADV_DOFORK
    // with VM_IO, but that causes other mapping issues.
-    vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND;
+    // Make the default behavior be VM_DONTCOPY to avoid the performance impact
+    // of removing CPU mappings in the parent on fork()+exec(). Users can call
+    // madvise(MDV_DOFORK) if the child process requires access to the
+    // allocation.
+    vma->vm_flags |= VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTCOPY;

    vma->vm_ops = &uvm_vm_ops_managed;

@ -685,7 +763,53 @@ static int uvm_mmap_entry(struct file *filp, struct vm_area_struct *vma)

 static NV_STATUS uvm_api_initialize(UVM_INITIALIZE_PARAMS *params, struct file *filp)
 {
-    return uvm_va_space_initialize(uvm_va_space_get(filp), params->flags);
+    uvm_va_space_t *va_space;
+    NV_STATUS status;
+    uvm_fd_type_t old_fd_type;
+
+    // Normally we expect private_data == UVM_FD_UNINITIALIZED. However multiple
+    // threads may call this ioctl concurrently so we have to be careful to
+    // avoid initializing multiple va_spaces and/or leaking memory. To do this
+    // we do an atomic compare and swap. Only one thread will observe
+    // UVM_FD_UNINITIALIZED and that thread will allocate and setup the
+    // va_space.
+    //
+    // Other threads will either see UVM_FD_INITIALIZING or UVM_FD_VA_SPACE. In
+    // the case of UVM_FD_VA_SPACE we return success if and only if the
+    // initialization flags match. If another thread is still initializing the
+    // va_space we return NV_ERR_BUSY_RETRY.
+    //
+    // If va_space initialization fails we return the failure code and reset the
+    // FD state back to UVM_FD_UNINITIALIZED to allow another initialization
+    // attempt to be made. This is safe because other threads will have only had
+    // a chance to observe UVM_FD_INITIALIZING and not UVM_FD_VA_SPACE in this
+    // case.
+    old_fd_type = nv_atomic_long_cmpxchg((atomic_long_t *)&filp->private_data,
+                                         UVM_FD_UNINITIALIZED, UVM_FD_INITIALIZING);
+    old_fd_type &= UVM_FD_TYPE_MASK;
+    if (old_fd_type == UVM_FD_UNINITIALIZED) {
+        status = uvm_va_space_create(filp->f_mapping, &va_space, params->flags);
+        if (status != NV_OK) {
+            atomic_long_set_release((atomic_long_t *)&filp->private_data, UVM_FD_UNINITIALIZED);
+            return status;
+        }
+
+        atomic_long_set_release((atomic_long_t *)&filp->private_data, (long)va_space | UVM_FD_VA_SPACE);
+    }
+    else if (old_fd_type == UVM_FD_VA_SPACE) {
+        va_space = uvm_va_space_get(filp);
+
+        if (params->flags != va_space->initialization_flags)
+            status = NV_ERR_INVALID_ARGUMENT;
+        else
+            status = NV_OK;
+    }
+    else {
+        UVM_ASSERT(old_fd_type == UVM_FD_INITIALIZING);
+        status = NV_ERR_BUSY_RETRY;
+    }
+
+    return status;
 }

 static NV_STATUS uvm_api_pageable_mem_access(UVM_PAGEABLE_MEM_ACCESS_PARAMS *params, struct file *filp)
@ -782,11 +906,6 @@ static const struct file_operations uvm_fops =
    .owner           = THIS_MODULE,
 };

-bool uvm_file_is_nvidia_uvm(struct file *filp)
-{
-    return (filp != NULL) && (filp->f_op == &uvm_fops);
-}
-
 NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_BUFFER_PARAMS *params, struct file *filp)
 {
    long ret;
--- a/kernel-open/nvidia-uvm/uvm.h
+++ b/kernel-open/nvidia-uvm/uvm.h
@ -1746,17 +1746,20 @@ NV_STATUS UvmCreateExternalRange(void     *base,
 // GPUs. The external allocation can be unmapped from a specific GPU using
 // UvmUnmapExternal or from all GPUs using UvmFree.
 //
-// The virtual address range specified by (base, length) must be aligned to the
-// allocation's physical page size and must fall within a VA range previously
-// created with UvmCreateExternalRange. A GPU VA space must have been registered
-// for each GPU in the list. The offset in the physical allocation at which the
-// allocation must be mapped should also be aligned to the allocation's physical
-// page size. The (base, length) range must lie within the largest possible
-// virtual address supported by the specified GPUs.
+// The virtual address range specified by (base, length) must fall within a VA
+// range previously created with UvmCreateExternalRange. A GPU VA space must
+// have been registered for each GPU in the list. The (base, length) range must
+// lie within the largest possible virtual address supported by the specified
+// GPUs.
+//
+// The page size used for the mapping is the largest supported page size less
+// than or equal to the alignments of base, length, offset, and the allocation
+// page size.
 //
 // If the range specified by (base, length) falls within any existing mappings,
 // the behavior is the same as if UvmUnmapExternal with the range specified by
-// (base, length) had been called first.
+// (base, length) had been called first, provided that base and length are
+// aligned to the page size used for the existing one.
 //
 // If the allocation resides in GPU memory, that GPU must have been registered
 // via UvmRegisterGpu. If the allocation resides in GPU memory and a mapping is
@ -1838,8 +1841,9 @@ NV_STATUS UvmCreateExternalRange(void     *base,
 //         - The requested address range does not fall entirely within an
 //           existing external VA range created with a single call to
 //           UvmCreateExternalRange.
-//         - At least one of base and length is not aligned to the allocation's
-//           physical page size.
+//         - The mapping page size allowed by the alignments of base, length,
+//           and offset is smaller than the minimum supported page size on the
+//           GPU.
 //         - base or base + length fall within an existing mapping but are not
 //           aligned to that mapping's page size.
 //
@ -1848,8 +1852,7 @@ NV_STATUS UvmCreateExternalRange(void     *base,
 //         address supported by one or more of the specified GPUs.
 //
 //     NV_ERR_INVALID_OFFSET:
-//         offset is not aligned to the allocation's physical page size or
-//         offset+length exceeds the allocation size.
+//         - offset+length exceeds the allocation size.
 //
 //     NV_ERR_INVALID_DEVICE:
 //         One of the following occurred:
@ -3758,6 +3761,7 @@ NV_STATUS UvmToolsDisableCounters(UvmToolsCountersHandle counters,
 //     NV_ERR_INVALID_ARGUMENT:
 //         Read spans more than a single target process allocation.
 //
+//
 //------------------------------------------------------------------------------
 NV_STATUS UvmToolsReadProcessMemory(UvmToolsSessionHandle  session,
                                    void                  *buffer,
--- a/kernel-open/nvidia-uvm/uvm_ampere_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_ce.c
@ -27,7 +27,7 @@
 #include "clc7b5.h"
 #include "clc56f.h" // Needed because HAL ce_init pushes SET_OBJECT

-bool uvm_hal_ampere_ce_method_validate_c6b5(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
+bool uvm_hal_ampere_ce_method_is_valid_c6b5(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
 {
    if (!uvm_channel_is_proxy(push->channel))
        return true;
@ -112,7 +112,7 @@ NvU32 uvm_hal_ampere_ce_plc_mode_c7b5(void)
    return HWCONST(C7B5, LAUNCH_DMA, DISABLE_PLC, TRUE);
 }

-bool uvm_hal_ampere_ce_memcopy_validate_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
+bool uvm_hal_ampere_ce_memcopy_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
 {
    NvU64 push_begin_gpu_va;
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
@ -183,7 +183,7 @@ void uvm_hal_ampere_ce_memcopy_patch_src_c6b5(uvm_push_t *push, uvm_gpu_address_
    src->address -= uvm_pushbuffer_get_gpu_va_for_push(push->channel->pool->manager->pushbuffer, push);
 }

-bool uvm_hal_ampere_ce_memset_validate_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
+bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

--- a/kernel-open/nvidia-uvm/uvm_ampere_host.c
+++ b/kernel-open/nvidia-uvm/uvm_ampere_host.c
@ -29,7 +29,7 @@
 #include "clc56f.h"
 #include "clc076.h"

-bool uvm_hal_ampere_host_method_validate(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
+bool uvm_hal_ampere_host_method_is_valid(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

@ -82,7 +82,7 @@ bool uvm_hal_ampere_host_method_validate(uvm_push_t *push, NvU32 method_address,
   return true;
 }

-bool uvm_hal_ampere_host_sw_method_validate(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
+bool uvm_hal_ampere_host_sw_method_is_valid(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
 {
    if (!uvm_channel_is_proxy(push->channel))
        return true;
--- a/kernel-open/nvidia-uvm/uvm_api.h
+++ b/kernel-open/nvidia-uvm/uvm_api.h
@ -25,6 +25,7 @@
 #define __UVM_API_H__

 #include "uvm_types.h"
+#include "uvm_common.h"
 #include "uvm_ioctl.h"
 #include "uvm_linux.h"
 #include "uvm_lock.h"
@ -51,8 +52,10 @@
                                                                                    \
        params.rmStatus = uvm_global_get_status();                                  \
        if (params.rmStatus == NV_OK) {                                             \
-            if (do_init_check)                                                      \
-                params.rmStatus = uvm_va_space_initialized(uvm_va_space_get(filp)); \
+            if (do_init_check) {                                                    \
+                if (!uvm_fd_va_space(filp))                                         \
+                    params.rmStatus = NV_ERR_ILLEGAL_ACTION;                        \
+            }                                                                       \
            if (likely(params.rmStatus == NV_OK))                                   \
                params.rmStatus = function_name(&params, filp);                     \
        }                                                                           \
@ -88,8 +91,10 @@
                                                                                        \
        params->rmStatus = uvm_global_get_status();                                     \
        if (params->rmStatus == NV_OK) {                                                \
-            if (do_init_check)                                                          \
-                params->rmStatus = uvm_va_space_initialized(uvm_va_space_get(filp));    \
+            if (do_init_check) {                                                        \
+                if (!uvm_fd_va_space(filp))                                             \
+                    params->rmStatus = NV_ERR_ILLEGAL_ACTION;                           \
+            }                                                                           \
            if (likely(params->rmStatus == NV_OK))                                      \
                params->rmStatus = function_name(params, filp);                         \
        }                                                                               \
--- a/kernel-open/nvidia-uvm/uvm_ce_test.c
+++ b/kernel-open/nvidia-uvm/uvm_ce_test.c
@ -24,6 +24,7 @@
 #include "uvm_channel.h"
 #include "uvm_global.h"
 #include "uvm_hal.h"
+#include "uvm_kvmalloc.h"
 #include "uvm_push.h"
 #include "uvm_test.h"
 #include "uvm_tracker.h"
@ -655,8 +656,10 @@ static NV_STATUS test_ce(uvm_va_space_t *va_space, bool skipTimestampTest)
        TEST_NV_CHECK_RET(test_memcpy_and_memset(gpu));
        TEST_NV_CHECK_RET(test_semaphore_reduction_inc(gpu));
        TEST_NV_CHECK_RET(test_semaphore_release(gpu));
+
        if (!skipTimestampTest)
            TEST_NV_CHECK_RET(test_semaphore_timestamp(gpu));
+
   }

    return NV_OK;
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@ -77,7 +77,7 @@ static void channel_pool_lock_init(uvm_channel_pool_t *pool)
        uvm_spin_lock_init(&pool->spinlock, UVM_LOCK_ORDER_CHANNEL);
 }

-void uvm_channel_pool_lock(uvm_channel_pool_t *pool)
+static void channel_pool_lock(uvm_channel_pool_t *pool)
 {
    if (uvm_channel_pool_is_proxy(pool))
        uvm_mutex_lock(&pool->mutex);
@ -85,7 +85,7 @@ void uvm_channel_pool_lock(uvm_channel_pool_t *pool)
        uvm_spin_lock(&pool->spinlock);
 }

-void uvm_channel_pool_unlock(uvm_channel_pool_t *pool)
+static void channel_pool_unlock(uvm_channel_pool_t *pool)
 {
    if (uvm_channel_pool_is_proxy(pool))
        uvm_mutex_unlock(&pool->mutex);
@ -93,14 +93,6 @@ void uvm_channel_pool_unlock(uvm_channel_pool_t *pool)
        uvm_spin_unlock(&pool->spinlock);
 }

-void uvm_channel_pool_assert_locked(uvm_channel_pool_t *pool)
-{
-    if (uvm_channel_pool_is_proxy(pool))
-        uvm_assert_mutex_locked(&pool->mutex);
-    else
-        uvm_assert_spinlock_locked(&pool->spinlock);
-}
-
 // Update channel progress, completing up to max_to_complete entries
 static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,
                                                  NvU32 max_to_complete,
@ -113,12 +105,14 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    NvU64 completed_value = uvm_channel_update_completed_value(channel);

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    // Completed value should never exceed the queued value
    UVM_ASSERT_MSG_RELEASE(completed_value <= channel->tracking_sem.queued_value,
                           "GPU %s channel %s unexpected completed_value 0x%llx > queued_value 0x%llx\n",
-                           channel->pool->manager->gpu->parent->name, channel->name, completed_value,
+                           channel->pool->manager->gpu->parent->name,
+                           channel->name,
+                           completed_value,
                           channel->tracking_sem.queued_value);

    cpu_put = channel->cpu_put;
@ -141,7 +135,7 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,

    channel->gpu_get = gpu_get;

-    uvm_channel_pool_unlock(channel->pool);
+    channel_pool_unlock(channel->pool);

    if (cpu_put >= gpu_get)
        pending_gpfifos = cpu_put - gpu_get;
@ -154,7 +148,8 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,
 NvU32 uvm_channel_update_progress(uvm_channel_t *channel)
 {
    // By default, don't complete too many entries at a time to spread the cost
-    // of doing so across callers and avoid holding a spin lock for too long.
+    // of doing so across callers and avoid potentially holding a spin lock for
+    // too long.
    return uvm_channel_update_progress_with_max(channel, 8, UVM_CHANNEL_UPDATE_MODE_COMPLETED);
 }

@ -186,71 +181,96 @@ NvU32 uvm_channel_manager_update_progress(uvm_channel_manager_t *channel_manager
    return pending_gpfifos;
 }

-static bool channel_is_available(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
+static NvU32 channel_get_available_gpfifo_entries(uvm_channel_t *channel)
 {
-    NvU32 pending_entries;
+    NvU32 available = channel->num_gpfifo_entries;

    uvm_channel_pool_assert_locked(channel->pool);

-    if (channel->cpu_put >= channel->gpu_get)
-        pending_entries = channel->cpu_put - channel->gpu_get;
-    else
-        pending_entries = channel->cpu_put + channel->num_gpfifo_entries - channel->gpu_get;
+    // Remove sentinel entry
+    available -= 1;

-    return (pending_entries + channel->current_gpfifo_count + num_gpfifo_entries < channel->num_gpfifo_entries);
+    // Remove entries of ongoing pushes
+    available -= channel->current_gpfifo_count;
+
+    // Remove pending entries
+    if (channel->cpu_put >= channel->gpu_get)
+        available -= (channel->cpu_put - channel->gpu_get);
+    else
+        available -= (channel->cpu_put + channel->num_gpfifo_entries - channel->gpu_get);
+
+    UVM_ASSERT(available < channel->num_gpfifo_entries);
+
+    return available;
 }

-static bool try_claim_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
+NvU32 uvm_channel_get_available_gpfifo_entries(uvm_channel_t *channel)
+{
+    NvU32 available;
+
+    channel_pool_lock(channel->pool);
+    available = channel_get_available_gpfifo_entries(channel);
+    channel_pool_unlock(channel->pool);
+
+    return available;
+}
+
+static bool try_claim_channel_locked(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
 {
    bool claimed = false;

    UVM_ASSERT(num_gpfifo_entries > 0);
    UVM_ASSERT(num_gpfifo_entries < channel->num_gpfifo_entries);

-    uvm_channel_pool_lock(channel->pool);
+    uvm_channel_pool_assert_locked(channel->pool);

-    if (channel_is_available(channel, num_gpfifo_entries)) {
+    if (channel_get_available_gpfifo_entries(channel) >= num_gpfifo_entries) {
        channel->current_gpfifo_count += num_gpfifo_entries;
        claimed = true;
    }

-    uvm_channel_pool_unlock(channel->pool);
+    return claimed;
+}
+
+static bool try_claim_channel(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
+{
+    bool claimed;
+
+    channel_pool_lock(channel->pool);
+    claimed = try_claim_channel_locked(channel, num_gpfifo_entries);
+    channel_pool_unlock(channel->pool);

    return claimed;
 }

-static void lock_push(uvm_channel_t *channel)
+static void unlock_channel_for_push(uvm_channel_t *channel)
 {
 }

-static void unlock_push(uvm_channel_t *channel)
+static bool is_channel_locked_for_push(uvm_channel_t *channel)
 {
-}

-static bool trylock_push(uvm_channel_t *channel)
-{
+    // For CE and proxy channels, we always return that the channel is locked,
+    // which has no functional impact in the UVM channel code-flow, this is only
+    // used on UVM_ASSERTs.
    return true;
 }

-// Reserve a channel in the specified pool
-static NV_STATUS channel_reserve_in_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out)
+// Reserve a channel in the specified CE pool
+static NV_STATUS channel_reserve_in_ce_pool(uvm_channel_pool_t *pool, uvm_channel_t **channel_out)
 {
    uvm_channel_t *channel;
    uvm_spin_loop_t spin;

    UVM_ASSERT(pool);
+    UVM_ASSERT(uvm_channel_pool_is_ce(pool));

    uvm_for_each_channel_in_pool(channel, pool) {
        // TODO: Bug 1764953: Prefer idle/less busy channels
-        if (trylock_push(channel)) {
        if (try_claim_channel(channel, 1)) {
            *channel_out = channel;
            return NV_OK;
        }
-            else {
-                unlock_push(channel);
-            }
-        }
    }

    uvm_spin_loop_init(&spin);
@ -261,7 +281,6 @@ static NV_STATUS channel_reserve_in_pool(uvm_channel_pool_t *pool, uvm_channel_t
            uvm_channel_update_progress(channel);

            if (try_claim_channel(channel, 1)) {
-                lock_push(channel);
                *channel_out = channel;

                return NV_OK;
@ -281,9 +300,12 @@ static NV_STATUS channel_reserve_in_pool(uvm_channel_pool_t *pool, uvm_channel_t

 NV_STATUS uvm_channel_reserve_type(uvm_channel_manager_t *manager, uvm_channel_type_t type, uvm_channel_t **channel_out)
 {
+    uvm_channel_pool_t *pool = manager->pool_to_use.default_for_type[type];
+
+    UVM_ASSERT(pool != NULL);
    UVM_ASSERT(type < UVM_CHANNEL_TYPE_COUNT);

-    return channel_reserve_in_pool(manager->pool_to_use.default_for_type[type], channel_out);
+    return channel_reserve_in_ce_pool(pool, channel_out);
 }

 NV_STATUS uvm_channel_reserve_gpu_to_gpu(uvm_channel_manager_t *manager,
@ -299,7 +321,7 @@ NV_STATUS uvm_channel_reserve_gpu_to_gpu(uvm_channel_manager_t *manager,

    UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);

-    return channel_reserve_in_pool(pool, channel_out);
+    return channel_reserve_in_ce_pool(pool, channel_out);
 }

 NV_STATUS uvm_channel_manager_wait(uvm_channel_manager_t *manager)
@ -323,14 +345,14 @@ static NvU32 channel_get_available_push_info_index(uvm_channel_t *channel)
 {
    uvm_push_info_t *push_info;

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    push_info = list_first_entry_or_null(&channel->available_push_infos, uvm_push_info_t, available_list_node);
    UVM_ASSERT(push_info != NULL);
    UVM_ASSERT(push_info->on_complete == NULL && push_info->on_complete_data == NULL);
    list_del(&push_info->available_list_node);

-    uvm_channel_pool_unlock(channel->pool);
+    channel_pool_unlock(channel->pool);

    return push_info - channel->push_infos;
 }
@ -345,6 +367,8 @@ NV_STATUS uvm_channel_begin_push(uvm_channel_t *channel, uvm_push_t *push)

    manager = channel->pool->manager;

+    UVM_ASSERT(is_channel_locked_for_push(channel));
+
    status = uvm_pushbuffer_begin_push(manager->pushbuffer, push);
    if (status != NV_OK)
        return status;
@ -439,7 +463,7 @@ void uvm_channel_end_push(uvm_push_t *push)
    NvU32 cpu_put;
    NvU32 new_cpu_put;

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    new_tracking_value = ++channel->tracking_sem.queued_value;
    new_payload = (NvU32)new_tracking_value;
@ -476,8 +500,8 @@ void uvm_channel_end_push(uvm_push_t *push)
    // may notice the GPU work to be completed and hence all state tracking the
    // push must be updated before that. Notably uvm_pushbuffer_end_push() has
    // to be called first.
-    uvm_channel_pool_unlock(channel->pool);
-    unlock_push(channel);
+    unlock_channel_for_push(channel);
+    channel_pool_unlock(channel->pool);

    // This memory barrier is borrowed from CUDA, as it supposedly fixes perf
    // issues on some systems. Comment from CUDA: "fixes throughput-related
@ -500,7 +524,7 @@ static void write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_valu
    NvU32 new_cpu_put;
    uvm_gpu_t *gpu = channel->pool->manager->gpu;

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    cpu_put = channel->cpu_put;
    new_cpu_put = (cpu_put + 1) % channel->num_gpfifo_entries;
@ -534,9 +558,10 @@ static void write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_entry_valu

    // The moment the channel is unlocked uvm_channel_update_progress_with_max()
    // may notice the GPU work to be completed and hence all state tracking the
-    // push must be updated before that.
-    uvm_channel_pool_unlock(channel->pool);
-    unlock_push(channel);
+    // push must be updated before that. Note that we do not call
+    // unlock_channel_for_push() because a control GPFIFO is followed by a
+    // semaphore release, where the channel is unlocked.
+    channel_pool_unlock(channel->pool);

    // This memory barrier is borrowed from CUDA, as it supposedly fixes perf
    // issues on some systems. Comment from CUDA: "fixes throughput-related
@ -593,7 +618,7 @@ NV_STATUS uvm_channel_reserve(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
    uvm_spin_loop_t spin;

    if (try_claim_channel(channel, num_gpfifo_entries))
-        goto out;
+        return NV_OK;

    uvm_channel_update_progress(channel);

@ -604,10 +629,6 @@ NV_STATUS uvm_channel_reserve(uvm_channel_t *channel, NvU32 num_gpfifo_entries)
        uvm_channel_update_progress(channel);
    }

-out:
-    if (status == NV_OK)
-        lock_push(channel);
-
    return status;
 }

@ -621,12 +642,12 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch
    if (pending_count == 0)
        return NULL;

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    if (channel->gpu_get != channel->cpu_put)
        entry = &channel->gpfifo_entries[channel->gpu_get];

-    uvm_channel_pool_unlock(channel->pool);
+    channel_pool_unlock(channel->pool);

    return entry;
 }
@ -780,18 +801,17 @@ static NV_STATUS internal_channel_create(uvm_channel_t *channel, unsigned engine
    uvm_channel_manager_t *manager = channel->pool->manager;
    uvm_gpu_t *gpu = manager->gpu;

-    if (uvm_channel_is_ce(channel)) {
-        UVM_ASSERT(channel->pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);
-    }
-
    memset(&channel_alloc_params, 0, sizeof(channel_alloc_params));
    channel_alloc_params.numGpFifoEntries = manager->conf.num_gpfifo_entries;
    channel_alloc_params.gpFifoLoc = manager->conf.gpfifo_loc;
    channel_alloc_params.gpPutLoc = manager->conf.gpput_loc;
    channel_alloc_params.engineIndex = engine_index;

-    if (uvm_channel_is_ce(channel))
+    if (uvm_channel_is_ce(channel)) {
+        UVM_ASSERT(channel->pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE);
+
        channel_alloc_params.engineType = UVM_GPU_CHANNEL_ENGINE_TYPE_CE;
+    }

    status = uvm_rm_locked_call(nvUvmInterfaceChannelAllocate(gpu->rm_address_space,
                                                              &channel_alloc_params,
@ -923,7 +943,7 @@ NvU64 uvm_channel_tracking_semaphore_get_gpu_va_in_channel(uvm_channel_t *semaph
    return uvm_gpu_semaphore_get_gpu_va(semaphore, gpu, uvm_channel_is_proxy(access_channel));
 }

-static NV_STATUS init_channel(uvm_channel_t *channel)
+static NV_STATUS channel_init(uvm_channel_t *channel)
 {
    uvm_push_t push;
    uvm_gpu_t *gpu = uvm_channel_get_gpu(channel);
@ -1010,6 +1030,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
    channel_pool_lock_init(pool);

    num_channels = channel_pool_type_num_channels(pool_type);
+    UVM_ASSERT(num_channels <= UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL);

    pool->channels = uvm_kvmalloc_zero(sizeof(*pool->channels) * num_channels);
    if (!pool->channels)
@ -1024,7 +1045,7 @@ static NV_STATUS channel_pool_add(uvm_channel_manager_t *channel_manager,
        if (status != NV_OK)
            goto error;

-        status = init_channel(channel);
+        status = channel_init(channel);
        if (status != NV_OK)
            goto error;
    }
@ -1404,15 +1425,55 @@ static void init_channel_manager_conf(uvm_channel_manager_t *manager)
        manager->conf.gpput_loc = string_to_buffer_location(gpput_loc_value);
 }

-// A pool is created for each usable CE, even if it has not been selected as the
-// preferred CE for any type, because as more information is discovered (for
-// example, a pair of peer GPUs is added) we may start using the previously idle
-// channels.
+// Returns the maximum number of pools that are needed in the current
+// configuration. The implementation may choose to create a smaller number of
+// pools.
+static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager)
+{
+    unsigned num_channel_pools;
+    unsigned num_used_ce = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
+
+    // Create one CE channel pool per usable CE
+    num_channel_pools = num_used_ce;
+
+    // CE proxy channel pool.
+    if (uvm_gpu_uses_proxy_channel_pool(manager->gpu))
+        num_channel_pools++;
+
+    return num_channel_pools;
+}
+
+static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce)
+{
+    unsigned ce;
+
+    // A pool is created for each usable CE, even if it has not been selected as
+    // the preferred CE for any type, because as more information is discovered
+    // (for example, a pair of peer GPUs is added) we may start using the
+    // previously idle pools.
+    for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
+        NV_STATUS status;
+        unsigned type;
+        uvm_channel_pool_t *pool = NULL;
+
+        status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool);
+        if (status != NV_OK)
+            return status;
+
+        for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) {
+            if (preferred_ce[type] == ce)
+                manager->pool_to_use.default_for_type[type] = pool;
+        }
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager)
 {
    NV_STATUS status;
-    unsigned ce, type;
-    unsigned num_channel_pools;
+    uvm_channel_type_t type;
+    unsigned max_channel_pools;
    unsigned preferred_ce[UVM_CHANNEL_TYPE_CE_COUNT];
    uvm_channel_pool_t *pool = NULL;

@ -1423,36 +1484,21 @@ static NV_STATUS channel_manager_create_pools(uvm_channel_manager_t *manager)
    if (status != NV_OK)
        return status;

-    // CE channel pools
-    num_channel_pools = bitmap_weight(manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX);
+    max_channel_pools = channel_manager_get_max_pools(manager);

-    // CE proxy channel pool.
-    if (uvm_gpu_uses_proxy_channel_pool(manager->gpu))
-        num_channel_pools++;
-
-    manager->channel_pools = uvm_kvmalloc_zero(sizeof(*manager->channel_pools) * num_channel_pools);
+    manager->channel_pools = uvm_kvmalloc_zero(sizeof(*manager->channel_pools) * max_channel_pools);
    if (!manager->channel_pools)
        return NV_ERR_NO_MEMORY;

-    for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) {
-        status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool);
+    status = channel_manager_create_ce_pools(manager, preferred_ce);
    if (status != NV_OK)
        return status;
-    }
-
-    // Assign channel types to pools
-    for (type = 0; type < ARRAY_SIZE(preferred_ce); type++) {
-        unsigned ce = preferred_ce[type];
-
-        UVM_ASSERT(test_bit(ce, manager->ce_mask));
-
-        manager->pool_to_use.default_for_type[type] = channel_manager_ce_pool(manager, ce);
-    }

    // In SR-IOV heavy, add an additional, single-channel, pool that is
    // dedicated to the MEMOPS type.
    if (uvm_gpu_uses_proxy_channel_pool(manager->gpu)) {
        uvm_channel_type_t channel_type = uvm_channel_proxy_channel_type();
+
        status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE_PROXY, preferred_ce[channel_type], &pool);
        if (status != NV_OK)
            return status;
@ -1613,7 +1659,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    uvm_channel_manager_t *manager = channel->pool->manager;
    UVM_SEQ_OR_DBG_PRINT(s, "Channel %s\n", channel->name);

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    UVM_SEQ_OR_DBG_PRINT(s, "completed          %llu\n", uvm_channel_update_completed_value(channel));
    UVM_SEQ_OR_DBG_PRINT(s, "queued             %llu\n", channel->tracking_sem.queued_value);
@ -1625,7 +1671,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);

-    uvm_channel_pool_unlock(channel->pool);
+    channel_pool_unlock(channel->pool);
 }

 static void channel_print_push_acquires(uvm_push_acquire_info_t *push_acquire_info, struct seq_file *seq)
@ -1669,7 +1715,7 @@ static void channel_print_pushes(uvm_channel_t *channel, NvU32 finished_pushes_c

    NvU64 completed_value = uvm_channel_update_completed_value(channel);

-    uvm_channel_pool_lock(channel->pool);
+    channel_pool_lock(channel->pool);

    cpu_put = channel->cpu_put;

@ -1717,7 +1763,7 @@ static void channel_print_pushes(uvm_channel_t *channel, NvU32 finished_pushes_c
                channel_print_push_acquires(push_acquire_info, seq);
        }
    }
-    uvm_channel_pool_unlock(channel->pool);
+    channel_pool_unlock(channel->pool);
 }

 void uvm_channel_print_pending_pushes(uvm_channel_t *channel)
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@ -50,6 +50,9 @@
 #define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN 32
 #define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)

+// Maximum number of channels per pool.
+#define UVM_CHANNEL_MAX_NUM_CHANNELS_PER_POOL 8
+
 // Semaphore payloads cannot advance too much between calls to
 // uvm_gpu_tracking_semaphore_update_completed_value(). In practice the jumps
 // are bound by gpfifo sizing as we have to update the completed value to
@ -61,6 +64,14 @@
 // uvm_channel.h includes uvm_gpu_semaphore.h.
 #define UVM_GPU_SEMAPHORE_MAX_JUMP (2 * UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX)

+#define uvm_channel_pool_assert_locked(pool) (          \
+{                                                       \
+    if (uvm_channel_pool_is_proxy(pool))                \
+        uvm_assert_mutex_locked(&(pool)->mutex);        \
+    else                                                \
+        uvm_assert_spinlock_locked(&(pool)->spinlock);  \
+})
+
 // Channel types
 typedef enum
 {
@ -162,7 +173,20 @@ typedef struct
    // Pool type: Refer to the uvm_channel_pool_type_t enum.
    uvm_channel_pool_type_t pool_type;

-    // Lock protecting the state of channels in the pool
+    // Lock protecting the state of channels in the pool.
+    //
+    // There are two pool lock types available: spinlock and mutex. The mutex
+    // variant is required when the thread holding the pool lock must
+    // sleep (ex: acquire another mutex) deeper in the call stack, either in UVM
+    // or RM. For example, work submission to proxy channels in SR-IOV heavy
+    // entails calling an RM API that acquires a mutex, so the proxy channel
+    // pool must use the mutex variant.
+    //
+    // Unless the mutex is required, the spinlock is preferred. This is because,
+    // other than for proxy channels, work submission takes little time and does
+    // not involve any RM calls, so UVM can avoid any invocation that may result
+    // on a sleep. All non-proxy channel pools use the spinlock variant, even in
+    // SR-IOV heavy.
    union {
        uvm_spinlock_t spinlock;
        uvm_mutex_t mutex;
@ -275,7 +299,7 @@ struct uvm_channel_manager_struct
    unsigned num_channel_pools;

    // Mask containing the indexes of the usable Copy Engines. Each usable CE
-    // has a pool associated with it, see channel_manager_ce_pool
+    // has at least one pool associated with it.
    DECLARE_BITMAP(ce_mask, UVM_COPY_ENGINE_COUNT_MAX);

    struct
@ -313,10 +337,6 @@ struct uvm_channel_manager_struct
 // Create a channel manager for the GPU
 NV_STATUS uvm_channel_manager_create(uvm_gpu_t *gpu, uvm_channel_manager_t **manager_out);

-void uvm_channel_pool_lock(uvm_channel_pool_t *pool);
-void uvm_channel_pool_unlock(uvm_channel_pool_t *pool);
-void uvm_channel_pool_assert_locked(uvm_channel_pool_t *pool);
-
 static bool uvm_channel_pool_is_proxy(uvm_channel_pool_t *pool)
 {
    UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
@ -329,10 +349,16 @@ static bool uvm_channel_is_proxy(uvm_channel_t *channel)
    return uvm_channel_pool_is_proxy(channel->pool);
 }

+static bool uvm_channel_pool_is_ce(uvm_channel_pool_t *pool)
+{
+    UVM_ASSERT(pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
+
+    return (pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE) || uvm_channel_pool_is_proxy(pool);
+}
+
 static bool uvm_channel_is_ce(uvm_channel_t *channel)
 {
-    UVM_ASSERT(channel->pool->pool_type < UVM_CHANNEL_POOL_TYPE_MASK);
-    return (channel->pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE) || uvm_channel_is_proxy(channel);
+    return uvm_channel_pool_is_ce(channel->pool);
 }

 // Proxy channels are used to push page tree related methods, so their channel
@ -449,6 +475,10 @@ NV_STATUS uvm_channel_write_ctrl_gpfifo(uvm_channel_t *channel, NvU64 ctrl_fifo_
 const char *uvm_channel_type_to_string(uvm_channel_type_t channel_type);
 const char *uvm_channel_pool_type_to_string(uvm_channel_pool_type_t channel_pool_type);

+// Returns the number of available GPFIFO entries. The function internally
+// acquires the channel pool lock.
+NvU32 uvm_channel_get_available_gpfifo_entries(uvm_channel_t *channel);
+
 void uvm_channel_print_pending_pushes(uvm_channel_t *channel);

 static uvm_gpu_t *uvm_channel_get_gpu(uvm_channel_t *channel)
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@ -153,7 +153,6 @@ done:

 static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space)
 {
-    NV_STATUS status;
    uvm_gpu_t *gpu;

    for_each_va_space_gpu(gpu, va_space) {
@ -168,11 +167,12 @@ static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space)
        completed_value = uvm_channel_update_completed_value(channel);
        uvm_gpu_semaphore_set_payload(&channel->tracking_sem.semaphore, (NvU32)completed_value + 1);

-        TEST_CHECK_RET(uvm_global_get_status() == NV_OK);
+        TEST_NV_CHECK_RET(uvm_global_get_status());
        uvm_channel_update_progress_all(channel);
        TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE);

        uvm_channel_manager_destroy(gpu->channel_manager);
+
        // Destruction will hit the error again, so clear one more time.
        uvm_global_reset_fatal_error();

@ -743,22 +743,6 @@ NV_STATUS test_write_ctrl_gpfifo_and_pushes(uvm_va_space_t *va_space)
    return NV_OK;
 }

-static NvU32 get_available_gpfifo_entries(uvm_channel_t *channel)
-{
-    NvU32 pending_entries;
-
-    uvm_channel_pool_lock(channel->pool);
-
-    if (channel->cpu_put >= channel->gpu_get)
-        pending_entries = channel->cpu_put - channel->gpu_get;
-    else
-        pending_entries = channel->cpu_put + channel->num_gpfifo_entries - channel->gpu_get;
-
-    uvm_channel_pool_unlock(channel->pool);
-
-    return channel->num_gpfifo_entries - pending_entries - 1;
-}
-
 NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
 {
    NV_STATUS status = NV_OK;
@ -771,9 +755,10 @@ NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
    NvU64 entry;
    uvm_push_t push;

+    gpu = uvm_va_space_find_first_gpu(va_space);
+
    for_each_va_space_gpu(gpu, va_space) {
        uvm_channel_manager_t *manager = gpu->channel_manager;
-        gpu = manager->gpu;

        TEST_NV_CHECK_RET(uvm_rm_mem_alloc_and_map_cpu(gpu, UVM_RM_MEM_TYPE_SYS, sizeof(*cpu_ptr), 0, &mem));
        cpu_ptr = uvm_rm_mem_get_cpu_va(mem);
@ -791,6 +776,12 @@ NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
        gpu->parent->host_hal->semaphore_acquire(&push, gpu_va, 1);
        uvm_push_end(&push);

+        // Flush all completed entries from the GPFIFO ring buffer. This test
+        // requires this flush because we verify (below with
+        // uvm_channel_get_available_gpfifo_entries) the number of free entries
+        // in the channel.
+        uvm_channel_update_progress_all(channel);
+
        // Populate the remaining GPFIFO entries, leaving 2 slots available.
        // 2 available entries + 1 semaphore acquire (above) + 1 spare entry to
        // indicate a terminal condition for the GPFIFO ringbuffer, therefore we
@ -800,7 +791,7 @@ NV_STATUS test_write_ctrl_gpfifo_tight(uvm_va_space_t *va_space)
            uvm_push_end(&push);
        }

-        TEST_CHECK_GOTO(get_available_gpfifo_entries(channel) == 2, error);
+        TEST_CHECK_GOTO(uvm_channel_get_available_gpfifo_entries(channel) == 2, error);

        // We should have room for the control GPFIFO and the subsequent
        // semaphore release.
@ -936,7 +927,7 @@ done:
 static NV_STATUS uvm_test_channel_stress_stream(uvm_va_space_t *va_space,
                                                const UVM_TEST_CHANNEL_STRESS_PARAMS *params)
 {
-    NV_STATUS status;
+    NV_STATUS status = NV_OK;

    if (params->iterations == 0 || params->num_streams == 0)
        return NV_ERR_INVALID_PARAMETER;
@ -951,10 +942,7 @@ static NV_STATUS uvm_test_channel_stress_stream(uvm_va_space_t *va_space,
                                        params->iterations,
                                        params->seed,
                                        params->verbose);
-    if (status != NV_OK)
-        goto done;

-done:
    uvm_va_space_up_read_rm(va_space);
    uvm_mutex_unlock(&g_uvm_global.global_lock);

--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@ -347,6 +347,21 @@ typedef struct
    NvHandle user_object;
 } uvm_rm_user_object_t;

+typedef enum
+{
+    UVM_FD_UNINITIALIZED,
+    UVM_FD_INITIALIZING,
+    UVM_FD_VA_SPACE,
+    UVM_FD_COUNT
+} uvm_fd_type_t;
+
+// This should be large enough to fit the valid values from uvm_fd_type_t above.
+// Note we can't use order_base_2(UVM_FD_COUNT) to define this because our code
+// coverage tool fails due when the preprocessor expands that to a huge mess of
+// ternary operators.
+#define UVM_FD_TYPE_BITS 2
+#define UVM_FD_TYPE_MASK ((1UL << UVM_FD_TYPE_BITS) - 1)
+
 // Macro used to compare two values for types that support less than operator.
 // It returns -1 if a < b, 1 if a > b and 0 if a == 0
 #define UVM_CMP_DEFAULT(a,b)              \
@ -369,6 +384,10 @@ typedef struct
 // file. A NULL input returns false.
 bool uvm_file_is_nvidia_uvm(struct file *filp);

+// Returns the type of data filp->private_data contains to and if ptr_val !=
+// NULL returns the value of the pointer.
+uvm_fd_type_t uvm_fd_type(struct file *filp, void **ptr_val);
+
 // Reads the first word in the supplied struct page.
 static inline void uvm_touch_page(struct page *page)
 {
--- a/kernel-open/nvidia-uvm/uvm_forward_decl.h
+++ b/kernel-open/nvidia-uvm/uvm_forward_decl.h
@ -28,6 +28,8 @@ typedef struct uvm_global_struct uvm_global_t;

 typedef struct uvm_gpu_struct uvm_gpu_t;
 typedef struct uvm_parent_gpu_struct uvm_parent_gpu_t;
+typedef struct uvm_gpu_chunk_struct uvm_gpu_chunk_t;
+typedef struct uvm_cpu_chunk_struct uvm_cpu_chunk_t;
 typedef struct uvm_rm_mem_struct uvm_rm_mem_t;
 typedef struct uvm_mem_struct uvm_mem_t;
 typedef struct uvm_host_hal_struct uvm_host_hal_t;
@ -56,6 +58,7 @@ typedef struct uvm_va_range_struct uvm_va_range_t;
 typedef struct uvm_va_block_struct uvm_va_block_t;
 typedef struct uvm_va_block_test_struct uvm_va_block_test_t;
 typedef struct uvm_va_block_wrapper_struct uvm_va_block_wrapper_t;
+typedef struct uvm_va_block_retry_struct uvm_va_block_retry_t;
 typedef struct uvm_va_space_struct uvm_va_space_t;
 typedef struct uvm_va_space_mm_struct uvm_va_space_mm_t;

--- a/kernel-open/nvidia-uvm/uvm_global.h
+++ b/kernel-open/nvidia-uvm/uvm_global.h
@ -191,6 +191,16 @@ static void uvm_global_remove_parent_gpu(uvm_parent_gpu_t *parent_gpu)
    g_uvm_global.parent_gpus[gpu_index] = NULL;
 }

+// Get a parent gpu by its id.
+// Returns a pointer to the parent GPU object, or NULL if not found.
+//
+// LOCKING: requires that you hold the gpu_table_lock, the global lock, or have
+// retained at least one of the child GPUs.
+static uvm_parent_gpu_t *uvm_parent_gpu_get(uvm_gpu_id_t id)
+{
+    return g_uvm_global.parent_gpus[uvm_id_gpu_index(id)];
+}
+
 // Get a gpu by its global id.
 // Returns a pointer to the GPU object, or NULL if not found.
 //
--- a/kernel-open/nvidia-uvm/uvm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -94,8 +94,6 @@ static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type)
            return UVM_GPU_LINK_NVLINK_3;
        case UVM_LINK_TYPE_NVLINK_4:
            return UVM_GPU_LINK_NVLINK_4;
-        case UVM_LINK_TYPE_C2C:
-            return UVM_GPU_LINK_C2C;
        default:
            return UVM_GPU_LINK_INVALID;
    }
@ -210,27 +208,12 @@ static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu)
    return parent_gpu->rm_info.subdeviceCount == 1;
 }

-static bool parent_gpu_uses_canonical_form_address(uvm_parent_gpu_t *parent_gpu)
+static bool platform_uses_canonical_form_address(void)
 {
-    NvU64 gpu_addr_shift;
-    NvU64 cpu_addr_shift;
-
-    // PPC64LE doesn't use canonical form addresses.
    if (NVCPU_IS_PPC64LE)
        return false;

-    // We use big_page_size as UVM_PAGE_SIZE_64K because num_va_bits() is
-    // big_page_size invariant in the MMU HAL.
-    UVM_ASSERT(!parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_128K) ||
-               (parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits() ==
-                parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_128K)->num_va_bits()));
-
-    gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
-    cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
-
-    // Refer to the comments and diagram in uvm_gpu.c:uvm_gpu_can_address().
-    return gpu_addr_shift >= cpu_addr_shift;
-
+    return true;
 }

 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
@ -239,6 +222,9 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    // the canonical address form.
    NvU64 max_va_lower;
    NvU64 addr_end = addr + size - 1;
+    NvU8 gpu_addr_shift;
+    NvU8 cpu_addr_shift;
+    NvU8 addr_shift;

    // Watch out for calling this too early in init
    UVM_ASSERT(gpu->address_space_tree.hal);
@ -246,6 +232,10 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    UVM_ASSERT(addr <= addr_end);
    UVM_ASSERT(size > 0);

+    gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits();
+    cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+    addr_shift = gpu_addr_shift;
+
    // Pascal+ GPUs are capable of accessing kernel pointers in various modes
    // by applying the same upper-bit checks that x86, ARM, and Power
    // processors do. x86 and ARM use canonical form addresses. For ARM, even
@ -255,13 +245,15 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    // mapped (or addressed) by the GPU/CPU when the CPU uses canonical form.
    // (C) regions are only accessible by the CPU. Similarly, (G) regions
    // are only accessible by the GPU. (X) regions are not addressible.
+    // Note that we only consider (V) regions, i.e., address ranges that are
+    // addressable by both, the CPU and GPU.
    //
    //               GPU MAX VA < CPU MAX VA           GPU MAX VA >= CPU MAX VA
    //          0xF..F +----------------+          0xF..F +----------------+
-    //                 |CCCCCCCCCCCCCCCC|                 |VVVVVVVVVVVVVVVV|
-    //                 |CCCCCCCCCCCCCCCC|                 |VVVVVVVVVVVVVVVV|
-    //                 |CCCCCCCCCCCCCCCC|                 |VVVVVVVVVVVVVVVV|
-    //                 |CCCCCCCCCCCCCCCC| CPU MIN UPPER VA|----------------|
+    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
+    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
+    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
+    // GPU MIN UPPER VA|----------------| CPU MIN UPPER VA|----------------|
    //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
    //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
    // CPU MIN UPPER VA|----------------| GPU MIN UPPER VA|----------------|
@ -270,32 +262,83 @@ bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
    // CPU MAX LOWER VA|----------------| GPU MAX LOWER VA|----------------|
    //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
    //                 |CCCCCCCCCCCCCCCC|                 |GGGGGGGGGGGGGGGG|
-    //       GPU MAX VA|----------------| CPU MAX LOWER VA|----------------|
+    // GPU MAX LOWER VA|----------------| CPU MAX LOWER VA|----------------|
    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
    //                 |VVVVVVVVVVVVVVVV|                 |VVVVVVVVVVVVVVVV|
    //               0 +----------------+               0 +----------------+

-    if (parent_gpu_uses_canonical_form_address(gpu->parent)) {
-        NvU64 min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - gpu->address_space_tree.hal->num_va_bits()));
-        max_va_lower = 1ULL << (gpu->address_space_tree.hal->num_va_bits() - 1);
+    // On canonical form address platforms and Pascal+ GPUs.
+    if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) {
+        NvU64 min_va_upper;
+
+        // On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses
+        // 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b
+        // wide VA) we set addr_shift to match a 4-level paging x86 (48b wide).
+        // See more details on uvm_parent_gpu_canonical_address(..);
+        if (cpu_addr_shift > gpu_addr_shift)
+            addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
+        else if (gpu_addr_shift == 57)
+            addr_shift = gpu_addr_shift;
+        else
+            addr_shift = cpu_addr_shift;
+
+        min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift));
+        max_va_lower = 1ULL << (addr_shift - 1);
        return (addr_end < max_va_lower) || (addr >= min_va_upper);
    }
    else {
-        max_va_lower = 1ULL << gpu->address_space_tree.hal->num_va_bits();
+        max_va_lower = 1ULL << addr_shift;
        return addr_end < max_va_lower;
    }
 }

+// The internal UVM VAS does not use canonical form addresses.
+bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size)
+{
+    NvU64 addr_end = addr + size - 1;
+    NvU64 max_gpu_va;
+
+    // Watch out for calling this too early in init
+    UVM_ASSERT(gpu->address_space_tree.hal);
+    UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64);
+    UVM_ASSERT(addr <= addr_end);
+    UVM_ASSERT(size > 0);
+
+    max_gpu_va = 1ULL << gpu->address_space_tree.hal->num_va_bits();
+    return addr_end < max_gpu_va;
+}
+
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr)
 {
-    NvU32 gpu_va_bits;
-    NvU32 shift;
+    NvU8 gpu_addr_shift;
+    NvU8 cpu_addr_shift;
+    NvU8 addr_shift;
+    NvU64 input_addr = addr;

-    if (parent_gpu_uses_canonical_form_address(parent_gpu)) {
-        gpu_va_bits =  parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
-        shift = 64 - gpu_va_bits;
-        addr = (NvU64)((NvS64)(addr << shift) >> shift);
+    if (platform_uses_canonical_form_address()) {
+        // When the CPU VA width is larger than GPU's, it means that:
+        // On ARM: the CPU is on LVA mode and the GPU is pre-Hopper.
+        // On x86: the CPU uses 5-level paging and the GPU is pre-Hopper.
+        // We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the
+        // behavior of CPUs with smaller (than GPU) VA widths.
+        gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits();
+        cpu_addr_shift = fls64(TASK_SIZE - 1) + 1;
+
+        if (cpu_addr_shift > gpu_addr_shift)
+            addr_shift = NVCPU_IS_X86_64 ? 48 : 49;
+        else if (gpu_addr_shift == 57)
+            addr_shift = gpu_addr_shift;
+        else
+            addr_shift = cpu_addr_shift;
+
+        addr = (NvU64)((NvS64)(addr << (64 - addr_shift)) >> (64 - addr_shift));
+
+        // This protection acts on when the address is not covered by the GPU's
+        // OOR_ADDR_CHECK. This can only happen when OOR_ADDR_CHECK is in
+        // permissive (NO_CHECK) mode.
+        if ((addr << (64 - gpu_addr_shift)) != (input_addr << (64 - gpu_addr_shift)))
+            return input_addr;
    }

    return addr;
@ -351,7 +394,7 @@ static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode)

 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
 {
-    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7);
+    BUILD_BUG_ON(UVM_GPU_LINK_MAX != 6);

    switch (link_type) {
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_INVALID);
@ -360,7 +403,6 @@ static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type)
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_2);
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_3);
        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_4);
-        UVM_ENUM_STRING_CASE(UVM_GPU_LINK_C2C);
        UVM_ENUM_STRING_DEFAULT();
    }
 }
@ -866,6 +908,7 @@ static NV_STATUS alloc_parent_gpu(const NvProcessorUuid *gpu_uuid,
                                  uvm_parent_gpu_t **parent_gpu_out)
 {
    uvm_parent_gpu_t *parent_gpu;
+    NV_STATUS status;

    parent_gpu = uvm_kvmalloc_zero(sizeof(*parent_gpu));
    if (!parent_gpu)
@ -882,11 +925,14 @@ static NV_STATUS alloc_parent_gpu(const NvProcessorUuid *gpu_uuid,
    uvm_rb_tree_init(&parent_gpu->instance_ptr_table);
    uvm_rb_tree_init(&parent_gpu->tsg_table);

+    // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
+    status = errno_to_nv_status(nv_kthread_q_init(&parent_gpu->lazy_free_q, "vidmem lazy free"));
+
    nv_kref_init(&parent_gpu->gpu_kref);

    *parent_gpu_out = parent_gpu;

-    return NV_OK;
+    return status;
 }

 // Allocates a uvm_gpu_t struct and initializes the basic fields and leaves all
@ -1539,6 +1585,8 @@ static void uvm_parent_gpu_destroy(nv_kref_t *nv_kref)
    UVM_ASSERT(parent_gpu->num_retained_gpus == 0);
    UVM_ASSERT(bitmap_empty(parent_gpu->valid_gpus, UVM_ID_MAX_SUB_PROCESSORS));

+    nv_kthread_q_stop(&parent_gpu->lazy_free_q);
+
    for (sub_processor_index = 0; sub_processor_index < UVM_ID_MAX_SUB_PROCESSORS; sub_processor_index++)
        UVM_ASSERT(!parent_gpu->gpus[sub_processor_index]);

@ -2165,12 +2213,9 @@ static NV_STATUS init_peer_access(uvm_gpu_t *gpu0,
 {
    NV_STATUS status;

-    UVM_ASSERT(p2p_caps_params->p2pLink != UVM_LINK_TYPE_C2C);
-
    // check for peer-to-peer compatibility (PCI-E or NvLink).
    peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink);
    if (peer_caps->link_type == UVM_GPU_LINK_INVALID
-        || peer_caps->link_type == UVM_GPU_LINK_C2C
        )
        return NV_ERR_NOT_SUPPORTED;

@ -2553,7 +2598,10 @@ uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu
 uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu)
 {
    // See comment in page_tree_set_location
-    return uvm_gpu_is_virt_mode_sriov_heavy(gpu)? UVM_APERTURE_VID : UVM_APERTURE_DEFAULT;
+    if (uvm_gpu_is_virt_mode_sriov_heavy(gpu))
+        return UVM_APERTURE_VID;
+
+    return UVM_APERTURE_DEFAULT;
 }

 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr)
@ -2964,9 +3012,6 @@ NV_STATUS uvm_gpu_fault_entry_to_va_space(uvm_gpu_t *gpu,
 exit_unlock:
    uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);

-    if (status == NV_OK)
-        UVM_ASSERT(uvm_va_space_initialized(*out_va_space) == NV_OK);
-
    return status;
 }

@ -3005,9 +3050,6 @@ NV_STATUS uvm_gpu_access_counter_entry_to_va_space(uvm_gpu_t *gpu,
 exit_unlock:
    uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock);

-    if (status == NV_OK)
-        UVM_ASSERT(uvm_va_space_initialized(*out_va_space) == NV_OK);
-
    return status;
 }

--- a/kernel-open/nvidia-uvm/uvm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_gpu.h
@ -386,7 +386,8 @@ struct uvm_access_counter_service_batch_context_struct
        // Virtual address notifications are always aligned to 64k. This means up to 16
        // different physical locations could have been accessed to trigger one notification.
        // The sub-granularity mask can correspond to any of them.
-        struct {
+        struct
+        {
            uvm_processor_id_t resident_processors[16];
            uvm_gpu_phys_address_t phys_addresses[16];
            uvm_access_counter_buffer_entry_t phys_entry;
@ -523,7 +524,6 @@ typedef enum
    UVM_GPU_LINK_NVLINK_2,
    UVM_GPU_LINK_NVLINK_3,
    UVM_GPU_LINK_NVLINK_4,
-    UVM_GPU_LINK_C2C,
    UVM_GPU_LINK_MAX
 } uvm_gpu_link_type_t;

@ -957,6 +957,10 @@ struct uvm_parent_gpu_struct
    // NUMA info, mainly for ATS
    uvm_numa_info_t numa_info;

+    // PMM lazy free processing queue.
+    // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue.
+    nv_kthread_q_t lazy_free_q;
+
    // Access counter buffer info. This is only valid if supports_access_counters is set to true
    uvm_access_counter_buffer_info_t access_counter_buffer_info;

@ -1120,7 +1124,8 @@ struct uvm_gpu_peer_struct
    // deletion.
    NvHandle p2p_handle;

-    struct {
+    struct
+    {
        struct proc_dir_entry *peer_file[2];
        struct proc_dir_entry *peer_symlink_file[2];

@ -1364,6 +1369,16 @@ void uvm_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_add
 // The GPU must be initialized before calling this function.
 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);

+// Returns whether the given range is within the GPU's addressable VA ranges in
+// the internal GPU VA "kernel" address space, which is a linear address space.
+// Therefore, the input 'addr' must not be in canonical form, even platforms
+// that use to the canonical form addresses, i.e., ARM64, and x86.
+// Warning: This only checks whether the GPU's MMU can support the given
+// address. Some HW units on that GPU might only support a smaller range.
+//
+// The GPU must be initialized before calling this function.
+bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size);
+
 // Returns addr's canonical form for host systems that use canonical form
 // addresses.
 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr);
--- a/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_non_replayable_faults.c
@ -459,8 +459,6 @@ static void kill_channel_delayed(void *_user_channel)
    uvm_user_channel_t *user_channel = (uvm_user_channel_t *)_user_channel;
    uvm_va_space_t *va_space = user_channel->kill_channel.va_space;

-    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
-
    uvm_va_space_down_read_rm(va_space);
    if (user_channel->gpu_va_space) {
        // RM handles the fault, which will do the correct fault reporting in the
--- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c
@ -1034,6 +1034,61 @@ static NV_STATUS preprocess_fault_batch(uvm_gpu_t *gpu, uvm_fault_service_batch_
    return NV_OK;
 }

+static bool check_fault_entry_duplicate(const uvm_fault_buffer_entry_t *current_entry,
+                                        const uvm_fault_buffer_entry_t *previous_entry)
+{
+    bool is_duplicate = false;
+
+    if (previous_entry) {
+        is_duplicate = (current_entry->va_space == previous_entry->va_space) &&
+                       (current_entry->fault_address == previous_entry->fault_address);
+    }
+
+    return is_duplicate;
+}
+
+static void fault_entry_duplicate_flags(uvm_fault_buffer_entry_t *current_entry,
+                                        const uvm_fault_buffer_entry_t *previous_entry)
+{
+    UVM_ASSERT(previous_entry);
+    UVM_ASSERT(check_fault_entry_duplicate(current_entry, previous_entry));
+
+    // Propagate the is_invalid_prefetch flag across all prefetch faults
+    // on the page
+    if (previous_entry->is_invalid_prefetch)
+        current_entry->is_invalid_prefetch = true;
+
+    // If a page is throttled, all faults on the page must be skipped
+    if (previous_entry->is_throttled)
+        current_entry->is_throttled = true;
+}
+
+static void update_batch_context(uvm_fault_service_batch_context_t *batch_context,
+                                 uvm_fault_buffer_entry_t *current_entry,
+                                 const uvm_fault_buffer_entry_t *previous_entry)
+{
+    bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
+    uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
+
+    UVM_ASSERT(utlb->num_pending_faults > 0);
+
+    if (is_duplicate)
+        batch_context->num_duplicate_faults += current_entry->num_instances;
+    else
+        batch_context->num_duplicate_faults += current_entry->num_instances - 1;
+
+    if (current_entry->is_invalid_prefetch)
+        batch_context->num_invalid_prefetch_faults += current_entry->num_instances;
+
+    if (current_entry->is_fatal) {
+        utlb->has_fatal_faults = true;
+        batch_context->has_fatal_faults = true;
+    }
+
+    if (current_entry->is_throttled)
+        batch_context->has_throttled_faults = true;
+}
+
 // This function computes the maximum access type that can be serviced for the
 // reported fault instances given the logical permissions of the VA range. If
 // none of the fault instances can be serviced UVM_FAULT_ACCESS_TYPE_COUNT is
@ -1122,11 +1177,11 @@ static uvm_fault_access_type_t check_fault_access_permissions(uvm_gpu_t *gpu,
 // - NV_ERR_MORE_PROCESSING_REQUIRED if servicing needs allocation retry
 // - NV_ERR_NO_MEMORY if the faults could not be serviced due to OOM
 // - Any other value is a UVM-global error
-static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,
+static NV_STATUS service_fault_batch_block_locked(uvm_gpu_t *gpu,
                                                  uvm_va_block_t *va_block,
                                                  uvm_va_block_retry_t *va_block_retry,
-                                                              NvU32 first_fault_index,
                                                  uvm_fault_service_batch_context_t *batch_context,
+                                                  NvU32 first_fault_index,
                                                  NvU32 *block_faults)
 {
    NV_STATUS status = NV_OK;
@ -1200,7 +1255,7 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,

        if (i > first_fault_index) {
            previous_entry = ordered_fault_cache[i - 1];
-            is_duplicate = current_entry->fault_address == previous_entry->fault_address;
+            is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
        }

        if (block_context->num_retries == 0) {
@ -1215,12 +1270,7 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,

        // Service the most intrusive fault per page, only. Waive the rest
        if (is_duplicate) {
-            // Propagate the is_invalid_prefetch flag across all prefetch
-            // faults on the page
-            current_entry->is_invalid_prefetch = previous_entry->is_invalid_prefetch;
-
-            // If a page is throttled, all faults on the page must be skipped
-            current_entry->is_throttled = previous_entry->is_throttled;
+            fault_entry_duplicate_flags(current_entry, previous_entry);

            // The previous fault was non-fatal so the page has been already
            // serviced
@ -1316,25 +1366,8 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,
        // Only update counters the first time since logical permissions cannot
        // change while we hold the VA space lock
        // TODO: Bug 1750144: That might not be true with HMM.
-        if (block_context->num_retries == 0) {
-            uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
-
-            if (current_entry->is_invalid_prefetch)
-                batch_context->num_invalid_prefetch_faults += current_entry->num_instances;
-
-            if (is_duplicate)
-                batch_context->num_duplicate_faults += current_entry->num_instances;
-            else
-                batch_context->num_duplicate_faults += current_entry->num_instances - 1;
-
-            if (current_entry->is_throttled)
-                batch_context->has_throttled_faults = true;
-
-            if (current_entry->is_fatal) {
-                utlb->has_fatal_faults = true;
-                batch_context->has_fatal_faults = true;
-            }
-        }
+        if (block_context->num_retries == 0)
+            update_batch_context(batch_context, current_entry, previous_entry);
    }

    // Apply the changes computed in the fault service block context, if there
@ -1361,10 +1394,10 @@ static NV_STATUS service_batch_managed_faults_in_block_locked(uvm_gpu_t *gpu,
 //
 // See the comments for function service_fault_batch_block_locked for
 // implementation details and error codes.
-static NV_STATUS service_batch_managed_faults_in_block(uvm_gpu_t *gpu,
+static NV_STATUS service_fault_batch_block(uvm_gpu_t *gpu,
                                           uvm_va_block_t *va_block,
-                                                       NvU32 first_fault_index,
                                           uvm_fault_service_batch_context_t *batch_context,
+                                           NvU32 first_fault_index,
                                           NvU32 *block_faults)
 {
    NV_STATUS status;
@ -1378,11 +1411,11 @@ static NV_STATUS service_batch_managed_faults_in_block(uvm_gpu_t *gpu,
    uvm_mutex_lock(&va_block->lock);

    status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, &va_block_retry,
-                                       service_batch_managed_faults_in_block_locked(gpu,
+                                       service_fault_batch_block_locked(gpu,
                                                                        va_block,
                                                                        &va_block_retry,
-                                                                                    first_fault_index,
                                                                        batch_context,
+                                                                        first_fault_index,
                                                                        block_faults));

    tracker_status = uvm_tracker_add_tracker_safe(&batch_context->tracker, &va_block->tracker);
@ -1402,59 +1435,65 @@ typedef enum
    FAULT_SERVICE_MODE_CANCEL,
 } fault_service_mode_t;

-static NV_STATUS service_non_managed_fault(uvm_fault_buffer_entry_t *current_entry,
-                                           const uvm_fault_buffer_entry_t *previous_entry,
-                                           NV_STATUS lookup_status,
-                                           uvm_gpu_va_space_t *gpu_va_space,
+static NV_STATUS service_fault_batch_ats(uvm_gpu_va_space_t *gpu_va_space,
                                         struct mm_struct *mm,
                                         uvm_fault_service_batch_context_t *batch_context,
-                                           uvm_ats_fault_invalidate_t *ats_invalidate,
-                                           uvm_fault_utlb_info_t *utlb)
+                                         NvU32 first_fault_index,
+                                         NvU32 *block_faults)
 {
-    NV_STATUS status = lookup_status;
-    bool is_duplicate = false;
-    UVM_ASSERT(utlb->num_pending_faults > 0);
-    UVM_ASSERT(lookup_status != NV_OK);
+    NV_STATUS status;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
+    const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
+                                                       batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
+    bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);

-    if (previous_entry) {
-        is_duplicate = (current_entry->va_space == previous_entry->va_space) &&
-                       (current_entry->fault_address == previous_entry->fault_address);
-
-        if (is_duplicate) {
-            // Propagate the is_invalid_prefetch flag across all prefetch faults
-            // on the page
-            if (previous_entry->is_invalid_prefetch)
-                current_entry->is_invalid_prefetch = true;
-
-            // If a page is throttled, all faults on the page must be skipped
-            if (previous_entry->is_throttled)
-                current_entry->is_throttled = true;
-        }
-    }
+    if (is_duplicate)
+        fault_entry_duplicate_flags(current_entry, previous_entry);

    // Generate fault events for all fault packets
    uvm_perf_event_notify_gpu_fault(&current_entry->va_space->perf_events,
                                    NULL,
-                                    gpu_va_space->gpu->id,
+                                    gpu->id,
                                    UVM_ID_INVALID,
                                    current_entry,
                                    batch_context->batch_id,
                                    is_duplicate);

-    if (status != NV_ERR_INVALID_ADDRESS)
-        return status;
-
-    if (uvm_ats_can_service_faults(gpu_va_space, mm)) {
    // The VA isn't managed. See if ATS knows about it, unless it is a
    // duplicate and the previous fault was non-fatal so the page has
    // already been serviced
+    //
+    // TODO: Bug 2103669: Service more than one ATS fault at a time so we
+    //       don't do an unconditional VA range lookup for every ATS fault.
    if (!is_duplicate || previous_entry->is_fatal)
        status = uvm_ats_service_fault_entry(gpu_va_space, current_entry, ats_invalidate);
    else
        status = NV_OK;
-    }
-    else {
-        // If the VA block cannot be found, set the fatal fault flag,
+
+    (*block_faults)++;
+
+    update_batch_context(batch_context, current_entry, previous_entry);
+
+    return status;
+}
+
+static void service_fault_batch_fatal(uvm_gpu_t *gpu,
+                                      uvm_fault_service_batch_context_t *batch_context,
+                                      NvU32 first_fault_index,
+                                      NV_STATUS status,
+                                      NvU32 *block_faults)
+{
+    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
+    const uvm_fault_buffer_entry_t *previous_entry = first_fault_index > 0 ?
+                                                       batch_context->ordered_fault_cache[first_fault_index - 1] : NULL;
+    bool is_duplicate = check_fault_entry_duplicate(current_entry, previous_entry);
+
+    if (is_duplicate)
+        fault_entry_duplicate_flags(current_entry, previous_entry);
+
+    // The VA block cannot be found, set the fatal fault flag,
    // unless it is a prefetch fault
    if (current_entry->fault_access_type == UVM_FAULT_ACCESS_TYPE_PREFETCH) {
        current_entry->is_invalid_prefetch = true;
@ -1465,31 +1504,59 @@ static NV_STATUS service_non_managed_fault(uvm_fault_buffer_entry_t *current_ent
        current_entry->replayable.cancel_va_mode = UVM_FAULT_CANCEL_VA_MODE_ALL;
    }

+    update_batch_context(batch_context, current_entry, previous_entry);
+
+    uvm_perf_event_notify_gpu_fault(&current_entry->va_space->perf_events,
+                                    NULL,
+                                    gpu->id,
+                                    UVM_ID_INVALID,
+                                    current_entry,
+                                    batch_context->batch_id,
+                                    is_duplicate);
+
+    (*block_faults)++;
+}
+
+static NV_STATUS service_fault_batch_dispatch(uvm_va_space_t *va_space,
+                                              uvm_gpu_va_space_t *gpu_va_space,
+                                              uvm_fault_service_batch_context_t *batch_context,
+                                              NvU32 first_fault_index,
+                                              NvU32 *block_faults)
+{
+    NV_STATUS status;
+    uvm_va_range_t *va_range;
+    uvm_va_block_t *va_block;
+    uvm_gpu_t *gpu = gpu_va_space->gpu;
+    uvm_va_block_context_t *va_block_context =
+        &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;
+    uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[first_fault_index];
+    struct mm_struct *mm = va_block_context->mm;
+    NvU64 fault_address = current_entry->fault_address;
+
+    (*block_faults) = 0;
+
+    va_range = uvm_va_range_find(va_space, fault_address);
+    status = uvm_va_block_find_create_in_range(va_space, va_range, fault_address, va_block_context, &va_block);
+    if (status == NV_OK) {
+        status = service_fault_batch_block(gpu, va_block, batch_context, first_fault_index, block_faults);
+    }
+    else if ((status == NV_ERR_INVALID_ADDRESS) && uvm_ats_can_service_faults(gpu_va_space, mm)) {
+        status = service_fault_batch_ats(gpu_va_space, mm, batch_context, first_fault_index, block_faults);
+    }
+    else {
+        service_fault_batch_fatal(gpu_va_space->gpu, batch_context, first_fault_index, status, block_faults);
+
        // Do not fail due to logical errors
        status = NV_OK;
    }

-    if (is_duplicate)
-        batch_context->num_duplicate_faults += current_entry->num_instances;
-    else
-        batch_context->num_duplicate_faults += current_entry->num_instances - 1;
-
-    if (current_entry->is_invalid_prefetch)
-        batch_context->num_invalid_prefetch_faults += current_entry->num_instances;
-
-    if (current_entry->is_fatal) {
-        utlb->has_fatal_faults = true;
-        batch_context->has_fatal_faults = true;
-    }
-
-    if (current_entry->is_throttled)
-        batch_context->has_throttled_faults = true;
-
    return status;
 }

-// Scan the ordered view of faults and group them by different va_blocks.
-// Service faults for each va_block, in batch.
+// Scan the ordered view of faults and group them by different va_blocks
+// (managed faults) and service faults for each va_block, in batch.
+// Service non-managed faults one at a time as they are encountered during the
+// scan.
 //
 // This function returns NV_WARN_MORE_PROCESSING_REQUIRED if the fault buffer
 // was flushed because the needs_fault_buffer_flush flag was set on some GPU VA
@ -1503,9 +1570,9 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
    uvm_va_space_t *va_space = NULL;
    uvm_gpu_va_space_t *gpu_va_space = NULL;
    uvm_ats_fault_invalidate_t *ats_invalidate = &gpu->parent->fault_buffer_info.replayable.ats_invalidate;
+    struct mm_struct *mm = NULL;
    const bool replay_per_va_block = service_mode != FAULT_SERVICE_MODE_CANCEL &&
                                     gpu->parent->fault_buffer_info.replayable.replay_policy == UVM_PERF_FAULT_REPLAY_POLICY_BLOCK;
-    struct mm_struct *mm = NULL;
    uvm_va_block_context_t *va_block_context =
        &gpu->parent->fault_buffer_info.replayable.block_service_context.block_context;

@ -1514,7 +1581,6 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
    ats_invalidate->write_faults_in_batch = false;

    for (i = 0; i < batch_context->num_coalesced_faults;) {
-        uvm_va_block_t *va_block;
        NvU32 block_faults;
        uvm_fault_buffer_entry_t *current_entry = batch_context->ordered_fault_cache[i];
        uvm_fault_utlb_info_t *utlb = &batch_context->utlbs[current_entry->fault_source.utlb_id];
@ -1548,14 +1614,11 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            uvm_va_space_down_read(va_space);

            gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
-            if (gpu_va_space && gpu_va_space->needs_fault_buffer_flush) {
-                // flush if required and clear the flush flag
+            if (uvm_processor_mask_test_and_clear_atomic(&va_space->needs_fault_buffer_flush, gpu->id)) {
                status = fault_buffer_flush_locked(gpu,
                                                   UVM_GPU_BUFFER_FLUSH_MODE_UPDATE_PUT,
                                                   UVM_FAULT_REPLAY_TYPE_START,
                                                   batch_context);
-                gpu_va_space->needs_fault_buffer_flush = false;
-
                if (status == NV_OK)
                    status = NV_WARN_MORE_PROCESSING_REQUIRED;

@ -1586,49 +1649,22 @@ static NV_STATUS service_fault_batch(uvm_gpu_t *gpu,
            continue;
        }

-        // TODO: Bug 2103669: Service more than one ATS fault at a time so we
-        //       don't do an unconditional VA range lookup for every ATS fault.
-        status = uvm_va_block_find_create(va_space,
-                                          current_entry->fault_address,
-                                          va_block_context,
-                                          &va_block);
-        if (status == NV_OK) {
-            status = service_batch_managed_faults_in_block(gpu_va_space->gpu,
-                                                           va_block,
-                                                           i,
-                                                           batch_context,
-                                                           &block_faults);
-
-            // When service_batch_managed_faults_in_block returns != NV_OK
-            // something really bad happened
+        status = service_fault_batch_dispatch(va_space, gpu_va_space, batch_context, i, &block_faults);
+        // TODO: Bug 3900733: clean up locking in service_fault_batch().
+        if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
+            uvm_va_space_up_read(va_space);
+            uvm_va_space_mm_release_unlock(va_space, mm);
+            mm = NULL;
+            va_space = NULL;
+            continue;
+        }
        if (status != NV_OK)
            goto fail;

        i += block_faults;
-        }
-        else {
-            const uvm_fault_buffer_entry_t *previous_entry = i == 0? NULL : batch_context->ordered_fault_cache[i - 1];
-
-            status = service_non_managed_fault(current_entry,
-                                               previous_entry,
-                                               status,
-                                               gpu_va_space,
-                                               mm,
-                                               batch_context,
-                                               ats_invalidate,
-                                               utlb);
-
-            // When service_non_managed_fault returns != NV_OK something really
-            // bad happened
-            if (status != NV_OK)
-                goto fail;
-
-            ++i;
-            continue;
-        }

        // Don't issue replays in cancel mode
-        if (replay_per_va_block) {
+        if (replay_per_va_block && !batch_context->has_fatal_faults) {
            status = push_replay_on_gpu(gpu, UVM_FAULT_REPLAY_TYPE_START, batch_context);
            if (status != NV_OK)
                goto fail;
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@ -131,6 +131,7 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    uvm_gpu_semaphore_pool_page_t *pool_page;
    NvU32 *payloads;
    size_t i;
+    uvm_rm_mem_type_t rm_mem_type = UVM_RM_MEM_TYPE_SYS;

    uvm_assert_mutex_locked(&pool->mutex);

@ -142,7 +143,7 @@ static NV_STATUS pool_alloc_page(uvm_gpu_semaphore_pool_t *pool)
    pool_page->pool = pool;

    status = uvm_rm_mem_alloc_and_map_all(pool->gpu,
-                                          UVM_RM_MEM_TYPE_SYS,
+                                          rm_mem_type,
                                          UVM_SEMAPHORE_PAGE_SIZE,
                                          0,
                                          &pool_page->memory);
--- a/kernel-open/nvidia-uvm/uvm_hal.c
+++ b/kernel-open/nvidia-uvm/uvm_hal.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -44,6 +44,10 @@
 #include "clc86f.h"
 #include "clc8b5.h"

+static int uvm_downgrade_force_membar_sys = 1;
+module_param(uvm_downgrade_force_membar_sys, uint, 0644);
+MODULE_PARM_DESC(uvm_downgrade_force_membar_sys, "Force all TLB invalidation downgrades to use MEMBAR_SYS");
+
 #define CE_OP_COUNT (sizeof(uvm_ce_hal_t) / sizeof(void *))
 #define HOST_OP_COUNT (sizeof(uvm_host_hal_t) / sizeof(void *))
 #define ARCH_OP_COUNT (sizeof(uvm_arch_hal_t) / sizeof(void *))
@ -61,7 +65,7 @@ static uvm_hal_class_ops_t ce_table[] =
        .id = MAXWELL_DMA_COPY_A,
        .u.ce_ops = {
            .init = uvm_hal_maxwell_ce_init,
-            .method_validate = uvm_hal_method_validate_stub,
+            .method_is_valid = uvm_hal_method_is_valid_stub,
            .semaphore_release = uvm_hal_maxwell_ce_semaphore_release,
            .semaphore_timestamp = uvm_hal_maxwell_ce_semaphore_timestamp,
            .semaphore_reduction_inc = uvm_hal_maxwell_ce_semaphore_reduction_inc,
@ -69,11 +73,11 @@ static uvm_hal_class_ops_t ce_table[] =
            .offset_in_out = uvm_hal_maxwell_ce_offset_in_out,
            .phys_mode = uvm_hal_maxwell_ce_phys_mode,
            .plc_mode = uvm_hal_maxwell_ce_plc_mode,
-            .memcopy_validate = uvm_hal_ce_memcopy_validate_stub,
+            .memcopy_is_valid = uvm_hal_ce_memcopy_is_valid_stub,
            .memcopy_patch_src = uvm_hal_ce_memcopy_patch_src_stub,
            .memcopy = uvm_hal_maxwell_ce_memcopy,
            .memcopy_v_to_v = uvm_hal_maxwell_ce_memcopy_v_to_v,
-            .memset_validate = uvm_hal_ce_memset_validate_stub,
+            .memset_is_valid = uvm_hal_ce_memset_is_valid_stub,
            .memset_1 = uvm_hal_maxwell_ce_memset_1,
            .memset_4 = uvm_hal_maxwell_ce_memset_4,
            .memset_8 = uvm_hal_maxwell_ce_memset_8,
@ -99,7 +103,15 @@ static uvm_hal_class_ops_t ce_table[] =
    {
        .id = VOLTA_DMA_COPY_A,
        .parent_id = PASCAL_DMA_COPY_B,
-        .u.ce_ops = {},
+        .u.ce_ops = {
+            .semaphore_release = uvm_hal_volta_ce_semaphore_release,
+            .semaphore_timestamp = uvm_hal_volta_ce_semaphore_timestamp,
+            .semaphore_reduction_inc = uvm_hal_volta_ce_semaphore_reduction_inc,
+            .memcopy = uvm_hal_volta_ce_memcopy,
+            .memset_1 = uvm_hal_volta_ce_memset_1,
+            .memset_4 = uvm_hal_volta_ce_memset_4,
+            .memset_8 = uvm_hal_volta_ce_memset_8,
+        },
    },
    {
        .id = TURING_DMA_COPY_A,
@ -110,22 +122,22 @@ static uvm_hal_class_ops_t ce_table[] =
        .id = AMPERE_DMA_COPY_A,
        .parent_id = TURING_DMA_COPY_A,
        .u.ce_ops = {
-            .method_validate = uvm_hal_ampere_ce_method_validate_c6b5,
+            .method_is_valid = uvm_hal_ampere_ce_method_is_valid_c6b5,
            .phys_mode = uvm_hal_ampere_ce_phys_mode,
-            .memcopy_validate = uvm_hal_ampere_ce_memcopy_validate_c6b5,
+            .memcopy_is_valid = uvm_hal_ampere_ce_memcopy_is_valid_c6b5,
            .memcopy_patch_src = uvm_hal_ampere_ce_memcopy_patch_src_c6b5,
-            .memset_validate = uvm_hal_ampere_ce_memset_validate_c6b5,
+            .memset_is_valid = uvm_hal_ampere_ce_memset_is_valid_c6b5,
        },
    },
    {
        .id = AMPERE_DMA_COPY_B,
        .parent_id = AMPERE_DMA_COPY_A,
        .u.ce_ops = {
-            .method_validate = uvm_hal_method_validate_stub,
+            .method_is_valid = uvm_hal_method_is_valid_stub,
            .plc_mode = uvm_hal_ampere_ce_plc_mode_c7b5,
-            .memcopy_validate = uvm_hal_ce_memcopy_validate_stub,
+            .memcopy_is_valid = uvm_hal_ce_memcopy_is_valid_stub,
            .memcopy_patch_src = uvm_hal_ce_memcopy_patch_src_stub,
-            .memset_validate = uvm_hal_ce_memset_validate_stub,
+            .memset_is_valid = uvm_hal_ce_memset_is_valid_stub,
        },
    },
    {
@ -140,6 +152,8 @@ static uvm_hal_class_ops_t ce_table[] =
            .memset_1 = uvm_hal_hopper_ce_memset_1,
            .memset_4 = uvm_hal_hopper_ce_memset_4,
            .memset_8 = uvm_hal_hopper_ce_memset_8,
+            .memcopy_is_valid = uvm_hal_hopper_ce_memcopy_is_valid,
+            .memset_is_valid = uvm_hal_hopper_ce_memset_is_valid,
        },
    },
 };
@ -152,8 +166,8 @@ static uvm_hal_class_ops_t host_table[] =
        .id = KEPLER_CHANNEL_GPFIFO_B,
        .u.host_ops = {
            .init = uvm_hal_maxwell_host_init_noop,
-            .method_validate = uvm_hal_method_validate_stub,
-            .sw_method_validate = uvm_hal_method_validate_stub,
+            .method_is_valid = uvm_hal_method_is_valid_stub,
+            .sw_method_is_valid = uvm_hal_method_is_valid_stub,
            .wait_for_idle = uvm_hal_maxwell_host_wait_for_idle,
            .membar_sys = uvm_hal_maxwell_host_membar_sys,
            // No MEMBAR GPU until Pascal, just do a MEMBAR SYS.
@ -235,8 +249,8 @@ static uvm_hal_class_ops_t host_table[] =
        .id = AMPERE_CHANNEL_GPFIFO_A,
        .parent_id = TURING_CHANNEL_GPFIFO_A,
        .u.host_ops = {
-            .method_validate = uvm_hal_ampere_host_method_validate,
-            .sw_method_validate = uvm_hal_ampere_host_sw_method_validate,
+            .method_is_valid = uvm_hal_ampere_host_method_is_valid,
+            .sw_method_is_valid = uvm_hal_ampere_host_sw_method_is_valid,
            .clear_faulted_channel_sw_method = uvm_hal_ampere_host_clear_faulted_channel_sw_method,
            .clear_faulted_channel_register = uvm_hal_ampere_host_clear_faulted_channel_register,
            .tlb_invalidate_all = uvm_hal_ampere_host_tlb_invalidate_all,
@ -248,8 +262,8 @@ static uvm_hal_class_ops_t host_table[] =
        .id = HOPPER_CHANNEL_GPFIFO_A,
        .parent_id = AMPERE_CHANNEL_GPFIFO_A,
        .u.host_ops = {
-            .method_validate = uvm_hal_method_validate_stub,
-            .sw_method_validate = uvm_hal_method_validate_stub,
+            .method_is_valid = uvm_hal_method_is_valid_stub,
+            .sw_method_is_valid = uvm_hal_method_is_valid_stub,
            .semaphore_acquire = uvm_hal_hopper_host_semaphore_acquire,
            .semaphore_release = uvm_hal_hopper_host_semaphore_release,
            .semaphore_timestamp = uvm_hal_hopper_host_semaphore_timestamp,
@ -637,14 +651,20 @@ NV_STATUS uvm_hal_init_gpu(uvm_parent_gpu_t *parent_gpu)
    return NV_OK;
 }

+static void hal_override_properties(uvm_parent_gpu_t *parent_gpu)
+{
+    // Access counters are currently not supported in vGPU.
+    //
+    // TODO: Bug 200692962: Add support for access counters in vGPU
+    if (parent_gpu->virt_mode != UVM_VIRT_MODE_NONE)
+        parent_gpu->access_counters_supported = false;
+}
+
 void uvm_hal_init_properties(uvm_parent_gpu_t *parent_gpu)
 {
    parent_gpu->arch_hal->init_properties(parent_gpu);

-    // Override the HAL when in non-passthrough virtualization
-    // TODO: Bug 200692962: [UVM] Add support for access counters in UVM on SR-IOV configurations
-    if (parent_gpu->virt_mode != UVM_VIRT_MODE_NONE)
-        parent_gpu->access_counters_supported = false;
+    hal_override_properties(parent_gpu);
 }

 void uvm_hal_tlb_invalidate_membar(uvm_push_t *push, uvm_membar_t membar)
@ -663,6 +683,44 @@ void uvm_hal_tlb_invalidate_membar(uvm_push_t *push, uvm_membar_t membar)
    uvm_hal_membar(gpu, push, membar);
 }

+bool uvm_hal_membar_before_semaphore(uvm_push_t *push)
+{
+    uvm_membar_t membar = uvm_push_get_and_reset_membar_flag(push);
+
+    if (membar == UVM_MEMBAR_NONE) {
+        // No MEMBAR requested, don't use a flush.
+        return false;
+    }
+
+    if (membar == UVM_MEMBAR_GPU) {
+        // MEMBAR GPU requested, do it on the HOST and skip the engine flush as
+        // it doesn't have this capability.
+        uvm_hal_wfi_membar(push, UVM_MEMBAR_GPU);
+        return false;
+    }
+
+    // By default do a MEMBAR SYS and for that we can just use flush on the
+    // semaphore operation.
+    return true;
+}
+
+uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem)
+{
+    // If the mapped memory was local, and we're not using a coherence protocol,
+    // we only need a GPU-local membar. This is because all accesses to this
+    // memory, including those from other processors like the CPU or peer GPUs,
+    // must come through this GPU's L2. In all current architectures, MEMBAR_GPU
+    // is sufficient to resolve ordering at the L2 level.
+    if (is_local_vidmem && !gpu->parent->numa_info.enabled && !uvm_downgrade_force_membar_sys)
+        return UVM_MEMBAR_GPU;
+
+    // If the mapped memory was remote, or if a coherence protocol can cache
+    // this GPU's memory, then there are external ways for other processors to
+    // access the memory without always going the local GPU L2, so we must use a
+    // MEMBAR_SYS.
+    return UVM_MEMBAR_SYS;
+}
+
 const char *uvm_aperture_string(uvm_aperture_t aperture)
 {
    BUILD_BUG_ON(UVM_APERTURE_MAX != 12);
@ -823,12 +881,12 @@ void uvm_hal_print_access_counter_buffer_entry(const uvm_access_counter_buffer_e
    UVM_DBG_PRINT("    tag             %x\n", entry->tag);
 }

-bool uvm_hal_method_validate_stub(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
+bool uvm_hal_method_is_valid_stub(uvm_push_t *push, NvU32 method_address, NvU32 method_data)
 {
    return true;
 }

-bool uvm_hal_ce_memcopy_validate_stub(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
+bool uvm_hal_ce_memcopy_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
 {
    return true;
 }
@ -837,7 +895,7 @@ void uvm_hal_ce_memcopy_patch_src_stub(uvm_push_t *push, uvm_gpu_address_t *src)
 {
 }

-bool uvm_hal_ce_memset_validate_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
+bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
 {
    return true;
 }
--- a/kernel-open/nvidia-uvm/uvm_hal.h
+++ b/kernel-open/nvidia-uvm/uvm_hal.h
@ -34,7 +34,7 @@

 // A dummy method validation that always returns true; it can be used to skip
 // CE/Host/SW method validations for a given architecture
-bool uvm_hal_method_validate_stub(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+bool uvm_hal_method_is_valid_stub(uvm_push_t *push, NvU32 method_address, NvU32 method_data);

 typedef void (*uvm_hal_init_t)(uvm_push_t *push);
 void uvm_hal_maxwell_ce_init(uvm_push_t *push);
@ -42,12 +42,12 @@ void uvm_hal_maxwell_host_init_noop(uvm_push_t *push);
 void uvm_hal_pascal_host_init(uvm_push_t *push);

 // Host method validation
-typedef bool (*uvm_hal_host_method_validate)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
-bool uvm_hal_ampere_host_method_validate(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+typedef bool (*uvm_hal_host_method_is_valid)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+bool uvm_hal_ampere_host_method_is_valid(uvm_push_t *push, NvU32 method_address, NvU32 method_data);

 // SW method validation
-typedef bool (*uvm_hal_host_sw_method_validate)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
-bool uvm_hal_ampere_host_sw_method_validate(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+typedef bool (*uvm_hal_host_sw_method_is_valid)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+bool uvm_hal_ampere_host_sw_method_is_valid(uvm_push_t *push, NvU32 method_address, NvU32 method_data);

 // Wait for idle
 typedef void (*uvm_hal_wait_for_idle_t)(uvm_push_t *push);
@ -208,6 +208,7 @@ typedef void (*uvm_hal_semaphore_release_t)(uvm_push_t *push, NvU64 gpu_va, NvU3
 void uvm_hal_maxwell_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_pascal_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
+void uvm_hal_volta_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_turing_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_hopper_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
@ -220,6 +221,7 @@ void uvm_hal_hopper_host_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32
 typedef void (*uvm_hal_semaphore_timestamp_t)(uvm_push_t *push, NvU64 gpu_va);
 void uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
 void uvm_hal_pascal_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
+void uvm_hal_volta_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
 void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);

 void uvm_hal_maxwell_host_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va);
@ -272,16 +274,17 @@ NvU32 uvm_hal_maxwell_ce_plc_mode(void);
 NvU32 uvm_hal_ampere_ce_plc_mode_c7b5(void);

 // CE method validation
-typedef bool (*uvm_hal_ce_method_validate)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
-bool uvm_hal_ampere_ce_method_validate_c6b5(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+typedef bool (*uvm_hal_ce_method_is_valid)(uvm_push_t *push, NvU32 method_address, NvU32 method_data);
+bool uvm_hal_ampere_ce_method_is_valid_c6b5(uvm_push_t *push, NvU32 method_address, NvU32 method_data);

 // Memcopy validation.
 // The validation happens at the start of the memcopy (uvm_hal_memcopy_t)
-// execution. Use uvm_hal_ce_memcopy_validate_stub to skip the validation for
+// execution. Use uvm_hal_ce_memcopy_is_valid_stub to skip the validation for
 // a given architecture.
-typedef bool (*uvm_hal_ce_memcopy_validate)(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
-bool uvm_hal_ce_memcopy_validate_stub(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
-bool uvm_hal_ampere_ce_memcopy_validate_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
+typedef bool (*uvm_hal_ce_memcopy_is_valid)(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
+bool uvm_hal_ce_memcopy_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
+bool uvm_hal_ampere_ce_memcopy_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);
+bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src);

 // Patching of the memcopy source; if not needed for a given architecture use
 // the (empty) uvm_hal_ce_memcopy_patch_src_stub implementation
@ -296,6 +299,7 @@ void uvm_hal_ampere_ce_memcopy_patch_src_c6b5(uvm_push_t *push, uvm_gpu_address_
 // UVM_PUSH_FLAG_NEXT_CE_* flags with uvm_push_set_flag().
 typedef void (*uvm_hal_memcopy_t)(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size);
 void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size);
+void uvm_hal_volta_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src, size_t size);

 // Simple wrapper for uvm_hal_memcopy_t with both addresses being virtual
 typedef void (*uvm_hal_memcopy_v_to_v_t)(uvm_push_t *push, NvU64 dst, NvU64 src, size_t size);
@ -303,11 +307,12 @@ void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst, NvU64 src, s

 // Memset validation.
 // The validation happens at the start of the memset (uvm_hal_memset_*_t)
-// execution. Use uvm_hal_ce_memset_validate_stub to skip the validation for
+// execution. Use uvm_hal_ce_memset_is_valid_stub to skip the validation for
 // a given architecture.
-typedef bool (*uvm_hal_ce_memset_validate)(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
-bool uvm_hal_ce_memset_validate_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
-bool uvm_hal_ampere_ce_memset_validate_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
+typedef bool (*uvm_hal_ce_memset_is_valid)(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
+bool uvm_hal_ce_memset_is_valid_stub(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
+bool uvm_hal_ampere_ce_memset_is_valid_c6b5(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);
+bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size);

 // Memset size bytes at dst to a given N-byte input value.
 //
@ -329,6 +334,10 @@ void uvm_hal_maxwell_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32
 void uvm_hal_maxwell_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size);
 void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value, size_t size);

+void uvm_hal_volta_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size);
+void uvm_hal_volta_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size);
+void uvm_hal_volta_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size);
+
 void uvm_hal_hopper_ce_memset_1(uvm_push_t *push, uvm_gpu_address_t dst, NvU8 value, size_t size);
 void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 value, size_t size);
 void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size);
@ -342,6 +351,7 @@ void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 v
 typedef void (*uvm_hal_semaphore_reduction_inc_t)(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_pascal_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
+void uvm_hal_volta_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);
 void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload);

 // Initialize GPU architecture dependent properties
@ -579,8 +589,8 @@ void uvm_hal_turing_clear_access_counter_notifications(uvm_parent_gpu_t *parent_
 struct uvm_host_hal_struct
 {
    uvm_hal_init_t init;
-    uvm_hal_host_method_validate method_validate;
-    uvm_hal_host_sw_method_validate sw_method_validate;
+    uvm_hal_host_method_is_valid method_is_valid;
+    uvm_hal_host_sw_method_is_valid sw_method_is_valid;
    uvm_hal_wait_for_idle_t wait_for_idle;
    uvm_hal_membar_sys_t membar_sys;
    uvm_hal_membar_gpu_t membar_gpu;
@ -612,18 +622,18 @@ struct uvm_host_hal_struct
 struct uvm_ce_hal_struct
 {
    uvm_hal_init_t init;
-    uvm_hal_ce_method_validate method_validate;
+    uvm_hal_ce_method_is_valid method_is_valid;
    uvm_hal_semaphore_release_t semaphore_release;
    uvm_hal_semaphore_timestamp_t semaphore_timestamp;
    uvm_hal_ce_offset_out_t offset_out;
    uvm_hal_ce_offset_in_out_t offset_in_out;
    uvm_hal_ce_phys_mode_t phys_mode;
    uvm_hal_ce_plc_mode_t plc_mode;
-    uvm_hal_ce_memcopy_validate memcopy_validate;
+    uvm_hal_ce_memcopy_is_valid memcopy_is_valid;
    uvm_hal_ce_memcopy_patch_src memcopy_patch_src;
    uvm_hal_memcopy_t memcopy;
    uvm_hal_memcopy_v_to_v_t memcopy_v_to_v;
-    uvm_hal_ce_memset_validate memset_validate;
+    uvm_hal_ce_memset_is_valid memset_is_valid;
    uvm_hal_memset_1_t memset_1;
    uvm_hal_memset_4_t memset_4;
    uvm_hal_memset_8_t memset_8;
@ -726,4 +736,20 @@ static void uvm_hal_wfi_membar(uvm_push_t *push, uvm_membar_t membar)
 // appropriate Host membar(s) after a TLB invalidate.
 void uvm_hal_tlb_invalidate_membar(uvm_push_t *push, uvm_membar_t membar);

+// Internal helper used by architectures/engines that don't support a FLUSH
+// operation with a FLUSH_TYPE on the semaphore release method, e.g., pre-Volta
+// CE. It inspects and clears the MEMBAR push flags, issues a Host WFI +
+// membar.gpu for MEMBAR_GPU or returns true to indicate the caller to use the
+// engine's FLUSH for MEMBAR_SYS.
+bool uvm_hal_membar_before_semaphore(uvm_push_t *push);
+
+// Determine the appropriate membar to use on TLB invalidates for GPU PTE
+// permissions downgrades.
+//
+// gpu is the GPU on which the TLB invalidate is happening.
+//
+// is_local_vidmem indicates whether all mappings being invalidated pointed to
+// the local GPU's memory.
+uvm_membar_t uvm_hal_downgrade_membar_type(uvm_gpu_t *gpu, bool is_local_vidmem);
+
 #endif // __UVM_HAL_H__
--- a/kernel-open/nvidia-uvm/uvm_hmm.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm.c
--- a/kernel-open/nvidia-uvm/uvm_hmm.h
+++ b/kernel-open/nvidia-uvm/uvm_hmm.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2022 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -37,19 +37,10 @@ typedef struct
    // This stores pointers to uvm_va_block_t for HMM blocks.
    uvm_range_tree_t blocks;
    uvm_mutex_t blocks_lock;
-
-    // TODO: Bug 3351822: [UVM-HMM] Remove temporary testing changes.
-    // This flag is set true by default for each va_space so most processes
-    // don't see partially implemented UVM-HMM behavior but can be enabled by
-    // test code for a given va_space so the test process can do some interim
-    // testing. It needs to be a separate flag instead of modifying
-    // uvm_disable_hmm or va_space->flags since those are user inputs and are
-    // visible/checked by test code.
-    // Remove this when UVM-HMM is fully integrated into chips_a.
-    bool disable;
 } uvm_hmm_va_space_t;

 #if UVM_IS_CONFIG_HMM()
+
    // Tells whether HMM is enabled for the given va_space.
    // If it is not enabled, all of the functions below are no-ops.
    bool uvm_hmm_is_enabled(uvm_va_space_t *va_space);
@ -62,17 +53,25 @@ typedef struct
    // and the va_space lock must be held in write mode.
    NV_STATUS uvm_hmm_va_space_initialize(uvm_va_space_t *va_space);

-    // Initialize HMM for the given the va_space for testing.
-    // Bug 1750144: UVM: Add HMM (Heterogeneous Memory Management) support to
-    // the UVM driver. Remove this when enough HMM functionality is implemented.
-    // Locking: the va_space->va_space_mm.mm mmap_lock must be write locked
-    // and the va_space lock must be held in write mode.
-    NV_STATUS uvm_hmm_va_space_initialize_test(uvm_va_space_t *va_space);
-
    // Destroy any HMM state for the given the va_space.
    // Locking: va_space lock must be held in write mode.
    void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space);

+    // Unmap all page tables in this VA space which map memory owned by this
+    // GPU. Any memory still resident on this GPU will be evicted to system
+    // memory. Note that 'mm' can be NULL (e.g., when closing the UVM file)
+    // in which case any GPU memory is simply freed.
+    // Locking: if mm is not NULL, the caller must hold mm->mmap_lock in at
+    // least read mode and the va_space lock must be held in write mode.
+    void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm);
+
+    // Destroy the VA space's mappings on the GPU, if it has any.
+    // Locking: if mm is not NULL, the caller must hold mm->mmap_lock in at
+    // least read mode and the va_space lock must be held in write mode.
+    void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
+                                     uvm_gpu_va_space_t *gpu_va_space,
+                                     struct mm_struct *mm);
+
    // Find an existing HMM va_block.
    // This function can be called without having retained and locked the mm,
    // but in that case, the only allowed operations on the returned block are
@ -85,6 +84,25 @@ typedef struct
                                    NvU64 addr,
                                    uvm_va_block_t **va_block_ptr);

+    // Find an existing HMM va_block when processing a CPU fault and try to
+    // isolate and lock the faulting page.
+    // Return NV_ERR_INVALID_ADDRESS if the block is not found,
+    // NV_ERR_BUSY_RETRY if the page could not be locked, and
+    // NV_OK if the block is found and the page is locked. Also,
+    // uvm_hmm_cpu_fault_finish() must be called if NV_OK is returned.
+    // Locking: This must be called with the vma->vm_mm locked and the va_space
+    // read locked.
+    NV_STATUS uvm_hmm_va_block_cpu_find(uvm_va_space_t *va_space,
+                                        uvm_service_block_context_t *service_context,
+                                        struct vm_fault *vmf,
+                                        uvm_va_block_t **va_block_ptr);
+
+    // This must be called after uvm_va_block_cpu_fault() if
+    // uvm_hmm_va_block_cpu_find() returns NV_OK.
+    // Locking: This must be called with the vma->vm_mm locked and the va_space
+    // read locked.
+    void uvm_hmm_cpu_fault_finish(uvm_service_block_context_t *service_context);
+
    // Find or create a new HMM va_block.
    //
    // Return NV_ERR_INVALID_ADDRESS if there is no VMA associated with the
@ -114,7 +132,7 @@ typedef struct
    // Locking: This function must be called with the va_block lock held and if
    // va_block is a HMM block, va_block_context->mm must be retained and
    // locked for at least read.
-    bool uvm_hmm_va_block_context_vma_is_valid(uvm_va_block_t *va_block,
+    bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
                                            uvm_va_block_context_t *va_block_context,
                                            uvm_va_block_region_t region);

@ -168,7 +186,8 @@ typedef struct
    NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
                                             uvm_processor_id_t preferred_location,
                                             NvU64 base,
-                                             NvU64 last_address);
+                                             NvU64 last_address,
+                                             uvm_tracker_t *out_tracker);

    // Set the accessed by policy for the given range. This also tries to
    // map the range. Note that 'last_address' is inclusive.
@ -178,7 +197,17 @@ typedef struct
                                      uvm_processor_id_t processor_id,
                                      bool set_bit,
                                      NvU64 base,
-                                      NvU64 last_address);
+                                      NvU64 last_address,
+                                      uvm_tracker_t *out_tracker);
+
+    // Deferred work item to reestablish accessed by mappings after eviction. On
+    // GPUs with access counters enabled, the evicted GPU will also get remote
+    // mappings.
+    // Locking: the va_space->va_space_mm.mm mmap_lock must be locked
+    // and the va_space lock must be held in at least read mode.
+    void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
+                                             uvm_va_block_t *va_block,
+                                             uvm_va_block_context_t *block_context);

    // Set the read duplication policy for the given range.
    // Note that 'last_address' is inclusive.
@ -248,7 +277,104 @@ typedef struct
                                            uvm_va_block_context_t *va_block_context,
                                            NvU64 addr);

-    NV_STATUS uvm_test_hmm_init(UVM_TEST_HMM_INIT_PARAMS *params, struct file *filp);
+    // This is called to service a GPU fault.
+    // Locking: the va_space->va_space_mm.mm mmap_lock must be locked,
+    // the va_space read lock must be held, and the va_block lock held.
+    NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
+                                              uvm_processor_id_t new_residency,
+                                              uvm_va_block_t *va_block,
+                                              uvm_va_block_retry_t *va_block_retry,
+                                              uvm_service_block_context_t *service_context);
+
+    // This is called to migrate a region within a HMM va_block.
+    // va_block_context must not be NULL and va_block_context->policy and
+    // va_block_context->hmm.vma must be valid.
+    // Locking: the va_block_context->mm must be retained, mmap_lock must be
+    // locked, and the va_block lock held.
+    NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
+                                              uvm_va_block_retry_t *va_block_retry,
+                                              uvm_va_block_context_t *va_block_context,
+                                              uvm_processor_id_t dest_id,
+                                              uvm_va_block_region_t region,
+                                              uvm_make_resident_cause_t cause);
+
+    // This is called to migrate an address range of HMM allocations via
+    // UvmMigrate().
+    //
+    // va_block_context must not be NULL. The caller is not required to set
+    // va_block_context->policy or va_block_context->hmm.vma.
+    //
+    // Locking: the va_space->va_space_mm.mm mmap_lock must be locked and
+    // the va_space read lock must be held.
+    NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
+                                     uvm_va_block_context_t *va_block_context,
+                                     NvU64 base,
+                                     NvU64 length,
+                                     uvm_processor_id_t dest_id,
+                                     uvm_migrate_mode_t mode,
+                                     uvm_tracker_t *out_tracker);
+
+    // This sets the va_block_context->hmm.src_pfns[] to the ZONE_DEVICE private
+    // PFN for the GPU chunk memory.
+    NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
+                                                uvm_va_block_context_t *va_block_context,
+                                                uvm_gpu_chunk_t *gpu_chunk,
+                                                uvm_va_block_region_t chunk_region);
+
+    // Migrate pages to system memory for the given page mask.
+    // Note that the mmap lock is not held and there is no MM retained.
+    // This must be called after uvm_hmm_va_block_evict_chunk_prep() has
+    // initialized va_block_context->hmm.src_pfns[] for the source GPU physical
+    // PFNs being migrated. Note that the input mask 'pages_to_evict' can be
+    // modified. If any of the evicted pages has the accessed by policy set,
+    // then record that by setting out_accessed_by_set.
+    // Locking: the va_block lock must be locked.
+    NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
+                                            uvm_va_block_context_t *va_block_context,
+                                            const uvm_page_mask_t *pages_to_evict,
+                                            uvm_va_block_region_t region,
+                                            bool *out_accessed_by_set);
+
+    // Migrate pages from the given GPU to system memory for the given page
+    // mask and region. va_block_context must not be NULL.
+    // Note that the mmap lock is not held and there is no MM retained.
+    // Locking: the va_block lock must be locked.
+    NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
+                                                    uvm_gpu_t *gpu,
+                                                    uvm_va_block_context_t *va_block_context,
+                                                    const uvm_page_mask_t *pages_to_evict,
+                                                    uvm_va_block_region_t region);
+
+    // Migrate a GPU chunk to system memory. This called to remove CPU page
+    // table references to device private struct pages for the given GPU after
+    // all other references in va_blocks have been released and the GPU is
+    // in the process of being removed/torn down. Note that there is no mm,
+    // VMA, va_block or any user channel activity on this GPU.
+    NV_STATUS uvm_hmm_pmm_gpu_evict_chunk(uvm_gpu_t *gpu,
+                                          uvm_gpu_chunk_t *gpu_chunk);
+
+    // This returns what would be the intersection of va_block start/end and
+    // VMA start/end-1 for the given 'lookup_address' if
+    // uvm_hmm_va_block_find_create() was called.
+    // Locking: the caller must hold mm->mmap_lock in at least read mode and
+    // the va_space lock must be held in at least read mode.
+    NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
+                                            struct mm_struct *mm,
+                                            NvU64 lookup_address,
+                                            NvU64 *startp,
+                                            NvU64 *endp,
+                                            UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params);
+
+    // This updates the HMM va_block CPU residency information for a single
+    // page at 'lookup_address' by calling hmm_range_fault(). If 'populate' is
+    // true, the CPU page will be faulted in read/write or read-only
+    // (depending on the permission of the underlying VMA at lookup_address).
+    // Locking: the caller must hold mm->mmap_lock in at least read mode and
+    // the va_space lock must be held in at least read mode.
+    NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
+                                                     struct mm_struct *mm,
+                                                     NvU64 lookup_address,
+                                                     bool populate);

    NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params,
                                              struct file *filp);
@ -285,12 +411,17 @@ typedef struct
        return NV_OK;
    }

-    static NV_STATUS uvm_hmm_va_space_initialize_test(uvm_va_space_t *va_space)
+    static void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
    {
-        return NV_WARN_NOTHING_TO_DO;
    }

-    static void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
+    static void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm)
+    {
+    }
+
+    static void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
+                                            uvm_gpu_va_space_t *gpu_va_space,
+                                            struct mm_struct *mm)
    {
    }

@ -301,6 +432,18 @@ typedef struct
        return NV_ERR_INVALID_ADDRESS;
    }

+    static NV_STATUS uvm_hmm_va_block_cpu_find(uvm_va_space_t *va_space,
+                                               uvm_service_block_context_t *service_context,
+                                               struct vm_fault *vmf,
+                                               uvm_va_block_t **va_block_ptr)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static void uvm_hmm_cpu_fault_finish(uvm_service_block_context_t *service_context)
+    {
+    }
+
    static NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
                                                  NvU64 addr,
                                                  uvm_va_block_context_t *va_block_context,
@ -314,7 +457,7 @@ typedef struct
        return NV_OK;
    }

-    static bool uvm_hmm_va_block_context_vma_is_valid(uvm_va_block_t *va_block,
+    static bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
                                                   uvm_va_block_context_t *va_block_context,
                                                   uvm_va_block_region_t region)
    {
@ -349,7 +492,8 @@ typedef struct
    static NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
                                                    uvm_processor_id_t preferred_location,
                                                    NvU64 base,
-                                                    NvU64 last_address)
+                                                    NvU64 last_address,
+                                                    uvm_tracker_t *out_tracker)
    {
        return NV_ERR_INVALID_ADDRESS;
    }
@ -358,11 +502,18 @@ typedef struct
                                             uvm_processor_id_t processor_id,
                                             bool set_bit,
                                             NvU64 base,
-                                             NvU64 last_address)
+                                             NvU64 last_address,
+                                             uvm_tracker_t *out_tracker)
    {
        return NV_ERR_INVALID_ADDRESS;
    }

+    static void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
+                                                    uvm_va_block_t *va_block,
+                                                    uvm_va_block_context_t *block_context)
+    {
+    }
+
    static NV_STATUS uvm_hmm_set_read_duplication(uvm_va_space_t *va_space,
                                                  uvm_read_duplication_policy_t new_policy,
                                                  NvU64 base,
@ -405,9 +556,84 @@ typedef struct
        return UVM_PROT_NONE;
    }

-    static NV_STATUS uvm_test_hmm_init(UVM_TEST_HMM_INIT_PARAMS *params, struct file *filp)
+    static NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
+                                                     uvm_processor_id_t new_residency,
+                                                     uvm_va_block_t *va_block,
+                                                     uvm_va_block_retry_t *va_block_retry,
+                                                     uvm_service_block_context_t *service_context)
    {
-        return NV_WARN_NOTHING_TO_DO;
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
+                                                     uvm_va_block_retry_t *va_block_retry,
+                                                     uvm_va_block_context_t *va_block_context,
+                                                     uvm_processor_id_t dest_id,
+                                                     uvm_va_block_region_t region,
+                                                     uvm_make_resident_cause_t cause)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
+                                            uvm_va_block_context_t *va_block_context,
+                                            NvU64 base,
+                                            NvU64 length,
+                                            uvm_processor_id_t dest_id,
+                                            uvm_migrate_mode_t mode,
+                                            uvm_tracker_t *out_tracker)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
+                                                       uvm_va_block_context_t *va_block_context,
+                                                       uvm_gpu_chunk_t *gpu_chunk,
+                                                       uvm_va_block_region_t chunk_region)
+    {
+        return NV_OK;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
+                                                   uvm_va_block_context_t *va_block_context,
+                                                   const uvm_page_mask_t *pages_to_evict,
+                                                   uvm_va_block_region_t region,
+                                                   bool *out_accessed_by_set)
+    {
+        return NV_OK;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
+                                                           uvm_gpu_t *gpu,
+                                                           uvm_va_block_context_t *va_block_context,
+                                                           const uvm_page_mask_t *pages_to_evict,
+                                                           uvm_va_block_region_t region)
+    {
+        return NV_OK;
+    }
+
+    static NV_STATUS uvm_hmm_pmm_gpu_evict_chunk(uvm_gpu_t *gpu,
+                                                 uvm_gpu_chunk_t *gpu_chunk)
+    {
+        return NV_OK;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
+                                                   struct mm_struct *mm,
+                                                   NvU64 lookup_address,
+                                                   NvU64 *startp,
+                                                   NvU64 *endp,
+                                                   UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params)
+    {
+        return NV_ERR_INVALID_ADDRESS;
+    }
+
+    static NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
+                                                            struct mm_struct *mm,
+                                                            NvU64 lookup_address,
+                                                            bool populate)
+    {
+        return NV_ERR_INVALID_ADDRESS;
    }

    static NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params,
--- a/kernel-open/nvidia-uvm/uvm_hmm_sanity_test.c
+++ b/kernel-open/nvidia-uvm/uvm_hmm_sanity_test.c
@ -1,90 +0,0 @@
-/*******************************************************************************
-    Copyright (c) 2021-2022 NVIDIA Corporation
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to
-    deal in the Software without restriction, including without limitation the
-    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-    sell copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-        The above copyright notice and this permission notice shall be
-        included in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
-*******************************************************************************/
-
-#include "uvm_common.h"
-#include "uvm_linux.h"
-#include "uvm_test.h"
-#include "uvm_va_space.h"
-#include "uvm_va_range.h"
-#include "uvm_hmm.h"
-
-NV_STATUS uvm_test_hmm_sanity(UVM_TEST_HMM_SANITY_PARAMS *params, struct file *filp)
-{
-    uvm_va_space_t *va_space = uvm_va_space_get(filp);
-    struct mm_struct *mm;
-    uvm_va_block_t *hmm_block = NULL;
-    NV_STATUS status;
-
-    mm = uvm_va_space_mm_or_current_retain(va_space);
-    if (!mm)
-        return NV_WARN_NOTHING_TO_DO;
-
-    uvm_down_write_mmap_lock(mm);
-    uvm_va_space_down_write(va_space);
-
-    // TODO: Bug 3351822: [UVM-HMM] Remove temporary testing changes.
-    // By default, HMM is enabled system wide but disabled per va_space.
-    // This will initialize the va_space for HMM.
-    status = uvm_hmm_va_space_initialize_test(va_space);
-    if (status != NV_OK)
-        goto out;
-
-    uvm_va_space_up_write(va_space);
-    uvm_up_write_mmap_lock(mm);
-
-    uvm_down_read_mmap_lock(mm);
-    uvm_va_space_down_read(va_space);
-
-    // Try to create an HMM va_block to virtual address zero (NULL).
-    // It should fail. There should be no VMA but a va_block for range
-    // [0x0 0x1fffff] is possible.
-    status = uvm_hmm_va_block_find_create(va_space, 0UL, NULL, &hmm_block);
-    TEST_CHECK_GOTO(status == NV_ERR_INVALID_ADDRESS, done);
-
-    // Try to create an HMM va_block which overlaps a managed block.
-    // It should fail.
-    status = uvm_hmm_va_block_find_create(va_space, params->uvm_address, NULL, &hmm_block);
-    TEST_CHECK_GOTO(status == NV_ERR_INVALID_ADDRESS, done);
-
-    // Try to create an HMM va_block; it should succeed.
-    status = uvm_hmm_va_block_find_create(va_space, params->hmm_address, NULL, &hmm_block);
-    TEST_CHECK_GOTO(status == NV_OK, done);
-
-    // Try to find an existing HMM va_block; it should succeed.
-    status = uvm_hmm_va_block_find(va_space, params->hmm_address, &hmm_block);
-    TEST_CHECK_GOTO(status == NV_OK, done);
-
-done:
-    uvm_va_space_up_read(va_space);
-    uvm_up_read_mmap_lock(mm);
-    uvm_va_space_mm_or_current_release(va_space, mm);
-
-    return status;
-
-out:
-    uvm_va_space_up_write(va_space);
-    uvm_up_write_mmap_lock(mm);
-    uvm_va_space_mm_or_current_release(va_space, mm);
-
-    return status;
-}
--- a/kernel-open/nvidia-uvm/uvm_hopper.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper.c
@ -54,6 +54,7 @@ void uvm_hal_hopper_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
    parent_gpu->uvm_mem_va_base = parent_gpu->rm_va_size + 384ull * 1024 * 1024 * 1024 * 1024;
    parent_gpu->uvm_mem_va_size = UVM_MEM_VA_SIZE;

+
    parent_gpu->peer_copy_mode = g_uvm_global.peer_copy_mode;

    // All GR context buffers may be mapped to 57b wide VAs. All "compute" units
--- a/kernel-open/nvidia-uvm/uvm_hopper_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_hopper_ce.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2020 NVIDIA Corporation
+    Copyright (c) 2020-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -23,25 +23,9 @@

 #include "uvm_hal.h"
 #include "uvm_push.h"
+#include "uvm_mem.h"
 #include "clc8b5.h"

-static void hopper_membar_after_transfer(uvm_push_t *push)
-{
-    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
-        return;
-
-    // TODO: [UVM-Volta] Remove Host WFI + Membar WAR for CE flush-only bug
-    // http://nvbugs/1734761
-    gpu->parent->host_hal->wait_for_idle(push);
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
-        gpu->parent->host_hal->membar_gpu(push);
-    else
-        gpu->parent->host_hal->membar_sys(push);
-}
-
 static NvU32 ce_aperture(uvm_aperture_t aperture)
 {
    BUILD_BUG_ON(HWCONST(C8B5, SET_SRC_PHYS_MODE, TARGET, LOCAL_FB) !=
@ -78,45 +62,32 @@ void uvm_hal_hopper_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 of
                     OFFSET_OUT_LOWER, HWVALUE(C8B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
 }

-// Perform an appropriate membar before a semaphore operation. Returns whether
-// the semaphore operation should include a flush.
-static bool hopper_membar_before_semaphore(uvm_push_t *push)
+// Return the flush type and the flush enablement.
+static NvU32 hopper_get_flush_value(uvm_push_t *push)
 {
-    uvm_gpu_t *gpu;
+    NvU32 flush_value;
+    uvm_membar_t membar = uvm_push_get_and_reset_membar_flag(push);

-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) {
+    if (membar == UVM_MEMBAR_NONE) {
        // No MEMBAR requested, don't use a flush.
-        return false;
+        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);
+    }
+    else {
+        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
+
+        if (membar == UVM_MEMBAR_GPU)
+            flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, GL);
+        else
+            flush_value |= HWCONST(C8B5, LAUNCH_DMA, FLUSH_TYPE, SYS);
    }

-    if (!uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) {
-        // By default do a MEMBAR SYS and for that we can just use flush on the
-        // semaphore operation.
-        return true;
-    }
-
-    // TODO: Bug 1734761: Remove the HOST WFI+membar WAR, i.e, perform the CE
-    // flush when MEMBAR GPU is requested.
-    gpu = uvm_push_get_gpu(push);
-    gpu->parent->host_hal->wait_for_idle(push);
-    gpu->parent->host_hal->membar_gpu(push);
-
-    return false;
+    return flush_value;
 }

 void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
-    NvU32 flush_value;
    NvU32 launch_dma_plc_mode;
-    bool use_flush;
-
-    use_flush = hopper_membar_before_semaphore(push);
-
-    if (use_flush)
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
-    else
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);

    NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
                     SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
@ -124,7 +95,7 @@ void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 p

    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();

-    NV_PUSH_1U(C8B5, LAUNCH_DMA, flush_value |
+    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
       launch_dma_plc_mode);
@ -133,16 +104,7 @@ void uvm_hal_hopper_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 p
 void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
-    NvU32 flush_value;
    NvU32 launch_dma_plc_mode;
-    bool use_flush;
-
-    use_flush = hopper_membar_before_semaphore(push);
-
-    if (use_flush)
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
-    else
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);

    NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
                     SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
@ -150,7 +112,7 @@ void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, N

    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();

-    NV_PUSH_1U(C8B5, LAUNCH_DMA, flush_value |
+    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_ONE_WORD_SEMAPHORE) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_REDUCTION, INC) |
@ -162,16 +124,7 @@ void uvm_hal_hopper_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, N
 void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
 {
    uvm_gpu_t *gpu;
-    NvU32 flush_value;
    NvU32 launch_dma_plc_mode;
-    bool use_flush;
-
-    use_flush = hopper_membar_before_semaphore(push);
-
-    if (use_flush)
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
-    else
-        flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);

    NV_PUSH_3U(C8B5, SET_SEMAPHORE_A, HWVALUE(C8B5, SET_SEMAPHORE_A, UPPER, NvOffset_HI32(gpu_va)),
                     SET_SEMAPHORE_B, HWVALUE(C8B5, SET_SEMAPHORE_B, LOWER, NvOffset_LO32(gpu_va)),
@ -180,7 +133,7 @@ void uvm_hal_hopper_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
    gpu = uvm_push_get_gpu(push);
    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();

-    NV_PUSH_1U(C8B5, LAUNCH_DMA, flush_value |
+    NV_PUSH_1U(C8B5, LAUNCH_DMA, hopper_get_flush_value(push) |
       HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NONE) |
       HWCONST(C8B5, LAUNCH_DMA, SEMAPHORE_TYPE, RELEASE_FOUR_WORD_SEMAPHORE) |
       launch_dma_plc_mode);
@ -218,8 +171,9 @@ static void hopper_memset_common(uvm_push_t *push,
    NvU32 launch_dma_plc_mode;
    NvU32 launch_dma_remap_enable;
    NvU32 launch_dma_scrub_enable;
+    NvU32 flush_value = HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE);

-    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_validate(push, dst, memset_element_size),
+    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
                   "Memset validation failed in channel %s, GPU %s",
                   push->channel->name,
                   uvm_gpu_name(gpu));
@ -252,6 +206,10 @@ static void hopper_memset_common(uvm_push_t *push,
    do {
        NvU32 memset_this_time = (NvU32)min(num_elements, max_single_memset);

+        // In the last operation, a flush/membar may be issued after the memset.
+        if (num_elements == memset_this_time)
+            flush_value = hopper_get_flush_value(push);
+
        gpu->parent->ce_hal->offset_out(push, dst.address);

        NV_PUSH_1U(C8B5, LINE_LENGTH_IN, memset_this_time);
@ -260,7 +218,7 @@ static void hopper_memset_common(uvm_push_t *push,
           HWCONST(C8B5, LAUNCH_DMA, SRC_MEMORY_LAYOUT, PITCH) |
           HWCONST(C8B5, LAUNCH_DMA, DST_MEMORY_LAYOUT, PITCH) |
           HWCONST(C8B5, LAUNCH_DMA, MULTI_LINE_ENABLE, FALSE) |
-           HWCONST(C8B5, LAUNCH_DMA, FLUSH_ENABLE, FALSE) |
+           flush_value |
           launch_dma_remap_enable |
           launch_dma_scrub_enable |
           launch_dma_dst_type |
@ -269,10 +227,8 @@ static void hopper_memset_common(uvm_push_t *push,

        dst.address += memset_this_time * memset_element_size;
        num_elements -= memset_this_time;
-        pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
+        pipelined_value = HWCONST(C8B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
    } while (num_elements > 0);
-
-    hopper_membar_after_transfer(push);
 }

 void uvm_hal_hopper_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64 value, size_t size)
@ -337,3 +293,16 @@ void uvm_hal_hopper_ce_memset_4(uvm_push_t *push, uvm_gpu_address_t dst, NvU32 v

    hopper_memset_common(push, dst, size, 4);
 }
+
+bool uvm_hal_hopper_ce_memset_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, size_t element_size)
+{
+
+    return true;
+}
+
+bool uvm_hal_hopper_ce_memcopy_is_valid(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu_address_t src)
+{
+
+    return true;
+}
+
--- a/kernel-open/nvidia-uvm/uvm_linux.h
+++ b/kernel-open/nvidia-uvm/uvm_linux.h
@ -108,7 +108,7 @@ static inline const struct cpumask *uvm_cpumask_of_node(int node)
 #endif
 }

-    #if defined(CONFIG_HMM_MIRROR) && defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MAKE_DEVICE_EXCLUSIVE_RANGE_PRESENT)
+    #if defined(CONFIG_HMM_MIRROR) && defined(CONFIG_DEVICE_PRIVATE) && defined(NV_MIGRATE_DEVICE_RANGE_PRESENT)
        #define UVM_IS_CONFIG_HMM() 1
    #else
        #define UVM_IS_CONFIG_HMM() 0
@ -404,6 +404,7 @@ static inline NvU64 NV_GETTIME(void)
 // 654672d4ba1a6001c365833be895f9477c4d5eab ("locking/atomics:
 // Add _{acquire|release|relaxed}() variants of some atomic operations") in v4.3
 // (2015-08-06).
+// TODO: Bug 3849079: We always use this definition on newer kernels.
 #ifndef atomic_read_acquire
    #define atomic_read_acquire(p) smp_load_acquire(&(p)->counter)
 #endif
@ -412,6 +413,24 @@ static inline NvU64 NV_GETTIME(void)
    #define atomic_set_release(p, v) smp_store_release(&(p)->counter, v)
 #endif

+// atomic_long_read_acquire and atomic_long_set_release were added in commit
+// b5d47ef9ea5c5fe31d7eabeb79f697629bd9e2cb ("locking/atomics: Switch to
+// generated atomic-long") in v5.1 (2019-05-05).
+// TODO: Bug 3849079: We always use these definitions on newer kernels.
+#define atomic_long_read_acquire uvm_atomic_long_read_acquire
+static inline long uvm_atomic_long_read_acquire(atomic_long_t *p)
+{
+    long val = atomic_long_read(p);
+    smp_mb();
+    return val;
+}
+
+#define atomic_long_set_release uvm_atomic_long_set_release
+static inline void uvm_atomic_long_set_release(atomic_long_t *p, long v)
+{
+    smp_mb();
+    atomic_long_set(p, v);
+}

 // Added in 3.11
 #ifndef PAGE_ALIGNED
--- a/kernel-open/nvidia-uvm/uvm_lock.h
+++ b/kernel-open/nvidia-uvm/uvm_lock.h
@ -75,26 +75,29 @@
 //
 //      Protects:
 //      - gpu->parent->isr.replayable_faults.service_lock:
-//        Changes to the state of a GPU as it transitions from top-half to bottom-half
-//        interrupt handler for replayable faults. This lock is acquired for that GPU,
-//        in the ISR top-half. Then a bottom-half is scheduled (to run in a workqueue).
-//        Then the bottom-half releases the lock when that GPU's processing appears to
-//        be done.
+//        Changes to the state of a GPU as it transitions from top-half to
+//        bottom-half interrupt handler for replayable faults. This lock is
+//        acquired for that GPU, in the ISR top-half. Then a bottom-half is
+//        scheduled (to run in a workqueue). Then the bottom-half releases the
+//        lock when that GPU's processing appears to be done.
+//
 //      - gpu->parent->isr.non_replayable_faults.service_lock:
-//        Changes to the state of a GPU in the bottom-half for non-replayable faults.
-//        Non-replayable faults are handed-off from RM instead of directly from the GPU
-//        hardware. This means that we do not keep receiving interrupts after RM pops
-//        out the faults from the HW buffer. In order not to miss fault notifications,
-//        we will always schedule a bottom-half for non-replayable faults if there are
-//        faults ready to be consumed in the buffer, even if there already is some
-//        bottom-half running or scheduled. This lock serializes all scheduled bottom
-//        halves per GPU which service non-replayable faults.
+//        Changes to the state of a GPU in the bottom-half for non-replayable
+//        faults. Non-replayable faults are handed-off from RM instead of
+//        directly from the GPU hardware. This means that we do not keep
+//        receiving interrupts after RM pops out the faults from the HW buffer.
+//        In order not to miss fault notifications, we will always schedule a
+//        bottom-half for non-replayable faults if there are faults ready to be
+//        consumed in the buffer, even if there already is some bottom-half
+//        running or scheduled. This lock serializes all scheduled bottom halves
+//        per GPU which service non-replayable faults.
+//
 //      - gpu->parent->isr.access_counters.service_lock:
-//        Changes to the state of a GPU as it transitions from top-half to bottom-half
-//        interrupt handler for access counter notifications. This lock is acquired for
-//        that GPU, in the ISR top-half. Then a bottom-half is scheduled (to run in a
-//        workqueue). Then the bottom-half releases the lock when that GPU's processing
-//        appears to be done.
+//        Changes to the state of a GPU as it transitions from top-half to
+//        bottom-half interrupt handler for access counter notifications. This
+//        lock is acquired for that GPU, in the ISR top-half. Then a bottom-half
+//        is scheduled (to run in a workqueue). Then the bottom-half releases
+//        the lock when that GPU's processing appears to be done.
 //
 // - mmap_lock (mmap_sem in kernels < 5.8)
 //      Order: UVM_LOCK_ORDER_MMAP_LOCK
@ -339,7 +342,9 @@
 //      Order: UVM_LOCK_ORDER_CHANNEL
 //      Spinlock (uvm_spinlock_t) or exclusive lock (mutex)
 //
-//      Lock protecting the state of all the channels in a channel pool.
+//      Lock protecting the state of all the channels in a channel pool. The
+//      channel pool lock documentation contains the guidelines about which lock
+//      type (mutex or spinlock) to use.
 //
 // - Tools global VA space list lock (g_tools_va_space_list_lock)
 //      Order: UVM_LOCK_ORDER_TOOLS_VA_SPACE_LIST
--- a/kernel-open/nvidia-uvm/uvm_map_external.c
+++ b/kernel-open/nvidia-uvm/uvm_map_external.c
@ -106,6 +106,9 @@ static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range,
    pte_buffer->mapping_info.formatType = map_rm_params->format_type;
    pte_buffer->mapping_info.elementBits = map_rm_params->element_bits;
    pte_buffer->mapping_info.compressionType = map_rm_params->compression_type;
+    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL)
+        pte_buffer->mapping_info.mappingPageSize = page_size;
+
    pte_buffer->page_size = page_size;
    pte_buffer->pte_size = uvm_mmu_pte_size(tree, page_size);
    num_all_ptes = uvm_div_pow2_64(length, page_size);
@ -341,9 +344,8 @@ static NV_STATUS map_rm_pt_range(uvm_page_tree_t *tree,
 static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_gpu_map_t *ext_gpu_map)
 {
    if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) {
-        if (va_range->channel.aperture == UVM_APERTURE_VID)
-            return UVM_MEMBAR_GPU;
-        return UVM_MEMBAR_SYS;
+        return uvm_hal_downgrade_membar_type(va_range->channel.gpu_va_space->gpu,
+                                             va_range->channel.aperture == UVM_APERTURE_VID);
    }

    // If there is no mem_handle, this is a sparse mapping.
@ -353,9 +355,8 @@ static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_
    if (!ext_gpu_map->mem_handle)
        return UVM_MEMBAR_GPU;

-    if (ext_gpu_map->is_sysmem || ext_gpu_map->gpu != ext_gpu_map->owning_gpu)
-            return UVM_MEMBAR_SYS;
-    return UVM_MEMBAR_GPU;
+    return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu,
+                                         !ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu);
 }

 NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,
@ -398,9 +399,7 @@ NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,

    page_tree = &gpu_va_space->page_tables;

-    // Verify that the GPU VA space supports this page size
-    if ((mem_info->pageSize & page_tree->hal->page_sizes()) == 0)
-        return NV_ERR_INVALID_ADDRESS;
+    UVM_ASSERT(uvm_mmu_page_size_supported(page_tree, mem_info->pageSize));

    if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) {
        // We should be never called with ext_gpu_map == NULL
@ -414,13 +413,12 @@ NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range,
        pt_range_vec = &va_range->channel.pt_range_vec;
    }

-    if (!IS_ALIGNED(map_offset, mem_info->pageSize) ||
-        map_offset + uvm_range_tree_node_size(node) > mem_info->size)
+    if (map_offset + uvm_range_tree_node_size(node) > mem_info->size)
        return NV_ERR_INVALID_OFFSET;

-    // Consolidate input checks for API-level callers
-    if (!IS_ALIGNED(node->start, mem_info->pageSize) || !IS_ALIGNED(node->end + 1, mem_info->pageSize))
-        return NV_ERR_INVALID_ADDRESS;
+    UVM_ASSERT(IS_ALIGNED(node->start, mem_info->pageSize) &&
+               IS_ALIGNED(node->end + 1, mem_info->pageSize) &&
+               IS_ALIGNED(map_offset, mem_info->pageSize));

    status = uvm_pte_buffer_init(va_range,
                                 mapping_gpu,
@ -845,6 +843,10 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    uvm_ext_gpu_map_t *ext_gpu_map = NULL;
    uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
    UvmGpuMemoryInfo mem_info;
+    uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu);
+    NvU32 mapping_page_size;
+    NvU64 alignments;
+    NvU32 smallest_alignment;
    NV_STATUS status;

    uvm_assert_rwsem_locked_read(&va_space->lock);
@ -915,12 +917,25 @@ static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range,
    if (status != NV_OK)
        goto error;

-    status = uvm_va_range_map_rm_allocation(va_range,
-                                            mapping_gpu,
-                                            &mem_info,
-                                            map_rm_params,
-                                            ext_gpu_map,
-                                            out_tracker);
+    // Determine the proper mapping page size.
+    // This will be the largest supported page size less than or equal to the
+    // smallest of the base VA address, length, offset, and allocation page size
+    // alignments.
+    alignments = mem_info.pageSize | base | length | map_rm_params->map_offset;
+    smallest_alignment = alignments & ~(alignments - 1);
+
+    // Check that alignment bits did not get truncated.
+    UVM_ASSERT(smallest_alignment);
+
+    mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables, smallest_alignment);
+    if (!mapping_page_size) {
+        status = NV_ERR_INVALID_ADDRESS;
+        goto error;
+    }
+
+    mem_info.pageSize = mapping_page_size;
+
+    status = uvm_va_range_map_rm_allocation(va_range, mapping_gpu, &mem_info, map_rm_params, ext_gpu_map, out_tracker);
    if (status != NV_OK)
        goto error;

--- a/kernel-open/nvidia-uvm/uvm_maxwell_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_maxwell_ce.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -50,38 +50,12 @@ void uvm_hal_maxwell_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 o
                     OFFSET_OUT_LOWER, HWVALUE(B0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
 }

-// Perform an appropriate membar before a semaphore operation. Returns whether
-// the semaphore operation should include a flush.
-static bool maxwell_membar_before_semaphore(uvm_push_t *push)
-{
-    uvm_gpu_t *gpu;
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) {
-        // No MEMBAR requested, don't use a flush.
-        return false;
-    }
-
-    if (!uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) {
-        // By default do a MEMBAR SYS and for that we can just use flush on the
-        // semaphore operation.
-        return true;
-    }
-
-    // MEMBAR GPU requested, do it on the HOST and skip the CE flush as CE
-    // doesn't have this capability.
-    gpu = uvm_push_get_gpu(push);
-    gpu->parent->host_hal->wait_for_idle(push);
-    gpu->parent->host_hal->membar_gpu(push);
-
-    return false;
-}
-
 void uvm_hal_maxwell_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
 {
    NvU32 flush_value;
    bool use_flush;

-    use_flush = maxwell_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
@ -102,7 +76,7 @@ void uvm_hal_maxwell_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va,
    NvU32 flush_value;
    bool use_flush;

-    use_flush = maxwell_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
@ -126,7 +100,7 @@ void uvm_hal_maxwell_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
    NvU32 flush_value;
    bool use_flush;

-    use_flush = maxwell_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(B0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
@ -221,10 +195,9 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
    NvU32 pipelined_value;
    NvU32 launch_dma_src_dst_type;
    NvU32 launch_dma_plc_mode;
-    bool first_operation = true;

-    UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_validate(push, dst, src),
-                   "Memcopy validation failed in channel %s, GPU %s",
+    UVM_ASSERT_MSG(gpu->parent->ce_hal->memcopy_is_valid(push, dst, src),
+                   "Memcopy validation failed in channel %s, GPU %s.\n",
                   push->channel->name,
                   uvm_gpu_name(gpu));

@ -233,14 +206,14 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
    launch_dma_src_dst_type = gpu->parent->ce_hal->phys_mode(push, dst, src);
    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();

-    do {
-        NvU32 copy_this_time = (NvU32)min(size, max_single_copy_size);
-
-        if (first_operation && uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
+    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
        pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
    else
        pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);

+    do {
+        NvU32 copy_this_time = (NvU32)min(size, max_single_copy_size);
+
        gpu->parent->ce_hal->offset_in_out(push, src.address, dst.address);

        NV_PUSH_1U(B0B5, LINE_LENGTH_IN, copy_this_time);
@ -255,10 +228,10 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu
           launch_dma_plc_mode |
           pipelined_value);

+        pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
        dst.address += copy_this_time;
        src.address += copy_this_time;
        size -= copy_this_time;
-        first_operation = false;
    } while (size > 0);

    maxwell_membar_after_transfer(push);
@ -266,11 +239,14 @@ void uvm_hal_maxwell_ce_memcopy(uvm_push_t *push, uvm_gpu_address_t dst, uvm_gpu

 void uvm_hal_maxwell_ce_memcopy_v_to_v(uvm_push_t *push, NvU64 dst_va, NvU64 src_va, size_t size)
 {
-    uvm_hal_maxwell_ce_memcopy(push, uvm_gpu_address_virtual(dst_va), uvm_gpu_address_virtual(src_va), size);
+    uvm_push_get_gpu(push)->parent->ce_hal->memcopy(push,
+                                                    uvm_gpu_address_virtual(dst_va),
+                                                    uvm_gpu_address_virtual(src_va),
+                                                    size);
 }

 // Push SET_DST_PHYS mode if needed and return LAUNCH_DMA_DST_TYPE flags
-static NvU32 memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst)
+static NvU32 maxwell_memset_push_phys_mode(uvm_push_t *push, uvm_gpu_address_t dst)
 {
    if (dst.is_virtual)
        return HWCONST(B0B5, LAUNCH_DMA, DST_TYPE, VIRTUAL);
@ -290,12 +266,12 @@ static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size,
    NvU32 launch_dma_dst_type;
    NvU32 launch_dma_plc_mode;

-    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_validate(push, dst, memset_element_size),
-                   "Memset validation failed in channel %s, GPU %s",
+    UVM_ASSERT_MSG(gpu->parent->ce_hal->memset_is_valid(push, dst, memset_element_size),
+                   "Memset validation failed in channel %s, GPU %s.\n",
                   push->channel->name,
                   uvm_gpu_name(gpu));

-    launch_dma_dst_type = memset_push_phys_mode(push, dst);
+    launch_dma_dst_type = maxwell_memset_push_phys_mode(push, dst);
    launch_dma_plc_mode = gpu->parent->ce_hal->plc_mode();

    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED))
@ -322,7 +298,7 @@ static void memset_common(uvm_push_t *push, uvm_gpu_address_t dst, size_t size,

        dst.address += memset_this_time * memset_element_size;
        size -= memset_this_time;
-        pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, NON_PIPELINED);
+        pipelined_value = HWCONST(B0B5, LAUNCH_DMA, DATA_TRANSFER_TYPE, PIPELINED);
    } while (size > 0);

    maxwell_membar_after_transfer(push);
@ -373,5 +349,6 @@ void uvm_hal_maxwell_ce_memset_8(uvm_push_t *push, uvm_gpu_address_t dst, NvU64

 void uvm_hal_maxwell_ce_memset_v_4(uvm_push_t *push, NvU64 dst_va, NvU32 value, size_t size)
 {
-    uvm_hal_maxwell_ce_memset_4(push, uvm_gpu_address_virtual(dst_va), value, size);
+    uvm_push_get_gpu(push)->parent->ce_hal->memset_4(push, uvm_gpu_address_virtual(dst_va), value, size);
 }
+
--- a/kernel-open/nvidia-uvm/uvm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_mem.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2021 NVIDIA Corporation
+    Copyright (c) 2016-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -455,7 +455,7 @@ static gfp_t sysmem_allocation_gfp_flags(int order, bool zero)
 //
 // In case of failure, the caller is required to handle cleanup by calling
 // uvm_mem_free
-static NV_STATUS mem_alloc_sysmem_dma_chunks(uvm_mem_t *mem, struct mm_struct *mm, gfp_t gfp_flags)
+static NV_STATUS mem_alloc_sysmem_dma_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
 {
    size_t i;
    NV_STATUS status;
@ -500,7 +500,7 @@ error:

 // In case of failure, the caller is required to handle cleanup by calling
 // uvm_mem_free
-static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, struct mm_struct *mm, gfp_t gfp_flags)
+static NV_STATUS mem_alloc_sysmem_chunks(uvm_mem_t *mem, gfp_t gfp_flags)
 {
    size_t i;
    int order;
@ -573,9 +573,9 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer

        uvm_memcg_context_start(&memcg_context, mm);
        if (uvm_mem_is_sysmem_dma(mem))
-            status = mem_alloc_sysmem_dma_chunks(mem, mm, gfp_flags);
+            status = mem_alloc_sysmem_dma_chunks(mem, gfp_flags);
        else
-            status = mem_alloc_sysmem_chunks(mem, mm, gfp_flags);
+            status = mem_alloc_sysmem_chunks(mem, gfp_flags);

        uvm_memcg_context_end(&memcg_context);
        return status;
@ -584,14 +584,6 @@ static NV_STATUS mem_alloc_chunks(uvm_mem_t *mem, struct mm_struct *mm, bool zer
    return mem_alloc_vidmem_chunks(mem, zero, is_protected);
 }

-static const char *mem_physical_source(uvm_mem_t *mem)
-{
-    if (uvm_mem_is_vidmem(mem))
-        return uvm_gpu_name(mem->backing_gpu);
-
-    return "CPU";
-}
-
 NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_global_processor_mask_t *mask)
 {
    uvm_gpu_t *gpu;
@ -617,6 +609,7 @@ NV_STATUS uvm_mem_map_kernel(uvm_mem_t *mem, const uvm_global_processor_mask_t *
 NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_out)
 {
    NV_STATUS status;
+    NvU64 physical_size;
    uvm_mem_t *mem = NULL;
    bool is_protected = false;

@ -637,8 +630,8 @@ NV_STATUS uvm_mem_alloc(const uvm_mem_alloc_params_t *params, uvm_mem_t **mem_ou

    UVM_ASSERT(mem->chunk_size > 0);

-    mem->physical_allocation_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
-    mem->chunks_count = mem->physical_allocation_size / mem->chunk_size;
+    physical_size = UVM_ALIGN_UP(mem->size, mem->chunk_size);
+    mem->chunks_count = physical_size / mem->chunk_size;

    status = mem_alloc_chunks(mem, params->mm, params->zero, is_protected);
    if (status != NV_OK)
@ -665,7 +658,7 @@ static NV_STATUS mem_init_user_mapping(uvm_mem_t *mem, uvm_va_space_t *user_va_s
    }

    UVM_ASSERT(IS_ALIGNED((NvU64)user_addr, mem->chunk_size));
-    UVM_ASSERT(mem->physical_allocation_size == mem->size);
+    UVM_ASSERT(uvm_mem_physical_size(mem) == mem->size);

    mem->user = uvm_kvmalloc_zero(sizeof(*mem->user));
    if (mem->user == NULL)
@ -691,7 +684,7 @@ static void mem_deinit_user_mapping(uvm_mem_t *mem)

 static NvU64 reserved_gpu_va(uvm_mem_t *mem, uvm_gpu_t *gpu)
 {
-    UVM_ASSERT(mem->kernel.range_alloc.aligned_start + mem->physical_allocation_size < gpu->parent->uvm_mem_va_size);
+    UVM_ASSERT(mem->kernel.range_alloc.aligned_start + uvm_mem_physical_size(mem) < gpu->parent->uvm_mem_va_size);

    return gpu->parent->uvm_mem_va_base + mem->kernel.range_alloc.aligned_start;
 }
@ -709,7 +702,7 @@ static struct page *mem_cpu_page(uvm_mem_t *mem, NvU64 offset)
 static NV_STATUS mem_map_cpu_to_sysmem_kernel(uvm_mem_t *mem)
 {
    struct page **pages = mem->sysmem.pages;
-    size_t num_pages = mem->physical_allocation_size / PAGE_SIZE;
+    size_t num_pages = uvm_mem_physical_size(mem) / PAGE_SIZE;
    pgprot_t prot = PAGE_KERNEL;

    UVM_ASSERT(uvm_mem_is_sysmem(mem));
@ -743,7 +736,7 @@ static NV_STATUS mem_map_cpu_to_vidmem_kernel(uvm_mem_t *mem)
 {
    struct page **pages;
    size_t num_chunk_pages = mem->chunk_size / PAGE_SIZE;
-    size_t num_pages = mem->physical_allocation_size / PAGE_SIZE;
+    size_t num_pages = uvm_mem_physical_size(mem) / PAGE_SIZE;
    size_t page_index;
    size_t chunk_index;

@ -798,7 +791,7 @@ static NV_STATUS mem_map_cpu_to_sysmem_user(uvm_mem_t *mem, struct vm_area_struc
    // compound pages in order to be able to use vm_insert_page on them. This
    // is not currently being exercised because the only allocations using this
    // are semaphore pools (which typically use a single page).
-    for (offset = 0; offset < mem->physical_allocation_size; offset += PAGE_SIZE) {
+    for (offset = 0; offset < uvm_mem_physical_size(mem); offset += PAGE_SIZE) {
        int ret = vm_insert_page(vma, (unsigned long)mem->user->addr + offset, mem_cpu_page(mem, offset));
        if (ret) {
            UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
@ -810,7 +803,7 @@ static NV_STATUS mem_map_cpu_to_sysmem_user(uvm_mem_t *mem, struct vm_area_struc
    return NV_OK;

 error:
-    unmap_mapping_range(&mem->user->va_space->mapping, (size_t)mem->user->addr, mem->physical_allocation_size, 1);
+    unmap_mapping_range(mem->user->va_space->mapping, (size_t)mem->user->addr, uvm_mem_physical_size(mem), 1);
    return status;
 }

@ -819,7 +812,7 @@ void uvm_mem_unmap_cpu_user(uvm_mem_t *mem)
    if (!uvm_mem_mapped_on_cpu_user(mem))
        return;

-    unmap_mapping_range(&mem->user->va_space->mapping, (size_t)mem->user->addr, mem->physical_allocation_size, 1);
+    unmap_mapping_range(mem->user->va_space->mapping, (size_t)mem->user->addr, uvm_mem_physical_size(mem), 1);
    mem_clear_mapped_on_cpu_user(mem);
    mem_deinit_user_mapping(mem);
 }
@ -959,21 +952,17 @@ static uvm_gpu_phys_address_t mem_gpu_physical_sysmem(uvm_mem_t *mem, uvm_gpu_t
    return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr + offset % mem->chunk_size);
 }

-static bool mem_check_range(uvm_mem_t *mem, NvU64 offset, NvU64 size)
+bool uvm_mem_is_physically_contiguous(uvm_mem_t *mem, NvU64 offset, NvU64 size)
 {
    UVM_ASSERT(size != 0);
-    UVM_ASSERT_MSG(UVM_ALIGN_DOWN(offset, mem->chunk_size) == UVM_ALIGN_DOWN(offset + size - 1, mem->chunk_size),
-                   "offset %llu size %llu page_size %u\n",
-                   offset,
-                   size,
-                   mem->chunk_size);
-    UVM_ASSERT_MSG(offset / mem->chunk_size < mem->chunks_count, "offset %llu\n", offset);
-    return true;
+    UVM_ASSERT((offset + size) <= uvm_mem_physical_size(mem));
+
+    return UVM_ALIGN_DOWN(offset, mem->chunk_size) == UVM_ALIGN_DOWN(offset + size - 1, mem->chunk_size);
 }

 uvm_gpu_phys_address_t uvm_mem_gpu_physical(uvm_mem_t *mem, uvm_gpu_t *gpu, NvU64 offset, NvU64 size)
 {
-    UVM_ASSERT(mem_check_range(mem, offset, size));
+    UVM_ASSERT(uvm_mem_is_physically_contiguous(mem, offset, size));

    if (uvm_mem_is_vidmem(mem)) {
        UVM_ASSERT(uvm_mem_is_local_vidmem(mem, gpu));
@ -990,7 +979,7 @@ uvm_gpu_address_t uvm_mem_gpu_address_copy(uvm_mem_t *mem, uvm_gpu_t *accessing_
    size_t chunk_offset;
    uvm_gpu_chunk_t *chunk;

-    UVM_ASSERT(mem_check_range(mem, offset, size));
+    UVM_ASSERT(uvm_mem_is_physically_contiguous(mem, offset, size));

    if (uvm_mem_is_sysmem(mem) || uvm_mem_is_local_vidmem(mem, accessing_gpu))
        return uvm_mem_gpu_address_physical(mem, accessing_gpu, offset, size);
@ -1024,13 +1013,8 @@ static NvU64 mem_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset,

 static void mem_unmap_gpu(uvm_mem_t *mem, uvm_gpu_t *gpu, uvm_page_table_range_vec_t **range_vec)
 {
-    NV_STATUS status;
-    uvm_membar_t tlb_membar = UVM_MEMBAR_SYS;
-
-    if (uvm_mem_is_local_vidmem(mem, gpu))
-        tlb_membar = UVM_MEMBAR_GPU;
-
-    status = uvm_page_table_range_vec_clear_ptes(*range_vec, tlb_membar);
+    uvm_membar_t tlb_membar = uvm_hal_downgrade_membar_type(gpu, uvm_mem_is_local_vidmem(mem, gpu));
+    NV_STATUS status = uvm_page_table_range_vec_clear_ptes(*range_vec, tlb_membar);
    if (status != NV_OK)
        UVM_ERR_PRINT("Clearing PTEs failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));

@ -1054,22 +1038,19 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
            .attrs = attrs
        };

-    if (!uvm_gpu_can_address(gpu, gpu_va, mem->size))
-        return NV_ERR_OUT_OF_RANGE;
-
    page_size = mem_pick_gpu_page_size(mem, gpu, tree);
    UVM_ASSERT_MSG(uvm_mmu_page_size_supported(tree, page_size), "page_size 0x%x\n", page_size);

    status = uvm_page_table_range_vec_create(tree,
                                             gpu_va,
-                                             mem->physical_allocation_size,
+                                             uvm_mem_physical_size(mem),
                                             page_size,
                                             pmm_flags,
                                             range_vec);
    if (status != NV_OK) {
        UVM_ERR_PRINT("Failed to init page mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
                      gpu_va,
-                      gpu_va + mem->physical_allocation_size,
+                      gpu_va + uvm_mem_physical_size(mem),
                      nvstatusToString(status),
                      uvm_gpu_name(gpu));
        return status;
@ -1079,7 +1060,7 @@ static NV_STATUS mem_map_gpu(uvm_mem_t *mem,
    if (status != NV_OK) {
        UVM_ERR_PRINT("Failed to write PTEs for mapping at [0x%llx, 0x%llx): %s, GPU %s\n",
                      gpu_va,
-                      gpu_va + mem->physical_allocation_size,
+                      gpu_va + uvm_mem_physical_size(mem),
                      nvstatusToString(status),
                      uvm_gpu_name(gpu));
        goto error;
@ -1098,7 +1079,7 @@ static NV_STATUS mem_init_gpu_kernel_range(uvm_mem_t *mem)
        return NV_OK;

    return uvm_range_allocator_alloc(&g_free_ranges,
-                                     mem->physical_allocation_size,
+                                     uvm_mem_physical_size(mem),
                                     mem->chunk_size,
                                     &mem->kernel.range_alloc);
 }
@ -1139,7 +1120,7 @@ NV_STATUS uvm_mem_map_gpu_kernel(uvm_mem_t *mem, uvm_gpu_t *gpu)
    if (status != NV_OK)
        return status;

-    gpu_va = uvm_parent_gpu_canonical_address(gpu->parent, reserved_gpu_va(mem, gpu));
+    gpu_va = reserved_gpu_va(mem, gpu);
    range_vec = &mem->kernel.range_vecs[uvm_global_id_gpu_index(gpu->global_id)];

    status = mem_map_gpu(mem, gpu, gpu_va, &gpu->address_space_tree, &attrs, range_vec);
@ -1165,6 +1146,7 @@ NV_STATUS uvm_mem_map_gpu_user(uvm_mem_t *mem,
    NV_STATUS status;
    uvm_gpu_va_space_t *gpu_va_space;
    uvm_page_table_range_vec_t **range_vec;
+    NvU64 gpu_va;

    UVM_ASSERT(mem_can_be_mapped_on_gpu_user(mem, gpu));
    uvm_assert_rwsem_locked(&user_va_space->lock);
@ -1172,6 +1154,10 @@ NV_STATUS uvm_mem_map_gpu_user(uvm_mem_t *mem,
    if (uvm_mem_mapped_on_gpu_user(mem, gpu))
        return NV_OK;

+    gpu_va = (NvU64)user_addr;
+    if (!uvm_gpu_can_address(gpu, gpu_va, mem->size))
+        return NV_ERR_OUT_OF_RANGE;
+
    status = uvm_mem_map_gpu_phys(mem, gpu);
    if (status != NV_OK)
        return status;
@ -1183,7 +1169,7 @@ NV_STATUS uvm_mem_map_gpu_user(uvm_mem_t *mem,
    gpu_va_space = uvm_gpu_va_space_get(mem->user->va_space, gpu);
    range_vec = &mem->user->range_vecs[uvm_global_id_gpu_index(gpu->global_id)];

-    status = mem_map_gpu(mem, gpu, (NvU64)mem->user->addr, &gpu_va_space->page_tables, attrs, range_vec);
+    status = mem_map_gpu(mem, gpu, gpu_va, &gpu_va_space->page_tables, attrs, range_vec);
    if (status != NV_OK)
        goto cleanup;

--- a/kernel-open/nvidia-uvm/uvm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_mem.h
@ -163,9 +163,6 @@ struct uvm_mem_struct

    uvm_gpu_t *dma_owner;

-    // Size of the physical chunks.
-    NvU32 chunk_size;
-
    union
    {
        struct
@ -194,12 +191,12 @@ struct uvm_mem_struct
    // Count of chunks (vidmem) or CPU pages (sysmem) above
    size_t chunks_count;

+    // Size of each physical chunk (vidmem) or CPU page (sysmem)
+    NvU32 chunk_size;
+
    // Size of the allocation
    NvU64 size;

-    // Size of the physical allocation backing
-    NvU64 physical_allocation_size;
-
    uvm_mem_user_mapping_t *user;

    // Information specific to allocations mapped in UVM internal VA space.
@ -235,6 +232,20 @@ NV_STATUS uvm_mem_translate_gpu_attributes(const UvmGpuMappingAttributes *attrs,

 uvm_chunk_sizes_mask_t uvm_mem_kernel_chunk_sizes(uvm_gpu_t *gpu);

+// Size of all the physical allocations backing the given memory.
+static inline NvU64 uvm_mem_physical_size(const uvm_mem_t *mem)
+{
+    NvU64 physical_size = mem->chunks_count * mem->chunk_size;
+
+    UVM_ASSERT(mem->size <= physical_size);
+
+    return physical_size;
+}
+
+// Returns true if the memory is physically contiguous in the
+// [offset, offset + size) interval.
+bool uvm_mem_is_physically_contiguous(uvm_mem_t *mem, NvU64 offset, NvU64 size);
+
 // Allocate memory according to the given allocation parameters.
 //
 // In the case of sysmem, the memory is immediately physically accessible from
--- a/kernel-open/nvidia-uvm/uvm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_mem_test.c
@ -62,7 +62,7 @@ static NV_STATUS check_accessible_from_gpu(uvm_gpu_t *gpu, uvm_mem_t *mem)

    verif_size = UVM_ALIGN_UP(verif_size, sizeof(*sys_verif));

-    UVM_ASSERT(mem->physical_allocation_size >= verif_size);
+    UVM_ASSERT(uvm_mem_physical_size(mem) >= verif_size);
    UVM_ASSERT(verif_size >= sizeof(*sys_verif));

    TEST_NV_CHECK_GOTO(__alloc_map_sysmem(verif_size, gpu, &sys_mem), done);
@ -185,7 +185,7 @@ static NV_STATUS test_map_gpu(uvm_mem_t *mem, uvm_gpu_t *gpu)

    gpu_va = uvm_mem_get_gpu_va_kernel(mem, gpu);
    TEST_CHECK_RET(gpu_va >= gpu->parent->uvm_mem_va_base);
-    TEST_CHECK_RET(gpu_va + mem->physical_allocation_size <= gpu->parent->uvm_mem_va_base + gpu->parent->uvm_mem_va_size);
+    TEST_CHECK_RET(gpu_va + uvm_mem_physical_size(mem) <= gpu->parent->uvm_mem_va_base + gpu->parent->uvm_mem_va_size);

    // Mapping if already mapped is OK
    TEST_NV_CHECK_RET(uvm_mem_map_gpu_kernel(mem, gpu));
@ -370,6 +370,7 @@ static NV_STATUS test_all(uvm_va_space_t *va_space)
    static const int max_supported_page_sizes = 4 + 1;
    int i;

+
    gpu_count = uvm_processor_mask_get_gpu_count(&va_space->registered_gpus);

    // +1 for the CPU
--- a/kernel-open/nvidia-uvm/uvm_migrate.c
+++ b/kernel-open/nvidia-uvm/uvm_migrate.c
@ -86,7 +86,8 @@ static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,
    // Only map those pages that are not already mapped on destination
    for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) {
        prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index);
-        UVM_ASSERT(prot != UVM_PROT_NONE);
+        if (prot == UVM_PROT_NONE)
+            continue;

        if (va_block_context->mask_by_prot[prot - 1].count++ == 0)
            uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask);
@ -206,7 +207,17 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
    NV_STATUS status, tracker_status = NV_OK;

    uvm_assert_mutex_locked(&va_block->lock);
+    UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context, region));

+    if (uvm_va_block_is_hmm(va_block)) {
+        status = uvm_hmm_va_block_migrate_locked(va_block,
+                                                 va_block_retry,
+                                                 va_block_context,
+                                                 dest_id,
+                                                 region,
+                                                 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
+    }
+    else {
        va_block_context->policy = uvm_va_range_get_policy(va_block->va_range);

        if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space)) {
@ -229,6 +240,7 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
                                                NULL,
                                                UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
        }
+    }

    if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) {
        // block_migrate_add_mappings will acquire the work from the above
@ -316,7 +328,8 @@ static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space,
 // read-duplication is enabled in the VA range. This is because, when migrating
 // read-duplicated VA blocks, the source processor doesn't need to be unmapped
 // (though it may need write access revoked).
-static bool va_range_should_do_cpu_preunmap(uvm_va_policy_t *policy, uvm_va_space_t *va_space)
+static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy,
+                                            uvm_va_space_t *va_space)
 {
    return !uvm_va_policy_is_read_duplicate(policy, va_space);
 }
@ -406,7 +419,7 @@ static void preunmap_multi_block(uvm_va_range_t *va_range,
    }

    if (num_unmap_pages > 0)
-        unmap_mapping_range(&va_range->va_space->mapping, start, end - start + 1, 1);
+        unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1);
 }

 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
@ -524,6 +537,17 @@ static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
    NV_STATUS status = NV_OK;
    bool skipped_migrate = false;

+    if (!first_va_range) {
+        // For HMM, we iterate over va_blocks since there is no va_range.
+        return uvm_hmm_migrate_ranges(va_space,
+                                      va_block_context,
+                                      base,
+                                      length,
+                                      dest_id,
+                                      mode,
+                                      out_tracker);
+    }
+
    UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));

    va_range_last = NULL;
@ -594,10 +618,10 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
                             NvU64 length,
                             uvm_processor_id_t dest_id,
                             NvU32 migrate_flags,
+                             uvm_va_range_t *first_va_range,
                             uvm_tracker_t *out_tracker)
 {
    NV_STATUS status = NV_OK;
-    uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, base, base);
    uvm_va_block_context_t *va_block_context;
    bool do_mappings;
    bool do_two_passes;
@ -606,9 +630,6 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,

    uvm_assert_rwsem_locked(&va_space->lock);

-    if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
-        return NV_ERR_INVALID_ADDRESS;
-
    // If the GPU has its memory disabled, just skip the migration and let
    // faults take care of things.
    if (!uvm_va_space_processor_has_memory(va_space, dest_id))
@ -616,6 +637,7 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,

    if (mm)
        uvm_assert_mmap_lock_locked(mm);
+
    va_block_context = uvm_va_block_context_alloc(mm);
    if (!va_block_context)
        return NV_ERR_NO_MEMORY;
@ -638,7 +660,9 @@ static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
    // 1- Transfer all VA blocks (do not add mappings)
    // 2- Go block by block reexecuting the transfer (in case someone moved it
    // since the first pass), and adding the mappings.
-    is_single_block = is_migration_single_block(first_va_range, base, length);
+    //
+    // For HMM (!first_va_range), we always do a single pass.
+    is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length);
    do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP);
    do_two_passes = do_mappings && !is_single_block;

@ -854,6 +878,7 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)

    if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) {
        UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n");
+        UVM_INFO_PRINT("TEMP\n");
        return NV_ERR_INVALID_ARGUMENT;
    }

@ -916,6 +941,9 @@ NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
                                 params->length,
                                 (dest_gpu ? dest_gpu->id : UVM_ID_CPU),
                                 params->flags,
+                                 uvm_va_space_iter_first(va_space,
+                                                         params->base,
+                                                         params->base),
                                 tracker_ptr);
        }
        else if (status == NV_WARN_NOTHING_TO_DO) {
@ -1029,10 +1057,26 @@ NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, st
        NvU64 start = rgr->node.start;
        NvU64 length = rgr->node.end - rgr->node.start + 1;

-        if (gpu && !uvm_gpu_can_address(gpu, start, length))
+        if (gpu && !uvm_gpu_can_address(gpu, start, length)) {
            status = NV_ERR_OUT_OF_RANGE;
-        else
-            status = uvm_migrate(va_space, mm, start, length, dest_id, migrate_flags, &local_tracker);
+        }
+        else {
+            uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start);
+
+            if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
+                status = NV_ERR_INVALID_ADDRESS;
+                goto done;
+            }
+
+            status = uvm_migrate(va_space,
+                                 mm,
+                                 start,
+                                 length,
+                                 dest_id,
+                                 migrate_flags,
+                                 first_va_range,
+                                 &local_tracker);
+        }

        if (status != NV_OK)
            goto done;
--- a/kernel-open/nvidia-uvm/uvm_mmu.c
+++ b/kernel-open/nvidia-uvm/uvm_mmu.c
@ -727,16 +727,19 @@ error:
 //
 static void page_tree_set_location(uvm_page_tree_t *tree, uvm_aperture_t location)
 {
+    bool should_location_be_vidmem;
    UVM_ASSERT(tree->gpu != NULL);
    UVM_ASSERT_MSG((location == UVM_APERTURE_VID) ||
                   (location == UVM_APERTURE_SYS) ||
                   (location == UVM_APERTURE_DEFAULT),
                   "Invalid location %s (%d)\n", uvm_aperture_string(location), (int)location);

-    // The tree must be explicitly initialized in vidmem when in SR-IOV heavy.
-    // The only exception are "fake" GPUs used during page tree testing, which
-    // can be identified by having no channel manager.
-    if ((tree->gpu->channel_manager != NULL) && uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu))
+    should_location_be_vidmem = uvm_gpu_is_virt_mode_sriov_heavy(tree->gpu);
+
+    // The page tree of a "fake" GPU used during page tree testing can be in
+    // sysmem even if should_location_be_vidmem is true. A fake GPU can be
+    // identified by having no channel manager.
+    if ((tree->gpu->channel_manager != NULL) && should_location_be_vidmem)
        UVM_ASSERT(location == UVM_APERTURE_VID);

    if (location == UVM_APERTURE_DEFAULT) {
@ -874,6 +877,7 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t
    // traverse until we hit an in-use page, or the root
    while (dir->host_parent != NULL && dir->ref_count == 0) {
        uvm_page_directory_t *parent = dir->host_parent;
+        uvm_membar_t this_membar;

        if (free_count == 0) {

@ -902,10 +906,9 @@ void uvm_page_tree_put_ptes_async(uvm_page_tree_t *tree, uvm_page_table_range_t

        invalidate_depth = dir->host_parent->depth;

-        // If any of the pointed to PDEs were in sysmem then a SYS membar is
-        // required after the TLB invalidate.
-        if (dir->phys_alloc.addr.aperture == UVM_APERTURE_SYS)
-            membar_after_invalidate = UVM_MEMBAR_SYS;
+        // Take the membar with the widest scope of any of the pointed-to PDEs
+        this_membar = uvm_hal_downgrade_membar_type(tree->gpu, dir->phys_alloc.addr.aperture == UVM_APERTURE_VID);
+        membar_after_invalidate = max(membar_after_invalidate, this_membar);

        // If any of the cleared PDEs were in sysmem then a SYS membar is
        // required after the clears and before the TLB invalidate.
@ -978,7 +981,8 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
 {
    uvm_mmu_mode_hal_t *hal = tree->hal;

-    // bit index just beyond the most significant bit used to index the current entry
+    // bit index just beyond the most significant bit used to index the current
+    // entry
    NvU32 addr_bit_shift = hal->num_va_bits();

    // track depth upon which the invalidate occured
@ -997,19 +1001,28 @@ static NV_STATUS try_get_ptes(uvm_page_tree_t *tree,
    // ensure that the caller has specified a valid page size
    UVM_ASSERT((page_size & hal->page_sizes()) != 0);

-    // This algorithm will work with unaligned ranges, but the caller's intent is unclear
-    UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0, "start 0x%llx size 0x%zx page_size 0x%x",
-            start, (size_t)size, page_size);
+    // This algorithm will work with unaligned ranges, but the caller's intent
+    // is unclear
+    UVM_ASSERT_MSG(start % page_size == 0 && size % page_size == 0,
+                   "start 0x%llx size 0x%zx page_size 0x%x\n",
+                   start,
+                   (size_t)size,
+                   page_size);

    // The GPU should be capable of addressing the passed range
+    if (tree->type == UVM_PAGE_TREE_TYPE_USER)
        UVM_ASSERT(uvm_gpu_can_address(tree->gpu, start, size));
+    else
+        UVM_ASSERT(uvm_gpu_can_address_kernel(tree->gpu, start, size));

    while (true) {

-        // index of the entry, for the first byte of the range, within its containing directory
+        // index of the entry, for the first byte of the range, within its
+        // containing directory
        NvU32 start_index;

-        // index of the entry, for the last byte of the range, within its containing directory
+        // index of the entry, for the last byte of the range, within its
+        // containing directory
        NvU32 end_index;

        // pointer to PDE/PTE
@ -2280,9 +2293,9 @@ static NV_STATUS create_dynamic_sysmem_mapping(uvm_gpu_t *gpu)
    UVM_ASSERT(gpu->parent->flat_sysmem_va_base != 0);

    // The DMA addressable window is the maximum system physical memory
-    // addressable by the GPU (this limit is 128TB in Pascal-Ampere). The
-    // virtual mapping to sysmem is linear, so its size matches that of the
-    // physical address space.
+    // addressable by the GPU (this limit is 128TB in Pascal-Ada). The virtual
+    // mapping to sysmem is linear, so its size matches that of the physical
+    // address space.
    flat_sysmem_va_size = gpu->parent->dma_addressable_limit + 1 - gpu->parent->dma_addressable_start;

    // The optimal mapping granularity is dependent on multiple factors:
@ -2351,7 +2364,7 @@ NV_STATUS uvm_mmu_sysmem_map(uvm_gpu_t *gpu, NvU64 pa, NvU64 size)
            // because in the common case the VA block lock is held.
            pmm_flags = UVM_PMM_ALLOC_FLAGS_NONE;

-            sysmem_mapping->base = uvm_parent_gpu_canonical_address(gpu->parent, virtual_address.address);
+            sysmem_mapping->base = virtual_address.address;

            status = create_identity_mapping(gpu,
                                             sysmem_mapping->base,
--- a/kernel-open/nvidia-uvm/uvm_mmu.h
+++ b/kernel-open/nvidia-uvm/uvm_mmu.h
@ -36,7 +36,9 @@
 #define UVM_PAGE_SIZE_AGNOSTIC 0

 // Memory layout of UVM's kernel VA space.
-// The following memory regions are not to scale.
+// The following memory regions are not to scale. The memory layout is linear,
+// i.e., no canonical form address conversion.
+//
 // Hopper:
 // +----------------+ 128PB
 // |                |
@ -57,7 +59,7 @@
 // |                |
 // +----------------+ 0 (rm_va_base)
 //
-// Pascal-Ampere:
+// Pascal-Ada:
 // +----------------+ 512TB
 // |                |
 // |   (not used)   |
@ -592,13 +594,18 @@ static bool uvm_mmu_page_size_supported(uvm_page_tree_t *tree, NvU32 page_size)

 static NvU32 uvm_mmu_biggest_page_size_up_to(uvm_page_tree_t *tree, NvU32 max_page_size)
 {
+    NvU32 gpu_page_sizes = tree->hal->page_sizes();
+    NvU32 smallest_gpu_page_size = gpu_page_sizes & ~(gpu_page_sizes - 1);
    NvU32 page_sizes;
    NvU32 page_size;

    UVM_ASSERT_MSG(is_power_of_2(max_page_size), "0x%x\n", max_page_size);

+    if (max_page_size < smallest_gpu_page_size)
+        return 0;
+
    // Calculate the supported page sizes that are not larger than the max
-    page_sizes = tree->hal->page_sizes() & (max_page_size | (max_page_size - 1));
+    page_sizes = gpu_page_sizes & (max_page_size | (max_page_size - 1));

    // And pick the biggest one of them
    page_size = 1 << __fls(page_sizes);
--- a/kernel-open/nvidia-uvm/uvm_page_tree_test.c
+++ b/kernel-open/nvidia-uvm/uvm_page_tree_test.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2021 NVIDIA Corporation
+    Copyright (c) 2015-2022 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -305,18 +305,25 @@ static NV_STATUS test_page_tree_init(uvm_gpu_t *gpu, NvU32 big_page_size, uvm_pa
    return uvm_page_tree_init(gpu, NULL, UVM_PAGE_TREE_TYPE_USER, big_page_size, UVM_APERTURE_SYS, tree);
 }

+static NV_STATUS test_page_tree_init_kernel(uvm_gpu_t *gpu, NvU32 big_page_size, uvm_page_tree_t *tree)
+{
+    return uvm_page_tree_init(gpu, NULL, UVM_PAGE_TREE_TYPE_KERNEL, big_page_size, UVM_APERTURE_SYS, tree);
+}
+
 static NV_STATUS test_page_tree_get_ptes(uvm_page_tree_t *tree,
                                         NvU32 page_size,
                                         NvU64 start,
                                         NvLength size,
                                         uvm_page_table_range_t *range)
 {
-    return uvm_page_tree_get_ptes(tree,
-                                  page_size,
-                                  uvm_parent_gpu_canonical_address(tree->gpu->parent, start),
-                                  size,
-                                  UVM_PMM_ALLOC_FLAGS_NONE,
-                                  range);
+    uvm_mmu_mode_hal_t *hal = tree->gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
+
+    // Maxwell GPUs don't use canonical form address even on platforms that
+    // support it.
+    start = (tree->type == UVM_PAGE_TREE_TYPE_USER) && (hal->num_va_bits() > 40) ?
+                           uvm_parent_gpu_canonical_address(tree->gpu->parent, start) :
+                           start;
+    return uvm_page_tree_get_ptes(tree, page_size, start, size, UVM_PMM_ALLOC_FLAGS_NONE, range);
 }

 static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
@ -324,11 +331,13 @@ static NV_STATUS test_page_tree_get_entry(uvm_page_tree_t *tree,
                                          NvU64 start,
                                          uvm_page_table_range_t *single)
 {
-    return uvm_page_tree_get_entry(tree,
-                                   page_size,
-                                   uvm_parent_gpu_canonical_address(tree->gpu->parent, start),
-                                   UVM_PMM_ALLOC_FLAGS_NONE,
-                                   single);
+    uvm_mmu_mode_hal_t *hal = tree->gpu->parent->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K);
+
+    // See comment above (test_page_tree_get_ptes)
+    start = (tree->type == UVM_PAGE_TREE_TYPE_USER) && (hal->num_va_bits() > 40) ?
+                           uvm_parent_gpu_canonical_address(tree->gpu->parent, start) :
+                           start;
+    return uvm_page_tree_get_entry(tree, page_size, start, UVM_PMM_ALLOC_FLAGS_NONE, single);
 }

 static NV_STATUS test_page_tree_alloc_table(uvm_page_tree_t *tree,
@ -411,7 +420,10 @@ static NV_STATUS alloc_64k_memory_57b_va(uvm_gpu_t *gpu)
    uvm_page_table_range_t range;

    NvLength size = 64 * 1024;
-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+
+    // We use a kernel-type page tree to decouple the test from the CPU VA width
+    // and canonical form address limits.
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, 0x100000000000000ULL, size, &range), NV_OK);
    TEST_CHECK_RET(range.entry_count == 1);
    TEST_CHECK_RET(range.table->depth == 5);
@ -532,7 +544,7 @@ static NV_STATUS allocate_then_free_8_8_64k(uvm_gpu_t *gpu)
    NvLength start = stride * 248 + 256LL * 1024 * 1024 * 1024 + (1LL << 47);
    int i;

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    for (i = 0; i < 16; i++)
        MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_64K, start + i * stride, size, range + i), NV_OK);
@ -652,7 +664,7 @@ static NV_STATUS get_entire_table_4k(uvm_gpu_t *gpu)

    NvLength size = 1 << 21;

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_4K, start, size, &range), NV_OK);

    TEST_CHECK_RET(range.table == tree.root->entries[1]->entries[0]->entries[0]->entries[1]);
@ -672,13 +684,13 @@ static NV_STATUS get_entire_table_512m(uvm_gpu_t *gpu)
    uvm_page_tree_t tree;
    uvm_page_table_range_t range;

-    NvU64 start = 1UL << 47;
+    NvU64 start = 1UL << 48;
    NvLength size = 512UL * 512 * 1024 * 1024;

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range), NV_OK);

-    TEST_CHECK_RET(range.table == tree.root->entries[1]->entries[0]);
+    TEST_CHECK_RET(range.table == tree.root->entries[2]->entries[0]);
    TEST_CHECK_RET(range.entry_count == 512);
    TEST_CHECK_RET(range.table->depth == 2);
    TEST_CHECK_RET(range.page_size == UVM_PAGE_SIZE_512M);
@ -701,7 +713,7 @@ static NV_STATUS split_4k_from_2m(uvm_gpu_t *gpu)
    NvU64 start = 1UL << 48;
    NvLength size = 1 << 21;

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start, size, &range_2m), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_2M, start + size, size, &range_adj), NV_OK);

@ -749,7 +761,7 @@ static NV_STATUS split_2m_from_512m(uvm_gpu_t *gpu)
    NvU64 start = 1UL << 48;
    NvLength size = 512UL * 1024 * 1024;

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start, size, &range_512m), NV_OK);
    MEM_NV_CHECK_RET(test_page_tree_get_ptes(&tree, UVM_PAGE_SIZE_512M, start + size, size, &range_adj), NV_OK);

@ -1025,7 +1037,7 @@ static NV_STATUS test_tlb_invalidates(uvm_gpu_t *gpu)
    // Depth 1
    NvU64 extent_pde2 = extent_pde1 * (1ull << 9);

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, BIG_PAGE_SIZE_PASCAL, &tree), NV_OK);

    fake_tlb_invals_enable();

@ -1270,7 +1282,7 @@ static NV_STATUS test_range_vec(uvm_gpu_t *gpu, NvU32 big_page_size, NvU32 page_
    NvU32 i;
    NvU64 offsets[4];

-    MEM_NV_CHECK_RET(test_page_tree_init(gpu, big_page_size, &tree), NV_OK);
+    MEM_NV_CHECK_RET(test_page_tree_init_kernel(gpu, big_page_size, &tree), NV_OK);

    pde_coverage = uvm_mmu_pde_coverage(&tree, page_size);
    page_table_entries = pde_coverage / page_size;
--- a/kernel-open/nvidia-uvm/uvm_pascal_ce.c
+++ b/kernel-open/nvidia-uvm/uvm_pascal_ce.c
@ -39,32 +39,6 @@ void uvm_hal_pascal_ce_offset_in_out(uvm_push_t *push, NvU64 offset_in, NvU64 of
                     OFFSET_OUT_LOWER, HWVALUE(C0B5, OFFSET_OUT_LOWER, VALUE, NvOffset_LO32(offset_out)));
 }

-// Perform an appropriate membar before a semaphore operation. Returns whether
-// the semaphore operation should include a flush.
-static bool pascal_membar_before_semaphore(uvm_push_t *push)
-{
-    uvm_gpu_t *gpu;
-
-    if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE)) {
-        // No MEMBAR requested, don't use a flush.
-        return false;
-    }
-
-    if (!uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU)) {
-        // By default do a MEMBAR SYS and for that we can just use flush on the
-        // semaphore operation.
-        return true;
-    }
-
-    // MEMBAR GPU requested, do it on the HOST and skip the CE flush as CE
-    // doesn't have this capability.
-    gpu = uvm_push_get_gpu(push);
-    gpu->parent->host_hal->wait_for_idle(push);
-    gpu->parent->host_hal->membar_gpu(push);
-
-    return false;
-}
-
 void uvm_hal_pascal_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 payload)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);
@ -72,7 +46,7 @@ void uvm_hal_pascal_ce_semaphore_release(uvm_push_t *push, NvU64 gpu_va, NvU32 p
    NvU32 launch_dma_plc_mode;
    bool use_flush;

-    use_flush = pascal_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(C0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
@ -98,7 +72,7 @@ void uvm_hal_pascal_ce_semaphore_reduction_inc(uvm_push_t *push, NvU64 gpu_va, N
    NvU32 launch_dma_plc_mode;
    bool use_flush;

-    use_flush = pascal_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(C0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
@ -127,7 +101,7 @@ void uvm_hal_pascal_ce_semaphore_timestamp(uvm_push_t *push, NvU64 gpu_va)
    NvU32 launch_dma_plc_mode;
    bool use_flush;

-    use_flush = pascal_membar_before_semaphore(push);
+    use_flush = uvm_hal_membar_before_semaphore(push);

    if (use_flush)
        flush_value = HWCONST(C0B5, LAUNCH_DMA, FLUSH_ENABLE, TRUE);
--- a/kernel-open/nvidia-uvm/uvm_perf_events.h
+++ b/kernel-open/nvidia-uvm/uvm_perf_events.h
@ -54,6 +54,9 @@ typedef enum
    // Locking: uvm_va_space: write
    UVM_PERF_EVENT_BLOCK_SHRINK,

+    // Locking: HMM uvm_va_block lock
+    UVM_PERF_EVENT_BLOCK_MUNMAP,
+
    // Locking: uvm_va_space: write
    UVM_PERF_EVENT_RANGE_DESTROY,

@ -89,6 +92,12 @@ typedef union
        uvm_va_block_t *block;
    } block_shrink;

+    struct
+    {
+        uvm_va_block_t *block;
+        uvm_va_block_region_t region;
+    } block_munmap;
+
    struct
    {
        uvm_va_range_t *range;
--- a/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_prefetch.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2022 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -236,7 +236,7 @@ static NvU32 uvm_perf_prefetch_prenotify_fault_migrations(uvm_va_block_t *va_blo
    const uvm_page_mask_t *resident_mask = NULL;
    const uvm_page_mask_t *thrashing_pages = NULL;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;
    uvm_va_block_region_t max_prefetch_region;
    NvU32 big_page_size;
    uvm_va_block_region_t big_pages_region;
@ -372,7 +372,7 @@ void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
                                uvm_perf_prefetch_bitmap_tree_t *bitmap_tree,
                                uvm_perf_prefetch_hint_t *out_hint)
 {
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    uvm_page_mask_t *prefetch_pages = &out_hint->prefetch_pages_mask;
    NvU32 pending_prefetch_pages;
@ -380,9 +380,10 @@ void uvm_perf_prefetch_get_hint(uvm_va_block_t *va_block,
    uvm_assert_rwsem_locked(&va_space->lock);
    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, faulted_region));
-    UVM_ASSERT(uvm_hmm_va_block_context_vma_is_valid(va_block, va_block_context, faulted_region));
+    UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context, faulted_region));

    out_hint->residency = UVM_ID_INVALID;
+    uvm_page_mask_zero(prefetch_pages);

    if (!g_uvm_perf_prefetch_enable)
        return;
--- a/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
+++ b/kernel-open/nvidia-uvm/uvm_perf_thrashing.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2022 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -398,11 +398,13 @@ static uvm_perf_module_t g_module_thrashing;
 // Callback declaration for the performance heuristics events
 static void thrashing_event_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
 static void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);
+static void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data);

 static uvm_perf_module_event_callback_desc_t g_callbacks_thrashing[] = {
    { UVM_PERF_EVENT_BLOCK_DESTROY, thrashing_block_destroy_cb },
    { UVM_PERF_EVENT_MODULE_UNLOAD, thrashing_block_destroy_cb },
    { UVM_PERF_EVENT_BLOCK_SHRINK , thrashing_block_destroy_cb },
+    { UVM_PERF_EVENT_BLOCK_MUNMAP , thrashing_block_munmap_cb  },
    { UVM_PERF_EVENT_MIGRATION,     thrashing_event_cb         },
    { UVM_PERF_EVENT_REVOCATION,    thrashing_event_cb         }
 };
@ -533,7 +535,7 @@ static void gpu_thrashing_stats_destroy(uvm_gpu_t *gpu)
 // VA space lock needs to be held
 static va_space_thrashing_info_t *va_space_thrashing_info_get_or_null(uvm_va_space_t *va_space)
 {
-    uvm_assert_rwsem_locked(&va_space->lock);
+    // TODO: Bug 3898454: check locking requirement for UVM-HMM.

    return uvm_perf_module_type_data(va_space->perf_modules_data, UVM_PERF_MODULE_TYPE_THRASHING);
 }
@ -689,6 +691,20 @@ void thrashing_block_destroy_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t
    uvm_perf_thrashing_info_destroy(va_block);
 }

+void thrashing_block_munmap_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data)
+{
+    uvm_va_block_t *va_block = event_data->block_munmap.block;
+    uvm_va_block_region_t region = event_data->block_munmap.region;
+
+    UVM_ASSERT(g_uvm_perf_thrashing_enable);
+    UVM_ASSERT(event_id == UVM_PERF_EVENT_BLOCK_MUNMAP);
+    UVM_ASSERT(va_block);
+
+    thrashing_reset_pages_in_region(va_block,
+                                    uvm_va_block_region_start(va_block, region),
+                                    uvm_va_block_region_size(region));
+}
+
 // Sanity checks of the thrashing tracking state
 static bool thrashing_state_checks(uvm_va_block_t *va_block,
                                   block_thrashing_info_t *block_thrashing,
@ -1075,7 +1091,7 @@ static NV_STATUS unmap_remote_pinned_pages(uvm_va_block_t *va_block,
    NV_STATUS tracker_status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
    uvm_processor_id_t processor_id;
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;

    uvm_assert_mutex_locked(&va_block->lock);

@ -1121,7 +1137,7 @@ NV_STATUS uvm_perf_thrashing_unmap_remote_pinned_pages_all(uvm_va_block_t *va_bl
 {
    block_thrashing_info_t *block_thrashing;
    uvm_processor_mask_t unmap_processors;
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;

    uvm_assert_mutex_locked(&va_block->lock);
    UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region));
@ -1425,7 +1441,7 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    uvm_processor_id_t do_not_throttle_processor = page_thrashing->do_not_throttle_processor_id;
    uvm_processor_id_t pinned_residency = page_thrashing->pinned_residency_id;
-    uvm_va_policy_t *policy;
+    const uvm_va_policy_t *policy;
    uvm_processor_id_t preferred_location;

    policy = uvm_va_policy_get(va_block, uvm_va_block_cpu_page_address(va_block, page_index));
@ -1519,7 +1535,8 @@ static uvm_perf_thrashing_hint_t get_hint_for_migration_thrashing(va_space_thras
                }
            }
        }
-        else if (uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(page_thrashing->pinned_residency_id)], requester)) {
+        else if (uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(pinned_residency)], requester)) {
+            if (!uvm_va_block_is_hmm(va_block))
                UVM_ASSERT(uvm_id_equal(closest_resident_id, pinned_residency));

            hint.type = UVM_PERF_THRASHING_HINT_TYPE_PIN;
@ -1790,8 +1807,6 @@ static void thrashing_unpin_pages(struct work_struct *work)
    uvm_va_space_t *va_space = va_space_thrashing->va_space;
    uvm_va_block_context_t *va_block_context = &va_space_thrashing->pinned_pages.va_block_context;

-    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
-
    // Take the VA space lock so that VA blocks don't go away during this
    // operation.
    uvm_va_space_down_read(va_space);
--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -100,10 +100,10 @@
 // All allocated user memory root chunks are tracked in an LRU list
 // (root_chunks.va_block_used). A root chunk is moved to the tail of that list
 // whenever any of its subchunks is allocated (unpinned) by a VA block (see
-// uvm_pmm_gpu_unpin_temp()). When a root chunk is selected for eviction, it has
-// the eviction flag set (see pick_root_chunk_to_evict()). This flag affects
-// many of the PMM operations on all of the subchunks of the root chunk being
-// evicted. See usage of (root_)chunk_is_in_eviction(), in particular in
+// uvm_pmm_gpu_unpin_allocated()). When a root chunk is selected for eviction,
+// it has the eviction flag set (see pick_root_chunk_to_evict()). This flag
+// affects many of the PMM operations on all of the subchunks of the root chunk
+// being evicted. See usage of (root_)chunk_is_in_eviction(), in particular in
 // chunk_free_locked() and claim_free_chunk().
 //
 // To evict a root chunk, all of its free subchunks are pinned, then all
@ -394,7 +394,8 @@ static bool chunk_is_root_chunk_pinned(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chun
    return chunk->suballoc->pinned_leaf_chunks > 0;
 }

-// Pin a chunk and update its root chunk's pinned leaf chunks count if the chunk is not a root chunk
+// Pin a chunk and update its root chunk's pinned leaf chunks count if the
+// chunk is not a root chunk.
 static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
 {
    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
@ -406,17 +407,20 @@ static void chunk_pin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    if (chunk_is_root_chunk(chunk))
        return;

-    // For subchunks, update the pinned leaf chunks count tracked in the suballoc of the root chunk.
+    // For subchunks, update the pinned leaf chunks count tracked in the
+    // suballoc of the root chunk.
    chunk = &root_chunk->chunk;

-    // The passed-in subchunk is not the root chunk so the root chunk has to be split
+    // The passed-in subchunk is not the root chunk so the root chunk has to be
+    // split.
    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
            uvm_pmm_gpu_chunk_state_string(chunk->state));

    chunk->suballoc->pinned_leaf_chunks++;
 }

-// Unpin a chunk and update its root chunk's pinned leaf chunks count if the chunk is not a root chunk
+// Unpin a chunk and update its root chunk's pinned leaf chunks count if the
+// chunk is not a root chunk.
 static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_chunk_state_t new_state)
 {
    uvm_gpu_root_chunk_t *root_chunk = root_chunk_from_chunk(pmm, chunk);
@ -432,10 +436,12 @@ static void chunk_unpin(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_pmm_gpu_
    if (chunk_is_root_chunk(chunk))
        return;

-    // For subchunks, update the pinned leaf chunks count tracked in the suballoc of the root chunk.
+    // For subchunks, update the pinned leaf chunks count tracked in the
+    // suballoc of the root chunk.
    chunk = &root_chunk->chunk;

-    // The passed-in subchunk is not the root chunk so the root chunk has to be split
+    // The passed-in subchunk is not the root chunk so the root chunk has to be
+    // split.
    UVM_ASSERT_MSG(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT, "chunk state %s\n",
            uvm_pmm_gpu_chunk_state_string(chunk->state));

@ -609,6 +615,7 @@ static NV_STATUS pmm_gpu_alloc_kernel(uvm_pmm_gpu_t *pmm,

        uvm_spin_lock(&pmm->list_lock);
        chunk_unpin(pmm, chunks[i], UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+        chunks[i]->is_referenced = false;
        uvm_spin_unlock(&pmm->list_lock);
    }

@ -653,7 +660,10 @@ static void chunk_update_lists_locked(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk
        list_del_init(&chunk->list);
 }

-void uvm_pmm_gpu_unpin_temp(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
+static void gpu_unpin_temp(uvm_pmm_gpu_t *pmm,
+                           uvm_gpu_chunk_t *chunk,
+                           uvm_va_block_t *va_block,
+                           bool is_referenced)
 {
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
    UVM_ASSERT(uvm_pmm_gpu_memory_type_is_user(chunk->type));
@ -667,16 +677,26 @@ void uvm_pmm_gpu_unpin_temp(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_b
    UVM_ASSERT(chunk->va_block_page_index < uvm_va_block_num_cpu_pages(va_block));

    chunk_unpin(pmm, chunk, UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+    chunk->is_referenced = is_referenced;
    chunk->va_block = va_block;
    chunk_update_lists_locked(pmm, chunk);

    uvm_spin_unlock(&pmm->list_lock);
 }

+void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
+{
+    gpu_unpin_temp(pmm, chunk, va_block, false);
+}
+
+void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block)
+{
+    gpu_unpin_temp(pmm, chunk, va_block, true);
+}
+
 void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t *tracker)
 {
    NV_STATUS status;
-    uvm_gpu_root_chunk_t *root_chunk;

    if (!chunk)
        return;
@ -684,11 +704,12 @@ void uvm_pmm_gpu_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_tracker_t
    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED ||
               chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);

-    root_chunk = root_chunk_from_chunk(pmm, chunk);
-
    if (tracker) {
+        uvm_gpu_root_chunk_t *root_chunk;
+
        uvm_tracker_remove_completed(tracker);

+        root_chunk = root_chunk_from_chunk(pmm, chunk);
        root_chunk_lock(pmm, root_chunk);

        // Remove any completed entries from the root tracker to prevent it from
@ -756,6 +777,7 @@ static bool assert_chunk_mergeable(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
            UVM_ASSERT(child->va_block == child_va_block);
            UVM_ASSERT(child->va_block_page_index ==
                       prev_child->va_block_page_index + uvm_gpu_chunk_get_size(prev_child) / PAGE_SIZE);
+            UVM_ASSERT(child->is_referenced == prev_child->is_referenced);
        }
    }

@ -799,6 +821,7 @@ static void merge_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
        UVM_ASSERT(subchunk->va_block);
        chunk->va_block = subchunk->va_block;
        chunk->va_block_page_index = subchunk->va_block_page_index;
+        chunk->is_referenced = subchunk->is_referenced;
    }
    else if (child_state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
        UVM_ASSERT(root_chunk->chunk.suballoc->pinned_leaf_chunks >= num_sub);
@ -2013,8 +2036,8 @@ static NV_STATUS alloc_chunk_with_splits(uvm_pmm_gpu_t *pmm,
    UVM_PANIC();
 }

-// Allocates a single chunk of a given size. If needed splits a chunk of bigger size
-// or, if that is not possible, allocates from PMA or evicts.
+// Allocates a single chunk of a given size. If needed, splits a chunk of
+// bigger size or, if that is not possible, allocates from PMA or evicts.
 NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
                      uvm_pmm_gpu_memory_type_t type,
                      uvm_chunk_size_t chunk_size,
@ -2027,8 +2050,7 @@ NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
    chunk = claim_free_chunk(pmm, type, chunk_size);
    if (chunk) {
        // A free chunk could be claimed, we are done.
-        *out_chunk = chunk;
-        return NV_OK;
+        goto out;
    }

    if (chunk_size == UVM_CHUNK_SIZE_MAX) {
@ -2039,11 +2061,11 @@ NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
        if (status != NV_OK)
            return status;

-        *out_chunk = chunk;
-        return NV_OK;
+        goto out;
    }

-    // We didn't find a free chunk and we will require splits so acquire the PMM lock.
+    // We didn't find a free chunk and we will require splits so acquire the
+    // PMM lock.
    uvm_mutex_lock(&pmm->lock);

    status = alloc_chunk_with_splits(pmm, type, chunk_size, flags, &chunk);
@ -2055,6 +2077,7 @@ NV_STATUS alloc_chunk(uvm_pmm_gpu_t *pmm,
        return status;
    }

+out:
    *out_chunk = chunk;

    return NV_OK;
@ -2273,7 +2296,8 @@ void free_root_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_root_chunk_t *root_chunk, free_
 }

 // Splits the input chunk into subchunks of the next size down. The chunk state
-// can be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED or UVM_PMM_GPU_CHUNK_STATE_ALLOCATED.
+// can be UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED or
+// UVM_PMM_GPU_CHUNK_STATE_ALLOCATED.
 //
 // UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED: This is a split for allocation.
 //
@ -2339,6 +2363,7 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
            uvm_assert_mutex_locked(&chunk->va_block->lock);
            subchunk->va_block = chunk->va_block;
            subchunk->va_block_page_index = chunk->va_block_page_index + (i * subchunk_size) / PAGE_SIZE;
+            subchunk->is_referenced = chunk->is_referenced;
        }
    }

@ -2354,6 +2379,7 @@ NV_STATUS split_gpu_chunk(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
    if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
        chunk->va_block = NULL;
        chunk->va_block_page_index = PAGES_PER_UVM_VA_BLOCK;
+        chunk->is_referenced = false;
    }
    else if (chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED) {
        // -1 for the parent chunk that is going to transition into the split state.
@ -2511,6 +2537,8 @@ static bool try_chunk_free(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)

    uvm_spin_lock(&pmm->list_lock);

+    UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED || !chunk->is_referenced);
+
    chunk->inject_split_error = false;

    // Chunks that are the last allocated child need to trigger a merge and are
@ -3254,6 +3282,270 @@ NvU32 uvm_pmm_gpu_phys_to_virt(uvm_pmm_gpu_t *pmm, NvU64 phys_addr, NvU64 region
    return num_mappings;
 }

+#if UVM_IS_CONFIG_HMM()
+
+static uvm_pmm_gpu_t *devmem_page_to_pmm(struct page *page)
+{
+    return container_of(page->pgmap, uvm_pmm_gpu_t, devmem.pagemap);
+}
+
+static uvm_gpu_chunk_t *devmem_page_to_chunk_locked(struct page *page)
+{
+    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
+    NvU64 chunk_addr = ((NvU64)page_to_pfn(page) << PAGE_SHIFT) - pmm->devmem.pagemap.range.start;
+    size_t index = chunk_addr / UVM_CHUNK_SIZE_MAX;
+    uvm_gpu_chunk_t *root_chunk;
+    uvm_gpu_chunk_t *chunk;
+    uvm_gpu_chunk_t *parent;
+    uvm_chunk_size_t chunk_size;
+
+    UVM_ASSERT(index < pmm->root_chunks.count);
+    root_chunk = &pmm->root_chunks.array[index].chunk;
+    UVM_ASSERT(root_chunk->address == UVM_ALIGN_DOWN(chunk_addr, UVM_CHUNK_SIZE_MAX));
+
+    // Find the uvm_gpu_chunk_t that corresponds to the device private struct
+    // page's PFN. The loop is only 0, 1, or 2 iterations.
+    for (chunk = root_chunk;
+         uvm_gpu_chunk_get_size(chunk) != page_size(page);
+         chunk = parent->suballoc->subchunks[index]) {
+
+        parent = chunk;
+        UVM_ASSERT(parent->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
+        UVM_ASSERT(parent->suballoc);
+
+        chunk_size = uvm_gpu_chunk_get_size(parent->suballoc->subchunks[0]);
+        index = (size_t)uvm_div_pow2_64(chunk_addr - parent->address, chunk_size);
+        UVM_ASSERT(index < num_subchunks(parent));
+    }
+
+    UVM_ASSERT(chunk->address = chunk_addr);
+    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
+    UVM_ASSERT(chunk->is_referenced);
+
+    return chunk;
+}
+
+uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page)
+{
+    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
+    uvm_gpu_chunk_t *chunk;
+
+    UVM_ASSERT(is_device_private_page(page));
+
+    uvm_spin_lock(&pmm->list_lock);
+    chunk = devmem_page_to_chunk_locked(page);
+    uvm_spin_unlock(&pmm->list_lock);
+
+    return chunk;
+}
+
+uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page)
+{
+    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+
+    UVM_ASSERT(is_device_private_page(page));
+
+    return gpu->id;
+}
+
+static void evict_orphan_pages(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
+{
+    NvU32 i;
+
+    UVM_ASSERT(chunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT);
+    UVM_ASSERT(chunk->suballoc);
+
+    for (i = 0; i < num_subchunks(chunk); i++) {
+        uvm_gpu_chunk_t *subchunk = chunk->suballoc->subchunks[i];
+
+        uvm_spin_lock(&pmm->list_lock);
+
+        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT) {
+            uvm_spin_unlock(&pmm->list_lock);
+
+            evict_orphan_pages(pmm, subchunk);
+            continue;
+        }
+
+        if (subchunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED && subchunk->is_referenced) {
+            uvm_spin_unlock(&pmm->list_lock);
+
+            uvm_hmm_pmm_gpu_evict_chunk(uvm_pmm_to_gpu(pmm), subchunk);
+            continue;
+        }
+
+        uvm_spin_unlock(&pmm->list_lock);
+    }
+}
+
+void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm)
+{
+    size_t i;
+
+    if (!pmm->initialized)
+        return;
+
+    // Scan all the root chunks looking for subchunks which are still
+    // referenced. This is slow, but we only do this when unregistering a GPU
+    // and is not critical for performance.
+    for (i = 0; i < pmm->root_chunks.count; i++) {
+        uvm_gpu_root_chunk_t *root_chunk = &pmm->root_chunks.array[i];
+
+        root_chunk_lock(pmm, root_chunk);
+        if (root_chunk->chunk.state == UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
+            evict_orphan_pages(pmm, &root_chunk->chunk);
+        root_chunk_unlock(pmm, root_chunk);
+    }
+}
+
+static void devmem_page_free(struct page *page)
+{
+    uvm_pmm_gpu_t *pmm = devmem_page_to_pmm(page);
+    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
+    uvm_gpu_chunk_t *chunk;
+
+    page->zone_device_data = NULL;
+
+    // We should be calling free_chunk() except that it acquires a mutex and
+    // we may be in an interrupt context where we can't do that. Instead,
+    // do a lazy free. Note that we have to use a "normal" spin lock because
+    // the UVM context is not available.
+    spin_lock(&pmm->list_lock.lock);
+
+    chunk = devmem_page_to_chunk_locked(page);
+    UVM_ASSERT(chunk->is_referenced);
+    chunk->is_referenced = false;
+    list_add_tail(&chunk->list, &pmm->root_chunks.va_block_lazy_free);
+
+    spin_unlock(&pmm->list_lock.lock);
+
+    nv_kthread_q_schedule_q_item(&gpu->parent->lazy_free_q,
+                                 &pmm->root_chunks.va_block_lazy_free_q_item);
+}
+
+// This is called by HMM when the CPU faults on a ZONE_DEVICE private entry.
+static vm_fault_t devmem_fault(struct vm_fault *vmf)
+{
+    uvm_va_space_t *va_space = vmf->page->zone_device_data;
+
+    if (!va_space)
+        return VM_FAULT_SIGBUS;
+
+    return uvm_va_space_cpu_fault_hmm(va_space, vmf->vma, vmf);
+}
+
+static vm_fault_t devmem_fault_entry(struct vm_fault *vmf)
+{
+    UVM_ENTRY_RET(devmem_fault(vmf));
+}
+
+static const struct dev_pagemap_ops uvm_pmm_devmem_ops =
+{
+    .page_free = devmem_page_free,
+    .migrate_to_ram = devmem_fault_entry,
+};
+
+static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+{
+    unsigned long size = pmm->root_chunks.count * UVM_CHUNK_SIZE_MAX;
+    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+    struct resource *res;
+    void *ptr;
+    NV_STATUS status;
+
+    if (!uvm_hmm_is_enabled_system_wide()) {
+        devmem->pagemap.owner = NULL;
+        return NV_OK;
+    }
+
+    res = request_free_mem_region(&iomem_resource, size, "nvidia-uvm-hmm");
+    if (IS_ERR(res)) {
+        UVM_ERR_PRINT("request_free_mem_region() err %ld\n", PTR_ERR(res));
+        status = errno_to_nv_status(PTR_ERR(res));
+        goto err;
+    }
+
+    devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+    devmem->pagemap.range.start = res->start;
+    devmem->pagemap.range.end = res->end;
+    devmem->pagemap.nr_range = 1;
+    devmem->pagemap.ops = &uvm_pmm_devmem_ops;
+    devmem->pagemap.owner = &g_uvm_global;
+
+    // Numa node ID doesn't matter for ZONE_DEVICE private pages.
+    ptr = memremap_pages(&devmem->pagemap, NUMA_NO_NODE);
+    if (IS_ERR(ptr)) {
+        UVM_ERR_PRINT("memremap_pages() err %ld\n", PTR_ERR(ptr));
+        status = errno_to_nv_status(PTR_ERR(ptr));
+        goto err_release;
+    }
+
+    return NV_OK;
+
+err_release:
+    release_mem_region(res->start, resource_size(res));
+err:
+    devmem->pagemap.owner = NULL;
+    return status;
+}
+
+static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+{
+    uvm_pmm_gpu_devmem_t *devmem = &pmm->devmem;
+
+    if (!devmem->pagemap.owner)
+        return;
+
+    memunmap_pages(&devmem->pagemap);
+    release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range));
+}
+
+unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk)
+{
+    return (pmm->devmem.pagemap.range.start + chunk->address) >> PAGE_SHIFT;
+}
+
+#endif // UVM_IS_CONFIG_HMM()
+
+#if !UVM_IS_CONFIG_HMM()
+static NV_STATUS devmem_init(uvm_pmm_gpu_t *pmm)
+{
+    return NV_OK;
+}
+
+static void devmem_deinit(uvm_pmm_gpu_t *pmm)
+{
+}
+#endif // UVM_IS_CONFIG_HMM()
+
+static void process_lazy_free(uvm_pmm_gpu_t *pmm)
+{
+    uvm_gpu_chunk_t *chunk;
+
+    uvm_spin_lock(&pmm->list_lock);
+
+    // Note: We can't use list_for_each_safe_entry() because we drop the lock
+    // in the loop. Instead, just keep removing the first entry until the list
+    // is empty.
+    while (!list_empty(&pmm->root_chunks.va_block_lazy_free)) {
+        chunk = list_first_entry(&pmm->root_chunks.va_block_lazy_free, uvm_gpu_chunk_t, list);
+        list_del_init(&chunk->list);
+        uvm_spin_unlock(&pmm->list_lock);
+
+        free_chunk(pmm, chunk);
+
+        uvm_spin_lock(&pmm->list_lock);
+    }
+
+    uvm_spin_unlock(&pmm->list_lock);
+}
+
+static void process_lazy_free_entry(void *args)
+{
+    UVM_ENTRY_VOID(process_lazy_free(args));
+}
+
 NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
 {
    uvm_gpu_t *gpu = uvm_pmm_to_gpu(pmm);
@ -3279,6 +3571,8 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
    }
    INIT_LIST_HEAD(&pmm->root_chunks.va_block_used);
    INIT_LIST_HEAD(&pmm->root_chunks.va_block_unused);
+    INIT_LIST_HEAD(&pmm->root_chunks.va_block_lazy_free);
+    nv_kthread_q_item_init(&pmm->root_chunks.va_block_lazy_free_q_item, process_lazy_free_entry, pmm);

    uvm_mutex_init(&pmm->lock, UVM_LOCK_ORDER_PMM);
    uvm_init_rwsem(&pmm->pma_lock, UVM_LOCK_ORDER_PMM_PMA);
@ -3354,6 +3648,10 @@ NV_STATUS uvm_pmm_gpu_init(uvm_pmm_gpu_t *pmm)
        }
    }

+    status = devmem_init(pmm);
+    if (status != NV_OK)
+        goto cleanup;
+
    return NV_OK;
 cleanup:
    uvm_pmm_gpu_deinit(pmm);
@ -3387,9 +3685,11 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)
    if (!pmm->initialized)
        return;

+    gpu = uvm_pmm_to_gpu(pmm);
+    nv_kthread_q_flush(&gpu->parent->lazy_free_q);
+    UVM_ASSERT(list_empty(&pmm->root_chunks.va_block_lazy_free));
    release_free_root_chunks(pmm);

-    gpu = uvm_pmm_to_gpu(pmm);
    if (gpu->mem_info.size != 0 && gpu_supports_pma_eviction(gpu))
        nvUvmInterfacePmaUnregisterEvictionCallbacks(pmm->pma);

@ -3425,6 +3725,8 @@ void uvm_pmm_gpu_deinit(uvm_pmm_gpu_t *pmm)

    deinit_caches(pmm);

+    devmem_deinit(pmm);
+
    pmm->initialized = false;
 }

--- a/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_gpu.h
@ -59,6 +59,9 @@
 #include "uvm_linux.h"
 #include "uvm_types.h"
 #include "nv_uvm_types.h"
+#if UVM_IS_CONFIG_HMM()
+#include <linux/memremap.h>
+#endif

 typedef enum
 {
@ -195,7 +198,35 @@ typedef uvm_chunk_size_t uvm_chunk_sizes_mask_t;

 typedef struct uvm_pmm_gpu_chunk_suballoc_struct uvm_pmm_gpu_chunk_suballoc_t;

-typedef struct uvm_gpu_chunk_struct uvm_gpu_chunk_t;
+#if UVM_IS_CONFIG_HMM()
+
+typedef struct uvm_pmm_gpu_struct uvm_pmm_gpu_t;
+
+typedef struct
+{
+    struct dev_pagemap pagemap;
+} uvm_pmm_gpu_devmem_t;
+
+// Return the GPU chunk for a given device private struct page.
+uvm_gpu_chunk_t *uvm_pmm_devmem_page_to_chunk(struct page *page);
+
+// Return the GPU id for a given device private struct page.
+uvm_gpu_id_t uvm_pmm_devmem_page_to_gpu_id(struct page *page);
+
+// Return the PFN of the device private struct page for the given GPU chunk.
+unsigned long uvm_pmm_gpu_devmem_get_pfn(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
+
+// Free any orphan pages.
+// This should be called as part of removing a GPU: after all work is stopped
+// and all va_blocks have been destroyed. There normally won't be any
+// device private struct page references left but there can be cases after
+// fork() where a child process still holds a reference. This function searches
+// for pages that still have a reference and migrates the page to the GPU in
+// order to release the reference in the CPU page table.
+void uvm_pmm_gpu_free_orphan_pages(uvm_pmm_gpu_t *pmm);
+
+#endif
+
 struct uvm_gpu_chunk_struct
 {
    // Physical address of GPU chunk. This may be removed to save memory
@ -226,6 +257,10 @@ struct uvm_gpu_chunk_struct
        // chunk.
        bool is_zero : 1;

+        // This flag indicates an allocated chunk is referenced by a device
+        // private struct page PTE and therefore expects a page_free() callback.
+        bool is_referenced : 1;
+
        uvm_pmm_gpu_chunk_state_t state : order_base_2(UVM_PMM_GPU_CHUNK_STATE_COUNT + 1);

        size_t log2_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);
@ -309,7 +344,7 @@ typedef struct
    atomic64_t map_count;
 } uvm_gpu_root_chunk_indirect_peer_t;

-typedef struct
+typedef struct uvm_pmm_gpu_struct
 {
    // Sizes of the MMU
    uvm_chunk_sizes_mask_t chunk_sizes[UVM_PMM_GPU_MEMORY_TYPE_COUNT];
@ -348,9 +383,19 @@ typedef struct
        // List of root chunks used by VA blocks
        struct list_head va_block_used;

+        // List of chunks needing to be lazily freed and a queue for processing
+        // the list. TODO: Bug 3881835: revisit whether to use nv_kthread_q_t
+        // or workqueue.
+        struct list_head va_block_lazy_free;
+        nv_kthread_q_item_t va_block_lazy_free_q_item;
+
        uvm_gpu_root_chunk_indirect_peer_t indirect_peer[UVM_ID_MAX_GPUS];
    } root_chunks;

+#if UVM_IS_CONFIG_HMM()
+    uvm_pmm_gpu_devmem_t devmem;
+#endif
+
    // Lock protecting PMA allocation, freeing and eviction
    uvm_rw_semaphore_t pma_lock;

@ -410,16 +455,17 @@ struct page *uvm_gpu_chunk_to_page(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk);
 // Allocates num_chunks chunks of size chunk_size in caller-supplied array (chunks).
 //
 // Returned chunks are in the TEMP_PINNED state, requiring a call to either
-// uvm_pmm_gpu_unpin_temp or uvm_pmm_gpu_free. If a tracker is passed in, all
+// uvm_pmm_gpu_unpin_allocated, uvm_pmm_gpu_unpin_referenced, or
+// uvm_pmm_gpu_free. If a tracker is passed in, all
 // the pending operations on the allocated chunks will be added to it
 // guaranteeing that all the entries come from the same GPU as the PMM.
 // Otherwise, when tracker is NULL, all the pending operations will be
 // synchronized before returning to the caller.
 //
 // Each of the allocated chunks list nodes (uvm_gpu_chunk_t::list) can be used
-// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_temp) or freed
-// (uvm_pmm_gpu_free). If used, the list node has to be returned to a valid
-// state before calling either of the APIs.
+// by the caller until the chunk is unpinned (uvm_pmm_gpu_unpin_allocated,
+// uvm_pmm_gpu_unpin_referenced) or freed (uvm_pmm_gpu_free). If used, the list
+// node has to be returned to a valid state before calling either of the APIs.
 //
 // In case of an error, the chunks array is guaranteed to be cleared.
 NV_STATUS uvm_pmm_gpu_alloc(uvm_pmm_gpu_t *pmm,
@ -459,10 +505,17 @@ static NV_STATUS uvm_pmm_gpu_alloc_user(uvm_pmm_gpu_t *pmm,
    return uvm_pmm_gpu_alloc(pmm, num_chunks, chunk_size, UVM_PMM_GPU_MEMORY_TYPE_USER, flags, chunks, out_tracker);
 }

-// Unpin a temporarily pinned chunk and set its reverse map to a VA block
+// Unpin a temporarily pinned chunk, set its reverse map to a VA block, and
+// mark it as allocated.
 //
 // Can only be used on user memory.
-void uvm_pmm_gpu_unpin_temp(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);
+void uvm_pmm_gpu_unpin_allocated(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);
+
+// Unpin a temporarily pinned chunk, set its reverse map to a VA block, and
+// mark it as referenced.
+//
+// Can only be used on user memory.
+void uvm_pmm_gpu_unpin_referenced(uvm_pmm_gpu_t *pmm, uvm_gpu_chunk_t *chunk, uvm_va_block_t *va_block);

 // Frees the chunk. This also unpins the chunk if it is temporarily pinned.
 //
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.c
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem.h
@ -28,6 +28,7 @@
 #include "uvm_linux.h"
 #include "uvm_forward_decl.h"
 #include "uvm_lock.h"
+#include "uvm_pmm_gpu.h"

 // Module to handle per-GPU user mappings to sysmem physical memory. Notably,
 // this implements a reverse map of the DMA address to {va_block, virt_addr}.
@ -176,17 +177,25 @@ size_t uvm_pmm_sysmem_mappings_dma_to_virt(uvm_pmm_sysmem_mappings_t *sysmem_map
                                           uvm_reverse_map_t *out_mappings,
                                           size_t max_out_mappings);

-#define UVM_CPU_CHUNK_SIZES PAGE_SIZE
+#define UVM_CPU_CHUNK_SIZES (UVM_PAGE_SIZE_2M | UVM_PAGE_SIZE_64K | PAGE_SIZE)

-#if UVM_CPU_CHUNK_SIZES == PAGE_SIZE
-#define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 1
-typedef struct page uvm_cpu_chunk_t;
+typedef enum
+{
+    UVM_CPU_CHUNK_ALLOC_FLAGS_NONE = 0,

-#define UVM_CPU_CHUNK_PAGE_INDEX(chunk, page_index) (page_index)
+    // Zero the chunk.
+    UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO = (1 << 0),

-#else
-#define UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE() 0
-typedef struct uvm_cpu_chunk_struct uvm_cpu_chunk_t;
+    // Account for the chunk in the cgroup context.
+    UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT = (1 << 1),
+} uvm_cpu_chunk_alloc_flags_t;
+
+typedef enum
+{
+    UVM_CPU_CHUNK_TYPE_PHYSICAL,
+    UVM_CPU_CHUNK_TYPE_LOGICAL,
+    UVM_CPU_CHUNK_TYPE_HMM
+} uvm_cpu_chunk_type_t;

 // CPU memory chunk descriptor.
 // CPU memory chunks represent a physically contiguous CPU memory
@ -197,6 +206,22 @@ typedef struct uvm_cpu_chunk_struct uvm_cpu_chunk_t;
 // splitting are referred to as "logical chunks".
 struct uvm_cpu_chunk_struct
 {
+    uvm_cpu_chunk_type_t type:2;
+
+    // Size of the chunk.
+    // For chunks resulting from page allocations (physical chunks),
+    // this value is the size of the physical allocation.
+    size_t log2_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);
+
+    // Chunk reference count used when a CPU chunk is split. Each
+    // child sub-chunk will increment the reference count of its
+    // parent.
+    // The reference count is set to 1 when the chunk is created.
+    // This initial reference is dropped if the chunk is split in
+    // order to automatically destroy the chunk when all logical
+    // chunks resulting from the split are destroyed.
+    nv_kref_t refcount;
+
    // Pointer to the CPU page backing this CPU chunk.
    // For physical chunks, this will point to the head page. Physical
    // chunk allocation will set the reference count for the struct
@ -209,100 +234,110 @@ struct uvm_cpu_chunk_struct
    // reference counted, there is no need to take separate references
    // to the struct page for logical chunks.
    struct page *page;
+};

-    // For logical chunks, this points to the parent chunk (which
-    // could also be a logical chunk). For physical chunks, this
-    // is NULL.
-    uvm_cpu_chunk_t *parent;
+typedef struct
+{
+    NvU64 dma_addr;
+    NvU32 map_count;
+} uvm_cpu_phys_mapping_t;

-    // Page offset of this chunk within the physical size of
-    // the parent.
-    uvm_page_index_t offset;
+typedef struct
+{
+    uvm_cpu_chunk_t common;

-    // Region within the VA block covered by this CPU chunk.
-    uvm_va_block_region_t region;
+    // Lock protecting dirty_bitmap and gpu_mappings.
+    uvm_mutex_t lock;

-    // Chunk reference count used when a CPU chunk is split. Each
-    // child sub-chunk will increment the reference count of its
-    // parent.
-    nv_kref_t refcount;
-
-    // Size of the chunk.
-    // For chunks resulting from page allocations (physical chunks),
-    // this value is the size of the physical allocation.
-    size_t log2_size : order_base_2(UVM_CHUNK_SIZE_MASK_SIZE);
-
-    struct {
+    struct
+    {
        // Per-GPU array of DMA mapping addresses for the chunk.
        // The DMA mapping addresses for logical chunks are adjusted
        // to the correct offset within the parent chunk.
-        union {
-            NvU64 static_entry;
-            NvU64 *dynamic_entries;
+        union
+        {
+            uvm_cpu_phys_mapping_t static_entry;
+            uvm_cpu_phys_mapping_t *dynamic_entries;
        };
+
+        // Miximum number of physical mapping entries available.
+        // The initial value is 1 since the static_entry is always
+        // available.
+        // When using the dynamic_entries, it holds the size of the
+        // dynamic_entries array. This may be more than the number
+        // of GPUs with active mappings. The number of active entries
+        // is the number of set bits in dma_addrs_mask.
+        size_t max_entries;
+
+        // The set of GPU ID's that have an active physical mapping.
+        // Since physical mappings are shared by all GPUs under a
+        // parent GPU, this mask only needs to track uvm_parent_gpu_t.
        uvm_processor_mask_t dma_addrs_mask;
    } gpu_mappings;

-    // Lock protecting dirty_bitmap
-    uvm_spinlock_t lock;
-
    // A dynamically allocated bitmap (one per PAGE_SIZE page) used
    // to track dirty state of each PAGE_SIZE page.
-    // Dirty state is tracked only by physical chunks. Therefore,
-    // for logical chunks this will be NULL;
+    // Large CPU chunks are allocated as compound pages. For such
+    // pages, the kernel keeps dirtiness state with a single bit
+    // (in the compound page head) that covers the entire compound
+    // page.
+    //
+    // In the case of UVM-Lite GPUs, using the dirty bit of the
+    // the compound page will cause performance regression due to
+    // the copying of extra data. We mitigate this by using this
+    // bitmap to track which base pages are dirty.
    unsigned long *dirty_bitmap;
-};

-#define UVM_CPU_CHUNK_PAGE_INDEX(chunk, page_index) (chunk->region.first)
-#endif // UVM_CPU_CHUNK_SIZES == PAGE_SIZE
+} uvm_cpu_physical_chunk_t;
+
+typedef struct
+{
+    uvm_cpu_chunk_t common;
+
+    // Pointer to the parent chunk (which could also be a logical chunk).
+    uvm_cpu_chunk_t *parent;
+    uvm_processor_mask_t mapped_gpus;
+} uvm_cpu_logical_chunk_t;

 // Return the set of allowed CPU chunk allocation sizes.
 uvm_chunk_sizes_mask_t uvm_cpu_chunk_get_allocation_sizes(void);

-// Allocate a physical CPU chunk for the specified page index and owned by
-// va_block.
+// Allocate a physical CPU chunk of the specified size.
 //
-// The size of the allocated CPU chunk may be any of the allowed sizes and
-// depends on several factors:
-//     * Allocation will be attempted in reverse order - highest to lowest - in
-//       order ensure that the highest possible size is used.
-//     * An allocation size will be used if:
-//         - the VA region within the block covered by the allocation size is
-//           aligned to that allocation size,
-//         - the VA block region corresponding to the allocation size is empty
-//           (has no previously populated pages), and
-//         - the system allows a page allocation of that size.
-//
-// If mm is not NULL, the chunks memory will be added to the mm's memory cgroup.
-//
-// If a CPU chunk allocation succeeds, NV_OK is returned. If new_chunk is not
-// NULL it will be set to point to the newly allocated chunk. On failure,
-// NV_ERR_NO_MEMORY is returned.
-NV_STATUS uvm_cpu_chunk_alloc(uvm_va_block_t *va_block,
-                              uvm_page_index_t page_index,
-                              struct mm_struct *mm,
+// If a CPU chunk allocation succeeds, NV_OK is returned. new_chunk will be set
+// to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is
+// returned.
+NV_STATUS uvm_cpu_chunk_alloc(uvm_chunk_size_t alloc_size,
+                              uvm_cpu_chunk_alloc_flags_t flags,
                              uvm_cpu_chunk_t **new_chunk);

-// Insert a CPU chunk in the va_block's storage structures.
+// Allocate a HMM CPU chunk.
 //
-// On success, NV_OK is returned. On error,
-//   - NV_ERR_NO_MEMORY is returned if memory allocation for any if the internal
-//     structures did not succeed.
-//   - NV_ERR_INVALID_ARGUMENT is returned if the size of the chunk to be inserted
-//     in invalid.
-//   - NV_ERR_INVALID_STATE is returned if a matching chunk already exists in the
-//     block.
-NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
+// HMM chunks differ from normal CPU chunks in that the kernel has already
+// allocated the page for them. This means we don't allocate any CPU memory
+// here. It also means the kernel holds the reference to the page, so we
+// shouldn't call put_page() when freeing the chunk.
+//
+// If a CPU chunk allocation succeeds NV_OK is returned and new_chunk will be
+// set to point to the newly allocated chunk. On failure, NV_ERR_NO_MEMORY is
+// returned.
+//
+// Note that the kernel retains logical ownership of the page. This means page
+// properties should not be directly modified by UVM. In particular page flags
+// such as PageDirty should not be modified by UVM, nor can UVM directly free
+// the page. The kernel is also responsible for mapping/unmapping the page on
+// the CPU. We create a CPU chunk for the page primarily to allow GPU mappings
+// for the page to be created.
+NV_STATUS uvm_cpu_chunk_alloc_hmm(struct page *page,
+                                  uvm_cpu_chunk_t **new_chunk);

-// Remove a CPU chunk from the va_block's storage structures.
-// The chunk is not freed, only removed from the block's storage structures.
-void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
+// Convert a physical chunk to an HMM chunk.
+static void uvm_cpu_chunk_make_hmm(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);

-// Return the CPU chunk backing page_index within the VA block.
-// If page_index is beyond the boundary of the VA block or a CPU chunk for
-// the specified page has not been allocated and/or inserted into the block,
-// NULL is returned.
-uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *block, uvm_page_index_t page_index);
+    chunk->type = UVM_CPU_CHUNK_TYPE_HMM;
+}

 uvm_chunk_size_t uvm_cpu_chunk_get_size(uvm_cpu_chunk_t *chunk);

@ -313,158 +348,105 @@ static size_t uvm_cpu_chunk_num_pages(uvm_cpu_chunk_t *chunk)
    return uvm_cpu_chunk_get_size(chunk) / PAGE_SIZE;
 }

+static inline bool uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_t *chunk)
+{
+    return chunk->type == UVM_CPU_CHUNK_TYPE_HMM;
+}
+
 static bool uvm_cpu_chunk_is_physical(uvm_cpu_chunk_t *chunk)
 {
-#if UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-    return true;
-#else
-    return chunk->parent == NULL;
-#endif
+    return (chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL || uvm_cpu_chunk_is_hmm(chunk));
 }

-// Return a pointer to the struct page backing page_index within the owning
-// VA block.
-struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);
+static bool uvm_cpu_chunk_is_logical(uvm_cpu_chunk_t *chunk)
+{
+    return chunk->type == UVM_CPU_CHUNK_TYPE_LOGICAL;
+}

-// Take a reference to the CPU chunk.
-void uvm_cpu_chunk_get(uvm_cpu_chunk_t *chunk);
+static uvm_cpu_physical_chunk_t *uvm_cpu_chunk_to_physical(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(uvm_cpu_chunk_is_physical(chunk));
+    return container_of((chunk), uvm_cpu_physical_chunk_t, common);
+}

-// Release a reference to the CPU chunk. When the reference count
-// drops to zero, the CPU chunk will be freed. Physical CPU chunks
-// will also free the CPU pages backing the chunk.
-void uvm_cpu_chunk_put(uvm_cpu_chunk_t *chunk);
+static uvm_cpu_logical_chunk_t *uvm_cpu_chunk_to_logical(uvm_cpu_chunk_t *chunk)
+{
+    UVM_ASSERT(uvm_cpu_chunk_is_logical(chunk));
+    return container_of((chunk), uvm_cpu_logical_chunk_t, common);
+}

-NV_STATUS uvm_cpu_chunk_gpu_mapping_alloc(uvm_va_block_t *va_block, uvm_gpu_id_t id);
-void uvm_cpu_chunk_gpu_mapping_split(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t id);
-void uvm_cpu_chunk_gpu_mapping_free(uvm_va_block_t *va_block, uvm_gpu_id_t id);
+// Free a CPU chunk.
+// This may not result in the immediate freeing of the physical pages of the
+// chunk if this is a logical chunk and there are other logical chunks holding
+// references to the physical chunk.
+// If any DMA mappings to this chunk are still active, they are implicitly
+// destroyed.
+void uvm_cpu_chunk_free(uvm_cpu_chunk_t *chunk);

-// Set the CPU chunk's DMA mapping address for the specified GPU ID.
-NV_STATUS uvm_cpu_chunk_set_gpu_mapping_addr(uvm_va_block_t *va_block,
-                                             uvm_page_index_t page_index,
-                                             uvm_cpu_chunk_t *chunk,
-                                             uvm_gpu_id_t id,
-                                             NvU64 dma_addr);
+// In some configurations such as SR-IOV heavy, a CPU chunk cannot be
+// referenced using its physical address. There needs to be a kernel virtual
+// mapping created.
+//
+// This helper function creates a DMA mapping on the GPU (see
+// uvm_cpu_chunk_map_gpu()) and if necessary a kernel virtual mapping for the
+// chunk. The virtual mapping persists until GPU deinitialization, such that no
+// unmap functionality is exposed. For more details see uvm_mmu_sysmem_map().
+//
+// Note that unlike uvm_cpu_chunk_map_gpu(), this helper requires the GPU
+// object instead of the parent GPU object.
+NV_STATUS uvm_cpu_chunk_map_gpu(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu);
+
+// Destroy a CPU chunk's DMA mapping for the parent GPU.
+// If chunk is a logical chunk, this call may not necessary destroy the DMA
+// mapping of the parent physical chunk since all logical chunks share the
+// parent's DMA mapping.
+void uvm_cpu_chunk_unmap_gpu_phys(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);

 // Get the CPU chunk's DMA mapping address for the specified GPU ID.
-NvU64 uvm_cpu_chunk_get_gpu_mapping_addr(uvm_va_block_t *block,
-                                         uvm_page_index_t page_index,
-                                         uvm_cpu_chunk_t *chunk,
-                                         uvm_gpu_id_t id);
+// If there is no mapping for the GPU, 0 is returned.
+NvU64 uvm_cpu_chunk_get_gpu_phys_addr(uvm_cpu_chunk_t *chunk, uvm_parent_gpu_t *parent_gpu);

-#if !UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-// Split a CPU chunk into a set of CPU chunks of size new_size.
-// new_size has to be one of the supported CPU chunk allocation sizes and has to
-// be smaller than the current size of chunk.
+// Split a CPU chunk into a set of CPU chunks of the next size down from the set
+// of enabled CPU chunk sizes.
 //
-// On success, NV_OK is returned. On failure NV_ERR_NO_MEMORY will be returned.
-NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block,
-                              uvm_cpu_chunk_t *chunk,
-                              uvm_chunk_size_t new_size,
-                              uvm_page_index_t page_index,
-                              uvm_cpu_chunk_t **new_chunks);
+// This function expects that the chunk to be split is larger than the minimum
+// enabled chunk size and that new_chunks has enough space for all chunks
+// resulting from the split.
+//
+// On success, NV_OK is returned and the caller-provided new_chunks array will
+// be filled out with the newly-created logical chunks.
+//
+// After a successfull split, the input chunk can no longer be used.
+//
+// On failure NV_ERR_NO_MEMORY will be returned.
+//
+// Should never be called for HMM chunks as these don't need splitting (they can
+// only be PAGE_SIZE) and even if larger chunks could exist UVM could not split
+// them without kernel interaction which currently isn't exported. Will return
+// NV_ERR_INVALID_ARGUMENT for a HMM chunk.
+// TODO: Bug 3368756: add support for transparent huge page (THP)
+NV_STATUS uvm_cpu_chunk_split(uvm_cpu_chunk_t *chunk, uvm_cpu_chunk_t **new_chunks);

-// Merge chunks to merge_size.
-//
-// All input chunks must have the same parent and size. If not,
-// NV_ERR_INVALID_ARGUMENT is returned.
-//
-// If a merge cannot be done, NV_WARN_NOTHING_TO_DO is returned.
-//
-// On success, NV_OK is returned and merged_chunk is set to point to the
-// merged chunk.
-NV_STATUS uvm_cpu_chunk_merge(uvm_va_block_t *va_block,
-                              uvm_cpu_chunk_t **chunks,
-                              size_t num_merge_chunks,
-                              uvm_chunk_size_t merge_size,
-                              uvm_cpu_chunk_t **merged_chunk);
+// Merge an array of logical chunks into their parent chunk. All chunks have to
+// have the same size, parent, and set of mapped GPUs.
+uvm_cpu_chunk_t *uvm_cpu_chunk_merge(uvm_cpu_chunk_t **chunks);

-// Mark the CPU sub-page page_index in the CPU chunk as dirty.
-// page_index has to be a page withing the chunk's region.
+// Mark the page_index sub-page of the chunk as dirty.
+// page_index is an offset into the chunk.
+//
+// Note that dirty status for HMM chunks should not be modified directly from
+// UVM. Instead the kernel will mark the backing struct pages dirty either on
+// fault when written to from the CPU, or when the PTE is mirrored to the GPU
+// using hmm_range_fault().
 void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

-// Mark the CPU sub-pages page_index in the CPU chunk as clean.
-// page_index has to be a page withing the chunk's region.
+// Mark the page_index sub-page of the chunk as clean.
+// page_index is an offset into the chunk.
 void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

-// Return true if the CPU sub-pages page_index in the CPU chunk are dirty.
-// page_index has to be a page withing the chunk's region.
+// Return true if the page_index base page of the CPU chunk is dirty.
 bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index);

-#else // UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-
-static NV_STATUS uvm_cpu_chunk_split(uvm_va_block_t *va_block,
-                                     uvm_cpu_chunk_t *chunk,
-                                     uvm_chunk_size_t new_size,
-                                     uvm_page_index_t page_index,
-                                     uvm_cpu_chunk_t **new_chunks)
-{
-    return NV_OK;
-}
-
-static NV_STATUS uvm_cpu_chunk_merge(uvm_va_block_t *va_block,
-                                     uvm_cpu_chunk_t **chunk,
-                                     size_t num_merge_chunks,
-                                     uvm_chunk_size_t merge_size,
-                                     uvm_cpu_chunk_t **merged_chunk)
-{
-    return NV_WARN_NOTHING_TO_DO;
-}
-
-static void uvm_cpu_chunk_mark_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
-{
-    SetPageDirty(chunk);
-}
-
-static void uvm_cpu_chunk_mark_clean(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
-{
-    ClearPageDirty(chunk);
-}
-
-static bool uvm_cpu_chunk_is_dirty(uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
-{
-    return PageDirty(chunk);
-}
-#endif // !UVM_CPU_CHUNK_SIZE_IS_PAGE_SIZE()
-
-// Return the first CPU chunk in the block. If no CPU chunks have been
-// allocated and/or inserted into the block, NULL is returned.
-// If not NULL, page_index will be set to the first page of the block covered by
-// the returned chunk.
-uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_block(uvm_va_block_t *va_block, uvm_page_index_t *out_page_index);
-
-// Return the next CPU chunk in the block owning chunk.
-// previous_page_index is the index after which to start searching. Its value
-// will be updated with the starting page index of the next chunk in the block.
-uvm_cpu_chunk_t *uvm_cpu_chunk_next(uvm_va_block_t *va_block, uvm_page_index_t *previous_page_index);
-
-#define for_each_cpu_chunk_in_block(chunk, page_index, va_block)                                                     \
-    for ((chunk) = uvm_cpu_chunk_first_in_block((va_block), &(page_index));                                          \
-         (chunk) != NULL;                                                                                            \
-         (page_index) += uvm_cpu_chunk_num_pages(chunk) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)))
-
-#define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)                  \
-    for ((chunk) = uvm_cpu_chunk_first_in_block((va_block), &(page_index)),                             \
-             (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages(chunk) : 0);         \
-         (chunk) != NULL;                                                                               \
-         (page_index) = (next_page_index) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)), \
-             (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages(chunk) : 0))
-
-// Use a special symbol for the region so it does not replace the chunk's region
-// structure member.
-#define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, __region)                                   \
-    for ((page_index) = uvm_va_block_first_page_in_mask((__region), &(va_block)->cpu.allocated),                    \
-             (chunk) = uvm_cpu_chunk_get_chunk_for_page((va_block), (page_index));                                  \
-         (chunk) != NULL && page_index < (__region).outer;                                                          \
-         (page_index) += uvm_cpu_chunk_num_pages(chunk) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index))
-
-#define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, __region) \
-    for ((page_index) = uvm_va_block_first_page_in_mask((__region), &(va_block)->cpu.allocated),        \
-             (chunk) = uvm_cpu_chunk_get_chunk_for_page((va_block), (page_index)),                      \
-             (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);           \
-         (chunk) != NULL && page_index < (__region).outer;                                              \
-         (page_index) = (next_page_index) - 1, (chunk) = uvm_cpu_chunk_next((va_block), &(page_index)), \
-             (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0))
-
 static NV_STATUS uvm_test_get_cpu_chunk_allocation_sizes(UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES_PARAMS *params,
                                                                struct file *filp)
 {
--- a/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_pmm_sysmem_test.c
@ -30,6 +30,10 @@
 #include "uvm_va_block.h"
 #include "uvm_va_range.h"
 #include "uvm_va_space.h"
+#include "uvm_kvmalloc.h"
+#include "uvm_hal.h"
+#include "uvm_push.h"
+#include "uvm_processors.h"

 // Pre-allocated array used for dma-to-virt translations
 static uvm_reverse_map_t g_sysmem_translations[PAGES_PER_UVM_VA_BLOCK];
@ -576,3 +580,640 @@ NV_STATUS uvm_test_pmm_sysmem(UVM_TEST_PMM_SYSMEM_PARAMS *params, struct file *f

    return status;
 }
+
+static NV_STATUS cpu_chunk_map_on_cpu(uvm_cpu_chunk_t *chunk, void **cpu_addr)
+{
+    struct page **pages;
+    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
+    size_t num_pages = uvm_cpu_chunk_num_pages(chunk);
+    NV_STATUS status = NV_OK;
+
+    UVM_ASSERT(cpu_addr);
+
+    // Map the CPU chunk on the CPU.
+    if (chunk_size > PAGE_SIZE) {
+        size_t i;
+
+        pages = uvm_kvmalloc(num_pages * sizeof(*pages));
+        if (!pages)
+            return NV_ERR_NO_MEMORY;
+
+        for (i = 0; i < num_pages; i++)
+            pages[i] = chunk->page + i;
+    }
+    else {
+        pages = &chunk->page;
+    }
+
+    *cpu_addr = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
+    if (!*cpu_addr)
+        status = NV_ERR_NO_MEMORY;
+
+    if (chunk_size > PAGE_SIZE)
+        uvm_kvfree(pages);
+
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_mapping_access(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
+{
+    NvU64 dma_addr;
+    uvm_gpu_phys_address_t gpu_phys_addr;
+    uvm_gpu_address_t gpu_addr;
+    uvm_push_t push;
+    NvU32 *cpu_addr;
+    uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
+    size_t i;
+    NV_STATUS status = NV_OK;
+
+    TEST_NV_CHECK_RET(cpu_chunk_map_on_cpu(chunk, (void **)&cpu_addr));
+    memset(cpu_addr, 0, chunk_size);
+
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
+    gpu_phys_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
+
+    if (uvm_mmu_gpu_needs_dynamic_sysmem_mapping(gpu))
+        gpu_addr = uvm_gpu_address_virtual_from_sysmem_phys(gpu, gpu_phys_addr.address);
+    else
+        gpu_addr = uvm_gpu_address_from_phys(gpu_phys_addr);
+
+    TEST_NV_CHECK_GOTO(uvm_push_begin_acquire(gpu->channel_manager,
+                                              UVM_CHANNEL_TYPE_GPU_TO_CPU,
+                                              NULL,
+                                              &push,
+                                              "GPU -> CPU {%s, %llx} %u bytes",
+                                              uvm_gpu_address_aperture_string(gpu_addr),
+                                              gpu_addr.address,
+                                              chunk_size),
+                       done);
+    gpu->parent->ce_hal->memset_4(&push, gpu_addr, 0xdeadc0de, chunk_size);
+    TEST_NV_CHECK_GOTO(uvm_push_end_and_wait(&push), done);
+
+    for (i = 0; i < chunk_size / sizeof(*cpu_addr); i++) {
+        if (cpu_addr[i] != 0xdeadc0de) {
+            UVM_TEST_PRINT("GPU write of {%s, 0x%llx} %u bytes expected pattern 0x%08x, but offset %zu is 0x%08x\n",
+                           uvm_gpu_address_aperture_string(gpu_addr),
+                           gpu_addr.address,
+                           chunk_size,
+                           0xdeadc0de,
+                           i * sizeof(*cpu_addr),
+                           cpu_addr[i]);
+            status = NV_ERR_INVALID_STATE;
+            break;
+        }
+    }
+
+done:
+    vunmap(cpu_addr);
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_alloc(uvm_chunk_size_t size,
+                                      uvm_cpu_chunk_alloc_flags_t flags,
+                                      uvm_cpu_chunk_t **out_chunk)
+{
+    uvm_cpu_chunk_t *chunk;
+    NV_STATUS status = NV_OK;
+    size_t i;
+
+    UVM_ASSERT(out_chunk);
+
+    // It is possible that the allocation fails due to lack of large pages
+    // rather than an API issue, which will result in a false negative.
+    // However, that should be very rare.
+    TEST_NV_CHECK_RET(uvm_cpu_chunk_alloc(size, flags, &chunk));
+
+    // Check general state of the chunk:
+    //   - chunk should be a physical chunk,
+    //   - chunk should have the correct size,
+    //   - chunk should have the correct number of base pages, and
+    TEST_CHECK_GOTO(uvm_cpu_chunk_is_physical(chunk), done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(chunk) == size, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_num_pages(chunk) == size / PAGE_SIZE, done);
+
+    if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO) {
+        NvU64 *cpu_addr;
+
+        TEST_NV_CHECK_GOTO(cpu_chunk_map_on_cpu(chunk, (void **)&cpu_addr), done);
+        for (i = 0; i < size / sizeof(*cpu_addr); i++)
+            TEST_CHECK_GOTO(cpu_addr[i] == 0, done);
+        vunmap(cpu_addr);
+    }
+
+    for (i = 0; i < size / PAGE_SIZE; i++) {
+        if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO)
+            TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(chunk, i), done);
+        else
+            TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(chunk, i), done);
+    }
+
+done:
+    if (status == NV_OK)
+        *out_chunk = chunk;
+    else
+        uvm_cpu_chunk_free(chunk);
+
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_mapping_basic_verify(uvm_gpu_t *gpu,
+                                                     uvm_cpu_chunk_alloc_flags_t flags,
+                                                     uvm_chunk_size_t size)
+{
+    uvm_cpu_chunk_t *chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk;
+    NvU64 dma_addr;
+    NV_STATUS status = NV_OK;
+
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, flags, &chunk));
+    phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+
+    // Check state of the physical chunk:
+    //   - gpu_mappings.max_entries should be 1 (for the static entry),
+    //   - gpu_mappings.dma_addrs_mask should be 0.
+    //   - no GPU mapping address.
+    TEST_CHECK_GOTO(phys_chunk->gpu_mappings.max_entries == 1, done);
+    TEST_CHECK_GOTO(uvm_processor_mask_get_gpu_count(&phys_chunk->gpu_mappings.dma_addrs_mask) == 0, done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
+
+    // Test basic access.
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
+
+    // Test double map is harmless.
+    dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == dma_addr, done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
+
+    // Test unmap, remap.
+    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0, done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done);
+
+done:
+    // Test free with mapped GPUs still works.
+    uvm_cpu_chunk_free(chunk);
+    return status;;
+}
+
+static NV_STATUS test_cpu_chunk_mapping_basic(uvm_gpu_t *gpu, uvm_cpu_chunk_alloc_flags_t flags)
+{
+    uvm_chunk_sizes_mask_t chunk_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    uvm_chunk_size_t size;
+
+    for_each_chunk_size(size, chunk_sizes)
+        TEST_NV_CHECK_RET(test_cpu_chunk_mapping_basic_verify(gpu, flags, size));
+
+    return NV_OK;
+}
+
+static NV_STATUS test_cpu_chunk_mapping_array(uvm_gpu_t *gpu1, uvm_gpu_t *gpu2, uvm_gpu_t *gpu3)
+{
+    NV_STATUS status = NV_OK;
+    uvm_cpu_chunk_t *chunk;
+    uvm_cpu_physical_chunk_t *phys_chunk;
+    NvU64 dma_addr_gpu2;
+
+    TEST_NV_CHECK_RET(test_cpu_chunk_alloc(PAGE_SIZE, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+    phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu3), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu3), done);
+    dma_addr_gpu2 = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent);
+    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu3->parent);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu1), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu2), done);
+
+    // DMA mapping addresses for different GPUs live in different IOMMU spaces,
+    // so it would be perfectly legal for them to have the same IOVA, and even
+    // if they lived in the same space we freed GPU3's address so it would be
+    // available for reuse.
+    // What we need to ensure is that GPU2's address didn't change after we map
+    // GPU1. It's true that we may get a false negative if both addresses
+    // happened to alias and we had a bug in how the addresses are shifted in
+    // the dense array, but that's better than intermittent failure.
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu2->parent) == dma_addr_gpu2, done);
+
+done:
+    uvm_cpu_chunk_free(chunk);
+    return status;
+}
+
+static NV_STATUS do_test_cpu_chunk_split_and_merge(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
+{
+    NV_STATUS status = NV_OK;
+    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t num_split_chunks;
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_cpu_chunk_t *merged_chunk;
+    uvm_chunk_size_t split_size;
+    NvU64 phys_dma_addr;
+    size_t map_chunk;
+    size_t i;
+
+    split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
+    UVM_ASSERT(split_size != UVM_CHUNK_SIZE_INVALID);
+    num_split_chunks = size / split_size;
+    split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
+
+    if (!split_chunks)
+        return NV_ERR_NO_MEMORY;
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(chunk, gpu), done_free);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(chunk, gpu), done_free);
+    uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent);
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+    TEST_CHECK_GOTO(nv_kref_read(&chunk->refcount) == num_split_chunks, done);
+
+    for (i = 0; i < num_split_chunks; i++) {
+        TEST_CHECK_GOTO(split_chunks[i], done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_is_logical(split_chunks[i]), done);
+        TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[i]) == split_size, done);
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu), done);
+    }
+
+    // Test CPU chunk merging.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu), done_free);
+
+    // Test that GPU mappings are transferred after a split
+    phys_dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+
+    for (i = 0; i < num_split_chunks; i++) {
+        NvU64 dma_addr;
+
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[i], gpu), done);
+        dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu->parent);
+        TEST_CHECK_GOTO(dma_addr == phys_dma_addr + (i * split_size), done);
+        uvm_cpu_chunk_unmap_gpu_phys(split_chunks[i], gpu->parent);
+    }
+
+    // Test that mapping one logical chunk does not affect others.
+    map_chunk = num_split_chunks / 2;
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[map_chunk], gpu), done);
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(split_chunks[map_chunk], gpu), done);
+
+    for (i = 0; i < num_split_chunks; i++) {
+        if (i != map_chunk)
+            TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[i], gpu->parent) == 0, done);
+    }
+
+    if (split_size > PAGE_SIZE) {
+        for (i = 0; i < num_split_chunks; i++)
+            TEST_NV_CHECK_GOTO(do_test_cpu_chunk_split_and_merge(split_chunks[i], gpu), done);
+    }
+
+    // Map all chunks before merging.
+    for (i = 0; i < num_split_chunks; i++)
+        TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
+
+    // Test CPU chunk merging.
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+
+    // At this point, all split chunks have been merged.
+    num_split_chunks = 0;
+
+    TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(merged_chunk) == size, done_free);
+    TEST_CHECK_GOTO(merged_chunk == chunk, done_free);
+
+    // Since all logical chunks were mapped, the entire merged chunk should
+    // be accessible without needing to map it.
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_access(merged_chunk, gpu), done_free);
+
+done:
+    for (i = 0; i < num_split_chunks; i++)
+        uvm_cpu_chunk_free(split_chunks[i]);
+
+done_free:
+    uvm_kvfree(split_chunks);
+
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_split_and_merge(uvm_gpu_t *gpu)
+{
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    uvm_chunk_size_t size;
+
+    size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
+    for_each_chunk_size_from(size, alloc_sizes) {
+        uvm_cpu_chunk_t *chunk;
+        NV_STATUS status;
+
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        status = do_test_cpu_chunk_split_and_merge(chunk, gpu);
+        uvm_cpu_chunk_free(chunk);
+
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
+
+static NV_STATUS test_cpu_chunk_dirty_split(uvm_cpu_chunk_t *chunk)
+{
+    uvm_chunk_size_t size = uvm_cpu_chunk_get_size(chunk);
+    uvm_chunk_size_t split_size;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_cpu_chunk_t *merged_chunk;
+    size_t num_pages = size / PAGE_SIZE;
+    size_t num_split_chunks;
+    size_t num_split_chunk_pages;
+    size_t i;
+    NV_STATUS status = NV_OK;
+
+    split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
+    UVM_ASSERT(split_size != UVM_CHUNK_SIZE_INVALID);
+    num_split_chunks = size / split_size;
+    num_split_chunk_pages = split_size / PAGE_SIZE;
+    split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
+    if (!split_chunks)
+        return NV_ERR_NO_MEMORY;
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+
+    // The parent chunk had only the even pages set as dirty. Make sure
+    // that's still the case after the split.
+    for (i = 0; i < num_split_chunks; i++) {
+        uvm_page_index_t chunk_page;
+
+        for (chunk_page = 0; chunk_page < num_split_chunk_pages; chunk_page++) {
+            if (((i * num_split_chunk_pages) + chunk_page) % 2)
+                TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(split_chunks[i], chunk_page), done);
+            else
+                TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(split_chunks[i], chunk_page), done);
+        }
+    }
+
+    if (split_size > PAGE_SIZE) {
+        for (i = 0; i < num_split_chunks; i++)
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_dirty_split(split_chunks[i]), done);
+    }
+
+    merged_chunk = uvm_cpu_chunk_merge(split_chunks);
+    num_split_chunks = 0;
+    for (i = 0; i < num_pages; i++) {
+        if (i % 2)
+            TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(merged_chunk, i), done_free);
+        else
+            TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(merged_chunk, i), done_free);
+    }
+
+done:
+    for (i = 0; i < num_split_chunks; i++)
+        uvm_cpu_chunk_free(split_chunks[i]);
+
+done_free:
+    uvm_kvfree(split_chunks);
+    return status;
+}
+
+static NV_STATUS test_cpu_chunk_dirty(uvm_gpu_t *gpu)
+{
+    NV_STATUS status = NV_OK;
+    uvm_cpu_chunk_t *chunk;
+    uvm_chunk_size_t size;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t i;
+
+    for_each_chunk_size(size, alloc_sizes) {
+        uvm_cpu_physical_chunk_t *phys_chunk;
+        size_t num_pages;
+
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+        num_pages = uvm_cpu_chunk_num_pages(chunk);
+
+        for (i = 0; i < num_pages; i++)
+            TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(chunk, i), done);
+
+        if (size > PAGE_SIZE)
+            TEST_CHECK_GOTO(bitmap_empty(phys_chunk->dirty_bitmap, num_pages), done);
+
+        uvm_cpu_chunk_free(chunk);
+
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO, &chunk));
+        phys_chunk = uvm_cpu_chunk_to_physical(chunk);
+        num_pages = uvm_cpu_chunk_num_pages(chunk);
+
+        // Allocating the chunk with UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO will set the
+        // entire chunk as dirty.
+        for (i = 0; i < num_pages; i++)
+            TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(chunk, i), done);
+
+        if (size > PAGE_SIZE)
+            TEST_CHECK_GOTO(bitmap_full(phys_chunk->dirty_bitmap, num_pages), done);
+
+        // For chunks larger than PAGE_SIZE, marking individual pages in a
+        // physical chunk should not affect the entire chunk.
+        for (i = 0; i < num_pages; i++) {
+            uvm_cpu_chunk_mark_clean(chunk, i);
+            TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(chunk, i), done);
+            if (size > PAGE_SIZE) {
+                TEST_CHECK_GOTO(bitmap_empty(phys_chunk->dirty_bitmap, i + 1), done);
+                TEST_CHECK_GOTO(bitmap_weight(phys_chunk->dirty_bitmap, num_pages) == num_pages - (i + 1), done);
+            }
+        }
+
+        for (i = 0; i < num_pages; i++) {
+            uvm_cpu_chunk_mark_dirty(chunk, i);
+            TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(chunk, i), done);
+            if (size > PAGE_SIZE) {
+                TEST_CHECK_GOTO(bitmap_full(phys_chunk->dirty_bitmap, i + 1), done);
+                TEST_CHECK_GOTO(bitmap_weight(phys_chunk->dirty_bitmap, num_pages) == i + 1, done);
+            }
+        }
+
+        // Leave only even pages as dirty
+        for (i = 1; i < num_pages; i += 2)
+            uvm_cpu_chunk_mark_clean(chunk, i);
+
+        for (i = 0; i < num_pages; i++) {
+            if (i % 2) {
+                TEST_CHECK_GOTO(!uvm_cpu_chunk_is_dirty(chunk, i), done);
+                if (size > PAGE_SIZE)
+                    TEST_CHECK_GOTO(!test_bit(i, phys_chunk->dirty_bitmap), done);
+            }
+            else {
+                TEST_CHECK_GOTO(uvm_cpu_chunk_is_dirty(chunk, i), done);
+                if (size > PAGE_SIZE)
+                    TEST_CHECK_GOTO(test_bit(i, phys_chunk->dirty_bitmap), done);
+            }
+        }
+
+        if (size > PAGE_SIZE)
+            TEST_NV_CHECK_GOTO(test_cpu_chunk_dirty_split(chunk), done);
+
+done:
+        uvm_cpu_chunk_free(chunk);
+
+        if (status != NV_OK)
+            break;
+    }
+
+    return status;
+}
+
+NV_STATUS do_test_cpu_chunk_free(uvm_cpu_chunk_t *chunk, uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+{
+    NV_STATUS status = NV_OK;
+    uvm_cpu_chunk_t **split_chunks;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t size = uvm_cpu_chunk_get_size(chunk);
+    uvm_chunk_size_t split_size = uvm_chunk_find_prev_size(alloc_sizes, size);
+    size_t num_split_chunks = size / split_size;
+    uvm_gpu_t *gpu;
+    size_t i;
+    size_t j;
+
+    split_chunks = uvm_kvmalloc_zero(num_split_chunks * sizeof(*split_chunks));
+    if (!split_chunks) {
+        UVM_TEST_PRINT("Failed to allocate split chunk array memory");
+        status = NV_ERR_NO_MEMORY;
+        goto done_free;
+    }
+
+    TEST_NV_CHECK_GOTO(uvm_cpu_chunk_split(chunk, split_chunks), done_free);
+
+    // The caller does not free the input chunk.
+    // So, we have to do it in this function. However, beyond this point
+    // the input chunk will be freed by freeing the split chunks.
+    chunk = NULL;
+
+    // Map every other chunk.
+    // The call to uvm_cpu_chunk_unmap_gpu_phys() is here in case this is part
+    // of a double split (see below). In that case, the parent chunk would be
+    // either mapped or unmapped.
+    //
+    // If it is mapped, we have to unmap the subchunks in
+    // order for the mapping check below to succeed. If it is unmapped, the
+    // calls are noops.
+    for (i = 0; i < num_split_chunks; i++) {
+        for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
+            if (i & (1 << uvm_id_gpu_index(gpu->id)))
+                TEST_NV_CHECK_GOTO(uvm_cpu_chunk_map_gpu(split_chunks[i], gpu), done);
+            else
+                uvm_cpu_chunk_unmap_gpu_phys(split_chunks[i], gpu->parent);
+        }
+    }
+
+    // Do a double split if we can
+    if (split_size > PAGE_SIZE) {
+        size_t chunk_to_be_resplit;
+
+        // Test an even (mapped) chunk.
+        chunk_to_be_resplit = num_split_chunks / 2;
+        TEST_NV_CHECK_GOTO(do_test_cpu_chunk_free(split_chunks[chunk_to_be_resplit], va_space, test_gpus), done);
+
+        // The chunk would have been freed by do_test_cpu_chunk_free().
+        split_chunks[chunk_to_be_resplit] = NULL;
+
+        // Test an odd (unmapped) chunk.
+        chunk_to_be_resplit += 1;
+        TEST_NV_CHECK_GOTO(do_test_cpu_chunk_free(split_chunks[chunk_to_be_resplit], va_space, test_gpus), done);
+        split_chunks[chunk_to_be_resplit] = NULL;
+    }
+
+    for (i = 0; i < num_split_chunks; i++) {
+        if (!split_chunks[i])
+            continue;
+
+        uvm_cpu_chunk_free(split_chunks[i]);
+        split_chunks[i] = NULL;
+
+        for (j = i + 1; j < num_split_chunks; j++) {
+            if (!split_chunks[j])
+                continue;
+
+            TEST_CHECK_GOTO(uvm_cpu_chunk_is_logical(split_chunks[j]), done);
+            TEST_CHECK_GOTO(uvm_cpu_chunk_get_size(split_chunks[j]) == split_size, done);
+            for_each_va_space_gpu_in_mask(gpu, va_space, test_gpus) {
+                if (j & (1 << uvm_id_gpu_index(gpu->id)))
+                    TEST_CHECK_GOTO(uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+                else
+                    TEST_CHECK_GOTO(!uvm_cpu_chunk_get_gpu_phys_addr(split_chunks[j], gpu->parent), done);
+            }
+        }
+    }
+
+done:
+    for (i = 0; i < num_split_chunks; i++) {
+        if (split_chunks[i])
+            uvm_cpu_chunk_free(split_chunks[i]);
+    }
+
+done_free:
+    if (chunk)
+        uvm_cpu_chunk_free(chunk);
+
+    uvm_kvfree(split_chunks);
+    return status;
+}
+
+NV_STATUS test_cpu_chunk_free(uvm_va_space_t *va_space, uvm_processor_mask_t *test_gpus)
+{
+    uvm_cpu_chunk_t *chunk;
+    uvm_chunk_sizes_mask_t alloc_sizes = uvm_cpu_chunk_get_allocation_sizes();
+    size_t size = uvm_chunk_find_next_size(alloc_sizes, PAGE_SIZE);
+
+    for_each_chunk_size_from(size, alloc_sizes) {
+        TEST_NV_CHECK_RET(test_cpu_chunk_alloc(size, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE, &chunk));
+        TEST_NV_CHECK_RET(do_test_cpu_chunk_free(chunk, va_space, test_gpus));
+    }
+
+    return NV_OK;
+}
+
+NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp)
+{
+    uvm_va_space_t *va_space = uvm_va_space_get(filp);
+    uvm_processor_mask_t test_gpus;
+    uvm_gpu_t *gpu;
+    NV_STATUS status = NV_OK;
+
+    uvm_va_space_down_read(va_space);
+    uvm_processor_mask_and(&test_gpus,
+                           &va_space->registered_gpus,
+                           &va_space->accessible_from[uvm_id_value(UVM_ID_CPU)]);
+
+    for_each_va_space_gpu_in_mask(gpu, va_space, &test_gpus) {
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_basic(gpu, UVM_CPU_CHUNK_ALLOC_FLAGS_NONE), done);
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_basic(gpu, UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO), done);
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_split_and_merge(gpu), done);
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_dirty(gpu), done);
+    }
+
+    TEST_NV_CHECK_GOTO(test_cpu_chunk_free(va_space, &test_gpus), done);
+
+    if (uvm_processor_mask_get_gpu_count(&test_gpus) >= 3) {
+        uvm_gpu_t *gpu2, *gpu3;
+
+        gpu = uvm_processor_mask_find_first_va_space_gpu(&test_gpus, va_space);
+        gpu2 = uvm_processor_mask_find_next_va_space_gpu(&test_gpus, va_space, gpu);
+        gpu3 = uvm_processor_mask_find_next_va_space_gpu(&test_gpus, va_space, gpu2);
+        TEST_NV_CHECK_GOTO(test_cpu_chunk_mapping_array(gpu, gpu2, gpu3), done);
+    }
+
+done:
+    uvm_va_space_up_read(va_space);
+    return status;
+}
--- a/kernel-open/nvidia-uvm/uvm_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_policy.c
@ -141,7 +141,7 @@ static NV_STATUS split_span_as_needed(uvm_va_space_t *va_space,
    return split_as_needed(va_space, end_addr, split_needed_cb, data);
 }

-static bool preferred_location_is_split_needed(uvm_va_policy_t *policy, void *data)
+static bool preferred_location_is_split_needed(const uvm_va_policy_t *policy, void *data)
 {
    uvm_processor_id_t processor_id;

@ -152,12 +152,13 @@ static bool preferred_location_is_split_needed(uvm_va_policy_t *policy, void *da
 }

 static NV_STATUS preferred_location_unmap_remote_pages(uvm_va_block_t *va_block,
-                                                       uvm_va_block_context_t *va_block_context)
+                                                       uvm_va_block_context_t *va_block_context,
+                                                       uvm_va_block_region_t region)
 {
    NV_STATUS status = NV_OK;
    NV_STATUS tracker_status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;
    uvm_processor_id_t preferred_location = policy->preferred_location;
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
    const uvm_page_mask_t *mapped_mask;
@ -185,7 +186,7 @@ static NV_STATUS preferred_location_unmap_remote_pages(uvm_va_block_t *va_block,
    status = uvm_va_block_unmap(va_block,
                                va_block_context,
                                preferred_location,
-                                uvm_va_block_region_from_block(va_block),
+                                region,
                                &va_block_context->caller_page_mask,
                                &local_tracker);

@ -200,17 +201,15 @@ done:
 }

 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
-                                                     uvm_va_block_context_t *va_block_context)
+                                                     uvm_va_block_context_t *va_block_context,
+                                                     uvm_va_block_region_t region)
 {
    uvm_assert_mutex_locked(&va_block->lock);
-    // TODO: Bug 1750144: remove this restriction when HMM handles setting
-    // the preferred location semantics instead of just recording the policy.
-    UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
-    UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));

+    if (!uvm_va_block_is_hmm(va_block))
        uvm_va_block_mark_cpu_dirty(va_block);

-    return preferred_location_unmap_remote_pages(va_block, va_block_context);
+    return preferred_location_unmap_remote_pages(va_block, va_block_context, region);
 }

 static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
@ -278,7 +277,7 @@ static NV_STATUS preferred_location_set(uvm_va_space_t *va_space,
        return NV_OK;
    }

-    return uvm_hmm_set_preferred_location(va_space, preferred_location, base, last_address);
+    return uvm_hmm_set_preferred_location(va_space, preferred_location, base, last_address, out_tracker);
 }

 NV_STATUS uvm_api_set_preferred_location(const UVM_SET_PREFERRED_LOCATION_PARAMS *params, struct file *filp)
@ -405,20 +404,22 @@ NV_STATUS uvm_api_unset_preferred_location(const UVM_UNSET_PREFERRED_LOCATION_PA
    return status == NV_OK ? tracker_status : status;
 }

-static NV_STATUS va_block_set_accessed_by_locked(uvm_va_block_t *va_block,
+NV_STATUS uvm_va_block_set_accessed_by_locked(uvm_va_block_t *va_block,
                                              uvm_va_block_context_t *va_block_context,
                                              uvm_processor_id_t processor_id,
+                                              uvm_va_block_region_t region,
                                              uvm_tracker_t *out_tracker)
 {
    NV_STATUS status;
    NV_STATUS tracker_status;

    uvm_assert_mutex_locked(&va_block->lock);
+    UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));

    status = uvm_va_block_add_mappings(va_block,
                                       va_block_context,
                                       processor_id,
-                                       uvm_va_block_region_from_block(va_block),
+                                       region,
                                       NULL,
                                       UvmEventMapRemoteCausePolicy);

@ -432,6 +433,7 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
                                       uvm_processor_id_t processor_id)
 {
    uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
+    uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
    NV_STATUS status;
    uvm_tracker_t local_tracker = UVM_TRACKER_INIT();

@ -443,10 +445,12 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
    if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space))
        return NV_OK;

-    status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
-                                     va_block_set_accessed_by_locked(va_block,
+    status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
+                                     NULL,
+                                     uvm_va_block_set_accessed_by_locked(va_block,
                                                                         va_block_context,
                                                                         processor_id,
+                                                                         region,
                                                                         &local_tracker));

    // TODO: Bug 1767224: Combine all accessed_by operations into single tracker
@ -463,7 +467,7 @@ typedef struct
    bool set_bit;
 } accessed_by_split_params_t;

-static bool accessed_by_is_split_needed(uvm_va_policy_t *policy, void *data)
+static bool accessed_by_is_split_needed(const uvm_va_policy_t *policy, void *data)
 {
    accessed_by_split_params_t *params = (accessed_by_split_params_t*)data;

@ -560,7 +564,8 @@ static NV_STATUS accessed_by_set(uvm_va_space_t *va_space,
                                     processor_id,
                                     set_bit,
                                     base,
-                                     last_address);
+                                     last_address,
+                                     &local_tracker);

 done:
    tracker_status = uvm_tracker_wait_deinit(&local_tracker);
@ -641,7 +646,7 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block
    uvm_processor_id_t processor_id;
    uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
    uvm_page_mask_t *break_read_duplication_pages = &va_block_context->caller_page_mask;
-    uvm_va_policy_t *policy = va_block_context->policy;
+    const uvm_va_policy_t *policy = va_block_context->policy;
    uvm_processor_id_t preferred_location = policy->preferred_location;
    uvm_processor_mask_t accessed_by = policy->accessed_by;

@ -703,9 +708,10 @@ static NV_STATUS va_block_unset_read_duplication_locked(uvm_va_block_t *va_block

    // 2- Re-establish SetAccessedBy mappings
    for_each_id_in_mask(processor_id, &accessed_by) {
-        status = va_block_set_accessed_by_locked(va_block,
+        status = uvm_va_block_set_accessed_by_locked(va_block,
                                                     va_block_context,
                                                     processor_id,
+                                                     block_region,
                                                     out_tracker);
        if (status != NV_OK)
            return status;
@ -738,7 +744,7 @@ NV_STATUS uvm_va_block_unset_read_duplication(uvm_va_block_t *va_block,
    return status;
 }

-static bool read_duplication_is_split_needed(uvm_va_policy_t *policy, void *data)
+static bool read_duplication_is_split_needed(const uvm_va_policy_t *policy, void *data)
 {
    uvm_read_duplication_policy_t new_policy;

--- a/kernel-open/nvidia-uvm/uvm_processors.h
+++ b/kernel-open/nvidia-uvm/uvm_processors.h
@ -314,6 +314,14 @@ static bool uvm_id_equal(uvm_processor_id_t id1, uvm_processor_id_t id2)
    return id1.val == id2.val;
 }

+static int uvm_global_id_cmp(uvm_global_processor_id_t id1, uvm_global_processor_id_t id2)
+{
+    UVM_GLOBAL_ID_CHECK_BOUNDS(id1);
+    UVM_GLOBAL_ID_CHECK_BOUNDS(id2);
+
+    return UVM_CMP_DEFAULT(id1.val, id2.val);
+}
+
 static bool uvm_global_id_equal(uvm_global_processor_id_t id1, uvm_global_processor_id_t id2)
 {
    UVM_GLOBAL_ID_CHECK_BOUNDS(id1);
--- a/kernel-open/nvidia-uvm/uvm_push.c
+++ b/kernel-open/nvidia-uvm/uvm_push.c
@ -447,16 +447,16 @@ NvU64 *uvm_push_timestamp(uvm_push_t *push)
    return timestamp;
 }

-bool uvm_push_method_validate(uvm_push_t *push, NvU8 subch, NvU32 method_address, NvU32 method_data)
+bool uvm_push_method_is_valid(uvm_push_t *push, NvU8 subch, NvU32 method_address, NvU32 method_data)
 {
    uvm_gpu_t *gpu = uvm_push_get_gpu(push);

    if (subch == UVM_SUBCHANNEL_CE)
-        return gpu->parent->ce_hal->method_validate(push, method_address, method_data);
+        return gpu->parent->ce_hal->method_is_valid(push, method_address, method_data);
    else if (subch == UVM_SUBCHANNEL_HOST)
-        return gpu->parent->host_hal->method_validate(push, method_address, method_data);
+        return gpu->parent->host_hal->method_is_valid(push, method_address, method_data);
    else if (subch == UVM_SW_OBJ_SUBCHANNEL)
-        return gpu->parent->host_hal->sw_method_validate(push, method_address, method_data);
+        return gpu->parent->host_hal->sw_method_is_valid(push, method_address, method_data);

    UVM_ERR_PRINT("Unsupported subchannel 0x%x\n", subch);
    return false;
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@ -149,7 +149,8 @@ struct uvm_push_info_struct
    char description[128];

    // Procedure to be called when the corresponding push is complete.
-    // This procedure is called with the UVM_LOCK_ORDER_CHANNEL spin lock held.
+    // This procedure is called with the channel pool lock held, which
+    // may be a spinlock.
    void (*on_complete)(void *);
    void *on_complete_data;
 };
@ -438,7 +439,7 @@ static uvm_gpu_t *uvm_push_get_gpu(uvm_push_t *push)

 // Validate that the given method can be pushed to the underlying channel. The
 // method contents can be used to further validate individual fields.
-bool uvm_push_method_validate(uvm_push_t *push, NvU8 subch, NvU32 method_address, NvU32 method_data);
+bool uvm_push_method_is_valid(uvm_push_t *push, NvU8 subch, NvU32 method_address, NvU32 method_data);

 // Retrieve the push info object for a push that has already started
 static uvm_push_info_t *uvm_push_info_from_push(uvm_push_t *push)
--- a/kernel-open/nvidia-uvm/uvm_push_macros.h
+++ b/kernel-open/nvidia-uvm/uvm_push_macros.h
@ -34,6 +34,7 @@
 #include "clb06f.h"

 #define HWMASK(d, r, f) DRF_MASK(NV ## d ## _ ## r ## _ ## f)
+#define HWSHIFT(d, r, f) DRF_SHIFT(NV ## d ## _ ## r ## _ ## f)
 #define HWSHIFTMASK(d, r, f) DRF_SHIFTMASK(NV ## d ## _ ## r ## _ ## f)
 #define HWSIZE(d, r, f) DRF_SIZE(NV ## d ## _ ## r ## _ ## f)
 #define HWCONST(d, r, f, c) DRF_DEF(d, _ ## r, _ ## f, _ ## c)
@ -92,6 +93,8 @@
 #define UVM_SUBCHANNEL_C0B5 UVM_SUBCHANNEL_CE

 #define UVM_SUBCHANNEL_C36F UVM_SUBCHANNEL_HOST
+#define UVM_SUBCHANNEL_C3B5 UVM_SUBCHANNEL_CE
+
 #define UVM_SUBCHANNEL_C46F UVM_SUBCHANNEL_HOST

 #define UVM_SUBCHANNEL_C56F UVM_SUBCHANNEL_HOST
@ -143,7 +146,7 @@
    do {                                                                \
        __NV_PUSH_0U(subch, count, a1);                                 \
        push->next[0] = d1;                                             \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a1, d1),   \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a1, d1),   \
                      "Method validation failed in channel %s\n",       \
                      push->channel->name);                             \
        ++push->next;                                                   \
@ -153,7 +156,7 @@
    do {                                                                \
        __UVM_ASSERT_CONTIGUOUS_METHODS(a1, a2);                        \
        __NV_PUSH_1U(subch, count, a1,d1);                              \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a2, d2),   \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a2, d2),   \
                      "Method validation failed in channel %s\n",       \
                      push->channel->name);                             \
        push->next[0] = d2;                                             \
@ -164,7 +167,7 @@
    do {                                                                \
        __UVM_ASSERT_CONTIGUOUS_METHODS(a2, a3);                        \
        __NV_PUSH_2U(subch, count, a1,d1, a2,d2);                       \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a3, d3),   \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a3, d3),   \
                      "Method validation failed in channel %s\n",       \
                      push->channel->name);                             \
        push->next[0] = d3;                                             \
@ -175,7 +178,7 @@
    do {                                                                \
        __UVM_ASSERT_CONTIGUOUS_METHODS(a3, a4);                        \
        __NV_PUSH_3U(subch, count, a1,d1, a2,d2, a3,d3);                \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a4, d4),   \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a4, d4),   \
                      "Method validation failed in channel %s\n",       \
                      push->channel->name);                             \
        push->next[0] = d4;                                             \
@ -186,7 +189,7 @@
    do {                                                                \
        __UVM_ASSERT_CONTIGUOUS_METHODS(a4, a5);                        \
        __NV_PUSH_4U(subch, count, a1,d1, a2,d2, a3,d3, a4,d4);         \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a5, d5),   \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a5, d5),   \
                      "Method validation failed in channel %s\n",       \
                      push->channel->name);                             \
        push->next[0] = d5;                                             \
@ -197,7 +200,7 @@
    do {                                                                        \
        __UVM_ASSERT_CONTIGUOUS_METHODS(a5, a6);                                \
        __NV_PUSH_5U(subch, count, a1,d1, a2,d2, a3,d3, a4,d4, a5,d5);          \
-        UVM_ASSERT_MSG(uvm_push_method_validate(push, subch, a6, d6),           \
+        UVM_ASSERT_MSG(uvm_push_method_is_valid(push, subch, a6, d6),           \
                      "Method validation failed in channel %s\n",               \
                      push->channel->name);                                     \
        push->next[0] = d6;                                                     \
--- a/kernel-open/nvidia-uvm/uvm_push_test.c
+++ b/kernel-open/nvidia-uvm/uvm_push_test.c
@ -48,40 +48,46 @@ static NvU32 get_push_end_size(uvm_channel_t *channel)

 static NV_STATUS test_push_end_size(uvm_va_space_t *va_space)
 {
-    NV_STATUS status = NV_OK;
    uvm_gpu_t *gpu;
-    NvU32 push_size;
-    NvU32 i;

    for_each_va_space_gpu(gpu, va_space) {
-        for (i = 0; i < UVM_CHANNEL_TYPE_COUNT; ++i) {
+        uvm_channel_type_t type;
+
+        for (type = 0; type < UVM_CHANNEL_TYPE_COUNT; ++type) {
            uvm_push_t push;
-            NvU32 push_end_size;
-            uvm_channel_type_t type = i;
+            NvU32 push_size_before;
+            NvU32 push_end_size_observed, push_end_size_expected;

-            status = uvm_push_begin(gpu->channel_manager, type, &push, "type %u\n", (unsigned)type);
-            TEST_CHECK_GOTO(status == NV_OK, done);
+            TEST_NV_CHECK_RET(uvm_push_begin(gpu->channel_manager,
+                                             type,
+                                             &push,
+                                             "type %s\n",
+                                             uvm_channel_type_to_string(type)));

-            push_end_size = get_push_end_size(push.channel);
-            push_size = uvm_push_get_size(&push);
+            push_size_before = uvm_push_get_size(&push);
            uvm_push_end(&push);
-            if (uvm_push_get_size(&push) - push_size != push_end_size) {
-                UVM_TEST_PRINT("push_end_size incorrect, %u instead of %u for GPU %s\n",
-                               uvm_push_get_size(&push) - push_size,
-                               push_end_size,
+
+            push_end_size_expected = get_push_end_size(push.channel);
+            push_end_size_observed = uvm_push_get_size(&push) - push_size_before;
+
+            if (push_end_size_observed != push_end_size_expected) {
+                UVM_TEST_PRINT("push_end_size incorrect, %u instead of %u on channel type %s for GPU %s\n",
+                               push_end_size_observed,
+                               push_end_size_expected,
+                               uvm_channel_type_to_string(type),
                               uvm_gpu_name(gpu));
-                status = NV_ERR_INVALID_STATE;
-                goto done;
+
+                // The size mismatch error gets precedence over a wait error
+                (void) uvm_push_wait(&push);
+
+                return NV_ERR_INVALID_STATE;
            }
+
+            TEST_NV_CHECK_RET(uvm_push_wait(&push));
        }
    }

-done:
-    for_each_va_space_gpu(gpu, va_space) {
-        uvm_channel_manager_wait(gpu->channel_manager);
-    }
-
-    return status;
+    return NV_OK;
 }

 typedef enum {
@ -201,6 +207,7 @@ static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
    NvU32 i;
    uvm_push_t *pushes;
    uvm_tracker_t tracker = UVM_TRACKER_INIT();
+    uvm_channel_type_t channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;

    // As noted above, this test does unsafe things that would be detected by
    // lock tracking, opt-out.
@ -213,9 +220,10 @@ static NV_STATUS test_concurrent_pushes(uvm_va_space_t *va_space)
    }

    for_each_va_space_gpu(gpu, va_space) {
+
        for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
            uvm_push_t *push = &pushes[i];
-            status = uvm_push_begin(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, push, "concurrent push %u", i);
+            status = uvm_push_begin(gpu->channel_manager, channel_type, push, "concurrent push %u", i);
            TEST_CHECK_GOTO(status == NV_OK, done);
        }
        for (i = 0; i < UVM_PUSH_MAX_CONCURRENT_PUSHES; ++i) {
@ -760,6 +768,7 @@ static NV_STATUS test_push_gpu_to_gpu(uvm_va_space_t *va_space)
    bool waive = true;

    for_each_va_space_gpu(gpu_a, va_space) {
+
        for_each_va_space_gpu(gpu_b, va_space) {
            if (can_do_peer_copies(va_space, gpu_a, gpu_b)) {
                waive = false;
--- a/kernel-open/nvidia-uvm/uvm_rm_mem.c
+++ b/kernel-open/nvidia-uvm/uvm_rm_mem.c
@ -284,12 +284,12 @@ NV_STATUS uvm_rm_mem_map_gpu(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, NvU64 gpu_ali
    UVM_ASSERT(rm_mem);
    UVM_ASSERT(gpu);

-    // Peer mappings not supported yet
-    UVM_ASSERT(rm_mem->type == UVM_RM_MEM_TYPE_SYS);
-
    if (uvm_rm_mem_mapped_on_gpu(rm_mem, gpu))
        return NV_OK;

+    // Peer mappings are not supported yet
+    UVM_ASSERT(rm_mem->type == UVM_RM_MEM_TYPE_SYS);
+
    gpu_owner = rm_mem->gpu_owner;
    gpu_owner_va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu_owner);

@ -334,8 +334,9 @@ void uvm_rm_mem_unmap_gpu(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu)
    UVM_ASSERT(rm_mem);
    UVM_ASSERT(gpu);

-    // Cannot unmap from the gpu that owns the allocation.
-    UVM_ASSERT_MSG(rm_mem->gpu_owner != gpu, "GPU %s\n", uvm_gpu_name(gpu));
+    // The GPU owner mapping remains valid until the memory is freed.
+    if (gpu == rm_mem->gpu_owner)
+        return;

    rm_mem_unmap_gpu(rm_mem, gpu);
 }
--- a/kernel-open/nvidia-uvm/uvm_rm_mem.h
+++ b/kernel-open/nvidia-uvm/uvm_rm_mem.h
@ -120,8 +120,9 @@ NV_STATUS uvm_rm_mem_alloc_and_map_all(uvm_gpu_t *gpu,
 // Map/Unmap on UVM's internal address space of a GPU. In SR-IOV heavy the
 // operation is also applied on the GPU's proxy address space.
 //
-// Supported only for sysmem (UVM_RM_MEM_TYPE_SYS). The GPU has to be different
-// from the one the memory was originally allocated for.
+// Mapping/unmapping on the GPU owner, or mapping on an already mapped GPU, are
+// no-ops. Mapping/unmapping on a GPU different from the owner is only supported
+// for system memory.
 //
 // Locking same as uvm_rm_mem_alloc()
 NV_STATUS uvm_rm_mem_map_gpu(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, NvU64 gpu_alignment);
--- a/kernel-open/nvidia-uvm/uvm_rm_mem_test.c
+++ b/kernel-open/nvidia-uvm/uvm_rm_mem_test.c
@ -64,10 +64,9 @@ static NV_STATUS check_alignment(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, NvU64 ali
 {
    // Alignment requirements only apply to mappings in the UVM-owned VA space
    if (alignment != 0) {
-        bool is_proxy_va_space = false;
-        NvU64 gpu_va = uvm_rm_mem_get_gpu_va(rm_mem, gpu, is_proxy_va_space);
+        NvU64 gpu_uvm_va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);

-        TEST_CHECK_RET(IS_ALIGNED(gpu_va, alignment));
+        TEST_CHECK_RET(IS_ALIGNED(gpu_uvm_va, alignment));
    }

    return NV_OK;
@ -76,20 +75,51 @@ static NV_STATUS check_alignment(uvm_rm_mem_t *rm_mem, uvm_gpu_t *gpu, NvU64 ali
 static NV_STATUS map_gpu_owner(uvm_rm_mem_t *rm_mem, NvU64 alignment)
 {
    uvm_gpu_t *gpu = rm_mem->gpu_owner;
+    NvU64 gpu_uvm_va;
+    NvU64 gpu_proxy_va = 0;

    // The memory should have been automatically mapped in the GPU owner
    TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu(rm_mem, gpu));

+    gpu_uvm_va = uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu);
+
    // In SR-IOV heavy, there are two VA spaces per GPU, so there are two
    // mappings for a single rm_mem object on a GPU, even if the memory is
    // located in vidmem.
-    TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu) == uvm_gpu_uses_proxy_channel_pool(gpu));
+    if (uvm_gpu_uses_proxy_channel_pool(gpu)) {
+        TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
+
+        gpu_proxy_va = uvm_rm_mem_get_gpu_proxy_va(rm_mem, gpu);
+    }
+    else {
+        TEST_CHECK_RET(!uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
+    }

    TEST_NV_CHECK_RET(check_alignment(rm_mem, gpu, alignment));

-    // Explicitly mapping or unmapping to the GPU that owns the allocation is
-    // not allowed, so the testing related to GPU owners is simpler than that of
-    // other GPUs.
+    // Mappings are not ref counted, so additional map calls are no-ops; the
+    // GPU VA should remain the same for all the applicable VA spaces.
+    TEST_NV_CHECK_RET(uvm_rm_mem_map_gpu(rm_mem, gpu, alignment));
+
+    TEST_CHECK_RET(gpu_uvm_va == uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu));
+
+    if (uvm_gpu_uses_proxy_channel_pool(gpu))
+        TEST_CHECK_RET(gpu_proxy_va == uvm_rm_mem_get_gpu_proxy_va(rm_mem, gpu));
+
+    // Unmapping the GPU owner is a no-op
+    uvm_rm_mem_unmap_gpu(rm_mem, gpu);
+
+    TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu(rm_mem, gpu));
+    TEST_CHECK_RET(gpu_uvm_va == uvm_rm_mem_get_gpu_uvm_va(rm_mem, gpu));
+
+    if (uvm_gpu_uses_proxy_channel_pool(gpu)) {
+        TEST_CHECK_RET(uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
+        TEST_CHECK_RET(gpu_proxy_va == uvm_rm_mem_get_gpu_proxy_va(rm_mem, gpu));
+    }
+    else {
+        TEST_CHECK_RET(!uvm_rm_mem_mapped_on_gpu_proxy(rm_mem, gpu));
+    }
+
    return NV_OK;
 }

--- a/kernel-open/nvidia-uvm/uvm_test.c
+++ b/kernel-open/nvidia-uvm/uvm_test.c
@ -324,13 +324,12 @@ long uvm_test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_RB_TREE_RANDOM,               uvm_test_rb_tree_random);
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_GET_USER_SPACE_END_ADDRESS, uvm_test_get_user_space_end_address);
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES, uvm_test_get_cpu_chunk_allocation_sizes);
-        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_HMM_SANITY,                   uvm_test_hmm_sanity);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR,
                                       uvm_test_va_range_inject_add_gpu_va_space_error);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY,   uvm_test_destroy_gpu_va_space_delay);
        UVM_ROUTE_CMD_STACK_NO_INIT_CHECK(UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED, uvm_test_cgroup_accounting_supported);
-        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_HMM_INIT, uvm_test_hmm_init);
        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_SPLIT_INVALIDATE_DELAY, uvm_test_split_invalidate_delay);
+        UVM_ROUTE_CMD_STACK_INIT_CHECK(UVM_TEST_CPU_CHUNK_API, uvm_test_cpu_chunk_api);
    }

    return -EINVAL;
--- a/kernel-open/nvidia-uvm/uvm_test.h
+++ b/kernel-open/nvidia-uvm/uvm_test.h
@ -187,5 +187,5 @@ NV_STATUS uvm_test_tools_flush_replay_events(UVM_TEST_TOOLS_FLUSH_REPLAY_EVENTS_
 NV_STATUS uvm_test_register_unload_state_buffer(UVM_TEST_REGISTER_UNLOAD_STATE_BUFFER_PARAMS *params, struct file *filp);
 NV_STATUS uvm_test_rb_tree_directed(UVM_TEST_RB_TREE_DIRECTED_PARAMS *params, struct file *filp);
 NV_STATUS uvm_test_rb_tree_random(UVM_TEST_RB_TREE_RANDOM_PARAMS *params, struct file *filp);
-NV_STATUS uvm_test_hmm_sanity(UVM_TEST_HMM_SANITY_PARAMS *params, struct file *filp);
+NV_STATUS uvm_test_cpu_chunk_api(UVM_TEST_CPU_CHUNK_API_PARAMS *params, struct file *filp);
 #endif
--- a/kernel-open/nvidia-uvm/uvm_test_ioctl.h
+++ b/kernel-open/nvidia-uvm/uvm_test_ioctl.h
@ -616,6 +616,7 @@ typedef struct
    // Array of processors which have a virtual mapping covering lookup_address.
    NvProcessorUuid                 mapped_on[UVM_MAX_PROCESSORS];                      // Out
    NvU32                           mapping_type[UVM_MAX_PROCESSORS];                   // Out
+    NvU64                           mapping_physical_address[UVM_MAX_PROCESSORS] NV_ALIGN_BYTES(8); // Out
    NvU32                           mapped_on_count;                                    // Out

    // The size of the virtual mapping covering lookup_address on each
@ -1394,16 +1395,6 @@ typedef struct
    NvU32                           rmStatus;                                           // Out
 } UVM_TEST_GET_CPU_CHUNK_ALLOC_SIZES_PARAMS;

-#define UVM_TEST_HMM_SANITY                              UVM_TEST_IOCTL_BASE(92)
-typedef struct
-{
-    NvU64     hmm_address             NV_ALIGN_BYTES(8); // In
-    NvU64     hmm_length              NV_ALIGN_BYTES(8); // In
-    NvU64     uvm_address             NV_ALIGN_BYTES(8); // In
-    NvU64     uvm_length              NV_ALIGN_BYTES(8); // In
-    NV_STATUS rmStatus;                                  // Out
-} UVM_TEST_HMM_SANITY_PARAMS;
-
 // Forces the next range covering the lookup_address to fail in
 // uvm_va_range_add_gpu_va_space() with an out-of-memory error. Only the next
 // uvm_va_range_add_gpu_va_space() will fail. Subsequent ones will succeed.
@ -1435,12 +1426,6 @@ typedef struct
    NV_STATUS rmStatus;                                  // Out
 } UVM_TEST_CGROUP_ACCOUNTING_SUPPORTED_PARAMS;

-#define UVM_TEST_HMM_INIT                                UVM_TEST_IOCTL_BASE(97)
-typedef struct
-{
-    NV_STATUS rmStatus;                                  // Out
-} UVM_TEST_HMM_INIT_PARAMS;
-
 #define UVM_TEST_SPLIT_INVALIDATE_DELAY                  UVM_TEST_IOCTL_BASE(98)
 typedef struct
 {
@ -1448,6 +1433,11 @@ typedef struct
    NV_STATUS rmStatus;                                  // Out
 } UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS;

+#define UVM_TEST_CPU_CHUNK_API                           UVM_TEST_IOCTL_BASE(100)
+typedef struct
+{
+    NV_STATUS rmStatus;                                  // Out
+} UVM_TEST_CPU_CHUNK_API_PARAMS;
 #ifdef __cplusplus
 }
 #endif
--- a/kernel-open/nvidia-uvm/uvm_thread_context.c
+++ b/kernel-open/nvidia-uvm/uvm_thread_context.c
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2019 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -498,6 +498,7 @@ bool uvm_thread_context_add(uvm_thread_context_t *thread_context)
        uvm_thread_context_global_init();

    thread_context->task = current;
+    thread_context->ignore_hmm_invalidate_va_block = NULL;
    table_entry = thread_context_non_interrupt_table_entry(&array_index);
    return thread_context_non_interrupt_add(thread_context, table_entry, array_index);
 }
--- a/kernel-open/nvidia-uvm/uvm_thread_context.h
+++ b/kernel-open/nvidia-uvm/uvm_thread_context.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2019 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -70,6 +70,12 @@ struct uvm_thread_context_struct
    // This field is ignored in interrupt paths
    NvU32 array_index;

+    // Set if uvm_hmm_invalidate() callbacks should be ignored on this va_block.
+    // This needs to be set whenever the va_block lock is held and
+    // migrate_vma_setup() needs to be called since the "slow path" which
+    // calls try_to_migrate() doesn't pass the pgmap_owner.
+    uvm_va_block_t *ignore_hmm_invalidate_va_block;
+
    // Pointer to enclosing node (if any) in red-black tree
    //
    // This field is ignored in interrupt paths
--- a/kernel-open/nvidia-uvm/uvm_tools.c
+++ b/kernel-open/nvidia-uvm/uvm_tools.c
@ -208,7 +208,6 @@ static uvm_va_space_t *tools_event_tracker_va_space(uvm_tools_event_tracker_t *e
    uvm_va_space_t *va_space;
    UVM_ASSERT(event_tracker->uvm_file);
    va_space = uvm_va_space_get(event_tracker->uvm_file);
-    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);
    return va_space;
 }

@ -1614,10 +1613,10 @@ NV_STATUS uvm_api_tools_init_event_tracker(UVM_TOOLS_INIT_EVENT_TRACKER_PARAMS *
        goto fail;
    }

-    status = uvm_va_space_initialized(uvm_va_space_get(event_tracker->uvm_file));
-    if (status != NV_OK) {
+    if (!uvm_fd_va_space(event_tracker->uvm_file)) {
        fput(event_tracker->uvm_file);
        event_tracker->uvm_file = NULL;
+        status = NV_ERR_ILLEGAL_ACTION;
        goto fail;
    }

@ -1758,7 +1757,6 @@ static NV_STATUS tools_update_status(uvm_va_space_t *va_space)
    uvm_assert_rwsem_locked_write(&g_tools_va_space_list_lock);
    uvm_assert_rwsem_locked_write(&va_space->perf_events.lock);
    uvm_assert_rwsem_locked_write(&va_space->tools.lock);
-    UVM_ASSERT(uvm_va_space_initialized(va_space) == NV_OK);

    status = tools_update_perf_events_callbacks(va_space);
    if (status != NV_OK)
@ -2016,13 +2014,11 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
    if (status != NV_OK)
        goto exit;

-    if (is_write) {
    block_context = uvm_va_block_context_alloc(mm);
    if (!block_context) {
        status = NV_ERR_NO_MEMORY;
        goto exit;
    }
-    }

    stage_addr = uvm_mem_get_cpu_addr_kernel(stage_mem);
    *bytes = 0;
@ -2044,11 +2040,16 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
            }
        }

+        if (mm)
+            uvm_down_read_mmap_lock(mm);
+
        // The RM flavor of the lock is needed to perform ECC checks.
        uvm_va_space_down_read_rm(va_space);
-        status = uvm_va_block_find_create_managed(va_space, target_va_start, &block);
+        status = uvm_va_block_find_create(va_space, UVM_ALIGN_DOWN(target_va_start, PAGE_SIZE), block_context, &block);
        if (status != NV_OK) {
            uvm_va_space_up_read_rm(va_space);
+            if (mm)
+                uvm_up_read_mmap_lock(mm);
            goto exit;
        }

@ -2070,6 +2071,22 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
            status = uvm_mem_map_gpu_kernel(stage_mem, gpu);
            if (status != NV_OK) {
                uvm_va_space_up_read_rm(va_space);
+                if (mm)
+                    uvm_up_read_mmap_lock(mm);
+                goto exit;
+            }
+        }
+
+        // Make sure a CPU resident page has an up to date struct page pointer.
+        if (uvm_va_block_is_hmm(block)) {
+            status = uvm_hmm_va_block_update_residency_info(block,
+                                                            mm,
+                                                            UVM_ALIGN_DOWN(target_va_start, PAGE_SIZE),
+                                                            true);
+            if (status != NV_OK) {
+                uvm_va_space_up_read_rm(va_space);
+                if (mm)
+                    uvm_up_read_mmap_lock(mm);
                goto exit;
            }
        }
@ -2082,6 +2099,9 @@ static NV_STATUS tools_access_process_memory(uvm_va_space_t *va_space,
            status = uvm_global_mask_check_ecc_error(global_gpus);

        uvm_va_space_up_read_rm(va_space);
+        if (mm)
+            uvm_up_read_mmap_lock(mm);
+
        if (status != NV_OK)
            goto exit;

--- a/kernel-open/nvidia-uvm/uvm_user_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_user_channel.c
@ -984,14 +984,13 @@ NV_STATUS uvm_test_check_channel_va_space(UVM_TEST_CHECK_CHANNEL_VA_SPACE_PARAMS
        goto out;
    }

-    va_space = uvm_va_space_get(va_space_filp);
-    uvm_va_space_down_read(va_space);
-
-    // We can do this query outside of the lock, but doing it within the lock
-    // simplifies error handling.
-    status = uvm_va_space_initialized(va_space);
-    if (status != NV_OK)
+    va_space = uvm_fd_va_space(va_space_filp);
+    if (!va_space) {
+        status = NV_ERR_INVALID_ARGUMENT;
        goto out;
+    }
+
+    uvm_va_space_down_read(va_space);

    gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpu_uuid);
    if (!gpu || !uvm_processor_mask_test(&va_space->faultable_processors, gpu->id)) {
--- a/kernel-open/nvidia-uvm/uvm_va_block.c
+++ b/kernel-open/nvidia-uvm/uvm_va_block.c
--- a/kernel-open/nvidia-uvm/uvm_va_block.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2015-2022 NVIDIA Corporation
+    Copyright (c) 2015-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -43,6 +43,7 @@
 #include "nv-kthread-q.h"

 #include <linux/mmu_notifier.h>
+#include <linux/wait.h>

 // VA blocks are the leaf nodes in the uvm_va_space tree for managed allocations
 // (VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED):
@ -328,7 +329,7 @@ struct uvm_va_block_struct
        // allocations. See uvm_pmm_sysmem.h for more details on CPU chunks.
        // This member is meant to hold an opaque value indicating the CPU
        // chunk storage method. For more details on CPU chunk storage,
-        // see uvm_cpu_chunk_storage_type_t in uvm_pmm_sysmem.c.
+        // see uvm_cpu_chunk_storage_type_t in uvm_va_block.c.
        unsigned long chunks;

        // Per-page allocation bit vector.
@ -352,12 +353,18 @@ struct uvm_va_block_struct
        // UVM_PTE_BITS_CPU_WRITE is not set, the mapping is read-only.
        // Otherwise, the mapping is read-write.
        //
-        // Note that this is the maximum permissions a PTE could have, but not
-        // necessarily the actual current permissions of the CPU PTEs. The UVM
-        // driver will never change the PTEs without updating this state, but
-        // the kernel can downgrade our CPU mappings at any time without
-        // notifying the UVM driver (for example in response to user space
-        // calling madvise with MADV_DONTNEED).
+        // For managed allocations, this is the maximum permissions a PTE
+        // could have, but not necessarily the actual current permissions of the
+        // CPU PTEs. The UVM driver will never change the PTEs without updating
+        // this state, but the kernel can downgrade our CPU mappings at any time
+        // without notifying the UVM driver (for example in response to user
+        // space calling madvise with MADV_DONTNEED).
+        //
+        // For HMM allocations, this is the minimum permission the CPU has since
+        // Linux can upgrade a read-only PTE to read-write without notifying
+        // the UVM driver. This is why read duplication isn't currently
+        // supported.
+        // TODO: Bug 3660922: Need to handle read duplication at some point.
        uvm_page_mask_t pte_bits[UVM_PTE_BITS_CPU_MAX];

        // Whether the CPU has ever mapped a page on this VA block. This is
@ -452,6 +459,19 @@ struct uvm_va_block_struct
        // The MMU notifier is registered per va_block.
        struct mmu_interval_notifier notifier;

+        // Wait queue for GPU atomic operations to system memory.
+        struct wait_queue_head atomic_waitq;
+
+        // Mask of pages being migrated to system memory for GPU atomic access.
+        // It is used so other threads don't try to migrate those pages while
+        // make_device_exclusive_range() is called without holding the va_block
+        // lock.
+        uvm_page_mask_t atomic_busy;
+
+        // Sequence number to tell if any changes were made to the va_block
+        // while not holding the block lock and calling hmm_range_fault().
+        unsigned long changed;
+
        // Parent VA space pointer. It is NULL for managed blocks or if
        // the HMM block is dead. This field can be read while holding the
        // block lock and is only modified while holding the va_space write
@ -522,7 +542,7 @@ struct uvm_va_block_wrapper_struct
 };

 // Tracking needed for supporting allocation-retry of user GPU memory
-typedef struct
+struct uvm_va_block_retry_struct
 {
    // A tracker used for all allocations from PMM.
    uvm_tracker_t tracker;
@ -537,7 +557,7 @@ typedef struct
    // can contain chunks from multiple GPUs. All the used chunks are unpinned
    // when the operation is finished with uvm_va_block_retry_deinit().
    struct list_head used_chunks;
-} uvm_va_block_retry_t;
+};

 // Module load/exit
 NV_STATUS uvm_va_block_init(void);
@ -635,6 +655,12 @@ uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block);
 // is held in write mode.
 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block);

+// Return true if the VA space has access counter migrations enabled and should
+// remote map pages evicted to system memory. This is OK since access counters
+// can pull the data back to vidmem if sufficient accesses trigger a migration.
+// The caller must ensure that the VA space cannot go away.
+bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space);
+
 // Dynamic cache-based allocation for uvm_va_block_context_t.
 //
 // See uvm_va_block_context_init() for a description of the mm parameter.
@ -663,7 +689,7 @@ static void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context,
 // This always returns true and is intended to only be used with UVM_ASSERT().
 // Locking: the va_block lock must be held.
 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
-                                        uvm_va_policy_t *policy,
+                                        const uvm_va_policy_t *policy,
                                        uvm_va_block_region_t region);

 // TODO: Bug 1766480: Using only page masks instead of a combination of regions
@ -697,7 +723,7 @@ bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
 //
 // Allocation-retry: this operation may need to perform eviction to be able to
 // allocate GPU memory successfully and if that happens,
-// NV_WARN_MORE_PROCESSING_REQUIRED will be returned. That also means that the
+// NV_ERR_MORE_PROCESSING_REQUIRED will be returned. That also means that the
 // block's lock has been unlocked and relocked as part of the call and that the
 // whole sequence of operations performed under the block's lock needs to be
 // attempted again. To facilitate that, the caller needs to provide the same
@ -707,14 +733,15 @@ bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
 // caller.
 //
 // If dest_id is the CPU then va_block_retry can be NULL and allocation-retry of
-// user memory is guaranteed not to happen. Allocation-retry of page tables can
-// still occur though.
+// user memory is guaranteed not to happen. Allocation-retry of GPU page tables
+// can still occur though.
 //
 // va_block_context must not be NULL. This function will set a bit in
 // va_block_context->make_resident.pages_changed_residency for each page that
 // changed residency (due to a migration or first population) as a result of the
-// operation. This function only sets bits in that mask. It is the caller's
-// responsiblity to zero the mask or not first.
+// operation and va_block_context->make_resident.all_involved_processors for
+// each processor involved in the copy. This function only sets bits in those
+// masks. It is the caller's responsiblity to zero the masks or not first.
 //
 // va_block_context->policy must also be set by the caller for the given region.
 // See the comments for uvm_va_block_check_policy_is_valid().
@ -723,6 +750,8 @@ bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
 // have been unlocked and relocked.
 //
 // LOCKING: The caller must hold the va_block lock.
+// If va_block_context->mm != NULL, va_block_context->mm->mmap_lock must be
+// held in at least read mode.
 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
                                     uvm_va_block_retry_t *va_block_retry,
                                     uvm_va_block_context_t *va_block_context,
@ -743,8 +772,6 @@ NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
 //   are broken
 // - Only managed va_blocks are supported.
 //   TODO: Bug 3660922: need to implement HMM read duplication support.
-// - LOCKING: If va_block_context->mm != NULL, va_block_context->mm->mmap_lock
-//            must be held in at least read mode.
 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
                                                    uvm_va_block_retry_t *va_block_retry,
                                                    uvm_va_block_context_t *va_block_context,
@ -756,15 +783,19 @@ NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,

 // Similar to uvm_va_block_make_resident() (read documentation there). The
 // difference is that source pages are only copied to the destination and the
-// residency is not updated until uvm_va_block_make_resident_post() is called.
-// Otherwise, the combination of uvm_va_block_make_resident_pre() and
-// uvm_va_block_make_resident_post() should be the same as just calling
-// uvm_va_block_make_resident().
+// residency is not updated until uvm_va_block_make_resident_finish() is called.
+// Otherwise, the combination of uvm_va_block_make_resident_copy() and
+// uvm_va_block_make_resident_finish() is the same as just calling
+// uvm_va_block_make_resident(). Note, however, that the va_block lock must be
+// held across the two calls for the operation to be complete. The va_block
+// lock can be dropped after calling uvm_va_block_make_resident_copy() but
+// uvm_va_block_make_resident_copy() must be called again after relocking the
+// va_block lock and before calling uvm_va_block_make_resident_finish().
 // This split is needed when using migrate_vma_setup() and migrate_vma_pages()
 // so that when migrate_vma_pages() indicates a page is not migrating, the
 // va_block state is not updated.
 // LOCKING: The caller must hold the va_block lock.
-NV_STATUS uvm_va_block_make_resident_pre(uvm_va_block_t *va_block,
+NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
                                          uvm_va_block_retry_t *va_block_retry,
                                          uvm_va_block_context_t *va_block_context,
                                          uvm_processor_id_t dest_id,
@ -774,10 +805,10 @@ NV_STATUS uvm_va_block_make_resident_pre(uvm_va_block_t *va_block,
                                          uvm_make_resident_cause_t cause);

 // The page_mask must be the same or a subset of the page_mask passed to
-// uvm_va_block_make_resident_pre(). This step updates the residency and breaks
+// uvm_va_block_make_resident_copy(). This step updates the residency and breaks
 // read duplication.
 // LOCKING: The caller must hold the va_block lock.
-void uvm_va_block_make_resident_post(uvm_va_block_t *va_block,
+void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
                                       uvm_va_block_context_t *va_block_context,
                                       uvm_va_block_region_t region,
                                       const uvm_page_mask_t *page_mask);
@ -905,7 +936,8 @@ NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
 //
 // LOCKING: The caller must hold the VA block lock.
 NV_STATUS uvm_va_block_set_preferred_location_locked(uvm_va_block_t *va_block,
-                                                     uvm_va_block_context_t *va_block_context);
+                                                     uvm_va_block_context_t *va_block_context,
+                                                     uvm_va_block_region_t region);

 // Maps the given processor to all resident pages in this block, as allowed by
 // location and policy. Waits for the operation to complete before returning.
@ -921,6 +953,22 @@ NV_STATUS uvm_va_block_set_accessed_by(uvm_va_block_t *va_block,
                                       uvm_va_block_context_t *va_block_context,
                                       uvm_processor_id_t processor_id);

+// Maps given processor to all resident pages in this block and region, as
+// allowed by location and policy. The caller is responsible for waiting for
+// the tracker after all mappings have been started.
+// This function can be called with HMM and managed va_blocks.
+//
+// va_block_context must not be NULL and va_block_context->policy must be valid.
+// See the comments for uvm_va_block_check_policy_is_valid().
+//
+// LOCKING: The caller must hold the va_block lock and
+//          va_block_context->mm->mmap_lock must be held in at least read mode.
+NV_STATUS uvm_va_block_set_accessed_by_locked(uvm_va_block_t *va_block,
+                                              uvm_va_block_context_t *va_block_context,
+                                              uvm_processor_id_t processor_id,
+                                              uvm_va_block_region_t region,
+                                              uvm_tracker_t *out_tracker);
+
 // Breaks SetAccessedBy and remote mappings
 // This function should only be called with managed va_blocks.
 //
@ -1124,6 +1172,11 @@ void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uv
 //          must hold mm->mmap_lock in at least read mode.
 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);

+// Same as uvm_va_block_unregister_gpu() but the VA block lock must be held.
+// Note that this handles allocation-retry internally and hence might unlock
+// and relock block's lock.
+void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm);
+
 // Unmaps all memory associated with the block and drops the ref count of the
 // block. This allows the caller to free resources associated with this block
 // regardless of the block's current ref count. Most importantly it allows the
@ -1186,9 +1239,13 @@ NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,

 // Performs any operations necessary to establish a coherent mapping
 // (migrations, cache invalidates, etc.) in response to the given service block
-// context
+// context.
 //
-// service_context->block_context.policy is set by this function.
+// service_context must not be NULL and service_context->block_context.policy
+// must be valid. See the comments for uvm_va_block_check_policy_is_valid().
+// If va_block is a HMM block, va_block_context->hmm.vma must be valid.
+// See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
+// service_context->prefetch_hint is set by this function.
 //
 // Locking:
 //  - service_context->block_context.mm->mmap_lock must be held in at least
@ -1197,8 +1254,8 @@ NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
 //  - va_block lock must be held
 //
 // If allocation-retry was required as part of the operation and was successful,
-// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock was
-// unlocked and relocked.
+// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
+// was unlocked and relocked.
 //
 // NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
 // and the performance heuristics logic decided to throttle execution.
@ -1209,6 +1266,77 @@ NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
                                      uvm_va_block_retry_t *block_retry,
                                      uvm_service_block_context_t *service_context);

+// Performs population of the destination pages, unmapping and copying source
+// pages to new_residency.
+//
+// service_context must not be NULL and service_context->block_context.policy
+// must be valid. See the comments for uvm_va_block_check_policy_is_valid().
+// If va_block is a HMM block, va_block_context->hmm.vma must be valid.
+// See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
+// service_context->prefetch_hint should be set before calling this function.
+//
+// Locking:
+//  - service_context->block_context.mm->mmap_lock must be held in at least
+//    read mode, if valid.
+//  - va_space lock must be held in at least read mode
+//  - va_block lock must be held
+//
+// If allocation-retry was required as part of the operation and was successful,
+// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
+// was unlocked and relocked.
+//
+// NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
+// and the performance heuristics logic decided to throttle execution.
+// Any other error code different than NV_OK indicates OOM or a global fatal
+// error.
+NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
+                                    uvm_processor_id_t new_residency,
+                                    uvm_va_block_t *va_block,
+                                    uvm_va_block_retry_t *block_retry,
+                                    uvm_service_block_context_t *service_context);
+
+// This updates the va_block residency state and maps the faulting processor_id
+// to the new residency (which may be remote).
+//
+// service_context must not be NULL and service_context->block_context.policy
+// must be valid. See the comments for uvm_va_block_check_policy_is_valid().
+// If va_block is a HMM block, va_block_context->hmm.vma must be valid.
+// See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
+// service_context must be initialized by calling uvm_va_block_service_copy()
+// before calling this function.
+//
+// Locking:
+//  - service_context->block_context.mm->mmap_lock must be held in at least
+//    read mode, if valid.
+//  - va_space lock must be held in at least read mode
+//  - va_block lock must be held
+//  - the mmap lock and va_space lock must be held across the calls to
+//    uvm_va_block_service_copy() and this function. If the va_block lock is
+//    dropped inbetween, special care is needed to check for eviction and
+//    invalidation callbacks.
+//
+// If allocation-retry was required as part of the operation and was successful,
+// NV_ERR_MORE_PROCESSING_REQUIRED is returned. In this case, the block's lock
+// was unlocked and relocked.
+//
+// NV_WARN_MORE_PROCESSING_REQUIRED indicates that thrashing has been detected
+// and the performance heuristics logic decided to throttle execution.
+// Any other error code different than NV_OK indicates OOM or a global fatal
+// error.
+NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
+                                      uvm_va_block_t *va_block,
+                                      uvm_service_block_context_t *service_context);
+
+// Allocate GPU state for the given va_block and registered GPUs.
+// Locking: The block lock must be held.
+NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block);
+
+// Release any GPU or policy data associated with the given region in response
+// to munmap().
+// Locking: The va_block lock must be held.
+void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
+                                uvm_va_block_region_t region);
+
 // Size of the block in bytes. Guaranteed to be a page-aligned value between
 // PAGE_SIZE and UVM_VA_BLOCK_SIZE.
 static inline NvU64 uvm_va_block_size(uvm_va_block_t *block)
@ -1275,7 +1403,7 @@ const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_proc
 // VA block lookup functions. There are a number of permutations which might be
 // useful, such as looking up the block from {va_space, va_range} x {addr,
 // block index}. The ones implemented here and in uvm_va_range.h support the
-// primary two use cases, which are:
+// primary three use cases, which are:
 // 1) Iterating over all VA blocks in a VA range. This uses block indices on the
 //    VA range:
 //      uvm_va_range_num_blocks
@ -1286,6 +1414,9 @@ const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_proc
 //    VA space and address:
 //      uvm_va_block_find
 //      uvm_va_block_find_create
+// 3) Operating on a single VA block (fault). This looks up the block using the
+//    supplied VA range and address:
+//      uvm_va_block_find_create_in_range

 // Finds the UVM or HMM VA block containing addr, if any. The va_space->lock
 // must be held in at least read mode. Return values:
@ -1315,6 +1446,15 @@ NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
                                   uvm_va_block_context_t *va_block_context,
                                   uvm_va_block_t **out_block);

+// Same as uvm_va_block_find_create except that va_range lookup was already done
+// by the caller. If the supplied va_range is NULL, this function behaves just
+// like when the va_range lookup in uvm_va_block_find_create is NULL.
+NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
+                                            uvm_va_range_t *va_range,
+                                            NvU64 addr,
+                                            uvm_va_block_context_t *va_block_context,
+                                            uvm_va_block_t **out_block);
+
 // Same as uvm_va_block_find_create except that only managed va_blocks are
 // created if not already present in the VA range.
 static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
@ -1324,15 +1464,10 @@ static NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
    return uvm_va_block_find_create(va_space, addr, NULL, out_block);
 }

-// Look up a chunk backing a specific address within the VA block. Returns NULL if none.
+// Look up a chunk backing a specific address within the VA block.
+// Returns NULL if none.
 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address);

-typedef enum
-{
-    UVM_MIGRATE_MODE_MAKE_RESIDENT,
-    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
-} uvm_migrate_mode_t;
-
 // Implementation of the UvmMigrate() API at the VA block scope.
 //
 // The out_tracker can be NULL.
@ -1345,6 +1480,8 @@ typedef enum
 //
 // va_block_context must not be NULL and va_block_context->policy must be valid.
 // See the comments for uvm_va_block_check_policy_is_valid().
+// If va_block is a HMM block, va_block_context->hmm.vma must be valid.
+// See the comments for uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
 //
 // LOCKING: The caller must hold the va_block lock. If va_block_context->mm !=
 //          NULL, va_block_context->mm->mmap_lock must be held in at least
@ -1362,7 +1499,7 @@ NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
 // The [dst, dst + size) range has to fit within a single PAGE_SIZE page.
 //
 // va_block_context must not be NULL. The caller is not required to set
-// va_block_context->policy.
+// va_block_context->policy or va_block_context->hmm.vma.
 //
 // The caller needs to support allocation-retry of page tables.
 //
@ -1416,6 +1553,8 @@ static uvm_page_index_t uvm_va_block_cpu_page_index(uvm_va_block_t *va_block, Nv

 // Computes the size and index in the gpu_state chunks array of the GPU chunk
 // which corresponds to the given page_index of the VA region.
+// Note this is only used for testing and does not work on HMM va_blocks as it
+// returns incorrect results for those.
 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
                                          NvU64 size,
                                          uvm_gpu_t *gpu,
@ -1868,77 +2007,66 @@ static uvm_va_block_region_t uvm_va_block_chunk_region(uvm_va_block_t *block,
 // Helpers for page state (permissions, size, residency)
 //

-// Compute the gpus that have at least the given access permissions for the
-// range described by region and page_mask. The function sets the bit if any
-// page in the region has the permissions.
-void uvm_va_block_region_authorized_gpus(uvm_va_block_t *va_block,
-                                         uvm_va_block_region_t region,
-                                         uvm_prot_t access_permission,
-                                         uvm_processor_mask_t *authorized_gpus);
-
-// Compute the processors that have at least the given access permissions for the
-// range described by region and page_mask. The function sets the bit if any
-// page in the region has the permissions.
-void uvm_va_block_region_authorized_processors(uvm_va_block_t *va_block,
-                                               uvm_va_block_region_t region,
-                                               uvm_prot_t access_permission,
-                                               uvm_processor_mask_t *authorized_processors);
-
-void uvm_va_block_page_authorized_gpus(uvm_va_block_t *va_block,
-                                       uvm_page_index_t page_index,
-                                       uvm_prot_t access_permission,
-                                       uvm_processor_mask_t *authorized_gpus);
-
-void uvm_va_block_page_authorized_processors(uvm_va_block_t *va_block,
-                                             uvm_page_index_t page_index,
-                                             uvm_prot_t access_permission,
-                                             uvm_processor_mask_t *authorized_processors);
-
-bool uvm_va_block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
-                                                    uvm_va_block_region_t region,
-                                                    uvm_gpu_id_t gpu_id,
-                                                    uvm_prot_t required_prot);
-
-bool uvm_va_block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
-                                                          uvm_va_block_region_t region,
-                                                          uvm_processor_id_t processor_id,
-                                                          uvm_prot_t required_prot);
-
 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
                                         uvm_page_index_t page_index,
                                         uvm_gpu_id_t gpu_id,
                                         uvm_prot_t required_prot);

-bool uvm_va_block_page_is_processor_authorized(uvm_va_block_t *va_block,
-                                               uvm_page_index_t page_index,
-                                               uvm_processor_id_t processor_id,
-                                               uvm_prot_t required_prot);
-
-// Compute the gpus that have a copy of the given page resident in their memory
-void uvm_va_block_page_resident_gpus(uvm_va_block_t *va_block,
-                                     uvm_page_index_t page_index,
-                                     uvm_processor_mask_t *resident_gpus);
-
-// Compute the processors that have a copy of the given page resident in their memory
+// Compute the processors that have a copy of the given page resident in their
+// memory.
 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
                                           uvm_page_index_t page_index,
                                           uvm_processor_mask_t *resident_processors);

-// Count how many processors have a copy of the given page resident in their memory
+// Count how many processors have a copy of the given page resident in their
+// memory.
 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index);

-// Get the processor with a resident copy of a page closest to the given processor
+// Get the processor with a resident copy of a page closest to the given
+// processor.
 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
                                                          uvm_page_index_t page_index,
                                                          uvm_processor_id_t processor);

-uvm_processor_id_t uvm_va_block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block,
-                                                                  uvm_page_index_t page_index,
-                                                                  uvm_processor_id_t processor,
-                                                                  const uvm_processor_mask_t *processor_mask);
+// Insert a CPU chunk at the given page_index into the va_block.
+// Locking: The va_block lock must be held.
+NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
+                                        uvm_cpu_chunk_t *chunk,
+                                        uvm_page_index_t page_index);
+
+// Remove a CPU chunk at the given page_index from the va_block.
+// Locking: The va_block lock must be held.
+void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
+                                     uvm_page_index_t page_index);
+
+// Return the CPU chunk at the given page_index from the va_block.
+// Locking: The va_block lock must be held.
+uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block,
+                                                  uvm_page_index_t page_index);
+
+// Return the CPU chunk at the given page_index from the va_block.
+// Locking: The va_block lock must be held.
+struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block,
+                                        uvm_page_index_t page_index);
+
+// Physically map a CPU chunk so it is DMA'able from all registered GPUs.
+// Locking: The va_block lock must be held.
+NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
+                                             uvm_page_index_t page_index);
+
+// Physically unmap a CPU chunk from all registered GPUs.
+// Locking: The va_block lock must be held.
+void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *va_block,
+                                          uvm_cpu_chunk_t *chunk,
+                                          uvm_page_index_t page_index);
+
+// Remove any CPU chunks in the given region.
+// Locking: The va_block lock must be held.
+void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region);

 // Get CPU page size or 0 if it is not mapped
-NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index);
+NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block,
+                                 uvm_page_index_t page_index);

 // Get GPU page size or 0 if it is not mapped on the given GPU
 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index);
@ -1999,14 +2127,14 @@ size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t pa
 // and if the va_block is a HMM block, va_block_context->hmm.vma must be valid
 // which also means the va_block_context->mm is not NULL, retained, and locked
 // for at least read. See the comments for uvm_va_block_check_policy_is_valid()
-// and uvm_hmm_va_block_context_vma_is_valid() in uvm_hmm.h.
+// and uvm_hmm_check_context_vma_is_valid() in uvm_hmm.h.
 // Locking: the va_block lock must be held.
 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
                                                 uvm_va_block_context_t *va_block_context,
                                                 uvm_page_index_t page_index,
                                                 uvm_processor_id_t processor_id,
                                                 NvU32 access_type_mask,
-                                                 uvm_va_policy_t *policy,
+                                                 const uvm_va_policy_t *policy,
                                                 const uvm_perf_thrashing_hint_t *thrashing_hint,
                                                 uvm_service_operation_t operation,
                                                 bool *read_duplicate);
--- a/kernel-open/nvidia-uvm/uvm_va_block_types.h
+++ b/kernel-open/nvidia-uvm/uvm_va_block_types.h
@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2016-2019 NVIDIA Corporation
+    Copyright (c) 2016-2023 NVIDIA Corporation

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to
@ -27,6 +27,11 @@
 #include "uvm_common.h"
 #include "uvm_pte_batch.h"
 #include "uvm_tlb_batch.h"
+#include "uvm_forward_decl.h"
+
+#if UVM_IS_CONFIG_HMM()
+#include <linux/migrate.h>
+#endif

 // UVM_VA_BLOCK_BITS is 21, meaning the maximum block size is 2MB. Rationale:
 // - 2MB matches the largest Pascal GPU page size so it's a natural fit
@ -161,8 +166,12 @@ typedef struct
    {
        // Masks used internally
        uvm_page_mask_t page_mask;
-        uvm_page_mask_t copy_resident_pages_between_mask;
+        uvm_page_mask_t copy_resident_pages_mask;
        uvm_page_mask_t pages_staged;
+
+        // This is used to store which pages were successfully copied to the
+        // destination processor and used by uvm_va_block_make_resident_finish()
+        // to update the va_block state.
        uvm_page_mask_t pages_migrated;

        // Out mask filled in by uvm_va_block_make_resident to indicate which
@ -225,13 +234,24 @@ typedef struct
    // the mm, such as creating CPU mappings.
    struct mm_struct *mm;

-    uvm_va_policy_t *policy;
+    const uvm_va_policy_t *policy;

 #if UVM_IS_CONFIG_HMM()
    struct
    {
+        // These are used for migrate_vma_*(), hmm_range_fault(), and
+        // make_device_exclusive_range() handling.
+        unsigned long src_pfns[PAGES_PER_UVM_VA_BLOCK];
+        union {
+            unsigned long dst_pfns[PAGES_PER_UVM_VA_BLOCK];
+            struct page *pages[PAGES_PER_UVM_VA_BLOCK];
+        };
+
        // Cached VMA pointer. This is only valid while holding the mmap_lock.
        struct vm_area_struct *vma;
+
+        // Used for migrate_vma_*() to migrate pages to/from GPU/CPU.
+        struct migrate_vma migrate_vma_args;
    } hmm;
 #endif

@ -242,8 +262,7 @@ typedef struct
 typedef enum
 {
    UVM_VA_BLOCK_TRANSFER_MODE_MOVE = 1,
-    UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2,
-    UVM_VA_BLOCK_TRANSFER_MODE_COPY_ONLY = 3
+    UVM_VA_BLOCK_TRANSFER_MODE_COPY = 2
 } uvm_va_block_transfer_mode_t;

 struct uvm_reverse_map_struct
@ -266,4 +285,10 @@ typedef enum
    UVM_SERVICE_OPERATION_ACCESS_COUNTERS,
 } uvm_service_operation_t;

+typedef enum
+{
+    UVM_MIGRATE_MODE_MAKE_RESIDENT,
+    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
+} uvm_migrate_mode_t;
+
 #endif
--- a/kernel-open/nvidia-uvm/uvm_va_policy.c
+++ b/kernel-open/nvidia-uvm/uvm_va_policy.c
@ -29,23 +29,23 @@
 #include "uvm_va_space.h"
 #include "uvm_va_range.h"

-uvm_va_policy_t uvm_va_policy_default __read_mostly = {
+const uvm_va_policy_t uvm_va_policy_default = {
    .preferred_location = UVM_ID_INVALID,
    .read_duplication = UVM_READ_DUPLICATION_UNSET,
 };

-bool uvm_va_policy_is_read_duplicate(uvm_va_policy_t *policy, uvm_va_space_t *va_space)
+bool uvm_va_policy_is_read_duplicate(const uvm_va_policy_t *policy, uvm_va_space_t *va_space)
 {
    return policy->read_duplication == UVM_READ_DUPLICATION_ENABLED &&
           uvm_va_space_can_read_duplicate(va_space, NULL);
 }

-uvm_va_policy_t *uvm_va_policy_get(uvm_va_block_t *va_block, NvU64 addr)
+const uvm_va_policy_t *uvm_va_policy_get(uvm_va_block_t *va_block, NvU64 addr)
 {
    uvm_assert_mutex_locked(&va_block->lock);

    if (uvm_va_block_is_hmm(va_block)) {
-        uvm_va_policy_node_t *node = uvm_va_policy_node_find(va_block, addr);
+        const uvm_va_policy_node_t *node = uvm_va_policy_node_find(va_block, addr);

        return node ? &node->policy : &uvm_va_policy_default;
    }
@ -63,14 +63,6 @@ static uvm_va_policy_node_t *uvm_va_policy_node_container(uvm_range_tree_node_t
    return container_of(tree_node, uvm_va_policy_node_t, node);
 }

-static uvm_va_policy_t *uvm_va_policy_container(uvm_range_tree_node_t *tree_node)
-{
-    if (!tree_node)
-        return NULL;
-
-    return &uvm_va_policy_node_container(tree_node)->policy;
-}
-
 NV_STATUS uvm_va_policy_init(void)
 {
    g_uvm_va_policy_node_cache = NV_KMEM_CACHE_CREATE("uvm_va_policy_node_t", uvm_va_policy_node_t);
@ -173,7 +165,7 @@ uvm_va_policy_node_t *uvm_va_policy_node_iter_next(uvm_va_block_t *va_block,
    return uvm_va_policy_node_container(tree_node);
 }

-uvm_va_policy_t *uvm_va_policy_iter_first(uvm_va_block_t *va_block,
+const uvm_va_policy_t *uvm_va_policy_iter_first(uvm_va_block_t *va_block,
                                                NvU64 start,
                                                NvU64 end,
                                                uvm_va_policy_node_t **out_node,
@ -181,7 +173,7 @@ uvm_va_policy_t *uvm_va_policy_iter_first(uvm_va_block_t *va_block,
 {
    uvm_range_tree_node_t *tree_node;
    uvm_va_policy_node_t *node;
-    uvm_va_policy_t *policy;
+    const uvm_va_policy_t *policy;
    uvm_va_block_region_t region;

    UVM_ASSERT(uvm_va_block_is_hmm(va_block));
@ -219,8 +211,8 @@ uvm_va_policy_t *uvm_va_policy_iter_first(uvm_va_block_t *va_block,
    return policy;
 }

-uvm_va_policy_t *uvm_va_policy_iter_next(uvm_va_block_t *va_block,
-                                         uvm_va_policy_t *policy,
+const uvm_va_policy_t *uvm_va_policy_iter_next(uvm_va_block_t *va_block,
+                                               const uvm_va_policy_t *policy,
                                               NvU64 end,
                                               uvm_va_policy_node_t **inout_node,
                                               uvm_va_block_region_t *inout_region)
@ -234,7 +226,7 @@ uvm_va_policy_t *uvm_va_policy_iter_next(uvm_va_block_t *va_block,

    next = uvm_va_policy_node_iter_next(va_block, node, end);

-    if (policy == &uvm_va_policy_default) {
+    if (uvm_va_policy_is_default(policy)) {
        // We haven't used the current policy node yet so use it now.
        next = node;
        policy = &node->policy;
@ -563,4 +555,39 @@ NV_STATUS uvm_va_policy_set_range(uvm_va_block_t *va_block,
    return NV_OK;
 }

+const uvm_va_policy_t *uvm_va_policy_set_preferred_location(uvm_va_block_t *va_block,
+                                                            uvm_va_block_region_t region,
+                                                            uvm_processor_id_t processor_id,
+                                                            const uvm_va_policy_t *old_policy)
+{
+    NvU64 start = uvm_va_block_region_start(va_block, region);
+    NvU64 end = uvm_va_block_region_end(va_block, region);
+    uvm_va_policy_node_t *node;
+
+    if (uvm_va_policy_is_default(old_policy)) {
+
+        UVM_ASSERT(!UVM_ID_IS_INVALID(processor_id));
+        UVM_ASSERT(!uvm_range_tree_iter_first(&va_block->hmm.va_policy_tree, start, end));
+
+        node = uvm_va_policy_node_create(va_block, start, end);
+        if (!node)
+            return NULL;
+    }
+    else {
+        // Since the old_policy isn't the constant default policy, we know it
+        // is an allocated uvm_va_policy_node_t and can be cast.
+        node = container_of((uvm_va_policy_t *)old_policy, uvm_va_policy_node_t, policy);
+
+        // The caller guarantees that the policy node doesn't require splitting
+        // and that the policy is changing.
+        UVM_ASSERT(node->node.start >= start);
+        UVM_ASSERT(node->node.end <= end);
+        UVM_ASSERT(!uvm_id_equal(node->policy.preferred_location, processor_id));
+    }
+
+    node->policy.preferred_location = processor_id;
+
+    return &node->policy;
+}
+
 #endif // UVM_IS_CONFIG_HMM()
--- a/Show More
+++ b/Show More