/* _NVRM_COPYRIGHT_BEGIN_
 *
 * Copyright 1999-2019 by NVIDIA Corporation.  All rights reserved.  All
 * information contained herein is proprietary and confidential to NVIDIA
 * Corporation.  Any use, reproduction, or disclosure without the written
 * permission of NVIDIA Corporation is prohibited.
 *
 * _NVRM_COPYRIGHT_END_
 */

#include "nv-misc.h"
#include "nvmisc.h"
#include "os-interface.h"
#include "nv-linux.h"
#include "nv-p2p.h"
#include "nv-reg.h"
#include "rmil.h"
#include "nv-instance.h"
#include "nv-msi.h"
#include "nvlink_proto.h"

#if defined(NV_UVM_ENABLE)
#include "nv_uvm_interface.h"
#endif

#if defined(NV_VGPU_KVM_BUILD)
#include "nv-vgpu-vfio-interface.h"
#endif

#include "nv-frontend.h"
#include "nv-hypervisor.h"
#include "nv-ibmnpu.h"
#include "nv-rsync.h"
#include "nv-kthread-q.h"
#include "nv-pat.h"

#if !defined(CONFIG_RETPOLINE)
#include "nv-retpoline.h"
#endif

/*
 * The module information macros for Linux single-module builds
 * are present in nv-frontend.c.
 */

#if (NV_BUILD_MODULE_INSTANCES != 0)
#if defined(MODULE_LICENSE)
MODULE_LICENSE("NVIDIA");
#endif
#if defined(MODULE_INFO)
MODULE_INFO(supported, "external");
#endif
#if defined(MODULE_VERSION)
MODULE_VERSION(NV_VERSION_STRING);
#endif
#ifdef MODULE_ALIAS_CHARDEV_MAJOR
MODULE_ALIAS_CHARDEV_MAJOR(NV_MAJOR_DEVICE_NUMBER);
#endif
#endif

#include "conftest/patches.h"

/*
 * our global state; one per device
 */

static NvU32 num_nv_devices = 0;
NvU32 num_probed_nv_devices = 0;

NvU32 nv_assign_gpu_count = 0;
nv_pci_info_t nv_assign_gpu_pci_info[NV_MAX_DEVICES];

nv_linux_state_t *nv_linux_devices;

/*
 * And one for the control device
 */

nv_linux_state_t nv_ctl_device = { { 0 } };
wait_queue_head_t nv_ctl_waitqueue;

nv_kthread_q_t nv_kthread_q;
nv_kthread_q_t nv_deferred_close_kthread_q;

nv_cpu_type_t nv_cpu_type = NV_CPU_TYPE_UNKNOWN;

struct rw_semaphore nv_system_pm_lock;

#if defined(CONFIG_PM)
static nv_power_state_t nv_system_power_state;
static nv_pm_action_depth_t nv_system_pm_action_depth;
struct semaphore nv_system_power_state_lock;
#endif

void *nvidia_p2p_page_t_cache;
static void *nvidia_pte_t_cache;
void *nvidia_stack_t_cache;
static nvidia_stack_t *__nv_init_sp;

static int nv_tce_bypass_mode = NV_TCE_BYPASS_MODE_DEFAULT;

struct semaphore nv_linux_devices_lock;

static NvTristate nv_chipset_is_io_coherent = NV_TRISTATE_INDETERMINATE;

// True if all the successfully probed devices support ATS
// Assigned at device probe (module init) time
NvBool nv_ats_supported = NVCPU_IS_PPC64LE;

// allow an easy way to convert all debug printfs related to events
// back and forth between 'info' and 'errors'
#if defined(NV_DBG_EVENTS)
#define NV_DBG_EVENTINFO NV_DBG_ERRORS
#else
#define NV_DBG_EVENTINFO NV_DBG_INFO
#endif

/***
 *** STATIC functions, only in this file
 ***/

/* nvos_ functions.. do not take a state device parameter  */
static int      nvos_count_devices(nvidia_stack_t *);

static nv_alloc_t  *nvos_create_alloc(struct device *, int);
static int          nvos_free_alloc(nv_alloc_t *);

/* lock-related functions that should only be called from this file */
static NvBool nv_lock_init_locks(nvidia_stack_t *sp, nv_state_t *nv);
static void   nv_lock_destroy_locks(nvidia_stack_t *sp, nv_state_t *nv);


/***
 *** EXPORTS to Linux Kernel
 ***/

static long          nvidia_unlocked_ioctl  (struct file *, unsigned int, unsigned long);
static irqreturn_t   nvidia_isr_common_bh   (void *);
static void          nvidia_isr_bh_unlocked (void *);
static int           nvidia_ctl_open        (struct inode *, struct file *);
static int           nvidia_ctl_close       (struct inode *, struct file *);

#if defined(NV_PCI_ERROR_RECOVERY)
static pci_ers_result_t nvidia_pci_error_detected   (struct pci_dev *, enum pci_channel_state);
static pci_ers_result_t nvidia_pci_mmio_enabled     (struct pci_dev *);

struct pci_error_handlers nv_pci_error_handlers = {
    .error_detected = nvidia_pci_error_detected,
    .mmio_enabled   = nvidia_pci_mmio_enabled,
};
#endif

/***
 *** see nv.h for functions exported to other parts of resman
 ***/

/***
 *** STATIC functions
 ***/

static
nv_alloc_t *nvos_create_alloc(
    struct device *dev,
    int num_pages
)
{
    nv_alloc_t *at;
    unsigned int pt_size, i;

    NV_KMALLOC(at, sizeof(nv_alloc_t));
    if (at == NULL)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate alloc info\n");
        return NULL;
    }

    memset(at, 0, sizeof(nv_alloc_t));

    at->dev = dev;
    pt_size = num_pages *  sizeof(nvidia_pte_t *);
    if (os_alloc_mem((void **)&at->page_table, pt_size) != NV_OK)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate page table\n");
        NV_KFREE(at, sizeof(nv_alloc_t));
        return NULL;
    }

    memset(at->page_table, 0, pt_size);
    at->num_pages = num_pages;
    NV_ATOMIC_SET(at->usage_count, 0);

    for (i = 0; i < at->num_pages; i++)
    {
        at->page_table[i] = NV_KMEM_CACHE_ALLOC(nvidia_pte_t_cache);
        if (at->page_table[i] == NULL)
        {
            nv_printf(NV_DBG_ERRORS,
                      "NVRM: failed to allocate page table entry\n");
            nvos_free_alloc(at);
            return NULL;
        }
        memset(at->page_table[i], 0, sizeof(nvidia_pte_t));
    }

    at->pid = os_get_current_process();

    return at;
}

static
int nvos_free_alloc(
    nv_alloc_t *at
)
{
    unsigned int i;

    if (at == NULL)
        return -1;

    if (NV_ATOMIC_READ(at->usage_count))
        return 1;

    for (i = 0; i < at->num_pages; i++)
    {
        if (at->page_table[i] != NULL)
            NV_KMEM_CACHE_FREE(at->page_table[i], nvidia_pte_t_cache);
    }
    os_free_mem(at->page_table);

    NV_KFREE(at, sizeof(nv_alloc_t));

    return 0;
}

NvU8 nv_find_pci_capability(struct pci_dev *pci_dev, NvU8 capability)
{
    u16 status = 0;
    u8  cap_ptr = 0, cap_id = 0xff;

    pci_read_config_word(pci_dev, PCI_STATUS, &status);
    status &= PCI_STATUS_CAP_LIST;
    if (!status)
        return 0;

    switch (pci_dev->hdr_type) {
        case PCI_HEADER_TYPE_NORMAL:
        case PCI_HEADER_TYPE_BRIDGE:
            pci_read_config_byte(pci_dev, PCI_CAPABILITY_LIST, &cap_ptr);
            break;
        default:
            return 0;
    }

    do {
        cap_ptr &= 0xfc;
        pci_read_config_byte(pci_dev, cap_ptr + PCI_CAP_LIST_ID, &cap_id);
        if (cap_id == capability)
            return cap_ptr;
        pci_read_config_byte(pci_dev, cap_ptr + PCI_CAP_LIST_NEXT, &cap_ptr);
    } while (cap_ptr && cap_id != 0xff);

    return 0;
}

/*!
 * @brief This function accepts pci information corresponding to a GPU
 * and returns a reference to the nv_linux_state_t corresponding to that GPU.
 *
 * @param[in] domain            Pci domain number for the GPU to be found.
 * @param[in] bus               Pci bus number for the GPU to be found.
 * @param[in] slot              Pci slot number for the GPU to be found.
 * @param[in] function          Pci function number for the GPU to be found.
 *
 * @return Pointer to nv_linux_state_t for the GPU if it is found, or NULL otherwise.
 */
nv_linux_state_t * find_pci(NvU32 domain, NvU8 bus, NvU8 slot, NvU8 function)
{
    nv_linux_state_t *nvl = NULL;

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
    {
        nv_state_t *nv = NV_STATE_PTR(nvl);

        if (nv->pci_info.domain == domain &&
            nv->pci_info.bus == bus &&
            nv->pci_info.slot == slot &&
            nv->pci_info.function == function)
        {
            break;
        }
    }

    UNLOCK_NV_LINUX_DEVICES();
    return nvl;
}

static void
nv_module_resources_exit(nv_stack_t *sp)
{
    nv_mem_pool_destroy();
    nv_heap_destroy();

    nv_kmem_cache_free_stack(sp);

    NV_KMEM_CACHE_DESTROY(nvidia_p2p_page_t_cache);
    NV_KMEM_CACHE_DESTROY(nvidia_pte_t_cache);
    NV_KMEM_CACHE_DESTROY(nvidia_stack_t_cache);
}

static int
nv_module_resources_init(nv_stack_t **sp)
{
    int rc = -ENOMEM;

    nvidia_stack_t_cache = NV_KMEM_CACHE_CREATE(nvidia_stack_cache_name,
                                                nvidia_stack_t);
    if (nvidia_stack_t_cache == NULL)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: nvidia_stack_t cache allocation failed.\n");
        goto exit;
    }

    nvidia_pte_t_cache = NV_KMEM_CACHE_CREATE(nvidia_pte_cache_name,
                                              nvidia_pte_t);
    if (nvidia_pte_t_cache == NULL)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: nvidia_pte_t cache allocation failed.\n");
        goto exit;
    }

    if (!nv_multiple_kernel_modules)
    {
        nvidia_p2p_page_t_cache = NV_KMEM_CACHE_CREATE(nvidia_p2p_page_cache_name,
                                                       nvidia_p2p_page_t);
        if (nvidia_p2p_page_t_cache == NULL)
        {
            nv_printf(NV_DBG_ERRORS,
                      "NVRM: nvidia_p2p_page_t cache allocation failed.\n");
            goto exit;
        }
    }

    rc = nv_kmem_cache_alloc_stack(sp);
    if (rc < 0)
    {
        goto exit;
    }

    rc = nv_heap_create();
    if (rc < 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: heap creation failed.\n");
        goto exit;
    }

    rc = nv_mem_pool_create();
    if (rc < 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: memory pool creation failed.\n");
        nv_heap_destroy();
    }

exit:
    if (rc < 0)
    {
        nv_kmem_cache_free_stack(*sp);

        NV_KMEM_CACHE_DESTROY(nvidia_p2p_page_t_cache);
        NV_KMEM_CACHE_DESTROY(nvidia_pte_t_cache);
        NV_KMEM_CACHE_DESTROY(nvidia_stack_t_cache);
    }

    return rc;
}

static void
nvlink_drivers_exit(void)
{
    nv_destroy_rsync_info();


#if NVCPU_IS_64_BITS
    nvswitch_exit();
#endif


#if defined(NVCPU_PPC64LE)
    ibmnpu_exit();
#endif

    nvlink_core_exit();
}

static int
nvlink_drivers_init(void)
{
    int rc;

    rc = nvlink_core_init();
    if (rc < 0)
    {
        nv_printf(NV_DBG_INFO, "NVRM: NVLink core init failed.\n");
        return rc;
    }

#if defined(NVCPU_PPC64LE)
    rc = ibmnpu_init();
    if (rc < 0)
    {
        nv_printf(NV_DBG_INFO, "NVRM: IBM NPU init failed.\n");
        nvlink_core_exit();
        return rc;
    }
#endif


#if NVCPU_IS_64_BITS
    rc = nvswitch_init();
    if (rc < 0)
    {
        nv_printf(NV_DBG_INFO, "NVRM: NVSwitch init failed.\n");
#if defined(NVCPU_PPC64LE)
        ibmnpu_exit();
#endif
        nvlink_core_exit();
    }
#endif


    nv_init_rsync_info();

    return 0;
}

static void
nv_module_state_exit(nv_stack_t *sp)
{
    nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);

    nv_teardown_pat_support();

    nv_kthread_q_stop(&nv_deferred_close_kthread_q);
    nv_kthread_q_stop(&nv_kthread_q);

    nv_lock_destroy_locks(sp, nv);
}

static int
nv_module_state_init(nv_stack_t *sp)
{
    int rc;
    nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);

    nv->os_state = (void *)&nv_ctl_device;

    if (!nv_lock_init_locks(sp, nv))
    {
        return -ENOMEM;
    }

    rc = nv_kthread_q_init(&nv_kthread_q, "nv_queue");
    if (rc != 0)
    {
        goto exit;
    }

    rc = nv_kthread_q_init(&nv_deferred_close_kthread_q, "nv_queue");
    if (rc != 0)
    {
        nv_kthread_q_stop(&nv_kthread_q);
        goto exit;
    }

    rc = nv_init_pat_support(sp);
    if (rc < 0)
    {
        nv_kthread_q_stop(&nv_deferred_close_kthread_q);
        nv_kthread_q_stop(&nv_kthread_q);
        goto exit;
    }

    nv_linux_devices = NULL;
    NV_INIT_MUTEX(&nv_linux_devices_lock);
    init_rwsem(&nv_system_pm_lock);

#if defined(CONFIG_PM)
    NV_INIT_MUTEX(&nv_system_power_state_lock);
    nv_system_power_state = NV_POWER_STATE_RUNNING;
    nv_system_pm_action_depth = NV_PM_ACTION_DEPTH_DEFAULT;
#endif

exit:
    if (rc < 0)
    {
        nv_lock_destroy_locks(sp, nv);
    }

    return rc;
}

static void
nv_registry_keys_init(nv_stack_t *sp)
{
    NV_STATUS status;
    nv_state_t *nv = NV_STATE_PTR(&nv_ctl_device);
    NvU32 data;

    /*
     * Determine the TCE bypass mode here so it can be used during
     * device probe.  Also determine whether we should allow
     * user-mode NUMA onlining of device memory.
     */
    if (NVCPU_IS_PPC64LE)
    {
        status = rm_read_registry_dword(sp, nv,
                                        "NVreg", NV_REG_TCE_BYPASS_MODE,
                                        &data);
        if ((status == NV_OK) && ((int)data != NV_TCE_BYPASS_MODE_DEFAULT))
        {
            nv_tce_bypass_mode = data;
        }

        if (NVreg_EnableUserNUMAManagement)
        {
            /* Force on the core RM registry key to match. */
            status = rm_write_registry_dword(sp, nv,
                                             "NVreg", "RMNumaOnlining", 1);
            WARN_ON(status != NV_OK);
        }
    }

    /* Parse and set any per-GPU registry keys specified. */
    nv_parse_per_device_option_string(sp);
}

static void
nv_report_applied_patches(void)
{
    unsigned i;

    for (i = 0; __nv_patches[i].short_description; i++)
    {
        if (i == 0)
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: Applied patches:\n");
        }

        nv_printf(NV_DBG_ERRORS,
            "NVRM:    Patch #%d: %s\n", i + 1, __nv_patches[i].short_description);
    }
}

static void
nv_drivers_and_procfs_exit(void)
{
    pci_unregister_driver(&nv_pci_driver);

    nv_unregister_procfs();

    nv_unregister_chrdev((void *)&nv_fops);
}

static int
nv_drivers_and_procfs_init(void)
{
    int rc;

    rc = nv_register_chrdev((void *)&nv_fops);
    if (rc < 0)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: failed to register character device.\n");
        return rc;
    }

    rc = nv_register_procfs();
    if (rc < 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to register procfs.\n");
        goto exit;
    }

    rc = nv_pci_register_driver(&nv_pci_driver);
    if (rc < 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA devices found.\n");
        nv_unregister_procfs();
        rc = -ENODEV;
    }

exit:
    if (rc < 0)
    {
        nv_unregister_chrdev((void *)&nv_fops);
    }

    return rc;
}

static void
nv_module_exit(nv_stack_t *sp)
{
    nv_module_state_exit(sp);

    rm_shutdown_rm(sp);

    nvlink_drivers_exit();

    nv_module_resources_exit(sp);
}

static int
nv_module_init(nv_stack_t **sp)
{
    int rc;

    rc = nv_module_resources_init(sp);
    if (rc < 0)
    {
        return rc;
    }

    rc = nvlink_drivers_init();
    if (rc < 0)
    {
        goto exit;
    }

    if (!rm_init_rm(*sp))
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: rm_init_rm() failed!\n");
        nvlink_drivers_exit();
        goto exit;
    }

    rc = nv_module_state_init(*sp);
    if (rc < 0)
    {
        rm_shutdown_rm(*sp);
        nvlink_drivers_exit();
    }

exit:
    if (rc < 0)
    {
        nv_module_resources_exit(*sp);
    }

    return rc;
}

/* 
 * In this function we check for the cases where GPU blacklist is not
 * honored, and issue a warning.
 *
 * Only GPUs that support a mechanism to query UUID prior to
 * initializing the GPU can be blacklisted, so that we can detect and
 * blacklist them during device probe.  This function checks than an
 * initialized GPU was not specified in the blacklist, and issues a
 * warning if so.
 */
static void
nv_assert_not_in_gpu_blacklist(
    nvidia_stack_t *sp,
    nv_state_t *nv
)
{
    NvU8 *uuid;
    NvU32 len;
    NV_STATUS rmStatus;

    if (NV_IS_GVI_DEVICE(nv))
        return;

    rmStatus = rm_get_gpu_uuid(sp, nv, &uuid, &len);
    if (rmStatus != NV_OK)
    {
        NV_DEV_PRINTF_STATUS(NV_DBG_INFO, nv, rmStatus, "Unable to read UUID");
        return;
    }

    if (nv_is_uuid_in_gpu_blacklist(uuid))
    {
        NV_DEV_PRINTF(NV_DBG_WARNINGS, nv,
                      "Could not blacklist %s because PBI is not supported\n",
                      uuid);
        WARN_ON(1);
    }

    os_free_mem(uuid);

    return;
}

static void
nv_check_and_blacklist_gpu(
    nvidia_stack_t *sp,
    nv_state_t *nv
)
{
    NV_STATUS rm_status;
    NvU8 *uuid_str;

    if (NV_IS_GVI_DEVICE(nv))
        return;

    rm_status = rm_get_gpu_uuid(sp, nv, &uuid_str, NULL);
    if (rm_status != NV_OK)
    {
        NV_DEV_PRINTF_STATUS(NV_DBG_INFO, nv, rm_status, "Unable to read UUID");
        return;
    }

    if (nv_is_uuid_in_gpu_blacklist(uuid_str))
    {
        rm_status = rm_blacklist_adapter(sp, nv);
        if (rm_status != NV_OK)
        {
            NV_DEV_PRINTF_STATUS(NV_DBG_ERRORS, nv, rm_status,
                          "Failed to blacklist %s", uuid_str);
            goto done;
        }
        nv->flags |= NV_FLAG_BLACKLIST;
        NV_DEV_PRINTF(NV_DBG_INFO, nv, "Blacklisted %s successfully\n",
                      uuid_str);
    }

done:
    os_free_mem(uuid_str);
}


int __init nvidia_init_module(void)
{
    int rc;
    NvU32 count;
    nvidia_stack_t *sp = NULL;

    if (nv_multiple_kernel_modules)
    {
        nv_printf(NV_DBG_INFO, "NVRM: nvidia module instance %d\n",
                  nv_module_instance);
    }

    nv_user_map_init();
    nv_memdbg_init();

    rc = nv_module_init(&sp);
    if (rc < 0)
    {
        return rc;
    }

    count = nvos_count_devices(sp);
    if (count == 0)
    {
        if (NV_IS_ASSIGN_GPU_PCI_INFO_SPECIFIED())
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: The requested GPU assignments are invalid. Please ensure\n"
                "NVRM: that the GPUs you wish to assign to this kernel module\n"
                "NVRM: are present and available.\n");
        }
        else
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA graphics adapter found!\n");
        }
        rc = -ENODEV;
        goto exit;
    }

    rc = nv_drivers_and_procfs_init();
    if (rc < 0)
    {
        goto exit;
    }

    if (num_probed_nv_devices != count)
    {
        nv_printf(NV_DBG_ERRORS,
            "NVRM: The NVIDIA probe routine was not called for %d device(s).\n",
            count - num_probed_nv_devices);
        nv_printf(NV_DBG_ERRORS,
            "NVRM: This can occur when a driver such as: \n"
            "NVRM: nouveau, rivafb, nvidiafb or rivatv "
#if (NV_BUILD_MODULE_INSTANCES != 0)
            "NVRM: or another NVIDIA kernel module "
#endif
            "\nNVRM: was loaded and obtained ownership of the NVIDIA device(s).\n");
        nv_printf(NV_DBG_ERRORS,
            "NVRM: Try unloading the conflicting kernel module (and/or\n"
            "NVRM: reconfigure your kernel without the conflicting\n"
            "NVRM: driver(s)), then try loading the NVIDIA kernel module\n"
            "NVRM: again.\n");
    }

    if (num_probed_nv_devices == 0)
    {
        rc = -ENODEV;
        nv_printf(NV_DBG_ERRORS, "NVRM: No NVIDIA devices probed.\n");
        nv_drivers_and_procfs_exit();
        goto exit;
    }

    if (num_probed_nv_devices != num_nv_devices)
    {
        nv_printf(NV_DBG_ERRORS,
            "NVRM: The NVIDIA probe routine failed for %d device(s).\n",
            num_probed_nv_devices - num_nv_devices);
    }

    if (num_nv_devices == 0)
    {
        rc = -ENODEV;
        nv_printf(NV_DBG_ERRORS,
            "NVRM: None of the NVIDIA devices were initialized.\n");
        nv_drivers_and_procfs_exit();
        goto exit;
    }

    /*
     * Initialize registry keys after PCI driver registration has
     * completed successfully to support per-device module
     * parameters.
     */
    nv_registry_keys_init(sp);

    nv_report_applied_patches();

    nv_printf(NV_DBG_ERRORS, "NVRM: loading %s\n", pNVRM_ID);

#if defined(NVCPU_X86_64) && defined(CONFIG_IA32_EMULATION) && \
  !defined(NV_FILE_OPERATIONS_HAS_COMPAT_IOCTL)
    rm_register_compatible_ioctls(sp);
#endif

#if defined(NV_UVM_ENABLE)
    rc = nv_uvm_init();
    if (rc != 0)
    {
        nv_drivers_and_procfs_exit();
        goto exit;
    }
#endif

    __nv_init_sp = sp;

exit:
    if (rc < 0)
    {
        nv_module_exit(sp);
    }

    return rc;
}

void nvidia_exit_module(void)
{
    nvidia_stack_t *sp = __nv_init_sp;

#if defined(NV_UVM_ENABLE)
    nv_uvm_exit();
#endif

#if defined(NVCPU_X86_64) && defined(CONFIG_IA32_EMULATION) && \
  !defined(NV_FILE_OPERATIONS_HAS_COMPAT_IOCTL)
    rm_unregister_compatible_ioctls(sp);
#endif

    nv_drivers_and_procfs_exit();

    nv_module_exit(sp);

    nv_memdbg_exit();
}

/*
 * Module entry and exit functions for Linux single-module builds
 * are present in nv-frontend.c.
 */

#if (NV_BUILD_MODULE_INSTANCES != 0)
module_init(nvidia_init_module);
module_exit(nvidia_exit_module);
#endif

void *nv_alloc_file_private(void)
{
    nv_file_private_t *nvfp;
    unsigned int i;

    NV_KMALLOC(nvfp, sizeof(nv_file_private_t));
    if (!nvfp)
        return NULL;

    memset(nvfp, 0, sizeof(nv_file_private_t));

    for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
    {
        NV_INIT_MUTEX(&nvfp->fops_sp_lock[i]);
    }
    init_waitqueue_head(&nvfp->waitqueue);
    NV_SPIN_LOCK_INIT(&nvfp->fp_lock);

    return nvfp;
}

void nv_free_file_private(nv_file_private_t *nvfp)
{
    nvidia_event_t *nvet;

    if (nvfp == NULL)
        return;

    for (nvet = nvfp->event_head; nvet != NULL; nvet = nvfp->event_head)
    {
        nvfp->event_head = nvfp->event_head->next;
        NV_KFREE(nvet, sizeof(nvidia_event_t));
    }

    if (nvfp->mmap_context.page_array != NULL)
    {
        os_free_mem(nvfp->mmap_context.page_array);
    }

    NV_KFREE(nvfp, sizeof(nv_file_private_t));
}


static int nv_is_control_device(
    struct inode *inode
)
{
    return (minor((inode)->i_rdev) == \
            (NV_CONTROL_DEVICE_MINOR - nv_module_instance));
}

/*
 * Search the global list of nv devices for the one with the given minor device
 * number. If found, nvl is returned with nvl->ldata_lock taken.
 */
static nv_linux_state_t *find_minor(NvU32 minor)
{
    nv_linux_state_t *nvl;

    LOCK_NV_LINUX_DEVICES();
    nvl = nv_linux_devices;
    while (nvl != NULL)
    {
        if (nvl->minor_num == minor)
        {
            down(&nvl->ldata_lock);
            break;
        }
        nvl = nvl->next;
    }

    UNLOCK_NV_LINUX_DEVICES();
    return nvl;
}

/*
 * Search the global list of nv devices for the one with the given gpu_id.
 * If found, nvl is returned with nvl->ldata_lock taken.
 */
static nv_linux_state_t *find_gpu_id(NvU32 gpu_id)
{
    nv_linux_state_t *nvl;

    LOCK_NV_LINUX_DEVICES();
    nvl = nv_linux_devices;
    while (nvl != NULL)
    {
        nv_state_t *nv = NV_STATE_PTR(nvl);
        if (nv->gpu_id == gpu_id)
        {
            down(&nvl->ldata_lock);
            break;
        }
        nvl = nvl->next;
    }

    UNLOCK_NV_LINUX_DEVICES();
    return nvl;
}

/*
 * Search the global list of nv devices for the one with the given UUID. Devices
 * with missing UUID information are ignored. If found, nvl is returned with
 * nvl->ldata_lock taken.
 */
static nv_linux_state_t *find_uuid(const NvU8 *uuid)
{
    nv_linux_state_t *nvl = NULL;
    nv_state_t *nv;
    const NvU8 *dev_uuid;

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl; nvl = nvl->next)
    {
        nv = NV_STATE_PTR(nvl);
        down(&nvl->ldata_lock);
        dev_uuid = nv_get_cached_uuid(nv);
        if (dev_uuid && memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
            goto out;
        up(&nvl->ldata_lock);
    }

out:
    UNLOCK_NV_LINUX_DEVICES();
    return nvl;
}

/*
 * Search the global list of nv devices. The search logic is:
 *
 * 1) If any device has the given UUID, return it
 *
 * 2) If no device has the given UUID but at least one non-GVI device is missing
 *    its UUID (for example because rm_init_adapter has not run on it yet),
 *    return that device.
 *
 * 3) If no device has the given UUID and all UUIDs are present, return NULL.
 *
 * In cases 1 and 2, nvl is returned with nvl->ldata_lock taken.
 *
 * The reason for this weird logic is because UUIDs aren't always available. See
 * bug 1642200.
 */
static nv_linux_state_t *find_uuid_candidate(const NvU8 *uuid)
{
    nv_linux_state_t *nvl = NULL;
    nv_state_t *nv;
    const NvU8 *dev_uuid;
    int use_missing;
    int has_missing = 0;

    LOCK_NV_LINUX_DEVICES();

    /*
     * Take two passes through the list. The first pass just looks for the UUID.
     * The second looks for the target or missing UUIDs. It would be nice if
     * this could be done in a single pass by remembering which nvls are missing
     * UUIDs, but we have to hold the nvl lock after we check for the UUID.
     */
    for (use_missing = 0; use_missing <= 1; use_missing++)
    {
        for (nvl = nv_linux_devices; nvl; nvl = nvl->next)
        {
            nv = NV_STATE_PTR(nvl);
            down(&nvl->ldata_lock);
            dev_uuid = nv_get_cached_uuid(nv);
            if (dev_uuid)
            {
                /* Case 1: If a device has the given UUID, return it */
                if (memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
                    goto out;
            }
            else if (!NV_IS_GVI_DEVICE(nv))
            {
                /* Case 2: If no device has the given UUID but at least one non-
                 * GVI device is missing its UUID, return that device. */
                if (use_missing)
                    goto out;
                has_missing = 1;
            }
            up(&nvl->ldata_lock);
        }

        /* Case 3: If no device has the given UUID and all UUIDs are present,
         * return NULL. */
        if (!has_missing)
            break;
    }

out:
    UNLOCK_NV_LINUX_DEVICES();
    return nvl;
}

static void nv_dev_free_stacks(nv_linux_state_t *nvl)
{
    NvU32 i;
    for (i = 0; i < NV_DEV_STACK_COUNT; i++)
    {
        if (nvl->sp[i])
        {
            nv_kmem_cache_free_stack(nvl->sp[i]);
            nvl->sp[i] = NULL;
        }
    }
}

static int nv_dev_alloc_stacks(nv_linux_state_t *nvl)
{
    NvU32 i;
    int rc;

    for (i = 0; i < NV_DEV_STACK_COUNT; i++)
    {
        rc = nv_kmem_cache_alloc_stack(&nvl->sp[i]);
        if (rc != 0)
        {
            nv_dev_free_stacks(nvl);
            return rc;
        }
    }

    return 0;
}

static int validate_numa_start_state(nv_linux_state_t *nvl)
{
    int rc = 0;
    int numa_status = nv_get_numa_status(nvl);

    if (numa_status != NV_IOCTL_NUMA_STATUS_DISABLED)
    {
        if (nv_ctl_device.numa_memblock_size == 0)
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: numa memblock size of zero "
                      "found during device start");
            rc = -EINVAL;
        }
        else
        {
            /* Keep the individual devices consistent with the control device */
            nvl->numa_memblock_size = nv_ctl_device.numa_memblock_size;
        }
    }

    return rc;
}

/*
 * Brings up the device on the first file open. Assumes nvl->ldata_lock is held.
 */
static int nv_start_device(nv_state_t *nv, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
    NvU32 msi_config = 0;
#endif
    int rc = 0;
    NvBool kthread_init = NV_FALSE;
    NvBool power_ref = NV_FALSE;

    rc = nv_get_rsync_info();
    if (rc != 0)
    {
        return rc;
    }

    rc = validate_numa_start_state(nvl);
    if (rc != 0)
    {
        goto failed;
    }

    if (nv->pci_info.device_id == 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: open of nonexistent "
                  "device bearing minor number %d\n", nvl->minor_num);
        rc = -ENXIO;
        goto failed;
    }

    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        if (rm_ref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE) != NV_OK)
        {
            goto failed;
        }
        power_ref = NV_TRUE;
    }

    rc = nv_init_ibmnpu_devices(nv);
    if (rc != 0)
    {
        nv_printf(NV_DBG_ERRORS,
            "NVRM: failed to initialize ibmnpu devices attached to device bearing minor number %d\n",
            nvl->minor_num);
        goto failed;
    }

    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        rc = nv_dev_alloc_stacks(nvl);
        if (rc != 0)
            goto failed;
    }

#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
    if (!NV_IS_GVI_DEVICE(nv))
    {
        if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
        {
            rm_read_registry_dword(sp, nv, "NVreg", NV_REG_ENABLE_MSI,
                                   &msi_config);
            if (msi_config == 1)
            {
                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSIX))
                {
                    nv_init_msix(nv);
                }
                if (pci_find_capability(nvl->pci_dev, PCI_CAP_ID_MSI) &&
                    !(nv->flags & NV_FLAG_USES_MSIX))
                {
                    nv_init_msi(nv);
                }
            }
        }
    }
#endif

    if (((!(nv->flags & NV_FLAG_USES_MSI)) && (!(nv->flags & NV_FLAG_USES_MSIX)))
        && (nvl->pci_dev->irq == 0))
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                      "No interrupts of any type are available. Cannot use this GPU.\n");
        rc = -EIO;
        goto failed;
    }

    if (NV_IS_GVI_DEVICE(nv))
    {
        rc = request_irq(nv->interrupt_line, nv_gvi_kern_isr, nv_default_irq_flags(nv),
                         nv_device_name, (void *)nvl);
        if (rc == 0)
        {
            INIT_WORK(&nvl->work.task, nv_gvi_kern_bh);
            nvl->work.data = nvl;

            rm_init_gvi_device(sp, nv);
            goto done;
        }
    }
    else
    {
        rc = 0;
        if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
        {
            if (!(nv->flags & NV_FLAG_USES_MSIX))
            {
                rc = request_threaded_irq(nv->interrupt_line, nvidia_isr,
                                      nvidia_isr_kthread_bh, nv_default_irq_flags(nv),
                                      nv_device_name, (void *)nvl);
            }
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
            else
            {
                rc = nv_request_msix_irq(nvl);
            }
#endif
        }
    }
    if (rc != 0)
    {
        if ((nv->interrupt_line != 0) && (rc == -EBUSY))
        {
            NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                "Tried to get IRQ %d, but another driver\n",
                (unsigned int) nv->interrupt_line);
            nv_printf(NV_DBG_ERRORS, "NVRM: has it and is not sharing it.\n");
            nv_printf(NV_DBG_ERRORS, "NVRM: You may want to verify that no audio driver");
            nv_printf(NV_DBG_ERRORS, " is using the IRQ.\n");
        }
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "request_irq() failed (%d)\n", rc);
        goto failed;
    }

    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        rc = os_alloc_mutex(&nvl->isr_bh_unlocked_mutex);
        if (rc != 0)
            goto failed;
        nv_kthread_q_item_init(&nvl->bottom_half_q_item, nvidia_isr_bh_unlocked, (void *)nv);
        rc = nv_kthread_q_init(&nvl->bottom_half_q, nv_device_name);
        if (rc != 0)
            goto failed;
        kthread_init = NV_TRUE;
    }

    rc = nv_kthread_q_init(&nvl->queue.nvk, "nv_queue");
    if (rc)
        goto failed;
    nv->queue = &nvl->queue;

    if (!rm_init_adapter(sp, nv))
    {
        if (!(nv->flags & NV_FLAG_USES_MSIX))
        {
            free_irq(nv->interrupt_line, (void *) nvl);
        }
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
        else
        {
            nv_free_msix_irq(nvl);
        }
#endif
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                      "rm_init_adapter failed, device minor number %d\n",
                      nvl->minor_num);
        rc = -EIO;
        goto failed;
    }

    if (!NV_IS_GVI_DEVICE(nv))
    {
        NvU8 *uuid;
        if (rm_get_gpu_uuid_raw(sp, nv, &uuid, NULL) == NV_OK)
        {
#if defined(NV_UVM_ENABLE)
            nv_uvm_notify_start_device(uuid);
#endif
        }
    }

done:
    nv->flags |= NV_FLAG_OPEN;
    return 0;

failed:
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
    if (nv->flags & NV_FLAG_USES_MSI)
    {
        nv->flags &= ~NV_FLAG_USES_MSI;
        NV_PCI_DISABLE_MSI(nvl->pci_dev);
    }
    if (nv->flags & NV_FLAG_USES_MSIX)
    {
        nv->flags &= ~NV_FLAG_USES_MSIX;
        pci_disable_msix(nvl->pci_dev);
        NV_KFREE(nvl->msix_entries, nvl->num_intr*sizeof(struct msix_entry));
    }

    if (nvl->msix_bh_mutex)
    {
        os_free_mutex(nvl->msix_bh_mutex);
        nvl->msix_bh_mutex = NULL;
    }
#endif

    nv->queue = NULL;
    nv_kthread_q_stop(&nvl->queue.nvk);

    if (kthread_init && !(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
        nv_kthread_q_stop(&nvl->bottom_half_q);

    if (nvl->isr_bh_unlocked_mutex)
    {
        os_free_mutex(nvl->isr_bh_unlocked_mutex);
        nvl->isr_bh_unlocked_mutex = NULL;
    }

    nv_dev_free_stacks(nvl);

    nv_unregister_ibmnpu_devices(nv);

    if (power_ref)
    {
        rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
    }

    nv_put_rsync_info();

    return rc;
}

/*
 * Makes sure the device is ready for operations and increases nvl->usage_count.
 * Assumes nvl->ldata_lock is held.
 */
static int nv_open_device(nv_state_t *nv, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    int rc;
    NV_STATUS status;

    if (IS_VGX_HYPER())
    {
        /* fail open if GPU is being unbound */
        if (nv->flags & NV_FLAG_UNBIND_LOCK)
        {
            NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                          "Open failed as GPU is locked for unbind operation\n");
            return -ENODEV;
        }
    }

    NV_DEV_PRINTF(NV_DBG_INFO, nv, "Opening device bearing minor number %d\n",
                  nvl->minor_num);

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        NV_DEV_PRINTF(NV_DBG_INFO, nv, "Device in removal process\n");
        return -ENODEV;
    }

    if ( ! (nv->flags & NV_FLAG_OPEN))
    {
        /* Sanity check: !NV_FLAG_OPEN requires usage_count == 0 */
        if (NV_ATOMIC_READ(nvl->usage_count) != 0)
        {
            NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                          "Minor device %u is referenced without being open!\n",
                          nvl->minor_num);
            WARN_ON(1);
            return -EBUSY;
        }

        rc = nv_start_device(nv, sp);
        if (rc != 0)
            return rc;
    }
    else if (rm_is_device_sequestered(sp, nv))
    {
        /* Do not increment the usage count of sequestered devices. */
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Device is currently unavailable\n");
        return -EBUSY;
    }

    NV_ATOMIC_INC(nvl->usage_count);
    return 0;
}

static void nv_init_mapping_revocation(nv_linux_state_t *nvl,
                                       struct file *file,
                                       nv_file_private_t *nvfp,
                                       struct inode *inode)
{
    down(&nvl->mmap_lock);

    /* Set up struct address_space for use with unmap_mapping_range() */
    nv_address_space_init_once(&nvfp->mapping);
    nvfp->mapping.host = inode;
    nvfp->mapping.a_ops = inode->i_mapping->a_ops;
#if defined(NV_ADDRESS_SPACE_HAS_BACKING_DEV_INFO)
    nvfp->mapping.backing_dev_info = inode->i_mapping->backing_dev_info;
#endif
    file->f_mapping = &nvfp->mapping;

    /* Add nvfp to list of open files in nvl for mapping revocation */
    list_add(&nvfp->entry, &nvl->open_files);

    up(&nvl->mmap_lock);
}

/*
** nvidia_open
**
** nv driver open entry point.  Sessions are created here.
*/
int
nvidia_open(
    struct inode *inode,
    struct file *file
)
{
    nv_state_t *nv = NULL;
    nv_linux_state_t *nvl = NULL;
    NvU32 minor_num;
    int rc = 0;
    nv_file_private_t *nvfp = NULL;
    nvidia_stack_t *sp = NULL;
    unsigned int i;
    unsigned int k;

    nv_printf(NV_DBG_INFO, "NVRM: nvidia_open...\n");

    nvfp = nv_alloc_file_private();
    if (nvfp == NULL)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate file private!\n");
        return -ENOMEM;
    }

    rc = nv_kmem_cache_alloc_stack(&sp);
    if (rc != 0)
    {
        nv_free_file_private(nvfp);
        return rc;
    }

    for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
    {
        rc = nv_kmem_cache_alloc_stack(&nvfp->fops_sp[i]);
        if (rc != 0)
        {
            nv_kmem_cache_free_stack(sp);
            for (k = 0; k < i; ++k)
            {
                nv_kmem_cache_free_stack(nvfp->fops_sp[k]);
            }
            nv_free_file_private(nvfp);
            return rc;
        }
    }

    /* what device are we talking about? */
    minor_num = NV_DEVICE_MINOR_NUMBER(inode);

    nvfp->minor_num = minor_num;

    NV_SET_FILE_PRIVATE(file, nvfp);
    nvfp->sp = sp;

    /* for control device, just jump to its open routine */
    /* after setting up the private data */
    if (nv_is_control_device(inode))
    {
        rc = nvidia_ctl_open(inode, file);
        if (rc != 0)
            goto failed;
        return rc;
    }

    rc = NV_READ_LOCK_SYSTEM_PM_LOCK_INTERRUPTIBLE();
    if (rc < 0)
        goto failed;

    /* Takes nvl->ldata_lock */
    nvl = find_minor(minor_num);
    if (!nvl)
    {
        rc = -ENODEV;
        NV_READ_UNLOCK_SYSTEM_PM_LOCK();
        goto failed;
    }

    nvfp->nvptr = nvl;
    nv = NV_STATE_PTR(nvl);

    if ((nv->flags & NV_FLAG_BLACKLIST) != 0)
    {
        NV_STATUS rm_status;
        NvU8 *uuid;

        rm_status = rm_get_gpu_uuid(sp, nv, &uuid, NULL);
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                      "open() not permitted for blacklisted %s\n",
                      (rm_status == NV_OK) ? (char *)uuid : "GPU");
        if (rm_status == NV_OK)
            os_free_mem(uuid);
        rc = -EPERM;
        goto failed1;
    }

    rc = nv_open_device(nv, sp);
    /* Fall-through on error */

    nv_assert_not_in_gpu_blacklist(sp, nv);

failed1:
    up(&nvl->ldata_lock);

    NV_READ_UNLOCK_SYSTEM_PM_LOCK();

failed:
    if (rc != 0)
    {
        if (nvfp != NULL)
        {
            nv_kmem_cache_free_stack(sp);
            for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
            {
                nv_kmem_cache_free_stack(nvfp->fops_sp[i]);
            }
            nv_free_file_private(nvfp);
            NV_SET_FILE_PRIVATE(file, NULL);
        }
    }
    else
    {
        nv_init_mapping_revocation(nvl, file, nvfp, inode);
    }

    return rc;
}

static void validate_numa_shutdown_state(nv_linux_state_t *nvl)
{
    int numa_status = nv_get_numa_status(nvl);
    WARN_ON((numa_status != NV_IOCTL_NUMA_STATUS_OFFLINE) &&
            (numa_status != NV_IOCTL_NUMA_STATUS_DISABLED));
}

static void nv_shutdown_adapter(nvidia_stack_t *sp,
                                nv_state_t *nv,
                                nv_linux_state_t *nvl)
{
    validate_numa_shutdown_state(nvl);

    rm_disable_adapter(sp, nv);

    // It's safe to call nv_kthread_q_stop even if queue is not initialized
    nv_kthread_q_stop(&nvl->bottom_half_q);
    if (nvl->isr_bh_unlocked_mutex)
    {
        os_free_mutex(nvl->isr_bh_unlocked_mutex);
        nvl->isr_bh_unlocked_mutex = NULL;
    }

    if (!(nv->flags & NV_FLAG_USES_MSIX))
    {
        free_irq(nv->interrupt_line, (void *)nvl);
        if (nv->flags & NV_FLAG_USES_MSI)
            NV_PCI_DISABLE_MSI(nvl->pci_dev);
    }
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
    else
    {
        nv_free_msix_irq(nvl);
        pci_disable_msix(nvl->pci_dev);
        nv->flags &= ~NV_FLAG_USES_MSIX;
        NV_KFREE(nvl->msix_entries, nvl->num_intr*sizeof(struct msix_entry));
    }
#endif

    if (nvl->msix_bh_mutex)
    {
        os_free_mutex(nvl->msix_bh_mutex);
        nvl->msix_bh_mutex = NULL;
    }

    rm_shutdown_adapter(sp, nv);
}

/*
 * Tears down the device on the last file close. Assumes nvl->ldata_lock is
 * held.
 */
static void nv_stop_device(nv_state_t *nv, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    static int persistence_mode_notice_logged;

    if (NV_IS_GVI_DEVICE(nv))
    {
        rm_shutdown_gvi_device(sp, nv);
        flush_scheduled_work();
        free_irq(nv->interrupt_line, (void *)nvl);
    }
    else
    {
#if defined(NV_UVM_ENABLE)
        {
            const NvU8* uuid;
            // Inform UVM before disabling adapter. Use cached copy
            uuid = nv_get_cached_uuid(nv);
            if (uuid != NULL)
            {
                // this function cannot fail
                nv_uvm_notify_stop_device(uuid);
            }
        }
#endif
        /* Adapter is already shutdown as part of nvidia_remove */
        if (!(NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)))
        {
            if (nv->flags & NV_FLAG_PERSISTENT_SW_STATE)
            {
                rm_disable_adapter(sp, nv);
            }
            else
            {
                nv_shutdown_adapter(sp, nv, nvl);
            }
        }
    }

    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        /*
         * For legacy persistence mode, keep the per-GPU queue active even
         * after shutdown.
         * Note: One day, when legacy persistence mode is removed from this
         * driver, just move these two lines outside of the if-statement.
        */
        nv->queue = NULL;
        nv_kthread_q_stop(&nvl->queue.nvk);

        nv_dev_free_stacks(nvl);
    }

    if ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) &&
        (!persistence_mode_notice_logged) && (!IS_VGX_HYPER()))
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: Persistence mode is deprecated and"
                  " will be removed in a future release. Please use"
                  " nvidia-persistenced instead.\n");
        persistence_mode_notice_logged  = 1;
    }

    /* leave INIT flag alone so we don't reinit every time */
    nv->flags &= ~NV_FLAG_OPEN;

    nv_unregister_ibmnpu_devices(nv);

    if (!(nv->flags & NV_FLAG_PERSISTENT_SW_STATE))
    {
        rm_unref_dynamic_power(sp, nv, NV_DYNAMIC_PM_COARSE);
    }

    nv_put_rsync_info();
}

/*
 * Decreases nvl->usage_count, stopping the device when it reaches 0. Assumes
 * nvl->ldata_lock is held.
 */
static void nv_close_device(nv_state_t *nv, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (NV_ATOMIC_READ(nvl->usage_count) == 0)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: Attempting to close unopened minor device %u!\n",
                  nvl->minor_num);
        WARN_ON(1);
        return;
    }

    if (NV_ATOMIC_DEC_AND_TEST(nvl->usage_count))
        nv_stop_device(nv, sp);
}

/*
** nvidia_close
**
** Master driver close entry point.
*/

static void
nvidia_close_callback(
   nv_file_private_t *nvfp
)
{
    nv_linux_state_t *nvl = nvfp->nvptr;
    nv_state_t *nv = NV_STATE_PTR(nvl);
    nvidia_stack_t *sp = nvfp->sp;
    unsigned int i;
    NvBool bRemove = NV_FALSE;

    rm_free_unused_clients(sp, nv, nvfp);

    down(&nvl->mmap_lock);
    list_del(&nvfp->entry);
    up(&nvl->mmap_lock);

    down(&nvl->ldata_lock);
    nv_close_device(nv, sp);

    bRemove = (!NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)) &&
              (NV_ATOMIC_READ(nvl->usage_count) == 0) &&
              rm_get_device_remove_flag(sp, nv->gpu_id);

    up(&nvl->ldata_lock);

    for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
    {
        nv_kmem_cache_free_stack(nvfp->fops_sp[i]);
    }

    nv_free_file_private(nvfp);

#if defined(NV_PCI_STOP_AND_REMOVE_BUS_DEVICE)
    if (bRemove)
    {
        NV_PCI_STOP_AND_REMOVE_BUS_DEVICE(nvl->pci_dev);
    }
#endif

    /*
     * In case of surprise removal of eGPU, TB3 driver will schedule the
     * device removal call i.e. nvidia_remove will get called.
     * RM doesn't clean up linux layer locks and nv linux state struct, as these
     * are required in nvidia_close call, once it comes from clients.
     * Once all the clients are closed, last nvidia_close will clean up linux
     * layer locks and nv linux state struct.
     */
    if (NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv) &&
        (NV_ATOMIC_READ(nvl->usage_count) == 0))
    {
        nvidia_frontend_remove_device((void *)&nv_fops, nvl);
        nv_lock_destroy_locks(sp, nv);
        NV_KFREE(nvl, sizeof(nv_linux_state_t));
    }

    nv_kmem_cache_free_stack(sp);
}

static void nvidia_close_deferred(void *data)
{
    nv_file_private_t *nvfp = data;

    NV_READ_LOCK_SYSTEM_PM_LOCK();

    nvidia_close_callback(nvfp);

    NV_READ_UNLOCK_SYSTEM_PM_LOCK();
}

int
nvidia_close(
    struct inode *inode,
    struct file *file
)
{
    int rc;
    nv_file_private_t *nvfp = NV_GET_FILE_PRIVATE(file);
    nv_linux_state_t *nvl = nvfp->nvptr;
    nv_state_t *nv = NV_STATE_PTR(nvl);

    NV_DEV_PRINTF(NV_DBG_INFO, nv, "nvidia_close on device "
                  "bearing minor number %d\n", NV_DEVICE_MINOR_NUMBER(inode));

    if (nv_is_control_device(inode))
    {
        return nvidia_ctl_close(inode, file);
    }

    NV_SET_FILE_PRIVATE(file, NULL);

    rc = NV_READ_LOCK_SYSTEM_PM_LOCK_INTERRUPTIBLE();
    if (rc == 0)
    {
        nvidia_close_callback(nvfp);
        NV_READ_UNLOCK_SYSTEM_PM_LOCK();
    }
    else
    {
        nv_kthread_q_item_init(&nvfp->deferred_close_q_item,
                               nvidia_close_deferred,
                               nvfp);
        rc = nv_kthread_q_schedule_q_item(&nv_deferred_close_kthread_q,
                                          &nvfp->deferred_close_q_item);
        WARN_ON(rc == 0);
    }

    return 0;
}

unsigned int
nvidia_poll(
    struct file *file,
    poll_table  *wait
)
{
    unsigned int mask = 0;
    nv_file_private_t *nvfp = NV_GET_FILE_PRIVATE(file);
    unsigned long eflags;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
    nv_state_t *nv = NV_STATE_PTR(nvl);
    NV_STATUS status;

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping nvidia_poll\n");
        return POLLHUP;
    }

    if ((file->f_flags & O_NONBLOCK) == 0)
        poll_wait(file, &nvfp->waitqueue, wait);

    NV_SPIN_LOCK_IRQSAVE(&nvfp->fp_lock, eflags);

    if ((nvfp->event_head != NULL) || nvfp->event_pending)
    {
        mask = (POLLPRI | POLLIN);
        nvfp->event_pending = FALSE;
    }

    NV_SPIN_UNLOCK_IRQRESTORE(&nvfp->fp_lock, eflags);

    return mask;
}

#define NV_CTL_DEVICE_ONLY(nv)                 \
{                                              \
    if (((nv)->flags & NV_FLAG_CONTROL) == 0)  \
    {                                          \
        status = -EINVAL;                      \
        goto done;                             \
    }                                          \
}

#define NV_ACTUAL_DEVICE_ONLY(nv)              \
{                                              \
    if (((nv)->flags & (NV_FLAG_CONTROL |      \
                        NV_FLAG_GVI)) != 0)    \
    {                                          \
        status = -EINVAL;                      \
        goto done;                             \
    }                                          \
}

/*
 * Fills the ci array with the state of num_entries devices. Returns -EINVAL if
 * num_entries isn't big enough to hold all available devices.
 */
static int nvidia_read_card_info(nv_ioctl_card_info_t *ci, size_t num_entries)
{
    nv_state_t *nv;
    nv_linux_state_t *nvl;
    size_t i = 0;
    int rc = 0;

    /* Clear each card's flags field the lazy way */
    memset(ci, 0, num_entries * sizeof(ci[0]));

    LOCK_NV_LINUX_DEVICES();

    if (num_entries < num_nv_devices)
    {
        rc = -EINVAL;
        goto out;
    }

    for (nvl = nv_linux_devices; nvl && i < num_entries; nvl = nvl->next)
    {
        nv = NV_STATE_PTR(nvl);
        if (nv->pci_info.device_id)
        {
            /* We do not include blacklisted GPUs in the list... */
            if ((nv->flags & NV_FLAG_BLACKLIST) != 0)
                continue;

            ci[i].flags              = NV_IOCTL_CARD_INFO_FLAG_PRESENT;
            ci[i].pci_info.domain    = nv->pci_info.domain;
            ci[i].pci_info.bus       = nv->pci_info.bus;
            ci[i].pci_info.slot      = nv->pci_info.slot;
            ci[i].pci_info.vendor_id = nv->pci_info.vendor_id;
            ci[i].pci_info.device_id = nv->pci_info.device_id;
            ci[i].gpu_id             = nv->gpu_id;
            ci[i].interrupt_line     = nv->interrupt_line;
            ci[i].reg_address        = nv->regs->cpu_address;
            ci[i].reg_size           = nv->regs->size;
            ci[i].fb_address         = nv->fb->cpu_address;
            ci[i].fb_size            = nv->fb->size;
            ci[i].minor_number       = nvl->minor_num;
            i++;
        }
    }

out:
    UNLOCK_NV_LINUX_DEVICES();
    return rc;
}

int
nvidia_ioctl(
    struct inode *inode,
    struct file *file,
    unsigned int cmd,
    unsigned long i_arg)
{
    NV_STATUS rmStatus;
    int status = 0;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
    nv_state_t *nv = NV_STATE_PTR(nvl);
    nv_file_private_t *nvfp = NV_GET_FILE_PRIVATE(file);
    nvidia_stack_t *sp = NULL;
    nv_ioctl_xfer_t ioc_xfer;
    void *arg_ptr = (void *) i_arg;
    void *arg_copy = NULL;
    size_t arg_size = 0;
    int arg_cmd;

    nv_printf(NV_DBG_INFO, "NVRM: ioctl(0x%x, 0x%x, 0x%x)\n",
        _IOC_NR(cmd), (unsigned int) i_arg, _IOC_SIZE(cmd));

    status = NV_READ_LOCK_SYSTEM_PM_LOCK_INTERRUPTIBLE();
    if (status < 0)
        return status;

    down(&nvfp->fops_sp_lock[NV_FOPS_STACK_INDEX_IOCTL]);
    sp = nvfp->fops_sp[NV_FOPS_STACK_INDEX_IOCTL];

    rmStatus = nv_check_gpu_state(nv);
    if (rmStatus == NV_ERR_GPU_IS_LOST)
    {
        nv_printf(NV_DBG_INFO, "NVRM: GPU is lost, skipping nvidia_ioctl\n");
        status = -EINVAL;
        goto done;
    }

    arg_size = _IOC_SIZE(cmd);
    arg_cmd  = _IOC_NR(cmd);

    if (arg_cmd == NV_ESC_IOCTL_XFER_CMD)
    {
        if (arg_size != sizeof(nv_ioctl_xfer_t))
        {
            nv_printf(NV_DBG_ERRORS,
                    "NVRM: invalid ioctl XFER structure size!\n");
            status = -EINVAL;
            goto done;
        }

        if (NV_COPY_FROM_USER(&ioc_xfer, arg_ptr, sizeof(ioc_xfer)))
        {
            nv_printf(NV_DBG_ERRORS,
                    "NVRM: failed to copy in ioctl XFER data!\n");
            status = -EFAULT;
            goto done;
        }

        arg_cmd  = ioc_xfer.cmd;
        arg_size = ioc_xfer.size;
        arg_ptr  = NvP64_VALUE(ioc_xfer.ptr);

        if (arg_size > NV_ABSOLUTE_MAX_IOCTL_SIZE)
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: invalid ioctl XFER size!\n");
            status = -EINVAL;
            goto done;
        }
    }

    NV_KMALLOC(arg_copy, arg_size);
    if (arg_copy == NULL)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate ioctl memory\n");
        status = -ENOMEM;
        goto done;
    }

    if (NV_COPY_FROM_USER(arg_copy, arg_ptr, arg_size))
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to copy in ioctl data!\n");
        status = -EFAULT;
        goto done;
    }

    switch (arg_cmd)
    {
        case NV_ESC_QUERY_DEVICE_INTR:
        {
            nv_ioctl_query_device_intr *query_intr = arg_copy;

            NV_ACTUAL_DEVICE_ONLY(nv);

            if (!nv->regs->map)
            {
                status = -EINVAL;
                goto done;
            }

            query_intr->intrStatus =
                *(nv->regs->map + (NV_RM_DEVICE_INTR_ADDRESS >> 2));
            query_intr->status = NV_OK;
            break;
        }

        /* pass out info about the card */
        case NV_ESC_CARD_INFO:
        {
            nv_ioctl_rm_api_old_version_t *rm_api;
            size_t num_arg_devices = arg_size / sizeof(nv_ioctl_card_info_t);
            NvU32 major, minor, patch;

            NV_CTL_DEVICE_ONLY(nv);

            /* the first element of card info passed from the client will have
             * the rm_api_version_magic value to show that the client is new
             * enough to support versioning. If the client is too old to
             * support versioning, our mmap interfaces are probably different
             * enough to cause serious damage.
             * just copy in the one dword to check.
             */
            if (arg_size < sizeof(rm_api->magic))
            {
                status = -EINVAL;
                goto done;
            }

            rm_api = arg_copy;
            switch (rm_api->magic)
            {
                case NV_RM_API_OLD_VERSION_MAGIC_REQ:
                case NV_RM_API_OLD_VERSION_MAGIC_LAX_REQ:
                case NV_RM_API_OLD_VERSION_MAGIC_OVERRIDE_REQ:
                    /* the client is using the old major-minor-patch
                     * API version check; reject it.
                     */
                    if (arg_size < sizeof(*rm_api))
                    {
                        major = 0;
                        minor = 0;
                        patch = 0;
                    }
                    else
                    {
                        major = rm_api->major;
                        minor = rm_api->minor;
                        patch = rm_api->patch;
                    }
                    nv_printf(NV_DBG_ERRORS,
                              "NVRM: API mismatch: the client has the version %d.%d-%d, but\n"
                              "NVRM: this kernel module has the version %s.  Please\n"
                              "NVRM: make sure that this kernel module and all NVIDIA driver\n"
                              "NVRM: components have the same version.\n",
                              major, minor, patch, NV_VERSION_STRING);
                    status = -EINVAL;
                    goto done;

                case NV_RM_API_OLD_VERSION_MAGIC_IGNORE:
                    /* the client is telling us to ignore the old
                     * version scheme; it will do a version check via
                     * NV_ESC_CHECK_VERSION_STR
                     */
                    break;
                default:
                    nv_printf(NV_DBG_ERRORS,
                        "NVRM: client does not support versioning!!\n");
                    status = -EINVAL;
                    goto done;
            }

            status = nvidia_read_card_info(arg_copy, num_arg_devices);
            break;
        }

        case NV_ESC_ATTACH_GPUS_TO_FD:
        {
            size_t num_arg_gpus = arg_size / sizeof(NvU32);
            size_t i;

            NV_CTL_DEVICE_ONLY(nv);

            if (num_arg_gpus == 0 || nvfp->num_attached_gpus != 0)
            {
                status = -EINVAL;
                goto done;
            }

            NV_KMALLOC(nvfp->attached_gpus, sizeof(NvU32) * num_arg_gpus);
            if (nvfp->attached_gpus == NULL)
            {
                status = -ENOMEM;
                goto done;
            }
            memcpy(nvfp->attached_gpus, arg_copy, sizeof(NvU32) * num_arg_gpus);
            nvfp->num_attached_gpus = num_arg_gpus;

            for (i = 0; i < nvfp->num_attached_gpus; i++)
            {
                if (nvfp->attached_gpus[i] == 0)
                {
                    continue;
                }

                nvidia_dev_get(nvfp->attached_gpus[i], sp);
            }

            break;
        }

        case NV_ESC_CHECK_VERSION_STR:
        {
            NV_CTL_DEVICE_ONLY(nv);

            rmStatus = rm_perform_version_check(sp, arg_copy, arg_size);
            status = ((rmStatus == NV_OK) ? 0 : -EINVAL);
            break;
        }

        case NV_ESC_SYS_PARAMS:
        {
            nv_ioctl_sys_params_t *api = arg_copy;

            NV_CTL_DEVICE_ONLY(nv);

            if (arg_size != sizeof(nv_ioctl_sys_params_t))
            {
                status = -EINVAL;
                goto done;
            }

            /* numa_memblock_size should only be set once */
            if (nvl->numa_memblock_size == 0)
            {
                nvl->numa_memblock_size = api->memblock_size;
            }
            else
            {
                status = (nvl->numa_memblock_size == api->memblock_size) ?
                    0 : -EBUSY;
                goto done;
            }
            break;
        }

        case NV_ESC_NUMA_INFO:
        {
            nv_ioctl_numa_info_t *api = arg_copy;
            rmStatus = NV_OK;

            NV_ACTUAL_DEVICE_ONLY(nv);

            if (arg_size != sizeof(nv_ioctl_numa_info_t))
            {
                status = -EINVAL;
                goto done;
            }

            rmStatus = rm_get_gpu_numa_info(sp, nv,
                &(api->nid),
                &(api->numa_mem_addr),
                &(api->numa_mem_size),
                &(api->blacklist_addresses));
            if (rmStatus != NV_OK)
            {
                status = -EBUSY;
                goto done;
            }

            api->status = nv_get_numa_status(nvl);
            api->memblock_size = nv_ctl_device.numa_memblock_size;
            break;
        }

        case NV_ESC_SET_NUMA_STATUS:
        {
            nv_ioctl_set_numa_status_t *api = arg_copy;
            rmStatus = NV_OK;

            if (!NV_IS_SUSER())
            {
                status = -EACCES;
                goto done;
            }

            NV_ACTUAL_DEVICE_ONLY(nv);

            if (arg_size != sizeof(nv_ioctl_set_numa_status_t))
            {
                status = -EINVAL;
                goto done;
            }

            /*
             * The nv_linux_state_t for the device needs to be locked
             * in order to prevent additional open()/close() calls from
             * manipulating the usage count for the device while we
             * determine if NUMA state can be changed.
             */
            down(&nvl->ldata_lock);

            if (nv_get_numa_status(nvl) != api->status)
            {
                if (api->status == NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS)
                {
                    /*
                     * Only the current client should have an open file
                     * descriptor for the device, to allow safe offlining.
                     */
                    if (NV_ATOMIC_READ(nvl->usage_count) > 1)
                    {
                        status = -EBUSY;
                        goto unlock;
                    }
                    else
                    {
                        /*
                         * If this call fails, it indicates that RM
                         * is not ready to offline memory, and we should keep
                         * the current NUMA status of ONLINE.
                         */
                        rmStatus = rm_gpu_numa_offline(sp, nv);
                        if (rmStatus != NV_OK)
                        {
                            status = -EBUSY;
                            goto unlock;
                        }
                    }
                }

                status = nv_set_numa_status(nvl, api->status);
                if (status < 0)
                {
                    if (api->status == NV_IOCTL_NUMA_STATUS_OFFLINE_IN_PROGRESS)
                        (void) rm_gpu_numa_online(sp, nv);
                    goto unlock;
                }

                if (api->status == NV_IOCTL_NUMA_STATUS_ONLINE)
                {
                    rmStatus = rm_gpu_numa_online(sp, nv);
                    if (rmStatus != NV_OK)
                    {
                        status = -EBUSY;
                        goto unlock;
                    }
                }
            }

unlock:
            up(&nvl->ldata_lock);

            break;
        }

        default:
            rmStatus = rm_ioctl(sp, nv, nvfp, arg_cmd, arg_copy, arg_size);
            status = ((rmStatus == NV_OK) ? 0 : -EINVAL);
            break;
    }

done:
    up(&nvfp->fops_sp_lock[NV_FOPS_STACK_INDEX_IOCTL]);

    NV_READ_UNLOCK_SYSTEM_PM_LOCK();

    if (arg_copy != NULL)
    {
        if (status != -EFAULT)
        {
            if (NV_COPY_TO_USER(arg_ptr, arg_copy, arg_size))
            {
                nv_printf(NV_DBG_ERRORS, "NVRM: failed to copy out ioctl data\n");
                status = -EFAULT;
            }
        }
        NV_KFREE(arg_copy, arg_size);
    }

    return status;
}

static long
nvidia_unlocked_ioctl(
    struct file *file,
    unsigned int cmd,
    unsigned long i_arg
)
{
    return nvidia_ioctl(NV_FILE_INODE(file), file, cmd, i_arg);
}

irqreturn_t
nvidia_isr_msix(
    int   irq,
    void *arg
)
{
    irqreturn_t ret;
    nv_linux_state_t *nvl = (void *) arg;

#if defined(NV_VGPU_KVM_BUILD)
    nvidia_isr_msix_prologue(nvl);
#endif

    // nvidia_isr_msix() is called for each of the MSI-X vectors and they can
    // run in parallel on different CPUs (cores), but this is not currently
    // supported by nvidia_isr() and its children. As a big hammer fix just
    // spinlock around the nvidia_isr() call to serialize them.
    //
    // At this point interrupts are disabled on the CPU running our ISR (see
    // comments for nv_default_irq_flags()) so a plain spinlock is enough.
    NV_SPIN_LOCK(&nvl->msix_isr_lock);

#if defined(NV_VGPU_KVM_BUILD)
    // Disable the local CPU interrupts which was enabled earlier in nvidia_isr_msix_prologue(). 
    local_irq_disable();
#endif

    ret = nvidia_isr(irq, arg);

    NV_SPIN_UNLOCK(&nvl->msix_isr_lock);

#if defined(NV_VGPU_KVM_BUILD)
    nvidia_isr_msix_epilogue(nvl);
#endif

    return ret;
}

/*
 * driver receives an interrupt
 *    if someone waiting, then hand it off.
 */
irqreturn_t
nvidia_isr(
    int   irq,
    void *arg
)
{
    nv_linux_state_t *nvl = (void *) arg;
    nv_state_t *nv = NV_STATE_PTR(nvl);
    NvU32 need_to_run_bottom_half_gpu_lock_held = 0;
    BOOL rm_handled = FALSE,  uvm_handled = FALSE, rm_fault_handling_needed = FALSE;
    NvU32 rm_serviceable_fault_cnt = 0;

    rm_gpu_copy_mmu_faults_unlocked(nvl->sp[NV_DEV_STACK_ISR], nv, &rm_serviceable_fault_cnt);
    rm_fault_handling_needed = (rm_serviceable_fault_cnt != 0);

#if defined (NV_UVM_ENABLE)
    if (!NV_IS_GVI_DEVICE(nv))
    {
        //
        // Returns NV_OK if the UVM driver handled the interrupt
        //
        // Returns NV_ERR_NO_INTR_PENDING if the interrupt is not for
        // the UVM driver.
        //
        // Returns NV_WARN_MORE_PROCESSING_REQUIRED if the UVM top-half ISR was
        // unable to get its lock(s), due to other (UVM) threads holding them.
        //
        // RM can normally treat NV_WARN_MORE_PROCESSING_REQUIRED the same as
        // NV_ERR_NO_INTR_PENDING, but in some cases the extra information may
        // be helpful.
        //
        if (nv_uvm_event_interrupt(nv_get_cached_uuid(nv)) == NV_OK)
            uvm_handled = TRUE;
    }
#endif

    rm_handled = rm_isr(nvl->sp[NV_DEV_STACK_ISR], nv,
                        &need_to_run_bottom_half_gpu_lock_held);

    if (need_to_run_bottom_half_gpu_lock_held)
    {
        return IRQ_WAKE_THREAD;
    }
    else
    {
        //
        // If rm_isr does not need to run a bottom half and mmu_faults_copied
        // indicates that bottom half is needed, then we enqueue a kthread based
        // bottom half, as this specific bottom_half will acquire the GPU lock
        //
        if (rm_fault_handling_needed)
            nv_kthread_q_schedule_q_item(&nvl->bottom_half_q, &nvl->bottom_half_q_item);
    }

    return IRQ_RETVAL(rm_handled || uvm_handled || rm_fault_handling_needed);
}

irqreturn_t
nvidia_isr_kthread_bh(
    int irq,
    void *data
)
{
    return nvidia_isr_common_bh(data);
}

irqreturn_t
nvidia_isr_msix_kthread_bh(
    int irq,
    void *data
)
{
    NV_STATUS status;
    irqreturn_t ret;
    nv_state_t *nv = (nv_state_t *) data;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    //
    // Synchronize kthreads servicing bottom halves for different MSI-X vectors
    // as they share same pre-allocated alt-stack.
    //
    status = os_acquire_mutex(nvl->msix_bh_mutex);
    // os_acquire_mutex can only fail if we cannot sleep and we can
    WARN_ON(status != NV_OK);

    ret = nvidia_isr_common_bh(data);

    os_release_mutex(nvl->msix_bh_mutex);

    return ret;
}

static irqreturn_t
nvidia_isr_common_bh(
    void *data
)
{
    nv_state_t *nv = (nv_state_t *) data;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nvidia_stack_t *sp = nvl->sp[NV_DEV_STACK_ISR_BH];
    NV_STATUS status;

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        nv_printf(NV_DBG_INFO, "NVRM: GPU is lost, skipping ISR bottom half\n");
    }
    else
    {
        rm_isr_bh(sp, nv);
    }

    return IRQ_HANDLED;
}

static void
nvidia_isr_bh_unlocked(
    void * args
)
{
    nv_state_t *nv = (nv_state_t *) args;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nvidia_stack_t *sp;
    NV_STATUS status;

    //
    // Synchronize kthreads servicing unlocked bottom half as they
    // share same pre-allocated stack for alt-stack
    //
    status = os_acquire_mutex(nvl->isr_bh_unlocked_mutex);
    if (status != NV_OK)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: %s: Unable to take bottom_half mutex!\n",
                  __FUNCTION__);
        WARN_ON(1);
    }

    sp = nvl->sp[NV_DEV_STACK_ISR_BH_UNLOCKED];

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        nv_printf(NV_DBG_INFO,
            "NVRM: GPU is lost, skipping unlocked ISR bottom half\n");
    }
    else
    {
        rm_isr_bh_unlocked(sp, nv);
    }

    os_release_mutex(nvl->isr_bh_unlocked_mutex);
}

static void
nvidia_rc_timer_callback(
    struct nv_timer *nv_timer
)
{
    nv_linux_state_t *nvl = container_of(nv_timer, nv_linux_state_t, rc_timer);
    nv_state_t *nv = NV_STATE_PTR(nvl);
    nvidia_stack_t *sp = nvl->sp[NV_DEV_STACK_TIMER];
    NV_STATUS status;

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        nv_printf(NV_DBG_INFO,
            "NVRM: GPU is lost, skipping device timer callbacks\n");
        return;
    }

    if (rm_run_rc_callback(sp, nv) == NV_OK)
    {
        // set another timeout 1 sec in the future:
        mod_timer(&nvl->rc_timer.kernel_timer, jiffies + HZ);
    }
}

/*
** nvidia_ctl_open
**
** nv control driver open entry point.  Sessions are created here.
*/
static int
nvidia_ctl_open(
    struct inode *inode,
    struct file *file
)
{
    nv_linux_state_t *nvl = &nv_ctl_device;
    nv_state_t *nv = NV_STATE_PTR(nvl);
    nv_file_private_t *nvfp = NV_GET_FILE_PRIVATE(file);
    static int count = 0;

    nv_printf(NV_DBG_INFO, "NVRM: nvidia_ctl_open\n");

    down(&nvl->ldata_lock);

    /* save the nv away in file->private_data */
    nvfp->nvptr = nvl;

    if (NV_ATOMIC_READ(nvl->usage_count) == 0)
    {
        init_waitqueue_head(&nv_ctl_waitqueue);

        nv->flags |= (NV_FLAG_OPEN | NV_FLAG_CONTROL);

        if ((nv_acpi_init() < 0) &&
            (count++ < NV_MAX_RECURRING_WARNING_MESSAGES))
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: failed to register with the ACPI subsystem!\n");
        }
    }

    NV_ATOMIC_INC(nvl->usage_count);
    up(&nvl->ldata_lock);

    return 0;
}


/*
** nvidia_ctl_close
*/
static int
nvidia_ctl_close(
    struct inode *inode,
    struct file *file
)
{
    nv_alloc_t *at, *next;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_FILEP(file);
    nv_state_t *nv = NV_STATE_PTR(nvl);
    nv_file_private_t *nvfp = NV_GET_FILE_PRIVATE(file);
    nvidia_stack_t *sp = nvfp->sp;
    static int count = 0;
    unsigned int i;

    nv_printf(NV_DBG_INFO, "NVRM: nvidia_ctl_close\n");

    //
    // If a rm-object exported into this file descriptor the get it free.
    //
    if (nvfp->hExportedRmObject != 0)
    {
        rm_free_exported_object(sp, nvfp->hExportedRmObject);
    }

    down(&nvl->ldata_lock);
    if (NV_ATOMIC_DEC_AND_TEST(nvl->usage_count))
    {
        nv->flags &= ~NV_FLAG_OPEN;

        if ((nv_acpi_uninit() < 0) &&
            (count++ < NV_MAX_RECURRING_WARNING_MESSAGES))
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: failed to unregister from the ACPI subsystem!\n");
        }
    }
    up(&nvl->ldata_lock);

    rm_free_unused_clients(sp, nv, nvfp);

    if (nvfp->free_list != NULL)
    {
        at = nvfp->free_list;
        while (at != NULL)
        {
            next = at->next;
            if (at->pid == os_get_current_process())
                NV_PRINT_AT(NV_DBG_MEMINFO, at);
            nv_free_pages(nv, at->num_pages,
                          NV_ALLOC_MAPPING_CONTIG(at->flags),
                          NV_ALLOC_MAPPING(at->flags),
                          (void *)at);
            at = next;
        }
    }

    if (nvfp->num_attached_gpus != 0)
    {
        size_t i;

        for (i = 0; i < nvfp->num_attached_gpus; i++)
        {
            if (nvfp->attached_gpus[i] != 0)
                nvidia_dev_put(nvfp->attached_gpus[i], sp);
        }

        NV_KFREE(nvfp->attached_gpus, sizeof(NvU32) * nvfp->num_attached_gpus);
        nvfp->num_attached_gpus = 0;
    }

    for (i = 0; i < NV_FOPS_STACK_INDEX_COUNT; ++i)
    {
        nv_kmem_cache_free_stack(nvfp->fops_sp[i]);
    }

    nv_free_file_private(nvfp);
    NV_SET_FILE_PRIVATE(file, NULL);

    nv_kmem_cache_free_stack(sp);

    return 0;
}


void NV_API_CALL
nv_set_dma_address_size(
    nv_state_t  *nv,
    NvU32       phys_addr_bits
)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    /*
     * The only scenario in which we definitely should not update the DMA mask
     * is on POWER, when using TCE bypass mode (see nv_get_dma_start_address()
     * for details), since the meaning of the DMA mask is overloaded in that
     * case.
     */
    if (!nvl->tce_bypass_enabled)
    {
        NvU64 new_mask = (((NvU64)1) << phys_addr_bits) - 1;
        pci_set_dma_mask(nvl->pci_dev, new_mask);
    }
}

static NvUPtr
nv_map_guest_pages(nv_alloc_t *at,
                   NvU64 address,
                   NvU32 page_count,
                   NvU32 page_idx)
{
    struct page **pages;
    NvU32 j;
    NvUPtr virt_addr;

    NV_KMALLOC(pages, sizeof(struct page *) * page_count);
    if (pages == NULL)
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: failed to allocate vmap() page descriptor table!\n");
        return 0;
    }

    for (j = 0; j < page_count; j++)
    {
        pages[j] = NV_GET_PAGE_STRUCT(at->page_table[page_idx+j]->phys_addr);
    }

    virt_addr = nv_vm_map_pages(pages, page_count,
        NV_ALLOC_MAPPING_CACHED(at->flags));
    NV_KFREE(pages, sizeof(struct page *) * page_count);

    return virt_addr;
}

NV_STATUS NV_API_CALL
nv_alias_pages(
    nv_state_t *nv,
    NvU32 page_cnt,
    NvU32 contiguous,
    NvU32 cache_type,
    NvU64 guest_id,
    NvU64 *pte_array,
    void **priv_data
)
{
    nv_alloc_t *at;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    NvU32 i=0;
    nvidia_pte_t *page_ptr = NULL;

    at = nvos_create_alloc(nvl->dev, page_cnt);

    if (at == NULL)
    {
        return NV_ERR_NO_MEMORY;
    }

    at->flags = nv_alloc_init_flags(cache_type, contiguous, 0);
    at->flags |= NV_ALLOC_TYPE_GUEST;

    at->order = get_order(at->num_pages * PAGE_SIZE);

    for (i=0; i < at->num_pages; ++i)
    {
        page_ptr = at->page_table[i];

        if (contiguous && i>0)
        {
            page_ptr->dma_addr = pte_array[0] + (i << PAGE_SHIFT);
        }
        else
        {
            page_ptr->dma_addr  = pte_array[i];
        }

        page_ptr->phys_addr = page_ptr->dma_addr;

        /* aliased pages will be mapped on demand. */
        page_ptr->virt_addr = 0x0;
    }

    at->guest_id = guest_id;
    *priv_data = at;
    NV_ATOMIC_INC(at->usage_count);

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    return NV_OK;
}

/*
 *   This creates a dummy nv_alloc_t for peer IO mem, so that it can
 *   be mapped using NvRmMapMemory.
 */
NV_STATUS NV_API_CALL nv_register_peer_io_mem(
    nv_state_t *nv,
    NvU64      *phys_addr,
    NvU64       page_count,
    void      **priv_data
)
{
    nv_alloc_t *at;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    NvU64 i;
    NvU64 addr;

    at = nvos_create_alloc(nvl->dev, page_count);

    if (at == NULL)
        return NV_ERR_NO_MEMORY;

    // IO regions should be uncached and contiguous
    at->flags = nv_alloc_init_flags(1, 1, 0);
    at->flags |= NV_ALLOC_TYPE_PEER_IO;

    at->order = get_order(at->num_pages * PAGE_SIZE);

    addr = phys_addr[0];

    for (i = 0; i < page_count; i++)
    {
        at->page_table[i]->phys_addr = addr;
        addr += PAGE_SIZE;
    }

    // No struct page array exists for this memory.
    at->user_pages = NULL;

    *priv_data = at;

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    return NV_OK;
}

void NV_API_CALL nv_unregister_peer_io_mem(
    nv_state_t *nv,
    void       *priv_data
)
{
    nv_alloc_t *at = priv_data;

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    nvos_free_alloc(at);
}

/*
 * By registering user pages, we create a dummy nv_alloc_t for it, so that the
 * rest of the RM can treat it like any other alloc.
 *
 * This also converts the page array to an array of physical addresses.
 */
NV_STATUS NV_API_CALL nv_register_user_pages(
    nv_state_t *nv,
    NvU64       page_count,
    NvU64      *phys_addr,
    void      **priv_data
)
{
    nv_alloc_t *at;
    NvU64 i;
    struct page **user_pages;
    nv_linux_state_t *nvl;
    nvidia_pte_t *page_ptr;

    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_register_user_pages: 0x%x\n", page_count);
    user_pages = *priv_data;
    nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    at = nvos_create_alloc(nvl->dev, page_count);

    if (at == NULL)
    {
        return NV_ERR_NO_MEMORY;
    }

    /*
     * Anonymous memory currently must be write-back cacheable, and we can't
     * enforce contiguity.
     */
    at->flags = nv_alloc_init_flags(1, 0, 0);
    at->flags |= NV_ALLOC_TYPE_USER;

    at->order = get_order(at->num_pages * PAGE_SIZE);

    for (i = 0; i < page_count; i++)
    {
        /*
         * We only assign the physical address and not the DMA address, since
         * this allocation hasn't been DMA-mapped yet.
         */
        page_ptr = at->page_table[i];
        page_ptr->phys_addr = page_to_phys(user_pages[i]);

        phys_addr[i] = page_ptr->phys_addr;
    }

    /* Save off the user pages array to be restored later */
    at->user_pages = user_pages;
    *priv_data = at;

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    return NV_OK;
}

NV_STATUS NV_API_CALL nv_unregister_user_pages(
    nv_state_t *nv,
    NvU64       page_count,
    void      **priv_data
)
{
    nv_alloc_t *at = *priv_data;

    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_unregister_user_pages: 0x%x\n", page_count);

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    WARN_ON(!NV_ALLOC_MAPPING_USER(at->flags));

    /* Restore the user pages array for the caller to handle */
    *priv_data = at->user_pages;

    nvos_free_alloc(at);

    return NV_OK;
}

/*
 * This creates a dummy nv_alloc_t for existing physical allocations, so
 * that it can be mapped using NvRmMapMemory and BAR2 code path.
 */
NV_STATUS NV_API_CALL nv_register_phys_pages(
    nv_state_t *nv,
    NvU64      *phys_addr,
    NvU64       page_count,
    void      **priv_data
)
{
    nv_alloc_t *at;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    NvU64 i;
    NvU64 addr;

    at = nvos_create_alloc(nvl->dev, page_count);

    if (at == NULL)
        return NV_ERR_NO_MEMORY;
    /*
     * Setting memory flags to cacheable and discontiguous.
     */
    at->flags = nv_alloc_init_flags(0, 0, 0);

    /*
     * Only physical address is available so we don't try to reuse existing
     * mappings
     */
    at->flags |= NV_ALLOC_TYPE_PHYSICAL;

    at->order = get_order(at->num_pages * PAGE_SIZE);

    for (i = 0, addr = phys_addr[0]; i < page_count; addr = phys_addr[++i])
    {
        at->page_table[i]->phys_addr = addr;
    }

    at->user_pages = NULL;
    *priv_data = at;

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    return NV_OK;
}

void NV_API_CALL nv_unregister_phys_pages(
    nv_state_t *nv,
    void       *priv_data
)
{
    nv_alloc_t *at = priv_data;
    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    nvos_free_alloc(at);
}

void* NV_API_CALL nv_alloc_kernel_mapping(
    nv_state_t *nv,
    void       *pAllocPrivate,
    NvU64       pageIndex,
    NvU32       pageOffset,
    NvU64       size,
    void      **pPrivate
)
{
    nv_alloc_t *at = pAllocPrivate;
    NvU32 j, page_count;
    NvUPtr virt_addr;
    struct page **pages;
    NvBool isUserAllocatedMem;

    //
    // For User allocated memory (like ErrorNotifier's) which is NOT allocated
    // nor owned by RM, the RM driver just stores the physical address
    // corresponding to that memory and does not map it until required.
    // In that case, in page tables the virt_addr == 0, so first we need to map
    // those pages to obtain virtual address.
    //
    isUserAllocatedMem = NV_ALLOC_MAPPING_USER(at->flags) &&
                        !at->page_table[pageIndex]->virt_addr &&
                         at->page_table[pageIndex]->phys_addr;

    //
    // User memory may NOT have kernel VA. So check this and fallback to else
    // case to create one.
    //
    if (((size + pageOffset) <= PAGE_SIZE) &&
         !NV_ALLOC_MAPPING_GUEST(at->flags) && !NV_ALLOC_MAPPING_ALIASED(at->flags) &&
         !isUserAllocatedMem && !NV_ALLOC_MAPPING_PHYSICAL(at->flags))
    {
        *pPrivate = NULL;
        return (void *)(at->page_table[pageIndex]->virt_addr + pageOffset);
    }
    else
    {
        size += pageOffset;
        page_count = (size >> PAGE_SHIFT) + ((size & ~NV_PAGE_MASK) ? 1 : 0);

        if (NV_ALLOC_MAPPING_GUEST(at->flags))
        {
            virt_addr = nv_map_guest_pages(at,
                                           nv->bars[NV_GPU_BAR_INDEX_REGS].cpu_address,
                                           page_count, pageIndex);
        }
        else
        {
            NV_KMALLOC(pages, sizeof(struct page *) * page_count);
            if (pages == NULL)
            {
                nv_printf(NV_DBG_ERRORS,
                          "NVRM: failed to allocate vmap() page descriptor table!\n");
                return NULL;
            }

            for (j = 0; j < page_count; j++)
                pages[j] = NV_GET_PAGE_STRUCT(at->page_table[pageIndex+j]->phys_addr);

            virt_addr = nv_vm_map_pages(pages, page_count,
                NV_ALLOC_MAPPING_CACHED(at->flags));
            NV_KFREE(pages, sizeof(struct page *) * page_count);
        }

        if (virt_addr == 0)
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: failed to map pages!\n");
            return NULL;
        }

        *pPrivate = (void *)(NvUPtr)page_count;
        return (void *)(virt_addr + pageOffset);
    }

    return NULL;
}

NV_STATUS NV_API_CALL nv_free_kernel_mapping(
    nv_state_t *nv,
    void       *pAllocPrivate,
    void       *address,
    void       *pPrivate
)
{
    nv_alloc_t *at = pAllocPrivate;
    NvUPtr virt_addr;
    NvU32 page_count;

    virt_addr = ((NvUPtr)address & NV_PAGE_MASK);
    page_count = (NvUPtr)pPrivate;

    if (NV_ALLOC_MAPPING_GUEST(at->flags))
    {
        nv_iounmap((void *)virt_addr, (page_count * PAGE_SIZE));
    }
    else if (pPrivate != NULL)
    {
        nv_vm_unmap_pages(virt_addr, page_count);
    }

    return NV_OK;
}

NV_STATUS NV_API_CALL nv_alloc_pages(
    nv_state_t *nv,
    NvU32       page_count,
    NvBool      contiguous,
    NvU32       cache_type,
    NvBool      zeroed,
    NvU64      *pte_array,
    void      **priv_data
)
{
    nv_alloc_t *at;
    NV_STATUS status = NV_ERR_NO_MEMORY;
    nv_linux_state_t *nvl = NULL;
    NvBool will_remap = NV_FALSE;
    NvU32 i, memory_type;
    struct device *dev = NULL;

    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_alloc_pages: %d pages\n", page_count);
    nv_printf(NV_DBG_MEMINFO, "NVRM: VM:    contig %d  cache_type %d\n",
        contiguous, cache_type);

    memory_type = NV_MEMORY_TYPE_SYSTEM;
    //
    // system memory allocation can be associated with a client instead of a gpu
    // handle the case where per device state is NULL
    //
    if(nv)
    {
       nvl = NV_GET_NVL_FROM_NV_STATE(nv);
       will_remap = nv_requires_dma_remap(nv);
       dev = nvl->dev;
    }

    if (nv_encode_caching(NULL, cache_type, memory_type))
        return NV_ERR_NOT_SUPPORTED;

    at = nvos_create_alloc(dev, page_count);
    if (at == NULL)
        return NV_ERR_NO_MEMORY;

    at->flags = nv_alloc_init_flags(cache_type, contiguous, zeroed);

#if defined(NVCPU_PPC64LE)
    /*
     * Starting on Power9 systems, DMA addresses for NVLink are no longer the
     * same as used over PCIe. There is an address compression scheme required
     * for NVLink ONLY which impacts the upper address bits of the DMA address.
     *
     * This divergence between PCIe and NVLink DMA mappings breaks assumptions
     * in the driver where during initialization we allocate system memory
     * for the GPU to access over PCIe before NVLink is trained -- and some of
     * these mappings persist on the GPU. If these persistent mappings are not
     * equivalent they will cause invalid DMA accesses from the GPU once we
     * switch to NVLink.
     *
     * To work around this we limit all system memory allocations from the driver
     * during the period before NVLink is enabled to be from NUMA node 0 (CPU 0)
     * which has a CPU real address with the upper address bits (above bit 42)
     * set to 0. Effectively making the PCIe and NVLink DMA mappings equivalent
     * allowing persistent system memory mappings already programmed on the GPU
     * to remain valid after NVLink is enabled.
     *
     * See Bug 1920398 for more details.
     */
    if (nv && nvl->npu && !(nv->nvlink_sysmem_links_enabled))
        at->flags |= NV_ALLOC_TYPE_NODE0;
#endif

    if (NV_ALLOC_MAPPING_CONTIG(at->flags))
        status = nv_alloc_contig_pages(nv, at);
    else
        status = nv_alloc_system_pages(nv, at);

    if (status != NV_OK)
        goto failed;

    for (i = 0; i < ((contiguous) ? 1 : page_count); i++)
    {
        /*
         * The contents of the pte_array[] depend on whether or not this device
         * requires DMA-remapping. If it does, it should be the phys addresses
         * used by the DMA-remapping paths, otherwise it should be the actual
         * address that the device should use for DMA (which, confusingly, may
         * be different than the CPU physical address, due to a static DMA
         * offset).
         */
        if ((nv == NULL) || will_remap)
        {
            pte_array[i] = at->page_table[i]->phys_addr;
        }
        else
        {
            pte_array[i] = nv_phys_to_dma(dev,
                at->page_table[i]->phys_addr);
        }
    }

    *priv_data = at;
    NV_ATOMIC_INC(at->usage_count);

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    return NV_OK;

failed:
    nvos_free_alloc(at);

    return status;
}

NV_STATUS NV_API_CALL nv_free_pages(
    nv_state_t *nv,
    NvU32 page_count,
    NvBool contiguous,
    NvU32 cache_type,
    void *priv_data
)
{
    NV_STATUS rmStatus = NV_OK;
    nv_alloc_t *at = priv_data;

    nv_printf(NV_DBG_MEMINFO, "NVRM: VM: nv_free_pages: 0x%x\n", page_count);

    NV_PRINT_AT(NV_DBG_MEMINFO, at);

    /*
     * If the 'at' usage count doesn't drop to zero here, not all of
     * the user mappings have been torn down in time - we can't
     * safely free the memory. We report success back to the RM, but
     * defer the actual free operation until later.
     *
     * This is described in greater detail in the comments above the
     * nvidia_vma_(open|release)() callbacks in nv-mmap.c.
     */
    if (!NV_ATOMIC_DEC_AND_TEST(at->usage_count))
        return NV_OK;

    if (!NV_ALLOC_MAPPING_GUEST(at->flags))
    {
        if (NV_ALLOC_MAPPING_CONTIG(at->flags))
            nv_free_contig_pages(at);
        else
            nv_free_system_pages(at);
    }

    nvos_free_alloc(at);

    return rmStatus;
}

static NvBool nv_lock_init_locks
(
    nvidia_stack_t *sp,
    nv_state_t *nv
)
{
    nv_linux_state_t *nvl;
    nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    NV_INIT_MUTEX(&nvl->ldata_lock);
    NV_INIT_MUTEX(&nvl->mmap_lock);

    NV_ATOMIC_SET(nvl->usage_count, 0);

    if (!rm_init_event_locks(sp, nv))
        return NV_FALSE;

    return NV_TRUE;
}

static void nv_lock_destroy_locks
(
    nvidia_stack_t *sp,
    nv_state_t *nv
)
{
    rm_destroy_event_locks(sp, nv);
}

void NV_API_CALL nv_post_event(
    nv_state_t *nv,
    nv_event_t *event,
    NvHandle    handle,
    NvU32       index,
    NvBool      data_valid
)
{
    nv_file_private_t *nvfp = event->file;
    unsigned long eflags;
    nvidia_event_t *nvet;

    NV_SPIN_LOCK_IRQSAVE(&nvfp->fp_lock, eflags);

    if (data_valid)
    {
        NV_KMALLOC_ATOMIC(nvet, sizeof(nvidia_event_t));
        if (nvet == NULL)
        {
            NV_SPIN_UNLOCK_IRQRESTORE(&nvfp->fp_lock, eflags);
            return;
        }

        if (nvfp->event_tail != NULL)
            nvfp->event_tail->next = nvet;
        if (nvfp->event_head == NULL)
            nvfp->event_head = nvet;
        nvfp->event_tail = nvet;
        nvet->next = NULL;

        nvet->event = *event;
        nvet->event.hObject = handle;
        nvet->event.index = index;
    }

    nvfp->event_pending = TRUE;

    NV_SPIN_UNLOCK_IRQRESTORE(&nvfp->fp_lock, eflags);

    wake_up_interruptible(&nvfp->waitqueue);
}

NV_STATUS NV_API_CALL nv_export_rm_object_to_fd(
    NvHandle  hExportedRmObject,
    NvS32     fd
)
{
    struct file *filp = NULL;
    nv_file_private_t *nvfp = NULL;
    dev_t rdev = 0;
    NV_STATUS status = NV_OK;

    filp = fget(fd);

    if (filp == NULL || !NV_FILE_INODE(filp))
    {
        status = NV_ERR_INVALID_ARGUMENT;
        goto done;
    }

    rdev = (NV_FILE_INODE(filp))->i_rdev;

    if ((MAJOR(rdev) != NV_MAJOR_DEVICE_NUMBER) ||
        (MINOR(rdev) != NV_CONTROL_DEVICE_MINOR))
    {
        status = NV_ERR_INVALID_ARGUMENT;
        goto done;
    }

    nvfp = NV_GET_FILE_PRIVATE(filp);

    if (nvfp->hExportedRmObject != 0)
    {
        status = NV_ERR_STATE_IN_USE;
        goto done;
    }

    nvfp->hExportedRmObject = hExportedRmObject;

done:

    if (filp != NULL)
    {
        fput(filp);
    }

    return status;
}

NV_STATUS NV_API_CALL nv_import_rm_object_from_fd(
    NvS32     fd,
    NvHandle *pExportedObject
)
{
    struct file *filp = NULL;
    nv_file_private_t *nvfp = NULL;
    dev_t rdev = 0;
    NV_STATUS status = NV_OK;

    filp = fget(fd);

    if (filp == NULL || !NV_FILE_INODE(filp))
    {
        status = NV_ERR_INVALID_ARGUMENT;
        goto done;
    }

    rdev = (NV_FILE_INODE(filp))->i_rdev;

    if ((MAJOR(rdev) != NV_MAJOR_DEVICE_NUMBER) ||
        (MINOR(rdev) != NV_CONTROL_DEVICE_MINOR))
    {
        status = NV_ERR_INVALID_ARGUMENT;
        goto done;
    }

    nvfp = NV_GET_FILE_PRIVATE(filp);

    if (nvfp->hExportedRmObject == 0)
    {
        status = NV_ERR_INVALID_ARGUMENT;
        goto done;
    }

    *pExportedObject = nvfp->hExportedRmObject;

done:

    if (filp != NULL)
    {
        fput(filp);
    }

    return status;
}

int NV_API_CALL nv_get_event(
    nv_state_t *nv,
    void       *file,
    nv_event_t *event,
    NvU32      *pending
)
{
    nv_file_private_t *nvfp = file;
    nvidia_event_t *nvet;
    unsigned long eflags;

    NV_SPIN_LOCK_IRQSAVE(&nvfp->fp_lock, eflags);

    nvet = nvfp->event_head;
    if (nvet == NULL)
    {
        NV_SPIN_UNLOCK_IRQRESTORE(&nvfp->fp_lock, eflags);
        return NV_ERR_GENERIC;
    }

    *event = nvet->event;

    if (nvfp->event_tail == nvet)
        nvfp->event_tail = NULL;
    nvfp->event_head = nvet->next;

    *pending = (nvfp->event_head != NULL);

    NV_SPIN_UNLOCK_IRQRESTORE(&nvfp->fp_lock, eflags);

    NV_KFREE(nvet, sizeof(nvidia_event_t));

    return NV_OK;
}

int NV_API_CALL nv_start_rc_timer(
    nv_state_t *nv
)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nv->rc_timer_enabled)
        return -1;

    nv_printf(NV_DBG_INFO, "NVRM: initializing rc timer\n");

    nv_timer_setup(&nvl->rc_timer, nvidia_rc_timer_callback);

    nv->rc_timer_enabled = 1;

    // set the timeout for 1 second in the future:
    mod_timer(&nvl->rc_timer.kernel_timer, jiffies + HZ);

    nv_printf(NV_DBG_INFO, "NVRM: rc timer initialized\n");

    return 0;
}

int NV_API_CALL nv_stop_rc_timer(
    nv_state_t *nv
)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (!nv->rc_timer_enabled)
        return -1;

    nv_printf(NV_DBG_INFO, "NVRM: stopping rc timer\n");
    nv->rc_timer_enabled = 0;
    del_timer_sync(&nvl->rc_timer.kernel_timer);
    nv_printf(NV_DBG_INFO, "NVRM: rc timer stopped\n");

    return 0;
}

static void
nvos_validate_assigned_gpus(struct pci_dev *pci_dev)
{
    NvU32 i;

    if (NV_IS_ASSIGN_GPU_PCI_INFO_SPECIFIED())
    {
        for (i = 0; i < nv_assign_gpu_count; i++)
        {
            if ((nv_assign_gpu_pci_info[i].domain == NV_PCI_DOMAIN_NUMBER(pci_dev)) &&
                (nv_assign_gpu_pci_info[i].bus == NV_PCI_BUS_NUMBER(pci_dev)) &&
                (nv_assign_gpu_pci_info[i].slot == NV_PCI_SLOT_NUMBER(pci_dev)))
            {
                nv_assign_gpu_pci_info[i].valid = NV_TRUE;
                return;
            }
        }
    }
}

/* make sure the pci_driver called probe for all of our devices.
 * we've seen cases where rivafb claims the device first and our driver
 * doesn't get called.
 */
static int
nvos_count_devices(nvidia_stack_t *sp)
{
    struct pci_dev *pci_dev;
    int count = 0;

    pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_DISPLAY_VGA << 8, NULL);
    while (pci_dev)
    {
        if ((pci_dev->vendor == 0x10de) && (pci_dev->device >= 0x20) &&
            !rm_is_legacy_device(sp, pci_dev->device, pci_dev->subsystem_vendor,
                                 pci_dev->subsystem_device, TRUE))
        {
            count++;
            nvos_validate_assigned_gpus(pci_dev);
        }
        pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_DISPLAY_VGA << 8, pci_dev);
    }

    pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_DISPLAY_3D << 8, NULL);
    while (pci_dev)
    {
        if ((pci_dev->vendor == 0x10de) && (pci_dev->device >= 0x20) &&
            !rm_is_legacy_device(sp, pci_dev->device, pci_dev->subsystem_vendor,
                                 pci_dev->subsystem_device, TRUE))
        {
            count++;
            nvos_validate_assigned_gpus(pci_dev);
        }
        pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_DISPLAY_3D << 8, pci_dev);
    }

    pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_MULTIMEDIA_OTHER << 8, NULL);
    while (pci_dev)
    {
        if ((pci_dev->vendor == 0x10de) && (pci_dev->device == 0x0e00))
            count++;
        pci_dev = NV_PCI_GET_CLASS(PCI_CLASS_MULTIMEDIA_OTHER << 8, pci_dev);
    }

    if (NV_IS_ASSIGN_GPU_PCI_INFO_SPECIFIED())
    {
        NvU32 i;

        for (i = 0; i < nv_assign_gpu_count; i++)
        {
            if (nv_assign_gpu_pci_info[i].valid == NV_TRUE)
                count++;
        }
    }

    return count;
}

NvBool nvos_is_chipset_io_coherent(void)
{
    if (nv_chipset_is_io_coherent == NV_TRISTATE_INDETERMINATE)
    {
        nvidia_stack_t *sp = NULL;
        if (nv_kmem_cache_alloc_stack(&sp) != 0)
        {
            nv_printf(NV_DBG_ERRORS,
              "NVRM: cannot allocate stack for platform coherence check callback \n");
            WARN_ON(1);
            return NV_FALSE;
        }

        nv_chipset_is_io_coherent = rm_is_chipset_io_coherent(sp);

        nv_kmem_cache_free_stack(sp);
    }

    return nv_chipset_is_io_coherent;
}

static BOOL nv_treat_missing_irq_as_error(void)
{
#if defined(NV_LINUX_PCIE_MSI_SUPPORTED)
    return (nv_get_hypervisor_type() != OS_HYPERVISOR_HYPERV);
#else
    return TRUE;
#endif
}

/* find nvidia devices and set initial state */
int
nvidia_probe
(
    struct pci_dev *pci_dev,
    const struct pci_device_id *id_table
)
{
    nv_state_t *nv = NULL;
    nv_linux_state_t *nvl = NULL;
    unsigned int i, j;
    int flags = 0;
    nvidia_stack_t *sp = NULL;
    NvBool prev_nv_ats_supported = nv_ats_supported;
    NV_STATUS status;

    nv_printf(NV_DBG_SETUP, "NVRM: probing 0x%x 0x%x, class 0x%x\n",
        pci_dev->vendor, pci_dev->device, pci_dev->class);

















    if ((pci_dev->class == (PCI_CLASS_MULTIMEDIA_OTHER << 8)) &&
        (pci_dev->device == 0x0e00))
    {
        flags = NV_FLAG_GVI;
    }

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return -1;
    }

    if (!(flags & NV_FLAG_GVI))
    {
        if ((pci_dev->vendor != 0x10de) || (pci_dev->device < 0x20) ||
            ((pci_dev->class != (PCI_CLASS_DISPLAY_VGA << 8)) &&
             (pci_dev->class != (PCI_CLASS_DISPLAY_3D << 8))) ||
            rm_is_legacy_device(sp, pci_dev->device, pci_dev->subsystem_vendor,
                                pci_dev->subsystem_device, FALSE))
        {
            nv_printf(NV_DBG_ERRORS, "NVRM: ignoring the legacy GPU %04x:%02x:%02x.%x\n",
                      NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
                      NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
            goto failed;
        }
    }

    if (NV_IS_ASSIGN_GPU_PCI_INFO_SPECIFIED())
    {
        for (i = 0; i < nv_assign_gpu_count; i++)
        {
            if (((nv_assign_gpu_pci_info[i].domain == NV_PCI_DOMAIN_NUMBER(pci_dev)) &&
                 (nv_assign_gpu_pci_info[i].bus == NV_PCI_BUS_NUMBER(pci_dev)) &&
                 (nv_assign_gpu_pci_info[i].slot == NV_PCI_SLOT_NUMBER(pci_dev))) &&
                (nv_assign_gpu_pci_info[i].valid))
                break;
        }

        if (i == nv_assign_gpu_count)
        {
            goto failed;
        }
    }

    num_probed_nv_devices++;

    if (pci_enable_device(pci_dev) != 0)
    {
        nv_printf(NV_DBG_ERRORS,
            "NVRM: pci_enable_device failed, aborting\n");
        goto failed;
    }

    if ((pci_dev->irq == 0 && !pci_find_capability(pci_dev, PCI_CAP_ID_MSIX))
        && nv_treat_missing_irq_as_error())
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: Can't find an IRQ for your NVIDIA card!\n");
        nv_printf(NV_DBG_ERRORS, "NVRM: Please check your BIOS settings.\n");
        nv_printf(NV_DBG_ERRORS, "NVRM: [Plug & Play OS] should be set to NO\n");
        nv_printf(NV_DBG_ERRORS, "NVRM: [Assign IRQ to VGA] should be set to YES \n");
        goto failed;
    }

    for (i = 0; i < (NV_GPU_NUM_BARS - 1); i++)
    {
        if (NV_PCI_RESOURCE_VALID(pci_dev, i))
        {
#if defined(NV_PCI_MAX_MMIO_BITS_SUPPORTED)
            if ((NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_TYPE_64) &&
                ((NV_PCI_RESOURCE_START(pci_dev, i) >> NV_PCI_MAX_MMIO_BITS_SUPPORTED)))
            {
                nv_printf(NV_DBG_ERRORS,
                    "NVRM: This is a 64-bit BAR mapped above %dGB by the system\n"
                    "NVRM: BIOS or the %s kernel. This PCI I/O region assigned\n"
                    "NVRM: to your NVIDIA device is not supported by the kernel.\n"
                    "NVRM: BAR%d is %dM @ 0x%llx (PCI:%04x:%02x:%02x.%x)\n",
                    (1 << (NV_PCI_MAX_MMIO_BITS_SUPPORTED - 30)),
                    NV_KERNEL_NAME, i,
                    (NV_PCI_RESOURCE_SIZE(pci_dev, i) >> 20),
                    (NvU64)NV_PCI_RESOURCE_START(pci_dev, i),
                    NV_PCI_DOMAIN_NUMBER(pci_dev),
                    NV_PCI_BUS_NUMBER(pci_dev), NV_PCI_SLOT_NUMBER(pci_dev),
                    PCI_FUNC(pci_dev->devfn));
                goto failed;
            }
#endif
            if ((NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_TYPE_64) &&
                (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_PREFETCH))
            {
                struct pci_dev *bridge = pci_dev->bus->self;
                NvU32 base_upper, limit_upper;

                if (bridge == NULL)
                    continue;

                pci_read_config_dword(pci_dev, NVRM_PCICFG_BAR_OFFSET(i) + 4,
                        &base_upper);
                if (base_upper == 0)
                    continue;

                pci_read_config_dword(bridge, PCI_PREF_BASE_UPPER32,
                        &base_upper);
                pci_read_config_dword(bridge, PCI_PREF_LIMIT_UPPER32,
                        &limit_upper);

                if ((base_upper != 0) && (limit_upper != 0))
                    continue;

                nv_printf(NV_DBG_ERRORS,
                    "NVRM: This is a 64-bit BAR mapped above 4GB by the system\n"
                    "NVRM: BIOS or the %s kernel, but the PCI bridge\n"
                    "NVRM: immediately upstream of this GPU does not define\n"
                    "NVRM: a matching prefetchable memory window.\n",
                    NV_KERNEL_NAME);
                nv_printf(NV_DBG_ERRORS,
                    "NVRM: This may be due to a known Linux kernel bug.  Please\n"
                    "NVRM: see the README section on 64-bit BARs for additional\n"
                    "NVRM: information.\n");
                goto failed;
            }
                continue;
        }
        nv_printf(NV_DBG_ERRORS,
            "NVRM: This PCI I/O region assigned to your NVIDIA device is invalid:\n"
            "NVRM: BAR%d is %dM @ 0x%llx (PCI:%04x:%02x:%02x.%x)\n", i,
            (NV_PCI_RESOURCE_SIZE(pci_dev, i) >> 20),
            (NvU64)NV_PCI_RESOURCE_START(pci_dev, i),
            NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
            NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
#if defined(NVCPU_X86)
        if ((NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_TYPE_64) &&
            ((NV_PCI_RESOURCE_START(pci_dev, i) >> PAGE_SHIFT) > 0xfffffULL))
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: This is a 64-bit BAR mapped above 4GB by the system\n"
                "NVRM: BIOS or the Linux kernel.  The NVIDIA Linux/x86\n"
                "NVRM: graphics driver and other system software components\n"
                "NVRM: do not support this configuration.\n");
        }
        else
#endif
        if (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_MEM_TYPE_64)
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: This is a 64-bit BAR, which some operating system\n"
                "NVRM: kernels and other system software components are known\n"
                "NVRM: to handle incorrectly.  Please see the README section\n"
                "NVRM: on 64-bit BARs for more information.\n");
        }
        else
        {
            nv_printf(NV_DBG_ERRORS,
                "NVRM: The system BIOS may have misconfigured your GPU.\n");
        }
        goto failed;
    }

    if (!request_mem_region(NV_PCI_RESOURCE_START(pci_dev, NV_GPU_BAR_INDEX_REGS),
                            NV_PCI_RESOURCE_SIZE(pci_dev, NV_GPU_BAR_INDEX_REGS),
                            nv_device_name))
    {
        nv_printf(NV_DBG_ERRORS,
            "NVRM: request_mem_region failed for %dM @ 0x%llx. This can\n"
            "NVRM: occur when a driver such as rivatv is loaded and claims\n"
            "NVRM: ownership of the device's registers.\n",
            (NV_PCI_RESOURCE_SIZE(pci_dev, NV_GPU_BAR_INDEX_REGS) >> 20),
            (NvU64)NV_PCI_RESOURCE_START(pci_dev, NV_GPU_BAR_INDEX_REGS));
        goto failed;
    }

    NV_KMALLOC(nvl, sizeof(nv_linux_state_t));
    if (nvl == NULL)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: failed to allocate memory\n");
        goto err_not_supported;
    }

    os_mem_set(nvl, 0, sizeof(nv_linux_state_t));

    nv  = NV_STATE_PTR(nvl);

    pci_set_drvdata(pci_dev, (void *)nvl);

    /* default to 32-bit PCI bus address space */
    pci_dev->dma_mask = 0xffffffffULL;

    nvl->dev               = &pci_dev->dev;
    nvl->pci_dev           = pci_dev;

    nv->pci_info.vendor_id = pci_dev->vendor;
    nv->pci_info.device_id = pci_dev->device;
    nv->subsystem_id       = pci_dev->subsystem_device;
    nv->subsystem_vendor   = pci_dev->subsystem_vendor;
    nv->os_state           = (void *) nvl;
    nv->pci_info.domain    = NV_PCI_DOMAIN_NUMBER(pci_dev);
    nv->pci_info.bus       = NV_PCI_BUS_NUMBER(pci_dev);
    nv->pci_info.slot      = NV_PCI_SLOT_NUMBER(pci_dev);
    nv->handle             = pci_dev;
    nv->flags             |= flags;

    if (!nv_lock_init_locks(sp, nv))
    {
        goto err_not_supported;
    }

    nvl->all_mappings_revoked = TRUE;
    nvl->safe_to_mmap = TRUE;
    nvl->gpu_wakeup_callback_needed = TRUE;
    INIT_LIST_HEAD(&nvl->open_files);

    for (i = 0, j = 0; i < NVRM_PCICFG_NUM_BARS && j < NV_GPU_NUM_BARS; i++)
    {
        if ((NV_PCI_RESOURCE_VALID(pci_dev, i)) &&
            (NV_PCI_RESOURCE_FLAGS(pci_dev, i) & PCI_BASE_ADDRESS_SPACE)
                == PCI_BASE_ADDRESS_SPACE_MEMORY)
        {
            nv->bars[j].offset = NVRM_PCICFG_BAR_OFFSET(i);
            nv->bars[j].cpu_address = NV_PCI_RESOURCE_START(pci_dev, i);
            nv->bars[j].strapped_size = NV_PCI_RESOURCE_SIZE(pci_dev, i);
            nv->bars[j].size = nv->bars[j].strapped_size;
            j++;
        }
    }
    nv->regs = &nv->bars[NV_GPU_BAR_INDEX_REGS];
    nv->fb   = &nv->bars[NV_GPU_BAR_INDEX_FB];

    nv->interrupt_line = pci_dev->irq;

    nv_init_ibmnpu_info(nv);

#if defined(NVCPU_PPC64LE)
    // Use HW NUMA support as a proxy for ATS support. This is true in the only
    // platform where ATS is currently supported (IBM P9).
    nv_ats_supported &= nv_numa_info_valid(nvl);
#endif

    pci_set_master(pci_dev);

#if defined(CONFIG_VGA_ARB) && !defined(NVCPU_PPC64LE)
#if defined(VGA_DEFAULT_DEVICE)
    vga_tryget(VGA_DEFAULT_DEVICE, VGA_RSRC_LEGACY_MASK);
#endif
    vga_set_legacy_decoding(pci_dev, VGA_RSRC_NONE);
#endif

    if (rm_get_cpu_type(sp, &nv_cpu_type) != NV_OK)
        nv_printf(NV_DBG_ERRORS, "NVRM: error retrieving cpu type\n");

    if (NV_IS_GVI_DEVICE(nv))
    {
        if (!rm_gvi_init_private_state(sp, nv))
        {
            nv_printf(NV_DBG_ERRORS, "NVGVI: rm_init_gvi_private_state() failed!\n");
            goto err_not_supported;
        }

        if (rm_gvi_attach_device(sp, nv) != NV_OK)
        {
            rm_gvi_free_private_state(sp, nv);
            goto err_not_supported;
        }
    }
    else
    {
        status = nv_check_gpu_state(nv);
        if (status == NV_ERR_GPU_IS_LOST)
        {
            NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping nvidia_probe\n");
            goto err_not_supported;
        }

        if ((rm_is_supported_device(sp, nv)) != NV_OK)
            goto err_not_supported;

        if (!rm_init_private_state(sp, nv))
        {
            NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "rm_init_private_state() failed!\n");
            goto err_zero_dev;
        }
    }

    nv_printf(NV_DBG_INFO,
              "NVRM: PCI:%04x:%02x:%02x.%x (%04x:%04x): BAR0 @ 0x%llx (%lluMB)\n",
              nv->pci_info.domain, nv->pci_info.bus, nv->pci_info.slot,
              PCI_FUNC(pci_dev->devfn), nv->pci_info.vendor_id, nv->pci_info.device_id,
              nv->regs->cpu_address, (nv->regs->size >> 20));
    nv_printf(NV_DBG_INFO,
              "NVRM: PCI:%04x:%02x:%02x.%x (%04x:%04x): BAR1 @ 0x%llx (%lluMB)\n",
              nv->pci_info.domain, nv->pci_info.bus, nv->pci_info.slot,
              PCI_FUNC(pci_dev->devfn), nv->pci_info.vendor_id, nv->pci_info.device_id,
              nv->fb->cpu_address, (nv->fb->size >> 20));

    num_nv_devices++;

    for (i = 0; i < NV_GPU_NUM_BARS; i++)
    {
        if (nv->bars[i].size != 0)
        {
            if (nv_user_map_register(nv->bars[i].cpu_address,
                nv->bars[i].strapped_size) != 0)
            {
                NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                              "Failed to register usermap for BAR %u!\n", i);
                for (j = 0; j < i; j++)
                {
                    nv_user_map_unregister(nv->bars[j].cpu_address,
                                           nv->bars[j].strapped_size);
                }
                goto err_zero_dev;
            }
        }
    }

    /*
     * The newly created nvl object is added to the nv_linux_devices global list
     * only after all the initialization operations for that nvl object are
     * completed, so as to protect against simultaneous lookup operations which
     * may discover a partially initialized nvl object in the list
     */
    LOCK_NV_LINUX_DEVICES();
    if (nv_linux_devices == NULL)
        nv_linux_devices = nvl;
    else
    {
        nv_linux_state_t *tnvl;
        for (tnvl = nv_linux_devices; tnvl->next != NULL;  tnvl = tnvl->next);
        tnvl->next = nvl;
    }
    UNLOCK_NV_LINUX_DEVICES();
    if (nvidia_frontend_add_device((void *)&nv_fops, nvl) != 0)
        goto err_zero_dev;

#if defined(NV_PM_VT_SWITCH_REQUIRED_PRESENT)
    pm_vt_switch_required(nvl->dev, NV_TRUE);
#endif

    nv_procfs_add_gpu(nvl);

#if defined(NV_VGPU_KVM_BUILD)
    if (nvidia_vgpu_vfio_probe(nvl->pci_dev) != NV_OK)
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "Failed to register device to vGPU VFIO module");
        goto err_zero_dev;
    }
#endif

    nv_check_and_blacklist_gpu(sp, nv);

    rm_init_dynamic_power_management(sp, nv);

    nv_kmem_cache_free_stack(sp);

    return 0;

err_zero_dev:
    rm_free_private_state(sp, nv);
err_not_supported:
    nv_ats_supported = prev_nv_ats_supported;
    nv_destroy_ibmnpu_info(nv);
    nv_lock_destroy_locks(sp, nv);
    if (nvl != NULL)
    {
        NV_KFREE(nvl, sizeof(nv_linux_state_t));
    }
    release_mem_region(NV_PCI_RESOURCE_START(pci_dev, NV_GPU_BAR_INDEX_REGS),
                       NV_PCI_RESOURCE_SIZE(pci_dev, NV_GPU_BAR_INDEX_REGS));
    NV_PCI_DISABLE_DEVICE(pci_dev);
    pci_set_drvdata(pci_dev, NULL);
failed:
    nv_kmem_cache_free_stack(sp);
    return -1;
}

void
nvidia_remove(struct pci_dev *pci_dev)
{
    nv_linux_state_t *nvl = NULL;
    nv_state_t *nv;
    nvidia_stack_t *sp = NULL;
    NvU32 i;

    nv_printf(NV_DBG_SETUP, "NVRM: removing GPU %04x:%02x:%02x.%x\n",
              NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
              NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));













    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return;
    }

    LOCK_NV_LINUX_DEVICES();
    nvl = pci_get_drvdata(pci_dev);
    if (!nvl || (nvl->pci_dev != pci_dev))
    {
        goto done;
    }

    nv = NV_STATE_PTR(nvl);

    rm_cleanup_dynamic_power_management(sp, nv);

    rm_check_for_gpu_surprise_removal(sp, nv);

    /*
     * Sanity check: A removed device shouldn't have a non-zero usage_count.
     * For eGPU, fall off the bus along with clients active is a valid scenario.
     * Hence skipping the sanity check for eGPU.
     */
    if ((NV_ATOMIC_READ(nvl->usage_count) != 0) &&
        (!(NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv))))
    {
        nv_printf(NV_DBG_ERRORS,
                  "NVRM: Attempting to remove minor device %u with non-zero usage count!\n",
                  nvl->minor_num);
        WARN_ON(1);

        /* We can't continue without corrupting state, so just hang to give the
         * user some chance to do something about this before reboot */
        while (1)
            os_schedule();
    }

    if (nvl == nv_linux_devices)
        nv_linux_devices = nvl->next;
    else
    {
        nv_linux_state_t *tnvl;
        for (tnvl = nv_linux_devices; tnvl->next != nvl;  tnvl = tnvl->next);
        tnvl->next = nvl->next;
    }

    /* Remove proc entry for this GPU */
    nv_procfs_remove_gpu(nvl);

    down(&nvl->ldata_lock);
    UNLOCK_NV_LINUX_DEVICES();

#if defined(NV_PM_VT_SWITCH_REQUIRED_PRESENT)
    pm_vt_switch_unregister(&pci_dev->dev);
#endif

#if defined(NV_VGPU_KVM_BUILD)
    nvidia_vgpu_vfio_remove(pci_dev);
#endif

    /* Update the frontend data structures */
    if (!(NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)))
    {
        nvidia_frontend_remove_device((void *)&nv_fops, nvl);
    }

    if ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) || (nv->flags & NV_FLAG_OPEN))
    {
        if (nv->flags & NV_FLAG_PERSISTENT_SW_STATE)
        {
            rm_disable_gpu_state_persistence(sp, nv);
        }
        nv_shutdown_adapter(sp, nv, nvl);
        nv_dev_free_stacks(nvl);
    }

    nv_unregister_ibmnpu_devices(nv);
    nv_destroy_ibmnpu_info(nv);

    if (!(NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)))
    {
        nv_lock_destroy_locks(sp, nv);
    }

    num_probed_nv_devices--;

    pci_set_drvdata(pci_dev, NULL);

    if (NV_IS_GVI_DEVICE(nv))
    {
        flush_scheduled_work();
        rm_gvi_detach_device(sp, nv);
        rm_gvi_free_private_state(sp, nv);
    }
    else
    {
        for (i = 0; i < NV_GPU_NUM_BARS; i++)
        {
            if (nv->bars[i].size != 0)
            {
                nv_user_map_unregister(nv->bars[i].cpu_address,
                                       nv->bars[i].strapped_size);
            }
        }
        rm_i2c_remove_adapters(sp, nv);
        rm_free_private_state(sp, nv);
    }
    release_mem_region(NV_PCI_RESOURCE_START(pci_dev, NV_GPU_BAR_INDEX_REGS),
                       NV_PCI_RESOURCE_SIZE(pci_dev, NV_GPU_BAR_INDEX_REGS));

    num_nv_devices--;

    if (!(NV_IS_DEVICE_IN_SURPRISE_REMOVAL(nv)))
    {
        NV_PCI_DISABLE_DEVICE(pci_dev);
        NV_KFREE(nvl, sizeof(nv_linux_state_t));
    }
    else
    {
        up(&nvl->ldata_lock);
    }

    nv_kmem_cache_free_stack(sp);
    return;

done:
    UNLOCK_NV_LINUX_DEVICES();
    nv_kmem_cache_free_stack(sp);
}


#if defined(CONFIG_PM)
static NV_STATUS
nv_power_management(
    nv_state_t *nv,
    nv_pm_action_t pm_action
)
{
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    int status = NV_OK;
    nvidia_stack_t *sp = NULL;

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return NV_ERR_NO_MEMORY;
    }

    status = nv_check_gpu_state(nv);
    if (status == NV_ERR_GPU_IS_LOST)
    {
        NV_DEV_PRINTF(NV_DBG_INFO, nv, "GPU is lost, skipping PM event\n");
        goto failure;
    }

    switch (pm_action)
    {
        case NV_PM_ACTION_STANDBY:
            /* fall through */
        case NV_PM_ACTION_HIBERNATE:
        {
            status = rm_power_management(sp, nv, pm_action);

            nv_kthread_q_stop(&nvl->bottom_half_q);

            nv_disable_pat_support();

            pci_save_state(nvl->pci_dev);
            break;
        }
        case NV_PM_ACTION_RESUME:
        {
            pci_restore_state(nvl->pci_dev);

            nv_enable_pat_support();

            nv_kthread_q_item_init(&nvl->bottom_half_q_item,
                                   nvidia_isr_bh_unlocked, (void *)nv);

            status = nv_kthread_q_init(&nvl->bottom_half_q, nv_device_name);
            if (status != NV_OK)
                break;

            status = rm_power_management(sp, nv, pm_action);
            break;
        }
        default:
            status = NV_ERR_INVALID_ARGUMENT;
            break;
    }

failure:
    nv_kmem_cache_free_stack(sp);

    return status;
}

static NV_STATUS
nv_restore_user_channels(
    nv_state_t *nv
)
{
    NV_STATUS status = NV_OK;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nv_stack_t *sp = NULL;

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return NV_ERR_NO_MEMORY;
    }

    down(&nvl->ldata_lock);

    if ((nv->flags & NV_FLAG_OPEN) == 0)
    {
        goto done;
    }

    status = rm_restart_user_channels(sp, nv);
    WARN_ON(status != NV_OK);

    down(&nvl->mmap_lock);

    nv_set_safe_to_mmap_locked(nv, NV_TRUE);

    up(&nvl->mmap_lock);

    status = rm_gpu_gc6_blocker_refcount_dec(sp, nv);
    WARN_ON(status != NV_OK);

done:
    up(&nvl->ldata_lock);

    nv_kmem_cache_free_stack(sp);

    return status;
}

static NV_STATUS
nv_preempt_user_channels(
    nv_state_t *nv
)
{
    NV_STATUS status = NV_OK;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nv_stack_t *sp = NULL;

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        return NV_ERR_NO_MEMORY;
    }

    down(&nvl->ldata_lock);

    if ((nv->flags & NV_FLAG_OPEN) == 0)
    {
        goto done;
    }

    status = rm_gpu_gc6_blocker_refcount_inc(sp, nv);
    WARN_ON(status != NV_OK);

    down(&nvl->mmap_lock);

    nv_set_safe_to_mmap_locked(nv, NV_FALSE);
    nv_revoke_gpu_mappings_locked(nv);

    up(&nvl->mmap_lock);

    status = rm_stop_user_channels(sp, nv);
    WARN_ON(status != NV_OK);

done:
    up(&nvl->ldata_lock);

    nv_kmem_cache_free_stack(sp);

    return status;
}

static NV_STATUS
nvidia_suspend(
    struct device *dev,
    nv_pm_action_t pm_action
)
{
    NV_STATUS status = NV_OK;
    struct pci_dev *pci_dev = to_pci_dev(dev);
    nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
    nv_state_t *nv = NV_STATE_PTR(nvl);

    down(&nvl->ldata_lock);

    if (((nv->flags & NV_FLAG_OPEN) == 0) &&
        ((nv->flags & NV_FLAG_PERSISTENT_SW_STATE) == 0))
    {
        goto done;
    }

    if ((nv->flags & NV_FLAG_SUSPENDED) != 0)
    {
        nvl->suspend_count++;
        goto done;
    }

    if (NV_IS_GVI_DEVICE(nv))
    {
        status = nv_gvi_suspend(nv, pm_action);
    }
    else
    {
        nvidia_modeset_suspend(nv->gpu_id);

        status = nv_power_management(nv, pm_action);

        if (status != NV_OK)
        {
            nvidia_modeset_resume(nv->gpu_id);
        }
    }

    if (status == NV_OK)
    {
        nv->flags |= NV_FLAG_SUSPENDED;
    }

done:
    up(&nvl->ldata_lock);

    return status;
}

static NV_STATUS
nvidia_resume(
    struct device *dev,
    nv_pm_action_t pm_action
)
{
    NV_STATUS status = NV_OK;
    struct pci_dev *pci_dev = to_pci_dev(dev);
    nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
    nv_state_t *nv = NV_STATE_PTR(nvl);

    down(&nvl->ldata_lock);

    if ((nv->flags & NV_FLAG_SUSPENDED) == 0)
    {
        goto done;
    }

    if (nvl->suspend_count != 0)
    {
        nvl->suspend_count--;
    }
    else
    {
        if (NV_IS_GVI_DEVICE(nv))
        {
            status = nv_gvi_resume(nv, pm_action);
        }
        else
        {
            status = nv_power_management(nv, pm_action);

            if (status == NV_OK)
            {
                nvidia_modeset_resume(nv->gpu_id);
            }
        }

        if (status == NV_OK)
        {
            nv->flags &= ~NV_FLAG_SUSPENDED;
        }
    }

done:
    up(&nvl->ldata_lock);

    return status;
}

static NV_STATUS
nv_resume_devices(
    nv_pm_action_t pm_action,
    nv_pm_action_depth_t pm_action_depth
)
{
    nv_linux_state_t *nvl;
    NvBool resume_devices = NV_TRUE;
    NV_STATUS status;

    if (pm_action_depth == NV_PM_ACTION_DEPTH_MODESET)
    {
        goto resume_modeset;
    }

    if (pm_action_depth == NV_PM_ACTION_DEPTH_UVM)
    {
        resume_devices = NV_FALSE;
    }

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
    {
        if (resume_devices)
        {
            status = nvidia_resume(nvl->dev, pm_action);
            WARN_ON(status != NV_OK);
        }
    }

    UNLOCK_NV_LINUX_DEVICES();

    status = nv_uvm_resume();
    WARN_ON(status != NV_OK);

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
    {
        status = nv_restore_user_channels(NV_STATE_PTR(nvl));
        WARN_ON(status != NV_OK);
    }

    UNLOCK_NV_LINUX_DEVICES();

resume_modeset:
    nvidia_modeset_resume(0);

    return NV_OK;
}

static NV_STATUS
nv_suspend_devices(
    nv_pm_action_t pm_action,
    nv_pm_action_depth_t pm_action_depth
)
{
    nv_linux_state_t *nvl;
    NvBool resume_devices = NV_FALSE;
    NV_STATUS status = NV_OK;

    nvidia_modeset_suspend(0);

    if (pm_action_depth == NV_PM_ACTION_DEPTH_MODESET)
    {
        return NV_OK;
    }

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl != NULL && status == NV_OK; nvl = nvl->next)
    {
        status = nv_preempt_user_channels(NV_STATE_PTR(nvl));
        WARN_ON(status != NV_OK);
    }

    UNLOCK_NV_LINUX_DEVICES();

    if (status == NV_OK)
    {
        status = nv_uvm_suspend();
        WARN_ON(status != NV_OK);
    }
    if (status != NV_OK)
    {
        goto done;
    }

    if (pm_action_depth == NV_PM_ACTION_DEPTH_UVM)
    {
        return NV_OK;
    }

    LOCK_NV_LINUX_DEVICES();

    for (nvl = nv_linux_devices; nvl != NULL && status == NV_OK; nvl = nvl->next)
    {
        status = nvidia_suspend(nvl->dev, pm_action);
        WARN_ON(status != NV_OK);
    }
    if (status != NV_OK)
    {
        resume_devices = NV_TRUE;
    }

    UNLOCK_NV_LINUX_DEVICES();

done:
    if (status != NV_OK)
    {
        LOCK_NV_LINUX_DEVICES();

        for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
        {
            if (resume_devices)
            {
                nvidia_resume(nvl->dev, pm_action);
            }

            nv_restore_user_channels(NV_STATE_PTR(nvl));
        }

        UNLOCK_NV_LINUX_DEVICES();
    }

    return status;
}

NV_STATUS
nv_set_system_power_state(
    nv_power_state_t power_state,
    nv_pm_action_depth_t pm_action_depth
)
{
    NV_STATUS status;
    nv_pm_action_t pm_action;

    switch (power_state)
    {
        case NV_POWER_STATE_IN_HIBERNATE:
            pm_action = NV_PM_ACTION_HIBERNATE;
            break;
        case NV_POWER_STATE_IN_STANDBY:
            pm_action = NV_PM_ACTION_STANDBY;
            break;
        case NV_POWER_STATE_RUNNING:
            pm_action = NV_PM_ACTION_RESUME;
            break;
        default:
            return NV_ERR_INVALID_ARGUMENT;
    }

    down(&nv_system_power_state_lock);

    if (nv_system_power_state == power_state)
    {
        status = NV_OK;
        goto done;
    }

    if (power_state == NV_POWER_STATE_RUNNING)
    {
        status = nv_resume_devices(pm_action, nv_system_pm_action_depth);
        NV_WRITE_UNLOCK_SYSTEM_PM_LOCK();
    }
    else
    {
        if (nv_system_power_state != NV_POWER_STATE_RUNNING)
        {
            status = NV_ERR_INVALID_ARGUMENT;
            goto done;
        }

        nv_system_pm_action_depth = pm_action_depth;

        NV_WRITE_LOCK_SYSTEM_PM_LOCK();
        status = nv_suspend_devices(pm_action, nv_system_pm_action_depth);
        if (status != NV_OK)
        {
            NV_WRITE_UNLOCK_SYSTEM_PM_LOCK();
            goto done;
        }
    }

    nv_system_power_state = power_state;

done:
    up(&nv_system_power_state_lock);

    return status;
}

int nv_pmops_suspend(
    struct device *dev
)
{
    NV_STATUS status;

    status = nvidia_suspend(dev, NV_PM_ACTION_STANDBY);
    return (status == NV_OK) ? 0 : -EIO;
}

int nv_pmops_resume(
    struct device *dev
)
{
    NV_STATUS status;

    status = nvidia_resume(dev, NV_PM_ACTION_RESUME);
    return (status == NV_OK) ? 0 : -EIO;
}

int nv_pmops_freeze(
    struct device *dev
)
{
    NV_STATUS status;

    status = nvidia_suspend(dev, NV_PM_ACTION_HIBERNATE);
    return (status == NV_OK) ? 0 : -EIO;
}

int nv_pmops_thaw(
    struct device *dev
)
{
    return 0;
}

int nv_pmops_restore(
    struct device *dev
)
{
    NV_STATUS status;

    status = nvidia_resume(dev, NV_PM_ACTION_RESUME);
    return (status == NV_OK) ? 0 : -EIO;
}

int nv_pmops_poweroff(
    struct device *dev
)
{
    return 0;
}

int nv_pmops_runtime_suspend(
    struct device *dev
)
{
    return 0;
}

int nv_pmops_runtime_resume(
    struct device *dev
)
{
    return 0;
}
#endif /* defined(CONFIG_PM) */

#if defined(NV_PCI_ERROR_RECOVERY)
static pci_ers_result_t
nvidia_pci_error_detected(
    struct pci_dev *pci_dev,
    enum pci_channel_state error
)
{
    nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);

    if ((nvl == NULL) || (nvl->pci_dev != pci_dev))
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid device!\n", __FUNCTION__);
        return PCI_ERS_RESULT_NONE;
    }

    /*
     * Tell Linux to continue recovery of the device. The kernel will enable
     * MMIO for the GPU and call the mmio_enabled callback.
     */
    return PCI_ERS_RESULT_CAN_RECOVER;
}

static pci_ers_result_t
nvidia_pci_mmio_enabled(
    struct pci_dev *pci_dev
)
{
    NV_STATUS         status = NV_OK;
    nv_stack_t       *sp = NULL;
    nv_linux_state_t *nvl = pci_get_drvdata(pci_dev);
    nv_state_t       *nv = NULL;

    if ((nvl == NULL) || (nvl->pci_dev != pci_dev))
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: %s: invalid device!\n", __FUNCTION__);
        goto done;
    }

    nv = NV_STATE_PTR(nvl);

    if (nv_kmem_cache_alloc_stack(&sp) != 0)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: %s: failed to allocate stack!\n",
            __FUNCTION__);
        goto done;
    }

    NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "A fatal error was detected.\n");

    /*
     * MMIO should be re-enabled now. If we still get bad reads, there's
     * likely something wrong with the adapter itself that will require a
     * reset. This should let us know whether the GPU has completely fallen
     * off the bus or just did something the host didn't like.
     */
    status = rm_is_supported_device(sp, nv);
    if (status != NV_OK)
    {
        NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
            "The kernel has enabled MMIO for the device,\n"
            "NVRM: but it still appears unreachable. The device\n"
            "NVRM: will not function properly until it is reset.\n");
    }

    status = rm_log_gpu_crash(sp, nv);
    if (status != NV_OK)
    {
        NV_DEV_PRINTF_STATUS(NV_DBG_ERRORS, nv, status,
                      "Failed to log crash data\n");
        goto done;
    }

done:
    if (sp != NULL)
    {
        nv_kmem_cache_free_stack(sp);
    }

    /*
     * Tell Linux to abandon recovery of the device. The kernel might be able
     * to recover the device, but RM and clients don't yet support that.
     */
    return PCI_ERS_RESULT_DISCONNECT;
}
#endif

nv_state_t* NV_API_CALL nv_get_adapter_state(
    NvU32 domain,
    NvU8  bus,
    NvU8  slot
)
{
    nv_linux_state_t *nvl;

    LOCK_NV_LINUX_DEVICES();
    for (nvl = nv_linux_devices; nvl != NULL;  nvl = nvl->next)
    {
        nv_state_t *nv = NV_STATE_PTR(nvl);
        if (nv->pci_info.domain == domain && nv->pci_info.bus == bus
            && nv->pci_info.slot == slot)
        {
            UNLOCK_NV_LINUX_DEVICES();
            return nv;
        }
    }
    UNLOCK_NV_LINUX_DEVICES();

    return NULL;
}

nv_state_t* NV_API_CALL nv_get_ctl_state(void)
{
    return NV_STATE_PTR(&nv_ctl_device);
}

NV_STATUS NV_API_CALL nv_log_error(
    nv_state_t *nv,
    NvU32       error_number,
    const char *format,
    va_list    ap
)
{
    NV_STATUS status = NV_OK;
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    nv_report_error(nvl->pci_dev, error_number, format, ap);
#if defined(CONFIG_CRAY_XT)
    status = nvos_forward_error_to_cray(nvl->pci_dev, error_number,
                format, ap);
#endif

    return status;
}

NvU64 NV_API_CALL nv_get_dma_start_address(
    nv_state_t *nv
)
{
    NvU64 start = 0;
#if defined(NVCPU_PPC64LE)
    nv_linux_state_t      *nvl;
    struct pci_dev        *pci_dev;
    dma_addr_t             dma_addr;
    NvU64                  saved_dma_mask;

    /*
     * If TCE bypass is disabled via a module parameter, then just return
     * the default (which is 0).
     *
     * Otherwise, the DMA start address only needs to be set once, and it
     * won't change afterward. Just return the cached value if asked again,
     * to avoid the kernel printing redundant messages to the kernel
     * log when we call pci_set_dma_mask().
     */
    nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    if ((nv_tce_bypass_mode == NV_TCE_BYPASS_MODE_DISABLE) ||
        (nvl->tce_bypass_enabled))
    {
        return nv->dma_addressable_start;
    }

    pci_dev = nvl->pci_dev;

    /*
     * Linux on IBM POWER8 offers 2 different DMA set-ups, sometimes
     * referred to as "windows".
     *
     * The "default window" provides a 2GB region of PCI address space
     * located below the 32-bit line. The IOMMU is used to provide a
     * "rich" mapping--any page in system memory can be mapped at an
     * arbitrary address within this window. The mappings are dynamic
     * and pass in and out of being as pci_map*()/pci_unmap*() calls
     * are made.
     *
     * Dynamic DMA Windows (sometimes "Huge DDW") provides a linear
     * mapping of the system's entire physical address space at some
     * fixed offset above the 59-bit line. IOMMU is still used, and
     * pci_map*()/pci_unmap*() are still required, but mappings are
     * static. They're effectively set up in advance, and any given
     * system page will always map to the same PCI bus address. I.e.
     *   physical 0x00000000xxxxxxxx => PCI 0x08000000xxxxxxxx
     *
     * This driver does not support the 2G default window because
     * of its limited size, and for reasons having to do with UVM.
     *
     * Linux on POWER8 will only provide the DDW-style full linear
     * mapping when the driver claims support for 64-bit DMA addressing
     * (a pre-requisite because the PCI addresses used in this case will
     * be near the top of the 64-bit range). The linear mapping
     * is not available in all system configurations.
     *
     * Detect whether the linear mapping is present by claiming
     * 64-bit support and then mapping physical page 0. For historical
     * reasons, Linux on POWER8 will never map a page to PCI address 0x0.
     * In the "default window" case page 0 will be mapped to some
     * non-zero address below the 32-bit line.  In the
     * DDW/linear-mapping case, it will be mapped to address 0 plus
     * some high-order offset.
     *
     * If the linear mapping is present and sane then return the offset
     * as the starting address for all DMA mappings.
     */
    saved_dma_mask = pci_dev->dma_mask;
    if (pci_set_dma_mask(pci_dev, DMA_BIT_MASK(64)) != 0)
    {
        goto done;
    }

    dma_addr = pci_map_single(pci_dev, NULL, 1, DMA_BIDIRECTIONAL);
    if (pci_dma_mapping_error(pci_dev, dma_addr))
    {
        pci_set_dma_mask(pci_dev, saved_dma_mask);
        goto done;
    }

    pci_unmap_single(pci_dev, dma_addr, 1, DMA_BIDIRECTIONAL);

    /*
     * From IBM: "For IODA2, native DMA bypass or KVM TCE-based implementation
     * of full 64-bit DMA support will establish a window in address-space
     * with the high 14 bits being constant and the bottom up-to-50 bits
     * varying with the mapping."
     *
     * Unfortunately, we don't have any good interfaces or definitions from
     * the kernel to get information about the DMA offset assigned by OS.
     * However, we have been told that the offset will be defined by the top
     * 14 bits of the address, and bits 40-49 will not vary for any DMA
     * mappings until 1TB of system memory is surpassed; this limitation is
     * essential for us to function properly since our current GPUs only
     * support 40 physical address bits. We are in a fragile place where we
     * need to tell the OS that we're capable of 64-bit addressing, while
     * relying on the assumption that the top 24 bits will not vary in this
     * case.
     *
     * The way we try to compute the window, then, is mask the trial mapping
     * against the DMA capabilities of the device. That way, devices with
     * greater addressing capabilities will only take the bits it needs to
     * define the window.
     */
    if ((dma_addr & DMA_BIT_MASK(32)) != 0)
    {
        /*
         * Huge DDW not available - page 0 mapped to non-zero address below
         * the 32-bit line.
         */
        nv_printf(NV_DBG_WARNINGS,
            "NVRM: DMA window limited by platform\n");
        pci_set_dma_mask(pci_dev, saved_dma_mask);
        goto done;
    }
    else if ((dma_addr & saved_dma_mask) != 0)
    {
        NvU64 memory_size = os_get_num_phys_pages() * PAGE_SIZE;
        if ((dma_addr & ~saved_dma_mask) !=
            ((dma_addr + memory_size) & ~saved_dma_mask))
        {
            /*
             * The physical window straddles our addressing limit boundary,
             * e.g., for an adapter that can address up to 1TB, the window
             * crosses the 40-bit limit so that the lower end of the range
             * has different bits 63:40 than the higher end of the range.
             * We can only handle a single, static value for bits 63:40, so
             * we must fall back here.
             */
            nv_printf(NV_DBG_WARNINGS,
                "NVRM: DMA window limited by memory size\n");
            pci_set_dma_mask(pci_dev, saved_dma_mask);
            goto done;
        }
    }

    nvl->tce_bypass_enabled = NV_TRUE;
    start = dma_addr & ~(saved_dma_mask);

    /* Update the coherent mask to match */
    dma_set_coherent_mask(&pci_dev->dev, pci_dev->dma_mask);

done:
#endif
    return start;
}

NV_STATUS NV_API_CALL nv_set_primary_vga_status(
    nv_state_t *nv
)
{
    /* IORESOURCE_ROM_SHADOW wasn't added until 2.6.10 */
#if defined(IORESOURCE_ROM_SHADOW)
    nv_linux_state_t *nvl;
    struct pci_dev *pci_dev;

    nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    pci_dev = nvl->pci_dev;

    nv->primary_vga = ((NV_PCI_RESOURCE_FLAGS(pci_dev, PCI_ROM_RESOURCE) &
        IORESOURCE_ROM_SHADOW) == IORESOURCE_ROM_SHADOW);
    return NV_OK;
#else
    return NV_ERR_NOT_SUPPORTED;
#endif
}

NV_STATUS NV_API_CALL nv_pci_trigger_recovery(
     nv_state_t *nv
)
{
    NV_STATUS status = NV_ERR_NOT_SUPPORTED;
#if defined(NV_PCI_ERROR_RECOVERY)
    nv_linux_state_t *nvl       = NV_GET_NVL_FROM_NV_STATE(nv);

    /*
     * Calling readl() on PPC64LE will allow the kernel to check its state for
     * the device and update it accordingly. This needs to be done before
     * checking if the PCI channel is offline, so that we don't check stale
     * state.
     *
     * This will also kick off the recovery process for the device.
     */
    if (NV_PCI_ERROR_RECOVERY_ENABLED())
    {
        if (readl(nv->regs->map) == 0xFFFFFFFF)
        {
            if (pci_channel_offline(nvl->pci_dev))
            {
                NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
                              "PCI channel for the device is offline\n");
                status = NV_OK;
            }
        }
    }
#endif
    return status;
}

NvBool NV_API_CALL nv_requires_dma_remap(
    nv_state_t *nv
)
{
    NvBool dma_remap = NV_FALSE;
#if !defined(NVCPU_ARM)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    dma_remap = !nv_dma_maps_swiotlb(nvl->dev);
#endif
    return dma_remap;
}

/*
 * Intended for use by external kernel modules to list nvidia gpu ids.
 */
NvBool nvidia_get_gpuid_list(NvU32 *gpu_ids, NvU32 *gpu_count)
{
    nv_linux_state_t *nvl;
    unsigned int count;
    NvBool ret = NV_TRUE;

    LOCK_NV_LINUX_DEVICES();

    count = 0;
    for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
        count++;

    if (*gpu_count == 0)
    {
        goto done;
    }
    else if ((*gpu_count) < count)
    {
        ret = NV_FALSE;
        goto done;
    }

    count = 0;
    for (nvl = nv_linux_devices; nvl != NULL; nvl = nvl->next)
    {
        nv_state_t *nv = NV_STATE_PTR(nvl);
        gpu_ids[count++] = nv->gpu_id;
    }


done:

    *gpu_count = count;

    UNLOCK_NV_LINUX_DEVICES();

    return ret;
}

/*
 * Kernel-level analog to nvidia_open, intended for use by external
 * kernel modules. This increments the ref count of the device with
 * the given gpu_id and makes sure the device has been initialized.
 *
 * Clients of this interface are counted by the RM reset path, to ensure a
 * GPU is not reset while the GPU is active.
 *
 * Returns -ENODEV if the given gpu_id does not exist.
 */
int nvidia_dev_get(NvU32 gpu_id, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl;
    int rc;

    /* Takes nvl->ldata_lock */
    nvl = find_gpu_id(gpu_id);
    if (!nvl)
        return -ENODEV;

    rc = nv_open_device(NV_STATE_PTR(nvl), sp);

    if (rc == 0)
        WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_TRUE) != NV_OK);

    up(&nvl->ldata_lock);
    return rc;
}

/*
 * Kernel-level analog to nvidia_close, intended for use by external
 * kernel modules. This decrements the ref count of the device with
 * the given gpu_id, potentially tearing it down.
 */
void nvidia_dev_put(NvU32 gpu_id, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl;

    /* Takes nvl->ldata_lock */
    nvl = find_gpu_id(gpu_id);
    if (!nvl)
        return;

    nv_close_device(NV_STATE_PTR(nvl), sp);

    WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_FALSE) != NV_OK);

    up(&nvl->ldata_lock);
}

/*
 * Like nvidia_dev_get but uses UUID instead of gpu_id. Note that this may
 * trigger initialization and teardown of unrelated devices to look up their
 * UUIDs.
 *
 * Clients of this interface are counted by the RM reset path, to ensure a
 * GPU is not reset while the GPU is active.
 */
int nvidia_dev_get_uuid(const NvU8 *uuid, nvidia_stack_t *sp)
{
    nv_state_t *nv = NULL;
    nv_linux_state_t *nvl = NULL;
    const NvU8 *dev_uuid;
    int rc = 0;

    /* Takes nvl->ldata_lock */
    nvl = find_uuid_candidate(uuid);
    while (nvl)
    {
        nv = NV_STATE_PTR(nvl);

        /*
         * If the device is missing its UUID, this call exists solely so
         * rm_get_gpu_uuid_raw will be called and we can inspect the UUID.
         */
        rc = nv_open_device(nv, sp);
        if (rc != 0)
            goto out;

        /* The UUID should always be present following nv_open_device */
        dev_uuid = nv_get_cached_uuid(nv);
        WARN_ON(!dev_uuid);
        if (dev_uuid && memcmp(dev_uuid, uuid, GPU_UUID_LEN) == 0)
            break;

        /* No match, try again. */
        nv_close_device(nv, sp);
        up(&nvl->ldata_lock);
        nvl = find_uuid_candidate(uuid);
    }

    if (nvl)
    {
        rc = 0;
        WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_TRUE) != NV_OK);
    }
    else
        rc = -ENODEV;

out:
    if (nvl)
        up(&nvl->ldata_lock);
    return rc;
}

/*
 * Like nvidia_dev_put but uses UUID instead of gpu_id.
 */
void nvidia_dev_put_uuid(const NvU8 *uuid, nvidia_stack_t *sp)
{
    nv_linux_state_t *nvl;

    /* Callers must already have called nvidia_dev_get_uuid() */

    /* Takes nvl->ldata_lock */
    nvl = find_uuid(uuid);
    if (!nvl)
        return;

    nv_close_device(NV_STATE_PTR(nvl), sp);

    WARN_ON(rm_set_external_kernel_client_count(sp, NV_STATE_PTR(nvl), NV_FALSE) != NV_OK);

    up(&nvl->ldata_lock);
}

int nvidia_dev_get_pci_info(const NvU8 *uuid, struct pci_dev **pci_dev_out, NvU64 *dma_start, NvU64 *dma_limit)
{
    nv_linux_state_t *nvl;

    /* Takes nvl->ldata_lock */
    nvl = find_uuid(uuid);
    if (!nvl)
        return -ENODEV;

    *pci_dev_out = nvl->pci_dev;
    *dma_start = nvl->nv_state.dma_addressable_start;
    *dma_limit = nvl->nv_state.dma_addressable_limit;

    up(&nvl->ldata_lock);

    return 0;
}

int nvidia_dev_block_gc6(const NvU8 *uuid, nvidia_stack_t *sp)

{
    nv_linux_state_t *nvl;

    /* Callers must already have called nvidia_dev_get_uuid() */

    /* Takes nvl->ldata_lock */
    nvl = find_uuid(uuid);
    if (!nvl)
        return -ENODEV;

    if (rm_gpu_gc6_blocker_refcount_inc(sp, NV_STATE_PTR(nvl)) != NV_OK)
    {
        up(&nvl->ldata_lock);
        return -EINVAL;
    }

    up(&nvl->ldata_lock);

    return 0;
}

int nvidia_dev_unblock_gc6(const NvU8 *uuid, nvidia_stack_t *sp)

{
    nv_linux_state_t *nvl;

    /* Callers must already have called nvidia_dev_get_uuid() */

    /* Takes nvl->ldata_lock */
    nvl = find_uuid(uuid);
    if (!nvl)
        return -ENODEV;

    if (rm_gpu_gc6_blocker_refcount_dec(sp, NV_STATE_PTR(nvl)) != NV_OK)
    {
        up(&nvl->ldata_lock);
        return -EINVAL;
    }

    up(&nvl->ldata_lock);

    return 0;
}

NV_STATUS NV_API_CALL nv_get_device_memory_config(
    nv_state_t *nv,
    NvU32 *compr_addr_sys_phys,
    NvU32 *addr_guest_phys,
    NvU32 *addr_width,
    NvU32 *granularity,
    NvS32 *node_id
)
{
    NV_STATUS status = NV_ERR_NOT_SUPPORTED;
#if defined(NVCPU_PPC64LE)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    nv_numa_info_t *numa_info;

    if (!nv_numa_info_valid(nvl))
    {
        return NV_ERR_NOT_SUPPORTED;
    }

    numa_info = &nvl->npu->numa_info;

    if (node_id != NULL)
    {
        *node_id = numa_info->node_id;
    }

    if (compr_addr_sys_phys != NULL)
    {
        *compr_addr_sys_phys =
            numa_info->compr_sys_phys_addr >> nv_volta_addr_space_width;
    }

    if (addr_guest_phys != NULL)
    {
        *addr_guest_phys =
            numa_info->guest_phys_addr >> nv_volta_addr_space_width;
    }

    if (addr_width != NULL)
    {
        *addr_width = nv_volta_dma_addr_size - nv_volta_addr_space_width;
    }

    if (granularity != NULL)
    {
        *granularity = nv_volta_addr_space_width;
    }

    status = NV_OK;
#endif
    return status;
}

#if defined(NVCPU_PPC64LE)

NV_STATUS NV_API_CALL nv_get_nvlink_line_rate(
    nv_state_t *nvState,
    NvU32      *linerate
)
{
#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT) && defined(NV_OF_GET_PROPERTY_PRESENT)

    nv_linux_state_t *nvl;
    struct pci_dev   *npuDev;
    NvU32            *pSpeedPtr = NULL;
    NvU32            speed;
    int              len;

    if (nvState != NULL)
        nvl = NV_GET_NVL_FROM_NV_STATE(nvState);
    else
        return NV_ERR_INVALID_ARGUMENT;

    if (!nvl->npu)
    {
        return NV_ERR_NOT_SUPPORTED;
    }

    npuDev = nvl->npu->devs[0];
    if (!npuDev->dev.of_node)
    {
        nv_printf(NV_DBG_ERRORS, "NVRM: %s: OF Node not found in IBM-NPU device node\n",
                  __FUNCTION__);
        return NV_ERR_NOT_SUPPORTED;
    }

    pSpeedPtr = (NvU32 *) of_get_property(npuDev->dev.of_node, "ibm,nvlink-speed", &len);

    if (pSpeedPtr)
    {
        speed = (NvU32) be32_to_cpup(pSpeedPtr);
    }
    else
    {
        return NV_ERR_NOT_SUPPORTED;
    }

    if (!speed)
    {
        return NV_ERR_NOT_SUPPORTED;
    }
    else
    {
        *linerate = speed;
    }

    return NV_OK;

#endif

    return NV_ERR_NOT_SUPPORTED;
}

#endif

#if defined(NV_BACKLIGHT_DEVICE_REGISTER_PRESENT)
static int nv_update_backlight_status(struct backlight_device *bd)
{
    nvidia_stack_t *sp;
    nv_state_t *nv = bl_get_data(bd);
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    NvU32 displayId = nvl->backlight.displayId;
    NV_STATUS status;
    int rc;

    rc = nv_kmem_cache_alloc_stack(&sp);
    if (rc != 0)
    {
        return rc;
    }

    status = rm_set_backlight(sp, nv, displayId, bd->props.brightness);
    if (status != NV_OK)
    {
        rc = -EINVAL;
    }

    nv_kmem_cache_free_stack(sp);
    return rc;
}

static int nv_get_backlight_brightness(struct backlight_device *bd)
{
    nvidia_stack_t *sp;
    nv_state_t *nv = bl_get_data(bd);
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    const NvU32 displayId = nvl->backlight.displayId;
    NV_STATUS status;
    NvU32 brightness;
    int rc;

    rc = nv_kmem_cache_alloc_stack(&sp);
    if (rc != 0)
    {
        return -1;
    }

    status = rm_get_backlight(sp, nv, displayId, &brightness);

    nv_kmem_cache_free_stack(sp);

    if (status == NV_OK)
    {
        return brightness;
    }

    return -1;
}

static const struct backlight_ops nvidia_backlight_ops = {
    .update_status = nv_update_backlight_status,
    .get_brightness = nv_get_backlight_brightness,
};
#endif


void NV_API_CALL nv_register_backlight(
    nv_state_t *nv,
    NvU32 displayId,
    NvU32 currentBrightness
)
{
#if defined(NV_BACKLIGHT_DEVICE_REGISTER_PRESENT)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    char name[11];
    struct backlight_properties props = {
        .brightness = currentBrightness,
        .max_brightness = 100,
# if defined(NV_BACKLIGHT_PROPERTIES_TYPE_PRESENT)
        .type = BACKLIGHT_RAW,
# endif
    };

    snprintf(name, sizeof(name), "nvidia_%d", nvl->minor_num);
    name[sizeof(name) - 1] = '\0';

    nvl->backlight.dev = backlight_device_register(name, nvl->dev, nv,
                                                   &nvidia_backlight_ops,
                                                   &props);
    nvl->backlight.displayId = displayId;
#endif
}

void NV_API_CALL nv_unregister_backlight(
    nv_state_t *nv
)
{
#if defined(NV_BACKLIGHT_DEVICE_REGISTER_PRESENT)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);

    if (nvl->backlight.dev)
    {
        backlight_device_unregister(nvl->backlight.dev);
    }
#endif
}

NV_STATUS NV_API_CALL nv_indicate_idle(
    nv_state_t *nv
)
{
#if defined(NV_PM_RUNTIME_AVAILABLE)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    struct device *dev = nvl->dev;

    pm_runtime_put_noidle(dev);

    dev->power.timer_expires = jiffies;
    mod_timer(&dev->power.suspend_timer, jiffies);

    return NV_OK;
#else
    return NV_ERR_NOT_SUPPORTED;
#endif
}

NV_STATUS NV_API_CALL nv_indicate_not_idle(
    nv_state_t *nv
)
{
#if defined(NV_PM_RUNTIME_AVAILABLE)
    nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
    struct device *dev = nvl->dev;

    pm_runtime_get_noresume(dev);

    pci_bus_type.shutdown(dev);

    return NV_OK;
#else
    return NV_ERR_NOT_SUPPORTED;
#endif
}
