diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a36ae1612059..9ae336218ac1 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 ccflags-y += -I$(src) - +ccflags-y += -DCONFIG_NVFS obj-$(CONFIG_NVME_CORE) += nvme-core.o obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o @@ -18,10 +18,12 @@ nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o nvme-y += pci.o +nvme-y += nvfs-dma.o nvme-fabrics-y += fabrics.o nvme-rdma-y += rdma.o +nvme-rdma-y += nvfs-rdma.o nvme-fc-y += fc.o diff --git a/drivers/nvme/host/nvfs-dma.c b/drivers/nvme/host/nvfs-dma.c new file mode 100644 index 000000000000..08c405bd924c --- /dev/null +++ b/drivers/nvme/host/nvfs-dma.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifdef CONFIG_NVFS +#define MODULE_PREFIX nvme_v1 +#include "nvfs.h" + +struct nvfs_dma_rw_ops *nvfs_ops; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +// must have for compatibility +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -EOPNOTSUPP; + +} +EXPORT_SYMBOL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC(void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL(UNREGISTER_FUNC); +#endif diff --git a/drivers/nvme/host/nvfs-dma.h b/drivers/nvme/host/nvfs-dma.h new file mode 100644 index 000000000000..4a0d9c998d4d --- /dev/null +++ b/drivers/nvme/host/nvfs-dma.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef NVFS_DMA_H +#define NVFS_DMA_H + +static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd); + +static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmd, int entries); + +static bool nvme_nvfs_unmap_data(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + enum dma_data_direction dma_dir = rq_dma_dir(req); + + if (!iod || !iod->nents) + return false; + if (iod->sg && !is_pci_p2pdma_page(sg_page(iod->sg)) && + !blk_integrity_rq(req) && + !iod->dma_len && + nvfs_ops != NULL) { + int count; + + count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sg, iod->nents, + dma_dir); + + if (!count) + return false; + + nvfs_put_ops(); + return true; + } + return false; +} + +static blk_status_t nvme_nvfs_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd, bool *is_nvfs_io) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_dma_dir(req); + blk_status_t ret = BLK_STS_RESOURCE; + int nr_mapped; + + nr_mapped = 0; + *is_nvfs_io = false; + + if (!blk_integrity_rq(req) && nvfs_get_ops()) { + iod->dma_len = 0; + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) { + nvfs_put_ops(); + return BLK_STS_RESOURCE; + } + + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); + // associates bio pages to scatterlist + iod->nents = nvfs_ops->nvfs_blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + return BLK_STS_IOERR; // reset to original ret + } + *is_nvfs_io = true; + + if (unlikely((iod->nents == NVFS_IO_ERR))) { + pr_err("%s: failed to map sg_nents=:%d\n", __func__, iod->nents); + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + return BLK_STS_IOERR; + } + + nr_mapped = nvfs_ops->nvfs_dma_map_sg_attrs(dev->dev, + iod->sg, + iod->nents, + dma_dir, + DMA_ATTR_NO_WARN); + + if (unlikely((nr_mapped == NVFS_IO_ERR))) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + pr_err("%s: failed to dma map sglist=:%d\n", __func__, iod->nents); + return BLK_STS_IOERR; + } + + if (unlikely(nr_mapped == NVFS_CPU_REQ)) { + mempool_free(iod->sg, dev->iod_mempool); + nvfs_put_ops(); + WARN_ON(1); + } + + iod->use_sgl = nvme_pci_use_sgls(dev, req); + if (iod->use_sgl) { + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped); + } else { + // push dma address to hw registers + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + } + + if (ret != BLK_STS_OK) { + nvme_nvfs_unmap_data(dev, req); + mempool_free(iod->sg, dev->iod_mempool); + } + return ret; + } + return ret; +} + +#endif /* NVFS_DMA_H */ diff --git a/drivers/nvme/host/nvfs-rdma.c b/drivers/nvme/host/nvfs-rdma.c new file mode 100644 index 000000000000..cc2d00653060 --- /dev/null +++ b/drivers/nvme/host/nvfs-rdma.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifdef CONFIG_NVFS +#define MODULE_PREFIX nvme_rdma_v1 +#include "nvfs.h" + +struct nvfs_dma_rw_ops *nvfs_ops; + +atomic_t nvfs_shutdown = ATOMIC_INIT(1); + +DEFINE_PER_CPU(long, nvfs_n_ops); + +// must have for compatibility +#define NVIDIA_FS_COMPAT_FT(ops) \ + (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)) + +// protected via nvfs_module_mutex +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops) +{ + if (NVIDIA_FS_COMPAT_FT(ops)) { + nvfs_ops = ops; + atomic_set(&nvfs_shutdown, 0); + return 0; + } else + return -EOPNOTSUPP; + +} +EXPORT_SYMBOL(REGISTER_FUNC); + +// protected via nvfs_module_mutex +void UNREGISTER_FUNC(void) +{ + (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1); + do { + msleep(NVFS_HOLD_TIME_MS); + } while (nvfs_count_ops()); + nvfs_ops = NULL; +} +EXPORT_SYMBOL(UNREGISTER_FUNC); +#endif diff --git a/drivers/nvme/host/nvfs-rdma.h b/drivers/nvme/host/nvfs-rdma.h new file mode 100644 index 000000000000..020fc83f7360 --- /dev/null +++ b/drivers/nvme/host/nvfs-rdma.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef NVFS_RDMA_H +#define NVFS_RDMA_H + +static bool nvme_rdma_nvfs_unmap_data(struct ib_device *ibdev, + struct request *rq) + +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int count; + + if (!blk_integrity_rq(rq) && nvfs_ops != NULL) { + count = nvfs_ops->nvfs_dma_unmap_sg(ibdev->dma_device, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + dma_dir); + if (count) { + nvfs_put_ops(); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return true; + } + } + return false; +} + +static int nvme_rdma_nvfs_map_data(struct ib_device *ibdev, struct request *rq, bool *is_nvfs_io, int* count) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + enum dma_data_direction dma_dir = rq_dma_dir(rq); + int ret = 0; + + *is_nvfs_io = false; + *count = 0; + if (!blk_integrity_rq(rq) && nvfs_get_ops()) { + + // associates bio pages to scatterlist + *count = nvfs_ops->nvfs_blk_rq_map_sg(rq->q, rq , req->data_sgl.sg_table.sgl); + if (!*count) { + nvfs_put_ops(); + return 0; // fall to cpu path + } + + *is_nvfs_io = true; + if (unlikely((*count == NVFS_IO_ERR))) { + nvfs_put_ops(); + pr_err("%s: failed to map sg_nents=:%d\n", __func__, req->data_sgl.nents); + return -EIO; + } + req->data_sgl.nents = *count; + + *count = nvfs_ops->nvfs_dma_map_sg_attrs(ibdev->dma_device, + req->data_sgl.sg_table.sgl, + req->data_sgl.nents, + dma_dir, + DMA_ATTR_NO_WARN); + + if (unlikely((*count == NVFS_IO_ERR))) { + nvfs_put_ops(); + return -EIO; + } + + if (unlikely(*count == NVFS_CPU_REQ)) { + nvfs_put_ops(); + BUG(); + return -EIO; + } + + return ret; + } else { + // Fall to CPU path + return 0; + } + + return ret; +} + +#endif diff --git a/drivers/nvme/host/nvfs.h b/drivers/nvme/host/nvfs.h new file mode 100644 index 000000000000..474c892f6038 --- /dev/null +++ b/drivers/nvme/host/nvfs.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef NVFS_H +#define NVFS_H + +#include +#include +#include +#include +#include +#include +#include + +#define REGSTR2(x) x##_register_nvfs_dma_ops +#define REGSTR(x) REGSTR2(x) + +#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops +#define UNREGSTR(x) UNREGSTR2(x) + +#define REGISTER_FUNC REGSTR(MODULE_PREFIX) +#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX) + +#define NVFS_IO_ERR -1 +#define NVFS_CPU_REQ -2 + +#define NVFS_HOLD_TIME_MS 1000 + +extern struct nvfs_dma_rw_ops *nvfs_ops; + +extern atomic_t nvfs_shutdown; + +DECLARE_PER_CPU(long, nvfs_n_ops); + +static inline long nvfs_count_ops(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nvfs_n_ops, i); + return sum; +} + +static inline bool nvfs_get_ops(void) +{ + if (nvfs_ops && !atomic_read(&nvfs_shutdown)) { + this_cpu_inc(nvfs_n_ops); + return true; + } + return false; +} + +static inline void nvfs_put_ops(void) +{ + this_cpu_dec(nvfs_n_ops); +} + +struct nvfs_dma_rw_ops { + unsigned long long ft_bmap; // feature bitmap + + int (*nvfs_blk_rq_map_sg)(struct request_queue *q, + struct request *req, + struct scatterlist *sglist); + + int (*nvfs_dma_map_sg_attrs)(struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir, + unsigned long attrs); + + int (*nvfs_dma_unmap_sg)(struct device *device, + struct scatterlist *sglist, + int nents, + enum dma_data_direction dma_dir); + + bool (*nvfs_is_gpu_page)(struct page *page); + + unsigned int (*nvfs_gpu_index)(struct page *page); + + unsigned int (*nvfs_device_priority)(struct device *dev, unsigned int gpu_index); +}; + +// feature list for dma_ops, values indicate bit pos +enum ft_bits { + nvfs_ft_prep_sglist = 1ULL << 0, + nvfs_ft_map_sglist = 1ULL << 1, + nvfs_ft_is_gpu_page = 1ULL << 2, + nvfs_ft_device_priority = 1ULL << 3, +}; + +// check features for use in registration with vendor drivers +#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) ((ops)->ft_bmap & nvfs_ft_prep_sglist) +#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops) ((ops)->ft_bmap & nvfs_ft_map_sglist) +#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops) ((ops)->ft_bmap & nvfs_ft_is_gpu_page) +#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops) ((ops)->ft_bmap & nvfs_ft_device_priority) + +int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops); + +void UNREGISTER_FUNC(void); + +#endif /* NVFS_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 09a58bb0b658..622ee39a91d4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -31,6 +31,9 @@ #include "trace.h" #include "nvme.h" +#ifdef CONFIG_NVFS +#include "nvfs.h" +#endif #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) @@ -577,10 +580,17 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) } } +#ifdef CONFIG_NVFS +#include "nvfs-dma.h" +#endif static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); +#ifdef CONFIG_NVFS + if (nvme_nvfs_unmap_data(dev, req)) + return; +#endif if (is_pci_p2pdma_page(sg_page(iod->sg))) pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); @@ -845,6 +855,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; int nr_mapped; +#ifdef CONFIG_NVFS + bool is_nvfs_io = false; + ret = nvme_nvfs_map_data(dev, req, cmnd, &is_nvfs_io); + if (is_nvfs_io) + return ret; +#endif if (blk_rq_nr_phys_segments(req) == 1) { struct bio_vec bv = req_bvec(req); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 240024dd5d85..7f39890bf001 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -28,6 +28,9 @@ #include "nvme.h" #include "fabrics.h" +#ifdef CONFIG_NVFS +#include "nvfs.h" +#endif #define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ @@ -1289,6 +1292,9 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, return ib_post_send(queue->qp, &wr, NULL); } +#ifdef CONFIG_NVFS +#include "nvfs-rdma.h" +#endif static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); @@ -1300,6 +1306,11 @@ static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) NVME_INLINE_METADATA_SG_CNT); } +#ifdef CONFIG_NVFS + if (nvme_rdma_nvfs_unmap_data(ibdev, rq)) + return; +#endif + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, rq_dma_dir(rq)); sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); @@ -1548,6 +1559,15 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, if (ret) return -ENOMEM; +#ifdef CONFIG_NVFS + { + bool is_nvfs_io = false; + ret = nvme_rdma_nvfs_map_data(ibdev, rq, &is_nvfs_io, count); + if (is_nvfs_io) + return ret; + } +#endif + req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, req->data_sgl.sg_table.sgl);