diff options
-rw-r--r-- | fs/cifs/Kconfig | 8 | ||||
-rw-r--r-- | fs/cifs/Makefile | 2 | ||||
-rw-r--r-- | fs/cifs/cifs_debug.c | 199 | ||||
-rw-r--r-- | fs/cifs/cifsacl.c | 2 | ||||
-rw-r--r-- | fs/cifs/cifsencrypt.c | 3 | ||||
-rw-r--r-- | fs/cifs/cifsfs.c | 8 | ||||
-rw-r--r-- | fs/cifs/cifsfs.h | 2 | ||||
-rw-r--r-- | fs/cifs/cifsglob.h | 35 | ||||
-rw-r--r-- | fs/cifs/cifsproto.h | 4 | ||||
-rw-r--r-- | fs/cifs/cifssmb.c | 22 | ||||
-rw-r--r-- | fs/cifs/connect.c | 251 | ||||
-rw-r--r-- | fs/cifs/file.c | 43 | ||||
-rw-r--r-- | fs/cifs/inode.c | 2 | ||||
-rw-r--r-- | fs/cifs/misc.c | 14 | ||||
-rw-r--r-- | fs/cifs/smb1ops.c | 4 | ||||
-rw-r--r-- | fs/cifs/smb2file.c | 2 | ||||
-rw-r--r-- | fs/cifs/smb2misc.c | 2 | ||||
-rw-r--r-- | fs/cifs/smb2ops.c | 77 | ||||
-rw-r--r-- | fs/cifs/smb2pdu.c | 578 | ||||
-rw-r--r-- | fs/cifs/smb2pdu.h | 60 | ||||
-rw-r--r-- | fs/cifs/smb2proto.h | 3 | ||||
-rw-r--r-- | fs/cifs/smbdirect.c | 2610 | ||||
-rw-r--r-- | fs/cifs/smbdirect.h | 338 | ||||
-rw-r--r-- | fs/cifs/transport.c | 69 |
24 files changed, 3896 insertions, 442 deletions
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index d5b2e12b5d02..c71971c01c63 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -196,6 +196,14 @@ config CIFS_SMB311 This dialect includes improved security negotiation features. If unsure, say N +config CIFS_SMB_DIRECT + bool "SMB Direct support (Experimental)" + depends on CIFS=m && INFINIBAND || CIFS=y && INFINIBAND=y + help + Enables SMB Direct experimental support for SMB 3.0, 3.02 and 3.1.1. + SMB Direct allows transferring SMB packets over RDMA. If unsure, + say N. + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 7134f182720b..7e4a1e2f0696 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -19,3 +19,5 @@ cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o + +cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index cbb9534b89b4..c7a863219fa3 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -30,6 +30,9 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#ifdef CONFIG_CIFS_SMB_DIRECT +#include "smbdirect.h" +#endif void cifs_dump_mem(char *label, void *data, int length) @@ -107,6 +110,32 @@ void cifs_dump_mids(struct TCP_Server_Info *server) } #ifdef CONFIG_PROC_FS +static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) +{ + __u32 dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); + + seq_printf(m, "%s Mounts: %d ", tcon->treeName, tcon->tc_count); + if (tcon->nativeFileSystem) + seq_printf(m, "Type: %s ", tcon->nativeFileSystem); + seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x\n\tPathComponentMax: %d Status: %d", + le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), + le32_to_cpu(tcon->fsAttrInfo.Attributes), + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), + tcon->tidStatus); + if (dev_type == FILE_DEVICE_DISK) + seq_puts(m, " type: DISK "); + else if (dev_type == FILE_DEVICE_CD_ROM) + seq_puts(m, " type: CDROM "); + else + seq_printf(m, " type: %d ", dev_type); + if (tcon->ses->server->ops->dump_share_caps) + tcon->ses->server->ops->dump_share_caps(m, tcon); + + if (tcon->need_reconnect) + seq_puts(m, "\tDISCONNECTED "); + seq_putc(m, '\n'); +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -115,7 +144,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; int i, j; - __u32 dev_type; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -152,6 +180,72 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (!server->rdma) + goto skip_rdma; + + seq_printf(m, "\nSMBDirect (in hex) protocol version: %x " + "transport status: %x", + server->smbd_conn->protocol, + server->smbd_conn->transport_status); + seq_printf(m, "\nConn receive_credit_max: %x " + "send_credit_target: %x max_send_size: %x", + server->smbd_conn->receive_credit_max, + server->smbd_conn->send_credit_target, + server->smbd_conn->max_send_size); + seq_printf(m, "\nConn max_fragmented_recv_size: %x " + "max_fragmented_send_size: %x max_receive_size:%x", + server->smbd_conn->max_fragmented_recv_size, + server->smbd_conn->max_fragmented_send_size, + server->smbd_conn->max_receive_size); + seq_printf(m, "\nConn keep_alive_interval: %x " + "max_readwrite_size: %x rdma_readwrite_threshold: %x", + server->smbd_conn->keep_alive_interval, + server->smbd_conn->max_readwrite_size, + server->smbd_conn->rdma_readwrite_threshold); + seq_printf(m, "\nDebug count_get_receive_buffer: %x " + "count_put_receive_buffer: %x count_send_empty: %x", + server->smbd_conn->count_get_receive_buffer, + server->smbd_conn->count_put_receive_buffer, + server->smbd_conn->count_send_empty); + seq_printf(m, "\nRead Queue count_reassembly_queue: %x " + "count_enqueue_reassembly_queue: %x " + "count_dequeue_reassembly_queue: %x " + "fragment_reassembly_remaining: %x " + "reassembly_data_length: %x " + "reassembly_queue_length: %x", + server->smbd_conn->count_reassembly_queue, + server->smbd_conn->count_enqueue_reassembly_queue, + server->smbd_conn->count_dequeue_reassembly_queue, + server->smbd_conn->fragment_reassembly_remaining, + server->smbd_conn->reassembly_data_length, + server->smbd_conn->reassembly_queue_length); + seq_printf(m, "\nCurrent Credits send_credits: %x " + "receive_credits: %x receive_credit_target: %x", + atomic_read(&server->smbd_conn->send_credits), + atomic_read(&server->smbd_conn->receive_credits), + server->smbd_conn->receive_credit_target); + seq_printf(m, "\nPending send_pending: %x send_payload_pending:" + " %x smbd_send_pending: %x smbd_recv_pending: %x", + atomic_read(&server->smbd_conn->send_pending), + atomic_read(&server->smbd_conn->send_payload_pending), + server->smbd_conn->smbd_send_pending, + server->smbd_conn->smbd_recv_pending); + seq_printf(m, "\nReceive buffers count_receive_queue: %x " + "count_empty_packet_queue: %x", + server->smbd_conn->count_receive_queue, + server->smbd_conn->count_empty_packet_queue); + seq_printf(m, "\nMR responder_resources: %x " + "max_frmr_depth: %x mr_type: %x", + server->smbd_conn->responder_resources, + server->smbd_conn->max_frmr_depth, + server->smbd_conn->mr_type); + seq_printf(m, "\nMR mr_ready_count: %x mr_used_count: %x", + atomic_read(&server->smbd_conn->mr_ready_count), + atomic_read(&server->smbd_conn->mr_used_count)); +skip_rdma: +#endif seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { @@ -176,6 +270,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } + if (server->rdma) + seq_printf(m, "RDMA\n\t"); seq_printf(m, "TCP status: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", server->tcpStatus, server->srv_count, @@ -189,35 +285,19 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_puts(m, "\n\tShares:"); j = 0; + + seq_printf(m, "\n\t%d) IPC: ", j); + if (ses->tcon_ipc) + cifs_debug_tcon(m, ses->tcon_ipc); + else + seq_puts(m, "none\n"); + list_for_each(tmp3, &ses->tcon_list) { tcon = list_entry(tmp3, struct cifs_tcon, tcon_list); ++j; - dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); - seq_printf(m, "\n\t%d) %s Mounts: %d ", j, - tcon->treeName, tcon->tc_count); - if (tcon->nativeFileSystem) { - seq_printf(m, "Type: %s ", - tcon->nativeFileSystem); - } - seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" - "\n\tPathComponentMax: %d Status: %d", - le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), - le32_to_cpu(tcon->fsAttrInfo.Attributes), - le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), - tcon->tidStatus); - if (dev_type == FILE_DEVICE_DISK) - seq_puts(m, " type: DISK "); - else if (dev_type == FILE_DEVICE_CD_ROM) - seq_puts(m, " type: CDROM "); - else - seq_printf(m, " type: %d ", dev_type); - if (server->ops->dump_share_caps) - server->ops->dump_share_caps(m, tcon); - - if (tcon->need_reconnect) - seq_puts(m, "\tDISCONNECTED "); - seq_putc(m, '\n'); + seq_printf(m, "\n\t%d) ", j); + cifs_debug_tcon(m, tcon); } seq_puts(m, "\n\tMIDs:\n"); @@ -374,6 +454,45 @@ static const struct file_operations cifs_stats_proc_fops = { }; #endif /* STATS */ +#ifdef CONFIG_CIFS_SMB_DIRECT +#define PROC_FILE_DEFINE(name) \ +static ssize_t name##_write(struct file *file, const char __user *buffer, \ + size_t count, loff_t *ppos) \ +{ \ + int rc; \ + rc = kstrtoint_from_user(buffer, count, 10, & name); \ + if (rc) \ + return rc; \ + return count; \ +} \ +static int name##_proc_show(struct seq_file *m, void *v) \ +{ \ + seq_printf(m, "%d\n", name ); \ + return 0; \ +} \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, name##_proc_show, NULL); \ +} \ +\ +static const struct file_operations cifs_##name##_proc_fops = { \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + .write = name##_write, \ +} + +PROC_FILE_DEFINE(rdma_readwrite_threshold); +PROC_FILE_DEFINE(smbd_max_frmr_depth); +PROC_FILE_DEFINE(smbd_keep_alive_interval); +PROC_FILE_DEFINE(smbd_max_receive_size); +PROC_FILE_DEFINE(smbd_max_fragmented_recv_size); +PROC_FILE_DEFINE(smbd_max_send_size); +PROC_FILE_DEFINE(smbd_send_credit_target); +PROC_FILE_DEFINE(smbd_receive_credit_max); +#endif + static struct proc_dir_entry *proc_fs_cifs; static const struct file_operations cifsFYI_proc_fops; static const struct file_operations cifs_lookup_cache_proc_fops; @@ -401,6 +520,24 @@ cifs_proc_init(void) &cifs_security_flags_proc_fops); proc_create("LookupCacheEnabled", 0, proc_fs_cifs, &cifs_lookup_cache_proc_fops); +#ifdef CONFIG_CIFS_SMB_DIRECT + proc_create("rdma_readwrite_threshold", 0, proc_fs_cifs, + &cifs_rdma_readwrite_threshold_proc_fops); + proc_create("smbd_max_frmr_depth", 0, proc_fs_cifs, + &cifs_smbd_max_frmr_depth_proc_fops); + proc_create("smbd_keep_alive_interval", 0, proc_fs_cifs, + &cifs_smbd_keep_alive_interval_proc_fops); + proc_create("smbd_max_receive_size", 0, proc_fs_cifs, + &cifs_smbd_max_receive_size_proc_fops); + proc_create("smbd_max_fragmented_recv_size", 0, proc_fs_cifs, + &cifs_smbd_max_fragmented_recv_size_proc_fops); + proc_create("smbd_max_send_size", 0, proc_fs_cifs, + &cifs_smbd_max_send_size_proc_fops); + proc_create("smbd_send_credit_target", 0, proc_fs_cifs, + &cifs_smbd_send_credit_target_proc_fops); + proc_create("smbd_receive_credit_max", 0, proc_fs_cifs, + &cifs_smbd_receive_credit_max_proc_fops); +#endif } void @@ -418,6 +555,16 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); +#ifdef CONFIG_CIFS_SMB_DIRECT + remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs); + remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs); + remove_proc_entry("smbd_keep_alive_interval", proc_fs_cifs); + remove_proc_entry("smbd_max_receive_size", proc_fs_cifs); + remove_proc_entry("smbd_max_fragmented_recv_size", proc_fs_cifs); + remove_proc_entry("smbd_max_send_size", proc_fs_cifs); + remove_proc_entry("smbd_send_credit_target", proc_fs_cifs); + remove_proc_entry("smbd_receive_credit_max", proc_fs_cifs); +#endif remove_proc_entry("fs/cifs", NULL); } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index b98436f5c7c7..13a8a77322c9 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1125,7 +1125,7 @@ out: return rc; } -/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ +/* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 68abbb0db608..f2b0a7f124da 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -325,9 +325,8 @@ int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, { int i; int rc; - char password_with_pad[CIFS_ENCPWD_SIZE]; + char password_with_pad[CIFS_ENCPWD_SIZE] = {0}; - memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); if (password) strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 31b7565b1617..a7be591d8e18 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -327,6 +327,8 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) default: seq_puts(s, "(unknown)"); } + if (server->rdma) + seq_puts(s, ",rdma"); } static void @@ -1068,6 +1070,7 @@ const struct file_operations cifs_file_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1086,6 +1089,7 @@ const struct file_operations cifs_file_strict_ops = { .flush = cifs_flush, .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1105,6 +1109,7 @@ const struct file_operations cifs_file_direct_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, @@ -1122,6 +1127,7 @@ const struct file_operations cifs_file_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1139,6 +1145,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_strict_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1157,6 +1164,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .flush = cifs_flush, .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, .clone_file_range = cifs_clone_file_range, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 5a10e566f0e6..013ba2aed8d9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,5 +149,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.10" +#define CIFS_VERSION "2.11" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b16583594d1a..48f7c197cd2d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -64,8 +64,8 @@ #define RFC1001_NAME_LEN 15 #define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1) -/* currently length of NIP6_FMT */ -#define SERVER_NAME_LENGTH 40 +/* maximum length of ip addr as a string (including ipv6 and sctp) */ +#define SERVER_NAME_LENGTH 80 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) /* echo interval in seconds */ @@ -230,8 +230,14 @@ struct smb_version_operations { __u64 (*get_next_mid)(struct TCP_Server_Info *); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); - /* data length from read response message */ - unsigned int (*read_data_length)(char *); + /* + * Data length from read response message + * When in_remaining is true, the returned data length is in + * message field DataRemaining for out-of-band data read (e.g through + * Memory Registration RDMA write in SMBD). + * Otherwise, the returned data length is in message field DataLength. + */ + unsigned int (*read_data_length)(char *, bool in_remaining); /* map smb to linux error */ int (*map_error)(char *, bool); /* find mid corresponding to the response message */ @@ -532,6 +538,7 @@ struct smb_vol { bool nopersistent:1; bool resilient:1; /* noresilient not required since not fored for CA */ bool domainauto:1; + bool rdma:1; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -648,6 +655,10 @@ struct TCP_Server_Info { bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ bool large_buf; /* is current buffer large? */ + /* use SMBD connection instead of socket */ + bool rdma; + /* point to the SMBD connection if RDMA is used instead of socket */ + struct smbd_connection *smbd_conn; struct delayed_work echo; /* echo ping workqueue job */ char *smallbuf; /* pointer to current "small" buffer */ char *bigbuf; /* pointer to current "big" buffer */ @@ -822,12 +833,12 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) struct cifs_ses { struct list_head smb_ses_list; struct list_head tcon_list; + struct cifs_tcon *tcon_ipc; struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ - __u32 ipc_tid; /* special tid for connection to IPC share */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ @@ -835,8 +846,7 @@ struct cifs_ses { kuid_t linux_uid; /* overriding owner of files on the mount */ kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; - char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for - TCP names - will ipv6 and sctp addresses fit? */ + char serverName[SERVER_NAME_LEN_WITH_NULL]; char *user_name; /* must not be null except during init of sess and after mount option parsing we fill it */ char *domainName; @@ -931,7 +941,9 @@ struct cifs_tcon { FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; - bool ipc:1; /* set if connection to IPC$ eg for RPC/PIPES */ + bool ipc:1; /* set if connection to IPC$ share (always also pipe) */ + bool pipe:1; /* set if connection to pipe share */ + bool print:1; /* set if connection to printer share */ bool retry:1; bool nocase:1; bool seal:1; /* transport encryption for this mounted share */ @@ -944,7 +956,6 @@ struct cifs_tcon { bool need_reopen_files:1; /* need to reopen tcon file handles */ bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ - bool print:1; /* set if connection to printer share */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1147,6 +1158,9 @@ struct cifs_readdata { struct cifs_readdata *rdata, struct iov_iter *iter); struct kvec iov[2]; +#ifdef CONFIG_CIFS_SMB_DIRECT + struct smbd_mr *mr; +#endif unsigned int pagesz; unsigned int tailsz; unsigned int credits; @@ -1169,6 +1183,9 @@ struct cifs_writedata { pid_t pid; unsigned int bytes; int result; +#ifdef CONFIG_CIFS_SMB_DIRECT + struct smbd_mr *mr; +#endif unsigned int pagesz; unsigned int tailsz; unsigned int credits; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4143c9dec463..93d565186698 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -106,6 +106,10 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, struct kvec * /* resp vec */); +extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses, + struct kvec *pkvec, int nvec_to_send, + int *pbuftype, const int flags, + struct kvec *presp); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 35dc5bf01ee2..4e0922d24eb2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -43,6 +43,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "fscache.h" +#include "smbdirect.h" #ifdef CONFIG_CIFS_POSIX static struct { @@ -1454,6 +1455,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; char *buf = server->smallbuf; unsigned int buflen = get_rfc1002_length(buf) + 4; + bool use_rdma_mr = false; cifs_dbg(FYI, "%s: mid=%llu offset=%llu bytes=%u\n", __func__, mid->mid, rdata->offset, rdata->bytes); @@ -1542,8 +1544,11 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) rdata->iov[0].iov_base, server->total_read); /* how much data is in the response? */ - data_len = server->ops->read_data_length(buf); - if (data_offset + data_len > buflen) { +#ifdef CONFIG_CIFS_SMB_DIRECT + use_rdma_mr = rdata->mr; +#endif + data_len = server->ops->read_data_length(buf, use_rdma_mr); + if (!use_rdma_mr && (data_offset + data_len > buflen)) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; return cifs_readv_discard(server, mid); @@ -1923,6 +1928,12 @@ cifs_writedata_release(struct kref *refcount) { struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif if (wdata->cfile) cifsFileInfo_put(wdata->cfile); @@ -4822,10 +4833,11 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, *target_nodes = NULL; cifs_dbg(FYI, "In GetDFSRefer the path %s\n", search_name); - if (ses == NULL) + if (ses == NULL || ses->tcon_ipc == NULL) return -ENODEV; + getDFSRetry: - rc = smb_init(SMB_COM_TRANSACTION2, 15, NULL, (void **) &pSMB, + rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; @@ -4833,7 +4845,7 @@ getDFSRetry: /* server pointer checked in called function, but should never be null here anyway */ pSMB->hdr.Mid = get_next_mid(ses->server); - pSMB->hdr.Tid = ses->ipc_tid; + pSMB->hdr.Tid = ses->tcon_ipc->tid; pSMB->hdr.Uid = ses->Suid; if (ses->capabilities & CAP_STATUS32) pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0bfc2280436d..a726f524fb84 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -44,7 +44,6 @@ #include <net/ipv6.h> #include <linux/parser.h> #include <linux/bvec.h> - #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -56,6 +55,7 @@ #include "rfc1002pdu.h" #include "fscache.h" #include "smb2proto.h" +#include "smbdirect.h" #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -92,7 +92,7 @@ enum { Opt_multiuser, Opt_sloppy, Opt_nosharesock, Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, - Opt_domainauto, + Opt_domainauto, Opt_rdma, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -183,6 +183,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_resilient, "resilienthandles"}, { Opt_noresilient, "noresilienthandles"}, { Opt_domainauto, "domainauto"}, + { Opt_rdma, "rdma"}, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -353,11 +354,12 @@ cifs_reconnect(struct TCP_Server_Info *server) list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); ses->need_reconnect = true; - ses->ipc_tid = 0; list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); tcon->need_reconnect = true; } + if (ses->tcon_ipc) + ses->tcon_ipc->need_reconnect = true; } spin_unlock(&cifs_tcp_ses_lock); @@ -405,7 +407,10 @@ cifs_reconnect(struct TCP_Server_Info *server) /* we should try only the port we connected to before */ mutex_lock(&server->srv_mutex); - rc = generic_ip_connect(server); + if (cifs_rdma_enabled(server)) + rc = smbd_reconnect(server); + else + rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); mutex_unlock(&server->srv_mutex); @@ -538,8 +543,10 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (server_unresponsive(server)) return -ECONNABORTED; - - length = sock_recvmsg(server->ssocket, smb_msg, 0); + if (cifs_rdma_enabled(server) && server->smbd_conn) + length = smbd_recv(server->smbd_conn, smb_msg); + else + length = sock_recvmsg(server->ssocket, smb_msg, 0); if (server->tcpStatus == CifsExiting) return -ESHUTDOWN; @@ -700,7 +707,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) wake_up_all(&server->request_q); /* give those requests time to exit */ msleep(125); - + if (cifs_rdma_enabled(server) && server->smbd_conn) { + smbd_destroy(server->smbd_conn); + server->smbd_conn = NULL; + } if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; @@ -1550,6 +1560,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_domainauto: vol->domainauto = true; break; + case Opt_rdma: + vol->rdma = true; + break; /* Numeric Values */ case Opt_backupuid: @@ -1707,7 +1720,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, tmp_end++; if (!(tmp_end < end && tmp_end[1] == delim)) { /* No it is not. Set the password to NULL */ - kfree(vol->password); + kzfree(vol->password); vol->password = NULL; break; } @@ -1745,7 +1758,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = end; } - kfree(vol->password); + kzfree(vol->password); /* Now build new password string */ temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); @@ -1951,6 +1964,19 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } + if (vol->rdma && vol->vals->protocol_id < SMB30_PROT_ID) { + cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); + goto cifs_parse_mount_err; + } + +#ifdef CONFIG_CIFS_SMB_DIRECT + if (vol->rdma && vol->sign) { + cifs_dbg(VFS, "Currently SMB direct doesn't support signing." + " This is being fixed\n"); + goto cifs_parse_mount_err; + } +#endif + #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (vol->multiuser) { @@ -2162,6 +2188,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (server->echo_interval != vol->echo_interval * HZ) return 0; + if (server->rdma != vol->rdma) + return 0; + return 1; } @@ -2260,6 +2289,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->noblocksnd = volume_info->noblocksnd; tcp_ses->noautotune = volume_info->noautotune; tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay; + tcp_ses->rdma = volume_info->rdma; tcp_ses->in_flight = 0; tcp_ses->credits = 1; init_waitqueue_head(&tcp_ses->response_q); @@ -2297,13 +2327,29 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->echo_interval = volume_info->echo_interval * HZ; else tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; - + if (tcp_ses->rdma) { +#ifndef CONFIG_CIFS_SMB_DIRECT + cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n"); + rc = -ENOENT; + goto out_err_crypto_release; +#endif + tcp_ses->smbd_conn = smbd_get_connection( + tcp_ses, (struct sockaddr *)&volume_info->dstaddr); + if (tcp_ses->smbd_conn) { + cifs_dbg(VFS, "RDMA transport established\n"); + rc = 0; + goto smbd_connected; + } else { + rc = -ENOENT; + goto out_err_crypto_release; + } + } rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); goto out_err_crypto_release; } - +smbd_connected: /* * since we're in a cifs function already, we know that * this will succeed. No need for try_module_get(). @@ -2381,6 +2427,93 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) return 1; } +/** + * cifs_setup_ipc - helper to setup the IPC tcon for the session + * + * A new IPC connection is made and stored in the session + * tcon_ipc. The IPC tcon has the same lifetime as the session. + */ +static int +cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info) +{ + int rc = 0, xid; + struct cifs_tcon *tcon; + struct nls_table *nls_codepage; + char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; + bool seal = false; + + /* + * If the mount request that resulted in the creation of the + * session requires encryption, force IPC to be encrypted too. + */ + if (volume_info->seal) { + if (ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) + seal = true; + else { + cifs_dbg(VFS, + "IPC: server doesn't support encryption\n"); + return -EOPNOTSUPP; + } + } + + tcon = tconInfoAlloc(); + if (tcon == NULL) + return -ENOMEM; + + snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->serverName); + + /* cannot fail */ + nls_codepage = load_nls_default(); + + xid = get_xid(); + tcon->ses = ses; + tcon->ipc = true; + tcon->seal = seal; + rc = ses->server->ops->tree_connect(xid, ses, unc, tcon, nls_codepage); + free_xid(xid); + + if (rc) { + cifs_dbg(VFS, "failed to connect to IPC (rc=%d)\n", rc); + tconInfoFree(tcon); + goto out; + } + + cifs_dbg(FYI, "IPC tcon rc = %d ipc tid = %d\n", rc, tcon->tid); + + ses->tcon_ipc = tcon; +out: + unload_nls(nls_codepage); + return rc; +} + +/** + * cifs_free_ipc - helper to release the session IPC tcon + * + * Needs to be called everytime a session is destroyed + */ +static int +cifs_free_ipc(struct cifs_ses *ses) +{ + int rc = 0, xid; + struct cifs_tcon *tcon = ses->tcon_ipc; + + if (tcon == NULL) + return 0; + + if (ses->server->ops->tree_disconnect) { + xid = get_xid(); + rc = ses->server->ops->tree_disconnect(xid, tcon); + free_xid(xid); + } + + if (rc) + cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc); + + tconInfoFree(tcon); + ses->tcon_ipc = NULL; + return rc; +} + static struct cifs_ses * cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) { @@ -2421,6 +2554,8 @@ cifs_put_smb_ses(struct cifs_ses *ses) ses->status = CifsExiting; spin_unlock(&cifs_tcp_ses_lock); + cifs_free_ipc(ses); + if (ses->status == CifsExiting && server->ops->logoff) { xid = get_xid(); rc = server->ops->logoff(xid, ses); @@ -2569,6 +2704,13 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)), } #endif /* CONFIG_KEYS */ +/** + * cifs_get_smb_ses - get a session matching @volume_info data from @server + * + * This function assumes it is being called from cifs_mount() where we + * already got a server reference (server refcount +1). See + * cifs_get_tcon() for refcount explanations. + */ static struct cifs_ses * cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) { @@ -2665,6 +2807,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) spin_unlock(&cifs_tcp_ses_lock); free_xid(xid); + + cifs_setup_ipc(ses, volume_info); + return ses; get_ses_fail: @@ -2709,8 +2854,16 @@ void cifs_put_tcon(struct cifs_tcon *tcon) { unsigned int xid; - struct cifs_ses *ses = tcon->ses; + struct cifs_ses *ses; + + /* + * IPC tcon share the lifetime of their session and are + * destroyed in the session put function + */ + if (tcon == NULL || tcon->ipc) + return; + ses = tcon->ses; cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count); spin_lock(&cifs_tcp_ses_lock); if (--tcon->tc_count > 0) { @@ -2731,6 +2884,26 @@ cifs_put_tcon(struct cifs_tcon *tcon) cifs_put_smb_ses(ses); } +/** + * cifs_get_tcon - get a tcon matching @volume_info data from @ses + * + * - tcon refcount is the number of mount points using the tcon. + * - ses refcount is the number of tcon using the session. + * + * 1. This function assumes it is being called from cifs_mount() where + * we already got a session reference (ses refcount +1). + * + * 2. Since we're in the context of adding a mount point, the end + * result should be either: + * + * a) a new tcon already allocated with refcount=1 (1 mount point) and + * its session refcount incremented (1 new tcon). This +1 was + * already done in (1). + * + * b) an existing tcon with refcount+1 (add a mount point to it) and + * identical ses refcount (no new tcon). Because of (1) we need to + * decrement the ses refcount. + */ static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) { @@ -2739,8 +2912,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) tcon = cifs_find_tcon(ses, volume_info); if (tcon) { + /* + * tcon has refcount already incremented but we need to + * decrement extra ses reference gotten by caller (case b) + */ cifs_dbg(FYI, "Found match on UNC path\n"); - /* existing tcon already has a reference */ cifs_put_smb_ses(ses); return tcon; } @@ -2986,39 +3162,17 @@ get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, const struct nls_table *nls_codepage, unsigned int *num_referrals, struct dfs_info3_param **referrals, int remap) { - char *temp_unc; int rc = 0; - if (!ses->server->ops->tree_connect || !ses->server->ops->get_dfs_refer) + if (!ses->server->ops->get_dfs_refer) return -ENOSYS; *num_referrals = 0; *referrals = NULL; - if (ses->ipc_tid == 0) { - temp_unc = kmalloc(2 /* for slashes */ + - strnlen(ses->serverName, SERVER_NAME_LEN_WITH_NULL * 2) - + 1 + 4 /* slash IPC$ */ + 2, GFP_KERNEL); - if (temp_unc == NULL) - return -ENOMEM; - temp_unc[0] = '\\'; - temp_unc[1] = '\\'; - strcpy(temp_unc + 2, ses->serverName); - strcpy(temp_unc + 2 + strlen(ses->serverName), "\\IPC$"); - rc = ses->server->ops->tree_connect(xid, ses, temp_unc, NULL, - nls_codepage); - cifs_dbg(FYI, "Tcon rc = %d ipc_tid = %d\n", rc, ses->ipc_tid); - kfree(temp_unc); - } - if (rc == 0) - rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, - referrals, num_referrals, - nls_codepage, remap); - /* - * BB - map targetUNCs to dfs_info3 structures, here or in - * ses->server->ops->get_dfs_refer. - */ - + rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, + referrals, num_referrals, + nls_codepage, remap); return rc; } @@ -3783,7 +3937,7 @@ try_mount_again: tcon->unix_ext = 0; /* server does not support them */ /* do not care if a following call succeed - informational */ - if (!tcon->ipc && server->ops->qfs_tcon) + if (!tcon->pipe && server->ops->qfs_tcon) server->ops->qfs_tcon(xid, tcon); cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); @@ -3913,8 +4067,7 @@ out: } /* - * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon - * pointer may be NULL. + * Issue a TREE_CONNECT request. */ int CIFSTCon(const unsigned int xid, struct cifs_ses *ses, @@ -3950,7 +4103,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { + if (tcon->pipe || (ses->server->sec_mode & SECMODE_USER)) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ @@ -4022,7 +4175,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, 0); /* above now done in SendReceive */ - if ((rc == 0) && (tcon != NULL)) { + if (rc == 0) { bool is_unicode; tcon->tidStatus = CifsGood; @@ -4042,7 +4195,8 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') && (bcc_ptr[2] == 'C')) { cifs_dbg(FYI, "IPC connection\n"); - tcon->ipc = 1; + tcon->ipc = true; + tcon->pipe = true; } } else if (length == 2) { if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) { @@ -4069,9 +4223,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, else tcon->Flags = 0; cifs_dbg(FYI, "Tcon flags: 0x%x\n", tcon->Flags); - } else if ((rc == 0) && tcon == NULL) { - /* all we need to save for IPC$ connection */ - ses->ipc_tid = smb_buffer_response->Tid; } cifs_buf_release(smb_buffer); @@ -4235,7 +4386,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) reset_cifs_unix_caps(0, tcon, NULL, vol_info); out: kfree(vol_info->username); - kfree(vol_info->password); + kzfree(vol_info->password); kfree(vol_info); return tcon; @@ -4387,7 +4538,7 @@ cifs_prune_tlinks(struct work_struct *work) struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); struct rb_root *root = &cifs_sb->tlink_tree; - struct rb_node *node = rb_first(root); + struct rb_node *node; struct rb_node *tmp; struct tcon_link *tlink; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index df9f682708c6..7cee97b93a61 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -42,7 +42,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "fscache.h" - +#include "smbdirect.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -2902,7 +2902,12 @@ cifs_readdata_release(struct kref *refcount) { struct cifs_readdata *rdata = container_of(refcount, struct cifs_readdata, refcount); - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (rdata->mr) { + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif if (rdata->cfile) cifsFileInfo_put(rdata->cfile); @@ -3031,6 +3036,10 @@ uncached_fill_pages(struct TCP_Server_Info *server, } if (iter) result = copy_page_from_iter(page, 0, n, iter); +#ifdef CONFIG_CIFS_SMB_DIRECT + else if (rdata->mr) + result = n; +#endif else result = cifs_read_page_from_socket(server, page, n); if (result < 0) @@ -3471,20 +3480,18 @@ static const struct vm_operations_struct cifs_file_vm_ops = { int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) { - int rc, xid; + int xid, rc = 0; struct inode *inode = file_inode(file); xid = get_xid(); - if (!CIFS_CACHE_READ(CIFS_I(inode))) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) rc = cifs_zap_mapping(inode); - if (rc) - return rc; - } - - rc = generic_file_mmap(file, vma); - if (rc == 0) + if (!rc) + rc = generic_file_mmap(file, vma); + if (!rc) vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); return rc; } @@ -3494,16 +3501,16 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) int rc, xid; xid = get_xid(); + rc = cifs_revalidate_file(file); - if (rc) { + if (rc) cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n", rc); - free_xid(xid); - return rc; - } - rc = generic_file_mmap(file, vma); - if (rc == 0) + if (!rc) + rc = generic_file_mmap(file, vma); + if (!rc) vma->vm_ops = &cifs_file_vm_ops; + free_xid(xid); return rc; } @@ -3600,6 +3607,10 @@ readpages_fill_pages(struct TCP_Server_Info *server, if (iter) result = copy_page_from_iter(page, 0, n, iter); +#ifdef CONFIG_CIFS_SMB_DIRECT + else if (rdata->mr) + result = n; +#endif else result = cifs_read_page_from_socket(server, page, n); if (result < 0) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ecb99079363a..8f9a8cc7cc62 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1049,7 +1049,7 @@ iget_no_retry: tcon->resource_id = CIFS_I(inode)->uniqueid; #endif - if (rc && tcon->ipc) { + if (rc && tcon->pipe) { cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); inode->i_mode |= S_IFDIR; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index eea93ac15ef0..a0dbced4a45c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -98,14 +98,11 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->serverOS); kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); - if (buf_to_free->password) { - memset(buf_to_free->password, 0, strlen(buf_to_free->password)); - kfree(buf_to_free->password); - } + kzfree(buf_to_free->password); kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); - kfree(buf_to_free->auth_key.response); - kfree(buf_to_free); + kzfree(buf_to_free->auth_key.response); + kzfree(buf_to_free); } struct cifs_tcon * @@ -136,10 +133,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) } atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); - if (buf_to_free->password) { - memset(buf_to_free->password, 0, strlen(buf_to_free->password)); - kfree(buf_to_free->password); - } + kzfree(buf_to_free->password); kfree(buf_to_free); } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a723df3e0197..3d495e440c87 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -87,9 +87,11 @@ cifs_read_data_offset(char *buf) } static unsigned int -cifs_read_data_length(char *buf) +cifs_read_data_length(char *buf, bool in_remaining) { READ_RSP *rsp = (READ_RSP *)buf; + /* It's a bug reading remaining data for SMB1 packets */ + WARN_ON(in_remaining); return (le16_to_cpu(rsp->DataLengthHigh) << 16) + le16_to_cpu(rsp->DataLength); } diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index b4b1f0305f29..12af5dba742b 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -74,7 +74,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, nr_ioctl_req.Reserved = 0; rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid, fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&nr_ioctl_req, sizeof(nr_ioctl_req), NULL, NULL /* no return info */); if (rc == -EOPNOTSUPP) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b08a1446a7f..76d03abaa38c 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -578,7 +578,7 @@ smb2_is_valid_lease_break(char *buffer) bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { - struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; + struct smb2_oplock_break_rsp *rsp = (struct smb2_oplock_break_rsp *)buffer; struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ed88ab8a4774..eb68e2fcc500 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -32,6 +32,7 @@ #include "smb2status.h" #include "smb2glob.h" #include "cifs_ioctl.h" +#include "smbdirect.h" static int change_conf(struct TCP_Server_Info *server) @@ -250,7 +251,11 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_readwrite_size); +#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); @@ -266,6 +271,11 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_readwrite_size); +#endif if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); @@ -283,7 +293,6 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, - false /* use_ipc */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); if (rc != 0) @@ -782,7 +791,6 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */, - false /* use_ipc */, NULL, 0 /* no input */, (char **)&res_key, &ret_data_len); @@ -848,8 +856,7 @@ smb2_copychunk_range(const unsigned int xid, /* Request server copy to target from src identified by key */ rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - true /* is_fsctl */, false /* use_ipc */, - (char *)pcchunk, + true /* is_fsctl */, (char *)pcchunk, sizeof(struct copychunk_ioctl), (char **)&retbuf, &ret_data_len); if (rc == 0) { @@ -947,9 +954,13 @@ smb2_read_data_offset(char *buf) } static unsigned int -smb2_read_data_length(char *buf) +smb2_read_data_length(char *buf, bool in_remaining) { struct smb2_read_rsp *rsp = (struct smb2_read_rsp *)buf; + + if (in_remaining) + return le32_to_cpu(rsp->DataRemaining); + return le32_to_cpu(rsp->DataLength); } @@ -1006,7 +1017,7 @@ static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_SPARSE, - true /* is_fctl */, false /* use_ipc */, + true /* is_fctl */, &setsparse, 1, NULL, NULL); if (rc) { tcon->broken_sparse_sup = true; @@ -1077,7 +1088,7 @@ smb2_duplicate_extents(const unsigned int xid, rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), NULL, @@ -1112,7 +1123,7 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_INTEGRITY_INFORMATION, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), NULL, @@ -1132,7 +1143,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SRV_ENUMERATE_SNAPSHOTS, - true /* is_fsctl */, false /* use_ipc */, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); @@ -1351,16 +1362,20 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "smb2_get_dfs_refer path <%s>\n", search_name); /* - * Use any tcon from the current session. Here, the first one. + * Try to use the IPC tcon, otherwise just use any */ - spin_lock(&cifs_tcp_ses_lock); - tcon = list_first_entry_or_null(&ses->tcon_list, struct cifs_tcon, - tcon_list); - if (tcon) - tcon->tc_count++; - spin_unlock(&cifs_tcp_ses_lock); + tcon = ses->tcon_ipc; + if (tcon == NULL) { + spin_lock(&cifs_tcp_ses_lock); + tcon = list_first_entry_or_null(&ses->tcon_list, + struct cifs_tcon, + tcon_list); + if (tcon) + tcon->tc_count++; + spin_unlock(&cifs_tcp_ses_lock); + } - if (!tcon) { + if (tcon == NULL) { cifs_dbg(VFS, "session %p has no tcon available for a dfs referral request\n", ses); rc = -ENOTCONN; @@ -1389,20 +1404,11 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, memcpy(dfs_req->RequestFileName, utf16_path, utf16_path_len); do { - /* try first with IPC */ rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_DFS_GET_REFERRALS, - true /* is_fsctl */, true /* use_ipc */, + true /* is_fsctl */, (char *)dfs_req, dfs_req_size, (char **)&dfs_rsp, &dfs_rsp_size); - if (rc == -ENOTCONN) { - /* try with normal tcon */ - rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, - FSCTL_DFS_GET_REFERRALS, - true /* is_fsctl */, false /*use_ipc*/, - (char *)dfs_req, dfs_req_size, - (char **)&dfs_rsp, &dfs_rsp_size); - } } while (rc == -EAGAIN); if (rc) { @@ -1421,7 +1427,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } out: - if (tcon) { + if (tcon && !tcon->ipc) { + /* ipc tcons are not refcounted */ spin_lock(&cifs_tcp_ses_lock); tcon->tc_count--; spin_unlock(&cifs_tcp_ses_lock); @@ -1713,8 +1720,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, false /* use_ipc */, - (char *)&fsctl_buf, + true /* is_fctl */, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -1748,8 +1754,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, false /* use_ipc */, - (char *)&fsctl_buf, + true /* is_fctl */, (char *)&fsctl_buf, sizeof(struct file_zero_data_information), NULL, NULL); free_xid(xid); return rc; @@ -2411,6 +2416,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, struct iov_iter iter; struct kvec iov; int length; + bool use_rdma_mr = false; if (shdr->Command != SMB2_READ) { cifs_dbg(VFS, "only big read responses are supported\n"); @@ -2437,7 +2443,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } data_offset = server->ops->read_data_offset(buf) + 4; - data_len = server->ops->read_data_length(buf); +#ifdef CONFIG_CIFS_SMB_DIRECT + use_rdma_mr = rdata->mr; +#endif + data_len = server->ops->read_data_length(buf, use_rdma_mr); if (data_offset < server->vals->read_rsp_size) { /* diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 01346b8b6edb..63778ac22fd9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -48,6 +48,7 @@ #include "smb2glob.h" #include "cifspdu.h" #include "cifs_spnego.h" +#include "smbdirect.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -319,54 +320,16 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, *total_len = parmsize + sizeof(struct smb2_sync_hdr); } -/* init request without RFC1001 length at the beginning */ -static int -smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf, unsigned int *total_len) -{ - int rc; - struct smb2_sync_hdr *shdr; - - rc = smb2_reconnect(smb2_command, tcon); - if (rc) - return rc; - - /* BB eventually switch this to SMB2 specific small buf size */ - *request_buf = cifs_small_buf_get(); - if (*request_buf == NULL) { - /* BB should we add a retry in here if not a writepage? */ - return -ENOMEM; - } - - shdr = (struct smb2_sync_hdr *)(*request_buf); - - fill_small_buf(smb2_command, tcon, shdr, total_len); - - if (tcon != NULL) { -#ifdef CONFIG_CIFS_STATS2 - uint16_t com_code = le16_to_cpu(smb2_command); - - cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]); -#endif - cifs_stats_inc(&tcon->num_smbs_sent); - } - - return rc; -} - /* * Allocate and return pointer to an SMB request hdr, and set basic * SMB information in the SMB header. If the return code is zero, this - * function must have filled in request_buf pointer. The returned buffer - * has RFC1001 length at the beginning. + * function must have filled in request_buf pointer. */ static int -small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, - void **request_buf) +smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, + void **request_buf, unsigned int *total_len) { int rc; - unsigned int total_len; - struct smb2_pdu *pdu; rc = smb2_reconnect(smb2_command, tcon); if (rc) @@ -379,12 +342,9 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return -ENOMEM; } - pdu = (struct smb2_pdu *)(*request_buf); - - fill_small_buf(smb2_command, tcon, get_sync_hdr(pdu), &total_len); - - /* Note this is only network field converted to big endian */ - pdu->hdr.smb2_buf_length = cpu_to_be32(total_len); + fill_small_buf(smb2_command, tcon, + (struct smb2_sync_hdr *)(*request_buf), + total_len); if (tcon != NULL) { #ifdef CONFIG_CIFS_STATS2 @@ -398,8 +358,8 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, } #ifdef CONFIG_CIFS_SMB311 -/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ -#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ +/* offset is sizeof smb2_negotiate_req but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) */ #define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) @@ -427,23 +387,25 @@ build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) } static void -assemble_neg_contexts(struct smb2_negotiate_req *req) +assemble_neg_contexts(struct smb2_negotiate_req *req, + unsigned int *total_len) { - - /* +4 is to account for the RFC1001 len field */ - char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT; build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) - + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ + + *total_len += 4 + sizeof(struct smb2_preauth_neg_context) + + sizeof(struct smb2_encryption_neg_context); } #else -static void assemble_neg_contexts(struct smb2_negotiate_req *req) +static void assemble_neg_contexts(struct smb2_negotiate_req *req, + unsigned int *total_len) { return; } @@ -477,6 +439,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) int blob_offset, blob_length; char *security_blob; int flags = CIFS_NEG_OP; + unsigned int total_len; cifs_dbg(FYI, "Negotiate protocol\n"); @@ -485,30 +448,30 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) return -EIO; } - rc = small_smb2_init(SMB2_NEGOTIATE, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_NEGOTIATE, NULL, (void **) &req, &total_len); if (rc) return rc; - req->hdr.sync_hdr.SessionId = 0; + req->sync_hdr.SessionId = 0; if (strcmp(ses->server->vals->version_string, SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); req->DialectCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4); + total_len += 4; } else if (strcmp(ses->server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); req->DialectCount = cpu_to_le16(3); - inc_rfc1001_len(req, 6); + total_len += 6; } else { /* otherwise send specific dialect */ req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id); req->DialectCount = cpu_to_le16(1); - inc_rfc1001_len(req, 2); + total_len += 2; } /* only one of SMB2 signing flags may be set in SMB2 request */ @@ -528,13 +491,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); if (ses->server->vals->protocol_id == SMB311_PROT_ID) - assemble_neg_contexts(req); + assemble_neg_contexts(req, &total_len); } iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -654,6 +616,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "validate negotiate\n"); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (tcon->ses->server->rdma) + return 0; +#endif + /* * validation ioctl must be signed, so no point sending this if we * can not sign it (ie are not known user). Even if signing is not @@ -713,7 +680,6 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, - false /* use_ipc */, (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req), (char **)&pneg_rsp, &rsplen); @@ -733,8 +699,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } /* check validate negotiate info response matches what we got earlier */ - if (pneg_rsp->Dialect != - cpu_to_le16(tcon->ses->server->vals->protocol_id)) + if (pneg_rsp->Dialect != cpu_to_le16(tcon->ses->server->dialect)) goto vneg_out; if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode)) @@ -806,20 +771,22 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; struct TCP_Server_Info *server = ses->server; + unsigned int total_len; - rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_SESSION_SETUP, NULL, (void **) &req, + &total_len); if (rc) return rc; /* First session, not a reauthenticate */ - req->hdr.sync_hdr.SessionId = 0; + req->sync_hdr.SessionId = 0; /* if reconnect, we need to send previous sess id, otherwise it is 0 */ req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(3); + req->sync_hdr.CreditRequest = cpu_to_le16(3); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -833,8 +800,8 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) req->Channel = 0; /* MBZ */ sess_data->iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for pad */ - sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for pad */ + sess_data->iov[0].iov_len = total_len - 1; /* * This variable will be used to clear the buffer * allocated above in case of any error in the calling function. @@ -860,18 +827,15 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); - /* BB add code to build os and lm fields */ - rc = SendReceive2(sess_data->xid, sess_data->ses, - sess_data->iov, 2, - &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + rc = smb2_send_recv(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); @@ -1092,7 +1056,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->hdr.sync_hdr.SessionId = ses->Suid; + req->sync_hdr.SessionId = ses->Suid; rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -1202,6 +1166,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) int rc = 0; struct TCP_Server_Info *server; int flags = 0; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "disconnect session %p\n", ses); @@ -1214,19 +1182,24 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) if (ses->need_reconnect) goto smb2_session_already_dead; - rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); + rc = smb2_plain_req_init(SMB2_LOGOFF, NULL, (void **) &req, &total_len); if (rc) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.sync_hdr.SessionId = ses->Suid; + req->sync_hdr.SessionId = ses->Suid; if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) flags |= CIFS_TRANSFORM_REQ; else if (server->sign) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; - rc = SendReceiveNoRsp(xid, ses, (char *) req, flags); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1265,6 +1238,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, int unc_path_len; __le16 *unc_path = NULL; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "TCON\n"); @@ -1283,40 +1257,30 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, } /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ - if (tcon) - tcon->tid = 0; + tcon->tid = 0; - rc = small_smb2_init(SMB2_TREE_CONNECT, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req, + &total_len); if (rc) { kfree(unc_path); return rc; } - if (tcon == NULL) { - if ((ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)) - flags |= CIFS_TRANSFORM_REQ; - - /* since no tcon, smb2_init can not do this, so do here */ - req->hdr.sync_hdr.SessionId = ses->Suid; - if (ses->server->sign) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - } else if (encryption_required(tcon)) + if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for pad */ + iov[0].iov_len = total_len - 1; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->PathOffset = cpu_to_le16(sizeof(struct smb2_tree_connect_req) - - 1 /* pad */ - 4 /* do not count rfc1001 len field */); + - 1 /* pad */); req->PathLength = cpu_to_le16(unc_path_len - 2); iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - inc_rfc1001_len(req, unc_path_len - 1 /* pad */); - - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1328,21 +1292,16 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_error_exit; } - if (tcon == NULL) { - ses->ipc_tid = rsp->hdr.sync_hdr.TreeId; - goto tcon_exit; - } - switch (rsp->ShareType) { case SMB2_SHARE_TYPE_DISK: cifs_dbg(FYI, "connection to disk share\n"); break; case SMB2_SHARE_TYPE_PIPE: - tcon->ipc = true; + tcon->pipe = true; cifs_dbg(FYI, "connection to pipe share\n"); break; case SMB2_SHARE_TYPE_PRINT: - tcon->ipc = true; + tcon->print = true; cifs_dbg(FYI, "connection to printer\n"); break; default: @@ -1389,6 +1348,10 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) int rc = 0; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "Tree Disconnect\n"); @@ -1398,14 +1361,20 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - rc = small_smb2_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, + &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceiveNoRsp(xid, ses, (char *)req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1505,11 +1474,10 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; if (!req->CreateContextsOffset) req->CreateContextsOffset = cpu_to_le32( - sizeof(struct smb2_create_req) - 4 + + sizeof(struct smb2_create_req) + iov[num - 1].iov_len); le32_add_cpu(&req->CreateContextsLength, server->vals->create_lease_size); - inc_rfc1001_len(&req->hdr, server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -1589,10 +1557,9 @@ add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable_v2); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2)); *num_iovec = num + 1; return 0; } @@ -1613,12 +1580,10 @@ add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_handle_reconnect_v2)); - inc_rfc1001_len(&req->hdr, - sizeof(struct create_durable_handle_reconnect_v2)); *num_iovec = num + 1; return 0; } @@ -1649,10 +1614,9 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, iov[num].iov_len = sizeof(struct create_durable); if (!req->CreateContextsOffset) req->CreateContextsOffset = - cpu_to_le32(sizeof(struct smb2_create_req) - 4 + + cpu_to_le32(sizeof(struct smb2_create_req) + iov[1].iov_len); le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; } @@ -1723,6 +1687,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u32 file_attributes = 0; char *dhc_buf = NULL, *lc_buf = NULL; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "create/open\n"); @@ -1731,7 +1696,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, else return -EIO; - rc = small_smb2_init(SMB2_CREATE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + if (rc) return rc; @@ -1752,12 +1718,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; /* -1 since last byte is buf[0] which is sent below (path) */ - iov[0].iov_len--; + iov[0].iov_len = total_len - 1; - req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req) - 4); + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); /* [MS-SMB2] 2.2.13 NameOffset: * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of @@ -1770,7 +1734,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); @@ -1797,8 +1761,6 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, iov[1].iov_len = uni_path_len; iov[1].iov_base = path; - /* -1 since last byte is buf[0] which was counted in smb2_buf_len */ - inc_rfc1001_len(req, uni_path_len - 1); if (!server->oplocks) *oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -1836,7 +1798,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, dhc_buf = iov[n_iov-1].iov_base; } - rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -1877,7 +1840,7 @@ creat_exit: */ int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, bool use_ipc, + u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */) { @@ -1891,6 +1854,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int n_iov; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "SMB2 IOCTL\n"); @@ -1909,20 +1873,10 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_IOCTL, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (use_ipc) { - if (ses->ipc_tid == 0) { - cifs_small_buf_release(req); - return -ENOTCONN; - } - - cifs_dbg(FYI, "replacing tid 0x%x with IPC tid 0x%x\n", - req->hdr.sync_hdr.TreeId, ses->ipc_tid); - req->hdr.sync_hdr.TreeId = ses->ipc_tid; - } if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1934,7 +1888,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = - cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4); + cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); iov[1].iov_base = in_data; iov[1].iov_len = indatalen; n_iov = 2; @@ -1969,21 +1923,20 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * but if input data passed to ioctl, we do not * want to double count this, so we do not send * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). We also must add 4 bytes - * in first iovec to allow for rfc1002 length field. + * input data (in iovec[1]). */ if (indatalen) { - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; - inc_rfc1001_len(req, indatalen - 1); + iov[0].iov_len = total_len - 1; } else - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) - req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; @@ -2052,7 +2005,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, - false /* use_ipc */, (char *)&fsctl_input /* data input */, 2 /* in data len */, &ret_data /* out data */, NULL); @@ -2073,13 +2025,14 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Close\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_CLOSE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_CLOSE, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2090,10 +2043,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -2180,13 +2132,15 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, int resp_buftype; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Query Info\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -2203,15 +2157,14 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, * We do not use the input buffer (do not send extra byte) */ req->InputBufferOffset = 0; - inc_rfc1001_len(req, -1); req->OutputBufferLength = cpu_to_le32(output_len); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2338,6 +2291,10 @@ void smb2_reconnect_server(struct work_struct *work) tcon_exist = true; } } + if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { + list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); + tcon_exist = true; + } } /* * Get the reference to server struct to be sure that the last call of @@ -2376,6 +2333,8 @@ SMB2_echo(struct TCP_Server_Info *server) struct kvec iov[2]; struct smb_rqst rqst = { .rq_iov = iov, .rq_nvec = 2 }; + unsigned int total_len; + __be32 rfc1002_marker; cifs_dbg(FYI, "In echo request\n"); @@ -2385,17 +2344,17 @@ SMB2_echo(struct TCP_Server_Info *server) return rc; } - rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); + rc = smb2_plain_req_init(SMB2_ECHO, NULL, (void **)&req, &total_len); if (rc) return rc; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); - /* 4 for rfc1002 length field */ iov[0].iov_len = 4; - iov[0].iov_base = (char *)req; - iov[1].iov_len = get_rfc1002_length(req); - iov[1].iov_base = (char *)req + 4; + rfc1002_marker = cpu_to_be32(total_len); + iov[0].iov_base = &rfc1002_marker; + iov[1].iov_len = total_len; + iov[1].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, server, CIFS_ECHO_OP); @@ -2417,13 +2376,14 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, int resp_buftype; int rc = 0; int flags = 0; + unsigned int total_len; cifs_dbg(FYI, "Flush\n"); if (!ses || !(ses->server)) return -EIO; - rc = small_smb2_init(SMB2_FLUSH, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_FLUSH, tcon, (void **) &req, &total_len); if (rc) return rc; @@ -2434,10 +2394,9 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, req->VolatileFileId = volatile_fid; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + iov[0].iov_len = total_len; - rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); if (rc != 0) @@ -2453,18 +2412,21 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, */ static int smb2_new_read_req(void **buf, unsigned int *total_len, - struct cifs_io_parms *io_parms, unsigned int remaining_bytes, - int request_type) + struct cifs_io_parms *io_parms, struct cifs_readdata *rdata, + unsigned int remaining_bytes, int request_type) { int rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_sync_hdr *shdr; + struct TCP_Server_Info *server; rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req, total_len); if (rc) return rc; - if (io_parms->tcon->ses->server == NULL) + + server = io_parms->tcon->ses->server; + if (server == NULL) return -ECONNABORTED; shdr = &req->sync_hdr; @@ -2478,7 +2440,40 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->MinimumCount = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If we want to do a RDMA write, fill in and append + * smbd_buffer_descriptor_v1 to the end of read request + */ + if (server->rdma && rdata && + rdata->bytes >= server->smbd_conn->rdma_readwrite_threshold) { + + struct smbd_buffer_descriptor_v1 *v1; + bool need_invalidate = + io_parms->tcon->ses->server->dialect == SMB30_PROT_ID; + + rdata->mr = smbd_register_mr( + server->smbd_conn, rdata->pages, + rdata->nr_pages, rdata->tailsz, + true, need_invalidate); + if (!rdata->mr) + return -ENOBUFS; + + req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; + if (need_invalidate) + req->Channel = SMB2_CHANNEL_RDMA_V1; + req->ReadChannelInfoOffset = + cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer)); + req->ReadChannelInfoLength = + cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); + v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + v1->offset = cpu_to_le64(rdata->mr->mr->iova); + v1->token = cpu_to_le32(rdata->mr->mr->rkey); + v1->length = cpu_to_le32(rdata->mr->mr->length); + + *total_len += sizeof(*v1) - 1; + } +#endif if (request_type & CHAINED_REQUEST) { if (!(request_type & END_OF_CHAIN)) { /* next 8-byte aligned request */ @@ -2557,7 +2552,17 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->result != -ENODATA) rdata->result = -EIO; } - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If this rdata has a memmory registered, the MR can be freed + * MR needs to be freed as soon as I/O finishes to prevent deadlock + * because they have limited number and are used for future I/Os + */ + if (rdata->mr) { + smbd_deregister_mr(rdata->mr); + rdata->mr = NULL; + } +#endif if (rdata->result) cifs_stats_fail_inc(tcon, SMB2_READ_HE); @@ -2592,7 +2597,8 @@ smb2_async_readv(struct cifs_readdata *rdata) server = io_parms.tcon->ses->server; - rc = smb2_new_read_req((void **) &buf, &total_len, &io_parms, 0, 0); + rc = smb2_new_read_req( + (void **) &buf, &total_len, &io_parms, rdata, 0, 0); if (rc) { if (rc == -EAGAIN && rdata->credits) { /* credits was reset by reconnect */ @@ -2650,31 +2656,24 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct smb2_sync_hdr *shdr; - struct kvec iov[2]; + struct kvec iov[1]; struct kvec rsp_iov; unsigned int total_len; - __be32 req_len; - struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; int flags = CIFS_LOG_ERROR; struct cifs_ses *ses = io_parms->tcon->ses; *nbytes = 0; - rc = smb2_new_read_req((void **)&req, &total_len, io_parms, 0, 0); + rc = smb2_new_read_req((void **)&req, &total_len, io_parms, NULL, 0, 0); if (rc) return rc; if (encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req_len = cpu_to_be32(total_len); - - iov[0].iov_base = &req_len; - iov[0].iov_len = sizeof(__be32); - iov[1].iov_base = req; - iov[1].iov_len = total_len; + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; @@ -2755,7 +2754,19 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EIO; break; } - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If this wdata has a memory registered, the MR can be freed + * The number of MRs available is limited, it's important to recover + * used MR as soon as I/O is finished. Hold MR longer in the later + * I/O process can possibly result in I/O deadlock due to lack of MR + * to send request on I/O retry + */ + if (wdata->mr) { + smbd_deregister_mr(wdata->mr); + wdata->mr = NULL; + } +#endif if (wdata->result) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); @@ -2776,8 +2787,10 @@ smb2_async_writev(struct cifs_writedata *wdata, struct TCP_Server_Info *server = tcon->ses->server; struct kvec iov[2]; struct smb_rqst rqst = { }; + unsigned int total_len; + __be32 rfc1002_marker; - rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); if (rc) { if (rc == -EAGAIN && wdata->credits) { /* credits was reset by reconnect */ @@ -2793,7 +2806,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - shdr = get_sync_hdr(req); + shdr = (struct smb2_sync_hdr *)req; shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); req->PersistentFileId = wdata->cfile->fid.persistent_fid; @@ -2802,16 +2815,51 @@ smb2_async_writev(struct cifs_writedata *wdata, req->WriteChannelInfoLength = 0; req->Channel = 0; req->Offset = cpu_to_le64(wdata->offset); - /* 4 for rfc1002 length field */ req->DataOffset = cpu_to_le16( - offsetof(struct smb2_write_req, Buffer) - 4); + offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; - +#ifdef CONFIG_CIFS_SMB_DIRECT + /* + * If we want to do a server RDMA read, fill in and append + * smbd_buffer_descriptor_v1 to the end of write request + */ + if (server->rdma && wdata->bytes >= + server->smbd_conn->rdma_readwrite_threshold) { + + struct smbd_buffer_descriptor_v1 *v1; + bool need_invalidate = server->dialect == SMB30_PROT_ID; + + wdata->mr = smbd_register_mr( + server->smbd_conn, wdata->pages, + wdata->nr_pages, wdata->tailsz, + false, need_invalidate); + if (!wdata->mr) { + rc = -ENOBUFS; + goto async_writev_out; + } + req->Length = 0; + req->DataOffset = 0; + req->RemainingBytes = + cpu_to_le32((wdata->nr_pages-1)*PAGE_SIZE + wdata->tailsz); + req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE; + if (need_invalidate) + req->Channel = SMB2_CHANNEL_RDMA_V1; + req->WriteChannelInfoOffset = + cpu_to_le16(offsetof(struct smb2_write_req, Buffer)); + req->WriteChannelInfoLength = + cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); + v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; + v1->offset = cpu_to_le64(wdata->mr->mr->iova); + v1->token = cpu_to_le32(wdata->mr->mr->rkey); + v1->length = cpu_to_le32(wdata->mr->mr->length); + } +#endif /* 4 for rfc1002 length field and 1 for Buffer */ iov[0].iov_len = 4; - iov[0].iov_base = req; - iov[1].iov_len = get_rfc1002_length(req) - 1; - iov[1].iov_base = (char *)req + 4; + rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes); + iov[0].iov_base = &rfc1002_marker; + iov[1].iov_len = total_len - 1; + iov[1].iov_base = (char *)req; rqst.rq_iov = iov; rqst.rq_nvec = 2; @@ -2819,13 +2867,22 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; - +#ifdef CONFIG_CIFS_SMB_DIRECT + if (wdata->mr) { + iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1); + rqst.rq_npages = 0; + } +#endif cifs_dbg(FYI, "async write at %llu %u bytes\n", wdata->offset, wdata->bytes); +#ifdef CONFIG_CIFS_SMB_DIRECT + /* For RDMA read, I/O size is in RemainingBytes not in Length */ + if (!wdata->mr) + req->Length = cpu_to_le32(wdata->bytes); +#else req->Length = cpu_to_le32(wdata->bytes); - - inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); +#endif if (wdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, @@ -2869,13 +2926,15 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, int resp_buftype; struct kvec rsp_iov; int flags = 0; + unsigned int total_len; *nbytes = 0; if (n_vec < 1) return rc; - rc = small_smb2_init(SMB2_WRITE, io_parms->tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -2885,7 +2944,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); + req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); req->PersistentFileId = io_parms->persistent_fid; req->VolatileFileId = io_parms->volatile_fid; @@ -2894,20 +2953,16 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, req->Channel = 0; req->Length = cpu_to_le32(io_parms->length); req->Offset = cpu_to_le64(io_parms->offset); - /* 4 for rfc1002 length field */ req->DataOffset = cpu_to_le16( - offsetof(struct smb2_write_req, Buffer) - 4); + offsetof(struct smb2_write_req, Buffer)); req->RemainingBytes = 0; iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and 1 for Buffer */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; - /* length of entire message including data to be written */ - inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */); - - rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1, - &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1, + &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; @@ -2984,13 +3039,15 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, unsigned int output_size = CIFSMaxBufSize; size_t info_buf_size; int flags = 0; + unsigned int total_len; if (ses && (ses->server)) server = ses->server; else return -EIO; - rc = small_smb2_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3022,7 +3079,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, memcpy(bufptr, &asteriks, len); req->FileNameOffset = - cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_query_directory_req) - 1); req->FileNameLength = cpu_to_le16(len); /* * BB could be 30 bytes or so longer if we used SMB2 specific @@ -3033,15 +3090,13 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, req->OutputBufferLength = cpu_to_le32(output_size); iov[0].iov_base = (char *)req; - /* 4 for RFC1001 length and 1 for Buffer */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - inc_rfc1001_len(req, len - 1 /* Buffer */); - - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3110,6 +3165,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, unsigned int i; struct cifs_ses *ses = tcon->ses; int flags = 0; + unsigned int total_len; if (!ses || !(ses->server)) return -EIO; @@ -3121,7 +3177,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, if (!iov) return -ENOMEM; - rc = small_smb2_init(SMB2_SET_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len); if (rc) { kfree(iov); return rc; @@ -3130,7 +3186,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); + req->sync_hdr.ProcessId = cpu_to_le32(pid); req->InfoType = info_type; req->FileInfoClass = info_class; @@ -3138,27 +3194,25 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - /* 4 for RFC1001 length and 1 for Buffer */ req->BufferOffset = - cpu_to_le16(sizeof(struct smb2_set_info_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_set_info_req) - 1); req->BufferLength = cpu_to_le32(*size); - inc_rfc1001_len(req, *size - 1 /* Buffer */); - memcpy(req->Buffer, *data, *size); + total_len += *size; iov[0].iov_base = (char *)req; - /* 4 for RFC1001 length */ - iov[0].iov_len = get_rfc1002_length(req) + 4; + /* 1 for Buffer */ + iov[0].iov_len = total_len - 1; for (i = 1; i < num; i++) { - inc_rfc1001_len(req, size[i]); le32_add_cpu(&req->BufferLength, size[i]); iov[i].iov_base = (char *)data[i]; iov[i].iov_len = size[i]; } - rc = SendReceive2(xid, ses, iov, num, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags, + &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; @@ -3310,11 +3364,17 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 oplock_level) { int rc; - struct smb2_oplock_break *req = NULL; + struct smb2_oplock_break_req *req = NULL; + struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "SMB2_oplock_break\n"); - rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3324,9 +3384,14 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3355,13 +3420,15 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, { int rc; struct smb2_query_info_req *req; + unsigned int total_len; cifs_dbg(FYI, "Query FSInfo level %d\n", level); if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; - rc = small_smb2_init(SMB2_QUERY_INFO, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, + &total_len); if (rc) return rc; @@ -3369,15 +3436,14 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, req->FileInfoClass = level; req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; - /* 4 for rfc1002 length field and 1 for pad */ + /* 1 for pad */ req->InputBufferOffset = - cpu_to_le16(sizeof(struct smb2_query_info_req) - 1 - 4); + cpu_to_le16(sizeof(struct smb2_query_info_req) - 1); req->OutputBufferLength = cpu_to_le32( outbuf_len + sizeof(struct smb2_query_info_rsp) - 1 - 4); iov->iov_base = (char *)req; - /* 4 for rfc1002 length field */ - iov->iov_len = get_rfc1002_length(req) + 4; + iov->iov_len = total_len; return 0; } @@ -3403,7 +3469,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3459,7 +3525,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3505,34 +3571,33 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, int resp_buf_type; unsigned int count; int flags = CIFS_NO_RESP; + unsigned int total_len; cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock); - rc = small_smb2_init(SMB2_LOCK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_LOCK, tcon, (void **) &req, &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid); + req->sync_hdr.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; req->VolatileFileId = volatile_fid; count = num_lock * sizeof(struct smb2_lock_element); - inc_rfc1001_len(req, count - sizeof(struct smb2_lock_element)); iov[0].iov_base = (char *)req; - /* 4 for rfc1002 length field and count for all locks */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - count; + iov[0].iov_len = total_len - sizeof(struct smb2_lock_element); iov[1].iov_base = (char *)buf; iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, flags, - &rsp_iov); + rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + &rsp_iov); cifs_small_buf_release(req); if (rc) { cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc); @@ -3565,24 +3630,35 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_lease_ack *req = NULL; + struct cifs_ses *ses = tcon->ses; int flags = CIFS_OBREAK_OP; + unsigned int total_len; + struct kvec iov[1]; + struct kvec rsp_iov; + int resp_buf_type; cifs_dbg(FYI, "SMB2_lease_break\n"); - rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req); + rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, + &total_len); if (rc) return rc; if (encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1); + req->sync_hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); - inc_rfc1001_len(req, 12); + total_len += 12; memcpy(req->LeaseKey, lease_key, 16); req->LeaseState = lease_state; - rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags); + flags |= CIFS_NO_RESP; + + iov[0].iov_base = (char *)req; + iov[0].iov_len = total_len; + + rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index c2ec934be968..6eb9f9691ed4 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -195,7 +195,7 @@ struct smb2_symlink_err_rsp { #define SMB2_CLIENT_GUID_SIZE 16 struct smb2_negotiate_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 36 */ __le16 DialectCount; __le16 SecurityMode; @@ -282,7 +282,7 @@ struct smb2_negotiate_rsp { #define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 struct smb2_sess_setup_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 25 */ __u8 Flags; __u8 SecurityMode; @@ -308,7 +308,7 @@ struct smb2_sess_setup_rsp { } __packed; struct smb2_logoff_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -323,7 +323,7 @@ struct smb2_logoff_rsp { #define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 struct smb2_tree_connect_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 9 */ __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; @@ -375,7 +375,7 @@ struct smb2_tree_connect_rsp { #define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ struct smb2_tree_disconnect_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; @@ -496,7 +496,7 @@ struct smb2_tree_disconnect_rsp { #define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 struct smb2_create_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u8 SecurityFlags; __u8 RequestedOplockLevel; @@ -753,7 +753,7 @@ struct duplicate_extents_to_file { } __packed; struct smb2_ioctl_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -789,7 +789,7 @@ struct smb2_ioctl_rsp { /* Currently defined values for close flags */ #define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) struct smb2_close_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __le16 Flags; __le32 Reserved; @@ -812,7 +812,7 @@ struct smb2_close_rsp { } __packed; struct smb2_flush_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 24 */ __le16 Reserved1; __le32 Reserved2; @@ -830,9 +830,9 @@ struct smb2_flush_rsp { #define SMB2_READFLAG_READ_UNBUFFERED 0x01 /* Channel field for read and write: exactly one of following flags can be set*/ -#define SMB2_CHANNEL_NONE 0x00000000 -#define SMB2_CHANNEL_RDMA_V1 0x00000001 /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000002 /* SMB3.02 or later */ +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ /* SMB2 read request without RFC1001 length at the beginning */ struct smb2_read_plain_req { @@ -847,8 +847,8 @@ struct smb2_read_plain_req { __le32 MinimumCount; __le32 Channel; /* MBZ except for SMB3 or later */ __le32 RemainingBytes; - __le16 ReadChannelInfoOffset; /* Reserved MBZ */ - __le16 ReadChannelInfoLength; /* Reserved MBZ */ + __le16 ReadChannelInfoOffset; + __le16 ReadChannelInfoLength; __u8 Buffer[1]; } __packed; @@ -868,7 +868,7 @@ struct smb2_read_rsp { #define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ struct smb2_write_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 49 */ __le16 DataOffset; /* offset from start of SMB2 header to write data */ __le32 Length; @@ -877,8 +877,8 @@ struct smb2_write_req { __u64 VolatileFileId; /* opaque endianness */ __le32 Channel; /* Reserved MBZ */ __le32 RemainingBytes; - __le16 WriteChannelInfoOffset; /* Reserved MBZ */ - __le16 WriteChannelInfoLength; /* Reserved MBZ */ + __le16 WriteChannelInfoOffset; + __le16 WriteChannelInfoLength; __le32 Flags; __u8 Buffer[1]; } __packed; @@ -907,7 +907,7 @@ struct smb2_lock_element { } __packed; struct smb2_lock_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; __le32 Reserved; @@ -924,7 +924,7 @@ struct smb2_lock_rsp { } __packed; struct smb2_echo_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -942,7 +942,7 @@ struct smb2_echo_rsp { #define SMB2_REOPEN 0x10 struct smb2_query_directory_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ __u8 FileInformationClass; __u8 Flags; @@ -989,7 +989,7 @@ struct smb2_query_directory_rsp { #define SL_INDEX_SPECIFIED 0x00000004 struct smb2_query_info_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 41 */ __u8 InfoType; __u8 FileInfoClass; @@ -1013,7 +1013,7 @@ struct smb2_query_info_rsp { } __packed; struct smb2_set_info_req { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ __u8 InfoType; __u8 FileInfoClass; @@ -1031,7 +1031,19 @@ struct smb2_set_info_rsp { __le16 StructureSize; /* Must be 2 */ } __packed; -struct smb2_oplock_break { +/* oplock break without an rfc1002 header */ +struct smb2_oplock_break_req { + struct smb2_sync_hdr sync_hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __u64 PersistentFid; + __u64 VolatileFid; +} __packed; + +/* oplock break with an rfc1002 header */ +struct smb2_oplock_break_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; @@ -1057,7 +1069,7 @@ struct smb2_lease_break { } __packed; struct smb2_lease_ack { - struct smb2_hdr hdr; + struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 36 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index e9ab5227e7a8..05287b01f596 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -125,8 +125,7 @@ extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, struct smb2_err_rsp **err_buf); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, - bool is_fsctl, bool use_ipc, - char *in_data, u32 indatalen, + bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c new file mode 100644 index 000000000000..5130492847eb --- /dev/null +++ b/fs/cifs/smbdirect.c @@ -0,0 +1,2610 @@ +/* + * Copyright (C) 2017, Microsoft Corporation. + * + * Author(s): Long Li <longli@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/highmem.h> +#include "smbdirect.h" +#include "cifs_debug.h" + +static struct smbd_response *get_empty_queue_buffer( + struct smbd_connection *info); +static struct smbd_response *get_receive_buffer( + struct smbd_connection *info); +static void put_receive_buffer( + struct smbd_connection *info, + struct smbd_response *response); +static int allocate_receive_buffers(struct smbd_connection *info, int num_buf); +static void destroy_receive_buffers(struct smbd_connection *info); + +static void put_empty_packet( + struct smbd_connection *info, struct smbd_response *response); +static void enqueue_reassembly( + struct smbd_connection *info, + struct smbd_response *response, int data_length); +static struct smbd_response *_get_first_reassembly( + struct smbd_connection *info); + +static int smbd_post_recv( + struct smbd_connection *info, + struct smbd_response *response); + +static int smbd_post_send_empty(struct smbd_connection *info); +static int smbd_post_send_data( + struct smbd_connection *info, + struct kvec *iov, int n_vec, int remaining_data_length); +static int smbd_post_send_page(struct smbd_connection *info, + struct page *page, unsigned long offset, + size_t size, int remaining_data_length); + +static void destroy_mr_list(struct smbd_connection *info); +static int allocate_mr_list(struct smbd_connection *info); + +/* SMBD version number */ +#define SMBD_V1 0x0100 + +/* Port numbers for SMBD transport */ +#define SMB_PORT 445 +#define SMBD_PORT 5445 + +/* Address lookup and resolve timeout in ms */ +#define RDMA_RESOLVE_TIMEOUT 5000 + +/* SMBD negotiation timeout in seconds */ +#define SMBD_NEGOTIATE_TIMEOUT 120 + +/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ +#define SMBD_MIN_RECEIVE_SIZE 128 +#define SMBD_MIN_FRAGMENTED_SIZE 131072 + +/* + * Default maximum number of RDMA read/write outstanding on this connection + * This value is possibly decreased during QP creation on hardware limit + */ +#define SMBD_CM_RESPONDER_RESOURCES 32 + +/* Maximum number of retries on data transfer operations */ +#define SMBD_CM_RETRY 6 +/* No need to retry on Receiver Not Ready since SMBD manages credits */ +#define SMBD_CM_RNR_RETRY 0 + +/* + * User configurable initial values per SMBD transport connection + * as defined in [MS-SMBD] 3.1.1.1 + * Those may change after a SMBD negotiation + */ +/* The local peer's maximum number of credits to grant to the peer */ +int smbd_receive_credit_max = 255; + +/* The remote peer's credit request of local peer */ +int smbd_send_credit_target = 255; + +/* The maximum single message size can be sent to remote peer */ +int smbd_max_send_size = 1364; + +/* The maximum fragmented upper-layer payload receive size supported */ +int smbd_max_fragmented_recv_size = 1024 * 1024; + +/* The maximum single-message size which can be received */ +int smbd_max_receive_size = 8192; + +/* The timeout to initiate send of a keepalive message on idle */ +int smbd_keep_alive_interval = 120; + +/* + * User configurable initial values for RDMA transport + * The actual values used may be lower and are limited to hardware capabilities + */ +/* Default maximum number of SGEs in a RDMA write/read */ +int smbd_max_frmr_depth = 2048; + +/* If payload is less than this byte, use RDMA send/recv not read/write */ +int rdma_readwrite_threshold = 4096; + +/* Transport logging functions + * Logging are defined as classes. They can be OR'ed to define the actual + * logging level via module parameter smbd_logging_class + * e.g. cifs.smbd_logging_class=0xa0 will log all log_rdma_recv() and + * log_rdma_event() + */ +#define LOG_OUTGOING 0x1 +#define LOG_INCOMING 0x2 +#define LOG_READ 0x4 +#define LOG_WRITE 0x8 +#define LOG_RDMA_SEND 0x10 +#define LOG_RDMA_RECV 0x20 +#define LOG_KEEP_ALIVE 0x40 +#define LOG_RDMA_EVENT 0x80 +#define LOG_RDMA_MR 0x100 +static unsigned int smbd_logging_class; +module_param(smbd_logging_class, uint, 0644); +MODULE_PARM_DESC(smbd_logging_class, + "Logging class for SMBD transport 0x0 to 0x100"); + +#define ERR 0x0 +#define INFO 0x1 +static unsigned int smbd_logging_level = ERR; +module_param(smbd_logging_level, uint, 0644); +MODULE_PARM_DESC(smbd_logging_level, + "Logging level for SMBD transport, 0 (default): error, 1: info"); + +#define log_rdma(level, class, fmt, args...) \ +do { \ + if (level <= smbd_logging_level || class & smbd_logging_class) \ + cifs_dbg(VFS, "%s:%d " fmt, __func__, __LINE__, ##args);\ +} while (0) + +#define log_outgoing(level, fmt, args...) \ + log_rdma(level, LOG_OUTGOING, fmt, ##args) +#define log_incoming(level, fmt, args...) \ + log_rdma(level, LOG_INCOMING, fmt, ##args) +#define log_read(level, fmt, args...) log_rdma(level, LOG_READ, fmt, ##args) +#define log_write(level, fmt, args...) log_rdma(level, LOG_WRITE, fmt, ##args) +#define log_rdma_send(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_SEND, fmt, ##args) +#define log_rdma_recv(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_RECV, fmt, ##args) +#define log_keep_alive(level, fmt, args...) \ + log_rdma(level, LOG_KEEP_ALIVE, fmt, ##args) +#define log_rdma_event(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_EVENT, fmt, ##args) +#define log_rdma_mr(level, fmt, args...) \ + log_rdma(level, LOG_RDMA_MR, fmt, ##args) + +/* + * Destroy the transport and related RDMA and memory resources + * Need to go through all the pending counters and make sure on one is using + * the transport while it is destroyed + */ +static void smbd_destroy_rdma_work(struct work_struct *work) +{ + struct smbd_response *response; + struct smbd_connection *info = + container_of(work, struct smbd_connection, destroy_work); + unsigned long flags; + + log_rdma_event(INFO, "destroying qp\n"); + ib_drain_qp(info->id->qp); + rdma_destroy_qp(info->id); + + /* Unblock all I/O waiting on the send queue */ + wake_up_interruptible_all(&info->wait_send_queue); + + log_rdma_event(INFO, "cancelling idle timer\n"); + cancel_delayed_work_sync(&info->idle_timer_work); + log_rdma_event(INFO, "cancelling send immediate work\n"); + cancel_delayed_work_sync(&info->send_immediate_work); + + log_rdma_event(INFO, "wait for all send to finish\n"); + wait_event(info->wait_smbd_send_pending, + info->smbd_send_pending == 0); + + log_rdma_event(INFO, "wait for all recv to finish\n"); + wake_up_interruptible(&info->wait_reassembly_queue); + wait_event(info->wait_smbd_recv_pending, + info->smbd_recv_pending == 0); + + log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); + wait_event(info->wait_send_pending, + atomic_read(&info->send_pending) == 0); + wait_event(info->wait_send_payload_pending, + atomic_read(&info->send_payload_pending) == 0); + + log_rdma_event(INFO, "freeing mr list\n"); + wake_up_interruptible_all(&info->wait_mr); + wait_event(info->wait_for_mr_cleanup, + atomic_read(&info->mr_used_count) == 0); + destroy_mr_list(info); + + /* It's not posssible for upper layer to get to reassembly */ + log_rdma_event(INFO, "drain the reassembly queue\n"); + do { + spin_lock_irqsave(&info->reassembly_queue_lock, flags); + response = _get_first_reassembly(info); + if (response) { + list_del(&response->list); + spin_unlock_irqrestore( + &info->reassembly_queue_lock, flags); + put_receive_buffer(info, response); + } + } while (response); + spin_unlock_irqrestore(&info->reassembly_queue_lock, flags); + info->reassembly_data_length = 0; + + log_rdma_event(INFO, "free receive buffers\n"); + wait_event(info->wait_receive_queues, + info->count_receive_queue + info->count_empty_packet_queue + == info->receive_credit_max); + destroy_receive_buffers(info); + + ib_free_cq(info->send_cq); + ib_free_cq(info->recv_cq); + ib_dealloc_pd(info->pd); + rdma_destroy_id(info->id); + + /* free mempools */ + mempool_destroy(info->request_mempool); + kmem_cache_destroy(info->request_cache); + + mempool_destroy(info->response_mempool); + kmem_cache_destroy(info->response_cache); + + info->transport_status = SMBD_DESTROYED; + wake_up_all(&info->wait_destroy); +} + +static int smbd_process_disconnected(struct smbd_connection *info) +{ + schedule_work(&info->destroy_work); + return 0; +} + +static void smbd_disconnect_rdma_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, disconnect_work); + + if (info->transport_status == SMBD_CONNECTED) { + info->transport_status = SMBD_DISCONNECTING; + rdma_disconnect(info->id); + } +} + +static void smbd_disconnect_rdma_connection(struct smbd_connection *info) +{ + queue_work(info->workqueue, &info->disconnect_work); +} + +/* Upcall from RDMA CM */ +static int smbd_conn_upcall( + struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct smbd_connection *info = id->context; + + log_rdma_event(INFO, "event=%d status=%d\n", + event->event, event->status); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + info->ri_rc = 0; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + info->ri_rc = -EHOSTUNREACH; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + info->ri_rc = -ENETUNREACH; + complete(&info->ri_done); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + log_rdma_event(INFO, "connected event=%d\n", event->event); + info->transport_status = SMBD_CONNECTED; + wake_up_interruptible(&info->conn_wait); + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + log_rdma_event(INFO, "connecting failed event=%d\n", event->event); + info->transport_status = SMBD_DISCONNECTED; + wake_up_interruptible(&info->conn_wait); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_DISCONNECTED: + /* This happenes when we fail the negotiation */ + if (info->transport_status == SMBD_NEGOTIATE_FAILED) { + info->transport_status = SMBD_DISCONNECTED; + wake_up(&info->conn_wait); + break; + } + + info->transport_status = SMBD_DISCONNECTED; + smbd_process_disconnected(info); + break; + + default: + break; + } + + return 0; +} + +/* Upcall from RDMA QP */ +static void +smbd_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct smbd_connection *info = context; + + log_rdma_event(ERR, "%s on device %s info %p\n", + ib_event_msg(event->event), event->device->name, info); + + switch (event->event) { + case IB_EVENT_CQ_ERR: + case IB_EVENT_QP_FATAL: + smbd_disconnect_rdma_connection(info); + + default: + break; + } +} + +static inline void *smbd_request_payload(struct smbd_request *request) +{ + return (void *)request->packet; +} + +static inline void *smbd_response_payload(struct smbd_response *response) +{ + return (void *)response->packet; +} + +/* Called when a RDMA send is done */ +static void send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + int i; + struct smbd_request *request = + container_of(wc->wr_cqe, struct smbd_request, cqe); + + log_rdma_send(INFO, "smbd_request %p completed wc->status=%d\n", + request, wc->status); + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { + log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n", + wc->status, wc->opcode); + smbd_disconnect_rdma_connection(request->info); + } + + for (i = 0; i < request->num_sge; i++) + ib_dma_unmap_single(request->info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + + if (request->has_payload) { + if (atomic_dec_and_test(&request->info->send_payload_pending)) + wake_up(&request->info->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&request->info->send_pending)) + wake_up(&request->info->wait_send_pending); + } + + mempool_free(request, request->info->request_mempool); +} + +static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp) +{ + log_rdma_event(INFO, "resp message min_version %u max_version %u " + "negotiated_version %u credits_requested %u " + "credits_granted %u status %u max_readwrite_size %u " + "preferred_send_size %u max_receive_size %u " + "max_fragmented_size %u\n", + resp->min_version, resp->max_version, resp->negotiated_version, + resp->credits_requested, resp->credits_granted, resp->status, + resp->max_readwrite_size, resp->preferred_send_size, + resp->max_receive_size, resp->max_fragmented_size); +} + +/* + * Process a negotiation response message, according to [MS-SMBD]3.1.5.7 + * response, packet_length: the negotiation response message + * return value: true if negotiation is a success, false if failed + */ +static bool process_negotiation_response( + struct smbd_response *response, int packet_length) +{ + struct smbd_connection *info = response->info; + struct smbd_negotiate_resp *packet = smbd_response_payload(response); + + if (packet_length < sizeof(struct smbd_negotiate_resp)) { + log_rdma_event(ERR, + "error: packet_length=%d\n", packet_length); + return false; + } + + if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) { + log_rdma_event(ERR, "error: negotiated_version=%x\n", + le16_to_cpu(packet->negotiated_version)); + return false; + } + info->protocol = le16_to_cpu(packet->negotiated_version); + + if (packet->credits_requested == 0) { + log_rdma_event(ERR, "error: credits_requested==0\n"); + return false; + } + info->receive_credit_target = le16_to_cpu(packet->credits_requested); + + if (packet->credits_granted == 0) { + log_rdma_event(ERR, "error: credits_granted==0\n"); + return false; + } + atomic_set(&info->send_credits, le16_to_cpu(packet->credits_granted)); + + atomic_set(&info->receive_credits, 0); + + if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) { + log_rdma_event(ERR, "error: preferred_send_size=%d\n", + le32_to_cpu(packet->preferred_send_size)); + return false; + } + info->max_receive_size = le32_to_cpu(packet->preferred_send_size); + + if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { + log_rdma_event(ERR, "error: max_receive_size=%d\n", + le32_to_cpu(packet->max_receive_size)); + return false; + } + info->max_send_size = min_t(int, info->max_send_size, + le32_to_cpu(packet->max_receive_size)); + + if (le32_to_cpu(packet->max_fragmented_size) < + SMBD_MIN_FRAGMENTED_SIZE) { + log_rdma_event(ERR, "error: max_fragmented_size=%d\n", + le32_to_cpu(packet->max_fragmented_size)); + return false; + } + info->max_fragmented_send_size = + le32_to_cpu(packet->max_fragmented_size); + info->rdma_readwrite_threshold = + rdma_readwrite_threshold > info->max_fragmented_send_size ? + info->max_fragmented_send_size : + rdma_readwrite_threshold; + + + info->max_readwrite_size = min_t(u32, + le32_to_cpu(packet->max_readwrite_size), + info->max_frmr_depth * PAGE_SIZE); + info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE; + + return true; +} + +/* + * Check and schedule to send an immediate packet + * This is used to extend credtis to remote peer to keep the transport busy + */ +static void check_and_send_immediate(struct smbd_connection *info) +{ + if (info->transport_status != SMBD_CONNECTED) + return; + + info->send_immediate = true; + + /* + * Promptly send a packet if our peer is running low on receive + * credits + */ + if (atomic_read(&info->receive_credits) < + info->receive_credit_target - 1) + queue_delayed_work( + info->workqueue, &info->send_immediate_work, 0); +} + +static void smbd_post_send_credits(struct work_struct *work) +{ + int ret = 0; + int use_receive_queue = 1; + int rc; + struct smbd_response *response; + struct smbd_connection *info = + container_of(work, struct smbd_connection, + post_send_credits_work); + + if (info->transport_status != SMBD_CONNECTED) { + wake_up(&info->wait_receive_queues); + return; + } + + if (info->receive_credit_target > + atomic_read(&info->receive_credits)) { + while (true) { + if (use_receive_queue) + response = get_receive_buffer(info); + else + response = get_empty_queue_buffer(info); + if (!response) { + /* now switch to emtpy packet queue */ + if (use_receive_queue) { + use_receive_queue = 0; + continue; + } else + break; + } + + response->type = SMBD_TRANSFER_DATA; + response->first_segment = false; + rc = smbd_post_recv(info, response); + if (rc) { + log_rdma_recv(ERR, + "post_recv failed rc=%d\n", rc); + put_receive_buffer(info, response); + break; + } + + ret++; + } + } + + spin_lock(&info->lock_new_credits_offered); + info->new_credits_offered += ret; + spin_unlock(&info->lock_new_credits_offered); + + atomic_add(ret, &info->receive_credits); + + /* Check if we can post new receive and grant credits to peer */ + check_and_send_immediate(info); +} + +static void smbd_recv_done_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, recv_done_work); + + /* + * We may have new send credits granted from remote peer + * If any sender is blcoked on lack of credets, unblock it + */ + if (atomic_read(&info->send_credits)) + wake_up_interruptible(&info->wait_send_queue); + + /* + * Check if we need to send something to remote peer to + * grant more credits or respond to KEEP_ALIVE packet + */ + check_and_send_immediate(info); +} + +/* Called from softirq, when recv is done */ +static void recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_data_transfer *data_transfer; + struct smbd_response *response = + container_of(wc->wr_cqe, struct smbd_response, cqe); + struct smbd_connection *info = response->info; + int data_length = 0; + + log_rdma_recv(INFO, "response=%p type=%d wc status=%d wc opcode %d " + "byte_len=%d pkey_index=%x\n", + response, response->type, wc->status, wc->opcode, + wc->byte_len, wc->pkey_index); + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { + log_rdma_recv(INFO, "wc->status=%d opcode=%d\n", + wc->status, wc->opcode); + smbd_disconnect_rdma_connection(info); + goto error; + } + + ib_dma_sync_single_for_cpu( + wc->qp->device, + response->sge.addr, + response->sge.length, + DMA_FROM_DEVICE); + + switch (response->type) { + /* SMBD negotiation response */ + case SMBD_NEGOTIATE_RESP: + dump_smbd_negotiate_resp(smbd_response_payload(response)); + info->full_packet_received = true; + info->negotiate_done = + process_negotiation_response(response, wc->byte_len); + complete(&info->negotiate_completion); + break; + + /* SMBD data transfer packet */ + case SMBD_TRANSFER_DATA: + data_transfer = smbd_response_payload(response); + data_length = le32_to_cpu(data_transfer->data_length); + + /* + * If this is a packet with data playload place the data in + * reassembly queue and wake up the reading thread + */ + if (data_length) { + if (info->full_packet_received) + response->first_segment = true; + + if (le32_to_cpu(data_transfer->remaining_data_length)) + info->full_packet_received = false; + else + info->full_packet_received = true; + + enqueue_reassembly( + info, + response, + data_length); + } else + put_empty_packet(info, response); + + if (data_length) + wake_up_interruptible(&info->wait_reassembly_queue); + + atomic_dec(&info->receive_credits); + info->receive_credit_target = + le16_to_cpu(data_transfer->credits_requested); + atomic_add(le16_to_cpu(data_transfer->credits_granted), + &info->send_credits); + + log_incoming(INFO, "data flags %d data_offset %d " + "data_length %d remaining_data_length %d\n", + le16_to_cpu(data_transfer->flags), + le32_to_cpu(data_transfer->data_offset), + le32_to_cpu(data_transfer->data_length), + le32_to_cpu(data_transfer->remaining_data_length)); + + /* Send a KEEP_ALIVE response right away if requested */ + info->keep_alive_requested = KEEP_ALIVE_NONE; + if (le16_to_cpu(data_transfer->flags) & + SMB_DIRECT_RESPONSE_REQUESTED) { + info->keep_alive_requested = KEEP_ALIVE_PENDING; + } + + queue_work(info->workqueue, &info->recv_done_work); + return; + + default: + log_rdma_recv(ERR, + "unexpected response type=%d\n", response->type); + } + +error: + put_receive_buffer(info, response); +} + +static struct rdma_cm_id *smbd_create_id( + struct smbd_connection *info, + struct sockaddr *dstaddr, int port) +{ + struct rdma_cm_id *id; + int rc; + __be16 *sport; + + id = rdma_create_id(&init_net, smbd_conn_upcall, info, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc); + return id; + } + + if (dstaddr->sa_family == AF_INET6) + sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; + else + sport = &((struct sockaddr_in *)dstaddr)->sin_port; + + *sport = htons(port); + + init_completion(&info->ri_done); + info->ri_rc = -ETIMEDOUT; + + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, + RDMA_RESOLVE_TIMEOUT); + if (rc) { + log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); + goto out; + } + wait_for_completion_interruptible_timeout( + &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = info->ri_rc; + if (rc) { + log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); + goto out; + } + + info->ri_rc = -ETIMEDOUT; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); + goto out; + } + wait_for_completion_interruptible_timeout( + &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + rc = info->ri_rc; + if (rc) { + log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); + goto out; + } + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Test if FRWR (Fast Registration Work Requests) is supported on the device + * This implementation requries FRWR on RDMA read/write + * return value: true if it is supported + */ +static bool frwr_is_supported(struct ib_device_attr *attrs) +{ + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return false; + if (attrs->max_fast_reg_page_list_len == 0) + return false; + return true; +} + +static int smbd_ia_open( + struct smbd_connection *info, + struct sockaddr *dstaddr, int port) +{ + int rc; + + info->id = smbd_create_id(info, dstaddr, port); + if (IS_ERR(info->id)) { + rc = PTR_ERR(info->id); + goto out1; + } + + if (!frwr_is_supported(&info->id->device->attrs)) { + log_rdma_event(ERR, + "Fast Registration Work Requests " + "(FRWR) is not supported\n"); + log_rdma_event(ERR, + "Device capability flags = %llx " + "max_fast_reg_page_list_len = %u\n", + info->id->device->attrs.device_cap_flags, + info->id->device->attrs.max_fast_reg_page_list_len); + rc = -EPROTONOSUPPORT; + goto out2; + } + info->max_frmr_depth = min_t(int, + smbd_max_frmr_depth, + info->id->device->attrs.max_fast_reg_page_list_len); + info->mr_type = IB_MR_TYPE_MEM_REG; + if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + info->mr_type = IB_MR_TYPE_SG_GAPS; + + info->pd = ib_alloc_pd(info->id->device, 0); + if (IS_ERR(info->pd)) { + rc = PTR_ERR(info->pd); + log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); + goto out2; + } + + return 0; + +out2: + rdma_destroy_id(info->id); + info->id = NULL; + +out1: + return rc; +} + +/* + * Send a negotiation request message to the peer + * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3 + * After negotiation, the transport is connected and ready for + * carrying upper layer SMB payload + */ +static int smbd_post_send_negotiate_req(struct smbd_connection *info) +{ + struct ib_send_wr send_wr, *send_wr_fail; + int rc = -ENOMEM; + struct smbd_request *request; + struct smbd_negotiate_req *packet; + + request = mempool_alloc(info->request_mempool, GFP_KERNEL); + if (!request) + return rc; + + request->info = info; + + packet = smbd_request_payload(request); + packet->min_version = cpu_to_le16(SMBD_V1); + packet->max_version = cpu_to_le16(SMBD_V1); + packet->reserved = 0; + packet->credits_requested = cpu_to_le16(info->send_credit_target); + packet->preferred_send_size = cpu_to_le32(info->max_send_size); + packet->max_receive_size = cpu_to_le32(info->max_receive_size); + packet->max_fragmented_size = + cpu_to_le32(info->max_fragmented_recv_size); + + request->num_sge = 1; + request->sge[0].addr = ib_dma_map_single( + info->id->device, (void *)packet, + sizeof(*packet), DMA_TO_DEVICE); + if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + rc = -EIO; + goto dma_mapping_failed; + } + + request->sge[0].length = sizeof(*packet); + request->sge[0].lkey = info->pd->local_dma_lkey; + + ib_dma_sync_single_for_device( + info->id->device, request->sge[0].addr, + request->sge[0].length, DMA_TO_DEVICE); + + request->cqe.done = send_done; + + send_wr.next = NULL; + send_wr.wr_cqe = &request->cqe; + send_wr.sg_list = request->sge; + send_wr.num_sge = request->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + log_rdma_send(INFO, "sge addr=%llx length=%x lkey=%x\n", + request->sge[0].addr, + request->sge[0].length, request->sge[0].lkey); + + request->has_payload = false; + atomic_inc(&info->send_pending); + rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail); + if (!rc) + return 0; + + /* if we reach here, post send failed */ + log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); + atomic_dec(&info->send_pending); + ib_dma_unmap_single(info->id->device, request->sge[0].addr, + request->sge[0].length, DMA_TO_DEVICE); + +dma_mapping_failed: + mempool_free(request, info->request_mempool); + return rc; +} + +/* + * Extend the credits to remote peer + * This implements [MS-SMBD] 3.1.5.9 + * The idea is that we should extend credits to remote peer as quickly as + * it's allowed, to maintain data flow. We allocate as much receive + * buffer as possible, and extend the receive credits to remote peer + * return value: the new credtis being granted. + */ +static int manage_credits_prior_sending(struct smbd_connection *info) +{ + int new_credits; + + spin_lock(&info->lock_new_credits_offered); + new_credits = info->new_credits_offered; + info->new_credits_offered = 0; + spin_unlock(&info->lock_new_credits_offered); + + return new_credits; +} + +/* + * Check if we need to send a KEEP_ALIVE message + * The idle connection timer triggers a KEEP_ALIVE message when expires + * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send + * back a response. + * return value: + * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set + * 0: otherwise + */ +static int manage_keep_alive_before_sending(struct smbd_connection *info) +{ + if (info->keep_alive_requested == KEEP_ALIVE_PENDING) { + info->keep_alive_requested = KEEP_ALIVE_SENT; + return 1; + } + return 0; +} + +/* + * Build and prepare the SMBD packet header + * This function waits for avaialbe send credits and build a SMBD packet + * header. The caller then optional append payload to the packet after + * the header + * intput values + * size: the size of the payload + * remaining_data_length: remaining data to send if this is part of a + * fragmented packet + * output values + * request_out: the request allocated from this function + * return values: 0 on success, otherwise actual error code returned + */ +static int smbd_create_header(struct smbd_connection *info, + int size, int remaining_data_length, + struct smbd_request **request_out) +{ + struct smbd_request *request; + struct smbd_data_transfer *packet; + int header_length; + int rc; + + /* Wait for send credits. A SMBD packet needs one credit */ + rc = wait_event_interruptible(info->wait_send_queue, + atomic_read(&info->send_credits) > 0 || + info->transport_status != SMBD_CONNECTED); + if (rc) + return rc; + + if (info->transport_status != SMBD_CONNECTED) { + log_outgoing(ERR, "disconnected not sending\n"); + return -ENOENT; + } + atomic_dec(&info->send_credits); + + request = mempool_alloc(info->request_mempool, GFP_KERNEL); + if (!request) { + rc = -ENOMEM; + goto err; + } + + request->info = info; + + /* Fill in the packet header */ + packet = smbd_request_payload(request); + packet->credits_requested = cpu_to_le16(info->send_credit_target); + packet->credits_granted = + cpu_to_le16(manage_credits_prior_sending(info)); + info->send_immediate = false; + + packet->flags = 0; + if (manage_keep_alive_before_sending(info)) + packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED); + + packet->reserved = 0; + if (!size) + packet->data_offset = 0; + else + packet->data_offset = cpu_to_le32(24); + packet->data_length = cpu_to_le32(size); + packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->padding = 0; + + log_outgoing(INFO, "credits_requested=%d credits_granted=%d " + "data_offset=%d data_length=%d remaining_data_length=%d\n", + le16_to_cpu(packet->credits_requested), + le16_to_cpu(packet->credits_granted), + le32_to_cpu(packet->data_offset), + le32_to_cpu(packet->data_length), + le32_to_cpu(packet->remaining_data_length)); + + /* Map the packet to DMA */ + header_length = sizeof(struct smbd_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!size) + header_length = offsetof(struct smbd_data_transfer, padding); + + request->num_sge = 1; + request->sge[0].addr = ib_dma_map_single(info->id->device, + (void *)packet, + header_length, + DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) { + mempool_free(request, info->request_mempool); + rc = -EIO; + goto err; + } + + request->sge[0].length = header_length; + request->sge[0].lkey = info->pd->local_dma_lkey; + + *request_out = request; + return 0; + +err: + atomic_inc(&info->send_credits); + return rc; +} + +static void smbd_destroy_header(struct smbd_connection *info, + struct smbd_request *request) +{ + + ib_dma_unmap_single(info->id->device, + request->sge[0].addr, + request->sge[0].length, + DMA_TO_DEVICE); + mempool_free(request, info->request_mempool); + atomic_inc(&info->send_credits); +} + +/* Post the send request */ +static int smbd_post_send(struct smbd_connection *info, + struct smbd_request *request, bool has_payload) +{ + struct ib_send_wr send_wr, *send_wr_fail; + int rc, i; + + for (i = 0; i < request->num_sge; i++) { + log_rdma_send(INFO, + "rdma_request sge[%d] addr=%llu legnth=%u\n", + i, request->sge[0].addr, request->sge[0].length); + ib_dma_sync_single_for_device( + info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + } + + request->cqe.done = send_done; + + send_wr.next = NULL; + send_wr.wr_cqe = &request->cqe; + send_wr.sg_list = request->sge; + send_wr.num_sge = request->num_sge; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + if (has_payload) { + request->has_payload = true; + atomic_inc(&info->send_payload_pending); + } else { + request->has_payload = false; + atomic_inc(&info->send_pending); + } + + rc = ib_post_send(info->id->qp, &send_wr, &send_wr_fail); + if (rc) { + log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc); + if (has_payload) { + if (atomic_dec_and_test(&info->send_payload_pending)) + wake_up(&info->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&info->send_pending)) + wake_up(&info->wait_send_pending); + } + } else + /* Reset timer for idle connection after packet is sent */ + mod_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); + + return rc; +} + +static int smbd_post_send_sgl(struct smbd_connection *info, + struct scatterlist *sgl, int data_length, int remaining_data_length) +{ + int num_sgs; + int i, rc; + struct smbd_request *request; + struct scatterlist *sg; + + rc = smbd_create_header( + info, data_length, remaining_data_length, &request); + if (rc) + return rc; + + num_sgs = sgl ? sg_nents(sgl) : 0; + for_each_sg(sgl, sg, num_sgs, i) { + request->sge[i+1].addr = + ib_dma_map_page(info->id->device, sg_page(sg), + sg->offset, sg->length, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error( + info->id->device, request->sge[i+1].addr)) { + rc = -EIO; + request->sge[i+1].addr = 0; + goto dma_mapping_failure; + } + request->sge[i+1].length = sg->length; + request->sge[i+1].lkey = info->pd->local_dma_lkey; + request->num_sge++; + } + + rc = smbd_post_send(info, request, data_length); + if (!rc) + return 0; + +dma_mapping_failure: + for (i = 1; i < request->num_sge; i++) + if (request->sge[i].addr) + ib_dma_unmap_single(info->id->device, + request->sge[i].addr, + request->sge[i].length, + DMA_TO_DEVICE); + smbd_destroy_header(info, request); + return rc; +} + +/* + * Send a page + * page: the page to send + * offset: offset in the page to send + * size: length in the page to send + * remaining_data_length: remaining data to send in this payload + */ +static int smbd_post_send_page(struct smbd_connection *info, struct page *page, + unsigned long offset, size_t size, int remaining_data_length) +{ + struct scatterlist sgl; + + sg_init_table(&sgl, 1); + sg_set_page(&sgl, page, size, offset); + + return smbd_post_send_sgl(info, &sgl, size, remaining_data_length); +} + +/* + * Send an empty message + * Empty message is used to extend credits to peer to for keep live + * while there is no upper layer payload to send at the time + */ +static int smbd_post_send_empty(struct smbd_connection *info) +{ + info->count_send_empty++; + return smbd_post_send_sgl(info, NULL, 0, 0); +} + +/* + * Send a data buffer + * iov: the iov array describing the data buffers + * n_vec: number of iov array + * remaining_data_length: remaining data to send following this packet + * in segmented SMBD packet + */ +static int smbd_post_send_data( + struct smbd_connection *info, struct kvec *iov, int n_vec, + int remaining_data_length) +{ + int i; + u32 data_length = 0; + struct scatterlist sgl[SMBDIRECT_MAX_SGE]; + + if (n_vec > SMBDIRECT_MAX_SGE) { + cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec); + return -ENOMEM; + } + + sg_init_table(sgl, n_vec); + for (i = 0; i < n_vec; i++) { + data_length += iov[i].iov_len; + sg_set_buf(&sgl[i], iov[i].iov_base, iov[i].iov_len); + } + + return smbd_post_send_sgl(info, sgl, data_length, remaining_data_length); +} + +/* + * Post a receive request to the transport + * The remote peer can only send data when a receive request is posted + * The interaction is controlled by send/receive credit system + */ +static int smbd_post_recv( + struct smbd_connection *info, struct smbd_response *response) +{ + struct ib_recv_wr recv_wr, *recv_wr_fail = NULL; + int rc = -EIO; + + response->sge.addr = ib_dma_map_single( + info->id->device, response->packet, + info->max_receive_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(info->id->device, response->sge.addr)) + return rc; + + response->sge.length = info->max_receive_size; + response->sge.lkey = info->pd->local_dma_lkey; + + response->cqe.done = recv_done; + + recv_wr.wr_cqe = &response->cqe; + recv_wr.next = NULL; + recv_wr.sg_list = &response->sge; + recv_wr.num_sge = 1; + + rc = ib_post_recv(info->id->qp, &recv_wr, &recv_wr_fail); + if (rc) { + ib_dma_unmap_single(info->id->device, response->sge.addr, + response->sge.length, DMA_FROM_DEVICE); + + log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); + } + + return rc; +} + +/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ +static int smbd_negotiate(struct smbd_connection *info) +{ + int rc; + struct smbd_response *response = get_receive_buffer(info); + + response->type = SMBD_NEGOTIATE_RESP; + rc = smbd_post_recv(info, response); + log_rdma_event(INFO, + "smbd_post_recv rc=%d iov.addr=%llx iov.length=%x " + "iov.lkey=%x\n", + rc, response->sge.addr, + response->sge.length, response->sge.lkey); + if (rc) + return rc; + + init_completion(&info->negotiate_completion); + info->negotiate_done = false; + rc = smbd_post_send_negotiate_req(info); + if (rc) + return rc; + + rc = wait_for_completion_interruptible_timeout( + &info->negotiate_completion, SMBD_NEGOTIATE_TIMEOUT * HZ); + log_rdma_event(INFO, "wait_for_completion_timeout rc=%d\n", rc); + + if (info->negotiate_done) + return 0; + + if (rc == 0) + rc = -ETIMEDOUT; + else if (rc == -ERESTARTSYS) + rc = -EINTR; + else + rc = -ENOTCONN; + + return rc; +} + +static void put_empty_packet( + struct smbd_connection *info, struct smbd_response *response) +{ + spin_lock(&info->empty_packet_queue_lock); + list_add_tail(&response->list, &info->empty_packet_queue); + info->count_empty_packet_queue++; + spin_unlock(&info->empty_packet_queue_lock); + + queue_work(info->workqueue, &info->post_send_credits_work); +} + +/* + * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1 + * This is a queue for reassembling upper layer payload and present to upper + * layer. All the inncoming payload go to the reassembly queue, regardless of + * if reassembly is required. The uuper layer code reads from the queue for all + * incoming payloads. + * Put a received packet to the reassembly queue + * response: the packet received + * data_length: the size of payload in this packet + */ +static void enqueue_reassembly( + struct smbd_connection *info, + struct smbd_response *response, + int data_length) +{ + spin_lock(&info->reassembly_queue_lock); + list_add_tail(&response->list, &info->reassembly_queue); + info->reassembly_queue_length++; + /* + * Make sure reassembly_data_length is updated after list and + * reassembly_queue_length are updated. On the dequeue side + * reassembly_data_length is checked without a lock to determine + * if reassembly_queue_length and list is up to date + */ + virt_wmb(); + info->reassembly_data_length += data_length; + spin_unlock(&info->reassembly_queue_lock); + info->count_reassembly_queue++; + info->count_enqueue_reassembly_queue++; +} + +/* + * Get the first entry at the front of reassembly queue + * Caller is responsible for locking + * return value: the first entry if any, NULL if queue is empty + */ +static struct smbd_response *_get_first_reassembly(struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + + if (!list_empty(&info->reassembly_queue)) { + ret = list_first_entry( + &info->reassembly_queue, + struct smbd_response, list); + } + return ret; +} + +static struct smbd_response *get_empty_queue_buffer( + struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&info->empty_packet_queue_lock, flags); + if (!list_empty(&info->empty_packet_queue)) { + ret = list_first_entry( + &info->empty_packet_queue, + struct smbd_response, list); + list_del(&ret->list); + info->count_empty_packet_queue--; + } + spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags); + + return ret; +} + +/* + * Get a receive buffer + * For each remote send, we need to post a receive. The receive buffers are + * pre-allocated in advance. + * return value: the receive buffer, NULL if none is available + */ +static struct smbd_response *get_receive_buffer(struct smbd_connection *info) +{ + struct smbd_response *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&info->receive_queue_lock, flags); + if (!list_empty(&info->receive_queue)) { + ret = list_first_entry( + &info->receive_queue, + struct smbd_response, list); + list_del(&ret->list); + info->count_receive_queue--; + info->count_get_receive_buffer++; + } + spin_unlock_irqrestore(&info->receive_queue_lock, flags); + + return ret; +} + +/* + * Return a receive buffer + * Upon returning of a receive buffer, we can post new receive and extend + * more receive credits to remote peer. This is done immediately after a + * receive buffer is returned. + */ +static void put_receive_buffer( + struct smbd_connection *info, struct smbd_response *response) +{ + unsigned long flags; + + ib_dma_unmap_single(info->id->device, response->sge.addr, + response->sge.length, DMA_FROM_DEVICE); + + spin_lock_irqsave(&info->receive_queue_lock, flags); + list_add_tail(&response->list, &info->receive_queue); + info->count_receive_queue++; + info->count_put_receive_buffer++; + spin_unlock_irqrestore(&info->receive_queue_lock, flags); + + queue_work(info->workqueue, &info->post_send_credits_work); +} + +/* Preallocate all receive buffer on transport establishment */ +static int allocate_receive_buffers(struct smbd_connection *info, int num_buf) +{ + int i; + struct smbd_response *response; + + INIT_LIST_HEAD(&info->reassembly_queue); + spin_lock_init(&info->reassembly_queue_lock); + info->reassembly_data_length = 0; + info->reassembly_queue_length = 0; + + INIT_LIST_HEAD(&info->receive_queue); + spin_lock_init(&info->receive_queue_lock); + info->count_receive_queue = 0; + + INIT_LIST_HEAD(&info->empty_packet_queue); + spin_lock_init(&info->empty_packet_queue_lock); + info->count_empty_packet_queue = 0; + + init_waitqueue_head(&info->wait_receive_queues); + + for (i = 0; i < num_buf; i++) { + response = mempool_alloc(info->response_mempool, GFP_KERNEL); + if (!response) + goto allocate_failed; + + response->info = info; + list_add_tail(&response->list, &info->receive_queue); + info->count_receive_queue++; + } + + return 0; + +allocate_failed: + while (!list_empty(&info->receive_queue)) { + response = list_first_entry( + &info->receive_queue, + struct smbd_response, list); + list_del(&response->list); + info->count_receive_queue--; + + mempool_free(response, info->response_mempool); + } + return -ENOMEM; +} + +static void destroy_receive_buffers(struct smbd_connection *info) +{ + struct smbd_response *response; + + while ((response = get_receive_buffer(info))) + mempool_free(response, info->response_mempool); + + while ((response = get_empty_queue_buffer(info))) + mempool_free(response, info->response_mempool); +} + +/* + * Check and send an immediate or keep alive packet + * The condition to send those packets are defined in [MS-SMBD] 3.1.1.1 + * Connection.KeepaliveRequested and Connection.SendImmediate + * The idea is to extend credits to server as soon as it becomes available + */ +static void send_immediate_work(struct work_struct *work) +{ + struct smbd_connection *info = container_of( + work, struct smbd_connection, + send_immediate_work.work); + + if (info->keep_alive_requested == KEEP_ALIVE_PENDING || + info->send_immediate) { + log_keep_alive(INFO, "send an empty message\n"); + smbd_post_send_empty(info); + } +} + +/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ +static void idle_connection_timer(struct work_struct *work) +{ + struct smbd_connection *info = container_of( + work, struct smbd_connection, + idle_timer_work.work); + + if (info->keep_alive_requested != KEEP_ALIVE_NONE) { + log_keep_alive(ERR, + "error status info->keep_alive_requested=%d\n", + info->keep_alive_requested); + smbd_disconnect_rdma_connection(info); + return; + } + + log_keep_alive(INFO, "about to send an empty idle message\n"); + smbd_post_send_empty(info); + + /* Setup the next idle timeout work */ + queue_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); +} + +/* Destroy this SMBD connection, called from upper layer */ +void smbd_destroy(struct smbd_connection *info) +{ + log_rdma_event(INFO, "destroying rdma session\n"); + + /* Kick off the disconnection process */ + smbd_disconnect_rdma_connection(info); + + log_rdma_event(INFO, "wait for transport being destroyed\n"); + wait_event(info->wait_destroy, + info->transport_status == SMBD_DESTROYED); + + destroy_workqueue(info->workqueue); + kfree(info); +} + +/* + * Reconnect this SMBD connection, called from upper layer + * return value: 0 on success, or actual error code + */ +int smbd_reconnect(struct TCP_Server_Info *server) +{ + log_rdma_event(INFO, "reconnecting rdma session\n"); + + if (!server->smbd_conn) { + log_rdma_event(ERR, "rdma session already destroyed\n"); + return -EINVAL; + } + + /* + * This is possible if transport is disconnected and we haven't received + * notification from RDMA, but upper layer has detected timeout + */ + if (server->smbd_conn->transport_status == SMBD_CONNECTED) { + log_rdma_event(INFO, "disconnecting transport\n"); + smbd_disconnect_rdma_connection(server->smbd_conn); + } + + /* wait until the transport is destroyed */ + wait_event(server->smbd_conn->wait_destroy, + server->smbd_conn->transport_status == SMBD_DESTROYED); + + destroy_workqueue(server->smbd_conn->workqueue); + kfree(server->smbd_conn); + + log_rdma_event(INFO, "creating rdma session\n"); + server->smbd_conn = smbd_get_connection( + server, (struct sockaddr *) &server->dstaddr); + + return server->smbd_conn ? 0 : -ENOENT; +} + +static void destroy_caches_and_workqueue(struct smbd_connection *info) +{ + destroy_receive_buffers(info); + destroy_workqueue(info->workqueue); + mempool_destroy(info->response_mempool); + kmem_cache_destroy(info->response_cache); + mempool_destroy(info->request_mempool); + kmem_cache_destroy(info->request_cache); +} + +#define MAX_NAME_LEN 80 +static int allocate_caches_and_workqueue(struct smbd_connection *info) +{ + char name[MAX_NAME_LEN]; + int rc; + + snprintf(name, MAX_NAME_LEN, "smbd_request_%p", info); + info->request_cache = + kmem_cache_create( + name, + sizeof(struct smbd_request) + + sizeof(struct smbd_data_transfer), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!info->request_cache) + return -ENOMEM; + + info->request_mempool = + mempool_create(info->send_credit_target, mempool_alloc_slab, + mempool_free_slab, info->request_cache); + if (!info->request_mempool) + goto out1; + + snprintf(name, MAX_NAME_LEN, "smbd_response_%p", info); + info->response_cache = + kmem_cache_create( + name, + sizeof(struct smbd_response) + + info->max_receive_size, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!info->response_cache) + goto out2; + + info->response_mempool = + mempool_create(info->receive_credit_max, mempool_alloc_slab, + mempool_free_slab, info->response_cache); + if (!info->response_mempool) + goto out3; + + snprintf(name, MAX_NAME_LEN, "smbd_%p", info); + info->workqueue = create_workqueue(name); + if (!info->workqueue) + goto out4; + + rc = allocate_receive_buffers(info, info->receive_credit_max); + if (rc) { + log_rdma_event(ERR, "failed to allocate receive buffers\n"); + goto out5; + } + + return 0; + +out5: + destroy_workqueue(info->workqueue); +out4: + mempool_destroy(info->response_mempool); +out3: + kmem_cache_destroy(info->response_cache); +out2: + mempool_destroy(info->request_mempool); +out1: + kmem_cache_destroy(info->request_cache); + return -ENOMEM; +} + +/* Create a SMBD connection, called by upper layer */ +static struct smbd_connection *_smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) +{ + int rc; + struct smbd_connection *info; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; + struct ib_port_immutable port_immutable; + u32 ird_ord_hdr[2]; + + info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL); + if (!info) + return NULL; + + info->transport_status = SMBD_CONNECTING; + rc = smbd_ia_open(info, dstaddr, port); + if (rc) { + log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); + goto create_id_failed; + } + + if (smbd_send_credit_target > info->id->device->attrs.max_cqe || + smbd_send_credit_target > info->id->device->attrs.max_qp_wr) { + log_rdma_event(ERR, + "consider lowering send_credit_target = %d. " + "Possible CQE overrun, device " + "reporting max_cpe %d max_qp_wr %d\n", + smbd_send_credit_target, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); + goto config_failed; + } + + if (smbd_receive_credit_max > info->id->device->attrs.max_cqe || + smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) { + log_rdma_event(ERR, + "consider lowering receive_credit_max = %d. " + "Possible CQE overrun, device " + "reporting max_cpe %d max_qp_wr %d\n", + smbd_receive_credit_max, + info->id->device->attrs.max_cqe, + info->id->device->attrs.max_qp_wr); + goto config_failed; + } + + info->receive_credit_max = smbd_receive_credit_max; + info->send_credit_target = smbd_send_credit_target; + info->max_send_size = smbd_max_send_size; + info->max_fragmented_recv_size = smbd_max_fragmented_recv_size; + info->max_receive_size = smbd_max_receive_size; + info->keep_alive_interval = smbd_keep_alive_interval; + + if (info->id->device->attrs.max_sge < SMBDIRECT_MAX_SGE) { + log_rdma_event(ERR, "warning: device max_sge = %d too small\n", + info->id->device->attrs.max_sge); + log_rdma_event(ERR, "Queue Pair creation may fail\n"); + } + + info->send_cq = NULL; + info->recv_cq = NULL; + info->send_cq = ib_alloc_cq(info->id->device, info, + info->send_credit_target, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(info->send_cq)) { + info->send_cq = NULL; + goto alloc_cq_failed; + } + + info->recv_cq = ib_alloc_cq(info->id->device, info, + info->receive_credit_max, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(info->recv_cq)) { + info->recv_cq = NULL; + goto alloc_cq_failed; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.event_handler = smbd_qp_async_error_upcall; + qp_attr.qp_context = info; + qp_attr.cap.max_send_wr = info->send_credit_target; + qp_attr.cap.max_recv_wr = info->receive_credit_max; + qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_SGE; + qp_attr.cap.max_inline_data = 0; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = info->send_cq; + qp_attr.recv_cq = info->recv_cq; + qp_attr.port_num = ~0; + + rc = rdma_create_qp(info->id, info->pd, &qp_attr); + if (rc) { + log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); + goto create_qp_failed; + } + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = 0; + + conn_param.responder_resources = + info->id->device->attrs.max_qp_rd_atom + < SMBD_CM_RESPONDER_RESOURCES ? + info->id->device->attrs.max_qp_rd_atom : + SMBD_CM_RESPONDER_RESOURCES; + info->responder_resources = conn_param.responder_resources; + log_rdma_mr(INFO, "responder_resources=%d\n", + info->responder_resources); + + /* Need to send IRD/ORD in private data for iWARP */ + info->id->device->get_port_immutable( + info->id->device, info->id->port_num, &port_immutable); + if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { + ird_ord_hdr[0] = info->responder_resources; + ird_ord_hdr[1] = 1; + conn_param.private_data = ird_ord_hdr; + conn_param.private_data_len = sizeof(ird_ord_hdr); + } else { + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + } + + conn_param.retry_count = SMBD_CM_RETRY; + conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; + conn_param.flow_control = 0; + init_waitqueue_head(&info->wait_destroy); + + log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", + &addr_in->sin_addr, port); + + init_waitqueue_head(&info->conn_wait); + rc = rdma_connect(info->id, &conn_param); + if (rc) { + log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); + goto rdma_connect_failed; + } + + wait_event_interruptible( + info->conn_wait, info->transport_status != SMBD_CONNECTING); + + if (info->transport_status != SMBD_CONNECTED) { + log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); + goto rdma_connect_failed; + } + + log_rdma_event(INFO, "rdma_connect connected\n"); + + rc = allocate_caches_and_workqueue(info); + if (rc) { + log_rdma_event(ERR, "cache allocation failed\n"); + goto allocate_cache_failed; + } + + init_waitqueue_head(&info->wait_send_queue); + init_waitqueue_head(&info->wait_reassembly_queue); + + INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer); + INIT_DELAYED_WORK(&info->send_immediate_work, send_immediate_work); + queue_delayed_work(info->workqueue, &info->idle_timer_work, + info->keep_alive_interval*HZ); + + init_waitqueue_head(&info->wait_smbd_send_pending); + info->smbd_send_pending = 0; + + init_waitqueue_head(&info->wait_smbd_recv_pending); + info->smbd_recv_pending = 0; + + init_waitqueue_head(&info->wait_send_pending); + atomic_set(&info->send_pending, 0); + + init_waitqueue_head(&info->wait_send_payload_pending); + atomic_set(&info->send_payload_pending, 0); + + INIT_WORK(&info->disconnect_work, smbd_disconnect_rdma_work); + INIT_WORK(&info->destroy_work, smbd_destroy_rdma_work); + INIT_WORK(&info->recv_done_work, smbd_recv_done_work); + INIT_WORK(&info->post_send_credits_work, smbd_post_send_credits); + info->new_credits_offered = 0; + spin_lock_init(&info->lock_new_credits_offered); + + rc = smbd_negotiate(info); + if (rc) { + log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); + goto negotiation_failed; + } + + rc = allocate_mr_list(info); + if (rc) { + log_rdma_mr(ERR, "memory registration allocation failed\n"); + goto allocate_mr_failed; + } + + return info; + +allocate_mr_failed: + /* At this point, need to a full transport shutdown */ + smbd_destroy(info); + return NULL; + +negotiation_failed: + cancel_delayed_work_sync(&info->idle_timer_work); + destroy_caches_and_workqueue(info); + info->transport_status = SMBD_NEGOTIATE_FAILED; + init_waitqueue_head(&info->conn_wait); + rdma_disconnect(info->id); + wait_event(info->conn_wait, + info->transport_status == SMBD_DISCONNECTED); + +allocate_cache_failed: +rdma_connect_failed: + rdma_destroy_qp(info->id); + +create_qp_failed: +alloc_cq_failed: + if (info->send_cq) + ib_free_cq(info->send_cq); + if (info->recv_cq) + ib_free_cq(info->recv_cq); + +config_failed: + ib_dealloc_pd(info->pd); + rdma_destroy_id(info->id); + +create_id_failed: + kfree(info); + return NULL; +} + +struct smbd_connection *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr) +{ + struct smbd_connection *ret; + int port = SMBD_PORT; + +try_again: + ret = _smbd_get_connection(server, dstaddr, port); + + /* Try SMB_PORT if SMBD_PORT doesn't work */ + if (!ret && port == SMBD_PORT) { + port = SMB_PORT; + goto try_again; + } + return ret; +} + +/* + * Receive data from receive reassembly queue + * All the incoming data packets are placed in reassembly queue + * buf: the buffer to read data into + * size: the length of data to read + * return value: actual data read + * Note: this implementation copies the data from reassebmly queue to receive + * buffers used by upper layer. This is not the optimal code path. A better way + * to do it is to not have upper layer allocate its receive buffers but rather + * borrow the buffer from reassembly queue, and return it after data is + * consumed. But this will require more changes to upper layer code, and also + * need to consider packet boundaries while they still being reassembled. + */ +static int smbd_recv_buf(struct smbd_connection *info, char *buf, + unsigned int size) +{ + struct smbd_response *response; + struct smbd_data_transfer *data_transfer; + int to_copy, to_read, data_read, offset; + u32 data_length, remaining_data_length, data_offset; + int rc; + +again: + if (info->transport_status != SMBD_CONNECTED) { + log_read(ERR, "disconnected\n"); + return -ENODEV; + } + + /* + * No need to hold the reassembly queue lock all the time as we are + * the only one reading from the front of the queue. The transport + * may add more entries to the back of the queue at the same time + */ + log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size, + info->reassembly_data_length); + if (info->reassembly_data_length >= size) { + int queue_length; + int queue_removed = 0; + + /* + * Need to make sure reassembly_data_length is read before + * reading reassembly_queue_length and calling + * _get_first_reassembly. This call is lock free + * as we never read at the end of the queue which are being + * updated in SOFTIRQ as more data is received + */ + virt_rmb(); + queue_length = info->reassembly_queue_length; + data_read = 0; + to_read = size; + offset = info->first_entry_offset; + while (data_read < size) { + response = _get_first_reassembly(info); + data_transfer = smbd_response_payload(response); + data_length = le32_to_cpu(data_transfer->data_length); + remaining_data_length = + le32_to_cpu( + data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); + + /* + * The upper layer expects RFC1002 length at the + * beginning of the payload. Return it to indicate + * the total length of the packet. This minimize the + * change to upper layer packet processing logic. This + * will be eventually remove when an intermediate + * transport layer is added + */ + if (response->first_segment && size == 4) { + unsigned int rfc1002_len = + data_length + remaining_data_length; + *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + data_read = 4; + response->first_segment = false; + log_read(INFO, "returning rfc1002 length %d\n", + rfc1002_len); + goto read_rfc1002_done; + } + + to_copy = min_t(int, data_length - offset, to_read); + memcpy( + buf + data_read, + (char *)data_transfer + data_offset + offset, + to_copy); + + /* move on to the next buffer? */ + if (to_copy == data_length - offset) { + queue_length--; + /* + * No need to lock if we are not at the + * end of the queue + */ + if (!queue_length) + spin_lock_irq( + &info->reassembly_queue_lock); + list_del(&response->list); + queue_removed++; + if (!queue_length) + spin_unlock_irq( + &info->reassembly_queue_lock); + + info->count_reassembly_queue--; + info->count_dequeue_reassembly_queue++; + put_receive_buffer(info, response); + offset = 0; + log_read(INFO, "put_receive_buffer offset=0\n"); + } else + offset += to_copy; + + to_read -= to_copy; + data_read += to_copy; + + log_read(INFO, "_get_first_reassembly memcpy %d bytes " + "data_transfer_length-offset=%d after that " + "to_read=%d data_read=%d offset=%d\n", + to_copy, data_length - offset, + to_read, data_read, offset); + } + + spin_lock_irq(&info->reassembly_queue_lock); + info->reassembly_data_length -= data_read; + info->reassembly_queue_length -= queue_removed; + spin_unlock_irq(&info->reassembly_queue_lock); + + info->first_entry_offset = offset; + log_read(INFO, "returning to thread data_read=%d " + "reassembly_data_length=%d first_entry_offset=%d\n", + data_read, info->reassembly_data_length, + info->first_entry_offset); +read_rfc1002_done: + return data_read; + } + + log_read(INFO, "wait_event on more data\n"); + rc = wait_event_interruptible( + info->wait_reassembly_queue, + info->reassembly_data_length >= size || + info->transport_status != SMBD_CONNECTED); + /* Don't return any data if interrupted */ + if (rc) + return -ENODEV; + + goto again; +} + +/* + * Receive a page from receive reassembly queue + * page: the page to read data into + * to_read: the length of data to read + * return value: actual data read + */ +static int smbd_recv_page(struct smbd_connection *info, + struct page *page, unsigned int to_read) +{ + int ret; + char *to_address; + + /* make sure we have the page ready for read */ + ret = wait_event_interruptible( + info->wait_reassembly_queue, + info->reassembly_data_length >= to_read || + info->transport_status != SMBD_CONNECTED); + if (ret) + return 0; + + /* now we can read from reassembly queue and not sleep */ + to_address = kmap_atomic(page); + + log_read(INFO, "reading from page=%p address=%p to_read=%d\n", + page, to_address, to_read); + + ret = smbd_recv_buf(info, to_address, to_read); + kunmap_atomic(to_address); + + return ret; +} + +/* + * Receive data from transport + * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC + * return: total bytes read, or 0. SMB Direct will not do partial read. + */ +int smbd_recv(struct smbd_connection *info, struct msghdr *msg) +{ + char *buf; + struct page *page; + unsigned int to_read; + int rc; + + info->smbd_recv_pending++; + + switch (msg->msg_iter.type) { + case READ | ITER_KVEC: + buf = msg->msg_iter.kvec->iov_base; + to_read = msg->msg_iter.kvec->iov_len; + rc = smbd_recv_buf(info, buf, to_read); + break; + + case READ | ITER_BVEC: + page = msg->msg_iter.bvec->bv_page; + to_read = msg->msg_iter.bvec->bv_len; + rc = smbd_recv_page(info, page, to_read); + break; + + default: + /* It's a bug in upper layer to get there */ + cifs_dbg(VFS, "CIFS: invalid msg type %d\n", + msg->msg_iter.type); + rc = -EIO; + } + + info->smbd_recv_pending--; + wake_up(&info->wait_smbd_recv_pending); + + /* SMBDirect will read it all or nothing */ + if (rc > 0) + msg->msg_iter.count = 0; + return rc; +} + +/* + * Send data to transport + * Each rqst is transported as a SMBDirect payload + * rqst: the data to write + * return value: 0 if successfully write, otherwise error code + */ +int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) +{ + struct kvec vec; + int nvecs; + int size; + int buflen = 0, remaining_data_length; + int start, i, j; + int max_iov_size = + info->max_send_size - sizeof(struct smbd_data_transfer); + struct kvec iov[SMBDIRECT_MAX_SGE]; + int rc; + + info->smbd_send_pending++; + if (info->transport_status != SMBD_CONNECTED) { + rc = -ENODEV; + goto done; + } + + /* + * This usually means a configuration error + * We use RDMA read/write for packet size > rdma_readwrite_threshold + * as long as it's properly configured we should never get into this + * situation + */ + if (rqst->rq_nvec + rqst->rq_npages > SMBDIRECT_MAX_SGE) { + log_write(ERR, "maximum send segment %x exceeding %x\n", + rqst->rq_nvec + rqst->rq_npages, SMBDIRECT_MAX_SGE); + rc = -EINVAL; + goto done; + } + + /* + * Remove the RFC1002 length defined in MS-SMB2 section 2.1 + * It is used only for TCP transport + * In future we may want to add a transport layer under protocol + * layer so this will only be issued to TCP transport + */ + iov[0].iov_base = (char *)rqst->rq_iov[0].iov_base + 4; + iov[0].iov_len = rqst->rq_iov[0].iov_len - 4; + buflen += iov[0].iov_len; + + /* total up iov array first */ + for (i = 1; i < rqst->rq_nvec; i++) { + iov[i].iov_base = rqst->rq_iov[i].iov_base; + iov[i].iov_len = rqst->rq_iov[i].iov_len; + buflen += iov[i].iov_len; + } + + /* add in the page array if there is one */ + if (rqst->rq_npages) { + buflen += rqst->rq_pagesz * (rqst->rq_npages - 1); + buflen += rqst->rq_tailsz; + } + + if (buflen + sizeof(struct smbd_data_transfer) > + info->max_fragmented_send_size) { + log_write(ERR, "payload size %d > max size %d\n", + buflen, info->max_fragmented_send_size); + rc = -EINVAL; + goto done; + } + + remaining_data_length = buflen; + + log_write(INFO, "rqst->rq_nvec=%d rqst->rq_npages=%d rq_pagesz=%d " + "rq_tailsz=%d buflen=%d\n", + rqst->rq_nvec, rqst->rq_npages, rqst->rq_pagesz, + rqst->rq_tailsz, buflen); + + start = i = iov[0].iov_len ? 0 : 1; + buflen = 0; + while (true) { + buflen += iov[i].iov_len; + if (buflen > max_iov_size) { + if (i > start) { + remaining_data_length -= + (buflen-iov[i].iov_len); + log_write(INFO, "sending iov[] from start=%d " + "i=%d nvecs=%d " + "remaining_data_length=%d\n", + start, i, i-start, + remaining_data_length); + rc = smbd_post_send_data( + info, &iov[start], i-start, + remaining_data_length); + if (rc) + goto done; + } else { + /* iov[start] is too big, break it */ + nvecs = (buflen+max_iov_size-1)/max_iov_size; + log_write(INFO, "iov[%d] iov_base=%p buflen=%d" + " break to %d vectors\n", + start, iov[start].iov_base, + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + vec.iov_base = + (char *)iov[start].iov_base + + j*max_iov_size; + vec.iov_len = max_iov_size; + if (j == nvecs-1) + vec.iov_len = + buflen - + max_iov_size*(nvecs-1); + remaining_data_length -= vec.iov_len; + log_write(INFO, + "sending vec j=%d iov_base=%p" + " iov_len=%zu " + "remaining_data_length=%d\n", + j, vec.iov_base, vec.iov_len, + remaining_data_length); + rc = smbd_post_send_data( + info, &vec, 1, + remaining_data_length); + if (rc) + goto done; + } + i++; + } + start = i; + buflen = 0; + } else { + i++; + if (i == rqst->rq_nvec) { + /* send out all remaining vecs */ + remaining_data_length -= buflen; + log_write(INFO, + "sending iov[] from start=%d i=%d " + "nvecs=%d remaining_data_length=%d\n", + start, i, i-start, + remaining_data_length); + rc = smbd_post_send_data(info, &iov[start], + i-start, remaining_data_length); + if (rc) + goto done; + break; + } + } + log_write(INFO, "looping i=%d buflen=%d\n", i, buflen); + } + + /* now sending pages if there are any */ + for (i = 0; i < rqst->rq_npages; i++) { + buflen = (i == rqst->rq_npages-1) ? + rqst->rq_tailsz : rqst->rq_pagesz; + nvecs = (buflen + max_iov_size - 1) / max_iov_size; + log_write(INFO, "sending pages buflen=%d nvecs=%d\n", + buflen, nvecs); + for (j = 0; j < nvecs; j++) { + size = max_iov_size; + if (j == nvecs-1) + size = buflen - j*max_iov_size; + remaining_data_length -= size; + log_write(INFO, "sending pages i=%d offset=%d size=%d" + " remaining_data_length=%d\n", + i, j*max_iov_size, size, remaining_data_length); + rc = smbd_post_send_page( + info, rqst->rq_pages[i], j*max_iov_size, + size, remaining_data_length); + if (rc) + goto done; + } + } + +done: + /* + * As an optimization, we don't wait for individual I/O to finish + * before sending the next one. + * Send them all and wait for pending send count to get to 0 + * that means all the I/Os have been out and we are good to return + */ + + wait_event(info->wait_send_payload_pending, + atomic_read(&info->send_payload_pending) == 0); + + info->smbd_send_pending--; + wake_up(&info->wait_smbd_send_pending); + + return rc; +} + +static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_mr *mr; + struct ib_cqe *cqe; + + if (wc->status) { + log_rdma_mr(ERR, "status=%d\n", wc->status); + cqe = wc->wr_cqe; + mr = container_of(cqe, struct smbd_mr, cqe); + smbd_disconnect_rdma_connection(mr->conn); + } +} + +/* + * The work queue function that recovers MRs + * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used + * again. Both calls are slow, so finish them in a workqueue. This will not + * block I/O path. + * There is one workqueue that recovers MRs, there is no need to lock as the + * I/O requests calling smbd_register_mr will never update the links in the + * mr_list. + */ +static void smbd_mr_recovery_work(struct work_struct *work) +{ + struct smbd_connection *info = + container_of(work, struct smbd_connection, mr_recovery_work); + struct smbd_mr *smbdirect_mr; + int rc; + + list_for_each_entry(smbdirect_mr, &info->mr_list, list) { + if (smbdirect_mr->state == MR_INVALIDATED || + smbdirect_mr->state == MR_ERROR) { + + if (smbdirect_mr->state == MR_INVALIDATED) { + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + smbdirect_mr->state = MR_READY; + } else if (smbdirect_mr->state == MR_ERROR) { + + /* recover this MR entry */ + rc = ib_dereg_mr(smbdirect_mr->mr); + if (rc) { + log_rdma_mr(ERR, + "ib_dereg_mr faield rc=%x\n", + rc); + smbd_disconnect_rdma_connection(info); + } + + smbdirect_mr->mr = ib_alloc_mr( + info->pd, info->mr_type, + info->max_frmr_depth); + if (IS_ERR(smbdirect_mr->mr)) { + log_rdma_mr(ERR, + "ib_alloc_mr failed mr_type=%x " + "max_frmr_depth=%x\n", + info->mr_type, + info->max_frmr_depth); + smbd_disconnect_rdma_connection(info); + } + + smbdirect_mr->state = MR_READY; + } + /* smbdirect_mr->state is updated by this function + * and is read and updated by I/O issuing CPUs trying + * to get a MR, the call to atomic_inc_return + * implicates a memory barrier and guarantees this + * value is updated before waking up any calls to + * get_mr() from the I/O issuing CPUs + */ + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); + } + } +} + +static void destroy_mr_list(struct smbd_connection *info) +{ + struct smbd_mr *mr, *tmp; + + cancel_work_sync(&info->mr_recovery_work); + list_for_each_entry_safe(mr, tmp, &info->mr_list, list) { + if (mr->state == MR_INVALIDATED) + ib_dma_unmap_sg(info->id->device, mr->sgl, + mr->sgl_count, mr->dir); + ib_dereg_mr(mr->mr); + kfree(mr->sgl); + kfree(mr); + } +} + +/* + * Allocate MRs used for RDMA read/write + * The number of MRs will not exceed hardware capability in responder_resources + * All MRs are kept in mr_list. The MR can be recovered after it's used + * Recovery is done in smbd_mr_recovery_work. The content of list entry changes + * as MRs are used and recovered for I/O, but the list links will not change + */ +static int allocate_mr_list(struct smbd_connection *info) +{ + int i; + struct smbd_mr *smbdirect_mr, *tmp; + + INIT_LIST_HEAD(&info->mr_list); + init_waitqueue_head(&info->wait_mr); + spin_lock_init(&info->mr_list_lock); + atomic_set(&info->mr_ready_count, 0); + atomic_set(&info->mr_used_count, 0); + init_waitqueue_head(&info->wait_for_mr_cleanup); + /* Allocate more MRs (2x) than hardware responder_resources */ + for (i = 0; i < info->responder_resources * 2; i++) { + smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); + if (!smbdirect_mr) + goto out; + smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type, + info->max_frmr_depth); + if (IS_ERR(smbdirect_mr->mr)) { + log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x " + "max_frmr_depth=%x\n", + info->mr_type, info->max_frmr_depth); + goto out; + } + smbdirect_mr->sgl = kcalloc( + info->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!smbdirect_mr->sgl) { + log_rdma_mr(ERR, "failed to allocate sgl\n"); + ib_dereg_mr(smbdirect_mr->mr); + goto out; + } + smbdirect_mr->state = MR_READY; + smbdirect_mr->conn = info; + + list_add_tail(&smbdirect_mr->list, &info->mr_list); + atomic_inc(&info->mr_ready_count); + } + INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work); + return 0; + +out: + kfree(smbdirect_mr); + + list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) { + ib_dereg_mr(smbdirect_mr->mr); + kfree(smbdirect_mr->sgl); + kfree(smbdirect_mr); + } + return -ENOMEM; +} + +/* + * Get a MR from mr_list. This function waits until there is at least one + * MR available in the list. It may access the list while the + * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock + * as they never modify the same places. However, there may be several CPUs + * issueing I/O trying to get MR at the same time, mr_list_lock is used to + * protect this situation. + */ +static struct smbd_mr *get_mr(struct smbd_connection *info) +{ + struct smbd_mr *ret; + int rc; +again: + rc = wait_event_interruptible(info->wait_mr, + atomic_read(&info->mr_ready_count) || + info->transport_status != SMBD_CONNECTED); + if (rc) { + log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); + return NULL; + } + + if (info->transport_status != SMBD_CONNECTED) { + log_rdma_mr(ERR, "info->transport_status=%x\n", + info->transport_status); + return NULL; + } + + spin_lock(&info->mr_list_lock); + list_for_each_entry(ret, &info->mr_list, list) { + if (ret->state == MR_READY) { + ret->state = MR_REGISTERED; + spin_unlock(&info->mr_list_lock); + atomic_dec(&info->mr_ready_count); + atomic_inc(&info->mr_used_count); + return ret; + } + } + + spin_unlock(&info->mr_list_lock); + /* + * It is possible that we could fail to get MR because other processes may + * try to acquire a MR at the same time. If this is the case, retry it. + */ + goto again; +} + +/* + * Register memory for RDMA read/write + * pages[]: the list of pages to register memory with + * num_pages: the number of pages to register + * tailsz: if non-zero, the bytes to register in the last page + * writing: true if this is a RDMA write (SMB read), false for RDMA read + * need_invalidate: true if this MR needs to be locally invalidated after I/O + * return value: the MR registered, NULL if failed. + */ +struct smbd_mr *smbd_register_mr( + struct smbd_connection *info, struct page *pages[], int num_pages, + int tailsz, bool writing, bool need_invalidate) +{ + struct smbd_mr *smbdirect_mr; + int rc, i; + enum dma_data_direction dir; + struct ib_reg_wr *reg_wr; + struct ib_send_wr *bad_wr; + + if (num_pages > info->max_frmr_depth) { + log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", + num_pages, info->max_frmr_depth); + return NULL; + } + + smbdirect_mr = get_mr(info); + if (!smbdirect_mr) { + log_rdma_mr(ERR, "get_mr returning NULL\n"); + return NULL; + } + smbdirect_mr->need_invalidate = need_invalidate; + smbdirect_mr->sgl_count = num_pages; + sg_init_table(smbdirect_mr->sgl, num_pages); + + for (i = 0; i < num_pages - 1; i++) + sg_set_page(&smbdirect_mr->sgl[i], pages[i], PAGE_SIZE, 0); + + sg_set_page(&smbdirect_mr->sgl[i], pages[i], + tailsz ? tailsz : PAGE_SIZE, 0); + + dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + smbdirect_mr->dir = dir; + rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgl, num_pages, dir); + if (!rc) { + log_rdma_mr(INFO, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", + num_pages, dir, rc); + goto dma_map_error; + } + + rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgl, num_pages, + NULL, PAGE_SIZE); + if (rc != num_pages) { + log_rdma_mr(INFO, + "ib_map_mr_sg failed rc = %x num_pages = %x\n", + rc, num_pages); + goto map_mr_error; + } + + ib_update_fast_reg_key(smbdirect_mr->mr, + ib_inc_rkey(smbdirect_mr->mr->rkey)); + reg_wr = &smbdirect_mr->wr; + reg_wr->wr.opcode = IB_WR_REG_MR; + smbdirect_mr->cqe.done = register_mr_done; + reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = IB_SEND_SIGNALED; + reg_wr->mr = smbdirect_mr->mr; + reg_wr->key = smbdirect_mr->mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + + /* + * There is no need for waiting for complemtion on ib_post_send + * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution + * on the next ib_post_send when we actaully send I/O to remote peer + */ + rc = ib_post_send(info->id->qp, ®_wr->wr, &bad_wr); + if (!rc) + return smbdirect_mr; + + log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", + rc, reg_wr->key); + + /* If all failed, attempt to recover this MR by setting it MR_ERROR*/ +map_mr_error: + ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, smbdirect_mr->dir); + +dma_map_error: + smbdirect_mr->state = MR_ERROR; + if (atomic_dec_and_test(&info->mr_used_count)) + wake_up(&info->wait_for_mr_cleanup); + + return NULL; +} + +static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbd_mr *smbdirect_mr; + struct ib_cqe *cqe; + + cqe = wc->wr_cqe; + smbdirect_mr = container_of(cqe, struct smbd_mr, cqe); + smbdirect_mr->state = MR_INVALIDATED; + if (wc->status != IB_WC_SUCCESS) { + log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); + smbdirect_mr->state = MR_ERROR; + } + complete(&smbdirect_mr->invalidate_done); +} + +/* + * Deregister a MR after I/O is done + * This function may wait if remote invalidation is not used + * and we have to locally invalidate the buffer to prevent data is being + * modified by remote peer after upper layer consumes it + */ +int smbd_deregister_mr(struct smbd_mr *smbdirect_mr) +{ + struct ib_send_wr *wr, *bad_wr; + struct smbd_connection *info = smbdirect_mr->conn; + int rc = 0; + + if (smbdirect_mr->need_invalidate) { + /* Need to finish local invalidation before returning */ + wr = &smbdirect_mr->inv_wr; + wr->opcode = IB_WR_LOCAL_INV; + smbdirect_mr->cqe.done = local_inv_done; + wr->wr_cqe = &smbdirect_mr->cqe; + wr->num_sge = 0; + wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; + wr->send_flags = IB_SEND_SIGNALED; + + init_completion(&smbdirect_mr->invalidate_done); + rc = ib_post_send(info->id->qp, wr, &bad_wr); + if (rc) { + log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); + smbd_disconnect_rdma_connection(info); + goto done; + } + wait_for_completion(&smbdirect_mr->invalidate_done); + smbdirect_mr->need_invalidate = false; + } else + /* + * For remote invalidation, just set it to MR_INVALIDATED + * and defer to mr_recovery_work to recover the MR for next use + */ + smbdirect_mr->state = MR_INVALIDATED; + + /* + * Schedule the work to do MR recovery for future I/Os + * MR recovery is slow and we don't want it to block the current I/O + */ + queue_work(info->workqueue, &info->mr_recovery_work); + +done: + if (atomic_dec_and_test(&info->mr_used_count)) + wake_up(&info->wait_for_mr_cleanup); + + return rc; +} diff --git a/fs/cifs/smbdirect.h b/fs/cifs/smbdirect.h new file mode 100644 index 000000000000..f9038daea194 --- /dev/null +++ b/fs/cifs/smbdirect.h @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2017, Microsoft Corporation. + * + * Author(s): Long Li <longli@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ +#ifndef _SMBDIRECT_H +#define _SMBDIRECT_H + +#ifdef CONFIG_CIFS_SMB_DIRECT +#define cifs_rdma_enabled(server) ((server)->rdma) + +#include "cifsglob.h" +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <linux/mempool.h> + +extern int rdma_readwrite_threshold; +extern int smbd_max_frmr_depth; +extern int smbd_keep_alive_interval; +extern int smbd_max_receive_size; +extern int smbd_max_fragmented_recv_size; +extern int smbd_max_send_size; +extern int smbd_send_credit_target; +extern int smbd_receive_credit_max; + +enum keep_alive_status { + KEEP_ALIVE_NONE, + KEEP_ALIVE_PENDING, + KEEP_ALIVE_SENT, +}; + +enum smbd_connection_status { + SMBD_CREATED, + SMBD_CONNECTING, + SMBD_CONNECTED, + SMBD_NEGOTIATE_FAILED, + SMBD_DISCONNECTING, + SMBD_DISCONNECTED, + SMBD_DESTROYED +}; + +/* + * The context for the SMBDirect transport + * Everything related to the transport is here. It has several logical parts + * 1. RDMA related structures + * 2. SMBDirect connection parameters + * 3. Memory registrations + * 4. Receive and reassembly queues for data receive path + * 5. mempools for allocating packets + */ +struct smbd_connection { + enum smbd_connection_status transport_status; + + /* RDMA related */ + struct rdma_cm_id *id; + struct ib_qp_init_attr qp_attr; + struct ib_pd *pd; + struct ib_cq *send_cq, *recv_cq; + struct ib_device_attr dev_attr; + int ri_rc; + struct completion ri_done; + wait_queue_head_t conn_wait; + wait_queue_head_t wait_destroy; + + struct completion negotiate_completion; + bool negotiate_done; + + struct work_struct destroy_work; + struct work_struct disconnect_work; + struct work_struct recv_done_work; + struct work_struct post_send_credits_work; + + spinlock_t lock_new_credits_offered; + int new_credits_offered; + + /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */ + int receive_credit_max; + int send_credit_target; + int max_send_size; + int max_fragmented_recv_size; + int max_fragmented_send_size; + int max_receive_size; + int keep_alive_interval; + int max_readwrite_size; + enum keep_alive_status keep_alive_requested; + int protocol; + atomic_t send_credits; + atomic_t receive_credits; + int receive_credit_target; + int fragment_reassembly_remaining; + + /* Memory registrations */ + /* Maximum number of RDMA read/write outstanding on this connection */ + int responder_resources; + /* Maximum number of SGEs in a RDMA write/read */ + int max_frmr_depth; + /* + * If payload is less than or equal to the threshold, + * use RDMA send/recv to send upper layer I/O. + * If payload is more than the threshold, + * use RDMA read/write through memory registration for I/O. + */ + int rdma_readwrite_threshold; + enum ib_mr_type mr_type; + struct list_head mr_list; + spinlock_t mr_list_lock; + /* The number of available MRs ready for memory registration */ + atomic_t mr_ready_count; + atomic_t mr_used_count; + wait_queue_head_t wait_mr; + struct work_struct mr_recovery_work; + /* Used by transport to wait until all MRs are returned */ + wait_queue_head_t wait_for_mr_cleanup; + + /* Activity accoutning */ + /* Pending reqeusts issued from upper layer */ + int smbd_send_pending; + wait_queue_head_t wait_smbd_send_pending; + + int smbd_recv_pending; + wait_queue_head_t wait_smbd_recv_pending; + + atomic_t send_pending; + wait_queue_head_t wait_send_pending; + atomic_t send_payload_pending; + wait_queue_head_t wait_send_payload_pending; + + /* Receive queue */ + struct list_head receive_queue; + int count_receive_queue; + spinlock_t receive_queue_lock; + + struct list_head empty_packet_queue; + int count_empty_packet_queue; + spinlock_t empty_packet_queue_lock; + + wait_queue_head_t wait_receive_queues; + + /* Reassembly queue */ + struct list_head reassembly_queue; + spinlock_t reassembly_queue_lock; + wait_queue_head_t wait_reassembly_queue; + + /* total data length of reassembly queue */ + int reassembly_data_length; + int reassembly_queue_length; + /* the offset to first buffer in reassembly queue */ + int first_entry_offset; + + bool send_immediate; + + wait_queue_head_t wait_send_queue; + + /* + * Indicate if we have received a full packet on the connection + * This is used to identify the first SMBD packet of a assembled + * payload (SMB packet) in reassembly queue so we can return a + * RFC1002 length to upper layer to indicate the length of the SMB + * packet received + */ + bool full_packet_received; + + struct workqueue_struct *workqueue; + struct delayed_work idle_timer_work; + struct delayed_work send_immediate_work; + + /* Memory pool for preallocating buffers */ + /* request pool for RDMA send */ + struct kmem_cache *request_cache; + mempool_t *request_mempool; + + /* response pool for RDMA receive */ + struct kmem_cache *response_cache; + mempool_t *response_mempool; + + /* for debug purposes */ + unsigned int count_get_receive_buffer; + unsigned int count_put_receive_buffer; + unsigned int count_reassembly_queue; + unsigned int count_enqueue_reassembly_queue; + unsigned int count_dequeue_reassembly_queue; + unsigned int count_send_empty; +}; + +enum smbd_message_type { + SMBD_NEGOTIATE_RESP, + SMBD_TRANSFER_DATA, +}; + +#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 + +/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */ +struct smbd_negotiate_req { + __le16 min_version; + __le16 max_version; + __le16 reserved; + __le16 credits_requested; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */ +struct smbd_negotiate_resp { + __le16 min_version; + __le16 max_version; + __le16 negotiated_version; + __le16 reserved; + __le16 credits_requested; + __le16 credits_granted; + __le32 status; + __le32 max_readwrite_size; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */ +struct smbd_data_transfer { + __le16 credits_requested; + __le16 credits_granted; + __le16 flags; + __le16 reserved; + __le32 remaining_data_length; + __le32 data_offset; + __le32 data_length; + __le32 padding; + __u8 buffer[]; +} __packed; + +/* The packet fields for a registered RDMA buffer */ +struct smbd_buffer_descriptor_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +/* Default maximum number of SGEs in a RDMA send/recv */ +#define SMBDIRECT_MAX_SGE 16 +/* The context for a SMBD request */ +struct smbd_request { + struct smbd_connection *info; + struct ib_cqe cqe; + + /* true if this request carries upper layer payload */ + bool has_payload; + + /* the SGE entries for this packet */ + struct ib_sge sge[SMBDIRECT_MAX_SGE]; + int num_sge; + + /* SMBD packet header follows this structure */ + u8 packet[]; +}; + +/* The context for a SMBD response */ +struct smbd_response { + struct smbd_connection *info; + struct ib_cqe cqe; + struct ib_sge sge; + + enum smbd_message_type type; + + /* Link to receive queue or reassembly queue */ + struct list_head list; + + /* Indicate if this is the 1st packet of a payload */ + bool first_segment; + + /* SMBD packet header and payload follows this structure */ + u8 packet[]; +}; + +/* Create a SMBDirect session */ +struct smbd_connection *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr); + +/* Reconnect SMBDirect session */ +int smbd_reconnect(struct TCP_Server_Info *server); +/* Destroy SMBDirect session */ +void smbd_destroy(struct smbd_connection *info); + +/* Interface for carrying upper layer I/O through send/recv */ +int smbd_recv(struct smbd_connection *info, struct msghdr *msg); +int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst); + +enum mr_state { + MR_READY, + MR_REGISTERED, + MR_INVALIDATED, + MR_ERROR +}; + +struct smbd_mr { + struct smbd_connection *conn; + struct list_head list; + enum mr_state state; + struct ib_mr *mr; + struct scatterlist *sgl; + int sgl_count; + enum dma_data_direction dir; + union { + struct ib_reg_wr wr; + struct ib_send_wr inv_wr; + }; + struct ib_cqe cqe; + bool need_invalidate; + struct completion invalidate_done; +}; + +/* Interfaces to register and deregister MR for RDMA read/write */ +struct smbd_mr *smbd_register_mr( + struct smbd_connection *info, struct page *pages[], int num_pages, + int tailsz, bool writing, bool need_invalidate); +int smbd_deregister_mr(struct smbd_mr *mr); + +#else +#define cifs_rdma_enabled(server) 0 +struct smbd_connection {}; +static inline void *smbd_get_connection( + struct TCP_Server_Info *server, struct sockaddr *dstaddr) {return NULL;} +static inline int smbd_reconnect(struct TCP_Server_Info *server) {return -1; } +static inline void smbd_destroy(struct smbd_connection *info) {} +static inline int smbd_recv(struct smbd_connection *info, struct msghdr *msg) {return -1; } +static inline int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) {return -1; } +#endif + +#endif diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7efbab013957..9779b3292d8e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -37,6 +37,10 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "smbdirect.h" + +/* Max number of iovectors we can use off the stack when sending requests. */ +#define CIFS_MAX_IOV_SIZE 8 void cifs_wake_up_task(struct mid_q_entry *mid) @@ -229,7 +233,10 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct socket *ssocket = server->ssocket; struct msghdr smb_msg; int val = 1; - + if (cifs_rdma_enabled(server) && server->smbd_conn) { + rc = smbd_send(server->smbd_conn, rqst); + goto smbd_done; + } if (ssocket == NULL) return -ENOTSOCK; @@ -298,7 +305,7 @@ uncork: */ server->tcpStatus = CifsNeedReconnect; } - +smbd_done: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); @@ -803,12 +810,16 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, const int flags, struct kvec *resp_iov) { struct smb_rqst rqst; - struct kvec *new_iov; + struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; int rc; - new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL); - if (!new_iov) - return -ENOMEM; + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { + new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), + GFP_KERNEL); + if (!new_iov) + return -ENOMEM; + } else + new_iov = s_iov; /* 1st iov is a RFC1001 length followed by the rest of the packet */ memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); @@ -823,7 +834,51 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, rqst.rq_nvec = n_vec + 1; rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - kfree(new_iov); + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) + kfree(new_iov); + return rc; +} + +/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */ +int +smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, + struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, + const int flags, struct kvec *resp_iov) +{ + struct smb_rqst rqst; + struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; + int rc; + int i; + __u32 count; + __be32 rfc1002_marker; + + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { + new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), + GFP_KERNEL); + if (!new_iov) + return -ENOMEM; + } else + new_iov = s_iov; + + /* 1st iov is an RFC1002 Session Message length */ + memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); + + count = 0; + for (i = 1; i < n_vec + 1; i++) + count += new_iov[i].iov_len; + + rfc1002_marker = cpu_to_be32(count); + + new_iov[0].iov_base = &rfc1002_marker; + new_iov[0].iov_len = 4; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = new_iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); + if (n_vec + 1 > CIFS_MAX_IOV_SIZE) + kfree(new_iov); return rc; } |