summaryrefslogtreecommitdiffstats
path: root/src/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'src/kernel')
-rw-r--r--src/kernel/.clang-format552
-rw-r--r--src/kernel/CMakeLists.txt66
-rw-r--r--src/kernel/Kbuild5
-rw-r--r--src/kernel/blk.c740
-rw-r--r--src/kernel/blk.h18
-rw-r--r--src/kernel/core.c81
-rw-r--r--src/kernel/dnbd3.h84
-rw-r--r--src/kernel/dnbd3_main.c250
-rw-r--r--src/kernel/dnbd3_main.h148
-rw-r--r--src/kernel/net.c1929
-rw-r--r--src/kernel/net.h29
l---------src/kernel/serialize.c1
-rw-r--r--src/kernel/serialize_kmod.c5
-rw-r--r--src/kernel/sysfs.c177
-rw-r--r--src/kernel/sysfs.h20
-rw-r--r--src/kernel/utils.c41
-rw-r--r--src/kernel/utils.h29
17 files changed, 2572 insertions, 1603 deletions
diff --git a/src/kernel/.clang-format b/src/kernel/.clang-format
new file mode 100644
index 0000000..c1fe2c6
--- /dev/null
+++ b/src/kernel/.clang-format
@@ -0,0 +1,552 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# clang-format configuration file. Intended for clang-format >= 4.
+#
+# For more information, see:
+#
+# Documentation/process/clang-format.rst
+# https://clang.llvm.org/docs/ClangFormat.html
+# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+#AlignEscapedNewlines: Left # Unknown to clang-format-4.0
+AlignOperands: true
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: false
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+ AfterClass: false
+ AfterControlStatement: false
+ AfterEnum: false
+ AfterFunction: true
+ AfterNamespace: true
+ AfterObjCDeclaration: false
+ AfterStruct: false
+ AfterUnion: false
+ #AfterExternBlock: false # Unknown to clang-format-5.0
+ BeforeCatch: false
+ BeforeElse: false
+ IndentBraces: false
+ #SplitEmptyFunction: true # Unknown to clang-format-4.0
+ #SplitEmptyRecord: true # Unknown to clang-format-4.0
+ #SplitEmptyNamespace: true # Unknown to clang-format-4.0
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: false
+#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+CommentPragmas: '^ IWYU pragma:'
+#CompactNamespaces: false # Unknown to clang-format-4.0
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: false # Unknown to clang-format-4.0
+
+# Taken from:
+# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
+# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
+# | sort | uniq
+ForEachMacros:
+ - 'apei_estatus_for_each_section'
+ - 'ata_for_each_dev'
+ - 'ata_for_each_link'
+ - '__ata_qc_for_each'
+ - 'ata_qc_for_each'
+ - 'ata_qc_for_each_raw'
+ - 'ata_qc_for_each_with_internal'
+ - 'ax25_for_each'
+ - 'ax25_uid_for_each'
+ - '__bio_for_each_bvec'
+ - 'bio_for_each_bvec'
+ - 'bio_for_each_bvec_all'
+ - 'bio_for_each_integrity_vec'
+ - '__bio_for_each_segment'
+ - 'bio_for_each_segment'
+ - 'bio_for_each_segment_all'
+ - 'bio_list_for_each'
+ - 'bip_for_each_vec'
+ - 'bitmap_for_each_clear_region'
+ - 'bitmap_for_each_set_region'
+ - 'blkg_for_each_descendant_post'
+ - 'blkg_for_each_descendant_pre'
+ - 'blk_queue_for_each_rl'
+ - 'bond_for_each_slave'
+ - 'bond_for_each_slave_rcu'
+ - 'bpf_for_each_spilled_reg'
+ - 'btree_for_each_safe128'
+ - 'btree_for_each_safe32'
+ - 'btree_for_each_safe64'
+ - 'btree_for_each_safel'
+ - 'card_for_each_dev'
+ - 'cgroup_taskset_for_each'
+ - 'cgroup_taskset_for_each_leader'
+ - 'cpufreq_for_each_entry'
+ - 'cpufreq_for_each_entry_idx'
+ - 'cpufreq_for_each_valid_entry'
+ - 'cpufreq_for_each_valid_entry_idx'
+ - 'css_for_each_child'
+ - 'css_for_each_descendant_post'
+ - 'css_for_each_descendant_pre'
+ - 'cxl_for_each_cmd'
+ - 'device_for_each_child_node'
+ - 'dma_fence_chain_for_each'
+ - 'do_for_each_ftrace_op'
+ - 'drm_atomic_crtc_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane'
+ - 'drm_atomic_crtc_state_for_each_plane_state'
+ - 'drm_atomic_for_each_plane_damage'
+ - 'drm_client_for_each_connector_iter'
+ - 'drm_client_for_each_modeset'
+ - 'drm_connector_for_each_possible_encoder'
+ - 'drm_for_each_bridge_in_chain'
+ - 'drm_for_each_connector_iter'
+ - 'drm_for_each_crtc'
+ - 'drm_for_each_crtc_reverse'
+ - 'drm_for_each_encoder'
+ - 'drm_for_each_encoder_mask'
+ - 'drm_for_each_fb'
+ - 'drm_for_each_legacy_plane'
+ - 'drm_for_each_plane'
+ - 'drm_for_each_plane_mask'
+ - 'drm_for_each_privobj'
+ - 'drm_mm_for_each_hole'
+ - 'drm_mm_for_each_node'
+ - 'drm_mm_for_each_node_in_range'
+ - 'drm_mm_for_each_node_safe'
+ - 'flow_action_for_each'
+ - 'for_each_active_dev_scope'
+ - 'for_each_active_drhd_unit'
+ - 'for_each_active_iommu'
+ - 'for_each_aggr_pgid'
+ - 'for_each_available_child_of_node'
+ - 'for_each_bio'
+ - 'for_each_board_func_rsrc'
+ - 'for_each_bvec'
+ - 'for_each_card_auxs'
+ - 'for_each_card_auxs_safe'
+ - 'for_each_card_components'
+ - 'for_each_card_dapms'
+ - 'for_each_card_pre_auxs'
+ - 'for_each_card_prelinks'
+ - 'for_each_card_rtds'
+ - 'for_each_card_rtds_safe'
+ - 'for_each_card_widgets'
+ - 'for_each_card_widgets_safe'
+ - 'for_each_cgroup_storage_type'
+ - 'for_each_child_of_node'
+ - 'for_each_clear_bit'
+ - 'for_each_clear_bit_from'
+ - 'for_each_cmsghdr'
+ - 'for_each_compatible_node'
+ - 'for_each_component_dais'
+ - 'for_each_component_dais_safe'
+ - 'for_each_comp_order'
+ - 'for_each_console'
+ - 'for_each_cpu'
+ - 'for_each_cpu_and'
+ - 'for_each_cpu_not'
+ - 'for_each_cpu_wrap'
+ - 'for_each_dapm_widgets'
+ - 'for_each_dev_addr'
+ - 'for_each_dev_scope'
+ - 'for_each_displayid_db'
+ - 'for_each_dma_cap_mask'
+ - 'for_each_dpcm_be'
+ - 'for_each_dpcm_be_rollback'
+ - 'for_each_dpcm_be_safe'
+ - 'for_each_dpcm_fe'
+ - 'for_each_drhd_unit'
+ - 'for_each_dss_dev'
+ - 'for_each_efi_memory_desc'
+ - 'for_each_efi_memory_desc_in_map'
+ - 'for_each_element'
+ - 'for_each_element_extid'
+ - 'for_each_element_id'
+ - 'for_each_endpoint_of_node'
+ - 'for_each_evictable_lru'
+ - 'for_each_fib6_node_rt_rcu'
+ - 'for_each_fib6_walker_rt'
+ - 'for_each_free_mem_pfn_range_in_zone'
+ - 'for_each_free_mem_pfn_range_in_zone_from'
+ - 'for_each_free_mem_range'
+ - 'for_each_free_mem_range_reverse'
+ - 'for_each_func_rsrc'
+ - 'for_each_hstate'
+ - 'for_each_if'
+ - 'for_each_iommu'
+ - 'for_each_ip_tunnel_rcu'
+ - 'for_each_irq_nr'
+ - 'for_each_link_codecs'
+ - 'for_each_link_cpus'
+ - 'for_each_link_platforms'
+ - 'for_each_lru'
+ - 'for_each_matching_node'
+ - 'for_each_matching_node_and_match'
+ - 'for_each_member'
+ - 'for_each_memcg_cache_index'
+ - 'for_each_mem_pfn_range'
+ - '__for_each_mem_range'
+ - 'for_each_mem_range'
+ - '__for_each_mem_range_rev'
+ - 'for_each_mem_range_rev'
+ - 'for_each_mem_region'
+ - 'for_each_migratetype_order'
+ - 'for_each_msi_entry'
+ - 'for_each_msi_entry_safe'
+ - 'for_each_net'
+ - 'for_each_net_continue_reverse'
+ - 'for_each_netdev'
+ - 'for_each_netdev_continue'
+ - 'for_each_netdev_continue_rcu'
+ - 'for_each_netdev_continue_reverse'
+ - 'for_each_netdev_feature'
+ - 'for_each_netdev_in_bond_rcu'
+ - 'for_each_netdev_rcu'
+ - 'for_each_netdev_reverse'
+ - 'for_each_netdev_safe'
+ - 'for_each_net_rcu'
+ - 'for_each_new_connector_in_state'
+ - 'for_each_new_crtc_in_state'
+ - 'for_each_new_mst_mgr_in_state'
+ - 'for_each_new_plane_in_state'
+ - 'for_each_new_private_obj_in_state'
+ - 'for_each_node'
+ - 'for_each_node_by_name'
+ - 'for_each_node_by_type'
+ - 'for_each_node_mask'
+ - 'for_each_node_state'
+ - 'for_each_node_with_cpus'
+ - 'for_each_node_with_property'
+ - 'for_each_nonreserved_multicast_dest_pgid'
+ - 'for_each_of_allnodes'
+ - 'for_each_of_allnodes_from'
+ - 'for_each_of_cpu_node'
+ - 'for_each_of_pci_range'
+ - 'for_each_old_connector_in_state'
+ - 'for_each_old_crtc_in_state'
+ - 'for_each_old_mst_mgr_in_state'
+ - 'for_each_oldnew_connector_in_state'
+ - 'for_each_oldnew_crtc_in_state'
+ - 'for_each_oldnew_mst_mgr_in_state'
+ - 'for_each_oldnew_plane_in_state'
+ - 'for_each_oldnew_plane_in_state_reverse'
+ - 'for_each_oldnew_private_obj_in_state'
+ - 'for_each_old_plane_in_state'
+ - 'for_each_old_private_obj_in_state'
+ - 'for_each_online_cpu'
+ - 'for_each_online_node'
+ - 'for_each_online_pgdat'
+ - 'for_each_pci_bridge'
+ - 'for_each_pci_dev'
+ - 'for_each_pci_msi_entry'
+ - 'for_each_pcm_streams'
+ - 'for_each_physmem_range'
+ - 'for_each_populated_zone'
+ - 'for_each_possible_cpu'
+ - 'for_each_present_cpu'
+ - 'for_each_prime_number'
+ - 'for_each_prime_number_from'
+ - 'for_each_process'
+ - 'for_each_process_thread'
+ - 'for_each_property_of_node'
+ - 'for_each_registered_fb'
+ - 'for_each_requested_gpio'
+ - 'for_each_requested_gpio_in_range'
+ - 'for_each_reserved_mem_range'
+ - 'for_each_reserved_mem_region'
+ - 'for_each_rtd_codec_dais'
+ - 'for_each_rtd_components'
+ - 'for_each_rtd_cpu_dais'
+ - 'for_each_rtd_dais'
+ - 'for_each_set_bit'
+ - 'for_each_set_bit_from'
+ - 'for_each_set_clump8'
+ - 'for_each_sg'
+ - 'for_each_sg_dma_page'
+ - 'for_each_sg_page'
+ - 'for_each_sgtable_dma_page'
+ - 'for_each_sgtable_dma_sg'
+ - 'for_each_sgtable_page'
+ - 'for_each_sgtable_sg'
+ - 'for_each_sibling_event'
+ - 'for_each_subelement'
+ - 'for_each_subelement_extid'
+ - 'for_each_subelement_id'
+ - '__for_each_thread'
+ - 'for_each_thread'
+ - 'for_each_unicast_dest_pgid'
+ - 'for_each_vsi'
+ - 'for_each_wakeup_source'
+ - 'for_each_zone'
+ - 'for_each_zone_zonelist'
+ - 'for_each_zone_zonelist_nodemask'
+ - 'fwnode_for_each_available_child_node'
+ - 'fwnode_for_each_child_node'
+ - 'fwnode_graph_for_each_endpoint'
+ - 'gadget_for_each_ep'
+ - 'genradix_for_each'
+ - 'genradix_for_each_from'
+ - 'hash_for_each'
+ - 'hash_for_each_possible'
+ - 'hash_for_each_possible_rcu'
+ - 'hash_for_each_possible_rcu_notrace'
+ - 'hash_for_each_possible_safe'
+ - 'hash_for_each_rcu'
+ - 'hash_for_each_safe'
+ - 'hctx_for_each_ctx'
+ - 'hlist_bl_for_each_entry'
+ - 'hlist_bl_for_each_entry_rcu'
+ - 'hlist_bl_for_each_entry_safe'
+ - 'hlist_for_each'
+ - 'hlist_for_each_entry'
+ - 'hlist_for_each_entry_continue'
+ - 'hlist_for_each_entry_continue_rcu'
+ - 'hlist_for_each_entry_continue_rcu_bh'
+ - 'hlist_for_each_entry_from'
+ - 'hlist_for_each_entry_from_rcu'
+ - 'hlist_for_each_entry_rcu'
+ - 'hlist_for_each_entry_rcu_bh'
+ - 'hlist_for_each_entry_rcu_notrace'
+ - 'hlist_for_each_entry_safe'
+ - 'hlist_for_each_entry_srcu'
+ - '__hlist_for_each_rcu'
+ - 'hlist_for_each_safe'
+ - 'hlist_nulls_for_each_entry'
+ - 'hlist_nulls_for_each_entry_from'
+ - 'hlist_nulls_for_each_entry_rcu'
+ - 'hlist_nulls_for_each_entry_safe'
+ - 'i3c_bus_for_each_i2cdev'
+ - 'i3c_bus_for_each_i3cdev'
+ - 'ide_host_for_each_port'
+ - 'ide_port_for_each_dev'
+ - 'ide_port_for_each_present_dev'
+ - 'idr_for_each_entry'
+ - 'idr_for_each_entry_continue'
+ - 'idr_for_each_entry_continue_ul'
+ - 'idr_for_each_entry_ul'
+ - 'in_dev_for_each_ifa_rcu'
+ - 'in_dev_for_each_ifa_rtnl'
+ - 'inet_bind_bucket_for_each'
+ - 'inet_lhash2_for_each_icsk_rcu'
+ - 'key_for_each'
+ - 'key_for_each_safe'
+ - 'klp_for_each_func'
+ - 'klp_for_each_func_safe'
+ - 'klp_for_each_func_static'
+ - 'klp_for_each_object'
+ - 'klp_for_each_object_safe'
+ - 'klp_for_each_object_static'
+ - 'kunit_suite_for_each_test_case'
+ - 'kvm_for_each_memslot'
+ - 'kvm_for_each_vcpu'
+ - 'list_for_each'
+ - 'list_for_each_codec'
+ - 'list_for_each_codec_safe'
+ - 'list_for_each_continue'
+ - 'list_for_each_entry'
+ - 'list_for_each_entry_continue'
+ - 'list_for_each_entry_continue_rcu'
+ - 'list_for_each_entry_continue_reverse'
+ - 'list_for_each_entry_from'
+ - 'list_for_each_entry_from_rcu'
+ - 'list_for_each_entry_from_reverse'
+ - 'list_for_each_entry_lockless'
+ - 'list_for_each_entry_rcu'
+ - 'list_for_each_entry_reverse'
+ - 'list_for_each_entry_safe'
+ - 'list_for_each_entry_safe_continue'
+ - 'list_for_each_entry_safe_from'
+ - 'list_for_each_entry_safe_reverse'
+ - 'list_for_each_entry_srcu'
+ - 'list_for_each_prev'
+ - 'list_for_each_prev_safe'
+ - 'list_for_each_safe'
+ - 'llist_for_each'
+ - 'llist_for_each_entry'
+ - 'llist_for_each_entry_safe'
+ - 'llist_for_each_safe'
+ - 'mci_for_each_dimm'
+ - 'media_device_for_each_entity'
+ - 'media_device_for_each_intf'
+ - 'media_device_for_each_link'
+ - 'media_device_for_each_pad'
+ - 'nanddev_io_for_each_page'
+ - 'netdev_for_each_lower_dev'
+ - 'netdev_for_each_lower_private'
+ - 'netdev_for_each_lower_private_rcu'
+ - 'netdev_for_each_mc_addr'
+ - 'netdev_for_each_uc_addr'
+ - 'netdev_for_each_upper_dev_rcu'
+ - 'netdev_hw_addr_list_for_each'
+ - 'nft_rule_for_each_expr'
+ - 'nla_for_each_attr'
+ - 'nla_for_each_nested'
+ - 'nlmsg_for_each_attr'
+ - 'nlmsg_for_each_msg'
+ - 'nr_neigh_for_each'
+ - 'nr_neigh_for_each_safe'
+ - 'nr_node_for_each'
+ - 'nr_node_for_each_safe'
+ - 'of_for_each_phandle'
+ - 'of_property_for_each_string'
+ - 'of_property_for_each_u32'
+ - 'pci_bus_for_each_resource'
+ - 'pcl_for_each_chunk'
+ - 'pcl_for_each_segment'
+ - 'pcm_for_each_format'
+ - 'ping_portaddr_for_each_entry'
+ - 'plist_for_each'
+ - 'plist_for_each_continue'
+ - 'plist_for_each_entry'
+ - 'plist_for_each_entry_continue'
+ - 'plist_for_each_entry_safe'
+ - 'plist_for_each_safe'
+ - 'pnp_for_each_card'
+ - 'pnp_for_each_dev'
+ - 'protocol_for_each_card'
+ - 'protocol_for_each_dev'
+ - 'queue_for_each_hw_ctx'
+ - 'radix_tree_for_each_slot'
+ - 'radix_tree_for_each_tagged'
+ - 'rbtree_postorder_for_each_entry_safe'
+ - 'rdma_for_each_block'
+ - 'rdma_for_each_port'
+ - 'rdma_umem_for_each_dma_block'
+ - 'resource_list_for_each_entry'
+ - 'resource_list_for_each_entry_safe'
+ - 'rhl_for_each_entry_rcu'
+ - 'rhl_for_each_rcu'
+ - 'rht_for_each'
+ - 'rht_for_each_entry'
+ - 'rht_for_each_entry_from'
+ - 'rht_for_each_entry_rcu'
+ - 'rht_for_each_entry_rcu_from'
+ - 'rht_for_each_entry_safe'
+ - 'rht_for_each_from'
+ - 'rht_for_each_rcu'
+ - 'rht_for_each_rcu_from'
+ - '__rq_for_each_bio'
+ - 'rq_for_each_bvec'
+ - 'rq_for_each_segment'
+ - 'scsi_for_each_prot_sg'
+ - 'scsi_for_each_sg'
+ - 'sctp_for_each_hentry'
+ - 'sctp_skb_for_each'
+ - 'shdma_for_each_chan'
+ - '__shost_for_each_device'
+ - 'shost_for_each_device'
+ - 'sk_for_each'
+ - 'sk_for_each_bound'
+ - 'sk_for_each_entry_offset_rcu'
+ - 'sk_for_each_from'
+ - 'sk_for_each_rcu'
+ - 'sk_for_each_safe'
+ - 'sk_nulls_for_each'
+ - 'sk_nulls_for_each_from'
+ - 'sk_nulls_for_each_rcu'
+ - 'snd_array_for_each'
+ - 'snd_pcm_group_for_each_entry'
+ - 'snd_soc_dapm_widget_for_each_path'
+ - 'snd_soc_dapm_widget_for_each_path_safe'
+ - 'snd_soc_dapm_widget_for_each_sink_path'
+ - 'snd_soc_dapm_widget_for_each_source_path'
+ - 'tb_property_for_each'
+ - 'tcf_exts_for_each_action'
+ - 'udp_portaddr_for_each_entry'
+ - 'udp_portaddr_for_each_entry_rcu'
+ - 'usb_hub_for_each_child'
+ - 'v4l2_device_for_each_subdev'
+ - 'v4l2_m2m_for_each_dst_buf'
+ - 'v4l2_m2m_for_each_dst_buf_safe'
+ - 'v4l2_m2m_for_each_src_buf'
+ - 'v4l2_m2m_for_each_src_buf_safe'
+ - 'virtio_device_for_each_vq'
+ - 'while_for_each_ftrace_op'
+ - 'xa_for_each'
+ - 'xa_for_each_marked'
+ - 'xa_for_each_range'
+ - 'xa_for_each_start'
+ - 'xas_for_each'
+ - 'xas_for_each_conflict'
+ - 'xas_for_each_marked'
+ - 'xbc_array_for_each_value'
+ - 'xbc_for_each_key_value'
+ - 'xbc_node_for_each_array_value'
+ - 'xbc_node_for_each_child'
+ - 'xbc_node_for_each_key_value'
+ - 'zorro_for_each_dev'
+
+#IncludeBlocks: Preserve # Unknown to clang-format-5.0
+IncludeCategories:
+ - Regex: '.*'
+ Priority: 1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+#IndentPPDirectives: None # Unknown to clang-format-5.0
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
+ObjCBlockIndentWidth: 8
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+
+# Taken from git's rules
+#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
+PenaltyBreakBeforeFirstCallParameter: 30
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 10
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 60
+
+PointerAlignment: Right
+ReflowComments: false
+SortIncludes: false
+#SortUsingDeclarations: false # Unknown to clang-format-4.0
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
+#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
+SpaceBeforeParens: ControlStatements
+#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp03
+TabWidth: 8
+UseTab: Always
+...
diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt
new file mode 100644
index 0000000..6bc61ff
--- /dev/null
+++ b/src/kernel/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.10)
+
+# set the project name
+project(dnbd3-kernel
+ LANGUAGES C)
+
+# include macros to define Linux kernel build targets
+include(Kernel)
+
+# set C flags for a Linux kernel module
+set(KERNEL_C_FLAGS "-DDNBD3_KERNEL_MODULE -I ${PROJECT_INCLUDE_GEN_DIR}"
+ CACHE STRING "C flags to be used for building the dnbd3 kernel module")
+# set C flags for the debug mode of a Linux kernel module
+set(KERNEL_C_FLAGS_DEBUG "-g -DDEBUG"
+ CACHE STRING "Additional C flags to be used for building the dnbd3 kernel module in debug mode")
+
+# append include directories to the C flags
+get_property(KERNEL_INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
+foreach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} -I ${KERNEL_INCLUDE_DIR}")
+endforeach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS})
+
+# append debug C flags if debug mode is enabled
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} ${KERNEL_C_FLAGS_DEBUG}")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+# dnbd3 Linux kernel module
+set(KERNEL_MODULE_DNBD3_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.c)
+set(KERNEL_MODULE_DNBD3_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/net.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.h)
+
+add_kernel_module(dnbd3 "${KERNEL_BUILD_DIR}"
+ "${KERNEL_INSTALL_DIR}"
+ "CONFIG_BLK_DEV_DNBD3=m"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}"
+ ${CMAKE_CURRENT_SOURCE_DIR}/Kbuild)
+
+# add dependency to generate project version header before dnbd3.ko is built
+add_dependencies(dnbd3 dnbd3-generate-version)
+
+set(CHECKPATCH_IGNORE_WARNINGS "NEW_TYPEDEFS"
+ "MSLEEP"
+ "CONSTANT_COMPARISON"
+ "DEEP_INDENTATION"
+ "PREFER_PR_LEVEL"
+ "LINUX_VERSION_CODE"
+ "JIFFIES_COMPARISON"
+ "KREALLOC_ARG_REUSE")
+
+add_kernel_linter(dnbd3-lint "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+add_kernel_linter_fix(dnbd3-lint-fix "${CHECKPATCH_IGNORE_WARNINGS}"
+ "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
+
+add_linter_fix(dnbd3-lint-fix-clang "${KERNEL_MODULE_DNBD3_SOURCE_FILES}"
+ "${KERNEL_MODULE_DNBD3_HEADER_FILES}")
diff --git a/src/kernel/Kbuild b/src/kernel/Kbuild
new file mode 100644
index 0000000..26afa98
--- /dev/null
+++ b/src/kernel/Kbuild
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Linux kernel module dnbd3
+obj-$(CONFIG_BLK_DEV_DNBD3) := dnbd3.o
+dnbd3-y += dnbd3_main.o blk.o net.o serialize.o sysfs.o
diff --git a/src/kernel/blk.c b/src/kernel/blk.c
index 889b988..69e4583 100644
--- a/src/kernel/blk.c
+++ b/src/kernel/blk.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,248 +19,259 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "blk.h"
#include "net.h"
#include "sysfs.h"
+#include "dnbd3_main.h"
#include <linux/pagemap.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-#define dnbd3_req_read(req) \
- req_op(req) == REQ_OP_READ
-#define dnbd3_req_fs(req) \
- dnbd3_req_read(req) || req_op(req) == REQ_OP_WRITE
-#define dnbd3_req_special(req) \
- blk_rq_is_private(req)
-#else
-#define dnbd3_req_read(req) \
- rq_data_dir(req) == READ
-#define dnbd3_req_fs(req) \
- req->cmd_type == REQ_TYPE_FS
-#define dnbd3_req_special(req) \
- req->cmd_type == REQ_TYPE_SPECIAL
-#endif
-
-int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+static int dnbd3_close_device(dnbd3_device_t *dev)
{
- struct gendisk *disk;
- struct request_queue *blk_queue;
-
- init_waitqueue_head(&dev->process_queue_send);
- init_waitqueue_head(&dev->process_queue_receive);
- init_waitqueue_head(&dev->process_queue_discover);
- INIT_LIST_HEAD(&dev->request_queue_send);
- INIT_LIST_HEAD(&dev->request_queue_receive);
+ int result;
- memset(&dev->cur_server, 0, sizeof(dev->cur_server));
- memset(&dev->initial_server, 0, sizeof(dev->initial_server));
- dev->better_sock = NULL;
+ if (dev->imgname)
+ dev_info(dnbd3_device_to_dev(dev), "closing down device.\n");
+ dev->panic = false;
+ result = dnbd3_net_disconnect(dev);
+ kfree(dev->imgname);
dev->imgname = NULL;
- dev->rid = 0;
- dev->update_available = 0;
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- dev->thread_send = NULL;
- dev->thread_receive = NULL;
- dev->thread_discover = NULL;
- dev->discover = 0;
- dev->disconnecting = 0;
- dev->panic = 0;
- dev->panic_count = 0;
- dev->reported_size = 0;
-
- if (!(disk = alloc_disk(1)))
- {
- printk("ERROR: dnbd3 alloc_disk failed.\n");
- return -EIO;
- }
-
- disk->major = major;
- disk->first_minor = minor;
- sprintf(disk->disk_name, "dnbd%d", minor);
- set_capacity(disk, 0);
- set_disk_ro(disk, 1);
- disk->fops = &dnbd3_blk_ops;
-
- spin_lock_init(&dev->blk_lock);
- if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL)
- {
- printk("ERROR: dnbd3 blk_init_queue failed.\n");
- return -EIO;
- }
-
- blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
- blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE);
-
- disk->queue = blk_queue;
- disk->private_data = dev;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-#else
- queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
-#endif
-#define ONE_MEG (1048576)
- blk_queue_max_segment_size(disk->queue, ONE_MEG);
- blk_queue_max_segments(disk->queue, 0xffff);
- blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
- disk->queue->limits.max_sectors = 256;
- dev->disk = disk;
-#undef ONE_MEG
- add_disk(disk);
- dnbd3_sysfs_init(dev);
- return 0;
+ /* new requests might have been queued up, */
+ /* but now that imgname is NULL no new ones can show up */
+ blk_mq_freeze_queue(dev->queue);
+ set_capacity(dev->disk, 0);
+ blk_mq_unfreeze_queue(dev->queue);
+ return result;
}
-int dnbd3_blk_del_device(dnbd3_device_t *dev)
-{
- dnbd3_sysfs_exit(dev);
- dnbd3_net_disconnect(dev);
- del_gendisk(dev->disk);
- put_disk(dev->disk);
- blk_cleanup_queue(dev->disk->queue);
- return 0;
-}
-
-struct block_device_operations dnbd3_blk_ops =
- { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, };
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
{
int result = -100;
dnbd3_device_t *dev = bdev->bd_disk->private_data;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
struct request_queue *blk_queue = dev->disk->queue;
+#endif
char *imgname = NULL;
dnbd3_ioctl_t *msg = NULL;
- //unsigned long irqflags;
+ int i = 0, j;
+ u8 locked = 0;
- while (dev->disconnecting)
- {
- // do nothing
- }
-
- if (arg != 0)
- {
+ if (arg != 0) {
msg = kmalloc(sizeof(*msg), GFP_KERNEL);
- if (msg == NULL) return -ENOMEM;
- if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg))
- {
+ if (msg == NULL)
+ return -ENOMEM;
+ if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) {
result = -ENOEXEC;
goto cleanup_return;
}
- if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0)
- {
+ if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) {
result = -ENOENT;
goto cleanup_return;
}
- if (msg->imgname != NULL && msg->imgnamelen > 0)
- {
+ if (msg->imgname != NULL && msg->imgnamelen > 0) {
imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
- if (imgname == NULL)
- {
+ if (imgname == NULL) {
result = -ENOMEM;
goto cleanup_return;
}
- if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0)
- {
+ if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) {
result = -ENOENT;
goto cleanup_return;
}
imgname[msg->imgnamelen] = '\0';
- //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname);
}
}
-
- switch (cmd)
- {
+ switch (cmd) {
case IOCTL_OPEN:
- if (dev->imgname != NULL)
- {
+ if (!dnbd3_flag_get(dev->connection_lock)) {
result = -EBUSY;
+ break;
}
- else if (imgname == NULL)
- {
+ locked = 1;
+ if (dev->imgname != NULL) {
+ result = -EBUSY;
+ } else if (imgname == NULL) {
result = -EINVAL;
- }
- else if (msg == NULL)
- {
+ } else if (msg == NULL) {
result = -EINVAL;
- }
- else
- {
- if (sizeof(msg->host) != sizeof(dev->cur_server.host))
- printk("Odd size bug#1 triggered in IOCTL\n");
- memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host));
- dev->cur_server.failures = 0;
- memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server));
+ } else {
+ /* assert that at least one and not to many hosts are given */
+ if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) {
+ result = -EINVAL;
+ break;
+ }
+
dev->imgname = imgname;
dev->rid = msg->rid;
dev->use_server_provided_alts = msg->use_server_provided_alts;
- // Forget all alt servers on explicit connect, set first al server to initial server
- memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS);
- memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0]));
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
- if (blk_queue->backing_dev_info != NULL) {
+
+ dev_info(dnbd3_device_to_dev(dev), "opening device.\n");
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set optimal request size for the queue to half the read-ahead
+ blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512));
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ // set readahead from optimal request size of the queue
+ // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE
+ blk_queue_update_readahead(dev->queue);
+#endif
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+ if (blk_queue->backing_dev_info != NULL)
blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
- }
#else
blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE;
#endif
- if (dnbd3_net_connect(dev) == 0)
- {
- result = 0;
- imgname = NULL; // Prevent kfree at the end
+
+ /* add specified servers to alt server list */
+ for (i = 0; i < NUMBER_SERVERS; i++)
+ dev->alt_servers[i].host.ss_family = 0;
+ for (i = 0; i < msg->hosts_num; i++) {
+ /* copy provided host into corresponding alt server slot */
+ if (dnbd3_add_server(dev, &msg->hosts[i]) == 0)
+ dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n",
+ &dev->alt_servers[i].host);
+ else
+ dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n",
+ &dev->alt_servers[i].host);
}
- else
- {
- result = -ENOENT;
+
+ /*
+ * probe added alt servers in specified order and
+ * choose first working server as initial server
+ */
+ result = -EPROTONOSUPPORT;
+ for (i = 0; i < NUMBER_SERVERS; i++) {
+ /* probe added alt server */
+ if (dev->alt_servers[i].host.ss_family == 0)
+ continue; // Empty slot
+
+ result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true);
+ if (result == 0) {
+ /* connection established, store index of server and exit loop */
+ result = i;
+ break;
+ }
+ }
+
+ if (result >= 0) {
+ /* connection was successful */
+ dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n",
+ &dev->cur_server.host);
+ imgname = NULL; // Prevent kfree at the end
+ } else {
+ /* probing failed */
dev->imgname = NULL;
}
}
break;
case IOCTL_CLOSE:
- dnbd3_blk_fail_all_requests(dev);
- result = dnbd3_net_disconnect(dev);
- dnbd3_blk_fail_all_requests(dev);
- set_capacity(dev->disk, 0);
- if (dev->imgname)
- {
- kfree(dev->imgname);
- dev->imgname = NULL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
}
+ locked = 1;
+ result = dnbd3_close_device(dev);
break;
case IOCTL_SWITCH:
- result = -EINVAL;
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ result = -EBUSY;
+ break;
+ }
+ locked = 1;
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ } else if (msg == NULL) {
+ result = -EINVAL;
+ } else {
+ dnbd3_alt_server_t *alt_server;
+ struct sockaddr_storage new_addr;
+
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(&msg->hosts[0], dev);
+ if (alt_server == NULL) {
+ mutex_unlock(&dev->alt_servers_lock);
+ /* specified server is not known, so do not switch */
+ result = -ENOENT;
+ } else {
+ /* specified server is known, so try to switch to it */
+ new_addr = alt_server->host;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &new_addr)) {
+ /* specified server is current server, so do not switch */
+ result = 0;
+ } else {
+ dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n",
+ &new_addr);
+ result = dnbd3_new_connection(dev, &new_addr, false);
+ if (result != 0) {
+ /* switching didn't work */
+ result = -EAGAIN;
+ }
+ }
+ if (result == 0) {
+ /* fake RTT so we don't switch away again soon */
+ mutex_lock(&dev->alt_servers_lock);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ alt_server = &dev->alt_servers[i];
+ if (is_same_server(&alt_server->host, &new_addr)) {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ alt_server->rtts[j] = 1;
+ alt_server->best_count = 100;
+ } else {
+ for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j)
+ if (alt_server->rtts[j] < 500000)
+ alt_server->rtts[j] = 500000;
+ alt_server->best_count = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ }
+ }
+ }
break;
case IOCTL_ADD_SRV:
- case IOCTL_REM_SRV:
- if (dev->imgname == NULL)
- {
- result = -ENOENT;
+ case IOCTL_REM_SRV: {
+ struct sockaddr_storage addr;
+ dnbd3_host_t *host;
+
+ if (dev->imgname == NULL) {
+ result = -ENOTCONN;
+ break;
}
- else if (dev->new_servers_num >= NUMBER_SERVERS)
- {
- result = -EAGAIN;
+ if (msg == NULL) {
+ result = -EINVAL;
+ break;
}
- else if (msg == NULL)
- {
+ host = &msg->hosts[0];
+ if (!dnbd3_host_to_sockaddr(host, &addr)) {
result = -EINVAL;
+ break;
}
- else
- {
- memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host));
- dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM
- ++dev->new_servers_num;
- result = 0;
+
+ if (cmd == IOCTL_ADD_SRV) {
+ result = dnbd3_add_server(dev, host);
+ if (result == -EEXIST)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr);
+ else if (result == -ENOSPC)
+ dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr);
+ } else { // IOCTL_REM_SRV
+ result = dnbd3_rem_server(dev, host);
+ if (result == -ENOENT)
+ dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr);
+ else
+ dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr);
}
break;
-
+ }
case BLKFLSBUF:
result = 0;
break;
@@ -270,113 +282,325 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
}
cleanup_return:
- if (msg) kfree(msg);
- if (imgname) kfree(imgname);
+ kfree(msg);
+ kfree(imgname);
+ if (locked)
+ dnbd3_flag_reset(dev->connection_lock);
return result;
}
-/**
- * dev->blk_lock and q->queue_lock are being held
- * when this is called!
+static const struct block_device_operations dnbd3_blk_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = dnbd3_blk_ioctl,
+};
+
+static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq)
+{
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add_tail(&rq->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+}
+
+/*
+ * Linux kernel blk-mq driver function (entry point) to handle block IO requests
*/
-void dnbd3_blk_request(struct request_queue *q)
+static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{
- struct request *req;
- dnbd3_device_t *dev;
+ struct request *rq = bd->rq;
+ dnbd3_device_t *dev = rq->q->queuedata;
+ struct dnbd3_cmd *cmd;
- while ((req = blk_fetch_request(q)) != NULL)
- {
- dev = req->rq_disk->private_data;
+ if (dev->imgname == NULL || !device_active(dev))
+ return BLK_STS_IOERR;
- if (dev->imgname == NULL)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (req_op(rq) != REQ_OP_READ)
+ return BLK_STS_IOERR;
- if (!(dnbd3_req_fs(req)))
- {
- __blk_end_request_all(req, 0);
- continue;
- }
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
+ return BLK_STS_TIMEOUT;
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT)
- {
- __blk_end_request_all(req, -EIO);
- continue;
- }
+ if (rq_data_dir(rq) != READ)
+ return BLK_STS_NOTSUPP;
- if (!(dnbd3_req_read(req)))
- {
- __blk_end_request_all(req, -EACCES);
- continue;
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32);
+ blk_mq_start_request(rq);
+ dnbd3_add_queue(dev, rq);
+ return BLK_STS_OK;
+}
+
+static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ , bool reserved
+#endif
+ )
+{
+ unsigned long irqflags;
+ struct request *rq_iter;
+ bool found = false;
+ dnbd3_device_t *dev = req->q->queuedata;
+
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->send_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // If still in send queue, do nothing
+ if (found)
+ return BLK_EH_RESET_TIMER;
+
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ if (rq_iter == req) {
+ found = true;
+ list_del_init(&req->queuelist);
+ break;
}
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (!found) {
+ dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n");
+ // Assume it was fnished concurrently
+ return BLK_EH_DONE;
+ }
+ // Add to send queue again and trigger work, reset timeout
+ dnbd3_add_queue(dev, req);
+ return BLK_EH_RESET_TIMER;
+}
+
+static
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
+const
+#endif
+struct blk_mq_ops dnbd3_mq_ops = {
+ .queue_rq = dnbd3_queue_rq,
+ .timeout = dnbd3_rq_timeout,
+};
+
+int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor)
+{
+ int ret;
+
+ memset(dev, 0, sizeof(*dev));
+ dev->index = minor;
+ // lock for imgname, cur_server etc.
+ spin_lock_init(&dev->blk_lock);
+ spin_lock_init(&dev->send_queue_lock);
+ spin_lock_init(&dev->recv_queue_lock);
+ INIT_LIST_HEAD(&dev->send_queue);
+ INIT_LIST_HEAD(&dev->recv_queue);
+ dnbd3_flag_reset(dev->connection_lock);
+ dnbd3_flag_reset(dev->discover_running);
+ mutex_init(&dev->alt_servers_lock);
+ dnbd3_net_work_init(dev);
+
+ // memset has done this already but I like initial values to be explicit
+ dev->imgname = NULL;
+ dev->rid = 0;
+ dev->update_available = false;
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->reported_size = 0;
+
+ // set up tag_set for blk-mq
+ dev->tag_set.ops = &dnbd3_mq_ops;
+ dev->tag_set.nr_hw_queues = 1;
+ dev->tag_set.queue_depth = 128;
+ dev->tag_set.numa_node = NUMA_NO_NODE;
+ dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
+ dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tag_set.driver_data = dev;
+ dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ;
+
+ ret = blk_mq_alloc_tag_set(&dev->tag_set);
+ if (ret) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n");
+ goto out;
+ }
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0)
+ // set up blk-mq and disk
+ dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev);
+ if (IS_ERR(dev->disk)) {
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n");
+ ret = PTR_ERR(dev->disk);
+ goto out_cleanup_tags;
+ }
+ dev->queue = dev->disk->queue;
+#else
+ // set up blk-mq
+ dev->queue = blk_mq_init_queue(&dev->tag_set);
+ if (IS_ERR(dev->queue)) {
+ ret = PTR_ERR(dev->queue);
+ dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n");
+ goto out_cleanup_tags;
+ }
+ dev->queue->queuedata = dev;
+#endif
+
+ blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+ blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue);
+#else
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue);
+#endif
+#define ONE_MEG (1048576)
+ blk_queue_max_segment_size(dev->queue, ONE_MEG);
+ blk_queue_max_segments(dev->queue, 0xffff);
+ blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE);
+ dev->queue->limits.max_sectors = 256;
+#undef ONE_MEG
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ // set up disk
+ dev->disk = alloc_disk(1);
+ if (!dev->disk) {
+ dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n");
+ ret = -ENOMEM;
+ goto out_cleanup_queue;
+ }
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \
+ || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ dev->disk->flags |= GENHD_FL_NO_PART;
+#else
+ dev->disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+ dev->disk->major = major;
+ dev->disk->first_minor = minor;
+ dev->disk->minors = 1;
+ dev->disk->fops = &dnbd3_blk_ops;
+ dev->disk->private_data = dev;
+ dev->disk->queue = dev->queue;
+ sprintf(dev->disk->disk_name, "dnbd%d", minor);
+ set_capacity(dev->disk, 0);
+ set_disk_ro(dev->disk, 1);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \
+ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ ret = add_disk(dev->disk);
+ if (ret != 0)
+ goto out_cleanup_queue;
+#else
+ add_disk(dev->disk);
+#endif
+
+ // set up sysfs
+ dnbd3_sysfs_init(dev);
+
+ return 0;
+
+out_cleanup_queue:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+out_cleanup_tags:
+ blk_mq_free_tag_set(&dev->tag_set);
+out:
+ mutex_destroy(&dev->alt_servers_lock);
+ return ret;
+}
+
+int dnbd3_blk_del_device(dnbd3_device_t *dev)
+{
+ while (!dnbd3_flag_get(dev->connection_lock))
+ schedule();
+ dnbd3_close_device(dev);
+ dnbd3_sysfs_exit(dev);
+ del_gendisk(dev->disk);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0)
+ blk_cleanup_queue(dev->queue);
+ put_disk(dev->disk);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \
+ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0))
+ blk_cleanup_disk(dev->disk);
+#else
+ put_disk(dev->disk);
+#endif
+ blk_mq_free_tag_set(&dev->tag_set);
+ mutex_destroy(&dev->alt_servers_lock);
+ return 0;
+}
+
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev)
+{
+ struct request *blk_request;
+ unsigned long flags;
+ struct list_head local_copy;
+ int count = 0;
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- spin_unlock_irq(q->queue_lock);
- wake_up(&dev->process_queue_send);
- spin_lock_irq(q->queue_lock);
+ INIT_LIST_HEAD(&local_copy);
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
+ }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ list_add_tail(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
}
+ // Do this even if we didn't move anything from the recv list to the send
+ // list. It might have already contained something, which needs to be
+ // re-requested anyways if this was called because of a server switch.
+ spin_lock_irqsave(&dev->blk_lock, flags);
+ queue_work(dev->send_wq, &dev->send_work);
+ spin_unlock_irqrestore(&dev->blk_lock, flags);
}
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev)
{
- struct request *blk_request, *tmp_request;
- struct request *blk_request2, *tmp_request2;
+ struct request *blk_request;
unsigned long flags;
struct list_head local_copy;
- int dup;
+ int count = 0;
+
INIT_LIST_HEAD(&local_copy);
- spin_lock_irqsave(&dev->blk_lock, flags);
- while (!list_empty(&dev->request_queue_receive))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_lock_irqsave(&dev->recv_queue_lock, flags);
+ while (!list_empty(&dev->recv_queue)) {
+ blk_request = list_entry(dev->recv_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- while (!list_empty(&dev->request_queue_send))
- {
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- dup = 0;
- list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist)
- {
- if (blk_request == blk_request2)
- {
- printk("WARNING: Request is in both lists!\n");
- dup = 1;
- break;
- }
- }
- if (!dup) list_add(&blk_request->queuelist, &local_copy);
- }
+ spin_unlock_irqrestore(&dev->recv_queue_lock, flags);
+ spin_lock_irqsave(&dev->send_queue_lock, flags);
+ while (!list_empty(&dev->send_queue)) {
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ list_add(&blk_request->queuelist, &local_copy);
+ count++;
}
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist)
- {
+ spin_unlock_irqrestore(&dev->send_queue_lock, flags);
+ if (count)
+ dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count);
+ while (!list_empty(&local_copy)) {
+ blk_request = list_entry(local_copy.next, struct request, queuelist);
list_del_init(&blk_request->queuelist);
- if (dnbd3_req_fs(blk_request))
- {
- spin_lock_irqsave(&dev->blk_lock, flags);
- __blk_end_request_all(blk_request, -EIO);
- spin_unlock_irqrestore(&dev->blk_lock, flags);
- }
- else if (dnbd3_req_special(blk_request))
- {
- kfree(blk_request);
- }
+ blk_mq_end_request(blk_request, BLK_STS_IOERR);
}
}
diff --git a/src/kernel/blk.h b/src/kernel/blk.h
index 5091d19..c6dcb8d 100644
--- a/src/kernel/blk.h
+++ b/src/kernel/blk.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,22 +22,17 @@
#ifndef BLK_H_
#define BLK_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV
-#endif
-
-extern struct block_device_operations dnbd3_blk_ops;
-
-int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
-
-void dnbd3_blk_request(struct request_queue *q);
+// The device has been set up via IOCTL_OPEN and hasn't been closed yet
+#define device_active(dev) ((dev)->reported_size != 0)
int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor);
int dnbd3_blk_del_device(dnbd3_device_t *dev);
+void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev);
+
void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev);
#endif /* BLK_H_ */
diff --git a/src/kernel/core.c b/src/kernel/core.c
deleted file mode 100644
index 69a2540..0000000
--- a/src/kernel/core.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include "clientconfig.h"
-#include "dnbd3.h"
-#include "blk.h"
-
-int major;
-static unsigned int max_devs = NUMBER_DEVICES;
-static dnbd3_device_t *dnbd3_device;
-
-static int __init dnbd3_init(void)
-{
- int i;
-
- dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL);
- if (!dnbd3_device)
- return -ENOMEM;
-
- // initialize block device
- if ((major = register_blkdev(0, "dnbd3")) == 0)
- {
- printk("ERROR: dnbd3 register_blkdev failed.\n");
- return -EIO;
- }
-
- printk("DNBD3 kernel module loaded. Machine type: " ENDIAN_MODE "\n");
-
- // add MAX_NUMBER_DEVICES devices
- for (i = 0; i < max_devs; i++)
- {
- if (dnbd3_blk_add_device(&dnbd3_device[i], i) != 0)
- {
- printk("ERROR: adding device failed.\n");
- return -EIO; // TODO: delete all devices added so far. it could happen that it's not the first one that fails. also call unregister_blkdev and free memory
- }
- }
-
- printk("INFO: dnbd3 init successful (%i devices).\n", max_devs);
- return 0;
-}
-
-static void __exit dnbd3_exit(void)
-{
- int i;
-
- for (i = 0; i < max_devs; i++)
- {
- dnbd3_blk_del_device(&dnbd3_device[i]);
- }
-
- unregister_blkdev(major, "dnbd3");
- kfree(dnbd3_device);
- printk("INFO: dnbd3 exit.\n");
-}
-
-module_init( dnbd3_init);
-module_exit( dnbd3_exit);
-
-MODULE_DESCRIPTION("Distributed Network Block Device 3");
-MODULE_LICENSE("GPL");
-
-module_param(max_devs, int, 0444);
-MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h
deleted file mode 100644
index f8af69f..0000000
--- a/src/kernel/dnbd3.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef DNBD_H_
-#define DNBD_H_
-
-#include <linux/version.h>
-#include <linux/kthread.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <net/sock.h>
-
-#define KERNEL_MODULE
-#include "config.h"
-#include "types.h"
-#include "serialize.h"
-
-extern int major;
-
-typedef struct
-{
- dnbd3_host_t host;
- unsigned long rtts[4]; // Last four round trip time measurements in µs
- uint16_t protocol_version; // dnbd3 protocol version of this server
- uint8_t failures; // How many times the server was unreachable
-} dnbd3_server_t;
-
-typedef struct
-{
- // block
- struct gendisk *disk;
- spinlock_t blk_lock;
-
- // sysfs
- struct kobject kobj;
-
- // network
- char *imgname;
- struct socket *sock;
- dnbd3_server_t cur_server, initial_server;
- unsigned long cur_rtt;
- serialized_buffer_t payload_buffer;
- dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers
- int new_servers_num; // number of new alt servers that are waiting to be copied to above array
- dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers
- uint8_t discover, panic, disconnecting, update_available, panic_count;
- uint8_t use_server_provided_alts;
- uint16_t rid;
- uint32_t heartbeat_count;
- uint64_t reported_size;
- // server switch
- struct socket *better_sock;
-
- // process
- struct task_struct * thread_send;
- struct task_struct * thread_receive;
- struct task_struct *thread_discover;
- struct timer_list hb_timer;
- wait_queue_head_t process_queue_send;
- wait_queue_head_t process_queue_receive;
- wait_queue_head_t process_queue_discover;
- struct list_head request_queue_send;
- struct list_head request_queue_receive;
-
-} dnbd3_device_t;
-
-#endif /* DNBD_H_ */
diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c
new file mode 100644
index 0000000..cb42567
--- /dev/null
+++ b/src/kernel/dnbd3_main.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <dnbd3/config/client.h>
+#include <dnbd3/version.h>
+#include <net/ipv6.h>
+#include "dnbd3_main.h"
+#include "blk.h"
+
+int major;
+static unsigned int max_devs = NUMBER_DEVICES;
+static dnbd3_device_t *dnbd3_devices;
+
+struct device *dnbd3_device_to_dev(dnbd3_device_t *dev)
+{
+ return disk_to_dev(dev->disk);
+}
+
+int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest)
+{
+ struct sockaddr_in *sin4;
+ struct sockaddr_in6 *sin6;
+
+ memset(dest, 0, sizeof(*dest));
+ if (host->type == HOST_IP4) {
+ sin4 = (struct sockaddr_in *)dest;
+ sin4->sin_family = AF_INET;
+ memcpy(&(sin4->sin_addr), host->addr, 4);
+ sin4->sin_port = host->port;
+ } else if (host->type == HOST_IP6) {
+ sin6 = (struct sockaddr_in6 *)dest;
+ sin6->sin6_family = AF_INET6;
+ memcpy(&(sin6->sin6_addr), host->addr, 16);
+ sin6->sin6_port = host->port;
+ } else
+ return 0;
+ return 1;
+}
+
+int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y)
+{
+ if (x->ss_family != y->ss_family)
+ return 0;
+ switch (x->ss_family) {
+ case AF_INET: {
+ const struct sockaddr_in *sinx = (const struct sockaddr_in *)x;
+ const struct sockaddr_in *siny = (const struct sockaddr_in *)y;
+
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x;
+ const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y;
+
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * Get a free slot pointer from the alt_servers list. Tries to find an
+ * entirely empty slot first, then looks for a slot with a server that
+ * wasn't reachable recently, finally returns NULL if none of the
+ * conditions match.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+static dnbd3_alt_server_t *get_free_alt_server(dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
+ return &dev->alt_servers[i];
+ }
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].failures > 10)
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev)
+{
+ int i;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (is_same_server(addr, &dev->alt_servers[i].host))
+ return &dev->alt_servers[i];
+ }
+ return NULL;
+}
+
+/**
+ * Returns pointer to existing entry in alt_servers that matches the given
+ * alt server, or NULL if not found.
+ * The caller has to hold dev->alt_servers_lock.
+ */
+dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev)
+{
+ struct sockaddr_storage addr;
+
+ if (!dnbd3_host_to_sockaddr(host, &addr))
+ return NULL;
+ return get_existing_alt_from_addr(&addr, dev);
+}
+
+int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ int result;
+ dnbd3_alt_server_t *alt_server;
+
+ if (host->type != HOST_IP4 && host->type != HOST_IP6)
+ return -EINVAL;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // ADD
+ if (alt_server != NULL) {
+ // Exists
+ result = -EEXIST;
+ } else {
+ // OK add
+ alt_server = get_free_alt_server(dev);
+ if (alt_server == NULL) {
+ result = -ENOSPC;
+ } else {
+ dnbd3_host_to_sockaddr(host, &alt_server->host);
+ alt_server->protocol_version = 0;
+ alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2]
+ = alt_server->rtts[3] = RTT_UNREACHABLE;
+ alt_server->failures = 0;
+ alt_server->best_count = 0;
+ result = 0;
+ }
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host)
+{
+ dnbd3_alt_server_t *alt_server;
+ int result;
+
+ /* protect access to 'alt_servers' */
+ mutex_lock(&dev->alt_servers_lock);
+ alt_server = get_existing_alt_from_host(host, dev);
+ // REMOVE
+ if (alt_server == NULL) {
+ // Not found
+ result = -ENOENT;
+ } else {
+ // Remove
+ alt_server->host.ss_family = 0;
+ result = 0;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
+ return result;
+}
+
+static int __init dnbd3_init(void)
+{
+ int i;
+
+ dnbd3_devices = kcalloc(max_devs, sizeof(*dnbd3_devices), GFP_KERNEL);
+ if (!dnbd3_devices)
+ return -ENOMEM;
+
+ // initialize block device
+ major = register_blkdev(0, "dnbd3");
+ if (major == 0) {
+ pr_err("register_blkdev failed\n");
+ return -EIO;
+ }
+
+ pr_info("kernel module in version %s loaded\n", DNBD3_VERSION);
+ pr_debug("machine type %s\n", DNBD3_ENDIAN_MODE);
+
+ // add MAX_NUMBER_DEVICES devices
+ for (i = 0; i < max_devs; i++) {
+ if (dnbd3_blk_add_device(&dnbd3_devices[i], i) != 0) {
+ pr_err("dnbd3_blk_add_device failed\n");
+ // TODO: delete all devices added so far.
+ // It could happen that it's not the first one that fails.
+ // Also call unregister_blkdev and free memory.
+ return -EIO;
+ }
+ }
+
+ pr_info("init successful (%i devices)\n", max_devs);
+
+ return 0;
+}
+
+static void __exit dnbd3_exit(void)
+{
+ int i;
+
+ pr_debug("exiting kernel module...\n");
+ for (i = 0; i < max_devs; i++)
+ dnbd3_blk_del_device(&dnbd3_devices[i]);
+
+ unregister_blkdev(major, "dnbd3");
+ kfree(dnbd3_devices);
+
+ pr_info("exit kernel module done\n");
+}
+
+module_init(dnbd3_init);
+module_exit(dnbd3_exit);
+
+MODULE_DESCRIPTION("Distributed Network Block Device 3");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DNBD3_VERSION);
+
+module_param(max_devs, int, 0444);
+MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");
diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h
new file mode 100644
index 0000000..a932ba2
--- /dev/null
+++ b/src/kernel/dnbd3_main.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is part of the Distributed Network Block Device 3
+ *
+ * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
+ *
+ * This file may be licensed under the terms of the
+ * GNU General Public License Version 2 (the ``GPL'').
+ *
+ * Software distributed under the License is distributed
+ * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied. See the GPL for the specific language
+ * governing rights and limitations.
+ *
+ * You should have received a copy of the GPL along with this
+ * program. If not, go to http://www.gnu.org/licenses/gpl.html
+ * or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ */
+
+#ifndef DNBD_H_
+#define DNBD_H_
+
+#include <dnbd3/config/client.h>
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <net/sock.h>
+
+#include <dnbd3/config.h>
+#include <dnbd3/types.h>
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/blk-mq.h>
+
+#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION)
+#define RHEL_CHECK_VERSION(CONDITION) (CONDITION)
+#else
+#define RHEL_CHECK_VERSION(CONDITION) (0)
+#endif
+
+extern int major;
+
+typedef struct {
+ unsigned long rtts[DISCOVER_HISTORY_SIZE]; // Last X round trip time measurements in µs
+ uint16_t protocol_version; // dnbd3 protocol version of this server
+ uint8_t failures; // How many times the server was unreachable
+ uint8_t best_count; // Number of times server measured best
+ struct sockaddr_storage host; // Address of server
+} dnbd3_alt_server_t;
+
+typedef struct {
+ // block
+ int index;
+ struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct request_queue *queue;
+ spinlock_t blk_lock;
+
+ // sysfs
+ struct kobject kobj;
+
+ char *imgname;
+ uint16_t rid;
+ struct socket *sock;
+ struct { // use blk_lock
+ unsigned long rtt;
+ struct sockaddr_storage host;
+ uint16_t protocol_version;
+ } cur_server;
+ serialized_buffer_t payload_buffer;
+ struct mutex alt_servers_lock;
+ dnbd3_alt_server_t alt_servers[NUMBER_SERVERS];
+ bool use_server_provided_alts;
+ bool panic;
+ u8 panic_count;
+ bool update_available;
+ atomic_t connection_lock;
+ // Size if image/device - this is 0 if the device is not in use,
+ // otherwise this is also the value we expect from alt servers.
+ uint64_t reported_size;
+ struct delayed_work keepalive_work;
+
+ // sending
+ struct workqueue_struct *send_wq;
+ spinlock_t send_queue_lock;
+ struct list_head send_queue;
+ struct mutex send_mutex;
+ struct work_struct send_work;
+ // receiving
+ struct workqueue_struct *recv_wq;
+ spinlock_t recv_queue_lock;
+ struct list_head recv_queue;
+ struct mutex recv_mutex;
+ struct work_struct recv_work;
+ // discover
+ atomic_t discover_running;
+ struct delayed_work discover_work;
+ u32 discover_interval;
+ u32 discover_count;
+
+} dnbd3_device_t;
+
+struct dnbd3_cmd {
+ u64 handle;
+};
+
+extern inline struct device *dnbd3_device_to_dev(dnbd3_device_t *dev);
+
+extern inline int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y);
+
+extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev);
+
+extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr,
+ dnbd3_device_t *const dev);
+
+extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+extern int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host);
+
+#define dnbd3_flag_get(x) (atomic_cmpxchg(&(x), 0, 1) == 0)
+#define dnbd3_flag_reset(x) atomic_set(&(x), 0)
+#define dnbd3_flag_taken(x) (atomic_read(&(x)) != 0)
+
+/*
+ * shims for making older kernels look like the current one, if possible, to avoid too
+ * much inline #ifdef which makes code harder to read.
+ */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
+#define BLK_EH_DONE BLK_EH_NOT_HANDLED
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
+#define blk_status_t int
+#define BLK_STS_OK 0
+#define BLK_STS_IOERR (-EIO)
+#define BLK_STS_TIMEOUT (-ETIME)
+#define BLK_STS_NOTSUPP (-ENOTSUPP)
+#endif
+
+#endif /* DNBD_H_ */
diff --git a/src/kernel/net.c b/src/kernel/net.c
index 9e48b86..5ef4016 100644
--- a/src/kernel/net.c
+++ b/src/kernel/net.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -18,1106 +19,1112 @@
*
*/
-#include "clientconfig.h"
+#include <dnbd3/config/client.h>
#include "net.h"
#include "blk.h"
-#include "utils.h"
+#include "dnbd3_main.h"
-#include "serialize.h"
+#include <dnbd3/shared/serialize.h>
+
+#include <linux/random.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
+#define get_random_u32 prandom_u32
+#endif
#include <linux/time.h>
-#include <linux/signal.h>
+#include <linux/ktime.h>
+#include <linux/tcp.h>
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
-#else
-#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern((af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)
+#ifndef ktime_to_s
+#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC)
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
-// cmd_flags and cmd_type are merged into cmd_flags now
-#if REQ_FLAG_BITS > 24
-#error "Fix CMD bitshift"
-#endif
-// Pack into cmd_flags field by shifting CMD_* into unused bits of cmd_flags
-#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = REQ_OP_DRV_IN | ((cmd) << REQ_FLAG_BITS)
-#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS)
-#define dnbd3_req_op(req) req_op(req)
-#define DNBD3_DEV_READ REQ_OP_READ
-#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
+#ifdef DEBUG
+#define ASSERT(x) \
+ do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+ } while (0)
#else
-// Old way with type and flags separated
-#define dnbd3_cmd_to_priv(req, cmd) do { \
- (req)->cmd_type = REQ_TYPE_SPECIAL; \
- (req)->cmd_flags = (cmd); \
-} while (0)
-#define dnbd3_priv_to_cmd(req) (req)->cmd_flags
-#define dnbd3_req_op(req) (req)->cmd_type
-#define DNBD3_DEV_READ REQ_TYPE_FS
-#define DNBD3_REQ_OP_SPECIAL REQ_TYPE_SPECIAL
+#define ASSERT(x) \
+ do { \
+ } while (0)
#endif
-/**
- * Some macros for easier debug output. Location in source-code
- * as well as server IP:port info will be printed.
- * The error_* macros include a "goto error;" at the end
- */
-#if 1 // Change to 0 to disable debug messages
-#define debug_print_va_host(_host, _fmt, ...) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " _fmt " (%s, %pI4:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " _fmt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_va_host(_host, _fmt, ...) do { \
- debug_print_va_host(_host, _fmt, __VA_ARGS__); \
- goto error; \
-} while(0)
-#define debug_dev_va(_fmt, ...) debug_print_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define error_dev_va(_fmt, ...) debug_error_va_host(dev->cur_server.host, _fmt, __VA_ARGS__)
-#define debug_alt_va(_fmt, ...) debug_print_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-#define error_alt_va(_fmt, ...) debug_error_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__)
-
-#define debug_print_host(_host, txt) do { \
- if ((_host).type == HOST_IP4) \
- printk("%s:%d " txt " (%s, %pI4:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
- else \
- printk("%s:%d " txt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \
-} while(0)
-#define debug_error_host(_host, txt) do { \
- debug_print_host(_host, txt); \
- goto error; \
-} while(0)
-#define debug_dev(txt) debug_print_host(dev->cur_server.host, txt)
-#define error_dev(txt) debug_error_host(dev->cur_server.host, txt)
-#define debug_alt(txt) debug_print_host(dev->alt_servers[i].host, txt)
-#define error_alt(txt) debug_error_host(dev->alt_servers[i].host, txt)
-
-#else // Silent
-#define debug_dev(x) do { } while(0)
-#define error_dev(x) goto error
-#define debug_dev_va(x, ...) do { } while(0)
-#define error_dev_va(x, ...) goto error
-#define debug_alt(x) do { } while(0)
-#define error_alt(x) goto error
-#define debug_alt_va(x, ...) do { } while(0)
-#define error_alt_va(x, ...) goto error
-#endif
+#define dnbd3_dev_dbg_host(dev, host, fmt, ...) \
+ dev_dbg(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_info_host(dev, host, fmt, ...) \
+ dev_info(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
+#define dnbd3_dev_err_host(dev, host, fmt, ...) \
+ dev_err(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__)
-static inline int is_same_server(const dnbd3_server_t * const a, const dnbd3_server_t * const b)
-{
- return (a->host.type == b->host.type) && (a->host.port == b->host.port)
- && (0 == memcmp(a->host.addr, b->host.addr, (a->host.type == HOST_IP4 ? 4 : 16)));
-}
+#define dnbd3_dev_dbg_cur(dev, fmt, ...) \
+ dnbd3_dev_dbg_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_info_cur(dev, fmt, ...) \
+ dnbd3_dev_info_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
+#define dnbd3_dev_err_cur(dev, fmt, ...) \
+ dnbd3_dev_err_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__)
-static inline dnbd3_server_t *get_existing_server(const dnbd3_server_entry_t * const newserver,
- dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if ((newserver->host.type == dev->alt_servers[i].host.type)
- && (newserver->host.port == dev->alt_servers[i].host.port)
- && (0
- == memcmp(newserver->host.addr, dev->alt_servers[i].host.addr, (newserver->host.type == HOST_IP4 ? 4 : 16))))
- {
- return &dev->alt_servers[i];
- break;
- }
- }
- return NULL ;
-}
-
-static inline dnbd3_server_t *get_free_alt_server(dnbd3_device_t * const dev)
-{
- int i;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == 0)
- return &dev->alt_servers[i];
- }
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].failures > 10)
- return &dev->alt_servers[i];
- }
- return NULL ;
-}
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes);
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count);
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr);
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size);
-int dnbd3_net_connect(dnbd3_device_t *dev)
-{
- struct request *req1 = NULL;
- struct timeval timeout;
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, u16 protocol_version);
- if (dev->disconnecting) {
- debug_dev("CONNECT: Still disconnecting!!!\n");
- while (dev->disconnecting)
- schedule();
- }
- if (dev->thread_receive != NULL) {
- debug_dev("CONNECT: Still receiving!!!\n");
- while (dev->thread_receive != NULL)
- schedule();
- }
- if (dev->thread_send != NULL) {
- debug_dev("CONNECT: Still sending!!!\n");
- while (dev->thread_send != NULL)
- schedule();
- }
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket **sock_out);
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
- timeout.tv_usec = 0;
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info);
- // do some checks before connecting
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr,
+ struct socket *sock);
- req1 = kmalloc(sizeof(*req1), GFP_ATOMIC );
- if (!req1)
- error_dev("FATAL: Kmalloc(1) failed.");
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd);
- if (dev->cur_server.host.port == 0 || dev->cur_server.host.type == 0 || dev->imgname == NULL )
- error_dev("FATAL: Host, port or image name not set.");
- if (dev->sock)
- error_dev("ERROR: Already connected.");
-
- if (dev->cur_server.host.type != HOST_IP4 && dev->cur_server.host.type != HOST_IP6)
- error_dev_va("ERROR: Unknown address type %d", (int)dev->cur_server.host.type);
-
- debug_dev("INFO: Connecting...");
-
- if (dev->better_sock == NULL )
- {
- // no established connection yet from discovery thread, start new one
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov[2];
- uint16_t rid;
- char *name;
- int mlen;
- init_msghdr(msg);
-
- if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0)
- error_dev("ERROR: Couldn't create socket (v6).");
-
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- dev->sock->sk->sk_allocation = GFP_NOIO;
- if (dev->cur_server.host.type == HOST_IP4)
- {
- struct sockaddr_in sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- memcpy(&(sin.sin_addr), dev->cur_server.host.addr, 4);
- sin.sin_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v4)");
- }
- else
- {
- struct sockaddr_in6 sin;
- memset(&sin, 0, sizeof(sin));
- sin.sin6_family = AF_INET6;
- memcpy(&(sin.sin6_addr), dev->cur_server.host.addr, 16);
- sin.sin6_port = dev->cur_server.host.port;
- if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0)
- error_dev("FATAL: Connection to host failed. (v6)");
- }
- // Request filesize
- dnbd3_request.magic = dnbd3_packet_magic;
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(&dev->payload_buffer);
- serializer_put_uint16(&dev->payload_buffer, PROTOCOL_VERSION);
- serializer_put_string(&dev->payload_buffer, dev->imgname);
- serializer_put_uint16(&dev->payload_buffer, dev->rid);
- serializer_put_uint8(&dev->payload_buffer, 0); // is_server = false
- iov[1].iov_base = &dev->payload_buffer;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&dev->payload_buffer);
- fixup_request(dnbd3_request);
- mlen = sizeof(dnbd3_request) + iov[1].iov_len;
- if (kernel_sendmsg(dev->sock, &msg, iov, 2, mlen) != mlen)
- error_dev("ERROR: Couldn't send CMD_SIZE_REQUEST.");
- // receive reply header
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_dev("FATAL: Received corrupted reply header after CMD_SIZE_REQUEST.");
- // check reply header
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 3 || dnbd3_reply.size > MAX_PAYLOAD
- || dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("FATAL: Received invalid reply to CMD_SIZE_REQUEST, image doesn't exist on server.");
- // receive reply payload
- iov[0].iov_base = &dev->payload_buffer;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(dev->sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_dev("FATAL: Cold not read CMD_SELECT_IMAGE payload on handshake.");
- // handle/check reply payload
- serializer_reset_read(&dev->payload_buffer, dnbd3_reply.size);
- dev->cur_server.protocol_version = serializer_get_uint16(&dev->payload_buffer);
- if (dev->cur_server.protocol_version < MIN_SUPPORTED_SERVER)
- error_dev("FATAL: Server version is lower than min supported version.");
- name = serializer_get_string(&dev->payload_buffer);
- if (dev->rid != 0 && strcmp(name, dev->imgname) != 0)
- error_dev_va("FATAL: Server offers image '%s', requested '%s'", name, dev->imgname);
- if (strlen(dev->imgname) < strlen(name))
- {
- dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_ATOMIC );
- if (dev->imgname == NULL )
- error_dev("FATAL: Reallocating buffer for new image name failed");
- }
- strcpy(dev->imgname, name);
- rid = serializer_get_uint16(&dev->payload_buffer);
- if (dev->rid != 0 && dev->rid != rid)
- error_dev_va("FATAL: Server provides rid %d, requested was %d.", (int)rid, (int)dev->rid);
- dev->rid = rid;
- dev->reported_size = serializer_get_uint64(&dev->payload_buffer);
- if (dev->reported_size < 4096)
- error_dev("ERROR: Reported size by server is < 4096");
- // store image information
- set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
- debug_dev_va("INFO: Filesize: %llu.", dev->reported_size);
- dev->update_available = 0;
- }
- else // Switching server, connection is already established and size request was executed
- {
- debug_dev("INFO: On-the-fly server change.");
- dev->sock = dev->better_sock;
- dev->better_sock = NULL;
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- }
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic);
- dev->panic = 0;
- dev->panic_count = 0;
+static void dnbd3_discover(dnbd3_device_t *dev);
- // Enqueue request to request_queue_send for a fresh list of alt servers
- dnbd3_cmd_to_priv(req1, CMD_GET_SERVERS);
- list_add(&req1->queuelist, &dev->request_queue_send);
+static void dnbd3_internal_discover(dnbd3_device_t *dev);
- // create required threads
- dev->thread_send = kthread_create(dnbd3_net_send, dev, dev->disk->disk_name);
- dev->thread_receive = kthread_create(dnbd3_net_receive, dev, dev->disk->disk_name);
- dev->thread_discover = kthread_create(dnbd3_net_discover, dev, dev->disk->disk_name);
- // start them up
- wake_up_process(dev->thread_send);
- wake_up_process(dev->thread_receive);
- wake_up_process(dev->thread_discover);
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms);
- wake_up(&dev->process_queue_send);
+// Use as write-only dump, don't care about race conditions etc.
+static u8 __garbage_mem[PAGE_SIZE];
- // add heartbeat timer
- dev->heartbeat_count = 0;
+/**
+ * Delayed work triggering sending of keepalive packet.
+ */
+static void dnbd3_keepalive_workfn(struct work_struct *work)
+{
+ unsigned long irqflags;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, keepalive_work.work);
-// init_timer_key changed from kernel version 4.14 to 4.15, see and compare to 4.15:
-// https://elixir.bootlin.com/linux/v4.14.32/source/include/linux/timer.h#L98
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
- timer_setup(&dev->hb_timer, dnbd3_net_heartbeat, 0);
-#else
- // Old timer setup
- init_timer(&dev->hb_timer);
- dev->hb_timer.data = (unsigned long)dev;
- dev->hb_timer.function = dnbd3_net_heartbeat;
-#endif
- dev->hb_timer.expires = jiffies + HZ;
- add_timer(&dev->hb_timer);
- return 0;
- error: ;
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ dnbd3_send_empty_request(dev, CMD_KEEPALIVE);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->keepalive_work, KEEPALIVE_INTERVAL * HZ);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
- if (req1)
- kfree(req1);
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-int dnbd3_net_disconnect(dnbd3_device_t *dev)
+/**
+ * Delayed work triggering discovery (alt server check)
+ */
+static void dnbd3_discover_workfn(struct work_struct *work)
{
- if (dev->disconnecting)
- return 0;
-
- if (dev->cur_server.host.port)
- debug_dev("INFO: Disconnecting device.");
-
- dev->disconnecting = 1;
-
- // clear heartbeat timer
- del_timer(&dev->hb_timer);
-
- dev->discover = 0;
-
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
-
- // kill sending and receiving threads
- if (dev->thread_send)
- {
- kthread_stop(dev->thread_send);
- }
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, discover_work.work);
- if (dev->thread_receive)
- {
- kthread_stop(dev->thread_receive);
- }
+ dnbd3_discover(dev);
+}
- if (dev->thread_discover)
- {
- kthread_stop(dev->thread_discover);
- dev->thread_discover = NULL;
- }
+/**
+ * For manually triggering an immediate discovery
+ */
+static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic)
+{
+ unsigned long irqflags;
- // clear socket
- if (dev->sock)
- {
- sock_release(dev->sock);
- dev->sock = NULL;
+ if (!device_active(dev))
+ return;
+ if (panic && dnbd3_flag_get(dev->connection_lock)) {
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (!dev->panic) {
+ // Panic freshly turned on
+ dev->panic = true;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC;
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->connection_lock);
}
- dev->cur_server.host.type = 0;
- dev->cur_server.host.port = 0;
-
- dev->disconnecting = 0;
-
- return 0;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, 1);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
}
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg)
-{
- dnbd3_device_t *dev = (dnbd3_device_t *)container_of(arg, dnbd3_device_t, hb_timer);
-#else
-void dnbd3_net_heartbeat(unsigned long arg)
+/**
+ * Wrapper for the actual discover function below. Check run conditions
+ * here and re-schedule delayed task here.
+ */
+static void dnbd3_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = (dnbd3_device_t *)arg;
-#endif
- // Because different events need different intervals, the timer is called once a second.
- // Other intervals can be derived using dev->heartbeat_count.
-#define timeout_seconds(x) (dev->heartbeat_count % (x) == 0)
-
- if (!dev->panic)
- {
- if (timeout_seconds(TIMER_INTERVAL_KEEPALIVE_PACKET))
- {
- struct request *req = kmalloc(sizeof(struct request), GFP_ATOMIC );
- // send keepalive
- if (req)
- {
- dnbd3_cmd_to_priv(req, CMD_KEEPALIVE);
- list_add_tail(&req->queuelist, &dev->request_queue_send);
- wake_up(&dev->process_queue_send);
- }
- else
- {
- debug_dev("ERROR: Couldn't create keepalive request.");
- }
- }
- if ((dev->heartbeat_count > STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_NORMAL))
- || (dev->heartbeat_count <= STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_STARTUP)))
- {
- // Normal discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ unsigned long irqflags;
+
+ if (!device_active(dev) || dnbd3_flag_taken(dev->connection_lock))
+ return; // device not active anymore, or just about to switch
+ if (!dnbd3_flag_get(dev->discover_running))
+ return; // Already busy
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ cancel_delayed_work(&dev->discover_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_internal_discover(dev);
+ dev->discover_count++;
+ // Re-queueing logic
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (device_active(dev)) {
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &dev->discover_work, dev->discover_interval * HZ);
+ if (dev->discover_interval < TIMER_INTERVAL_PROBE_MAX
+ && dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT) {
+ dev->discover_interval += 2;
}
}
- else if (timeout_seconds(TIMER_INTERVAL_PROBE_PANIC))
- {
- // Panic discovery
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
-
- dev->hb_timer.expires = jiffies + HZ;
-
- ++dev->heartbeat_count;
- add_timer(&dev->hb_timer);
-#undef timeout_seconds
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_flag_reset(dev->discover_running);
}
-int dnbd3_net_discover(void *data)
+/**
+ * Discovery. Probe all (or some) known alt servers,
+ * and initiate connection switch if appropriate
+ */
+static void dnbd3_internal_discover(dnbd3_device_t *dev)
{
- dnbd3_device_t *dev = data;
- struct sockaddr_in sin4;
- struct sockaddr_in6 sin6;
struct socket *sock, *best_sock = NULL;
+ dnbd3_alt_server_t *alt;
+ struct sockaddr_storage host_compare, best_server;
+ uint16_t remote_version;
+ ktime_t start, end;
+ unsigned long rtt = 0, best_rtt = 0;
+ int i, j, k, isize, fails, rtt_threshold;
+ int do_change = 0;
+ u8 check_order[NUMBER_SERVERS];
+ const bool ready = dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT;
+ const u32 turn = dev->discover_count % DISCOVER_HISTORY_SIZE;
+
+ // Shuffle alt_servers
+ for (i = 0; i < NUMBER_SERVERS; ++i)
+ check_order[i] = i;
- dnbd3_request_t dnbd3_request;
- dnbd3_reply_t dnbd3_reply;
- dnbd3_server_t *alt_server;
- struct msghdr msg;
- struct kvec iov[2];
-
- char *buf, *name;
- serialized_buffer_t *payload;
- uint64_t filesize;
- uint16_t rid;
-
- struct timeval start, end;
- unsigned long rtt, best_rtt = 0;
- unsigned long irqflags;
- int i, j, isize, best_server, current_server;
- int turn = 0;
- int ready = 0, do_change = 0;
- char check_order[NUMBER_SERVERS];
- int mlen;
-
- struct request *last_request = (struct request *)123, *cur_request = (struct request *)456;
-
- struct timeval timeout;
- timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DISCOVERY;
- timeout.tv_usec = 0;
-
- memset(&sin4, 0, sizeof(sin4));
- memset(&sin6, 0, sizeof(sin6));
-
- init_msghdr(msg);
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ j = get_random_u32() % NUMBER_SERVERS;
+ if (j != i) {
+ int tmp = check_order[i];
- buf = kmalloc(4096, GFP_KERNEL);
- if (!buf)
- {
- debug_dev("FATAL: Kmalloc failed (discover)");
- return -1;
+ check_order[i] = check_order[j];
+ check_order[j] = tmp;
+ }
}
- payload = (serialized_buffer_t *)buf; // Reuse this buffer to save kernel mem
- dnbd3_request.magic = dnbd3_packet_magic;
+ best_server.ss_family = 0;
+ best_rtt = RTT_UNREACHABLE;
- for (i = 0; i < NUMBER_SERVERS; ++i) {
- check_order[i] = i;
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_discover,
- kthread_should_stop() || dev->discover || dev->thread_discover == NULL);
+ if (!ready || dev->panic)
+ isize = NUMBER_SERVERS;
+ else
+ isize = 3;
- if (kthread_should_stop() || dev->imgname == NULL || dev->thread_discover == NULL )
+ for (j = 0; j < NUMBER_SERVERS; ++j) {
+ if (!device_active(dev))
break;
+ i = check_order[j];
+ mutex_lock(&dev->alt_servers_lock);
+ host_compare = dev->alt_servers[i].host;
+ fails = dev->alt_servers[i].failures;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (host_compare.ss_family == 0)
+ continue; // Empty slot
+ // Reduced probability for hosts that have been unreachable
+ if (!dev->panic && fails > 50 && (get_random_u32() % 4) != 0)
+ continue; // If not in panic mode, skip server if it failed too many times
+ if (isize-- <= 0 && !is_same_server(&dev->cur_server.host, &host_compare))
+ continue; // Only test isize servers plus current server
+
+ // Initialize socket and connect
+ sock = NULL;
+ if (dnbd3_connect(dev, &host_compare, &sock) != 0)
+ goto error;
- if (!dev->discover)
- continue;
- dev->discover = 0;
-
- if (dev->reported_size < 4096)
- continue;
-
- // Check if the list of alt servers needs to be updated and do so if necessary
- if (dev->new_servers_num)
- {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- for (i = 0; i < dev->new_servers_num; ++i)
- {
- if (dev->new_servers[i].host.type != HOST_IP4 && dev->new_servers[i].host.type != HOST_IP6) // Invalid entry?
- continue;
- alt_server = get_existing_server(&dev->new_servers[i], dev);
- if (alt_server != NULL ) // Server already known
- {
- if (dev->new_servers[i].failures == 1)
- {
- // REMOVE request
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Removing alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Removing alt server %pI6", alt_server->host.addr);
- alt_server->host.type = 0;
- continue;
- }
- // ADD, so just reset fail counter
- alt_server->failures = 0;
- continue;
- }
- if (dev->new_servers[i].failures == 1) // REMOVE, but server is not in list anyways
- continue;
- alt_server = get_free_alt_server(dev);
- if (alt_server == NULL ) // All NUMBER_SERVERS slots are taken, ignore entry
- continue;
- // Add new server entry
- alt_server->host = dev->new_servers[i].host;
- if (alt_server->host.type == HOST_IP4)
- debug_dev_va("Adding alt server %pI4", alt_server->host.addr);
- else
- debug_dev_va("Adding alt server %pI6", alt_server->host.addr);
- alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE;
- alt_server->protocol_version = 0;
- alt_server->failures = 0;
- }
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
+ remote_version = 0;
+ if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false))
+ goto error;
- current_server = best_server = -1;
- best_rtt = 0xFFFFFFFul;
- if (dev->heartbeat_count < STARTUP_MODE_DURATION || dev->panic)
- {
- isize = NUMBER_SERVERS;
- }
- else
- {
- isize = 3;
- }
- if (NUMBER_SERVERS > isize) {
- for (i = 0; i < isize; ++i) {
- j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS;
- if (j != i) {
- mlen = check_order[i];
- check_order[i] = check_order[j];
- check_order[j] = mlen;
- }
+ // panic mode, take first responding server
+ if (dev->panic) {
+ dnbd3_dev_info_host(dev, &host_compare, "panic mode, changing to new server\n");
+ if (!dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_host(dev, &host_compare, "...raced, ignoring\n");
+ } else {
+ // Check global flag, a connect might have been in progress
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000);
+ if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0)
+ sock_release(sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
}
}
- for (j = 0; j < NUMBER_SERVERS; ++j)
- {
- i = check_order[j];
- if (dev->alt_servers[i].host.type == 0) // Empty slot
- continue;
- if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times
- continue;
- if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- continue;
-
- // Initialize socket and connect
- if (dnbd3_sock_create(dev->alt_servers[i].host.type, SOCK_STREAM, IPPROTO_TCP, &sock) < 0)
- {
- debug_alt("ERROR: Couldn't create socket (discover).");
- sock = NULL;
- continue;
- }
- kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
- kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
- sock->sk->sk_allocation = GFP_NOIO;
- if (dev->alt_servers[i].host.type == HOST_IP4)
- {
- sin4.sin_family = AF_INET;
- memcpy(&sin4.sin_addr, dev->alt_servers[i].host.addr, 4);
- sin4.sin_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin4, sizeof(sin4), 0) < 0)
- goto error;
- }
- else
- {
- sin6.sin6_family = AF_INET6;
- memcpy(&sin6.sin6_addr, dev->alt_servers[i].host.addr, 16);
- sin6.sin6_port = dev->alt_servers[i].host.port;
- if (kernel_connect(sock, (struct sockaddr *)&sin6, sizeof(sin6), 0) < 0)
- goto error;
- }
+ // actual rtt measurement is just the first block requests and reply
+ start = ktime_get_real();
+ if (!dnbd3_request_test_block(dev, &host_compare, sock))
+ goto error;
+ end = ktime_get_real();
- // Request filesize
- dnbd3_request.cmd = CMD_SELECT_IMAGE;
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
- serializer_reset_write(payload);
- serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
- serializer_put_string(payload, dev->imgname); // image name
- serializer_put_uint16(payload, dev->rid); // revision id
- serializer_put_uint8(payload, 0); // are we a server? (no!)
- iov[1].iov_base = payload;
- dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(payload);
- fixup_request(dnbd3_request);
- mlen = iov[1].iov_len + sizeof(dnbd3_request);
- if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen)
- error_alt("ERROR: Requesting image size failed.");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving image size packet (header) failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4)
- error_alt("ERROR: Content of image size packet (header) mismatched (discover).");
-
- // receive data
- iov[0].iov_base = payload;
- iov[0].iov_len = dnbd3_reply.size;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size)
- error_alt("ERROR: Receiving image size packet (payload) failed (discover).");
- serializer_reset_read(payload, dnbd3_reply.size);
-
- dev->alt_servers[i].protocol_version = serializer_get_uint16(payload);
- if (dev->alt_servers[i].protocol_version < MIN_SUPPORTED_SERVER)
- error_alt_va("ERROR: Server version too old (client: %d, server: %d, min supported: %d).",
- (int)PROTOCOL_VERSION, (int)dev->alt_servers[i].protocol_version, (int)MIN_SUPPORTED_SERVER);
-
- name = serializer_get_string(payload);
- if (name == NULL )
- error_alt("ERROR: Server did not supply an image name (discover).");
-
- if (strcmp(name, dev->imgname) != 0)
- error_alt_va("ERROR: Image name does not match requested one (client: '%s', server: '%s') (discover).",
- dev->imgname, name);
-
- rid = serializer_get_uint16(payload);
- if (rid != dev->rid)
- error_alt_va("ERROR: Server supplied wrong rid (client: '%d', server: '%d') (discover).",
- (int)dev->rid, (int)rid);
-
- filesize = serializer_get_uint64(payload);
- if (filesize != dev->reported_size)
- error_alt_va("ERROR: Reported image size of %llu does not match expected value %llu.(discover).",
- (unsigned long long)filesize, (unsigned long long)dev->reported_size);
-
- // panic mode, take first responding server
- if (dev->panic)
- {
- dev->panic = 0;
- debug_alt("WARN: Panic mode, changing server:");
- if (best_sock != NULL )
- sock_release(best_sock);
- dev->better_sock = sock; // Pass over socket to take a shortcut in *_connect();
- kfree(buf);
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[i], sizeof(dev->cur_server));
- dnbd3_net_connect(dev);
- return 0;
- }
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ dev->alt_servers[i].protocol_version = remote_version;
+ dev->alt_servers[i].rtts[turn] =
+ (unsigned long)ktime_us_delta(end, start);
- // Request block
- dnbd3_request.cmd = CMD_GET_BLOCK;
- // Do *NOT* pick a random block as it has proven to cause severe
- // cache thrashing on the server
- dnbd3_request.offset = 0;
- dnbd3_request.size = RTT_BLOCK_SIZE;
- fixup_request(dnbd3_request);
- iov[0].iov_base = &dnbd3_request;
- iov[0].iov_len = sizeof(dnbd3_request);
-
- // start rtt measurement
- do_gettimeofday(&start);
-
- if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0)
- error_alt("ERROR: Requesting test block failed (discover).");
-
- // receive net reply
- iov[0].iov_base = &dnbd3_reply;
- iov[0].iov_len = sizeof(dnbd3_reply);
- if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply))
- error_alt("ERROR: Receiving test block header packet failed (discover).");
- fixup_reply(dnbd3_reply);
- if (dnbd3_reply.magic
- != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE)
- error_alt_va("ERROR: Unexpected reply to block request: cmd=%d, size=%d (discover).",
- (int)dnbd3_reply.cmd, (int)dnbd3_reply.size);
-
- // receive data
- iov[0].iov_base = buf;
- iov[0].iov_len = RTT_BLOCK_SIZE;
- if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE)
- error_alt("ERROR: Receiving test block payload failed (discover).");
-
- do_gettimeofday(&end); // end rtt measurement
-
- dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull
- + (end.tv_usec - start.tv_usec));
-
- rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2]
- + dev->alt_servers[i].rtts[3]) / 4;
-
- if (best_rtt > rtt)
- {
- // This one is better, keep socket open in case we switch
- best_rtt = rtt;
- best_server = i;
- if (best_sock != NULL )
- sock_release(best_sock);
- best_sock = sock;
- sock = NULL;
- }
- else
- {
- // Not better, discard connection
- sock_release(sock);
- sock = NULL;
- }
+ rtt = 0;
- // update cur servers rtt
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = rtt;
- current_server = i;
- }
+ for (k = 0; k < DISCOVER_HISTORY_SIZE; ++k)
+ rtt += dev->alt_servers[i].rtts[k];
+ rtt /= DISCOVER_HISTORY_SIZE;
dev->alt_servers[i].failures = 0;
+ if (dev->alt_servers[i].best_count > 1)
+ dev->alt_servers[i].best_count -= 2;
+ }
+ mutex_unlock(&dev->alt_servers_lock);
- continue;
-
- error: ;
- ++dev->alt_servers[i].failures;
+ if (best_rtt > rtt) {
+ // This one is better, keep socket open in case we switch
+ best_rtt = rtt;
+ best_server = host_compare;
+ if (best_sock != NULL)
+ sock_release(best_sock);
+ best_sock = sock;
+ sock = NULL;
+ } else {
+ // Not better, discard connection
sock_release(sock);
sock = NULL;
- dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
- if (is_same_server(&dev->cur_server, &dev->alt_servers[i]))
- {
- dev->cur_rtt = RTT_UNREACHABLE;
- current_server = i;
- }
- continue;
}
- if (dev->panic)
- {
- // After 21 retries, bail out by reporting errors to block layer
- if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count < 255 && ++dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
- dnbd3_blk_fail_all_requests(dev);
- }
+ // update cur servers rtt
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = rtt;
- if (best_server == -1 || kthread_should_stop() || dev->thread_discover == NULL ) // No alt server could be reached at all or thread should stop
- {
- if (best_sock != NULL ) // Should never happen actually
- {
- sock_release(best_sock);
- best_sock = NULL;
- }
- continue;
- }
+ continue;
- do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0
- && RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500;
-
- if (ready && !do_change) {
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (!list_empty(&dev->request_queue_send))
- {
- cur_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- do_change = (cur_request == last_request);
- if (do_change)
- printk("WARNING: Hung request on %s\n", dev->disk->disk_name);
- }
- else
- {
- cur_request = (struct request *)123;
- }
- last_request = cur_request;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+error:
+ if (sock != NULL) {
+ sock_release(sock);
+ sock = NULL;
}
-
- // take server with lowest rtt
- if (do_change)
- {
- printk("INFO: Server %d on %s is faster (%lluµs vs. %lluµs)\n", best_server, dev->disk->disk_name,
- (unsigned long long)best_rtt, (unsigned long long)dev->cur_rtt);
- kfree(buf);
- dev->better_sock = best_sock; // Take shortcut by continuing to use open connection
- dev->thread_discover = NULL;
- dnbd3_net_disconnect(dev);
- memcpy(&dev->cur_server, &dev->alt_servers[best_server], sizeof(dev->cur_server));
- dev->cur_rtt = best_rtt;
- dnbd3_net_connect(dev);
- return 0;
+ mutex_lock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->alt_servers[i].host, &host_compare)) {
+ if (remote_version)
+ dev->alt_servers[i].protocol_version = remote_version;
+ ++dev->alt_servers[i].failures;
+ dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE;
+ if (dev->alt_servers[i].best_count > 2)
+ dev->alt_servers[i].best_count -= 3;
}
-
- // Clean up connection that was held open for quicker server switch
- if (best_sock != NULL )
- {
- sock_release(best_sock);
- best_sock = NULL;
+ mutex_unlock(&dev->alt_servers_lock);
+ if (is_same_server(&dev->cur_server.host, &host_compare))
+ dev->cur_server.rtt = RTT_UNREACHABLE;
+ } // END - for loop over alt_servers
+
+ if (best_server.ss_family == 0) {
+ // No alt server could be reached
+ ASSERT(!best_sock);
+ if (dev->panic) {
+ if (dev->panic_count < 255)
+ dev->panic_count++;
+ // If probe timeout is set, report error to block layer
+ if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count == PROBE_COUNT_TIMEOUT + 1)
+ dnbd3_blk_fail_all_requests(dev);
}
+ return;
+ }
- if (!ready || (start.tv_usec & 15) != 0)
- turn = (turn + 1) % 4;
- if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing
- ready = 1;
-
+ // If best server was repeatedly measured best, lower the switching threshold more
+ mutex_lock(&dev->alt_servers_lock);
+ alt = get_existing_alt_from_addr(&best_server, dev);
+ if (alt != NULL) {
+ if (alt->best_count < 178)
+ alt->best_count += 3;
+ rtt_threshold = 1800 - (alt->best_count * 10);
+ remote_version = alt->protocol_version;
+ } else {
+ rtt_threshold = 1800;
+ remote_version = 0;
}
- kfree(buf);
- return 0;
+ mutex_unlock(&dev->alt_servers_lock);
+
+ do_change = ready && !is_same_server(&best_server, &dev->cur_server.host)
+ && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold;
+
+ // take server with lowest rtt
+ // if a (dis)connect is already in progress, we do nothing, this is not panic mode
+ if (do_change && device_active(dev) && dnbd3_flag_get(dev->connection_lock)) {
+ dnbd3_dev_info_cur(dev, "server %pISpc is faster (%lluµs vs. %lluµs)\n",
+ &best_server,
+ (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt);
+ set_socket_timeout(best_sock, false, // recv
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_RECV * 1000) + 500);
+ set_socket_timeout(best_sock, true, // send
+ MAX(best_rtt / 1000, SOCKET_TIMEOUT_SEND * 1000) + 500);
+ if (dnbd3_set_primary_connection(dev, best_sock, &best_server, remote_version) != 0)
+ sock_release(best_sock);
+ dnbd3_flag_reset(dev->connection_lock);
+ return;
+ }
+
+ // Clean up connection that was held open for quicker server switch
+ if (best_sock != NULL)
+ sock_release(best_sock);
}
-int dnbd3_net_send(void *data)
+/**
+ * Worker for sending pending requests. This will be triggered whenever
+ * we get a new request from the block layer. The worker will then
+ * work through all the requests in the send queue, request them from
+ * the server, and return again.
+ */
+static void dnbd3_send_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request;
-
- dnbd3_request_t dnbd3_request;
- struct msghdr msg;
- struct kvec iov;
-
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, send_work);
+ struct request *blk_request;
+ struct dnbd3_cmd *cmd;
unsigned long irqflags;
- init_msghdr(msg);
-
- dnbd3_request.magic = dnbd3_packet_magic;
-
- set_user_nice(current, -20);
-
- // move already sent requests to request_queue_send again
- while (!list_empty(&dev->request_queue_receive))
- {
- printk("WARN: Request queue was not empty on %s\n", dev->disk->disk_name);
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- list_del_init(&blk_request->queuelist);
- list_add(&blk_request->queuelist, &dev->request_queue_send);
- }
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
-
- for (;;)
- {
- wait_event_interruptible(dev->process_queue_send, kthread_should_stop() || !list_empty(&dev->request_queue_send));
-
- if (kthread_should_stop())
+ mutex_lock(&dev->send_mutex);
+ while (dev->sock && device_active(dev)) {
+ // extract next block request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ if (list_empty(&dev->send_queue)) {
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
break;
-
- // extract block request
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- if (list_empty(&dev->request_queue_send))
- {
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
- blk_request = list_entry(dev->request_queue_send.next, struct request, queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- // what to do?
- switch (dnbd3_req_op(blk_request))
- {
- case DNBD3_DEV_READ:
- dnbd3_request.cmd = CMD_GET_BLOCK;
- dnbd3_request.offset = blk_rq_pos(blk_request) << 9; // *512
- dnbd3_request.size = blk_rq_bytes(blk_request); // bytes left to complete entire request
- // enqueue request to request_queue_receive
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- list_add_tail(&blk_request->queuelist, &dev->request_queue_receive);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- break;
- case DNBD3_REQ_OP_SPECIAL:
- dnbd3_request.cmd = dnbd3_priv_to_cmd(blk_request);
- dnbd3_request.size = 0;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ blk_request = list_entry(dev->send_queue.next, struct request, queuelist);
+ list_del_init(&blk_request->queuelist);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ // append to receive queue
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_add_tail(&blk_request->queuelist, &dev->recv_queue);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+
+ cmd = blk_mq_rq_to_pdu(blk_request);
+ if (!dnbd3_send_request(dev->sock, CMD_GET_BLOCK, cmd->handle,
+ blk_rq_pos(blk_request) << 9 /* sectors */, blk_rq_bytes(blk_request))) {
+ if (!dnbd3_flag_taken(dev->connection_lock)) {
+ dnbd3_dev_err_cur(dev, "connection to server lost (send)\n");
+ dnbd3_start_discover(dev, true);
+ }
break;
-
- default:
- printk("ERROR: Unknown command (send %u %u)\n", (int)blk_request->cmd_flags, (int)dnbd3_req_op(blk_request));
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
}
-
- // send net request
- dnbd3_request.handle = (uint64_t)(uintptr_t)blk_request; // Double cast to prevent warning on 32bit
- fixup_request(dnbd3_request);
- iov.iov_base = &dnbd3_request;
- iov.iov_len = sizeof(dnbd3_request);
- if (kernel_sendmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_request)) != sizeof(dnbd3_request))
- {
- debug_dev("ERROR: Connection to server lost (send)");
- goto error;
- }
- wake_up(&dev->process_queue_receive);
}
-
- dev->thread_send = NULL;
- return 0;
-
- error: ;
- if (dev->sock)
- kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
- }
- dev->thread_send = NULL;
- return -1;
+ mutex_unlock(&dev->send_mutex);
}
-int dnbd3_net_receive(void *data)
+/**
+ * The receive workfn stays active for as long as the connection to a server
+ * lasts, i.e. it only gets restarted when we switch to a new server.
+ */
+static void dnbd3_recv_workfn(struct work_struct *work)
{
- dnbd3_device_t *dev = data;
- struct request *blk_request, *tmp_request, *received_request;
-
- dnbd3_reply_t dnbd3_reply;
- struct msghdr msg;
- struct kvec iov;
+ dnbd3_device_t *dev = container_of(work, dnbd3_device_t, recv_work);
+ struct request *blk_request;
+ struct request *rq_iter;
+ struct dnbd3_cmd *cmd;
+ dnbd3_reply_t reply_hdr;
struct req_iterator iter;
struct bio_vec bvec_inst;
struct bio_vec *bvec = &bvec_inst;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov;
void *kaddr;
unsigned long irqflags;
- sigset_t blocked, oldset;
uint16_t rid;
- unsigned long int recv_timeout = jiffies;
-
- int count, remaining, ret;
-
- init_msghdr(msg);
- set_user_nice(current, -20);
+ int remaining;
+ int ret;
- while (!kthread_should_stop())
- {
+ mutex_lock(&dev->recv_mutex);
+ while (dev->sock) {
// receive net reply
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = sizeof(dnbd3_reply);
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags);
- if (ret == -EAGAIN)
- {
- if (jiffies < recv_timeout) recv_timeout = jiffies; // Handle overflow
- if ((jiffies - recv_timeout) / HZ > SOCKET_KEEPALIVE_TIMEOUT)
- error_dev_va("ERROR: Receive timeout reached (%d of %d secs).", (int)((jiffies - recv_timeout) / HZ), (int)SOCKET_KEEPALIVE_TIMEOUT);
- continue;
+ ret = dnbd3_recv_reply(dev->sock, &reply_hdr);
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(dev, "remote peer has performed an orderly shutdown\n");
+ goto out_unlock;
+ } else if (ret < 0) {
+ if (ret == -EAGAIN) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "receive timeout reached\n");
+ } else {
+ /* for all errors other than -EAGAIN, print errno */
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "connection to server lost (receive, errno=%d)\n", ret);
+ }
+ goto out_unlock;
}
- if (ret <= 0)
- error_dev("ERROR: Connection to server lost (receive)");
- if (ret != sizeof(dnbd3_reply))
- error_dev("ERROR: Recv msg header.");
- fixup_reply(dnbd3_reply);
- // check error
- if (dnbd3_reply.magic != dnbd3_packet_magic)
- error_dev("ERROR: Wrong packet magic (Receive).");
- if (dnbd3_reply.cmd == 0)
- error_dev("ERROR: Command was 0 (Receive).");
+ /* check if arrived data is valid */
+ if (ret != sizeof(reply_hdr)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv partial msg header (%d/%d bytes)\n",
+ ret, (int)sizeof(reply_hdr));
+ goto out_unlock;
+ }
- // Update timeout
- recv_timeout = jiffies;
+ // check error
+ if (reply_hdr.magic != dnbd3_packet_magic) {
+ dnbd3_dev_err_cur(dev, "wrong packet magic (receive)\n");
+ goto out_unlock;
+ }
// what to do?
- switch (dnbd3_reply.cmd)
- {
+ switch (reply_hdr.cmd) {
case CMD_GET_BLOCK:
// search for replied request in queue
blk_request = NULL;
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_for_each_entry_safe(received_request, tmp_request, &dev->request_queue_receive, queuelist)
- {
- if ((uint64_t)(uintptr_t)received_request == dnbd3_reply.handle) // Double cast to prevent warning on 32bit
- {
- blk_request = received_request;
+ spin_lock_irqsave(&dev->recv_queue_lock, irqflags);
+ list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) {
+ cmd = blk_mq_rq_to_pdu(rq_iter);
+ if (cmd->handle == reply_hdr.handle) {
+ blk_request = rq_iter;
+ list_del_init(&blk_request->queuelist);
break;
}
}
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- if (blk_request == NULL )
- error_dev_va("ERROR: Received block data for unrequested handle (%llu: %llu).\n",
- (unsigned long long)dnbd3_reply.handle, (unsigned long long)dnbd3_reply.size);
+ spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags);
+ if (blk_request == NULL) {
+ dnbd3_dev_err_cur(dev, "received block data for unrequested handle (%llx: len=%llu)\n",
+ reply_hdr.handle,
+ (u64)reply_hdr.size);
+ goto out_unlock;
+ }
// receive data and answer to block layer
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0)
- rq_for_each_segment(bvec_inst, blk_request, iter)
+ rq_for_each_segment(bvec_inst, blk_request, iter) {
#else
- rq_for_each_segment(bvec, blk_request, iter)
+ rq_for_each_segment(bvec, blk_request, iter) {
#endif
- {
- siginitsetinv(&blocked, sigmask(SIGKILL));
- sigprocmask(SIG_SETMASK, &blocked, &oldset);
-
kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
iov.iov_base = kaddr;
iov.iov_len = bvec->bv_len;
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags) != bvec->bv_len)
- {
- kunmap(bvec->bv_page);
- sigprocmask(SIG_SETMASK, &oldset, NULL );
- error_dev("ERROR: Receiving from net to block layer.");
- }
+ ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
kunmap(bvec->bv_page);
-
- sigprocmask(SIG_SETMASK, &oldset, NULL );
+ if (ret != bvec->bv_len) {
+ if (ret == 0) {
+ /* have not received any data, but remote peer is shutdown properly */
+ dnbd3_dev_dbg_cur(
+ dev, "remote peer has performed an orderly shutdown\n");
+ } else if (ret < 0) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "disconnect: receiving from net to block layer\n");
+ } else {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev,
+ "receiving from net to block layer (%d bytes)\n", ret);
+ }
+ // Requeue request
+ spin_lock_irqsave(&dev->send_queue_lock, irqflags);
+ list_add(&blk_request->queuelist, &dev->send_queue);
+ spin_unlock_irqrestore(&dev->send_queue_lock, irqflags);
+ goto out_unlock;
+ }
}
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- list_del_init(&blk_request->queuelist);
- __blk_end_request_all(blk_request, 0);
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- continue;
+ blk_mq_end_request(blk_request, BLK_STS_OK);
+ break;
case CMD_GET_SERVERS:
- if (!dev->use_server_provided_alts)
- {
- remaining = dnbd3_reply.size;
- goto consume_payload;
- }
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = 0;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- count = MIN(NUMBER_SERVERS, dnbd3_reply.size / sizeof(dnbd3_server_entry_t));
-
- if (count != 0)
- {
- iov.iov_base = dev->new_servers;
- iov.iov_len = count * sizeof(dnbd3_server_entry_t);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags)
- != (count * sizeof(dnbd3_server_entry_t)))
- error_dev("ERROR: Recv CMD_GET_SERVERS payload.");
- spin_lock_irqsave(&dev->blk_lock, irqflags);
- dev->new_servers_num = count;
- spin_unlock_irqrestore(&dev->blk_lock, irqflags);
- }
- // If there were more servers than accepted, remove the remaining data from the socket buffer
- remaining = dnbd3_reply.size - (count * sizeof(dnbd3_server_entry_t));
- consume_payload: while (remaining > 0)
- {
- count = MIN(sizeof(dnbd3_reply), remaining); // Abuse the reply struct as the receive buffer
- iov.iov_base = &dnbd3_reply;
- iov.iov_len = count;
- ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
- if (ret <= 0)
- error_dev("ERROR: Recv additional payload from CMD_GET_SERVERS.");
- remaining -= ret;
+ remaining = reply_hdr.size;
+ if (dev->use_server_provided_alts) {
+ dnbd3_server_entry_t new_server;
+
+ while (remaining >= sizeof(dnbd3_server_entry_t)) {
+ if (dnbd3_recv_bytes(dev->sock, &new_server, sizeof(new_server))
+ != sizeof(new_server)) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dnbd3_dev_err_cur(dev, "recv CMD_GET_SERVERS payload\n");
+ goto out_unlock;
+ }
+ // TODO: Log
+ if (new_server.failures == 0) { // ADD
+ dnbd3_add_server(dev, &new_server.host);
+ } else { // REM
+ dnbd3_rem_server(dev, &new_server.host);
+ }
+ remaining -= sizeof(new_server);
+ }
}
- continue;
+ if (!dnbd3_drain_socket(dev, dev->sock, remaining))
+ goto out_unlock;
+ break;
case CMD_LATEST_RID:
- if (dnbd3_reply.size != 2)
- {
- printk("ERROR: CMD_LATEST_RID.size != 2.\n");
+ if (reply_hdr.size < 2) {
+ dev_err(dnbd3_device_to_dev(dev), "CMD_LATEST_RID.size < 2\n");
continue;
}
- iov.iov_base = &rid;
- iov.iov_len = sizeof(rid);
- if (kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags) <= 0)
- {
- printk("ERROR: Could not receive CMD_LATEST_RID payload.\n");
- }
- else
- {
+ if (dnbd3_recv_bytes(dev->sock, &rid, 2) != 2) {
+ if (!dnbd3_flag_taken(dev->connection_lock))
+ dev_err(dnbd3_device_to_dev(dev), "could not receive CMD_LATEST_RID payload\n");
+ } else {
rid = net_order_16(rid);
- printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid);
+ dnbd3_dev_info_cur(dev, "latest rid of %s is %d (currently using %d)\n",
+ dev->imgname, (int)rid, (int)dev->rid);
dev->update_available = (rid > dev->rid ? 1 : 0);
}
+ if (reply_hdr.size > 2)
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size - 2);
continue;
case CMD_KEEPALIVE:
- if (dnbd3_reply.size != 0)
- printk("ERROR: keep alive packet with payload.\n");
+ if (reply_hdr.size != 0) {
+ dev_dbg(dnbd3_device_to_dev(dev), "keep alive packet with payload\n");
+ dnbd3_drain_socket(dev, dev->sock, reply_hdr.size);
+ }
continue;
default:
- printk("ERROR: Unknown command (Receive)\n");
- continue;
+ dev_err(dnbd3_device_to_dev(dev), "unknown command: %d (receive), aborting connection\n", (int)reply_hdr.cmd);
+ goto out_unlock;
+ }
+ }
+out_unlock:
+ // This will check if we actually still need a new connection
+ dnbd3_start_discover(dev, true);
+ mutex_unlock(&dev->recv_mutex);
+}
+/**
+ * Set send or receive timeout of given socket
+ */
+static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0)
+ int opt = set_send ? SO_SNDTIMEO_NEW : SO_RCVTIMEO_NEW;
+ struct __kernel_sock_timeval timeout;
+#else
+ int opt = set_send ? SO_SNDTIMEO : SO_RCVTIMEO;
+ struct timeval timeout;
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
+ sockptr_t timeout_ptr = KERNEL_SOCKPTR(&timeout);
+#else
+ char *timeout_ptr = (char *)&timeout;
+#endif
+
+ timeout.tv_sec = timeout_ms / 1000;
+ timeout.tv_usec = (timeout_ms % 1000) * 1000;
+ sock_setsockopt(sock, SOL_SOCKET, opt, timeout_ptr, sizeof(timeout));
+}
+
+static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out)
+{
+ ktime_t start;
+ int ret, connect_time_ms;
+ struct socket *sock;
+ int retries = 4;
+ const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in)
+ : sizeof(struct sockaddr_in6);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0)
+ ret = sock_create_kern(&init_net, addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#else
+ ret = sock_create_kern(addr->ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+#endif
+ if (ret < 0) {
+ dev_err(dnbd3_device_to_dev(dev), "couldn't create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Only one retry, TCP no delay */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0)
+ tcp_sock_set_syncnt(sock->sk, 1);
+ tcp_sock_set_nodelay(sock->sk);
+ /* because of our aggressive timeouts, this is pointless */
+ sock_no_linger(sock->sk);
+#else
+ /* add legacy version of this, but ignore others as they're not that important */
+ ret = 1;
+ kernel_setsockopt(sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&ret, sizeof(ret));
+#endif
+ /* allow this socket to use reserved mem (vm.mem_free_kbytes) */
+ sk_set_memalloc(sock->sk);
+ sock->sk->sk_allocation = GFP_NOIO;
+
+ if (dev->panic && dev->panic_count > 1) {
+ /* in panic mode for some time, start increasing timeouts */
+ connect_time_ms = dev->panic_count * 1000;
+ } else {
+ /* otherwise, use 2*RTT of current server */
+ connect_time_ms = dev->cur_server.rtt * 2 / 1000;
+ }
+ /* but obey a minimal configurable value, and maximum sanity check */
+ if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000)
+ connect_time_ms = SOCKET_TIMEOUT_SEND * 1000;
+ else if (connect_time_ms > 60000)
+ connect_time_ms = 60000;
+ set_socket_timeout(sock, false, connect_time_ms); // recv
+ set_socket_timeout(sock, true, connect_time_ms); // send
+ start = ktime_get_real();
+ while (--retries > 0) {
+ ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0);
+ connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start);
+ if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) {
+ /* Either I'm losing my mind or there was a specific build of kernel
+ * 5.x where SO_RCVTIMEO didn't affect the connect call above, so
+ * this function would hang for over a minute for unreachable hosts.
+ * Leave in this debug check for twice the configured timeout
+ */
+ dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n",
+ connect_time_ms);
}
+ if (ret != 0) {
+ if (ret == -EINTR)
+ dnbd3_dev_dbg_host(dev, addr, "connect: interrupted system call (blocked %dms)\n",
+ connect_time_ms);
+ else
+ dnbd3_dev_dbg_host(dev, addr, "connect: failed (%d, blocked %dms)\n",
+ ret, connect_time_ms);
+ goto error;
+ }
+ *sock_out = sock;
+ return 0;
}
+error:
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
- printk("dnbd3_net_receive terminated normally.\n");
- dev->thread_receive = NULL;
- return 0;
+#define dnbd3_err_dbg_host(...) do { \
+ if (dev->panic || dev->sock == NULL) \
+ dnbd3_dev_err_host(__VA_ARGS__); \
+ else \
+ dnbd3_dev_dbg_host(__VA_ARGS__); \
+} while (0)
+
+/**
+ * Execute protocol handshake on a newly connected socket.
+ * If this is the initial connection to any server, ie. we're being called
+ * through the initial ioctl() to open a device, we'll store the rid, filesize
+ * etc. in the dev struct., otherwise, this is a potential switch to another
+ * server, so we validate the filesize, rid, name against what we expect.
+ * The server's protocol version is returned in 'remote_version'
+ */
+static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock,
+ struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_data)
+{
+ unsigned long irqflags;
+ const char *name;
+ uint64_t filesize;
+ int mlen;
+ uint16_t rid;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov[2];
+ serialized_buffer_t *payload;
+ dnbd3_reply_t reply_hdr;
+ dnbd3_request_t request_hdr = { .magic = dnbd3_packet_magic };
+
+ payload = kmalloc(sizeof(*payload), GFP_KERNEL);
+ if (payload == NULL)
+ goto error;
+
+ if (copy_data && device_active(dev))
+ dev_warn(dnbd3_device_to_dev(dev), "Called handshake function with copy_data enabled when reported_size is not zero\n");
+
+ // Request filesize
+ request_hdr.cmd = CMD_SELECT_IMAGE;
+ iov[0].iov_base = &request_hdr;
+ iov[0].iov_len = sizeof(request_hdr);
+ serializer_reset_write(payload);
+ serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version
+ serializer_put_string(payload, dev->imgname); // image name
+ serializer_put_uint16(payload, dev->rid); // revision id
+ serializer_put_uint8(payload, 0); // are we a server? (no!)
+ iov[1].iov_base = payload;
+ request_hdr.size = iov[1].iov_len = serializer_get_written_length(payload);
+ fixup_request(request_hdr);
+ mlen = iov[0].iov_len + iov[1].iov_len;
+ if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) {
+ dnbd3_err_dbg_host(dev, addr, "requesting image size failed\n");
+ goto error;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving image size packet (header) failed\n");
+ goto error;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic
+ || reply_hdr.cmd != CMD_SELECT_IMAGE || reply_hdr.size < 4
+ || reply_hdr.size > sizeof(*payload)) {
+ dnbd3_err_dbg_host(dev, addr,
+ "corrupt CMD_SELECT_IMAGE reply\n");
+ goto error;
+ }
+
+ // receive data
+ iov[0].iov_base = payload;
+ iov[0].iov_len = reply_hdr.size;
+ if (kernel_recvmsg(sock, &msg, iov, 1, reply_hdr.size, msg.msg_flags)
+ != reply_hdr.size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "receiving payload of CMD_SELECT_IMAGE reply failed\n");
+ goto error;
+ }
+ serializer_reset_read(payload, reply_hdr.size);
+
+ *remote_version = serializer_get_uint16(payload);
+ name = serializer_get_string(payload);
+ rid = serializer_get_uint16(payload);
+ filesize = serializer_get_uint64(payload);
+
+ if (*remote_version < MIN_SUPPORTED_SERVER) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server version too old (client: %d, server: %d, min supported: %d)\n",
+ (int)PROTOCOL_VERSION, (int)*remote_version,
+ (int)MIN_SUPPORTED_SERVER);
+ goto error;
+ }
+ if (name == NULL) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply an image name\n");
+ goto error;
+ }
+ if (rid == 0) {
+ dnbd3_err_dbg_host(dev, addr, "server did not supply a revision id\n");
+ goto error;
+ }
+
+ if (copy_data) {
+ if (filesize < DNBD3_BLOCK_SIZE) {
+ dnbd3_err_dbg_host(dev, addr, "reported size by server is < 4096\n");
+ goto error;
+ }
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (strlen(dev->imgname) < strlen(name)) {
+ dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_KERNEL);
+ if (dev->imgname == NULL) {
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ dnbd3_err_dbg_host(dev, addr, "reallocating buffer for new image name failed\n");
+ goto error;
+ }
+ }
+ strcpy(dev->imgname, name);
+ dev->rid = rid;
+ // store image information
+ dev->reported_size = filesize;
+ dev->update_available = 0;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
+ dnbd3_dev_dbg_host(dev, addr, "image size: %llu\n", dev->reported_size);
+ } else {
+ /* switching connection, sanity checks */
+ if (rid != dev->rid) {
+ dnbd3_err_dbg_host(dev, addr,
+ "server supplied wrong rid (client: '%d', server: '%d')\n",
+ (int)dev->rid, (int)rid);
+ goto error;
+ }
+
+ if (strcmp(name, dev->imgname) != 0) {
+ dnbd3_err_dbg_host(dev, addr, "server offers image '%s', requested '%s'\n", name, dev->imgname);
+ goto error;
+ }
+
+ if (filesize != dev->reported_size) {
+ dnbd3_err_dbg_host(dev, addr,
+ "reported image size of %llu does not match expected value %llu\n",
+ (unsigned long long)filesize, (unsigned long long)dev->reported_size);
+ goto error;
+ }
+ }
+ kfree(payload);
+ return true;
+
+error:
+ kfree(payload);
+ return false;
+}
+
+static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ dnbd3_request_t request_hdr = {
+ .magic = dnbd3_packet_magic,
+ .cmd = cmd,
+ .size = size,
+ .offset = offset,
+ .handle = handle,
+ };
+ struct kvec iov = { .iov_base = &request_hdr, .iov_len = sizeof(request_hdr) };
+
+ fixup_request(request_hdr);
+ return kernel_sendmsg(sock, &msg, &iov, 1, sizeof(request_hdr)) == sizeof(request_hdr);
+}
+
+/**
+ * Send a request with given cmd type and empty payload.
+ */
+static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd)
+{
+ int ret;
+
+ mutex_lock(&dev->send_mutex);
+ ret = dev->sock
+ && dnbd3_send_request(dev->sock, cmd, 0, 0, 0);
+ mutex_unlock(&dev->send_mutex);
+ return ret;
+}
+
+static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL };
+ struct kvec iov = { .iov_base = buffer, .iov_len = count };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, count, msg.msg_flags);
+}
+
+static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr)
+{
+ int ret = dnbd3_recv_bytes(sock, reply_hdr, sizeof(*reply_hdr));
+
+ fixup_reply(*reply_hdr);
+ return ret;
+}
+
+static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes)
+{
+ int ret;
+ struct kvec iov;
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+
+ while (bytes > 0) {
+ iov.iov_base = __garbage_mem;
+ iov.iov_len = sizeof(__garbage_mem);
+ ret = kernel_recvmsg(sock, &msg, &iov, 1, MIN(bytes, iov.iov_len), msg.msg_flags);
+ if (ret <= 0) {
+ dnbd3_dev_err_cur(dev, "draining payload failed (ret=%d)\n", ret);
+ return false;
+ }
+ bytes -= ret;
+ }
+ return true;
+}
+
+static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock)
+{
+ dnbd3_reply_t reply_hdr;
+
+ // Request block
+ if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) {
+ dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n");
+ return false;
+ }
+
+ // receive net reply
+ if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) {
+ dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n");
+ return false;
+ }
+ if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK
+ || reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) {
+ dnbd3_err_dbg_host(dev, addr,
+ "unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n",
+ (int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle);
+ return false;
+ }
- error:
+ // receive data
+ return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE);
+}
+#undef dnbd3_err_dbg_host
+
+static void replace_main_socket(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ mutex_lock(&dev->send_mutex);
+ // First, shutdown connection, so receive worker will leave its mainloop
if (dev->sock)
kernel_sock_shutdown(dev->sock, SHUT_RDWR);
- if (!dev->disconnecting)
- {
- dev->panic = 1;
- dev->discover = 1;
- wake_up(&dev->process_queue_discover);
+ mutex_lock(&dev->recv_mutex);
+ // Receive worker is done, get rid of socket and replace
+ if (dev->sock)
+ sock_release(dev->sock);
+ dev->sock = sock;
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (addr == NULL) {
+ memset(&dev->cur_server, 0, sizeof(dev->cur_server));
+ } else {
+ dev->cur_server.host = *addr;
+ dev->cur_server.rtt = 0;
+ dev->cur_server.protocol_version = protocol_version;
}
- dev->thread_receive = NULL;
- return -1;
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ mutex_unlock(&dev->recv_mutex);
+ mutex_unlock(&dev->send_mutex);
}
+static void dnbd3_release_resources(dnbd3_device_t *dev)
+{
+ if (dev->send_wq)
+ destroy_workqueue(dev->send_wq);
+ dev->send_wq = NULL;
+ if (dev->recv_wq)
+ destroy_workqueue(dev->recv_wq);
+ dev->recv_wq = NULL;
+ mutex_destroy(&dev->send_mutex);
+ mutex_destroy(&dev->recv_mutex);
+}
+
+/**
+ * Establish new connection on a dnbd3 device.
+ * Return 0 on success, errno otherwise
+ */
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init)
+{
+ unsigned long irqflags;
+ struct socket *sock = NULL;
+ uint16_t proto_version;
+ int ret;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (init && device_active(dev)) {
+ dnbd3_dev_err_cur(dev, "device already configured/connected\n");
+ return -EBUSY;
+ }
+ if (!init && !device_active(dev)) {
+ dev_warn(dnbd3_device_to_dev(dev), "connection switch called on unconfigured device\n");
+ return -ENOTCONN;
+ }
+
+ dnbd3_dev_dbg_host(dev, addr, "connecting...\n");
+ ret = dnbd3_connect(dev, addr, &sock);
+ if (ret != 0 || sock == NULL)
+ goto error;
+
+ /* execute the "select image" handshake */
+ // if init is true, reported_size will be set
+ if (!dnbd3_execute_handshake(dev, sock, addr, &proto_version, init)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (init) {
+ // We're setting up the device for use - allocate resources
+ // Do not goto error before this
+ ASSERT(!dev->send_wq);
+ ASSERT(!dev->recv_wq);
+ mutex_init(&dev->send_mutex);
+ mutex_init(&dev->recv_mutex);
+ // a designated queue for sending, that allows one active task only
+ dev->send_wq = alloc_workqueue("dnbd%d-send",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI,
+ 1, dev->index);
+ dev->recv_wq = alloc_workqueue("dnbd%d-recv",
+ WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ 1, dev->index);
+ if (!dev->send_wq || !dev->recv_wq) {
+ ret = -ENOMEM;
+ goto error_dealloc;
+ }
+ }
+
+ set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000); // recv
+ dnbd3_set_primary_connection(dev, sock, addr, proto_version);
+ sock = NULL; // In case we ever goto error* after this point
+
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ if (init) {
+ dev->discover_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_STARTUP;
+ // discovery and keepalive are not critical, use the power efficient queue
+ queue_delayed_work(system_power_efficient_wq, &dev->discover_work,
+ dev->discover_interval * HZ);
+ queue_delayed_work(system_power_efficient_wq, &dev->keepalive_work,
+ KEEPALIVE_INTERVAL * HZ);
+ // but the receiver is performance critical AND runs indefinitely, use the
+ // the cpu intensive queue, as jobs submitted there will not cound towards
+ // the concurrency limit of per-cpu worker threads. It still feels a little
+ // dirty to avoid managing our own thread, but nbd does it too.
+ }
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+ return 0;
+
+error_dealloc:
+ if (init) {
+ // If anything fails during initialization, free resources again
+ dnbd3_release_resources(dev);
+ }
+error:
+ if (init)
+ dev->reported_size = 0;
+ if (sock)
+ sock_release(sock);
+ return ret < 0 ? ret : -EIO;
+}
+
+void dnbd3_net_work_init(dnbd3_device_t *dev)
+{
+ INIT_WORK(&dev->send_work, dnbd3_send_workfn);
+ INIT_WORK(&dev->recv_work, dnbd3_recv_workfn);
+ INIT_DELAYED_WORK(&dev->discover_work, dnbd3_discover_workfn);
+ INIT_DELAYED_WORK(&dev->keepalive_work, dnbd3_keepalive_workfn);
+}
+
+static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version)
+{
+ unsigned long irqflags;
+
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (addr->ss_family == 0 || dev->imgname == NULL || sock == NULL) {
+ dnbd3_dev_err_cur(dev, "connect: host, image name or sock not set\n");
+ return -EINVAL;
+ }
+
+ replace_main_socket(dev, sock, addr, protocol_version);
+ spin_lock_irqsave(&dev->blk_lock, irqflags);
+ dev->panic = false;
+ dev->panic_count = 0;
+ dev->discover_interval = TIMER_INTERVAL_PROBE_SWITCH;
+ queue_work(dev->recv_wq, &dev->recv_work);
+ spin_unlock_irqrestore(&dev->blk_lock, irqflags);
+
+ if (dev->use_server_provided_alts)
+ dnbd3_send_empty_request(dev, CMD_GET_SERVERS);
+
+ dnbd3_dev_info_cur(dev, "connection switched\n");
+ dnbd3_blk_requeue_all_requests(dev);
+ return 0;
+}
+
+/**
+ * Disconnect the device, shutting it down.
+ */
+int dnbd3_net_disconnect(dnbd3_device_t *dev)
+{
+ ASSERT(dnbd3_flag_taken(dev->connection_lock));
+ if (!device_active(dev))
+ return -ENOTCONN;
+ dev_dbg(dnbd3_device_to_dev(dev), "disconnecting device ...\n");
+
+ dev->reported_size = 0;
+ /* quickly fail all requests */
+ dnbd3_blk_fail_all_requests(dev);
+ replace_main_socket(dev, NULL, NULL, 0);
+
+ cancel_delayed_work_sync(&dev->keepalive_work);
+ cancel_delayed_work_sync(&dev->discover_work);
+ cancel_work_sync(&dev->send_work);
+ cancel_work_sync(&dev->recv_work);
+
+ dnbd3_blk_fail_all_requests(dev);
+ dnbd3_release_resources(dev);
+ dev_dbg(dnbd3_device_to_dev(dev), "all workers shut down\n");
+ return 0;
+}
diff --git a/src/kernel/net.h b/src/kernel/net.h
index a06a20c..69fa523 100644
--- a/src/kernel/net.h
+++ b/src/kernel/net.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,30 +22,12 @@
#ifndef NET_H_
#define NET_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
-#define init_msghdr(h) do { \
- h.msg_name = NULL; \
- h.msg_namelen = 0; \
- h.msg_control = NULL; \
- h.msg_controllen = 0; \
- h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
- } while (0)
+void dnbd3_net_work_init(dnbd3_device_t *dev);
-int dnbd3_net_connect(dnbd3_device_t *lo);
+int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init);
-int dnbd3_net_disconnect(dnbd3_device_t *lo);
-
-int dnbd3_net_send(void *data);
-
-int dnbd3_net_receive(void *data);
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0)
-void dnbd3_net_heartbeat(struct timer_list *arg);
-#else
-void dnbd3_net_heartbeat(unsigned long arg);
-#endif
-
-int dnbd3_net_discover(void *data);
+int dnbd3_net_disconnect(dnbd3_device_t *dev);
#endif /* NET_H_ */
diff --git a/src/kernel/serialize.c b/src/kernel/serialize.c
new file mode 120000
index 0000000..5a4e4ac
--- /dev/null
+++ b/src/kernel/serialize.c
@@ -0,0 +1 @@
+../shared/serialize.c \ No newline at end of file
diff --git a/src/kernel/serialize_kmod.c b/src/kernel/serialize_kmod.c
deleted file mode 100644
index 50746df..0000000
--- a/src/kernel/serialize_kmod.c
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-
-#define KERNEL_MODULE
-#include "serialize.c"
diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c
index 4406072..9deba96 100644
--- a/src/kernel/sysfs.c
+++ b/src/kernel/sysfs.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,156 +22,138 @@
#include <linux/kobject.h>
#include "sysfs.h"
-#include "utils.h"
#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
+/**
+ * Print currently connected server IP:PORT
+ */
ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev)
{
- if (dev->cur_server.host.type == HOST_IP4)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- else if (dev->cur_server.host.type == HOST_IP6)
- return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE);
- *buf = '\0';
- return 0;
-}
-
-ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev)
-{
- return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE);
-}
+ ssize_t ret;
-ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev)
-{
- int i, num = 0;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type) ++num;
- }
- return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE);
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%pISpc\n", &dev->cur_server.host), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * List alt servers. One line per server, format is:
+ * IP:PORT RTT consecutive_failures best_count
+ */
ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev)
{
- int i, size = PAGE_SIZE, ret;
- for (i = 0; i < NUMBER_SERVERS; ++i)
- {
- if (dev->alt_servers[i].host.type == HOST_IP4)
- ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else if (dev->alt_servers[i].host.type == HOST_IP6)
- ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n",
- dev->alt_servers[i].host.addr,
- (int)ntohs(dev->alt_servers[i].host.port),
- (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4),
- (int)dev->alt_servers[i].failures)
- , size);
- else
+ int i, size = PAGE_SIZE;
+ ssize_t ret;
+
+ if (mutex_lock_interruptible(&dev->alt_servers_lock) != 0)
+ return 0;
+
+ for (i = 0; i < NUMBER_SERVERS; ++i) {
+ if (dev->alt_servers[i].host.ss_family == 0)
continue;
+
+ ret = MIN(snprintf(buf, size, "%pISpc %llu %d %d\n", &dev->alt_servers[i].host,
+ (unsigned long long)((dev->alt_servers[i].rtts[0] +
+ dev->alt_servers[i].rtts[1] +
+ dev->alt_servers[i].rtts[2] +
+ dev->alt_servers[i].rtts[3]) / 4),
+ (int)dev->alt_servers[i].failures,
+ (int)dev->alt_servers[i].best_count),
+ size);
size -= ret;
buf += ret;
- if (size <= 0)
- {
+ if (size <= 0) {
size = 0;
break;
}
}
+ mutex_unlock(&dev->alt_servers_lock);
return PAGE_SIZE - size;
}
+/**
+ * Show name of image in use
+ */
ssize_t show_image_name(char *buf, dnbd3_device_t *dev)
{
- if (dev->imgname == NULL) return sprintf(buf, "(null)");
- return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ ssize_t ret;
+
+ spin_lock(&dev->blk_lock);
+ ret = MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE);
+ spin_unlock(&dev->blk_lock);
+ return ret;
}
+/**
+ * Show rid of image in use
+ */
ssize_t show_rid(char *buf, dnbd3_device_t *dev)
{
+ // No locking here, primitive type, no pointer to allocated memory
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE);
}
ssize_t show_update_available(char *buf, dnbd3_device_t *dev)
{
+ // Same story
return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE);
}
-device_attr_t cur_server_addr =
-{
- .attr = {.name = "cur_server_addr", .mode = 0444 },
- .show = show_cur_server_addr,
- .store = NULL,
-};
-
-device_attr_t cur_server_rtt =
-{
- .attr = {.name = "cur_server_rtt", .mode = 0444 },
- .show = show_cur_server_rtt,
- .store = NULL,
-};
-
-device_attr_t alt_server_num =
-{
- .attr = {.name = "alt_server_num", .mode = 0444 },
- .show = show_alt_server_num,
- .store = NULL,
+device_attr_t cur_server_addr = {
+ .attr = { .name = "cur_server_addr", .mode = 0444 },
+ .show = show_cur_server_addr,
+ .store = NULL,
};
-device_attr_t alt_servers =
-{
- .attr = {.name = "alt_servers", .mode = 0444 },
- .show = show_alt_servers,
- .store = NULL,
+device_attr_t alt_servers = {
+ .attr = { .name = "alt_servers", .mode = 0444 },
+ .show = show_alt_servers,
+ .store = NULL,
};
-device_attr_t image_name =
-{
- .attr = {.name = "image_name", .mode = 0444 },
- .show = show_image_name,
- .store = NULL,
+device_attr_t image_name = {
+ .attr = { .name = "image_name", .mode = 0444 },
+ .show = show_image_name,
+ .store = NULL,
};
-device_attr_t rid =
-{
- .attr = {.name = "rid", .mode = 0444 },
- .show = show_rid,
- .store = NULL,
+device_attr_t rid = {
+ .attr = { .name = "rid", .mode = 0444 },
+ .show = show_rid,
+ .store = NULL,
};
-device_attr_t update_available =
-{
- .attr = {.name = "update_available", .mode = 0444 },
- .show = show_update_available,
- .store = NULL,
+device_attr_t update_available = {
+ .attr = { .name = "update_available", .mode = 0444 },
+ .show = show_update_available,
+ .store = NULL,
};
ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf)
{
device_attr_t *device_attr = container_of(attr, device_attr_t, attr);
dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj);
+
return device_attr->show(buf, dev);
}
-struct attribute *device_attrs[] =
-{
+struct attribute *device_attrs[] = {
&cur_server_addr.attr,
- &cur_server_rtt.attr,
- &alt_server_num.attr,
&alt_servers.attr,
- &image_name.attr,
- &rid.attr,
+ &image_name.attr, &rid.attr,
&update_available.attr,
NULL,
};
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0)
+ATTRIBUTE_GROUPS(device);
+#endif
-struct sysfs_ops device_ops =
-{
+const struct sysfs_ops device_ops = {
.show = device_show,
};
@@ -179,14 +162,16 @@ void release(struct kobject *kobj)
kobj->state_initialized = 0;
}
-struct kobj_type device_ktype =
-{
+struct kobj_type device_ktype = {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0)
.default_attrs = device_attrs,
+#else
+ .default_groups = device_groups,
+#endif
.sysfs_ops = &device_ops,
.release = release,
};
-
void dnbd3_sysfs_init(dnbd3_device_t *dev)
{
int error;
@@ -196,7 +181,7 @@ void dnbd3_sysfs_init(dnbd3_device_t *dev)
error = kobject_init_and_add(kobj, ktype, parent, "%s", "net");
if (error)
- printk("Error initializing dnbd3 device!\n");
+ dev_err(dnbd3_device_to_dev(dev), "initializing sysfs for device failed!\n");
}
void dnbd3_sysfs_exit(dnbd3_device_t *dev)
diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h
index 0a747a5..1db4a07 100644
--- a/src/kernel/sysfs.h
+++ b/src/kernel/sysfs.h
@@ -1,9 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* This file is part of the Distributed Network Block Device 3
*
* Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
*
- * This file may be licensed under the terms of of the
+ * This file may be licensed under the terms of the
* GNU General Public License Version 2 (the ``GPL'').
*
* Software distributed under the License is distributed
@@ -21,25 +22,16 @@
#ifndef SYSFS_H_
#define SYSFS_H_
-#include "dnbd3.h"
+#include "dnbd3_main.h"
void dnbd3_sysfs_init(dnbd3_device_t *dev);
void dnbd3_sysfs_exit(dnbd3_device_t *dev);
-typedef struct
-{
+typedef struct {
struct attribute attr;
- ssize_t (*show)(char *, dnbd3_device_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_device_t *);
+ ssize_t (*show)(char *buf, dnbd3_device_t *dev);
+ ssize_t (*store)(const char *buf, size_t len, dnbd3_device_t *dev);
} device_attr_t;
-typedef struct
-{
- struct attribute attr;
- ssize_t (*show)(char *, dnbd3_server_t *);
- ssize_t (*store)(const char *, size_t, dnbd3_server_t *);
-} server_attr_t;
-
-
#endif /* SYSFS_H_ */
diff --git a/src/kernel/utils.c b/src/kernel/utils.c
deleted file mode 100644
index 902025f..0000000
--- a/src/kernel/utils.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#include <linux/kernel.h>
-
-#include "utils.h"
-
-unsigned int inet_addr(char *str)
-{
- int a, b, c, d;
- char arr[4];
- sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d);
- arr[0] = a;
- arr[1] = b;
- arr[2] = c;
- arr[3] = d;
- return *(unsigned int *) arr;
-}
-
-void inet_ntoa(struct in_addr addr, char *str)
-{
- unsigned char *ptr = (unsigned char *) &addr;
- sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff);
-}
diff --git a/src/kernel/utils.h b/src/kernel/utils.h
deleted file mode 100644
index e54b3cf..0000000
--- a/src/kernel/utils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * This file is part of the Distributed Network Block Device 3
- *
- * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de>
- *
- * This file may be licensed under the terms of of the
- * GNU General Public License Version 2 (the ``GPL'').
- *
- * Software distributed under the License is distributed
- * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
- * express or implied. See the GPL for the specific language
- * governing rights and limitations.
- *
- * You should have received a copy of the GPL along with this
- * program. If not, go to http://www.gnu.org/licenses/gpl.html
- * or write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- */
-
-#ifndef UTILS_H_
-#define UTILS_H_
-
-#include <linux/in.h>
-
-unsigned int inet_addr(char *str);
-void inet_ntoa(struct in_addr addr, char *str);
-
-#endif /* UTILS_H_ */