diff options
Diffstat (limited to 'src/kernel')
-rw-r--r-- | src/kernel/.clang-format | 552 | ||||
-rw-r--r-- | src/kernel/CMakeLists.txt | 66 | ||||
-rw-r--r-- | src/kernel/Kbuild | 5 | ||||
-rw-r--r-- | src/kernel/blk.c | 740 | ||||
-rw-r--r-- | src/kernel/blk.h | 18 | ||||
-rw-r--r-- | src/kernel/core.c | 81 | ||||
-rw-r--r-- | src/kernel/dnbd3.h | 84 | ||||
-rw-r--r-- | src/kernel/dnbd3_main.c | 250 | ||||
-rw-r--r-- | src/kernel/dnbd3_main.h | 148 | ||||
-rw-r--r-- | src/kernel/net.c | 1929 | ||||
-rw-r--r-- | src/kernel/net.h | 29 | ||||
l--------- | src/kernel/serialize.c | 1 | ||||
-rw-r--r-- | src/kernel/serialize_kmod.c | 5 | ||||
-rw-r--r-- | src/kernel/sysfs.c | 177 | ||||
-rw-r--r-- | src/kernel/sysfs.h | 20 | ||||
-rw-r--r-- | src/kernel/utils.c | 41 | ||||
-rw-r--r-- | src/kernel/utils.h | 29 |
17 files changed, 2572 insertions, 1603 deletions
diff --git a/src/kernel/.clang-format b/src/kernel/.clang-format new file mode 100644 index 0000000..c1fe2c6 --- /dev/null +++ b/src/kernel/.clang-format @@ -0,0 +1,552 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 4. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +#AlignEscapedNewlines: Left # Unknown to clang-format-4.0 +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + #AfterExternBlock: false # Unknown to clang-format-5.0 + BeforeCatch: false + BeforeElse: false + IndentBraces: false + #SplitEmptyFunction: true # Unknown to clang-format-4.0 + #SplitEmptyRecord: true # Unknown to clang-format-4.0 + #SplitEmptyNamespace: true # Unknown to clang-format-4.0 +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0 +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +#BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 120 +CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false # Unknown to clang-format-4.0 +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: false # Unknown to clang-format-4.0 + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | sort | uniq +ForEachMacros: + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - '__ata_qc_for_each' + - 'ata_qc_for_each' + - 'ata_qc_for_each_raw' + - 'ata_qc_for_each_with_internal' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - '__bio_for_each_bvec' + - 'bio_for_each_bvec' + - 'bio_for_each_bvec_all' + - 'bio_for_each_integrity_vec' + - '__bio_for_each_segment' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'bitmap_for_each_clear_region' + - 'bitmap_for_each_set_region' + - 'blkg_for_each_descendant_post' + - 'blkg_for_each_descendant_pre' + - 'blk_queue_for_each_rl' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'bpf_for_each_spilled_reg' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'cxl_for_each_cmd' + - 'device_for_each_child_node' + - 'dma_fence_chain_for_each' + - 'do_for_each_ftrace_op' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_atomic_for_each_plane_damage' + - 'drm_client_for_each_connector_iter' + - 'drm_client_for_each_modeset' + - 'drm_connector_for_each_possible_encoder' + - 'drm_for_each_bridge_in_chain' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_crtc_reverse' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_for_each_privobj' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'flow_action_for_each' + - 'for_each_active_dev_scope' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_aggr_pgid' + - 'for_each_available_child_of_node' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_bvec' + - 'for_each_card_auxs' + - 'for_each_card_auxs_safe' + - 'for_each_card_components' + - 'for_each_card_dapms' + - 'for_each_card_pre_auxs' + - 'for_each_card_prelinks' + - 'for_each_card_rtds' + - 'for_each_card_rtds_safe' + - 'for_each_card_widgets' + - 'for_each_card_widgets_safe' + - 'for_each_cgroup_storage_type' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_cmsghdr' + - 'for_each_compatible_node' + - 'for_each_component_dais' + - 'for_each_component_dais_safe' + - 'for_each_comp_order' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dapm_widgets' + - 'for_each_dev_addr' + - 'for_each_dev_scope' + - 'for_each_displayid_db' + - 'for_each_dma_cap_mask' + - 'for_each_dpcm_be' + - 'for_each_dpcm_be_rollback' + - 'for_each_dpcm_be_safe' + - 'for_each_dpcm_fe' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_element' + - 'for_each_element_extid' + - 'for_each_element_id' + - 'for_each_endpoint_of_node' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_pfn_range_in_zone' + - 'for_each_free_mem_pfn_range_in_zone_from' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_link_codecs' + - 'for_each_link_cpus' + - 'for_each_link_platforms' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_member' + - 'for_each_memcg_cache_index' + - 'for_each_mem_pfn_range' + - '__for_each_mem_range' + - 'for_each_mem_range' + - '__for_each_mem_range_rev' + - 'for_each_mem_range_rev' + - 'for_each_mem_region' + - 'for_each_migratetype_order' + - 'for_each_msi_entry' + - 'for_each_msi_entry_safe' + - 'for_each_net' + - 'for_each_net_continue_reverse' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_continue_reverse' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_net_rcu' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_mst_mgr_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_private_obj_in_state' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_nonreserved_multicast_dest_pgid' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_cpu_node' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_old_mst_mgr_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_mst_mgr_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_plane_in_state_reverse' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pci_msi_entry' + - 'for_each_pcm_streams' + - 'for_each_physmem_range' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_property_of_node' + - 'for_each_registered_fb' + - 'for_each_requested_gpio' + - 'for_each_requested_gpio_in_range' + - 'for_each_reserved_mem_range' + - 'for_each_reserved_mem_region' + - 'for_each_rtd_codec_dais' + - 'for_each_rtd_components' + - 'for_each_rtd_cpu_dais' + - 'for_each_rtd_dais' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_set_clump8' + - 'for_each_sg' + - 'for_each_sg_dma_page' + - 'for_each_sg_page' + - 'for_each_sgtable_dma_page' + - 'for_each_sgtable_dma_sg' + - 'for_each_sgtable_page' + - 'for_each_sgtable_sg' + - 'for_each_sibling_event' + - 'for_each_subelement' + - 'for_each_subelement_extid' + - 'for_each_subelement_id' + - '__for_each_thread' + - 'for_each_thread' + - 'for_each_unicast_dest_pgid' + - 'for_each_vsi' + - 'for_each_wakeup_source' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'genradix_for_each' + - 'genradix_for_each_from' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hctx_for_each_ctx' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - 'hlist_for_each_entry_srcu' + - '__hlist_for_each_rcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'i3c_bus_for_each_i2cdev' + - 'i3c_bus_for_each_i3cdev' + - 'ide_host_for_each_port' + - 'ide_port_for_each_dev' + - 'ide_port_for_each_present_dev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_continue_ul' + - 'idr_for_each_entry_ul' + - 'in_dev_for_each_ifa_rcu' + - 'in_dev_for_each_ifa_rtnl' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk_rcu' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_func_safe' + - 'klp_for_each_func_static' + - 'klp_for_each_object' + - 'klp_for_each_object_safe' + - 'klp_for_each_object_static' + - 'kunit_suite_for_each_test_case' + - 'kvm_for_each_memslot' + - 'kvm_for_each_vcpu' + - 'list_for_each' + - 'list_for_each_codec' + - 'list_for_each_codec_safe' + - 'list_for_each_continue' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_rcu' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_entry_srcu' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'mci_for_each_dimm' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'nanddev_io_for_each_page' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'pcl_for_each_chunk' + - 'pcl_for_each_segment' + - 'pcm_for_each_format' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rbtree_postorder_for_each_entry_safe' + - 'rdma_for_each_block' + - 'rdma_for_each_port' + - 'rdma_umem_for_each_dma_block' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_entry' + - 'rht_for_each_entry_from' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_from' + - 'rht_for_each_entry_safe' + - 'rht_for_each_from' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_from' + - '__rq_for_each_bio' + - 'rq_for_each_bvec' + - 'rq_for_each_segment' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'shdma_for_each_chan' + - '__shost_for_each_device' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_array_for_each' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'tb_property_for_each' + - 'tcf_exts_for_each_action' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'virtio_device_for_each_vq' + - 'while_for_each_ftrace_op' + - 'xa_for_each' + - 'xa_for_each_marked' + - 'xa_for_each_range' + - 'xa_for_each_start' + - 'xas_for_each' + - 'xas_for_each_conflict' + - 'xas_for_each_marked' + - 'xbc_array_for_each_value' + - 'xbc_for_each_key_value' + - 'xbc_node_for_each_array_value' + - 'xbc_node_for_each_child' + - 'xbc_node_for_each_key_value' + - 'zorro_for_each_dev' + +#IncludeBlocks: Preserve # Unknown to clang-format-5.0 +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +#IndentPPDirectives: None # Unknown to clang-format-5.0 +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +#ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +#PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +#SortUsingDeclarations: false # Unknown to clang-format-4.0 +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +#SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0 +#SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0 +SpaceBeforeParens: ControlStatements +#SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0 +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/src/kernel/CMakeLists.txt b/src/kernel/CMakeLists.txt new file mode 100644 index 0000000..6bc61ff --- /dev/null +++ b/src/kernel/CMakeLists.txt @@ -0,0 +1,66 @@ +cmake_minimum_required(VERSION 3.10) + +# set the project name +project(dnbd3-kernel + LANGUAGES C) + +# include macros to define Linux kernel build targets +include(Kernel) + +# set C flags for a Linux kernel module +set(KERNEL_C_FLAGS "-DDNBD3_KERNEL_MODULE -I ${PROJECT_INCLUDE_GEN_DIR}" + CACHE STRING "C flags to be used for building the dnbd3 kernel module") +# set C flags for the debug mode of a Linux kernel module +set(KERNEL_C_FLAGS_DEBUG "-g -DDEBUG" + CACHE STRING "Additional C flags to be used for building the dnbd3 kernel module in debug mode") + +# append include directories to the C flags +get_property(KERNEL_INCLUDE_DIRS DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) +foreach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS}) + set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} -I ${KERNEL_INCLUDE_DIR}") +endforeach(KERNEL_INCLUDE_DIR ${KERNEL_INCLUDE_DIRS}) + +# append debug C flags if debug mode is enabled +if(CMAKE_BUILD_TYPE MATCHES Debug) + set(KERNEL_C_FLAGS "${KERNEL_C_FLAGS} ${KERNEL_C_FLAGS_DEBUG}") +endif(CMAKE_BUILD_TYPE MATCHES Debug) + +# dnbd3 Linux kernel module +set(KERNEL_MODULE_DNBD3_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.c + ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.c + ${CMAKE_CURRENT_SOURCE_DIR}/net.c + ${CMAKE_CURRENT_SOURCE_DIR}/serialize.c + ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.c) +set(KERNEL_MODULE_DNBD3_HEADER_FILES ${CMAKE_CURRENT_SOURCE_DIR}/blk.h + ${CMAKE_CURRENT_SOURCE_DIR}/dnbd3_main.h + ${CMAKE_CURRENT_SOURCE_DIR}/net.h + ${CMAKE_CURRENT_SOURCE_DIR}/sysfs.h) + +add_kernel_module(dnbd3 "${KERNEL_BUILD_DIR}" + "${KERNEL_INSTALL_DIR}" + "CONFIG_BLK_DEV_DNBD3=m" + "${KERNEL_MODULE_DNBD3_SOURCE_FILES}" + "${KERNEL_MODULE_DNBD3_HEADER_FILES}" + ${CMAKE_CURRENT_SOURCE_DIR}/Kbuild) + +# add dependency to generate project version header before dnbd3.ko is built +add_dependencies(dnbd3 dnbd3-generate-version) + +set(CHECKPATCH_IGNORE_WARNINGS "NEW_TYPEDEFS" + "MSLEEP" + "CONSTANT_COMPARISON" + "DEEP_INDENTATION" + "PREFER_PR_LEVEL" + "LINUX_VERSION_CODE" + "JIFFIES_COMPARISON" + "KREALLOC_ARG_REUSE") + +add_kernel_linter(dnbd3-lint "${CHECKPATCH_IGNORE_WARNINGS}" + "${KERNEL_MODULE_DNBD3_SOURCE_FILES}" + "${KERNEL_MODULE_DNBD3_HEADER_FILES}") +add_kernel_linter_fix(dnbd3-lint-fix "${CHECKPATCH_IGNORE_WARNINGS}" + "${KERNEL_MODULE_DNBD3_SOURCE_FILES}" + "${KERNEL_MODULE_DNBD3_HEADER_FILES}") + +add_linter_fix(dnbd3-lint-fix-clang "${KERNEL_MODULE_DNBD3_SOURCE_FILES}" + "${KERNEL_MODULE_DNBD3_HEADER_FILES}") diff --git a/src/kernel/Kbuild b/src/kernel/Kbuild new file mode 100644 index 0000000..26afa98 --- /dev/null +++ b/src/kernel/Kbuild @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +# Linux kernel module dnbd3 +obj-$(CONFIG_BLK_DEV_DNBD3) := dnbd3.o +dnbd3-y += dnbd3_main.o blk.o net.o serialize.o sysfs.o diff --git a/src/kernel/blk.c b/src/kernel/blk.c index 889b988..69e4583 100644 --- a/src/kernel/blk.c +++ b/src/kernel/blk.c @@ -1,9 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -18,248 +19,259 @@ * */ -#include "clientconfig.h" +#include <dnbd3/config/client.h> #include "blk.h" #include "net.h" #include "sysfs.h" +#include "dnbd3_main.h" #include <linux/pagemap.h> -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -#define dnbd3_req_read(req) \ - req_op(req) == REQ_OP_READ -#define dnbd3_req_fs(req) \ - dnbd3_req_read(req) || req_op(req) == REQ_OP_WRITE -#define dnbd3_req_special(req) \ - blk_rq_is_private(req) -#else -#define dnbd3_req_read(req) \ - rq_data_dir(req) == READ -#define dnbd3_req_fs(req) \ - req->cmd_type == REQ_TYPE_FS -#define dnbd3_req_special(req) \ - req->cmd_type == REQ_TYPE_SPECIAL -#endif - -int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor) +static int dnbd3_close_device(dnbd3_device_t *dev) { - struct gendisk *disk; - struct request_queue *blk_queue; - - init_waitqueue_head(&dev->process_queue_send); - init_waitqueue_head(&dev->process_queue_receive); - init_waitqueue_head(&dev->process_queue_discover); - INIT_LIST_HEAD(&dev->request_queue_send); - INIT_LIST_HEAD(&dev->request_queue_receive); + int result; - memset(&dev->cur_server, 0, sizeof(dev->cur_server)); - memset(&dev->initial_server, 0, sizeof(dev->initial_server)); - dev->better_sock = NULL; + if (dev->imgname) + dev_info(dnbd3_device_to_dev(dev), "closing down device.\n"); + dev->panic = false; + result = dnbd3_net_disconnect(dev); + kfree(dev->imgname); dev->imgname = NULL; - dev->rid = 0; - dev->update_available = 0; - memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS); - dev->thread_send = NULL; - dev->thread_receive = NULL; - dev->thread_discover = NULL; - dev->discover = 0; - dev->disconnecting = 0; - dev->panic = 0; - dev->panic_count = 0; - dev->reported_size = 0; - - if (!(disk = alloc_disk(1))) - { - printk("ERROR: dnbd3 alloc_disk failed.\n"); - return -EIO; - } - - disk->major = major; - disk->first_minor = minor; - sprintf(disk->disk_name, "dnbd%d", minor); - set_capacity(disk, 0); - set_disk_ro(disk, 1); - disk->fops = &dnbd3_blk_ops; - - spin_lock_init(&dev->blk_lock); - if ((blk_queue = blk_init_queue(&dnbd3_blk_request, &dev->blk_lock)) == NULL) - { - printk("ERROR: dnbd3 blk_init_queue failed.\n"); - return -EIO; - } - - blk_queue_logical_block_size(blk_queue, DNBD3_BLOCK_SIZE); - blk_queue_physical_block_size(blk_queue, DNBD3_BLOCK_SIZE); - - disk->queue = blk_queue; - disk->private_data = dev; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) - blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); -#else - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); -#endif -#define ONE_MEG (1048576) - blk_queue_max_segment_size(disk->queue, ONE_MEG); - blk_queue_max_segments(disk->queue, 0xffff); - blk_queue_max_hw_sectors(disk->queue, ONE_MEG / DNBD3_BLOCK_SIZE); - disk->queue->limits.max_sectors = 256; - dev->disk = disk; -#undef ONE_MEG - add_disk(disk); - dnbd3_sysfs_init(dev); - return 0; + /* new requests might have been queued up, */ + /* but now that imgname is NULL no new ones can show up */ + blk_mq_freeze_queue(dev->queue); + set_capacity(dev->disk, 0); + blk_mq_unfreeze_queue(dev->queue); + return result; } -int dnbd3_blk_del_device(dnbd3_device_t *dev) -{ - dnbd3_sysfs_exit(dev); - dnbd3_net_disconnect(dev); - del_gendisk(dev->disk); - put_disk(dev->disk); - blk_cleanup_queue(dev->disk->queue); - return 0; -} - -struct block_device_operations dnbd3_blk_ops = - { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, }; - -int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int result = -100; dnbd3_device_t *dev = bdev->bd_disk->private_data; +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) struct request_queue *blk_queue = dev->disk->queue; +#endif char *imgname = NULL; dnbd3_ioctl_t *msg = NULL; - //unsigned long irqflags; + int i = 0, j; + u8 locked = 0; - while (dev->disconnecting) - { - // do nothing - } - - if (arg != 0) - { + if (arg != 0) { msg = kmalloc(sizeof(*msg), GFP_KERNEL); - if (msg == NULL) return -ENOMEM; - if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) - { + if (msg == NULL) + return -ENOMEM; + if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) { result = -ENOEXEC; goto cleanup_return; } - if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) - { + if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) { result = -ENOENT; goto cleanup_return; } - if (msg->imgname != NULL && msg->imgnamelen > 0) - { + if (msg->imgname != NULL && msg->imgnamelen > 0) { imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL); - if (imgname == NULL) - { + if (imgname == NULL) { result = -ENOMEM; goto cleanup_return; } - if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) - { + if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) { result = -ENOENT; goto cleanup_return; } imgname[msg->imgnamelen] = '\0'; - //printk("IOCTL Image name of len %d is %s\n", (int)msg->imgnamelen, imgname); } } - - switch (cmd) - { + switch (cmd) { case IOCTL_OPEN: - if (dev->imgname != NULL) - { + if (!dnbd3_flag_get(dev->connection_lock)) { result = -EBUSY; + break; } - else if (imgname == NULL) - { + locked = 1; + if (dev->imgname != NULL) { + result = -EBUSY; + } else if (imgname == NULL) { result = -EINVAL; - } - else if (msg == NULL) - { + } else if (msg == NULL) { result = -EINVAL; - } - else - { - if (sizeof(msg->host) != sizeof(dev->cur_server.host)) - printk("Odd size bug#1 triggered in IOCTL\n"); - memcpy(&dev->cur_server.host, &msg->host, sizeof(msg->host)); - dev->cur_server.failures = 0; - memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server)); + } else { + /* assert that at least one and not to many hosts are given */ + if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) { + result = -EINVAL; + break; + } + dev->imgname = imgname; dev->rid = msg->rid; dev->use_server_provided_alts = msg->use_server_provided_alts; - // Forget all alt servers on explicit connect, set first al server to initial server - memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS); - memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0])); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) - if (blk_queue->backing_dev_info != NULL) { + + dev_info(dnbd3_device_to_dev(dev), "opening device.\n"); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) + // set optimal request size for the queue to half the read-ahead + blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512)); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \ + && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + // set readahead from optimal request size of the queue + // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE + blk_queue_update_readahead(dev->queue); +#endif +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) + if (blk_queue->backing_dev_info != NULL) blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; - } #else blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; #endif - if (dnbd3_net_connect(dev) == 0) - { - result = 0; - imgname = NULL; // Prevent kfree at the end + + /* add specified servers to alt server list */ + for (i = 0; i < NUMBER_SERVERS; i++) + dev->alt_servers[i].host.ss_family = 0; + for (i = 0; i < msg->hosts_num; i++) { + /* copy provided host into corresponding alt server slot */ + if (dnbd3_add_server(dev, &msg->hosts[i]) == 0) + dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n", + &dev->alt_servers[i].host); + else + dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n", + &dev->alt_servers[i].host); } - else - { - result = -ENOENT; + + /* + * probe added alt servers in specified order and + * choose first working server as initial server + */ + result = -EPROTONOSUPPORT; + for (i = 0; i < NUMBER_SERVERS; i++) { + /* probe added alt server */ + if (dev->alt_servers[i].host.ss_family == 0) + continue; // Empty slot + + result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true); + if (result == 0) { + /* connection established, store index of server and exit loop */ + result = i; + break; + } + } + + if (result >= 0) { + /* connection was successful */ + dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n", + &dev->cur_server.host); + imgname = NULL; // Prevent kfree at the end + } else { + /* probing failed */ dev->imgname = NULL; } } break; case IOCTL_CLOSE: - dnbd3_blk_fail_all_requests(dev); - result = dnbd3_net_disconnect(dev); - dnbd3_blk_fail_all_requests(dev); - set_capacity(dev->disk, 0); - if (dev->imgname) - { - kfree(dev->imgname); - dev->imgname = NULL; + if (!dnbd3_flag_get(dev->connection_lock)) { + result = -EBUSY; + break; } + locked = 1; + result = dnbd3_close_device(dev); break; case IOCTL_SWITCH: - result = -EINVAL; + if (!dnbd3_flag_get(dev->connection_lock)) { + result = -EBUSY; + break; + } + locked = 1; + if (dev->imgname == NULL) { + result = -ENOTCONN; + } else if (msg == NULL) { + result = -EINVAL; + } else { + dnbd3_alt_server_t *alt_server; + struct sockaddr_storage new_addr; + + mutex_lock(&dev->alt_servers_lock); + alt_server = get_existing_alt_from_host(&msg->hosts[0], dev); + if (alt_server == NULL) { + mutex_unlock(&dev->alt_servers_lock); + /* specified server is not known, so do not switch */ + result = -ENOENT; + } else { + /* specified server is known, so try to switch to it */ + new_addr = alt_server->host; + mutex_unlock(&dev->alt_servers_lock); + if (is_same_server(&dev->cur_server.host, &new_addr)) { + /* specified server is current server, so do not switch */ + result = 0; + } else { + dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n", + &new_addr); + result = dnbd3_new_connection(dev, &new_addr, false); + if (result != 0) { + /* switching didn't work */ + result = -EAGAIN; + } + } + if (result == 0) { + /* fake RTT so we don't switch away again soon */ + mutex_lock(&dev->alt_servers_lock); + for (i = 0; i < NUMBER_SERVERS; ++i) { + alt_server = &dev->alt_servers[i]; + if (is_same_server(&alt_server->host, &new_addr)) { + for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j) + alt_server->rtts[j] = 1; + alt_server->best_count = 100; + } else { + for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j) + if (alt_server->rtts[j] < 500000) + alt_server->rtts[j] = 500000; + alt_server->best_count = 0; + } + } + mutex_unlock(&dev->alt_servers_lock); + } + } + } break; case IOCTL_ADD_SRV: - case IOCTL_REM_SRV: - if (dev->imgname == NULL) - { - result = -ENOENT; + case IOCTL_REM_SRV: { + struct sockaddr_storage addr; + dnbd3_host_t *host; + + if (dev->imgname == NULL) { + result = -ENOTCONN; + break; } - else if (dev->new_servers_num >= NUMBER_SERVERS) - { - result = -EAGAIN; + if (msg == NULL) { + result = -EINVAL; + break; } - else if (msg == NULL) - { + host = &msg->hosts[0]; + if (!dnbd3_host_to_sockaddr(host, &addr)) { result = -EINVAL; + break; } - else - { - memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host)); - dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM - ++dev->new_servers_num; - result = 0; + + if (cmd == IOCTL_ADD_SRV) { + result = dnbd3_add_server(dev, host); + if (result == -EEXIST) + dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr); + else if (result == -ENOSPC) + dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr); + else + dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr); + } else { // IOCTL_REM_SRV + result = dnbd3_rem_server(dev, host); + if (result == -ENOENT) + dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr); + else + dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr); } break; - + } case BLKFLSBUF: result = 0; break; @@ -270,113 +282,325 @@ int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u } cleanup_return: - if (msg) kfree(msg); - if (imgname) kfree(imgname); + kfree(msg); + kfree(imgname); + if (locked) + dnbd3_flag_reset(dev->connection_lock); return result; } -/** - * dev->blk_lock and q->queue_lock are being held - * when this is called! +static const struct block_device_operations dnbd3_blk_ops = { + .owner = THIS_MODULE, + .ioctl = dnbd3_blk_ioctl, +}; + +static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq) +{ + unsigned long irqflags; + + spin_lock_irqsave(&dev->send_queue_lock, irqflags); + list_add_tail(&rq->queuelist, &dev->send_queue); + spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); + spin_lock_irqsave(&dev->blk_lock, irqflags); + queue_work(dev->send_wq, &dev->send_work); + spin_unlock_irqrestore(&dev->blk_lock, irqflags); +} + +/* + * Linux kernel blk-mq driver function (entry point) to handle block IO requests */ -void dnbd3_blk_request(struct request_queue *q) +static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct request *req; - dnbd3_device_t *dev; + struct request *rq = bd->rq; + dnbd3_device_t *dev = rq->q->queuedata; + struct dnbd3_cmd *cmd; - while ((req = blk_fetch_request(q)) != NULL) - { - dev = req->rq_disk->private_data; + if (dev->imgname == NULL || !device_active(dev)) + return BLK_STS_IOERR; - if (dev->imgname == NULL) - { - __blk_end_request_all(req, -EIO); - continue; - } + if (req_op(rq) != REQ_OP_READ) + return BLK_STS_IOERR; - if (!(dnbd3_req_fs(req))) - { - __blk_end_request_all(req, 0); - continue; - } + if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT) + return BLK_STS_TIMEOUT; - if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT) - { - __blk_end_request_all(req, -EIO); - continue; - } + if (rq_data_dir(rq) != READ) + return BLK_STS_NOTSUPP; - if (!(dnbd3_req_read(req))) - { - __blk_end_request_all(req, -EACCES); - continue; + cmd = blk_mq_rq_to_pdu(rq); + cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32); + blk_mq_start_request(rq); + dnbd3_add_queue(dev, rq); + return BLK_STS_OK; +} + +static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ + && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + , bool reserved +#endif + ) +{ + unsigned long irqflags; + struct request *rq_iter; + bool found = false; + dnbd3_device_t *dev = req->q->queuedata; + + spin_lock_irqsave(&dev->send_queue_lock, irqflags); + list_for_each_entry(rq_iter, &dev->send_queue, queuelist) { + if (rq_iter == req) { + found = true; + break; + } + } + spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); + // If still in send queue, do nothing + if (found) + return BLK_EH_RESET_TIMER; + + spin_lock_irqsave(&dev->recv_queue_lock, irqflags); + list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) { + if (rq_iter == req) { + found = true; + list_del_init(&req->queuelist); + break; } + } + spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags); + if (!found) { + dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n"); + // Assume it was fnished concurrently + return BLK_EH_DONE; + } + // Add to send queue again and trigger work, reset timeout + dnbd3_add_queue(dev, req); + return BLK_EH_RESET_TIMER; +} + +static +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) +const +#endif +struct blk_mq_ops dnbd3_mq_ops = { + .queue_rq = dnbd3_queue_rq, + .timeout = dnbd3_rq_timeout, +}; + +int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor) +{ + int ret; + + memset(dev, 0, sizeof(*dev)); + dev->index = minor; + // lock for imgname, cur_server etc. + spin_lock_init(&dev->blk_lock); + spin_lock_init(&dev->send_queue_lock); + spin_lock_init(&dev->recv_queue_lock); + INIT_LIST_HEAD(&dev->send_queue); + INIT_LIST_HEAD(&dev->recv_queue); + dnbd3_flag_reset(dev->connection_lock); + dnbd3_flag_reset(dev->discover_running); + mutex_init(&dev->alt_servers_lock); + dnbd3_net_work_init(dev); + + // memset has done this already but I like initial values to be explicit + dev->imgname = NULL; + dev->rid = 0; + dev->update_available = false; + dev->panic = false; + dev->panic_count = 0; + dev->reported_size = 0; + + // set up tag_set for blk-mq + dev->tag_set.ops = &dnbd3_mq_ops; + dev->tag_set.nr_hw_queues = 1; + dev->tag_set.queue_depth = 128; + dev->tag_set.numa_node = NUMA_NO_NODE; + dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd); + dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tag_set.driver_data = dev; + dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ; + + ret = blk_mq_alloc_tag_set(&dev->tag_set); + if (ret) { + dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n"); + goto out; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) + // set up blk-mq and disk + dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev); + if (IS_ERR(dev->disk)) { + dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n"); + ret = PTR_ERR(dev->disk); + goto out_cleanup_tags; + } + dev->queue = dev->disk->queue; +#else + // set up blk-mq + dev->queue = blk_mq_init_queue(&dev->tag_set); + if (IS_ERR(dev->queue)) { + ret = PTR_ERR(dev->queue); + dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n"); + goto out_cleanup_tags; + } + dev->queue->queuedata = dev; +#endif + + blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE); + blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue); +#else + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue); +#endif +#define ONE_MEG (1048576) + blk_queue_max_segment_size(dev->queue, ONE_MEG); + blk_queue_max_segments(dev->queue, 0xffff); + blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE); + dev->queue->limits.max_sectors = 256; +#undef ONE_MEG + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) + // set up disk + dev->disk = alloc_disk(1); + if (!dev->disk) { + dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n"); + ret = -ENOMEM; + goto out_cleanup_queue; + } +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \ + || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \ + || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + dev->disk->flags |= GENHD_FL_NO_PART; +#else + dev->disk->flags |= GENHD_FL_NO_PART_SCAN; +#endif + dev->disk->major = major; + dev->disk->first_minor = minor; + dev->disk->minors = 1; + dev->disk->fops = &dnbd3_blk_ops; + dev->disk->private_data = dev; + dev->disk->queue = dev->queue; + sprintf(dev->disk->disk_name, "dnbd%d", minor); + set_capacity(dev->disk, 0); + set_disk_ro(dev->disk, 1); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \ + || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + ret = add_disk(dev->disk); + if (ret != 0) + goto out_cleanup_queue; +#else + add_disk(dev->disk); +#endif + + // set up sysfs + dnbd3_sysfs_init(dev); + + return 0; + +out_cleanup_queue: +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) + blk_cleanup_queue(dev->queue); +#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ + && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + blk_cleanup_disk(dev->disk); +#else + put_disk(dev->disk); +#endif +out_cleanup_tags: + blk_mq_free_tag_set(&dev->tag_set); +out: + mutex_destroy(&dev->alt_servers_lock); + return ret; +} + +int dnbd3_blk_del_device(dnbd3_device_t *dev) +{ + while (!dnbd3_flag_get(dev->connection_lock)) + schedule(); + dnbd3_close_device(dev); + dnbd3_sysfs_exit(dev); + del_gendisk(dev->disk); +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) + blk_cleanup_queue(dev->queue); + put_disk(dev->disk); +#elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ + && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) + blk_cleanup_disk(dev->disk); +#else + put_disk(dev->disk); +#endif + blk_mq_free_tag_set(&dev->tag_set); + mutex_destroy(&dev->alt_servers_lock); + return 0; +} + +void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev) +{ + struct request *blk_request; + unsigned long flags; + struct list_head local_copy; + int count = 0; - list_add_tail(&req->queuelist, &dev->request_queue_send); - spin_unlock_irq(q->queue_lock); - wake_up(&dev->process_queue_send); - spin_lock_irq(q->queue_lock); + INIT_LIST_HEAD(&local_copy); + spin_lock_irqsave(&dev->recv_queue_lock, flags); + while (!list_empty(&dev->recv_queue)) { + blk_request = list_entry(dev->recv_queue.next, struct request, queuelist); + list_del_init(&blk_request->queuelist); + list_add(&blk_request->queuelist, &local_copy); + count++; + } + spin_unlock_irqrestore(&dev->recv_queue_lock, flags); + if (count) + dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count); + while (!list_empty(&local_copy)) { + blk_request = list_entry(local_copy.next, struct request, queuelist); + list_del_init(&blk_request->queuelist); + spin_lock_irqsave(&dev->send_queue_lock, flags); + list_add_tail(&blk_request->queuelist, &dev->send_queue); + spin_unlock_irqrestore(&dev->send_queue_lock, flags); } + // Do this even if we didn't move anything from the recv list to the send + // list. It might have already contained something, which needs to be + // re-requested anyways if this was called because of a server switch. + spin_lock_irqsave(&dev->blk_lock, flags); + queue_work(dev->send_wq, &dev->send_work); + spin_unlock_irqrestore(&dev->blk_lock, flags); } void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev) { - struct request *blk_request, *tmp_request; - struct request *blk_request2, *tmp_request2; + struct request *blk_request; unsigned long flags; struct list_head local_copy; - int dup; + int count = 0; + INIT_LIST_HEAD(&local_copy); - spin_lock_irqsave(&dev->blk_lock, flags); - while (!list_empty(&dev->request_queue_receive)) - { - list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist) - { - list_del_init(&blk_request->queuelist); - dup = 0; - list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist) - { - if (blk_request == blk_request2) - { - printk("WARNING: Request is in both lists!\n"); - dup = 1; - break; - } - } - if (!dup) list_add(&blk_request->queuelist, &local_copy); - } + spin_lock_irqsave(&dev->recv_queue_lock, flags); + while (!list_empty(&dev->recv_queue)) { + blk_request = list_entry(dev->recv_queue.next, struct request, queuelist); + list_del_init(&blk_request->queuelist); + list_add(&blk_request->queuelist, &local_copy); + count++; } - while (!list_empty(&dev->request_queue_send)) - { - list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_send, queuelist) - { - list_del_init(&blk_request->queuelist); - dup = 0; - list_for_each_entry_safe(blk_request2, tmp_request2, &local_copy, queuelist) - { - if (blk_request == blk_request2) - { - printk("WARNING: Request is in both lists!\n"); - dup = 1; - break; - } - } - if (!dup) list_add(&blk_request->queuelist, &local_copy); - } + spin_unlock_irqrestore(&dev->recv_queue_lock, flags); + spin_lock_irqsave(&dev->send_queue_lock, flags); + while (!list_empty(&dev->send_queue)) { + blk_request = list_entry(dev->send_queue.next, struct request, queuelist); + list_del_init(&blk_request->queuelist); + list_add(&blk_request->queuelist, &local_copy); + count++; } - spin_unlock_irqrestore(&dev->blk_lock, flags); - list_for_each_entry_safe(blk_request, tmp_request, &local_copy, queuelist) - { + spin_unlock_irqrestore(&dev->send_queue_lock, flags); + if (count) + dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count); + while (!list_empty(&local_copy)) { + blk_request = list_entry(local_copy.next, struct request, queuelist); list_del_init(&blk_request->queuelist); - if (dnbd3_req_fs(blk_request)) - { - spin_lock_irqsave(&dev->blk_lock, flags); - __blk_end_request_all(blk_request, -EIO); - spin_unlock_irqrestore(&dev->blk_lock, flags); - } - else if (dnbd3_req_special(blk_request)) - { - kfree(blk_request); - } + blk_mq_end_request(blk_request, BLK_STS_IOERR); } } diff --git a/src/kernel/blk.h b/src/kernel/blk.h index 5091d19..c6dcb8d 100644 --- a/src/kernel/blk.h +++ b/src/kernel/blk.h @@ -1,9 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -21,22 +22,17 @@ #ifndef BLK_H_ #define BLK_H_ -#include "dnbd3.h" +#include "dnbd3_main.h" -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) -#define REQ_TYPE_SPECIAL REQ_TYPE_DRV_PRIV -#endif - -extern struct block_device_operations dnbd3_blk_ops; - -int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); - -void dnbd3_blk_request(struct request_queue *q); +// The device has been set up via IOCTL_OPEN and hasn't been closed yet +#define device_active(dev) ((dev)->reported_size != 0) int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor); int dnbd3_blk_del_device(dnbd3_device_t *dev); +void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev); + void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev); #endif /* BLK_H_ */ diff --git a/src/kernel/core.c b/src/kernel/core.c deleted file mode 100644 index 69a2540..0000000 --- a/src/kernel/core.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * This file is part of the Distributed Network Block Device 3 - * - * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> - * - * This file may be licensed under the terms of of the - * GNU General Public License Version 2 (the ``GPL''). - * - * Software distributed under the License is distributed - * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either - * express or implied. See the GPL for the specific language - * governing rights and limitations. - * - * You should have received a copy of the GPL along with this - * program. If not, go to http://www.gnu.org/licenses/gpl.html - * or write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#include "clientconfig.h" -#include "dnbd3.h" -#include "blk.h" - -int major; -static unsigned int max_devs = NUMBER_DEVICES; -static dnbd3_device_t *dnbd3_device; - -static int __init dnbd3_init(void) -{ - int i; - - dnbd3_device = kcalloc(max_devs, sizeof(*dnbd3_device), GFP_KERNEL); - if (!dnbd3_device) - return -ENOMEM; - - // initialize block device - if ((major = register_blkdev(0, "dnbd3")) == 0) - { - printk("ERROR: dnbd3 register_blkdev failed.\n"); - return -EIO; - } - - printk("DNBD3 kernel module loaded. Machine type: " ENDIAN_MODE "\n"); - - // add MAX_NUMBER_DEVICES devices - for (i = 0; i < max_devs; i++) - { - if (dnbd3_blk_add_device(&dnbd3_device[i], i) != 0) - { - printk("ERROR: adding device failed.\n"); - return -EIO; // TODO: delete all devices added so far. it could happen that it's not the first one that fails. also call unregister_blkdev and free memory - } - } - - printk("INFO: dnbd3 init successful (%i devices).\n", max_devs); - return 0; -} - -static void __exit dnbd3_exit(void) -{ - int i; - - for (i = 0; i < max_devs; i++) - { - dnbd3_blk_del_device(&dnbd3_device[i]); - } - - unregister_blkdev(major, "dnbd3"); - kfree(dnbd3_device); - printk("INFO: dnbd3 exit.\n"); -} - -module_init( dnbd3_init); -module_exit( dnbd3_exit); - -MODULE_DESCRIPTION("Distributed Network Block Device 3"); -MODULE_LICENSE("GPL"); - -module_param(max_devs, int, 0444); -MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)"); diff --git a/src/kernel/dnbd3.h b/src/kernel/dnbd3.h deleted file mode 100644 index f8af69f..0000000 --- a/src/kernel/dnbd3.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * This file is part of the Distributed Network Block Device 3 - * - * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> - * - * This file may be licensed under the terms of of the - * GNU General Public License Version 2 (the ``GPL''). - * - * Software distributed under the License is distributed - * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either - * express or implied. See the GPL for the specific language - * governing rights and limitations. - * - * You should have received a copy of the GPL along with this - * program. If not, go to http://www.gnu.org/licenses/gpl.html - * or write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#ifndef DNBD_H_ -#define DNBD_H_ - -#include <linux/version.h> -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/blkdev.h> -#include <net/sock.h> - -#define KERNEL_MODULE -#include "config.h" -#include "types.h" -#include "serialize.h" - -extern int major; - -typedef struct -{ - dnbd3_host_t host; - unsigned long rtts[4]; // Last four round trip time measurements in µs - uint16_t protocol_version; // dnbd3 protocol version of this server - uint8_t failures; // How many times the server was unreachable -} dnbd3_server_t; - -typedef struct -{ - // block - struct gendisk *disk; - spinlock_t blk_lock; - - // sysfs - struct kobject kobj; - - // network - char *imgname; - struct socket *sock; - dnbd3_server_t cur_server, initial_server; - unsigned long cur_rtt; - serialized_buffer_t payload_buffer; - dnbd3_server_t alt_servers[NUMBER_SERVERS]; // array of alt servers - int new_servers_num; // number of new alt servers that are waiting to be copied to above array - dnbd3_server_entry_t new_servers[NUMBER_SERVERS]; // pending new alt servers - uint8_t discover, panic, disconnecting, update_available, panic_count; - uint8_t use_server_provided_alts; - uint16_t rid; - uint32_t heartbeat_count; - uint64_t reported_size; - // server switch - struct socket *better_sock; - - // process - struct task_struct * thread_send; - struct task_struct * thread_receive; - struct task_struct *thread_discover; - struct timer_list hb_timer; - wait_queue_head_t process_queue_send; - wait_queue_head_t process_queue_receive; - wait_queue_head_t process_queue_discover; - struct list_head request_queue_send; - struct list_head request_queue_receive; - -} dnbd3_device_t; - -#endif /* DNBD_H_ */ diff --git a/src/kernel/dnbd3_main.c b/src/kernel/dnbd3_main.c new file mode 100644 index 0000000..cb42567 --- /dev/null +++ b/src/kernel/dnbd3_main.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> + * + * This file may be licensed under the terms of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <dnbd3/config/client.h> +#include <dnbd3/version.h> +#include <net/ipv6.h> +#include "dnbd3_main.h" +#include "blk.h" + +int major; +static unsigned int max_devs = NUMBER_DEVICES; +static dnbd3_device_t *dnbd3_devices; + +struct device *dnbd3_device_to_dev(dnbd3_device_t *dev) +{ + return disk_to_dev(dev->disk); +} + +int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest) +{ + struct sockaddr_in *sin4; + struct sockaddr_in6 *sin6; + + memset(dest, 0, sizeof(*dest)); + if (host->type == HOST_IP4) { + sin4 = (struct sockaddr_in *)dest; + sin4->sin_family = AF_INET; + memcpy(&(sin4->sin_addr), host->addr, 4); + sin4->sin_port = host->port; + } else if (host->type == HOST_IP6) { + sin6 = (struct sockaddr_in6 *)dest; + sin6->sin6_family = AF_INET6; + memcpy(&(sin6->sin6_addr), host->addr, 16); + sin6->sin6_port = host->port; + } else + return 0; + return 1; +} + +int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y) +{ + if (x->ss_family != y->ss_family) + return 0; + switch (x->ss_family) { + case AF_INET: { + const struct sockaddr_in *sinx = (const struct sockaddr_in *)x; + const struct sockaddr_in *siny = (const struct sockaddr_in *)y; + + if (sinx->sin_port != siny->sin_port) + return 0; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + break; + } + case AF_INET6: { + const struct sockaddr_in6 *sinx = (const struct sockaddr_in6 *)x; + const struct sockaddr_in6 *siny = (const struct sockaddr_in6 *)y; + + if (sinx->sin6_port != siny->sin6_port) + return 0; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + break; + } + default: + return 0; + } + return 1; +} + +/** + * Get a free slot pointer from the alt_servers list. Tries to find an + * entirely empty slot first, then looks for a slot with a server that + * wasn't reachable recently, finally returns NULL if none of the + * conditions match. + * The caller has to hold dev->alt_servers_lock. + */ +static dnbd3_alt_server_t *get_free_alt_server(dnbd3_device_t *const dev) +{ + int i; + + for (i = 0; i < NUMBER_SERVERS; ++i) { + if (dev->alt_servers[i].host.ss_family == 0) + return &dev->alt_servers[i]; + } + for (i = 0; i < NUMBER_SERVERS; ++i) { + if (dev->alt_servers[i].failures > 10) + return &dev->alt_servers[i]; + } + return NULL; +} + +dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr, + dnbd3_device_t *const dev) +{ + int i; + + for (i = 0; i < NUMBER_SERVERS; ++i) { + if (is_same_server(addr, &dev->alt_servers[i].host)) + return &dev->alt_servers[i]; + } + return NULL; +} + +/** + * Returns pointer to existing entry in alt_servers that matches the given + * alt server, or NULL if not found. + * The caller has to hold dev->alt_servers_lock. + */ +dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev) +{ + struct sockaddr_storage addr; + + if (!dnbd3_host_to_sockaddr(host, &addr)) + return NULL; + return get_existing_alt_from_addr(&addr, dev); +} + +int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host) +{ + int result; + dnbd3_alt_server_t *alt_server; + + if (host->type != HOST_IP4 && host->type != HOST_IP6) + return -EINVAL; + + /* protect access to 'alt_servers' */ + mutex_lock(&dev->alt_servers_lock); + alt_server = get_existing_alt_from_host(host, dev); + // ADD + if (alt_server != NULL) { + // Exists + result = -EEXIST; + } else { + // OK add + alt_server = get_free_alt_server(dev); + if (alt_server == NULL) { + result = -ENOSPC; + } else { + dnbd3_host_to_sockaddr(host, &alt_server->host); + alt_server->protocol_version = 0; + alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] + = alt_server->rtts[3] = RTT_UNREACHABLE; + alt_server->failures = 0; + alt_server->best_count = 0; + result = 0; + } + } + mutex_unlock(&dev->alt_servers_lock); + return result; +} + +int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host) +{ + dnbd3_alt_server_t *alt_server; + int result; + + /* protect access to 'alt_servers' */ + mutex_lock(&dev->alt_servers_lock); + alt_server = get_existing_alt_from_host(host, dev); + // REMOVE + if (alt_server == NULL) { + // Not found + result = -ENOENT; + } else { + // Remove + alt_server->host.ss_family = 0; + result = 0; + } + mutex_unlock(&dev->alt_servers_lock); + return result; +} + +static int __init dnbd3_init(void) +{ + int i; + + dnbd3_devices = kcalloc(max_devs, sizeof(*dnbd3_devices), GFP_KERNEL); + if (!dnbd3_devices) + return -ENOMEM; + + // initialize block device + major = register_blkdev(0, "dnbd3"); + if (major == 0) { + pr_err("register_blkdev failed\n"); + return -EIO; + } + + pr_info("kernel module in version %s loaded\n", DNBD3_VERSION); + pr_debug("machine type %s\n", DNBD3_ENDIAN_MODE); + + // add MAX_NUMBER_DEVICES devices + for (i = 0; i < max_devs; i++) { + if (dnbd3_blk_add_device(&dnbd3_devices[i], i) != 0) { + pr_err("dnbd3_blk_add_device failed\n"); + // TODO: delete all devices added so far. + // It could happen that it's not the first one that fails. + // Also call unregister_blkdev and free memory. + return -EIO; + } + } + + pr_info("init successful (%i devices)\n", max_devs); + + return 0; +} + +static void __exit dnbd3_exit(void) +{ + int i; + + pr_debug("exiting kernel module...\n"); + for (i = 0; i < max_devs; i++) + dnbd3_blk_del_device(&dnbd3_devices[i]); + + unregister_blkdev(major, "dnbd3"); + kfree(dnbd3_devices); + + pr_info("exit kernel module done\n"); +} + +module_init(dnbd3_init); +module_exit(dnbd3_exit); + +MODULE_DESCRIPTION("Distributed Network Block Device 3"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DNBD3_VERSION); + +module_param(max_devs, int, 0444); +MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)"); diff --git a/src/kernel/dnbd3_main.h b/src/kernel/dnbd3_main.h new file mode 100644 index 0000000..a932ba2 --- /dev/null +++ b/src/kernel/dnbd3_main.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file is part of the Distributed Network Block Device 3 + * + * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> + * + * This file may be licensed under the terms of the + * GNU General Public License Version 2 (the ``GPL''). + * + * Software distributed under the License is distributed + * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either + * express or implied. See the GPL for the specific language + * governing rights and limitations. + * + * You should have received a copy of the GPL along with this + * program. If not, go to http://www.gnu.org/licenses/gpl.html + * or write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + */ + +#ifndef DNBD_H_ +#define DNBD_H_ + +#include <dnbd3/config/client.h> + +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/mutex.h> +#include <net/sock.h> + +#include <dnbd3/config.h> +#include <dnbd3/types.h> +#include <dnbd3/shared/serialize.h> + +#include <linux/blk-mq.h> + +#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION) +#define RHEL_CHECK_VERSION(CONDITION) (CONDITION) +#else +#define RHEL_CHECK_VERSION(CONDITION) (0) +#endif + +extern int major; + +typedef struct { + unsigned long rtts[DISCOVER_HISTORY_SIZE]; // Last X round trip time measurements in µs + uint16_t protocol_version; // dnbd3 protocol version of this server + uint8_t failures; // How many times the server was unreachable + uint8_t best_count; // Number of times server measured best + struct sockaddr_storage host; // Address of server +} dnbd3_alt_server_t; + +typedef struct { + // block + int index; + struct gendisk *disk; + struct blk_mq_tag_set tag_set; + struct request_queue *queue; + spinlock_t blk_lock; + + // sysfs + struct kobject kobj; + + char *imgname; + uint16_t rid; + struct socket *sock; + struct { // use blk_lock + unsigned long rtt; + struct sockaddr_storage host; + uint16_t protocol_version; + } cur_server; + serialized_buffer_t payload_buffer; + struct mutex alt_servers_lock; + dnbd3_alt_server_t alt_servers[NUMBER_SERVERS]; + bool use_server_provided_alts; + bool panic; + u8 panic_count; + bool update_available; + atomic_t connection_lock; + // Size if image/device - this is 0 if the device is not in use, + // otherwise this is also the value we expect from alt servers. + uint64_t reported_size; + struct delayed_work keepalive_work; + + // sending + struct workqueue_struct *send_wq; + spinlock_t send_queue_lock; + struct list_head send_queue; + struct mutex send_mutex; + struct work_struct send_work; + // receiving + struct workqueue_struct *recv_wq; + spinlock_t recv_queue_lock; + struct list_head recv_queue; + struct mutex recv_mutex; + struct work_struct recv_work; + // discover + atomic_t discover_running; + struct delayed_work discover_work; + u32 discover_interval; + u32 discover_count; + +} dnbd3_device_t; + +struct dnbd3_cmd { + u64 handle; +}; + +extern inline struct device *dnbd3_device_to_dev(dnbd3_device_t *dev); + +extern inline int is_same_server(const struct sockaddr_storage *const x, const struct sockaddr_storage *const y); + +extern int dnbd3_host_to_sockaddr(const dnbd3_host_t *host, struct sockaddr_storage *dest); + +extern dnbd3_alt_server_t *get_existing_alt_from_host(const dnbd3_host_t *const host, dnbd3_device_t *const dev); + +extern dnbd3_alt_server_t *get_existing_alt_from_addr(const struct sockaddr_storage *const addr, + dnbd3_device_t *const dev); + +extern int dnbd3_add_server(dnbd3_device_t *dev, dnbd3_host_t *host); + +extern int dnbd3_rem_server(dnbd3_device_t *dev, dnbd3_host_t *host); + +#define dnbd3_flag_get(x) (atomic_cmpxchg(&(x), 0, 1) == 0) +#define dnbd3_flag_reset(x) atomic_set(&(x), 0) +#define dnbd3_flag_taken(x) (atomic_read(&(x)) != 0) + +/* + * shims for making older kernels look like the current one, if possible, to avoid too + * much inline #ifdef which makes code harder to read. + */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) +#define BLK_EH_DONE BLK_EH_NOT_HANDLED +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) +#define blk_status_t int +#define BLK_STS_OK 0 +#define BLK_STS_IOERR (-EIO) +#define BLK_STS_TIMEOUT (-ETIME) +#define BLK_STS_NOTSUPP (-ENOTSUPP) +#endif + +#endif /* DNBD_H_ */ diff --git a/src/kernel/net.c b/src/kernel/net.c index 9e48b86..5ef4016 100644 --- a/src/kernel/net.c +++ b/src/kernel/net.c @@ -1,9 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -18,1106 +19,1112 @@ * */ -#include "clientconfig.h" +#include <dnbd3/config/client.h> #include "net.h" #include "blk.h" -#include "utils.h" +#include "dnbd3_main.h" -#include "serialize.h" +#include <dnbd3/shared/serialize.h> + +#include <linux/random.h> +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) +#define get_random_u32 prandom_u32 +#endif #include <linux/time.h> -#include <linux/signal.h> +#include <linux/ktime.h> +#include <linux/tcp.h> #ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) -#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock) -#else -#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern((af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock) +#ifndef ktime_to_s +#define ktime_to_s(kt) ktime_divns(kt, NSEC_PER_SEC) #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -// cmd_flags and cmd_type are merged into cmd_flags now -#if REQ_FLAG_BITS > 24 -#error "Fix CMD bitshift" -#endif -// Pack into cmd_flags field by shifting CMD_* into unused bits of cmd_flags -#define dnbd3_cmd_to_priv(req, cmd) (req)->cmd_flags = REQ_OP_DRV_IN | ((cmd) << REQ_FLAG_BITS) -#define dnbd3_priv_to_cmd(req) ((req)->cmd_flags >> REQ_FLAG_BITS) -#define dnbd3_req_op(req) req_op(req) -#define DNBD3_DEV_READ REQ_OP_READ -#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN +#ifdef DEBUG +#define ASSERT(x) \ + do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", __FILE__, __LINE__, #x); \ + BUG(); \ + } \ + } while (0) #else -// Old way with type and flags separated -#define dnbd3_cmd_to_priv(req, cmd) do { \ - (req)->cmd_type = REQ_TYPE_SPECIAL; \ - (req)->cmd_flags = (cmd); \ -} while (0) -#define dnbd3_priv_to_cmd(req) (req)->cmd_flags -#define dnbd3_req_op(req) (req)->cmd_type -#define DNBD3_DEV_READ REQ_TYPE_FS -#define DNBD3_REQ_OP_SPECIAL REQ_TYPE_SPECIAL +#define ASSERT(x) \ + do { \ + } while (0) #endif -/** - * Some macros for easier debug output. Location in source-code - * as well as server IP:port info will be printed. - * The error_* macros include a "goto error;" at the end - */ -#if 1 // Change to 0 to disable debug messages -#define debug_print_va_host(_host, _fmt, ...) do { \ - if ((_host).type == HOST_IP4) \ - printk("%s:%d " _fmt " (%s, %pI4:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \ - else \ - printk("%s:%d " _fmt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, __VA_ARGS__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \ -} while(0) -#define debug_error_va_host(_host, _fmt, ...) do { \ - debug_print_va_host(_host, _fmt, __VA_ARGS__); \ - goto error; \ -} while(0) -#define debug_dev_va(_fmt, ...) debug_print_va_host(dev->cur_server.host, _fmt, __VA_ARGS__) -#define error_dev_va(_fmt, ...) debug_error_va_host(dev->cur_server.host, _fmt, __VA_ARGS__) -#define debug_alt_va(_fmt, ...) debug_print_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__) -#define error_alt_va(_fmt, ...) debug_error_va_host(dev->alt_servers[i].host, _fmt, __VA_ARGS__) - -#define debug_print_host(_host, txt) do { \ - if ((_host).type == HOST_IP4) \ - printk("%s:%d " txt " (%s, %pI4:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \ - else \ - printk("%s:%d " txt " (%s, [%pI6]:%d)\n", __FILE__, __LINE__, dev->disk->disk_name, (_host).addr, (int)ntohs((_host).port)); \ -} while(0) -#define debug_error_host(_host, txt) do { \ - debug_print_host(_host, txt); \ - goto error; \ -} while(0) -#define debug_dev(txt) debug_print_host(dev->cur_server.host, txt) -#define error_dev(txt) debug_error_host(dev->cur_server.host, txt) -#define debug_alt(txt) debug_print_host(dev->alt_servers[i].host, txt) -#define error_alt(txt) debug_error_host(dev->alt_servers[i].host, txt) - -#else // Silent -#define debug_dev(x) do { } while(0) -#define error_dev(x) goto error -#define debug_dev_va(x, ...) do { } while(0) -#define error_dev_va(x, ...) goto error -#define debug_alt(x) do { } while(0) -#define error_alt(x) goto error -#define debug_alt_va(x, ...) do { } while(0) -#define error_alt_va(x, ...) goto error -#endif +#define dnbd3_dev_dbg_host(dev, host, fmt, ...) \ + dev_dbg(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__) +#define dnbd3_dev_info_host(dev, host, fmt, ...) \ + dev_info(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__) +#define dnbd3_dev_err_host(dev, host, fmt, ...) \ + dev_err(dnbd3_device_to_dev(dev), "(%pISpc): " fmt, (host), ##__VA_ARGS__) -static inline int is_same_server(const dnbd3_server_t * const a, const dnbd3_server_t * const b) -{ - return (a->host.type == b->host.type) && (a->host.port == b->host.port) - && (0 == memcmp(a->host.addr, b->host.addr, (a->host.type == HOST_IP4 ? 4 : 16))); -} +#define dnbd3_dev_dbg_cur(dev, fmt, ...) \ + dnbd3_dev_dbg_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__) +#define dnbd3_dev_info_cur(dev, fmt, ...) \ + dnbd3_dev_info_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__) +#define dnbd3_dev_err_cur(dev, fmt, ...) \ + dnbd3_dev_err_host(dev, &(dev)->cur_server.host, fmt, ##__VA_ARGS__) -static inline dnbd3_server_t *get_existing_server(const dnbd3_server_entry_t * const newserver, - dnbd3_device_t * const dev) -{ - int i; - for (i = 0; i < NUMBER_SERVERS; ++i) - { - if ((newserver->host.type == dev->alt_servers[i].host.type) - && (newserver->host.port == dev->alt_servers[i].host.port) - && (0 - == memcmp(newserver->host.addr, dev->alt_servers[i].host.addr, (newserver->host.type == HOST_IP4 ? 4 : 16)))) - { - return &dev->alt_servers[i]; - break; - } - } - return NULL ; -} - -static inline dnbd3_server_t *get_free_alt_server(dnbd3_device_t * const dev) -{ - int i; - for (i = 0; i < NUMBER_SERVERS; ++i) - { - if (dev->alt_servers[i].host.type == 0) - return &dev->alt_servers[i]; - } - for (i = 0; i < NUMBER_SERVERS; ++i) - { - if (dev->alt_servers[i].failures > 10) - return &dev->alt_servers[i]; - } - return NULL ; -} +static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes); +static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count); +static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr); +static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size); -int dnbd3_net_connect(dnbd3_device_t *dev) -{ - struct request *req1 = NULL; - struct timeval timeout; +static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, + struct sockaddr_storage *addr, u16 protocol_version); - if (dev->disconnecting) { - debug_dev("CONNECT: Still disconnecting!!!\n"); - while (dev->disconnecting) - schedule(); - } - if (dev->thread_receive != NULL) { - debug_dev("CONNECT: Still receiving!!!\n"); - while (dev->thread_receive != NULL) - schedule(); - } - if (dev->thread_send != NULL) { - debug_dev("CONNECT: Still sending!!!\n"); - while (dev->thread_send != NULL) - schedule(); - } +static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, + struct socket **sock_out); - timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA; - timeout.tv_usec = 0; +static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock, + struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_image_info); - // do some checks before connecting +static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, + struct socket *sock); - req1 = kmalloc(sizeof(*req1), GFP_ATOMIC ); - if (!req1) - error_dev("FATAL: Kmalloc(1) failed."); +static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd); - if (dev->cur_server.host.port == 0 || dev->cur_server.host.type == 0 || dev->imgname == NULL ) - error_dev("FATAL: Host, port or image name not set."); - if (dev->sock) - error_dev("ERROR: Already connected."); - - if (dev->cur_server.host.type != HOST_IP4 && dev->cur_server.host.type != HOST_IP6) - error_dev_va("ERROR: Unknown address type %d", (int)dev->cur_server.host.type); - - debug_dev("INFO: Connecting..."); - - if (dev->better_sock == NULL ) - { - // no established connection yet from discovery thread, start new one - dnbd3_request_t dnbd3_request; - dnbd3_reply_t dnbd3_reply; - struct msghdr msg; - struct kvec iov[2]; - uint16_t rid; - char *name; - int mlen; - init_msghdr(msg); - - if (dnbd3_sock_create(dev->cur_server.host.type, SOCK_STREAM, IPPROTO_TCP, &dev->sock) < 0) - error_dev("ERROR: Couldn't create socket (v6)."); - - kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); - kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); - dev->sock->sk->sk_allocation = GFP_NOIO; - if (dev->cur_server.host.type == HOST_IP4) - { - struct sockaddr_in sin; - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - memcpy(&(sin.sin_addr), dev->cur_server.host.addr, 4); - sin.sin_port = dev->cur_server.host.port; - if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0) - error_dev("FATAL: Connection to host failed. (v4)"); - } - else - { - struct sockaddr_in6 sin; - memset(&sin, 0, sizeof(sin)); - sin.sin6_family = AF_INET6; - memcpy(&(sin.sin6_addr), dev->cur_server.host.addr, 16); - sin.sin6_port = dev->cur_server.host.port; - if (kernel_connect(dev->sock, (struct sockaddr *)&sin, sizeof(sin), 0) != 0) - error_dev("FATAL: Connection to host failed. (v6)"); - } - // Request filesize - dnbd3_request.magic = dnbd3_packet_magic; - dnbd3_request.cmd = CMD_SELECT_IMAGE; - iov[0].iov_base = &dnbd3_request; - iov[0].iov_len = sizeof(dnbd3_request); - serializer_reset_write(&dev->payload_buffer); - serializer_put_uint16(&dev->payload_buffer, PROTOCOL_VERSION); - serializer_put_string(&dev->payload_buffer, dev->imgname); - serializer_put_uint16(&dev->payload_buffer, dev->rid); - serializer_put_uint8(&dev->payload_buffer, 0); // is_server = false - iov[1].iov_base = &dev->payload_buffer; - dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&dev->payload_buffer); - fixup_request(dnbd3_request); - mlen = sizeof(dnbd3_request) + iov[1].iov_len; - if (kernel_sendmsg(dev->sock, &msg, iov, 2, mlen) != mlen) - error_dev("ERROR: Couldn't send CMD_SIZE_REQUEST."); - // receive reply header - iov[0].iov_base = &dnbd3_reply; - iov[0].iov_len = sizeof(dnbd3_reply); - if (kernel_recvmsg(dev->sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply)) - error_dev("FATAL: Received corrupted reply header after CMD_SIZE_REQUEST."); - // check reply header - fixup_reply(dnbd3_reply); - if (dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 3 || dnbd3_reply.size > MAX_PAYLOAD - || dnbd3_reply.magic != dnbd3_packet_magic) - error_dev("FATAL: Received invalid reply to CMD_SIZE_REQUEST, image doesn't exist on server."); - // receive reply payload - iov[0].iov_base = &dev->payload_buffer; - iov[0].iov_len = dnbd3_reply.size; - if (kernel_recvmsg(dev->sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size) - error_dev("FATAL: Cold not read CMD_SELECT_IMAGE payload on handshake."); - // handle/check reply payload - serializer_reset_read(&dev->payload_buffer, dnbd3_reply.size); - dev->cur_server.protocol_version = serializer_get_uint16(&dev->payload_buffer); - if (dev->cur_server.protocol_version < MIN_SUPPORTED_SERVER) - error_dev("FATAL: Server version is lower than min supported version."); - name = serializer_get_string(&dev->payload_buffer); - if (dev->rid != 0 && strcmp(name, dev->imgname) != 0) - error_dev_va("FATAL: Server offers image '%s', requested '%s'", name, dev->imgname); - if (strlen(dev->imgname) < strlen(name)) - { - dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_ATOMIC ); - if (dev->imgname == NULL ) - error_dev("FATAL: Reallocating buffer for new image name failed"); - } - strcpy(dev->imgname, name); - rid = serializer_get_uint16(&dev->payload_buffer); - if (dev->rid != 0 && dev->rid != rid) - error_dev_va("FATAL: Server provides rid %d, requested was %d.", (int)rid, (int)dev->rid); - dev->rid = rid; - dev->reported_size = serializer_get_uint64(&dev->payload_buffer); - if (dev->reported_size < 4096) - error_dev("ERROR: Reported size by server is < 4096"); - // store image information - set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */ - debug_dev_va("INFO: Filesize: %llu.", dev->reported_size); - dev->update_available = 0; - } - else // Switching server, connection is already established and size request was executed - { - debug_dev("INFO: On-the-fly server change."); - dev->sock = dev->better_sock; - dev->better_sock = NULL; - kernel_setsockopt(dev->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); - kernel_setsockopt(dev->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); - } +static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic); - dev->panic = 0; - dev->panic_count = 0; +static void dnbd3_discover(dnbd3_device_t *dev); - // Enqueue request to request_queue_send for a fresh list of alt servers - dnbd3_cmd_to_priv(req1, CMD_GET_SERVERS); - list_add(&req1->queuelist, &dev->request_queue_send); +static void dnbd3_internal_discover(dnbd3_device_t *dev); - // create required threads - dev->thread_send = kthread_create(dnbd3_net_send, dev, dev->disk->disk_name); - dev->thread_receive = kthread_create(dnbd3_net_receive, dev, dev->disk->disk_name); - dev->thread_discover = kthread_create(dnbd3_net_discover, dev, dev->disk->disk_name); - // start them up - wake_up_process(dev->thread_send); - wake_up_process(dev->thread_receive); - wake_up_process(dev->thread_discover); +static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms); - wake_up(&dev->process_queue_send); +// Use as write-only dump, don't care about race conditions etc. +static u8 __garbage_mem[PAGE_SIZE]; - // add heartbeat timer - dev->heartbeat_count = 0; +/** + * Delayed work triggering sending of keepalive packet. + */ +static void dnbd3_keepalive_workfn(struct work_struct *work) +{ + unsigned long irqflags; + dnbd3_device_t *dev = container_of(work, dnbd3_device_t, keepalive_work.work); -// init_timer_key changed from kernel version 4.14 to 4.15, see and compare to 4.15: -// https://elixir.bootlin.com/linux/v4.14.32/source/include/linux/timer.h#L98 -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) - timer_setup(&dev->hb_timer, dnbd3_net_heartbeat, 0); -#else - // Old timer setup - init_timer(&dev->hb_timer); - dev->hb_timer.data = (unsigned long)dev; - dev->hb_timer.function = dnbd3_net_heartbeat; -#endif - dev->hb_timer.expires = jiffies + HZ; - add_timer(&dev->hb_timer); - return 0; - error: ; - if (dev->sock) - { - sock_release(dev->sock); - dev->sock = NULL; + dnbd3_send_empty_request(dev, CMD_KEEPALIVE); + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (device_active(dev)) { + mod_delayed_work(system_freezable_power_efficient_wq, + &dev->keepalive_work, KEEPALIVE_INTERVAL * HZ); } - dev->cur_server.host.type = 0; - dev->cur_server.host.port = 0; - if (req1) - kfree(req1); - return -1; + spin_unlock_irqrestore(&dev->blk_lock, irqflags); } -int dnbd3_net_disconnect(dnbd3_device_t *dev) +/** + * Delayed work triggering discovery (alt server check) + */ +static void dnbd3_discover_workfn(struct work_struct *work) { - if (dev->disconnecting) - return 0; - - if (dev->cur_server.host.port) - debug_dev("INFO: Disconnecting device."); - - dev->disconnecting = 1; - - // clear heartbeat timer - del_timer(&dev->hb_timer); - - dev->discover = 0; - - if (dev->sock) - kernel_sock_shutdown(dev->sock, SHUT_RDWR); - - // kill sending and receiving threads - if (dev->thread_send) - { - kthread_stop(dev->thread_send); - } + dnbd3_device_t *dev = container_of(work, dnbd3_device_t, discover_work.work); - if (dev->thread_receive) - { - kthread_stop(dev->thread_receive); - } + dnbd3_discover(dev); +} - if (dev->thread_discover) - { - kthread_stop(dev->thread_discover); - dev->thread_discover = NULL; - } +/** + * For manually triggering an immediate discovery + */ +static void dnbd3_start_discover(dnbd3_device_t *dev, bool panic) +{ + unsigned long irqflags; - // clear socket - if (dev->sock) - { - sock_release(dev->sock); - dev->sock = NULL; + if (!device_active(dev)) + return; + if (panic && dnbd3_flag_get(dev->connection_lock)) { + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (!dev->panic) { + // Panic freshly turned on + dev->panic = true; + dev->discover_interval = TIMER_INTERVAL_PROBE_PANIC; + } + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + dnbd3_flag_reset(dev->connection_lock); } - dev->cur_server.host.type = 0; - dev->cur_server.host.port = 0; - - dev->disconnecting = 0; - - return 0; + spin_lock_irqsave(&dev->blk_lock, irqflags); + mod_delayed_work(system_freezable_power_efficient_wq, + &dev->discover_work, 1); + spin_unlock_irqrestore(&dev->blk_lock, irqflags); } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) -void dnbd3_net_heartbeat(struct timer_list *arg) -{ - dnbd3_device_t *dev = (dnbd3_device_t *)container_of(arg, dnbd3_device_t, hb_timer); -#else -void dnbd3_net_heartbeat(unsigned long arg) +/** + * Wrapper for the actual discover function below. Check run conditions + * here and re-schedule delayed task here. + */ +static void dnbd3_discover(dnbd3_device_t *dev) { - dnbd3_device_t *dev = (dnbd3_device_t *)arg; -#endif - // Because different events need different intervals, the timer is called once a second. - // Other intervals can be derived using dev->heartbeat_count. -#define timeout_seconds(x) (dev->heartbeat_count % (x) == 0) - - if (!dev->panic) - { - if (timeout_seconds(TIMER_INTERVAL_KEEPALIVE_PACKET)) - { - struct request *req = kmalloc(sizeof(struct request), GFP_ATOMIC ); - // send keepalive - if (req) - { - dnbd3_cmd_to_priv(req, CMD_KEEPALIVE); - list_add_tail(&req->queuelist, &dev->request_queue_send); - wake_up(&dev->process_queue_send); - } - else - { - debug_dev("ERROR: Couldn't create keepalive request."); - } - } - if ((dev->heartbeat_count > STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_NORMAL)) - || (dev->heartbeat_count <= STARTUP_MODE_DURATION && timeout_seconds(TIMER_INTERVAL_PROBE_STARTUP))) - { - // Normal discovery - dev->discover = 1; - wake_up(&dev->process_queue_discover); + unsigned long irqflags; + + if (!device_active(dev) || dnbd3_flag_taken(dev->connection_lock)) + return; // device not active anymore, or just about to switch + if (!dnbd3_flag_get(dev->discover_running)) + return; // Already busy + spin_lock_irqsave(&dev->blk_lock, irqflags); + cancel_delayed_work(&dev->discover_work); + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + dnbd3_internal_discover(dev); + dev->discover_count++; + // Re-queueing logic + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (device_active(dev)) { + mod_delayed_work(system_freezable_power_efficient_wq, + &dev->discover_work, dev->discover_interval * HZ); + if (dev->discover_interval < TIMER_INTERVAL_PROBE_MAX + && dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT) { + dev->discover_interval += 2; } } - else if (timeout_seconds(TIMER_INTERVAL_PROBE_PANIC)) - { - // Panic discovery - dev->discover = 1; - wake_up(&dev->process_queue_discover); - } - - dev->hb_timer.expires = jiffies + HZ; - - ++dev->heartbeat_count; - add_timer(&dev->hb_timer); -#undef timeout_seconds + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + dnbd3_flag_reset(dev->discover_running); } -int dnbd3_net_discover(void *data) +/** + * Discovery. Probe all (or some) known alt servers, + * and initiate connection switch if appropriate + */ +static void dnbd3_internal_discover(dnbd3_device_t *dev) { - dnbd3_device_t *dev = data; - struct sockaddr_in sin4; - struct sockaddr_in6 sin6; struct socket *sock, *best_sock = NULL; + dnbd3_alt_server_t *alt; + struct sockaddr_storage host_compare, best_server; + uint16_t remote_version; + ktime_t start, end; + unsigned long rtt = 0, best_rtt = 0; + int i, j, k, isize, fails, rtt_threshold; + int do_change = 0; + u8 check_order[NUMBER_SERVERS]; + const bool ready = dev->discover_count > DISCOVER_STARTUP_PHASE_COUNT; + const u32 turn = dev->discover_count % DISCOVER_HISTORY_SIZE; + + // Shuffle alt_servers + for (i = 0; i < NUMBER_SERVERS; ++i) + check_order[i] = i; - dnbd3_request_t dnbd3_request; - dnbd3_reply_t dnbd3_reply; - dnbd3_server_t *alt_server; - struct msghdr msg; - struct kvec iov[2]; - - char *buf, *name; - serialized_buffer_t *payload; - uint64_t filesize; - uint16_t rid; - - struct timeval start, end; - unsigned long rtt, best_rtt = 0; - unsigned long irqflags; - int i, j, isize, best_server, current_server; - int turn = 0; - int ready = 0, do_change = 0; - char check_order[NUMBER_SERVERS]; - int mlen; - - struct request *last_request = (struct request *)123, *cur_request = (struct request *)456; - - struct timeval timeout; - timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DISCOVERY; - timeout.tv_usec = 0; - - memset(&sin4, 0, sizeof(sin4)); - memset(&sin6, 0, sizeof(sin6)); - - init_msghdr(msg); + for (i = 0; i < NUMBER_SERVERS; ++i) { + j = get_random_u32() % NUMBER_SERVERS; + if (j != i) { + int tmp = check_order[i]; - buf = kmalloc(4096, GFP_KERNEL); - if (!buf) - { - debug_dev("FATAL: Kmalloc failed (discover)"); - return -1; + check_order[i] = check_order[j]; + check_order[j] = tmp; + } } - payload = (serialized_buffer_t *)buf; // Reuse this buffer to save kernel mem - dnbd3_request.magic = dnbd3_packet_magic; + best_server.ss_family = 0; + best_rtt = RTT_UNREACHABLE; - for (i = 0; i < NUMBER_SERVERS; ++i) { - check_order[i] = i; - } - - for (;;) - { - wait_event_interruptible(dev->process_queue_discover, - kthread_should_stop() || dev->discover || dev->thread_discover == NULL); + if (!ready || dev->panic) + isize = NUMBER_SERVERS; + else + isize = 3; - if (kthread_should_stop() || dev->imgname == NULL || dev->thread_discover == NULL ) + for (j = 0; j < NUMBER_SERVERS; ++j) { + if (!device_active(dev)) break; + i = check_order[j]; + mutex_lock(&dev->alt_servers_lock); + host_compare = dev->alt_servers[i].host; + fails = dev->alt_servers[i].failures; + mutex_unlock(&dev->alt_servers_lock); + if (host_compare.ss_family == 0) + continue; // Empty slot + // Reduced probability for hosts that have been unreachable + if (!dev->panic && fails > 50 && (get_random_u32() % 4) != 0) + continue; // If not in panic mode, skip server if it failed too many times + if (isize-- <= 0 && !is_same_server(&dev->cur_server.host, &host_compare)) + continue; // Only test isize servers plus current server + + // Initialize socket and connect + sock = NULL; + if (dnbd3_connect(dev, &host_compare, &sock) != 0) + goto error; - if (!dev->discover) - continue; - dev->discover = 0; - - if (dev->reported_size < 4096) - continue; - - // Check if the list of alt servers needs to be updated and do so if necessary - if (dev->new_servers_num) - { - spin_lock_irqsave(&dev->blk_lock, irqflags); - for (i = 0; i < dev->new_servers_num; ++i) - { - if (dev->new_servers[i].host.type != HOST_IP4 && dev->new_servers[i].host.type != HOST_IP6) // Invalid entry? - continue; - alt_server = get_existing_server(&dev->new_servers[i], dev); - if (alt_server != NULL ) // Server already known - { - if (dev->new_servers[i].failures == 1) - { - // REMOVE request - if (alt_server->host.type == HOST_IP4) - debug_dev_va("Removing alt server %pI4", alt_server->host.addr); - else - debug_dev_va("Removing alt server %pI6", alt_server->host.addr); - alt_server->host.type = 0; - continue; - } - // ADD, so just reset fail counter - alt_server->failures = 0; - continue; - } - if (dev->new_servers[i].failures == 1) // REMOVE, but server is not in list anyways - continue; - alt_server = get_free_alt_server(dev); - if (alt_server == NULL ) // All NUMBER_SERVERS slots are taken, ignore entry - continue; - // Add new server entry - alt_server->host = dev->new_servers[i].host; - if (alt_server->host.type == HOST_IP4) - debug_dev_va("Adding alt server %pI4", alt_server->host.addr); - else - debug_dev_va("Adding alt server %pI6", alt_server->host.addr); - alt_server->rtts[0] = alt_server->rtts[1] = alt_server->rtts[2] = alt_server->rtts[3] = RTT_UNREACHABLE; - alt_server->protocol_version = 0; - alt_server->failures = 0; - } - dev->new_servers_num = 0; - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - } + remote_version = 0; + if (!dnbd3_execute_handshake(dev, sock, &host_compare, &remote_version, false)) + goto error; - current_server = best_server = -1; - best_rtt = 0xFFFFFFFul; - if (dev->heartbeat_count < STARTUP_MODE_DURATION || dev->panic) - { - isize = NUMBER_SERVERS; - } - else - { - isize = 3; - } - if (NUMBER_SERVERS > isize) { - for (i = 0; i < isize; ++i) { - j = ((start.tv_sec >> i) ^ (start.tv_usec >> j)) % NUMBER_SERVERS; - if (j != i) { - mlen = check_order[i]; - check_order[i] = check_order[j]; - check_order[j] = mlen; - } + // panic mode, take first responding server + if (dev->panic) { + dnbd3_dev_info_host(dev, &host_compare, "panic mode, changing to new server\n"); + if (!dnbd3_flag_get(dev->connection_lock)) { + dnbd3_dev_info_host(dev, &host_compare, "...raced, ignoring\n"); + } else { + // Check global flag, a connect might have been in progress + if (best_sock != NULL) + sock_release(best_sock); + set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000 + 1000); + if (dnbd3_set_primary_connection(dev, sock, &host_compare, remote_version) != 0) + sock_release(sock); + dnbd3_flag_reset(dev->connection_lock); + return; } } - for (j = 0; j < NUMBER_SERVERS; ++j) - { - i = check_order[j]; - if (dev->alt_servers[i].host.type == 0) // Empty slot - continue; - if (!dev->panic && dev->alt_servers[i].failures > 50 && (start.tv_usec & 7) != 0) // If not in panic mode, skip server if it failed too many times - continue; - if (isize-- <= 0 && !is_same_server(&dev->cur_server, &dev->alt_servers[i])) - continue; - - // Initialize socket and connect - if (dnbd3_sock_create(dev->alt_servers[i].host.type, SOCK_STREAM, IPPROTO_TCP, &sock) < 0) - { - debug_alt("ERROR: Couldn't create socket (discover)."); - sock = NULL; - continue; - } - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); - kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); - sock->sk->sk_allocation = GFP_NOIO; - if (dev->alt_servers[i].host.type == HOST_IP4) - { - sin4.sin_family = AF_INET; - memcpy(&sin4.sin_addr, dev->alt_servers[i].host.addr, 4); - sin4.sin_port = dev->alt_servers[i].host.port; - if (kernel_connect(sock, (struct sockaddr *)&sin4, sizeof(sin4), 0) < 0) - goto error; - } - else - { - sin6.sin6_family = AF_INET6; - memcpy(&sin6.sin6_addr, dev->alt_servers[i].host.addr, 16); - sin6.sin6_port = dev->alt_servers[i].host.port; - if (kernel_connect(sock, (struct sockaddr *)&sin6, sizeof(sin6), 0) < 0) - goto error; - } + // actual rtt measurement is just the first block requests and reply + start = ktime_get_real(); + if (!dnbd3_request_test_block(dev, &host_compare, sock)) + goto error; + end = ktime_get_real(); - // Request filesize - dnbd3_request.cmd = CMD_SELECT_IMAGE; - iov[0].iov_base = &dnbd3_request; - iov[0].iov_len = sizeof(dnbd3_request); - serializer_reset_write(payload); - serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version - serializer_put_string(payload, dev->imgname); // image name - serializer_put_uint16(payload, dev->rid); // revision id - serializer_put_uint8(payload, 0); // are we a server? (no!) - iov[1].iov_base = payload; - dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(payload); - fixup_request(dnbd3_request); - mlen = iov[1].iov_len + sizeof(dnbd3_request); - if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) - error_alt("ERROR: Requesting image size failed."); - - // receive net reply - iov[0].iov_base = &dnbd3_reply; - iov[0].iov_len = sizeof(dnbd3_reply); - if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply)) - error_alt("ERROR: Receiving image size packet (header) failed (discover)."); - fixup_reply(dnbd3_reply); - if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4) - error_alt("ERROR: Content of image size packet (header) mismatched (discover)."); - - // receive data - iov[0].iov_base = payload; - iov[0].iov_len = dnbd3_reply.size; - if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != dnbd3_reply.size) - error_alt("ERROR: Receiving image size packet (payload) failed (discover)."); - serializer_reset_read(payload, dnbd3_reply.size); - - dev->alt_servers[i].protocol_version = serializer_get_uint16(payload); - if (dev->alt_servers[i].protocol_version < MIN_SUPPORTED_SERVER) - error_alt_va("ERROR: Server version too old (client: %d, server: %d, min supported: %d).", - (int)PROTOCOL_VERSION, (int)dev->alt_servers[i].protocol_version, (int)MIN_SUPPORTED_SERVER); - - name = serializer_get_string(payload); - if (name == NULL ) - error_alt("ERROR: Server did not supply an image name (discover)."); - - if (strcmp(name, dev->imgname) != 0) - error_alt_va("ERROR: Image name does not match requested one (client: '%s', server: '%s') (discover).", - dev->imgname, name); - - rid = serializer_get_uint16(payload); - if (rid != dev->rid) - error_alt_va("ERROR: Server supplied wrong rid (client: '%d', server: '%d') (discover).", - (int)dev->rid, (int)rid); - - filesize = serializer_get_uint64(payload); - if (filesize != dev->reported_size) - error_alt_va("ERROR: Reported image size of %llu does not match expected value %llu.(discover).", - (unsigned long long)filesize, (unsigned long long)dev->reported_size); - - // panic mode, take first responding server - if (dev->panic) - { - dev->panic = 0; - debug_alt("WARN: Panic mode, changing server:"); - if (best_sock != NULL ) - sock_release(best_sock); - dev->better_sock = sock; // Pass over socket to take a shortcut in *_connect(); - kfree(buf); - dev->thread_discover = NULL; - dnbd3_net_disconnect(dev); - memcpy(&dev->cur_server, &dev->alt_servers[i], sizeof(dev->cur_server)); - dnbd3_net_connect(dev); - return 0; - } + mutex_lock(&dev->alt_servers_lock); + if (is_same_server(&dev->alt_servers[i].host, &host_compare)) { + dev->alt_servers[i].protocol_version = remote_version; + dev->alt_servers[i].rtts[turn] = + (unsigned long)ktime_us_delta(end, start); - // Request block - dnbd3_request.cmd = CMD_GET_BLOCK; - // Do *NOT* pick a random block as it has proven to cause severe - // cache thrashing on the server - dnbd3_request.offset = 0; - dnbd3_request.size = RTT_BLOCK_SIZE; - fixup_request(dnbd3_request); - iov[0].iov_base = &dnbd3_request; - iov[0].iov_len = sizeof(dnbd3_request); - - // start rtt measurement - do_gettimeofday(&start); - - if (kernel_sendmsg(sock, &msg, iov, 1, sizeof(dnbd3_request)) <= 0) - error_alt("ERROR: Requesting test block failed (discover)."); - - // receive net reply - iov[0].iov_base = &dnbd3_reply; - iov[0].iov_len = sizeof(dnbd3_reply); - if (kernel_recvmsg(sock, &msg, iov, 1, sizeof(dnbd3_reply), msg.msg_flags) != sizeof(dnbd3_reply)) - error_alt("ERROR: Receiving test block header packet failed (discover)."); - fixup_reply(dnbd3_reply); - if (dnbd3_reply.magic - != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE) - error_alt_va("ERROR: Unexpected reply to block request: cmd=%d, size=%d (discover).", - (int)dnbd3_reply.cmd, (int)dnbd3_reply.size); - - // receive data - iov[0].iov_base = buf; - iov[0].iov_len = RTT_BLOCK_SIZE; - if (kernel_recvmsg(sock, &msg, iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE) - error_alt("ERROR: Receiving test block payload failed (discover)."); - - do_gettimeofday(&end); // end rtt measurement - - dev->alt_servers[i].rtts[turn] = (unsigned long)((end.tv_sec - start.tv_sec) * 1000000ull - + (end.tv_usec - start.tv_usec)); - - rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] - + dev->alt_servers[i].rtts[3]) / 4; - - if (best_rtt > rtt) - { - // This one is better, keep socket open in case we switch - best_rtt = rtt; - best_server = i; - if (best_sock != NULL ) - sock_release(best_sock); - best_sock = sock; - sock = NULL; - } - else - { - // Not better, discard connection - sock_release(sock); - sock = NULL; - } + rtt = 0; - // update cur servers rtt - if (is_same_server(&dev->cur_server, &dev->alt_servers[i])) - { - dev->cur_rtt = rtt; - current_server = i; - } + for (k = 0; k < DISCOVER_HISTORY_SIZE; ++k) + rtt += dev->alt_servers[i].rtts[k]; + rtt /= DISCOVER_HISTORY_SIZE; dev->alt_servers[i].failures = 0; + if (dev->alt_servers[i].best_count > 1) + dev->alt_servers[i].best_count -= 2; + } + mutex_unlock(&dev->alt_servers_lock); - continue; - - error: ; - ++dev->alt_servers[i].failures; + if (best_rtt > rtt) { + // This one is better, keep socket open in case we switch + best_rtt = rtt; + best_server = host_compare; + if (best_sock != NULL) + sock_release(best_sock); + best_sock = sock; + sock = NULL; + } else { + // Not better, discard connection sock_release(sock); sock = NULL; - dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE; - if (is_same_server(&dev->cur_server, &dev->alt_servers[i])) - { - dev->cur_rtt = RTT_UNREACHABLE; - current_server = i; - } - continue; } - if (dev->panic) - { - // After 21 retries, bail out by reporting errors to block layer - if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count < 255 && ++dev->panic_count == PROBE_COUNT_TIMEOUT + 1) - dnbd3_blk_fail_all_requests(dev); - } + // update cur servers rtt + if (is_same_server(&dev->cur_server.host, &host_compare)) + dev->cur_server.rtt = rtt; - if (best_server == -1 || kthread_should_stop() || dev->thread_discover == NULL ) // No alt server could be reached at all or thread should stop - { - if (best_sock != NULL ) // Should never happen actually - { - sock_release(best_sock); - best_sock = NULL; - } - continue; - } + continue; - do_change = ready && best_server != current_server && (start.tv_usec & 3) != 0 - && RTT_THRESHOLD_FACTOR(dev->cur_rtt) > best_rtt + 1500; - - if (ready && !do_change) { - spin_lock_irqsave(&dev->blk_lock, irqflags); - if (!list_empty(&dev->request_queue_send)) - { - cur_request = list_entry(dev->request_queue_send.next, struct request, queuelist); - do_change = (cur_request == last_request); - if (do_change) - printk("WARNING: Hung request on %s\n", dev->disk->disk_name); - } - else - { - cur_request = (struct request *)123; - } - last_request = cur_request; - spin_unlock_irqrestore(&dev->blk_lock, irqflags); +error: + if (sock != NULL) { + sock_release(sock); + sock = NULL; } - - // take server with lowest rtt - if (do_change) - { - printk("INFO: Server %d on %s is faster (%lluµs vs. %lluµs)\n", best_server, dev->disk->disk_name, - (unsigned long long)best_rtt, (unsigned long long)dev->cur_rtt); - kfree(buf); - dev->better_sock = best_sock; // Take shortcut by continuing to use open connection - dev->thread_discover = NULL; - dnbd3_net_disconnect(dev); - memcpy(&dev->cur_server, &dev->alt_servers[best_server], sizeof(dev->cur_server)); - dev->cur_rtt = best_rtt; - dnbd3_net_connect(dev); - return 0; + mutex_lock(&dev->alt_servers_lock); + if (is_same_server(&dev->alt_servers[i].host, &host_compare)) { + if (remote_version) + dev->alt_servers[i].protocol_version = remote_version; + ++dev->alt_servers[i].failures; + dev->alt_servers[i].rtts[turn] = RTT_UNREACHABLE; + if (dev->alt_servers[i].best_count > 2) + dev->alt_servers[i].best_count -= 3; } - - // Clean up connection that was held open for quicker server switch - if (best_sock != NULL ) - { - sock_release(best_sock); - best_sock = NULL; + mutex_unlock(&dev->alt_servers_lock); + if (is_same_server(&dev->cur_server.host, &host_compare)) + dev->cur_server.rtt = RTT_UNREACHABLE; + } // END - for loop over alt_servers + + if (best_server.ss_family == 0) { + // No alt server could be reached + ASSERT(!best_sock); + if (dev->panic) { + if (dev->panic_count < 255) + dev->panic_count++; + // If probe timeout is set, report error to block layer + if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count == PROBE_COUNT_TIMEOUT + 1) + dnbd3_blk_fail_all_requests(dev); } + return; + } - if (!ready || (start.tv_usec & 15) != 0) - turn = (turn + 1) % 4; - if (turn == 2) // Set ready when we only have 2 of 4 measurements for quicker load balancing - ready = 1; - + // If best server was repeatedly measured best, lower the switching threshold more + mutex_lock(&dev->alt_servers_lock); + alt = get_existing_alt_from_addr(&best_server, dev); + if (alt != NULL) { + if (alt->best_count < 178) + alt->best_count += 3; + rtt_threshold = 1800 - (alt->best_count * 10); + remote_version = alt->protocol_version; + } else { + rtt_threshold = 1800; + remote_version = 0; } - kfree(buf); - return 0; + mutex_unlock(&dev->alt_servers_lock); + + do_change = ready && !is_same_server(&best_server, &dev->cur_server.host) + && RTT_THRESHOLD_FACTOR(dev->cur_server.rtt) > best_rtt + rtt_threshold; + + // take server with lowest rtt + // if a (dis)connect is already in progress, we do nothing, this is not panic mode + if (do_change && device_active(dev) && dnbd3_flag_get(dev->connection_lock)) { + dnbd3_dev_info_cur(dev, "server %pISpc is faster (%lluµs vs. %lluµs)\n", + &best_server, + (unsigned long long)best_rtt, (unsigned long long)dev->cur_server.rtt); + set_socket_timeout(best_sock, false, // recv + MAX(best_rtt / 1000, SOCKET_TIMEOUT_RECV * 1000) + 500); + set_socket_timeout(best_sock, true, // send + MAX(best_rtt / 1000, SOCKET_TIMEOUT_SEND * 1000) + 500); + if (dnbd3_set_primary_connection(dev, best_sock, &best_server, remote_version) != 0) + sock_release(best_sock); + dnbd3_flag_reset(dev->connection_lock); + return; + } + + // Clean up connection that was held open for quicker server switch + if (best_sock != NULL) + sock_release(best_sock); } -int dnbd3_net_send(void *data) +/** + * Worker for sending pending requests. This will be triggered whenever + * we get a new request from the block layer. The worker will then + * work through all the requests in the send queue, request them from + * the server, and return again. + */ +static void dnbd3_send_workfn(struct work_struct *work) { - dnbd3_device_t *dev = data; - struct request *blk_request, *tmp_request; - - dnbd3_request_t dnbd3_request; - struct msghdr msg; - struct kvec iov; - + dnbd3_device_t *dev = container_of(work, dnbd3_device_t, send_work); + struct request *blk_request; + struct dnbd3_cmd *cmd; unsigned long irqflags; - init_msghdr(msg); - - dnbd3_request.magic = dnbd3_packet_magic; - - set_user_nice(current, -20); - - // move already sent requests to request_queue_send again - while (!list_empty(&dev->request_queue_receive)) - { - printk("WARN: Request queue was not empty on %s\n", dev->disk->disk_name); - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_for_each_entry_safe(blk_request, tmp_request, &dev->request_queue_receive, queuelist) - { - list_del_init(&blk_request->queuelist); - list_add(&blk_request->queuelist, &dev->request_queue_send); - } - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - } - - for (;;) - { - wait_event_interruptible(dev->process_queue_send, kthread_should_stop() || !list_empty(&dev->request_queue_send)); - - if (kthread_should_stop()) + mutex_lock(&dev->send_mutex); + while (dev->sock && device_active(dev)) { + // extract next block request + spin_lock_irqsave(&dev->send_queue_lock, irqflags); + if (list_empty(&dev->send_queue)) { + spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); break; - - // extract block request - spin_lock_irqsave(&dev->blk_lock, irqflags); - if (list_empty(&dev->request_queue_send)) - { - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - continue; } - blk_request = list_entry(dev->request_queue_send.next, struct request, queuelist); - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - // what to do? - switch (dnbd3_req_op(blk_request)) - { - case DNBD3_DEV_READ: - dnbd3_request.cmd = CMD_GET_BLOCK; - dnbd3_request.offset = blk_rq_pos(blk_request) << 9; // *512 - dnbd3_request.size = blk_rq_bytes(blk_request); // bytes left to complete entire request - // enqueue request to request_queue_receive - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_del_init(&blk_request->queuelist); - list_add_tail(&blk_request->queuelist, &dev->request_queue_receive); - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - break; - case DNBD3_REQ_OP_SPECIAL: - dnbd3_request.cmd = dnbd3_priv_to_cmd(blk_request); - dnbd3_request.size = 0; - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_del_init(&blk_request->queuelist); - spin_unlock_irqrestore(&dev->blk_lock, irqflags); + blk_request = list_entry(dev->send_queue.next, struct request, queuelist); + list_del_init(&blk_request->queuelist); + spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); + // append to receive queue + spin_lock_irqsave(&dev->recv_queue_lock, irqflags); + list_add_tail(&blk_request->queuelist, &dev->recv_queue); + spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags); + + cmd = blk_mq_rq_to_pdu(blk_request); + if (!dnbd3_send_request(dev->sock, CMD_GET_BLOCK, cmd->handle, + blk_rq_pos(blk_request) << 9 /* sectors */, blk_rq_bytes(blk_request))) { + if (!dnbd3_flag_taken(dev->connection_lock)) { + dnbd3_dev_err_cur(dev, "connection to server lost (send)\n"); + dnbd3_start_discover(dev, true); + } break; - - default: - printk("ERROR: Unknown command (send %u %u)\n", (int)blk_request->cmd_flags, (int)dnbd3_req_op(blk_request)); - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_del_init(&blk_request->queuelist); - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - continue; } - - // send net request - dnbd3_request.handle = (uint64_t)(uintptr_t)blk_request; // Double cast to prevent warning on 32bit - fixup_request(dnbd3_request); - iov.iov_base = &dnbd3_request; - iov.iov_len = sizeof(dnbd3_request); - if (kernel_sendmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_request)) != sizeof(dnbd3_request)) - { - debug_dev("ERROR: Connection to server lost (send)"); - goto error; - } - wake_up(&dev->process_queue_receive); } - - dev->thread_send = NULL; - return 0; - - error: ; - if (dev->sock) - kernel_sock_shutdown(dev->sock, SHUT_RDWR); - if (!dev->disconnecting) - { - dev->panic = 1; - dev->discover = 1; - wake_up(&dev->process_queue_discover); - } - dev->thread_send = NULL; - return -1; + mutex_unlock(&dev->send_mutex); } -int dnbd3_net_receive(void *data) +/** + * The receive workfn stays active for as long as the connection to a server + * lasts, i.e. it only gets restarted when we switch to a new server. + */ +static void dnbd3_recv_workfn(struct work_struct *work) { - dnbd3_device_t *dev = data; - struct request *blk_request, *tmp_request, *received_request; - - dnbd3_reply_t dnbd3_reply; - struct msghdr msg; - struct kvec iov; + dnbd3_device_t *dev = container_of(work, dnbd3_device_t, recv_work); + struct request *blk_request; + struct request *rq_iter; + struct dnbd3_cmd *cmd; + dnbd3_reply_t reply_hdr; struct req_iterator iter; struct bio_vec bvec_inst; struct bio_vec *bvec = &bvec_inst; + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL }; + struct kvec iov; void *kaddr; unsigned long irqflags; - sigset_t blocked, oldset; uint16_t rid; - unsigned long int recv_timeout = jiffies; - - int count, remaining, ret; - - init_msghdr(msg); - set_user_nice(current, -20); + int remaining; + int ret; - while (!kthread_should_stop()) - { + mutex_lock(&dev->recv_mutex); + while (dev->sock) { // receive net reply - iov.iov_base = &dnbd3_reply; - iov.iov_len = sizeof(dnbd3_reply); - ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags); - if (ret == -EAGAIN) - { - if (jiffies < recv_timeout) recv_timeout = jiffies; // Handle overflow - if ((jiffies - recv_timeout) / HZ > SOCKET_KEEPALIVE_TIMEOUT) - error_dev_va("ERROR: Receive timeout reached (%d of %d secs).", (int)((jiffies - recv_timeout) / HZ), (int)SOCKET_KEEPALIVE_TIMEOUT); - continue; + ret = dnbd3_recv_reply(dev->sock, &reply_hdr); + if (ret == 0) { + /* have not received any data, but remote peer is shutdown properly */ + dnbd3_dev_dbg_cur(dev, "remote peer has performed an orderly shutdown\n"); + goto out_unlock; + } else if (ret < 0) { + if (ret == -EAGAIN) { + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, "receive timeout reached\n"); + } else { + /* for all errors other than -EAGAIN, print errno */ + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, "connection to server lost (receive, errno=%d)\n", ret); + } + goto out_unlock; } - if (ret <= 0) - error_dev("ERROR: Connection to server lost (receive)"); - if (ret != sizeof(dnbd3_reply)) - error_dev("ERROR: Recv msg header."); - fixup_reply(dnbd3_reply); - // check error - if (dnbd3_reply.magic != dnbd3_packet_magic) - error_dev("ERROR: Wrong packet magic (Receive)."); - if (dnbd3_reply.cmd == 0) - error_dev("ERROR: Command was 0 (Receive)."); + /* check if arrived data is valid */ + if (ret != sizeof(reply_hdr)) { + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, "recv partial msg header (%d/%d bytes)\n", + ret, (int)sizeof(reply_hdr)); + goto out_unlock; + } - // Update timeout - recv_timeout = jiffies; + // check error + if (reply_hdr.magic != dnbd3_packet_magic) { + dnbd3_dev_err_cur(dev, "wrong packet magic (receive)\n"); + goto out_unlock; + } // what to do? - switch (dnbd3_reply.cmd) - { + switch (reply_hdr.cmd) { case CMD_GET_BLOCK: // search for replied request in queue blk_request = NULL; - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_for_each_entry_safe(received_request, tmp_request, &dev->request_queue_receive, queuelist) - { - if ((uint64_t)(uintptr_t)received_request == dnbd3_reply.handle) // Double cast to prevent warning on 32bit - { - blk_request = received_request; + spin_lock_irqsave(&dev->recv_queue_lock, irqflags); + list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) { + cmd = blk_mq_rq_to_pdu(rq_iter); + if (cmd->handle == reply_hdr.handle) { + blk_request = rq_iter; + list_del_init(&blk_request->queuelist); break; } } - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - if (blk_request == NULL ) - error_dev_va("ERROR: Received block data for unrequested handle (%llu: %llu).\n", - (unsigned long long)dnbd3_reply.handle, (unsigned long long)dnbd3_reply.size); + spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags); + if (blk_request == NULL) { + dnbd3_dev_err_cur(dev, "received block data for unrequested handle (%llx: len=%llu)\n", + reply_hdr.handle, + (u64)reply_hdr.size); + goto out_unlock; + } // receive data and answer to block layer #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 14, 0) - rq_for_each_segment(bvec_inst, blk_request, iter) + rq_for_each_segment(bvec_inst, blk_request, iter) { #else - rq_for_each_segment(bvec, blk_request, iter) + rq_for_each_segment(bvec, blk_request, iter) { #endif - { - siginitsetinv(&blocked, sigmask(SIGKILL)); - sigprocmask(SIG_SETMASK, &blocked, &oldset); - kaddr = kmap(bvec->bv_page) + bvec->bv_offset; iov.iov_base = kaddr; iov.iov_len = bvec->bv_len; - if (kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags) != bvec->bv_len) - { - kunmap(bvec->bv_page); - sigprocmask(SIG_SETMASK, &oldset, NULL ); - error_dev("ERROR: Receiving from net to block layer."); - } + ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags); kunmap(bvec->bv_page); - - sigprocmask(SIG_SETMASK, &oldset, NULL ); + if (ret != bvec->bv_len) { + if (ret == 0) { + /* have not received any data, but remote peer is shutdown properly */ + dnbd3_dev_dbg_cur( + dev, "remote peer has performed an orderly shutdown\n"); + } else if (ret < 0) { + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, + "disconnect: receiving from net to block layer\n"); + } else { + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, + "receiving from net to block layer (%d bytes)\n", ret); + } + // Requeue request + spin_lock_irqsave(&dev->send_queue_lock, irqflags); + list_add(&blk_request->queuelist, &dev->send_queue); + spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); + goto out_unlock; + } } - spin_lock_irqsave(&dev->blk_lock, irqflags); - list_del_init(&blk_request->queuelist); - __blk_end_request_all(blk_request, 0); - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - continue; + blk_mq_end_request(blk_request, BLK_STS_OK); + break; case CMD_GET_SERVERS: - if (!dev->use_server_provided_alts) - { - remaining = dnbd3_reply.size; - goto consume_payload; - } - spin_lock_irqsave(&dev->blk_lock, irqflags); - dev->new_servers_num = 0; - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - count = MIN(NUMBER_SERVERS, dnbd3_reply.size / sizeof(dnbd3_server_entry_t)); - - if (count != 0) - { - iov.iov_base = dev->new_servers; - iov.iov_len = count * sizeof(dnbd3_server_entry_t); - if (kernel_recvmsg(dev->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags) - != (count * sizeof(dnbd3_server_entry_t))) - error_dev("ERROR: Recv CMD_GET_SERVERS payload."); - spin_lock_irqsave(&dev->blk_lock, irqflags); - dev->new_servers_num = count; - spin_unlock_irqrestore(&dev->blk_lock, irqflags); - } - // If there were more servers than accepted, remove the remaining data from the socket buffer - remaining = dnbd3_reply.size - (count * sizeof(dnbd3_server_entry_t)); - consume_payload: while (remaining > 0) - { - count = MIN(sizeof(dnbd3_reply), remaining); // Abuse the reply struct as the receive buffer - iov.iov_base = &dnbd3_reply; - iov.iov_len = count; - ret = kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret <= 0) - error_dev("ERROR: Recv additional payload from CMD_GET_SERVERS."); - remaining -= ret; + remaining = reply_hdr.size; + if (dev->use_server_provided_alts) { + dnbd3_server_entry_t new_server; + + while (remaining >= sizeof(dnbd3_server_entry_t)) { + if (dnbd3_recv_bytes(dev->sock, &new_server, sizeof(new_server)) + != sizeof(new_server)) { + if (!dnbd3_flag_taken(dev->connection_lock)) + dnbd3_dev_err_cur(dev, "recv CMD_GET_SERVERS payload\n"); + goto out_unlock; + } + // TODO: Log + if (new_server.failures == 0) { // ADD + dnbd3_add_server(dev, &new_server.host); + } else { // REM + dnbd3_rem_server(dev, &new_server.host); + } + remaining -= sizeof(new_server); + } } - continue; + if (!dnbd3_drain_socket(dev, dev->sock, remaining)) + goto out_unlock; + break; case CMD_LATEST_RID: - if (dnbd3_reply.size != 2) - { - printk("ERROR: CMD_LATEST_RID.size != 2.\n"); + if (reply_hdr.size < 2) { + dev_err(dnbd3_device_to_dev(dev), "CMD_LATEST_RID.size < 2\n"); continue; } - iov.iov_base = &rid; - iov.iov_len = sizeof(rid); - if (kernel_recvmsg(dev->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags) <= 0) - { - printk("ERROR: Could not receive CMD_LATEST_RID payload.\n"); - } - else - { + if (dnbd3_recv_bytes(dev->sock, &rid, 2) != 2) { + if (!dnbd3_flag_taken(dev->connection_lock)) + dev_err(dnbd3_device_to_dev(dev), "could not receive CMD_LATEST_RID payload\n"); + } else { rid = net_order_16(rid); - printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid); + dnbd3_dev_info_cur(dev, "latest rid of %s is %d (currently using %d)\n", + dev->imgname, (int)rid, (int)dev->rid); dev->update_available = (rid > dev->rid ? 1 : 0); } + if (reply_hdr.size > 2) + dnbd3_drain_socket(dev, dev->sock, reply_hdr.size - 2); continue; case CMD_KEEPALIVE: - if (dnbd3_reply.size != 0) - printk("ERROR: keep alive packet with payload.\n"); + if (reply_hdr.size != 0) { + dev_dbg(dnbd3_device_to_dev(dev), "keep alive packet with payload\n"); + dnbd3_drain_socket(dev, dev->sock, reply_hdr.size); + } continue; default: - printk("ERROR: Unknown command (Receive)\n"); - continue; + dev_err(dnbd3_device_to_dev(dev), "unknown command: %d (receive), aborting connection\n", (int)reply_hdr.cmd); + goto out_unlock; + } + } +out_unlock: + // This will check if we actually still need a new connection + dnbd3_start_discover(dev, true); + mutex_unlock(&dev->recv_mutex); +} +/** + * Set send or receive timeout of given socket + */ +static void set_socket_timeout(struct socket *sock, bool set_send, int timeout_ms) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) + int opt = set_send ? SO_SNDTIMEO_NEW : SO_RCVTIMEO_NEW; + struct __kernel_sock_timeval timeout; +#else + int opt = set_send ? SO_SNDTIMEO : SO_RCVTIMEO; + struct timeval timeout; +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) + sockptr_t timeout_ptr = KERNEL_SOCKPTR(&timeout); +#else + char *timeout_ptr = (char *)&timeout; +#endif + + timeout.tv_sec = timeout_ms / 1000; + timeout.tv_usec = (timeout_ms % 1000) * 1000; + sock_setsockopt(sock, SOL_SOCKET, opt, timeout_ptr, sizeof(timeout)); +} + +static int dnbd3_connect(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket **sock_out) +{ + ktime_t start; + int ret, connect_time_ms; + struct socket *sock; + int retries = 4; + const int addrlen = addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) + : sizeof(struct sockaddr_in6); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 2, 0) + ret = sock_create_kern(&init_net, addr->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); +#else + ret = sock_create_kern(addr->ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); +#endif + if (ret < 0) { + dev_err(dnbd3_device_to_dev(dev), "couldn't create socket: %d\n", ret); + return ret; + } + + /* Only one retry, TCP no delay */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) + tcp_sock_set_syncnt(sock->sk, 1); + tcp_sock_set_nodelay(sock->sk); + /* because of our aggressive timeouts, this is pointless */ + sock_no_linger(sock->sk); +#else + /* add legacy version of this, but ignore others as they're not that important */ + ret = 1; + kernel_setsockopt(sock, IPPROTO_TCP, TCP_SYNCNT, + (char *)&ret, sizeof(ret)); +#endif + /* allow this socket to use reserved mem (vm.mem_free_kbytes) */ + sk_set_memalloc(sock->sk); + sock->sk->sk_allocation = GFP_NOIO; + + if (dev->panic && dev->panic_count > 1) { + /* in panic mode for some time, start increasing timeouts */ + connect_time_ms = dev->panic_count * 1000; + } else { + /* otherwise, use 2*RTT of current server */ + connect_time_ms = dev->cur_server.rtt * 2 / 1000; + } + /* but obey a minimal configurable value, and maximum sanity check */ + if (connect_time_ms < SOCKET_TIMEOUT_SEND * 1000) + connect_time_ms = SOCKET_TIMEOUT_SEND * 1000; + else if (connect_time_ms > 60000) + connect_time_ms = 60000; + set_socket_timeout(sock, false, connect_time_ms); // recv + set_socket_timeout(sock, true, connect_time_ms); // send + start = ktime_get_real(); + while (--retries > 0) { + ret = kernel_connect(sock, (struct sockaddr *)addr, addrlen, 0); + connect_time_ms = (int)ktime_ms_delta(ktime_get_real(), start); + if (connect_time_ms > 2 * SOCKET_TIMEOUT_SEND * 1000) { + /* Either I'm losing my mind or there was a specific build of kernel + * 5.x where SO_RCVTIMEO didn't affect the connect call above, so + * this function would hang for over a minute for unreachable hosts. + * Leave in this debug check for twice the configured timeout + */ + dnbd3_dev_dbg_host(dev, addr, "connect: call took %dms\n", + connect_time_ms); } + if (ret != 0) { + if (ret == -EINTR) + dnbd3_dev_dbg_host(dev, addr, "connect: interrupted system call (blocked %dms)\n", + connect_time_ms); + else + dnbd3_dev_dbg_host(dev, addr, "connect: failed (%d, blocked %dms)\n", + ret, connect_time_ms); + goto error; + } + *sock_out = sock; + return 0; } +error: + sock_release(sock); + return ret < 0 ? ret : -EIO; +} - printk("dnbd3_net_receive terminated normally.\n"); - dev->thread_receive = NULL; - return 0; +#define dnbd3_err_dbg_host(...) do { \ + if (dev->panic || dev->sock == NULL) \ + dnbd3_dev_err_host(__VA_ARGS__); \ + else \ + dnbd3_dev_dbg_host(__VA_ARGS__); \ +} while (0) + +/** + * Execute protocol handshake on a newly connected socket. + * If this is the initial connection to any server, ie. we're being called + * through the initial ioctl() to open a device, we'll store the rid, filesize + * etc. in the dev struct., otherwise, this is a potential switch to another + * server, so we validate the filesize, rid, name against what we expect. + * The server's protocol version is returned in 'remote_version' + */ +static bool dnbd3_execute_handshake(dnbd3_device_t *dev, struct socket *sock, + struct sockaddr_storage *addr, uint16_t *remote_version, bool copy_data) +{ + unsigned long irqflags; + const char *name; + uint64_t filesize; + int mlen; + uint16_t rid; + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL }; + struct kvec iov[2]; + serialized_buffer_t *payload; + dnbd3_reply_t reply_hdr; + dnbd3_request_t request_hdr = { .magic = dnbd3_packet_magic }; + + payload = kmalloc(sizeof(*payload), GFP_KERNEL); + if (payload == NULL) + goto error; + + if (copy_data && device_active(dev)) + dev_warn(dnbd3_device_to_dev(dev), "Called handshake function with copy_data enabled when reported_size is not zero\n"); + + // Request filesize + request_hdr.cmd = CMD_SELECT_IMAGE; + iov[0].iov_base = &request_hdr; + iov[0].iov_len = sizeof(request_hdr); + serializer_reset_write(payload); + serializer_put_uint16(payload, PROTOCOL_VERSION); // DNBD3 protocol version + serializer_put_string(payload, dev->imgname); // image name + serializer_put_uint16(payload, dev->rid); // revision id + serializer_put_uint8(payload, 0); // are we a server? (no!) + iov[1].iov_base = payload; + request_hdr.size = iov[1].iov_len = serializer_get_written_length(payload); + fixup_request(request_hdr); + mlen = iov[0].iov_len + iov[1].iov_len; + if (kernel_sendmsg(sock, &msg, iov, 2, mlen) != mlen) { + dnbd3_err_dbg_host(dev, addr, "requesting image size failed\n"); + goto error; + } + + // receive net reply + if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) { + dnbd3_err_dbg_host(dev, addr, "receiving image size packet (header) failed\n"); + goto error; + } + if (reply_hdr.magic != dnbd3_packet_magic + || reply_hdr.cmd != CMD_SELECT_IMAGE || reply_hdr.size < 4 + || reply_hdr.size > sizeof(*payload)) { + dnbd3_err_dbg_host(dev, addr, + "corrupt CMD_SELECT_IMAGE reply\n"); + goto error; + } + + // receive data + iov[0].iov_base = payload; + iov[0].iov_len = reply_hdr.size; + if (kernel_recvmsg(sock, &msg, iov, 1, reply_hdr.size, msg.msg_flags) + != reply_hdr.size) { + dnbd3_err_dbg_host(dev, addr, + "receiving payload of CMD_SELECT_IMAGE reply failed\n"); + goto error; + } + serializer_reset_read(payload, reply_hdr.size); + + *remote_version = serializer_get_uint16(payload); + name = serializer_get_string(payload); + rid = serializer_get_uint16(payload); + filesize = serializer_get_uint64(payload); + + if (*remote_version < MIN_SUPPORTED_SERVER) { + dnbd3_err_dbg_host(dev, addr, + "server version too old (client: %d, server: %d, min supported: %d)\n", + (int)PROTOCOL_VERSION, (int)*remote_version, + (int)MIN_SUPPORTED_SERVER); + goto error; + } + if (name == NULL) { + dnbd3_err_dbg_host(dev, addr, "server did not supply an image name\n"); + goto error; + } + if (rid == 0) { + dnbd3_err_dbg_host(dev, addr, "server did not supply a revision id\n"); + goto error; + } + + if (copy_data) { + if (filesize < DNBD3_BLOCK_SIZE) { + dnbd3_err_dbg_host(dev, addr, "reported size by server is < 4096\n"); + goto error; + } + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (strlen(dev->imgname) < strlen(name)) { + dev->imgname = krealloc(dev->imgname, strlen(name) + 1, GFP_KERNEL); + if (dev->imgname == NULL) { + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + dnbd3_err_dbg_host(dev, addr, "reallocating buffer for new image name failed\n"); + goto error; + } + } + strcpy(dev->imgname, name); + dev->rid = rid; + // store image information + dev->reported_size = filesize; + dev->update_available = 0; + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */ + dnbd3_dev_dbg_host(dev, addr, "image size: %llu\n", dev->reported_size); + } else { + /* switching connection, sanity checks */ + if (rid != dev->rid) { + dnbd3_err_dbg_host(dev, addr, + "server supplied wrong rid (client: '%d', server: '%d')\n", + (int)dev->rid, (int)rid); + goto error; + } + + if (strcmp(name, dev->imgname) != 0) { + dnbd3_err_dbg_host(dev, addr, "server offers image '%s', requested '%s'\n", name, dev->imgname); + goto error; + } + + if (filesize != dev->reported_size) { + dnbd3_err_dbg_host(dev, addr, + "reported image size of %llu does not match expected value %llu\n", + (unsigned long long)filesize, (unsigned long long)dev->reported_size); + goto error; + } + } + kfree(payload); + return true; + +error: + kfree(payload); + return false; +} + +static bool dnbd3_send_request(struct socket *sock, u16 cmd, u64 handle, u64 offset, u32 size) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + dnbd3_request_t request_hdr = { + .magic = dnbd3_packet_magic, + .cmd = cmd, + .size = size, + .offset = offset, + .handle = handle, + }; + struct kvec iov = { .iov_base = &request_hdr, .iov_len = sizeof(request_hdr) }; + + fixup_request(request_hdr); + return kernel_sendmsg(sock, &msg, &iov, 1, sizeof(request_hdr)) == sizeof(request_hdr); +} + +/** + * Send a request with given cmd type and empty payload. + */ +static bool dnbd3_send_empty_request(dnbd3_device_t *dev, u16 cmd) +{ + int ret; + + mutex_lock(&dev->send_mutex); + ret = dev->sock + && dnbd3_send_request(dev->sock, cmd, 0, 0, 0); + mutex_unlock(&dev->send_mutex); + return ret; +} + +static int dnbd3_recv_bytes(struct socket *sock, void *buffer, size_t count) +{ + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL | MSG_WAITALL }; + struct kvec iov = { .iov_base = buffer, .iov_len = count }; + + return kernel_recvmsg(sock, &msg, &iov, 1, count, msg.msg_flags); +} + +static int dnbd3_recv_reply(struct socket *sock, dnbd3_reply_t *reply_hdr) +{ + int ret = dnbd3_recv_bytes(sock, reply_hdr, sizeof(*reply_hdr)); + + fixup_reply(*reply_hdr); + return ret; +} + +static bool dnbd3_drain_socket(dnbd3_device_t *dev, struct socket *sock, int bytes) +{ + int ret; + struct kvec iov; + struct msghdr msg = { .msg_flags = MSG_NOSIGNAL }; + + while (bytes > 0) { + iov.iov_base = __garbage_mem; + iov.iov_len = sizeof(__garbage_mem); + ret = kernel_recvmsg(sock, &msg, &iov, 1, MIN(bytes, iov.iov_len), msg.msg_flags); + if (ret <= 0) { + dnbd3_dev_err_cur(dev, "draining payload failed (ret=%d)\n", ret); + return false; + } + bytes -= ret; + } + return true; +} + +static bool dnbd3_request_test_block(dnbd3_device_t *dev, struct sockaddr_storage *addr, struct socket *sock) +{ + dnbd3_reply_t reply_hdr; + + // Request block + if (!dnbd3_send_request(sock, CMD_GET_BLOCK, 0, 0, RTT_BLOCK_SIZE)) { + dnbd3_err_dbg_host(dev, addr, "requesting test block failed\n"); + return false; + } + + // receive net reply + if (dnbd3_recv_reply(sock, &reply_hdr) != sizeof(reply_hdr)) { + dnbd3_err_dbg_host(dev, addr, "receiving test block header packet failed\n"); + return false; + } + if (reply_hdr.magic != dnbd3_packet_magic || reply_hdr.cmd != CMD_GET_BLOCK + || reply_hdr.size != RTT_BLOCK_SIZE || reply_hdr.handle != 0) { + dnbd3_err_dbg_host(dev, addr, + "unexpected reply to block request: cmd=%d, size=%d, handle=%llu (discover)\n", + (int)reply_hdr.cmd, (int)reply_hdr.size, reply_hdr.handle); + return false; + } - error: + // receive data + return dnbd3_drain_socket(dev, sock, RTT_BLOCK_SIZE); +} +#undef dnbd3_err_dbg_host + +static void replace_main_socket(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version) +{ + unsigned long irqflags; + + mutex_lock(&dev->send_mutex); + // First, shutdown connection, so receive worker will leave its mainloop if (dev->sock) kernel_sock_shutdown(dev->sock, SHUT_RDWR); - if (!dev->disconnecting) - { - dev->panic = 1; - dev->discover = 1; - wake_up(&dev->process_queue_discover); + mutex_lock(&dev->recv_mutex); + // Receive worker is done, get rid of socket and replace + if (dev->sock) + sock_release(dev->sock); + dev->sock = sock; + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (addr == NULL) { + memset(&dev->cur_server, 0, sizeof(dev->cur_server)); + } else { + dev->cur_server.host = *addr; + dev->cur_server.rtt = 0; + dev->cur_server.protocol_version = protocol_version; } - dev->thread_receive = NULL; - return -1; + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + mutex_unlock(&dev->recv_mutex); + mutex_unlock(&dev->send_mutex); } +static void dnbd3_release_resources(dnbd3_device_t *dev) +{ + if (dev->send_wq) + destroy_workqueue(dev->send_wq); + dev->send_wq = NULL; + if (dev->recv_wq) + destroy_workqueue(dev->recv_wq); + dev->recv_wq = NULL; + mutex_destroy(&dev->send_mutex); + mutex_destroy(&dev->recv_mutex); +} + +/** + * Establish new connection on a dnbd3 device. + * Return 0 on success, errno otherwise + */ +int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init) +{ + unsigned long irqflags; + struct socket *sock = NULL; + uint16_t proto_version; + int ret; + + ASSERT(dnbd3_flag_taken(dev->connection_lock)); + if (init && device_active(dev)) { + dnbd3_dev_err_cur(dev, "device already configured/connected\n"); + return -EBUSY; + } + if (!init && !device_active(dev)) { + dev_warn(dnbd3_device_to_dev(dev), "connection switch called on unconfigured device\n"); + return -ENOTCONN; + } + + dnbd3_dev_dbg_host(dev, addr, "connecting...\n"); + ret = dnbd3_connect(dev, addr, &sock); + if (ret != 0 || sock == NULL) + goto error; + + /* execute the "select image" handshake */ + // if init is true, reported_size will be set + if (!dnbd3_execute_handshake(dev, sock, addr, &proto_version, init)) { + ret = -EINVAL; + goto error; + } + + if (init) { + // We're setting up the device for use - allocate resources + // Do not goto error before this + ASSERT(!dev->send_wq); + ASSERT(!dev->recv_wq); + mutex_init(&dev->send_mutex); + mutex_init(&dev->recv_mutex); + // a designated queue for sending, that allows one active task only + dev->send_wq = alloc_workqueue("dnbd%d-send", + WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI, + 1, dev->index); + dev->recv_wq = alloc_workqueue("dnbd%d-recv", + WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, + 1, dev->index); + if (!dev->send_wq || !dev->recv_wq) { + ret = -ENOMEM; + goto error_dealloc; + } + } + + set_socket_timeout(sock, false, SOCKET_TIMEOUT_RECV * 1000); // recv + dnbd3_set_primary_connection(dev, sock, addr, proto_version); + sock = NULL; // In case we ever goto error* after this point + + spin_lock_irqsave(&dev->blk_lock, irqflags); + if (init) { + dev->discover_count = 0; + dev->discover_interval = TIMER_INTERVAL_PROBE_STARTUP; + // discovery and keepalive are not critical, use the power efficient queue + queue_delayed_work(system_power_efficient_wq, &dev->discover_work, + dev->discover_interval * HZ); + queue_delayed_work(system_power_efficient_wq, &dev->keepalive_work, + KEEPALIVE_INTERVAL * HZ); + // but the receiver is performance critical AND runs indefinitely, use the + // the cpu intensive queue, as jobs submitted there will not cound towards + // the concurrency limit of per-cpu worker threads. It still feels a little + // dirty to avoid managing our own thread, but nbd does it too. + } + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + return 0; + +error_dealloc: + if (init) { + // If anything fails during initialization, free resources again + dnbd3_release_resources(dev); + } +error: + if (init) + dev->reported_size = 0; + if (sock) + sock_release(sock); + return ret < 0 ? ret : -EIO; +} + +void dnbd3_net_work_init(dnbd3_device_t *dev) +{ + INIT_WORK(&dev->send_work, dnbd3_send_workfn); + INIT_WORK(&dev->recv_work, dnbd3_recv_workfn); + INIT_DELAYED_WORK(&dev->discover_work, dnbd3_discover_workfn); + INIT_DELAYED_WORK(&dev->keepalive_work, dnbd3_keepalive_workfn); +} + +static int dnbd3_set_primary_connection(dnbd3_device_t *dev, struct socket *sock, struct sockaddr_storage *addr, u16 protocol_version) +{ + unsigned long irqflags; + + ASSERT(dnbd3_flag_taken(dev->connection_lock)); + if (addr->ss_family == 0 || dev->imgname == NULL || sock == NULL) { + dnbd3_dev_err_cur(dev, "connect: host, image name or sock not set\n"); + return -EINVAL; + } + + replace_main_socket(dev, sock, addr, protocol_version); + spin_lock_irqsave(&dev->blk_lock, irqflags); + dev->panic = false; + dev->panic_count = 0; + dev->discover_interval = TIMER_INTERVAL_PROBE_SWITCH; + queue_work(dev->recv_wq, &dev->recv_work); + spin_unlock_irqrestore(&dev->blk_lock, irqflags); + + if (dev->use_server_provided_alts) + dnbd3_send_empty_request(dev, CMD_GET_SERVERS); + + dnbd3_dev_info_cur(dev, "connection switched\n"); + dnbd3_blk_requeue_all_requests(dev); + return 0; +} + +/** + * Disconnect the device, shutting it down. + */ +int dnbd3_net_disconnect(dnbd3_device_t *dev) +{ + ASSERT(dnbd3_flag_taken(dev->connection_lock)); + if (!device_active(dev)) + return -ENOTCONN; + dev_dbg(dnbd3_device_to_dev(dev), "disconnecting device ...\n"); + + dev->reported_size = 0; + /* quickly fail all requests */ + dnbd3_blk_fail_all_requests(dev); + replace_main_socket(dev, NULL, NULL, 0); + + cancel_delayed_work_sync(&dev->keepalive_work); + cancel_delayed_work_sync(&dev->discover_work); + cancel_work_sync(&dev->send_work); + cancel_work_sync(&dev->recv_work); + + dnbd3_blk_fail_all_requests(dev); + dnbd3_release_resources(dev); + dev_dbg(dnbd3_device_to_dev(dev), "all workers shut down\n"); + return 0; +} diff --git a/src/kernel/net.h b/src/kernel/net.h index a06a20c..69fa523 100644 --- a/src/kernel/net.h +++ b/src/kernel/net.h @@ -1,9 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -21,30 +22,12 @@ #ifndef NET_H_ #define NET_H_ -#include "dnbd3.h" +#include "dnbd3_main.h" -#define init_msghdr(h) do { \ - h.msg_name = NULL; \ - h.msg_namelen = 0; \ - h.msg_control = NULL; \ - h.msg_controllen = 0; \ - h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \ - } while (0) +void dnbd3_net_work_init(dnbd3_device_t *dev); -int dnbd3_net_connect(dnbd3_device_t *lo); +int dnbd3_new_connection(dnbd3_device_t *dev, struct sockaddr_storage *addr, bool init); -int dnbd3_net_disconnect(dnbd3_device_t *lo); - -int dnbd3_net_send(void *data); - -int dnbd3_net_receive(void *data); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) -void dnbd3_net_heartbeat(struct timer_list *arg); -#else -void dnbd3_net_heartbeat(unsigned long arg); -#endif - -int dnbd3_net_discover(void *data); +int dnbd3_net_disconnect(dnbd3_device_t *dev); #endif /* NET_H_ */ diff --git a/src/kernel/serialize.c b/src/kernel/serialize.c new file mode 120000 index 0000000..5a4e4ac --- /dev/null +++ b/src/kernel/serialize.c @@ -0,0 +1 @@ +../shared/serialize.c
\ No newline at end of file diff --git a/src/kernel/serialize_kmod.c b/src/kernel/serialize_kmod.c deleted file mode 100644 index 50746df..0000000 --- a/src/kernel/serialize_kmod.c +++ /dev/null @@ -1,5 +0,0 @@ -#include <linux/kernel.h> -#include <linux/string.h> - -#define KERNEL_MODULE -#include "serialize.c" diff --git a/src/kernel/sysfs.c b/src/kernel/sysfs.c index 4406072..9deba96 100644 --- a/src/kernel/sysfs.c +++ b/src/kernel/sysfs.c @@ -1,9 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -21,156 +22,138 @@ #include <linux/kobject.h> #include "sysfs.h" -#include "utils.h" #ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif +/** + * Print currently connected server IP:PORT + */ ssize_t show_cur_server_addr(char *buf, dnbd3_device_t *dev) { - if (dev->cur_server.host.type == HOST_IP4) - return MIN(snprintf(buf, PAGE_SIZE, "%pI4,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE); - else if (dev->cur_server.host.type == HOST_IP6) - return MIN(snprintf(buf, PAGE_SIZE, "%pI6,%d\n", dev->cur_server.host.addr, (int)ntohs(dev->cur_server.host.port)), PAGE_SIZE); - *buf = '\0'; - return 0; -} - -ssize_t show_cur_server_rtt(char *buf, dnbd3_device_t *dev) -{ - return MIN(snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)dev->cur_rtt), PAGE_SIZE); -} + ssize_t ret; -ssize_t show_alt_server_num(char *buf, dnbd3_device_t *dev) -{ - int i, num = 0; - for (i = 0; i < NUMBER_SERVERS; ++i) - { - if (dev->alt_servers[i].host.type) ++num; - } - return MIN(snprintf(buf, PAGE_SIZE, "%d\n", num), PAGE_SIZE); + spin_lock(&dev->blk_lock); + ret = MIN(snprintf(buf, PAGE_SIZE, "%pISpc\n", &dev->cur_server.host), PAGE_SIZE); + spin_unlock(&dev->blk_lock); + return ret; } +/** + * List alt servers. One line per server, format is: + * IP:PORT RTT consecutive_failures best_count + */ ssize_t show_alt_servers(char *buf, dnbd3_device_t *dev) { - int i, size = PAGE_SIZE, ret; - for (i = 0; i < NUMBER_SERVERS; ++i) - { - if (dev->alt_servers[i].host.type == HOST_IP4) - ret = MIN(snprintf(buf, size, "%pI4,%d,%llu,%d\n", - dev->alt_servers[i].host.addr, - (int)ntohs(dev->alt_servers[i].host.port), - (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4), - (int)dev->alt_servers[i].failures) - , size); - else if (dev->alt_servers[i].host.type == HOST_IP6) - ret = MIN(snprintf(buf, size, "%pI6,%d,%llu,%d\n", - dev->alt_servers[i].host.addr, - (int)ntohs(dev->alt_servers[i].host.port), - (unsigned long long)((dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1] + dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4), - (int)dev->alt_servers[i].failures) - , size); - else + int i, size = PAGE_SIZE; + ssize_t ret; + + if (mutex_lock_interruptible(&dev->alt_servers_lock) != 0) + return 0; + + for (i = 0; i < NUMBER_SERVERS; ++i) { + if (dev->alt_servers[i].host.ss_family == 0) continue; + + ret = MIN(snprintf(buf, size, "%pISpc %llu %d %d\n", &dev->alt_servers[i].host, + (unsigned long long)((dev->alt_servers[i].rtts[0] + + dev->alt_servers[i].rtts[1] + + dev->alt_servers[i].rtts[2] + + dev->alt_servers[i].rtts[3]) / 4), + (int)dev->alt_servers[i].failures, + (int)dev->alt_servers[i].best_count), + size); size -= ret; buf += ret; - if (size <= 0) - { + if (size <= 0) { size = 0; break; } } + mutex_unlock(&dev->alt_servers_lock); return PAGE_SIZE - size; } +/** + * Show name of image in use + */ ssize_t show_image_name(char *buf, dnbd3_device_t *dev) { - if (dev->imgname == NULL) return sprintf(buf, "(null)"); - return MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE); + ssize_t ret; + + spin_lock(&dev->blk_lock); + ret = MIN(snprintf(buf, PAGE_SIZE, "%s\n", dev->imgname), PAGE_SIZE); + spin_unlock(&dev->blk_lock); + return ret; } +/** + * Show rid of image in use + */ ssize_t show_rid(char *buf, dnbd3_device_t *dev) { + // No locking here, primitive type, no pointer to allocated memory return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->rid), PAGE_SIZE); } ssize_t show_update_available(char *buf, dnbd3_device_t *dev) { + // Same story return MIN(snprintf(buf, PAGE_SIZE, "%d\n", dev->update_available), PAGE_SIZE); } -device_attr_t cur_server_addr = -{ - .attr = {.name = "cur_server_addr", .mode = 0444 }, - .show = show_cur_server_addr, - .store = NULL, -}; - -device_attr_t cur_server_rtt = -{ - .attr = {.name = "cur_server_rtt", .mode = 0444 }, - .show = show_cur_server_rtt, - .store = NULL, -}; - -device_attr_t alt_server_num = -{ - .attr = {.name = "alt_server_num", .mode = 0444 }, - .show = show_alt_server_num, - .store = NULL, +device_attr_t cur_server_addr = { + .attr = { .name = "cur_server_addr", .mode = 0444 }, + .show = show_cur_server_addr, + .store = NULL, }; -device_attr_t alt_servers = -{ - .attr = {.name = "alt_servers", .mode = 0444 }, - .show = show_alt_servers, - .store = NULL, +device_attr_t alt_servers = { + .attr = { .name = "alt_servers", .mode = 0444 }, + .show = show_alt_servers, + .store = NULL, }; -device_attr_t image_name = -{ - .attr = {.name = "image_name", .mode = 0444 }, - .show = show_image_name, - .store = NULL, +device_attr_t image_name = { + .attr = { .name = "image_name", .mode = 0444 }, + .show = show_image_name, + .store = NULL, }; -device_attr_t rid = -{ - .attr = {.name = "rid", .mode = 0444 }, - .show = show_rid, - .store = NULL, +device_attr_t rid = { + .attr = { .name = "rid", .mode = 0444 }, + .show = show_rid, + .store = NULL, }; -device_attr_t update_available = -{ - .attr = {.name = "update_available", .mode = 0444 }, - .show = show_update_available, - .store = NULL, +device_attr_t update_available = { + .attr = { .name = "update_available", .mode = 0444 }, + .show = show_update_available, + .store = NULL, }; ssize_t device_show(struct kobject *kobj, struct attribute *attr, char *buf) { device_attr_t *device_attr = container_of(attr, device_attr_t, attr); dnbd3_device_t *dev = container_of(kobj, dnbd3_device_t, kobj); + return device_attr->show(buf, dev); } -struct attribute *device_attrs[] = -{ +struct attribute *device_attrs[] = { &cur_server_addr.attr, - &cur_server_rtt.attr, - &alt_server_num.attr, &alt_servers.attr, - &image_name.attr, - &rid.attr, + &image_name.attr, &rid.attr, &update_available.attr, NULL, }; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) +ATTRIBUTE_GROUPS(device); +#endif -struct sysfs_ops device_ops = -{ +const struct sysfs_ops device_ops = { .show = device_show, }; @@ -179,14 +162,16 @@ void release(struct kobject *kobj) kobj->state_initialized = 0; } -struct kobj_type device_ktype = -{ +struct kobj_type device_ktype = { +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) .default_attrs = device_attrs, +#else + .default_groups = device_groups, +#endif .sysfs_ops = &device_ops, .release = release, }; - void dnbd3_sysfs_init(dnbd3_device_t *dev) { int error; @@ -196,7 +181,7 @@ void dnbd3_sysfs_init(dnbd3_device_t *dev) error = kobject_init_and_add(kobj, ktype, parent, "%s", "net"); if (error) - printk("Error initializing dnbd3 device!\n"); + dev_err(dnbd3_device_to_dev(dev), "initializing sysfs for device failed!\n"); } void dnbd3_sysfs_exit(dnbd3_device_t *dev) diff --git a/src/kernel/sysfs.h b/src/kernel/sysfs.h index 0a747a5..1db4a07 100644 --- a/src/kernel/sysfs.h +++ b/src/kernel/sysfs.h @@ -1,9 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> * - * This file may be licensed under the terms of of the + * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed @@ -21,25 +22,16 @@ #ifndef SYSFS_H_ #define SYSFS_H_ -#include "dnbd3.h" +#include "dnbd3_main.h" void dnbd3_sysfs_init(dnbd3_device_t *dev); void dnbd3_sysfs_exit(dnbd3_device_t *dev); -typedef struct -{ +typedef struct { struct attribute attr; - ssize_t (*show)(char *, dnbd3_device_t *); - ssize_t (*store)(const char *, size_t, dnbd3_device_t *); + ssize_t (*show)(char *buf, dnbd3_device_t *dev); + ssize_t (*store)(const char *buf, size_t len, dnbd3_device_t *dev); } device_attr_t; -typedef struct -{ - struct attribute attr; - ssize_t (*show)(char *, dnbd3_server_t *); - ssize_t (*store)(const char *, size_t, dnbd3_server_t *); -} server_attr_t; - - #endif /* SYSFS_H_ */ diff --git a/src/kernel/utils.c b/src/kernel/utils.c deleted file mode 100644 index 902025f..0000000 --- a/src/kernel/utils.c +++ /dev/null @@ -1,41 +0,0 @@ -/* - * This file is part of the Distributed Network Block Device 3 - * - * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> - * - * This file may be licensed under the terms of of the - * GNU General Public License Version 2 (the ``GPL''). - * - * Software distributed under the License is distributed - * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either - * express or implied. See the GPL for the specific language - * governing rights and limitations. - * - * You should have received a copy of the GPL along with this - * program. If not, go to http://www.gnu.org/licenses/gpl.html - * or write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#include <linux/kernel.h> - -#include "utils.h" - -unsigned int inet_addr(char *str) -{ - int a, b, c, d; - char arr[4]; - sscanf(str, "%d.%d.%d.%d", &a, &b, &c, &d); - arr[0] = a; - arr[1] = b; - arr[2] = c; - arr[3] = d; - return *(unsigned int *) arr; -} - -void inet_ntoa(struct in_addr addr, char *str) -{ - unsigned char *ptr = (unsigned char *) &addr; - sprintf(str, "%d.%d.%d.%d", ptr[0] & 0xff, ptr[1] & 0xff, ptr[2] & 0xff, ptr[3] & 0xff); -} diff --git a/src/kernel/utils.h b/src/kernel/utils.h deleted file mode 100644 index e54b3cf..0000000 --- a/src/kernel/utils.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * This file is part of the Distributed Network Block Device 3 - * - * Copyright(c) 2011-2012 Johann Latocha <johann@latocha.de> - * - * This file may be licensed under the terms of of the - * GNU General Public License Version 2 (the ``GPL''). - * - * Software distributed under the License is distributed - * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either - * express or implied. See the GPL for the specific language - * governing rights and limitations. - * - * You should have received a copy of the GPL along with this - * program. If not, go to http://www.gnu.org/licenses/gpl.html - * or write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - * - */ - -#ifndef UTILS_H_ -#define UTILS_H_ - -#include <linux/in.h> - -unsigned int inet_addr(char *str); -void inet_ntoa(struct in_addr addr, char *str); - -#endif /* UTILS_H_ */ |