summaryrefslogblamecommitdiffstats
path: root/builder/modules.d/slx-dmsetup/scripts/dmsetup-slx-device
blob: 1756865e354e5993cd5a1446f2e717529220d9c0 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12











                                                                
                                        





                                                          


                        



























































                                                                                      



                                                                                            
                                                                   
                                                                      


















                                                                
                                                                               












                                                                                          
                                                                                     

























                                                                                                            
                                                        

 



                                                                                        
                

                                                                                                 

                      



                                                                         

                                    
                                                  
                         
                                              


              






















                                                                                                       








                                                                        
                                                                                                        








                                                                                            






                                                            
                                                






















                                                                                      
                                                                     
















                                                                                                       

                                                        

                                                                                         
                                                                                              

                                
                                                                          
 

                                           
                                                                                    
                                                




















                                                                                                                        

                                                          



















                                                                                         
                                                                                    
                                          
                           















                                                                                       


                                                                                  
                                                               
                                                                         














































                                                                                                                            
                                                                       









                                                                          
                                                                                    









                                                                            











                                                                                          


                                                                                          
                                                              






                                    
                                                         



                                                                                                       
                                                        




                   
#!/usr/bin/env bash
#
# Script to back given read-only device using the block device
# specified by SLX_WRITABLE_DEVICE_IDENTIFIER in the SLX config.
# If SLX_WRITABLE_DEVICE_PARTITION_TABLE is sepcified, it will
# further create device mapper devices accordingly.
#
# Example partition config:
#	<type>         <name> <size> <crypt>
#	thin-snapshot   root    10G    1
#	thin-volume     tmp     20G    0
#	linear          data0   5-10G  1
#	linear          data1   1-50%  1
#
# NOTE: Encrypting thin-snapshot will actually encrypt the
# entire pool data device used for the pool.
# TODO: Support external keys

type -p emergency_shell || . /lib/dracut-lib.sh

# for debugging purposes
set -x
exec &> /run/openslx/dmsetup.log

# read-only device to prepare for CoW
[ -z "$1" ] && emergency_shell "Read-only device was not given!"
declare -rg read_only_device="$1"
declare -rg read_only_device_size="$(blockdev --getsz $1)"

# global array variables storing the configuration of the partitions
declare -ag linear snapshot thin_snapshot thin_volume
parse_config() {
	[ -z "$1" ] && return 1
	while IFS= read -r line; do
		[ -z "$line" ] && continue
		read -r type name range crypt ignore <<< "$line"
		type=${type//-/_} # to use the type as variable in eval
		if ! [[ "$type" =~ \
			^(linear|snapshot|thin_snapshot|thin_volume)$ ]]; then
			echo "$0: Ignoring invalid type: $line"
			continue
		fi
		if [[ -z "$name" ]]; then
			echo "$0: Ignoring nameless entry: $line"
			continue
		fi
		unset unit min max
		# ranges can be like: 40G, 40-80G, 10-20%
		if ! [[ $range =~ ^([0-9]+-)*([0-9])+[GgMmKkBb%]$ ]]; then
			echo "$0: Ignoring invalid range: $line"
			continue
		fi
		# process ranges: convert percentages (of read_only_device!)
		# to actual sizes (in sectors!) before saving them
		local unit=${range: -1}
		local min="$(cut -d'-' -f1 <<< "${range%?}" )"
		local max="$(cut -d'-' -f2 <<< "${range%?}" )"
		if [ "$min" -gt "$max" ]; then
			echo "$0: Ignoring invalid range: $line"
			continue
		fi
		# default for bytes
		local -i potency=0
		case "$unit" in
			[%])
				if [ "$max" -gt 100 ]; then
					echo "Ignoring invalid percentages: $min/$max"
					continue
				fi
				min=$(( $writable_device_size * $min / 100 ))
				max=$(( $writable_device_size * $max / 100 ))
				;;
			[Kk])	potency=1 ;;&
			[Mm])	potency=2 ;;&
			[Gg])	potency=3 ;;&
			*)
				# => 1024 ** potency for G, M, K, etc results in bytes
				# => bytes / 512 = sectors
				min=$(( $min * ( 1024 ** $potency) / 512 ))
				max=$(( $max * ( 1024 ** $potency) / 512 ))
				;;
		esac
		if ! [[ "$crypt"  =~ ^[01]$ ]]; then
			echo "$0: Disabling encryption due to invalid crypt argument: $line"
			crypt=0
		fi
		# finally save it to the global array for this type
		eval "${type}"'+=("'${name} ${crypt} ${min} ${max}'")'
	done <<< "$1"
}

# Helper to call 'dmsetup setup' without syncing with udev
# and then actively create the devices with the mknodes command.
# dmsetup_create_noudevsync <name> <table>
dmsetup_create_noudevsync() {
	(
		set -o errexit
		dmsetup create "$1" --noudevsync --table "$2"
		dmsetup mknodes --noudevsync "$1"
	)
	local ret=$?
	[ $ret -ne 0 ] && dmsetup remove --noudevsync "$1"
	return $ret
}

# encrypt_device <dev_path> <encrypted_name> [<size>]
encrypt_device() {
	modprobe dm-crypt || echo "$0: dm-crypt loading failed, maybe builtin?"
	[ -b "$1" ] || return 1
	[ -n "$2" ] || return 1
	[ -z "$3" ] && local size="$(blockdev --getsz $1)"
	local key="$(head -c32 /dev/random | xxd -p | tr -d '\n')"
	if ! dmsetup_create_noudevsync "$2" \
		"0 ${3:-${size}} crypt aes-xts-plain64 $key 0 $1 0 1 allow_discards"; then
		echo "$0: Failed to encrypt $1."
		return 1
	fi
	return 0
}
# create_snapshot "<name> <persist>"
create_snapshot() {
	modprobe dm-snapshot || echo "$0: dm-snapshot loading failed, maybe builtin?"
	read -r name persist ignore <<< "$1"
	if ! dmsetup_create_noudevsync "$name" \
		"0 $read_only_device_size snapshot $read_only_device $writable_device ${persist:-N} 8"; then
		echo "$0: Failed to create snapshot on '$writable_device' for '$read_only_device'."
		return 1
	fi
	return 0
}

# Call this to fallback to a RAMdisk stored under /run/openslx
# This will call terminate the whole script by calling finish_setup, if successful
ramdisk_fallback() {
	echo "$0: Falling back to regular dm-snapshot on a RAMdisk."
	local file="$(mktemp -u -p /run/openslx dnbd_cow.XXX)"
	local size="$SLX_RAMDISK_SIZE_IN_MB"
	[ -z "$size" ] && size="$(awk '/MemTotal/ {printf("%d\n", $2 / 2 / 1024 )}' /proc/meminfo)"
	dd of="$file" seek="$size" bs=1M count=0 &> /dev/null
	writable_device="$(losetup --show --find "$file")"
	cow_device_candidate="root"
	while [ -b "/dev/mapper/$cow_device_candidate" ]; do
		cow_device_candidate="root.$RANDOM"
	done
	if [ -z "$writable_device" ] || ! create_snapshot "$cow_device_candidate N"; then
		emergency_shell "CRITICAL: failed to setup RAMdisk fallback."
		exit 1
	fi
	finish_setup "$cow_device_candidate" "0" "$size"
}

# finish_setup <device> <type> [<size>]
# <device> is the device name only, /dev/mapper will be prepended automatically.
# <type> denotes if the created device lies in a RAMdisk (0) or is backed by a disk (1).
# <size> is given in sectors.
finish_setup() {
	if [ -z "$1" ] || [ ! -b "/dev/mapper/$1" ]; then
		emergency_shell "'/dev/mapper/$1' not a block device. Failed to setup CoW layer."
		exit 1
	fi
	if ! [[ "$2" =~ ^[0-9]$ ]]; then
		emergency_shell "'$2' not a valid type, 0 or 1 expected."
	fi
	# <size> optional?
	(
	echo "# Generated by '$0'." 
	echo "SLX_DNBD3_DEVICE_COW=/dev/mapper/$1"
	) >> /etc/openslx
	save_partition_info "$1" "/" "$2" "$3"
	exit 0
}

# path to save the achieved setup to
declare -rg partitions_config="/run/openslx/dmsetup.state"
cat <<-EOF > "$partitions_config"
# Generated by '$0'.
# Format: <device_mapper_dev> <mount_point> <options>
# Options can be:
# * type -> CoW layer type: 0 is RAMdisk, 1 is disk, 2 is network
# * size -> in 512 byte sectors
EOF

# save_partition_info <dm_dev> <mount_point> <type> [<size>]
save_partition_info() {
	[ -b "/dev/mapper/$1" ] || return 1
	[ -n "$2" ] || return 1
	[[ "$3" =~ ^[0-9]$ ]] || return 1
        local opts="type=$3"
	# plain size given
	[[ "$4" =~ ^[0-9]+$ ]] && opts="$opts,physical_size=$4"
	# <physical_backing_dev_size>-<virtual_size>
	[[ "$4" =~ ^[0-9]+-[0-9]+$ ]] && opts="$opts,shared_physical_size=${4%-*},virtual_size=${4#*-}"
	echo "/dev/mapper/$1 $2 ${opts}" >> "$partitions_config"
}

###
##	MAIN
###

. /etc/openslx
# This is the main variable driving this script
declare -g writable_device=
if [ -n "$SLX_WRITABLE_DEVICE_IDENTIFIER" ]; then
	# only first one for now TODO create linear devices of all ID44s
	writable_device="$(slx-tools dev_find_partitions "$SLX_WRITABLE_DEVICE_IDENTIFIER" | head -n 1)"
fi
if [ -z "$writable_device" ]; then
	echo "$0: Could not find writable device with id '$SLX_WRITABLE_DEVICE_IDENTIFIER'."
	ramdisk_fallback
fi

# NOTE: from here on out, every value related to size is in 512 bytes sectors!
declare -g writable_device_size="$(blockdev --getsz $writable_device)"

# If SLX_WRITABLE_DEVICE_PARTITION_TABLE is not set, just do
# regular thin-snapshot for the CoW layer, else parse it.
if [ -n "$SLX_WRITABLE_DEVICE_PARTITION_TABLE" ]; then
	parse_config "$SLX_WRITABLE_DEVICE_PARTITION_TABLE"
fi
# Default to thin-snapshot, if none were configured
if [ -z "$snapshot" ] && [ -z "$thin_snapshot" ]; then
	parse_config "thin-snapshot root 100% 0"
fi

# Sanity checks for weird configurations
if [ "${#snapshot[@]}" -gt 1 ]; then
	echo "Multiple snapshots specified, using first one: ${snapshot[0]}"
	snapshot="${snapshot[0]}"
fi
if [ "${#thin_snapshot[@]}" -gt 1 ]; then
	echo "Multiple thin-snapshots specified, using first one: ${thin_snapshot[0]}"
	thin_snapshot="${thin_snapshot[0]}"
fi
if [ -n "$snapshot" ] && [ -n "$thin_snapshot" ]; then
	echo "$0: Both snapshot and thin-snapshot specified, prefering thin-snapshot."
	snapshot=
fi

###
##	LINEAR SLICES
###

# start allocating spaces to the configured devices
declare -g writable_device_allocated=0
# reserve the space for the snapshot (of either type)...
read -r name crypt min max ignore <<< "${thin_snapshot:-${snapshot}}"

declare -g scratch_device_size=0
if (( $min <= $writable_device_size )); then
	scratch_device_size=$max
	while (( $scratch_device_size >= 0 )) && (( $scratch_device_size > $writable_device_size )); do
		(( scratch_device_size -= 2097152 )) # 1G steps => 2097152 sectors
	done
	(( $scratch_device_size < $min )) && scratch_device_size="$min"
else
	# minimum snapshot size is bigger than physical device size
	echo "$0: Minimum snapshot size is too big for the scratch partition."
	echo "$0: You probably need to use a more conservative value."
	echo "$0: Using this client maximum scratch space ($writable_device_size sectors)."
	scratch_device_size="$writable_device_size"
fi

# ... and slice it from the start of the writable device (for performance).
declare -g scratch_device="/dev/mapper/scratch"
if ! dmsetup_create_noudevsync "${scratch_device##*/}" \
	"0 $scratch_device_size linear $writable_device $writable_device_allocated"; then
		echo "$0: Failed to create scratch space for the CoW layer."
		# TODO do not bail directly, but try to to create the linear devices at least?
		ramdisk_fallback
fi
save_partition_info "${scratch_device##*/}" "*" "1" "$scratch_device_size"

# encrypt the scratch device, if configured
if [ "$crypt" -ne 0 ] && encrypt_device \
	"$scratch_device" "${scratch_device##*/}-crypt" "$scratch_device_size"; then
	scratch_device="${scratch_device}-crypt"
fi

writable_device_allocated="$scratch_device_size"

# first setup linear slices of the writable device
for i in ${!linear[@]}; do
	[ -z "${linear[$i]}" ] && continue
	read -r name crypt min max ignore <<< "${linear[$i]}"
	free_space=$(( $writable_device_size - $writable_device_allocated ))
	if [ "$min" -gt "$free_space" ]; then
		echo "$0: Not enough space left for linear devices: ${linear[$i]}"
		break
	fi
	# allocate its max if it fits within the free space, otherwise use the space left.
	to_allocate="$max"
	[ "$to_allocate" -gt "$free_space" ] && to_allocate="$free_space"

	if ! dmsetup_create_noudevsync "$name" "0 $to_allocate linear $writable_device $writable_device_allocated"; then
		echo "$0: Failed to create linear device: ${linear[$i]}"
		continue
	fi
	# TODO sane?
	save_partition_info "$name" "*" "1" "$to_allocate"
	if [ "$crypt" -ne 0 ] && \
		! encrypt_device "/dev/mapper/$name" "${name}-crypt" "$to_allocate"; then
		echo "$0: Failed to encrypt '$name'."
	fi
	writable_device_allocated=$(( $to_allocate + $writable_device_allocated ))
done

# we are done with the physical device, use the scratch space from now on
writable_device="$scratch_device"
writable_device_size="$scratch_device_size"

### 
##	THIN-PROVISIONING
###
declare -rg pool_metadata_dev="/dev/mapper/pool-metadata"
declare -rg pool_data_dev="/dev/mapper/pool-data"
declare -rg pool_dev="/dev/mapper/pool"
create_pool() {
	# create external snapshot for read-only device
	# create remaining thin volumes
	modprobe dm-thin-pool || echo "$0: dm-thin-pool load failed, maybe builtin?"
	# create temporary metadata device
	data_block_size=255
	# calculate number of sectors needed and check boundaries:
	metadata_dev_size="$(( 48 * $writable_device_size / $data_block_size / 512 ))"
	# Min 2MB -> 4096 sectors, max 16GB -> 33554432 sectors
	[ "$metadata_dev_size" -lt 4096 ] && metadata_dev_size="4096"
	# TODO handle the exotic case of a too large metadata device to fit within RAM.
	[ "$metadata_dev_size" -gt 33554432 ] && metadata_dev_size="33554432"
	# TODO handle persistent metadata device on disk
	# create RAMdisk in /run for metadata device
	metadata_dev="$(mktemp -p /run/openslx .pool-metadata.XXX)"
	dd of="$metadata_dev" bs=512 seek="$metadata_dev_size" &> /dev/null
	metadata_dev="$(losetup --show --find $metadata_dev)"
	if ! dmsetup_create_noudevsync "${pool_metadata_dev##*/}" \
		"0 $metadata_dev_size linear $metadata_dev 0"; then
		echo "$0: Failed to create pool metadata device on '$writable_device'."
		return 1
	fi
	# For persistent metadata device we will need to cut that space off first:
	# writable_device_size=$(( $writable_device_size - $metadata_dev_size ))

	if ! dmsetup_create_noudevsync "${pool_data_dev##*/}" \
		"0 $writable_device_size linear $writable_device 0"; then
		echo "$0: Failed to create pool data device on '$writable_device'."
		return 1
	fi
	low_water_mark=32
	if ! dmsetup_create_noudevsync "${pool_dev##*/}" \
		"0 $writable_device_size thin-pool $pool_metadata_dev $pool_data_dev $data_block_size $low_water_mark"; then
		echo "$0: Failed to create thin-pool device on '$writable_device'."
		return 1
	fi
	return 0
}

# create_volume "<name> <id> <size> <backing_dev>"
create_volume() {
	if [ -z "$pool_dev" -o ! -b "$pool_dev" ]; then
		echo "$0: Global pool device not set or present."
		return 1
	fi
	if [ $# -ne 1 -o -z "$1" ]; then
		echo "$0: create_volume requires one non-empty argument."
		return 1
	fi
	local name id size backing_dev ignore
	read -r name id size backing_dev ignore <<< "$1"

	if ! dmsetup message "$pool_dev" 0 "create_thin $id"; then
		echo "$0: Failed to create thin volume with id '$id' in pool '$pool_dev'."
		echo "$0: It might already exists, trying anyway..."
	fi
	if ! dmsetup_create_noudevsync "$name" "0 $size thin $pool_dev $id $backing_dev"; then
		echo "$0: Failed to create external snapshot named '$name':"
		echo "  Size:           $size"
		echo "  Backing device: $backing_dev"
		echo "  Thin volume id: $id"
		return 1
	fi
	return 0
}
if [ -n "$thin_snapshot" ] || [ -n "$thin_volume" ]; then
	if ! create_pool ; then
		echo "Failed to create thin pool. Will ignore:"
		echo -e "\tThin snapshot: $(declare -p thin_snapshot)"
		echo -e "\tThin volumes: $(declare -p thin_volume)"
		ramdisk_fallback
	fi
	# the order in which pool devices are created does not matter
	# so start with thin volumes starting with id 2 and end with
	# the thin-snapshot with id 1 which needs to call finish_setup.
	volume_id=2
	# go over thin-volumes
	for i in ${!thin_volume[@]}; do
		[ -z "${thin_volume[$i]}" ] && continue
		read -r name crypt min max ignore <<< "${thin_volume[$i]}"
		# thin-volume can be safely created with max size,
		# since they are overprovisioned anyway.
		if ! create_volume "$name $(( volume_id++ )) $max"; then
			echo "Failed to create thin volume '$name'."
		fi
		save_partition_info "$name" "*" "1" "${writable_device_size}-${max}"
		if [ "$crypt" -ne 0 ] && ! encrypt_device \
			"/dev/mapper/$name" "$name-crypt" "$max"; then
			echo "Failed to encrypt thin volume '$name'."
		fi
	done

	if [ -n "$thin_snapshot" ]; then
		# create thin-snapshot, use first one
		read -r name crypt min max ignore <<< "$thin_snapshot"
		# min/max was used for the pool data device, ignore it here!
		# NOTE: the filesystem will most likely malfunction if the size of the
		# thin-snapshot is smaller than what it was upon creation.
		# As such, the size of the thin-snapshot can only be $writable_device_size
		# if it is larger than $read_only_device_size, otherwise we should only
		# use $read_only_device_size. While live-shrinking the filesystem might be
		# an option, it is not supported throughout all fileystems (xfs can't).
		if (( writable_device_size >= read_only_device_size )); then
			thin_snapshot_size="$writable_device_size"
		else
			thin_snapshot_size="$read_only_device_size"
		fi
		if ! create_volume "$name 1 $thin_snapshot_size $read_only_device"; then
			echo "Failed to create external snapshot for '$read_only_device'."
			ramdisk_fallback
		fi
		finish_setup "$name" "1" "$thin_snapshot_size"
	fi
fi

###
##	SNAPSHOT (OLD FUNCTIONALITY)
###
if [ -n "$snapshot" ]; then
	read -r name crypt min max ignore <<< "$snapshot"
	if ! create_snapshot "$name $persist"; then
		echo "Failed to create regular snapshot for '$read_only_device' on '$writable_device'."
		ramdisk_fallback
	fi
	finish_setup "$name" "1" "$writable_device_size"
fi

# ultimate fallback
ramdisk_fallback
exit 1