summaryrefslogblamecommitdiffstats
path: root/modules.d/slx-dmsetup/scripts/dmsetup-slx-device
blob: 195ab3ad73887c1d4a083d1f253f605e50a383b3 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12











                                                                
                                        



                                                          
                                       

                                               

                        
                                              
      


                                                                
                                


                                                               
 

                                                          









                                                                    
                                                                    













                                                                              
                
                                                       



                               

                                                                    
                    
                               
                              
                                                     











                                                                              


                                                                                               


                                                                



                                                                       





                                                                                      


















                                                                                                     
                  



                                                                                            
                                                                   






                                                                                           

                                                                                 
                                                                      
                  




                                                                

                                                                      

                             
                                




                                                                             


                                                 
                                         





                                                          
                                                  
                                                                               

                               
                                                              





                                                                                        








                                                                                          
                                                                                     

                                                
                                                                                                          





                                                                                                   




                                                                              
                                 

                                                                    

                                                         
                                                                  
                                                                                            
 


                                                                            
                      
                                                           
                                                                    
                                                                          
                                                              
         

                                 
                                                   
                                                                             






                                                                               
                                        

                                                   
          

                                                                                                                   

          
                                  
                                                              
                                                                                                   
                                                                    
          

                                                                      




                                                                                         
                      
          
                                                                       

 



                                                                                        
                                 
                
                                                         
                                                                                                 

                      



                                                                         

                                   
                                                  
                         
                                              


              














                                                                 
                            






                                                                                                       
                                                                          
                                                                           


                                                                               



                                                                                   
                                                    
                                               



                                                                                       
                                                                                                      



                                                                                    
                                                                  

                
 
               
                                                                                     
                                                                                    

                                                       
                                                                                    
                                          
                                                                  
                                                                                        
                                                                             



                                                                                        
                                                                                  









                                                                                                                    
                                                                                                         




                                                                                
                                                               
                                                                 
                                                                                       
                                                                         
                                     
                           

                                              
                                                                                      
                                                                
                                                                                                
                                                                                           
                    
                                                                    

                                                                                    

                                                                                                    


                                                                                                       
                  



                                                                           
                                                     
                                                                                              
                                                                                    
          

                                                                                               

                        









                                                                                             
                                                                                                  



                                                                                          

                              
                                                                  
                                                          
                                                                                                                                       
                                                                                                        




                        
                                                
                 
                                                           


                                                                 

                                                               

                        



                                                            













                                                                                              
 















                                                                                     
                             






                                                                                                 
                                                

                                      
                     
 




              





                                                         
                                               
                        








                                                                                                     
                                   
                                                                                            




                                                                                                                      






























                                                                                                                       



                                                                                                                  






















                                                                                                    
                                  








                                                                                                                         






                                                                              

                                                                       
  


































                                                                                                            























































                                                                                               






                                                                                          



                                            
                                            


                                                     
                                         


























                                                                                                                      

                                              




                                                                      










                                                                               
                                                         




                                                                                    
                                                                                         








                                                                              
                                                         
                                                                                    















                                                                                                        
                                                          
                                                                                            
                          
                                                                                                  


                                                                                                  
                                                                    
                  

                                                                              





                                    
                                                         
                                                         
                                         
                                                   

                                                                                                      
                                                             
          




                   
#!/usr/bin/env bash
#
# Script to back given read-only device using the block device
# specified by SLX_WRITABLE_DEVICE_IDENTIFIER in the SLX config.
# If SLX_WRITABLE_DEVICE_PARTITION_TABLE is sepcified, it will
# further create device mapper devices accordingly.
#
# Example partition config:
#	<type>         <name> <size> <crypt>
#	thin-snapshot   root    10G    1
#	thin-volume     tmp     20G    0
#	linear          data0   5-10G  1
#	linear          data1   1-50%  1
#
# NOTE: Encrypting thin-snapshot will actually encrypt the
# entire pool data device used for the pool.
# TODO: Support external keys
# TODO: Put table in file in config.tgz

type -p emergency_shell || . /lib/dracut-lib.sh

# for debugging purposes
exec {BASH_XTRACEFD}> /run/openslx/dmsetup.log
set -x

# read-only device to prepare for CoW
[ -z "$1" ] && emergency_shell "Read-only device was not given!"
declare -g read_only_device="$1"
declare -g read_only_device_sz="$( blockdev --getsz "$1" )"
# Use _sz suffix for sizes expressed in number of 512b sectors,
# _size for random other crap

declare -rg ntfs_list="/run/openslx/.thin-ntfs-candidates"

# handle_unit <value> <unit>
# Supply percentage, or size in [kmgt]bytes,
# returns appropriate value in number of 512b sectors
handle_unit() {
	# default to bytes
	local -i potency=0
	local -i val="$1"
	case "$2" in
		[%]) # These are relative to the writable CoW device
			# Allow > 100% for over-provisioning
			val="$(( remaining_device_sz * val / 100 ))"
			;;
		[Kk])	potency=1 ;;&
		[Mm])	potency=2 ;;&
		[Gg])	potency=3 ;;&
		[Tt])	potency=4 ;;&
		*)
			# => 1024 ** potency for G, M, K, etc results in bytes
			# => bytes / 512 = sectors
			val=$(( val * ( 1024 ** potency) / 512 ))
			;;
	esac
	echo "$val"
}

parse_config() {
	local remaining_device_sz="$writable_device_sz"
	parse_config_int "$1" 0
	parse_config_int "$1" 1
}

# global array variables storing the configuration of the partitions
declare -ag linear snapshot thin_snapshot thin_volume
parse_config_int() {
	[ -z "$1" ] && return 1
	local -i rel_only="$2"
	while IFS= read -r line || [ -n "$line" ]; do
		[ -z "$line" ] && continue
		read -r type name range crypt ignore <<< "$line"
		type=${type//-/_} # to use the type as variable in eval
		if ! [[ "$type" =~ \
			^(linear|snapshot|thin_snapshot|thin_volume)$ ]]; then
			echo "$0: Ignoring invalid type: $line"
			continue
		fi
		if [[ -z "$name" ]]; then
			echo "$0: Ignoring nameless entry: $line"
			continue
		fi
		unset min_unit max_unit min max
		# ranges can be like: 40G, 40-80G, 10G-20%
		if ! [[ "$range" =~ ^([0-9]+)([GgMmKkBb%]?)(-([0-9]+)([GgMmKkBb%]?))?$ ]]; then
			echo "$0: Ignoring invalid range: $line"
			continue
		fi
		local min="${BASH_REMATCH[1]}"
		local max="${BASH_REMATCH[4]:-${BASH_REMATCH[1]}}"
		local min_unit="${BASH_REMATCH[2]:-${BASH_REMATCH[5]}}"
		local max_unit="${BASH_REMATCH[5]:-${BASH_REMATCH[2]}}"
		# first pass we handle absolute values unly, second pass relative ones
		if [[ "$min_unit" = "%" || "$max_unit" = "%" ]]; then
			[ "$rel_only" != 1 ] && continue
		else
			[ "$rel_only" = 1 ] && continue
		fi
		if [ -z "$min_unit" ]; then
			echo "$0: WARNING: No unit given in range, assuming BYTES: $line"
		fi
		min="$( handle_unit "$min" "$min_unit" )"
		max="$( handle_unit "$max" "$max_unit" )"
		if (( min > max )); then
			# So, we might end up with something like 30G-100%, but the writable device
			# is only 20GB. In that case we most likely want to contine, and not consider
			# this an error. So let's try to come up with some logic on what is an error
			# and what isn't. Probably anything involving a mix of percentage and
			# non-percentage should not be an error.
			if [[ "$min_unit" = "%" && "$max_unit" != "%" ]] \
					|| [[ "$min_unit" != "%" && "$max_unit" = "%" ]]; then
				# Let's hope for the best
				max="$min"
			else
				echo "$0: Ignoring invalid range (min > max): $line"
				continue
			fi
		fi
		if ! [[ "$crypt"  =~ ^[01]$ ]]; then
			echo "$0: Disabling encryption due to invalid crypt argument: $line"
			crypt=0
		fi
		# finally save it to the global array for this type
		case "$type" in
			linear) linear+=("${name} ${crypt} ${min} ${max}") ;;
			snapshot) snapshot+=("${name} ${crypt} ${min} ${max}") ;;
			thin_snapshot) thin_snapshot+=("${name} ${crypt} ${min} ${max}") ;;
			thin_volume) thin_volume+=("${name} ${crypt} ${min} ${max}") ;;
			*) echo "$0: SOMETHING NOT GOOT CHECK SOURCE CODE" ;;
		esac
		# Decrease for upcoming calculations if we used fixed values here
		if [ "$rel_only" != 1 ]; then
			(( remaining_device_sz -= ( min + max ) / 2 ))
		fi
	done <<< "$1"
}

# Helper to call 'dmsetup setup' without syncing with udev
# and then actively create the devices with the mknodes command.
# Either pass the table contents as $2, or pipe them into the function
# dmsetup_create_noudevsync <name> [table]
dmsetup_create_noudevsync() {
	(
		set -eo pipefail
		if [ -n "$2" ]; then
			printf "%s\n" "$2" | dmsetup create "$1" --noudevsync
		else
			dmsetup create "$1" --noudevsync
		fi
		dmsetup mknodes --noudevsync "$1"
	)
	local ret=$?
	[ -b "/dev/mapper/$1" ] || ret=99
	[ $ret -ne 0 ] && dmsetup remove --noudevsync "$1"
	return $ret
}

# encrypt_device <dev_path> <encrypted_name> [<size>]
encrypt_device() {
	# TODO: Send key back to us, demand ransom
	modprobe dm-crypt || echo "$0: dm-crypt loading failed, maybe builtin?"
	[ -b "$1" ] || return 1
	[ -n "$2" ] || return 1
	[ -z "$3" ] && local size="$( blockdev --getsz "$1" )"
	local key
	key="$( < /dev/urandom xxd -c32 -p -l32 )"
	[ -z "$key" ] && key="$( < /dev/urandom  tr -c -d 'a-f0-9' | dd count=1 bs=32 )"
	[ -z "$key" ] && key="$( < /dev/urandom  head -c32 | xxd -c32 -p )"
	[ -z "$key" ] && key="$( < /dev/urandom  xxd -c32 -p | head -n 1 )"
	[ -z "$key" ] && echo "$0: ERROR: Could not generate encryption key"
	if ! dmsetup_create_noudevsync "$2" \
		"0 ${3:-${size}} crypt aes-xts-plain64 $key 0 $1 0 1 allow_discards"; then
		echo "$0: Failed to encrypt $1."
		return 1
	fi
	return 0
}
# create_snapshot "<name> <persist>"
create_snapshot() {
	modprobe dm-snapshot || echo "$0: dm-snapshot loading failed, maybe builtin?"
	read -r name persist ignore <<< "$1"
	if ! dmsetup_create_noudevsync "$name" \
		"0 $read_only_device_sz snapshot $read_only_device $writable_device ${persist:-N} 8"; then
		echo "$0: Failed to create snapshot on '$writable_device' for '$read_only_device'."
		return 1
	fi
	return 0
}

# This function is called if no ID44 partition could be found or anoother kind
# of critical error occurs during the CoW layer setup. It will combine the
# the read-only device with a DM zero device to increase its virtual size
# by half the RAM size. A sparse file of that size will then be created and
# placed on a dedicated tmpfs.
# THIS FUNCTION MUST NEVER RETURN
ramdisk_fallback() {
	echo "$0: Falling back to regular dm-snapshot on a RAMdisk."

	# RAM size in kb, note that this is equal to half
	# of the entire RAM when interpreted as 512-bytes sectors.
	local ram_cow_sz="$(awk '/^MemTotal:/ { printf("%d\n", $2 ); exit }' /proc/meminfo)"

	#	try to prepare the zero extension device
	local extended_device="/dev/mapper/${read_only_device##*/}-extended"
	(
		set -e
		lsmod | grep -q dm-zero || modprobe dm-zero
		dmsetup_create_noudevsync "${extended_device##*/}" \
			"0 $read_only_device_sz linear $read_only_device 0
			$read_only_device_sz $ram_cow_sz zero"
	)
	local ret="$?"
	if [ "$ret" -eq 0 ]; then
		read_only_device="$extended_device"
		read_only_device_sz="$(( read_only_device_sz + ram_cow_sz ))"
	else
		echo "$0: Failed to setup the fake larger '$read_only_device'."
		echo "$0: Continuing with its original size."
	fi

	# prepare dedicated tmpfs mount point
	local cow_tmpfs="/run/openslx/cow"
	if ! mkdir -p "$cow_tmpfs"; then
		cow_tmpfs="${cow_tmpfs}.$$.$RANDOM"
		mkdir -p "$cow_tmpfs"
	fi
	if ! mount -t tmpfs cow-tmpfs -o size="$(( read_only_device_sz / 2 + 100 ))k" "$cow_tmpfs"; then
		echo "$0: Failed to mount tmpfs in '$cow_tmpfs' of size '$(( read_only_device_sz / 2 + 100 ))KiB'."
	fi

	# create sparse file there
	local file="$(mktemp -u -p "$cow_tmpfs" dnbd_cow.XXX)"
	if ! dd if=/dev/null of="$file" seek="$(( read_only_device_sz ))" bs=512 2> /dev/null; then
		emergency_shell "Failed to allocate CoW file $file."
	fi
	declare -rg writable_device="$(losetup --show --find "$file")"
	local cow_device_candidate="root"
	while [ -b "/dev/mapper/$cow_device_candidate" ]; do
		cow_device_candidate="root.$RANDOM"
	done
	if [ -z "$writable_device" ] || ! create_snapshot "$cow_device_candidate N"; then
		emergency_shell "CRITICAL: failed to setup RAMdisk fallback."
		exit 1
	fi
	finish_setup "$cow_device_candidate" "0" "$read_only_device_sz"
}

# finish_setup <device> <type> [<size>]
# <device> is the device name only, /dev/mapper will be prepended automatically.
# <type> denotes if the created device lies in a RAMdisk (0) or is backed by a disk (1).
# <size> is given in sectors.
# THIS FUNCTION MUST NEVER RETURN
finish_setup() {
	if [ -z "$1" ] || ! [ -b "/dev/mapper/$1" ]; then
		emergency_shell "'/dev/mapper/$1' not a block device. Failed to setup CoW layer."
		exit 1
	fi
	if ! [[ "$2" =~ ^[0-9]$ ]]; then
		emergency_shell "'$2' not a valid type, 0 or 1 expected."
	fi
	# <size> optional?
	{
	echo "# Generated by '$0'."
	echo "SLX_DNBD3_DEVICE_COW=/dev/mapper/$1"
	} >> /etc/openslx
	save_partition_info "$1" "/" "$2" "$3"
	exit 0
}

# path to save the achieved setup to
declare -rg partitions_config="/run/openslx/dmsetup.state"
cat <<-EOF > "$partitions_config"
# Generated by '$0'.
# Format: <device_mapper_dev> <mount_point> <options>
# Options can be:
# * type -> CoW layer type: 0 is RAMdisk, 1 is disk, 2 is network
# * size -> in 512 byte sectors
EOF

# save_partition_info <dm_dev> <mount_point> <type> [<size>]
save_partition_info() {
	[ -b "/dev/mapper/$1" ] || return 1
	[ -n "$2" ] || return 1
	[[ "$3" =~ ^[0-9]$ ]] || return 1
	local opts="type=$3"
	# plain size given
	[[ "$4" =~ ^[0-9]+$ ]] && opts="$opts,physical_size=$4"
	# <physical_backing_dev_size>-<virtual_size>
	[[ "$4" =~ ^[0-9]+-[0-9]+$ ]] && opts="$opts,shared_physical_size=${4%-*},virtual_size=${4#*-}"
	echo "/dev/mapper/$1 $2 ${opts}" >> "$partitions_config"
}

# This will create another dm-linear on top of $scratch_device in case its
# size differs from $scratch_device_sz. This is useful for setups where you
# cannot explicitly configure how much space to use from the underlying device,
# and the partition table says not to use the entire $writable_device for cow
require_exact_scratch_size() {
	local current_sz="$( blockdev --getsz "$scratch_device" )"
	(( current_sz == scratch_device_sz )) && return 0 # Everything fine
	if (( current_sz < scratch_device_sz )); then
		echo "$0: WARNING: scratch_device_sz is larger than actual device."
		echo "$0: This should never happen."
		scratch_device_sz="$current_sz"
		return 0
	fi
	# We could check if $scratch_device already is a dm target, and just adjust its
	# size, but I think that scenario isn't possible, currently.
	if ! dmsetup_create_noudevsync "scratch" "0 $scratch_device_sz linear $scratch_device 0"; then
			echo "$0: Failed to create scratch space for the CoW layer."
			return 1
	fi
	scratch_device="/dev/mapper/scratch"
	save_partition_info "scratch" "*" "1" "$scratch_device_sz"
	return 0
}

create_pool() {
	declare -r data_block_sz=256 # Desired Block size (number of 512byte sectors)
	declare -r wanted_low_mb=100 # Free space below this will trigger a dm event
	# create external snapshot for read-only device
	# create remaining thin volumes
	modprobe dm-thin-pool || echo "$0: dm-thin-pool load failed, maybe builtin?"
	# create temporary metadata device
	# calculate number of sectors needed and check boundaries:
	# XXX Formula from thin-pool.txt calculates size in *bytes*, we want 512b blocks
	metadata_dev_sz="$(( 48 * scratch_device_sz / data_block_sz / 512 ))"
	# If we want NTFS as a backup plan to extend the pool, check if the current size
	# is less than 100GB, and only then consider this feature.
	# Maybe make that thresold configurable one day, but the the desktop client
	# use case this is sensible for now.
	if [ "$SLX_NTFSFREE" = "backup" ] && (( scratch_device_sz < 209715200 )) \
			&& [ -z "$metadata_persistent" ]; then
		find_ntfs_partitions
		if [ -s "$ntfs_list" ]; then
			# Look what size we end up if we want at least 50GB
			local sum="$( awk -v sum=0 \
				'{sum+=$1; if (sum >= 104857600) exit}END{printf "%.0f", sum}' \
				"$ntfs_list" )"
			if (( sum > 0 )); then
				(( sum > 209715200 )) && sum=209715200 # Max 100GB
				# Account for this potential growth in the metadata device size for future expansion
				metadata_dev_sz="$(( metadata_dev_sz + 48 * sum / data_block_sz / 512 ))"
				echo "$sum" > "/run/openslx/.thin-ntfs-growsize"
				root_ntfs_extra="$sum"
			fi
		fi
	fi
	# Min 2MB -> 4096 sectors, max 16GB -> 33554432 sectors
	[ "$metadata_dev_sz" -lt 4096 ] && metadata_dev_sz="4096"
	# TODO handle the exotic case of a too large metadata device to fit within RAM.
	[ "$metadata_dev_sz" -gt 33554432 ] && metadata_dev_sz="33554432"
	local scratch_device_offset=0
	local metadata_dev=
	local metadata_persistent=
	if [ -n "$metadata_persistent" ]; then
		# create persistent slice of the writable device for the pool metadata
		if ! dmsetup_create_noudevsync "pool-metadata" \
			"0 $metadata_dev_sz linear $scratch_device $scratch_device_offset"; then
			echo "$0: Failed to create linear device for pool metadata device."
		else
			# Adjust size for pool-data down accordingly
			scratch_device_offset="$metadata_dev_sz"
			scratch_device_sz=$(( scratch_device_sz - metadata_dev_sz ))
			declare -r metadata_dev="/dev/mapper/pool-metadata"
			# TODO configurable wipe: dd if=/dev/zero of="$metadata_dev" count=1 bs=4096
			# TODO: If we fail later on in this function, we would actually have to destroy
			# this target again, and re-adjust the offset and size back, so that the
			# snapshot fallback would work properly. Or maybe just don't support fallback.
		fi
	fi
	if [ -z "$metadata_dev" ]; then
		# create RAMdisk in /run for metadata device
		metadata_dev="$(mktemp -p /run/openslx .pool-metadata.XXX)"
		# Create sparse file of required size
		dd if=/dev/null of="$metadata_dev" bs=512 seek="$metadata_dev_sz" 2> /dev/null
		declare -r metadata_dev="$( losetup --show --find "$metadata_dev" )"
	fi
	if [ -z "$metadata_dev" ]; then
		echo "$0: Could not set up persistent or tmpfs-loop metadata device. Aborting."
		return 1
	fi

	local pool_data_dev
	if (( root_ntfs_extra == 0 )) && (( scratch_device_offset == 0 )); then
		# No offset, no potential expansion, don't create another linear target
		pool_data_dev="$scratch_device"
	else
		pool_data_dev="/dev/mapper/pool-data"
		# Create linear device of the writable device, in case we have an offset from
		# the on-disk meta data. Also this way we can easily extend it later.
		if ! dmsetup_create_noudevsync "${pool_data_dev##*/}" \
			"0 $scratch_device_sz linear $scratch_device $scratch_device_offset"; then
			echo "$0: Failed to create pool data device on '$scratch_device'."
			return 1
		fi
	fi
	local low_water_mark
	# Convert MB to blocks
	low_water_mark=$(( wanted_low_mb * 2048 / data_block_sz ))
	if ! dmsetup_create_noudevsync "${pool_dev##*/}" \
		"0 $scratch_device_sz thin-pool $metadata_dev $pool_data_dev $data_block_sz $low_water_mark 1 skip_block_zeroing"; then
		echo "$0: Failed to create thin-pool device (meta: $metadata_dev, data: $pool_data_dev)"
		return 1
	fi
	return 0
}

# create_volume <name> <id> <size> [backing_dev]
create_volume() {
	if [ -z "$pool_dev" ] || ! [ -b "$pool_dev" ]; then
		echo "$0: Global pool device not set or present."
		return 1
	fi
	if [ $# -lt 3 ] || [ -z "$1" ]; then
		echo "$0: create_volume: not enough arguments."
		return 1
	fi
	local name="$1"
	local id="$2"
	local size="$3"
	local backing_dev="$4" # Optional, internal if empty

	if ! dmsetup message "$pool_dev" 0 "create_thin $id"; then
		echo "$0: Failed to create thin volume with id '$id' in pool '$pool_dev'."
		echo "$0: It might already exists, trying anyway..."
	fi
	if ! dmsetup_create_noudevsync "$name" "0 $size thin $pool_dev $id $backing_dev"; then
		echo "$0: Failed to create external snapshot named '$name':"
		echo "  Size:           $size"
		echo "  Backing device: $backing_dev"
		echo "  Thin volume id: $id"
		return 1
	fi
	return 0
}

# Find NTFS partitions with decently sized ranges of
# free space. We can use these as our writable layer
# for our thin-pool, if configured.
# If suitable, this will create the file $ntfs_list with
# one line per suitable partition, format
# total_size_blocks devpath
# Results are sorted by size, descending order
find_ntfs_partitions() {
	[ -z "$SLX_NTFSFREE" ] && return
	[ "$SLX_NTFSFREE" = "never" ] && return
	[ -e "$ntfs_list" ] && return
	if ! command -v ntfsfree &> /dev/null; then
		echo "$0: ntfsfree not found, cannot use NTFS partitions as RW layer"
		return
	fi
	local part sum
	ntfs_extra_space_sz=0
	for part in /dev/disk/by-partuuid/*; do
		# Only count ranges >= 256MB, sum will be in number of 512b blocks
		sum="$( ntfsfree --block-size 512 --min-size "$(( 256 * 1024 * 1024 ))" "$part" \
			| awk -v sum=0 '{if ($1 == "Range") sum += $4}END{printf "%.0f", sum}' )"
		# Only consider volume if sum of these ranges > 1GB (this is BLOCKS, not bytes)
		(( "$sum" > 2 * 1024 * 1024 )) || continue
		echo "$sum $part" # only thing in loop going to stdout
		(( ntfs_extra_space_sz += sum ))
	done | sort -nr > "$ntfs_list"
}
ntfs_extra_space_sz=0

###
##	MAIN
###

. /etc/openslx

. slx-tools
# "Preload" functions by executing them NOT in a subshell
dev_find_partitions &> /dev/null
dev_swap_version &> /dev/null

# This is the main variable driving this script
declare -g id44_crypted=
declare -g writable_device=
if [ -z "$SLX_WRITABLE_DEVICE_IDENTIFIER" ]; then
	SLX_WRITABLE_DEVICE_IDENTIFIER=("44" "87f86132-ff94-4987-b250-444444444444")
	# TODO make scripts reading this variable compatible with list of IDs
	echo "SLX_WRITABLE_DEVICE_IDENTIFIER='${SLX_WRITABLE_DEVICE_IDENTIFIER[0]}'" >> /etc/openslx
	echo "SLX_WRITABLE_DEVICE_IDENTIFIERS='${SLX_WRITABLE_DEVICE_IDENTIFIER[*]}'" >> /etc/openslx
fi
# XXX The fuck? This may or may not be an array? Shit will defintely break some day...
if [ -n "$SLX_WRITABLE_DEVICE_IDENTIFIER" ]; then
	declare -a writable_devices
	writable_devices=( $( dev_find_partitions "${SLX_WRITABLE_DEVICE_IDENTIFIER[@]}" ) )
	if [[ "${#writable_devices[@]}" -eq 0 && "$SLX_NTFSFREE" != "never" ]] || [ "$SLX_NTFSFREE" = "always" ]; then
		find_ntfs_partitions
	fi
	if [ -s "$ntfs_list" ] || [[ "${#writable_devices[@]}" -gt 1 ]]; then
		# More than one device, and/or NTFS space, need linear
		tbl="/run/openslx/dmsetup-linear-id44"
		pos=0
		grow_max_sz=9999999999
		for dev in "${writable_devices[@]}"; do
			max="$(( grow_max_sz - pos ))"
			(( max <= 0 )) && break
			sz="$( blockdev --getsz "$dev" )"
			(( sz > 0 )) || continue
			(( sz > max )) && sz="$max"
			echo "$pos $sz linear $dev 0"
			(( pos += sz ))
		done > "$tbl"
		if [ -s "$ntfs_list" ]; then
			sum=
			while read -r sum dev _ || [ -n "$sum" ]; do # each dev
				word=
				while read -r word range_start_b _ range_sz _ || [ -n "$word" ]; do # each slice of dev
					[ "$word" = "Range" ] || continue
					(( range_sz > 0 )) || continue
					slice_sz="$(( grow_max_sz - pos ))"
					(( slice_sz <= 0 )) && break
					(( slice_sz > range_sz )) && slice_sz="$range_sz"
					# Append line
					if echo "$pos $slice_sz linear $dev $range_start_b" >> "$tbl"; then
						# Update counter
						(( pos += slice_sz ))
					else
						echo "$0: Could not write new table row into $tbl"
					fi
				done < <( ntfsfree --block-size 512 --min-size "$(( 256 * 1024 * 1024 ))" "$dev" )
			done < "$ntfs_list"
			# Don't try to add NTFS space again later
			SLX_NTFSFREE="never"
			sed -i "s/^SLX_NTFSFREE.*$/# & # disabled in stage3\nSLX_NTFSFREE='never'/" "/etc/openslx"
			rm -f -- "$ntfs_list"
		fi
		# See if we need a linear target at all
		if ! [ -s "$tbl" ]; then
			echo "$0: Empty tmp/id44 table, fallback to RAM"
		elif [ "$( wc -l < "$tbl" )" -eq 1 ] && [[ "${#writable_devices[@]}" -ge 1 ]]; then
			# Only one line, have writable device -> use directly
			writable_device="${writable_devices[0]}"
		else
			# set up linera device
			if ! dmsetup_create_noudevsync "id44-group" < "$tbl"; then
				echo "$0: Error creating group of id44 devices. Fallback to RAM :-("
			else
				writable_device="/dev/mapper/id44-group"
			fi
		fi
	else
		# Single device
		writable_device="${writable_devices[0]}"
	fi
fi
if [ -z "$writable_device" ]; then
	echo "$0: Could not find writable device with id '$SLX_WRITABLE_DEVICE_IDENTIFIER'."
	ramdisk_fallback
elif is_on "$SLX_ID44_CRYPT"; then
	# Config option crypts the entire ID44 device(s), before any slices are taken from it.
	if encrypt_device "$writable_device" "id44-crypt"; then
		echo "$0: ID44 encrypted"
		writable_device="/dev/mapper/id44-crypt"
		# Remember the whole device is already encrypted, and ignore the crypt flag for the partition table later
		id44_crypted=1
	else
		echo "$0: Error encrypting ID44 partition"
	fi
fi

# NOTE: from here on out, every value related to size is in 512 bytes sectors!
declare -rg writable_device_sz="$( blockdev --getsz "$writable_device" )"

# If SLX_WRITABLE_DEVICE_PARTITION_TABLE is not set, just do
# regular thin-snapshot for the CoW layer, else parse it.
if [ -z "$SLX_WRITABLE_DEVICE_PARTITION_TABLE" ]; then
	SLX_WRITABLE_DEVICE_PARTITION_TABLE="thin-snapshot root 100% 0"
fi

# extra swap?
if grep -qFw 'slx.swap' "/proc/cmdline"; then
	# Only if our basic writable_device is large enough, or we have ntfs backup
	do_swap_sz=0
	if (( writable_device_sz > 80078125 )); then
		# more than ~40GB, go ahead
		do_swap_sz="$(( ( writable_device_sz - 70312500 ) / 2 ))"
		# cap to 6GB
		(( do_swap_sz > 11718750 )) && do_swap_sz=11718750
	elif [ "$SLX_NTFSFREE" = "backup" ] \
			&& (( ntfs_extra_space_sz > 70312500 )) && (( writable_device_sz > 11718750 )); then
		# more than 40GB NTFS backup space, more than 6GB ID44, make 4GB swap
		do_swap_sz=7812500
	fi
	# Check how many we have and if they're regular, unencrypted ones.
	# If it's plenty, don't cut out swap from our backing device
	swap_sz=0
	for part in $( dev_find_partitions "82" "0657fd6d-a4ab-43c4-84e5-0933c84b4f4f" ); do
		dev_swap_version "$part" &> /dev/null || continue
		this_sz="$( blockdev --getsz "$part" )"
		(( this_sz > 0 )) && (( swap_sz += this_sz ))
	done
	echo "Have existing swap of $swap_sz blocks"
	# Go ahead with swap? Only if existing swap < 4GB. If so, add line to table.
	if (( do_swap_sz > 0 )) && (( swap_sz < 7812500 )); then
		echo "Adding $do_swap_sz blocks of additional swap on backing dev"
		skb="$(( do_swap_sz / 2 ))"
		SLX_WRITABLE_DEVICE_PARTITION_TABLE="$( printf "%s\n%s" "linear slx-swap ${skb}K 0" \
			"$SLX_WRITABLE_DEVICE_PARTITION_TABLE" )"
	fi
fi

parse_config "$SLX_WRITABLE_DEVICE_PARTITION_TABLE"

# Default to thin-snapshot, if none were configured
if [ "${#snapshot[@]}" = 0 ] && [ "${#thin_snapshot[@]}" = 0 ]; then
	parse_config "thin-snapshot root 100% 0"
fi

# Sanity checks for weird configurations
# XXX These were declared array and now turn into strings...
if [ "${#snapshot[@]}" -gt 1 ]; then
	echo "Multiple snapshots specified, using first one: ${snapshot[0]}"
fi
snapshot="${snapshot[0]}"
if [ "${#thin_snapshot[@]}" -gt 1 ]; then
	echo "Multiple thin-snapshots specified, using first one: ${thin_snapshot[0]}"
fi
thin_snapshot="${thin_snapshot[0]}"
if [ -n "$snapshot" ] && [ -n "$thin_snapshot" ]; then
	echo "$0: Both snapshot and thin-snapshot specified, prefering thin-snapshot."
	snapshot=
fi

###
##	LINEAR SLICES
###

# start allocating spaces to the configured devices
declare -g writable_device_used_sz=0

# first, reserve the space for the rootfs cow snapshot (of either type)...
read -r name crypt min max ignore <<< "${thin_snapshot:-${snapshot}}"

declare -g scratch_device="/dev/mapper/scratch"
declare -gi scratch_device_sz=0
if (( min <= writable_device_sz )); then
	scratch_device_sz="$max"
	(( scratch_device_sz > writable_device_sz )) && scratch_device_sz="$writable_device_sz"
else
	# minimum snapshot size is bigger than physical device size
	echo "$0: Minimum snapshot size is too big for the scratch partition."
	echo "$0: You probably need to use a more conservative value."
	echo "$0: Using this client maximum scratch space ($writable_device_sz sectors)."
	scratch_device_sz="$writable_device_sz"
fi

# Create a linear target for the scratch device. This might seem superfluous,
# but it works around problems when using NVMe as pool data device directly.
if ! dmsetup_create_noudevsync "${scratch_device##*/}" \
		"0 $scratch_device_sz linear $writable_device $writable_device_used_sz"; then
	echo "$0: Failed to create scratch space for the CoW layer."
	# this should never fail, but if it does, we would likely not be able to use
	# $writable_device for any dmsetup stuff, so just fallback to ramdisk
	# until we have a better idea on what to do :)
	ramdisk_fallback
fi
save_partition_info "${scratch_device##*/}" "*" "1" "$scratch_device_sz"

# encrypt the scratch device, if configured
if [ -z "$id44_crypted" ]; then
	if [ "$crypt" -ne 0 ] && encrypt_device \
		"$scratch_device" "${scratch_device##*/}-crypt" "$scratch_device_sz"; then
		scratch_device="/dev/mapper/${scratch_device##*/}-crypt"
	else
		echo "$0: Continuing with unencrypted scratch"
	fi
fi

writable_device_used_sz="$scratch_device_sz"

# setup linear slices of the writable device
for line in "${linear[@]}"; do
	[ -z "$line" ] && continue
	read -r name crypt min max ignore <<< "$line"
	[ -n "$id44_crypted" ] && crypt=0
	free_space="$(( writable_device_sz - writable_device_used_sz ))"
	if [ "$min" -gt "$free_space" ]; then
		echo "$0: Not enough space left for linear devices: '$line'"
		break
	fi
	# allocate its max if it fits within the free space, otherwise use the space left.
	to_allocate="$max"
	[ "$to_allocate" -gt "$free_space" ] && to_allocate="$free_space"

	if ! dmsetup_create_noudevsync "$name" "0 $to_allocate linear $writable_device $writable_device_used_sz"; then
		echo "$0: Failed to create linear device: $line"
		continue
	fi
	# TODO sane?
	save_partition_info "$name" "*" "1" "$to_allocate"
	if [ "$crypt" -ne 0 ] && \
		! encrypt_device "/dev/mapper/$name" "${name}-crypt" "$to_allocate"; then
		echo "$0: Failed to encrypt '$name'."
	fi
	writable_device_used_sz=$(( to_allocate + writable_device_used_sz ))
done

###
##	THIN-PROVISIONING
###
declare -rg pool_dev="/dev/mapper/pool"
declare -gi root_ntfs_extra=0 # Extra blocks to provision to root fs for later expansion
# Now decide what to do for the writable layer

if [ -n "$thin_snapshot" ] || [ -n "$thin_volume" ]; then
	if ! create_pool ; then
		echo "Failed to create thin pool. Will ignore:"
		echo -e "\tThin snapshot: $(declare -p thin_snapshot)"
		echo -e "\tThin volumes: $(declare -p thin_volume)"
		echo "Trying snapshot fallback..."
		snapshot="$thin_snapshot"
	else
		# the order in which pool devices are created does not matter
		# so start with thin volumes starting with id 2 and end with
		# the thin-snapshot with id 1 which needs to call finish_setup.
		volume_id=2
		# go over thin-volumes
		for line in "${thin_volume[@]}"; do
			[ -z "$line" ] && continue
			read -r name crypt min max ignore <<< "$line"
			[ -n "$id44_crypted" ] && crypt=0
			# thin-volume can be created with max size,
			# since they are overprovisioned anyway.
			if ! create_volume "$name" "$(( volume_id++ ))" "$max"; then
				echo "Failed to create thin volume '$name'."
			fi
			save_partition_info "$name" "*" "1" "${scratch_device_sz}-${max}"
			if [ "$crypt" -ne 0 ] && ! encrypt_device \
				"/dev/mapper/$name" "$name-crypt" "$max"; then
				echo "Failed to encrypt thin volume '$name'."
			fi
		done

		if [ -n "$thin_snapshot" ]; then
			# create thin-snapshot, use first one
			read -r name crypt min max ignore <<< "$thin_snapshot"
			[ -n "$id44_crypted" ] && crypt=0
			# min/max was used for the pool data device, ignore it here!
			# Calculate how much of the CoW space we reserve for changes in the base
			# system. Usually all the files in the base system should be static, but
			# if someone decided to run apt dist-upgrade, this would change a lot of
			# existing blocks, which is bad.
			# Use MIN( readonly_size / 2, scratch_size / 10 )
			# until we come up with anything better.
			# Given an RO image of 10GB, this gives us:
			# 40GB scratch -> 46GB, so initially 36GB free space
			# 5GB scratch -> 14.5GB, initially 4.5GB free space
			declare -r max_reserved_sz="$(( scratch_device_sz / 10 ))"
			reserved_sz="$(( read_only_device_sz / 2 ))"
			(( reserved_sz > max_reserved_sz )) && reserved_sz="$max_reserved_sz"
			thin_snapshot_sz="$(( scratch_device_sz + read_only_device_sz - reserved_sz ))"
			# For later on-demand growing, overprovision by free space we found on
			# clean NTFS volumes. This requires a user-space helper to listen for
			# dm events in stage4, which should in turn add that free space to the pool-data
			if (( root_ntfs_extra > 0 )); then
				thin_snapshot_sz="$(( thin_snapshot_sz + root_ntfs_extra ))"
			fi
			if ! create_volume "$name" 1 "$thin_snapshot_sz" "$read_only_device"; then
				echo "Failed to create external snapshot for '$read_only_device'."
				ramdisk_fallback
			fi
			finish_setup "$name" "1" "$thin_snapshot_sz"
		fi
		echo "$0: Thin volumes defined, but no snapshot. Using tmpfs."
		ramdisk_fallback
	fi
fi

###
##	SNAPSHOT (OLD FUNCTIONALITY)
###
if [ -n "$snapshot" ] && require_exact_scratch_size; then
	read -r name crypt min max ignore <<< "$snapshot"
	[ -n "$id44_crypted" ] && crypt=0
	if ! create_snapshot "$name $persist"; then
		echo "Failed to create regular snapshot for '$read_only_device' on '$scratch_device'."
	else
		finish_setup "$name" "1" "$scratch_device_sz"
	fi
fi

# ultimate fallback
ramdisk_fallback
exit 1