summaryrefslogtreecommitdiffstats
path: root/core/modules/hardware-stats
diff options
context:
space:
mode:
authorJonathan Bauer2019-10-31 17:00:23 +0100
committerJonathan Bauer2019-10-31 17:00:23 +0100
commitba6753cfabc149cc312f33a3c507b1cc0dbcee3b (patch)
treebf12c077ce4c1722a20319fc82b3eb2143b65a2c /core/modules/hardware-stats
parent[vmware-common] Fix launching usbarbitrator with 15.5.x (diff)
downloadmltk-ba6753cfabc149cc312f33a3c507b1cc0dbcee3b.tar.gz
mltk-ba6753cfabc149cc312f33a3c507b1cc0dbcee3b.tar.xz
mltk-ba6753cfabc149cc312f33a3c507b1cc0dbcee3b.zip
[*] introduce system-check
* hardware stats now only reports hardware info data in /run/hwinfo and /run/hwreport * system-check hooks will generate lightdm warnings and curl the report to the satellite * run-virt now handles the ID44 warnings to cope for network shares on /tmp/virt WARNING PROFI111!
Diffstat (limited to 'core/modules/hardware-stats')
-rw-r--r--core/modules/hardware-stats/data/etc/systemd/system/hardware-stats.service3
-rwxr-xr-xcore/modules/hardware-stats/data/opt/openslx/scripts/systemd-hardware_stats201
-rwxr-xr-xcore/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-report82
-rwxr-xr-xcore/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-warnings14
-rw-r--r--core/modules/hardware-stats/data/opt/openslx/system-check/lang/de/slx-gfx-nouveau1
-rw-r--r--core/modules/hardware-stats/data/opt/openslx/system-check/tags/slx-gfx-nouveau2
6 files changed, 169 insertions, 134 deletions
diff --git a/core/modules/hardware-stats/data/etc/systemd/system/hardware-stats.service b/core/modules/hardware-stats/data/etc/systemd/system/hardware-stats.service
index 36d47986..7309d0bb 100644
--- a/core/modules/hardware-stats/data/etc/systemd/system/hardware-stats.service
+++ b/core/modules/hardware-stats/data/etc/systemd/system/hardware-stats.service
@@ -1,7 +1,8 @@
[Unit]
Description=Gather statistics about this machine and send to boot server
After=tmp.target mount-vm-store.service network.target
-Wants=tmp.target
+Wants=tmp.target system-check.service
+Before=system-check.service
[Service]
Type=oneshot
diff --git a/core/modules/hardware-stats/data/opt/openslx/scripts/systemd-hardware_stats b/core/modules/hardware-stats/data/opt/openslx/scripts/systemd-hardware_stats
index 694c0707..6b81d7be 100755
--- a/core/modules/hardware-stats/data/opt/openslx/scripts/systemd-hardware_stats
+++ b/core/modules/hardware-stats/data/opt/openslx/scripts/systemd-hardware_stats
@@ -63,36 +63,33 @@ slxfdisk() {
return 127
}
-if [ -z "$SLX_REMOTE_LOG" ]; then
- echo "No remote log url given, will not report"
- exit 1
-fi
-
+################################################################################
# 1) Get MAC Address used for booting
-eval $(grep -Eo BOOTIF=\\S+ /proc/cmdline)
-if [ "${#BOOTIF}" -ne "20" ]; then
+#
+MAC="${SLX_PXE_MAC}"
+if [ -z "$MAC" ]; then
+ # get MAC from sysfs
+ MAC="$(cat /sys/class/net/${SLX_PXE_NETIF:-br0}/address)"
+fi
+if [ -z "$MAC" ]; then
+ BOOTIF="$(grep -Po '(?<=BOOTIF=)[0-9a-f\-:]+' /proc/cmdline)"
+ [ "${#BOOTIF}" -eq "20" ] && MAC="${BOOTIF:3}"
+fi
+if [ -z "$MAC" ]; then
echo "Getting MAC from /proc/cmdline failed, using 'ip a'..."
- BOOTIF=01-$(ip a | grep -A 1 ': br0' | grep -o 'ether ..:..:..:..:..:..' | cut -d' ' -f2 | sed s/:/-/g)
- if [ "${#BOOTIF}" -ne "20" ]; then
- echo "FAIL FAIL FAIL"
- BOOTIF="99-88-77-66-55-44-33"
- fi
+ _mac="$(ip a | grep -A 1 ': br0' | grep -o 'ether ..:..:..:..:..:..' | cut -d' ' -f2)"
+ [ "$_mac" -eq 17 ] && MAC="$_mac"
fi
-MAC=${BOOTIF:3}
-echo "Determined MAC=$MAC"
-
-# 2) Get machine UUID written in stage3.1
-UUID=$(cat /etc/system-uuid)
-if [ -z "$UUID" ] || [ "${#UUID}" -ne "36" ]; then
- echo "No/malformed UUID, aborting" >&2
- exit 1
+if [ -z "$MAC" ]; then
+ MAC="88-77-66-55-44-33"
fi
-echo "UUID=$UUID"
-
-# 3) Uptime in seconds
-UPTIME=$(grep -o -E '^[0-9]+' /proc/uptime)
+# always uppercase and dash-separated
+MAC="${MAC^^}"
+MAC="${MAC//:/-}"
+echo "Determined MAC=$MAC"
-# 4) Number of real and virtual CPU cores
+################################################################################
+# 2) Number of real and virtual CPU cores
# Virtual, cheap way
VCORES=$(grep '^processor\s' /proc/cpuinfo | sort -u | wc -l)
# Real cores
@@ -109,11 +106,15 @@ if [ -z "$CPUCORES" ] || [ "$CPUCORES" = "0" ]; then
fi
echo "$CPUCORES real cores, $VCORES with HT"
-# 5) CPU model name
+################################################################################
+# 3) CPU model name
+#
CPUMODEL=$(grep -m1 '^model name\s*:' /proc/cpuinfo | sed 's/^model name\s*:\s*//;s/\s\s*/ /g;s/^ //;s/ $//')
echo "$CPUMODEL"
-# 6) RAM
+################################################################################
+# 4) RAM
+#
RAM=$(grep -m1 '^MemTotal:' /proc/meminfo | awk '{print $2}')
RAM=$(( $RAM / 1024 ))
if [ -z "$RAM" ] || [ "$RAM" -lt 500 ]; then
@@ -125,7 +126,9 @@ if [ -z "$RAM" ] || [ "$RAM" -lt 500 ]; then
fi
echo "$RAM MB RAM"
-# 7) 64bit virtualization support
+################################################################################
+# 5) 64bit virtualization support
+#
VT="UNSUPPORTED"
VIRTTYPE=$(grep -m1 '^flags\s*:' /proc/cpuinfo | grep -wo -e svm -e vmx)
[ -n "$VIRTTYPE" ] && modprobe msr
@@ -153,7 +156,9 @@ elif [ "$VIRTTYPE" = "svm" ]; then # amd
fi
echo "$VIRTTYPE is $VT"
-# 8) ID44 partition size
+################################################################################
+# 6) ID44 partition size
+#
ID44=0
if ! slx-tools fs_path_isvolatile "/tmp/virt" ; then
ID44_SPACE=($(slx-tools fs_path_space "/tmp/virt"))
@@ -181,7 +186,9 @@ if ! slx-tools fs_path_isvolatile "/tmp/virt" ; then
fi
echo "Scratch space: $ID44 MB"
-# 9) check smart values
+################################################################################
+# 7) check smart values
+#
FDISK=$(mktemp)
declare -a DISKS
shopt -s extglob
@@ -192,6 +199,7 @@ for disk in /dev/disk/by-path/!(*-part*|*-usb-*); do
slxfdisk -l "$disk"
done > "$FDISK"
shopt -u extglob
+[ -z "$SLX_SMARTCTL_MIN_REALLOC" ] && SLX_SMARTCTL_MIN_REALLOC=0
BADSECTORS=0
if which smartctl; then
ALLSMART=$(mktemp)
@@ -226,7 +234,9 @@ if which smartctl; then
fi
echo "SMART: $OVERALL - $REALLOC reallocated, $PENDING pending"
-# A) Read system model and manufacturer
+################################################################################
+# 8) Read system model and manufacturer
+#
dmidec() {
local RETVAL=$(dmidecode "$@" 2>/dev/null | grep -v '^#' | grep -v '^Invalid' | sed 's/\s\s*/ /g;s/^ //;s/ $//')
case "$RETVAL" in
@@ -237,6 +247,10 @@ dmidec() {
echo "$RETVAL"
}
+bashesc () {
+ sed s/\'/\'\"\'\"\'/g <<< $*
+}
+
HW_MODEL=$(dmidec -q -s system-product-name)
HW_MANUF=$(dmidec -q -s system-manufacturer)
# Try fallback to baseboard
@@ -245,138 +259,59 @@ if [ "$HW_MODEL" = "Unknown" ]; then
HW_MANUF=$(dmidec -q -s baseboard-manufacturer)
fi
-MODEL="$HW_MODEL"
-if [ "$HW_MANUF" != "Unknown" ]; then
- MODEL="$MODEL ($HW_MANUF)"
-fi
-echo "System model: $MODEL"
+HW_MANUF=$(bashesc "$HW_MANUF")
+HW_MODEL=$(bashesc "$HW_MODEL")
-# n) Dump raw data to a file
-DATAFILE=$(mktemp)
-cat > "$DATAFILE" <<-EOF
+################################################################################
+# Save raw data to report file
+#
+REPORTFILE="/run/hwreport"
+cat > "$REPORTFILE" <<-EOF
############################### CPU #####################################
Sockets: $(grep '^physical id' /proc/cpuinfo | sort -u | wc -l)
Real cores: $CPUCORES
Virtual cores: $VCORES
######################## Partition tables ###############################
EOF
-cat "$FDISK" >> "$DATAFILE"
-cat >> "$DATAFILE" <<-EOF
+cat "$FDISK" >> "$REPORTFILE"
+cat >> "$REPORTFILE" <<-EOF
############################ PCI ID #####################################
EOF
-lspci -n -m >> "$DATAFILE"
-cat >> "$DATAFILE" <<-EOF
+lspci -n -m >> "$REPORTFILE"
+cat >> "$REPORTFILE" <<-EOF
########################## dmidecode ####################################
EOF
-dmidecode >> "$DATAFILE"
+dmidecode >> "$REPORTFILE"
if [ -n "$ALLSMART" ] && [ -s "$ALLSMART" ]; then
- cat >> "$DATAFILE" <<-EOF
+ cat >> "$REPORTFILE" <<-EOF
########################### smartctl ####################################
EOF
- cat "$ALLSMART" >> "$DATAFILE"
+ cat "$ALLSMART" >> "$REPORTFILE"
fi
-cat >> "$DATAFILE" <<-EOF
+cat >> "$REPORTFILE" <<-EOF
#########################
EOF
echo "Created report file"
[ -n "$ALLSMART" ] && rm -f -- "$ALLSMART"
-# Put some info in local file for later use
-HDDCOUNT="${#DISKS[@]}"
-
-bashesc () {
- sed s/\'/\'\"\'\"\'/g <<<$*
-}
-HW_MANUF=$(bashesc "$HW_MANUF")
-HW_MODEL=$(bashesc "$HW_MODEL")
+################################################################################
+# Save information to local file for later use
+#
cat > "/run/hwinfo" <<HORST
HW_KVM='${VT}'
HW_ID44='${ID44}'
+HW_MAC='${MAC}'
HW_MBRAM='${RAM}'
-HW_HDDCOUNT='${HDDCOUNT}'
+HW_HDDCOUNT='${#DISKS[@]}'
+HW_BADSECTORS='${BADSECTORS}'
HW_MANUF='${HW_MANUF}'
HW_MODEL='${HW_MODEL}'
+HW_CPUMODEL='${CPUMODEL}'
HW_CORES='${CPUCORES}'
HW_THREADS='${VCORES}'
HORST
-# Build warning logfile (for lightdm)
-buildlogfile() {
- . /run/hwinfo
- exec 4> /run/hw-warnings.log
- CONTACT_RZ=
- if [ "$HW_KVM" = "DISABLED" ]; then
- echo "ff0000" "* 64Bit-Gast-Support (VT-x oder AMD-V) ist im BIOS deaktiviert. 64Bit VMs können nicht gestartet werden." >&4
- CONTACT_RZ=jau
- elif [ "$HW_KVM" = "UNSUPPORTED" ]; then
- echo "000000" "* CPU hat keinen 64Bit-Gast-Support (VT-x oder AMD-V). 64Bit VMs können nicht gestartet werden." >&4
- fi
- if [ -n "$HW_MBRAM" ] && [ "$HW_MBRAM" -lt 3400 ]; then
- local GB=$(( ( HW_MBRAM + 300 ) / 1024 ))
- echo "000000" "* Dieser PC hat wenig RAM (${GB}GB). Die Leistung von VM-Sitzungen wird nicht optimal sein." >&4
- fi
- if [ "$HW_ID44" = "0" ]; then
- echo "000000" "* Keine ID44-Partition gefunden. VMs bekommen wenig RAM zugewiesen." >&4
- if [ "$HW_HDDCOUNT" = "0" ]; then
- echo "000000" " Keine Festplatte erkannt; eine Festplatte wird empfohlen, wenn Sie VMs nutzen wollen." >&4
- elif [ -n "$HW_HDDCOUNT" ]; then
- CONTACT_RZ=klar
- fi
- if [ -n "$HW_MBRAM" ] && [ "$HW_MBRAM" -lt 4500 ]; then
- echo "ff0000" " Da der PC wenig RAM hat, ist die Einrichtung einer ID44-Partition dringend zu empfehlen." >&4
- fi
- elif [ -n "$HW_ID44" ] && [ "$HW_ID44" -lt 10000 ]; then
- echo "000000" "* Die ID44-Partition ist sehr klein. VM-Sitzungen könnten nach einiger Zeit aus Speichermangel abstürzen." >&4
- CONTACT_RZ=fjeden
- fi
- if [ -n "$SLX_VM_NFS" ] && ! systemctl status mount-vm-store >/dev/null; then
- echo "ff0000" "* Der VM-Store konnte nicht eingehängt werden. VMs können nicht gestartet werden." >&4
- echo "ff0000" " Versuchen Sie das Problem zu lösen, indem Sie den Computer neu starten." >&4
- fi
- if grep -q '^nouveau ' "/proc/modules"; then
- echo "ff5500" "* Die nVidia-Karte in diesem Rechner wird nur von den quelloffenen Treibern (nouveau) unterstützt, und daher mit verminderter Leistung laufen." >&4
- fi
- if [ -n "$CONTACT_RZ" ]; then
- echo "000000" " -- " >&4
- echo "000000" " -- Wenden Sie sich ggf. an den bwLehrpool-Support Ihres Rechenzentrums -- " >&4
- fi
-}
-
-buildlogfile &
-
-# Fire away
-echo "Submitting to $SLX_REMOTE_LOG"
-if curl --retry 4 --retry-connrefused --max-time 5 --retry-max-time 15 \
- --data-urlencode "type=~poweron" --data-urlencode "uuid=$UUID" --data-urlencode "macaddr=$MAC" \
- --data-urlencode "uptime=$UPTIME" --data-urlencode "realcores=$CPUCORES" --data-urlencode "mbram=$RAM" \
- --data-urlencode "kvmstate=$VT" --data-urlencode "cpumodel=$CPUMODEL" --data-urlencode "id44mb=$ID44" \
- --data-urlencode "badsectors=$BADSECTORS" --data-urlencode "systemmodel=$MODEL" \
- --data-urlencode "data@$DATAFILE" "$SLX_REMOTE_LOG" | grep -q "RESULT=0"; then
- echo "Success"
- rm -f -- "$DATAFILE"
- START=$(( $RANDOM % 5 ))
- DELAY=$(( $RANDOM % 20 ))
- cat > "/etc/cron.d/usage_stats" <<-EOF
- # Update usage statistics on server
-
- SHELL=/bin/sh
- PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/openslx/sbin:/opt/openslx/bin
-
- ${START}-59/5 * * * * root sleep ${DELAY}; /opt/openslx/scripts/cron-system_usage_update
- EOF
- touch "/etc/cron.d" # Sometimes, aufs doesn't update the mtime of dirs when creating files,
- # so cron would not rescan the cron directory
- cleanup
- # Trigger right now so resource usage gets updated
- /opt/openslx/scripts/cron-system_usage_update
- exit 0
-else
- echo "Failed..."
-fi
-
-echo "Server doesn't seem to support hardware/usage stats - disabling logging"
-rm -f -- "/etc/cron.d/usage_stats"
cleanup
-exit 1
+exit 0
diff --git a/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-report b/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-report
new file mode 100755
index 00000000..4510bfe4
--- /dev/null
+++ b/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-report
@@ -0,0 +1,82 @@
+#!/bin/ash
+
+export PATH=$PATH:/opt/openslx/sbin:/opt/openslx/bin
+
+. /opt/openslx/config
+
+disable_remote_logging() {
+ echo "Server doesn't seem to support hardware/usage stats - disabling logging"
+ rm -f -- "/etc/cron.d/usage_stats"
+}
+
+# sends the hardware information of this machine generated by
+# systemd-hardware_stats_gather in /run/hwreport and /run/hwinfo
+report_hardware_info() {
+ if [ -z "$SLX_REMOTE_LOG" ]; then
+ disable_remote_logging
+ return 1
+ fi
+ local hwreport="/run/hwreport"
+ if [ ! -s "$hwreport" ]; then
+ echo "Missing hwreport file: $hwreport"
+ # TODO send data without?
+ return 1
+ fi
+
+ # Read generated data and current uptime and send it
+ local hwinfo="/run/hwinfo"
+ if [ ! -s "$hwinfo" ]; then
+ echo "Missing hwinfo file: $hwinfo"
+ return 1
+ fi
+ . "$hwinfo"
+
+ # got everything, get the last infos
+ local uptime=$(grep -oE '^[0-9]+' /proc/uptime)
+
+ uuid=$(cat /etc/system-uuid)
+ if [ -z "$uuid" ] || [ "${#uuid}" -ne "36" ]; then
+ echo "No/malformed UUID, aborting" >&2
+ exit 1
+ fi
+
+ # Combine manufacturer and model name (for displaying purposes)
+ local model="$HW_MODEL"
+ if [ "$HW_MANUF" != "Unknown" ]; then
+ model="$model ($HW_MANUF)"
+ fi
+ # just assume the uuid/mac dumped are valid here (its checked often enough :))
+ echo -n "Submitting to '$SLX_REMOTE_LOG' ... "
+ curl --retry 4 --retry-connrefused --max-time 5 --retry-max-time 15 \
+ --data-urlencode "type=~poweron" --data-urlencode "uuid=$uuid" --data-urlencode "macaddr=$HW_MAC" \
+ --data-urlencode "uptime=$uptime" --data-urlencode "realcores=$HW_CORES" --data-urlencode "mbram=$HW_MBRAM" \
+ --data-urlencode "kvmstate=$HW_KVM" --data-urlencode "cpumodel=$HW_CPUMODEL" --data-urlencode "id44mb=$HW_ID44" \
+ --data-urlencode "badsectors=$HW_BADSECTORS" --data-urlencode "systemmodel=$model" \
+ --data-urlencode "data@$hwreport" "$SLX_REMOTE_LOG" | grep -q "RESULT=0"
+ local ret=$?
+ if [ "$ret" -ne 0 ]; then
+ echo "failed."
+ disable_remote_logging
+ return 1
+ fi
+ echo "succeeded."
+ rm -f -- "$hwreport"
+ START=$(( $RANDOM % 5 ))
+ DELAY=$(( $RANDOM % 20 ))
+ cat > "/etc/cron.d/usage_stats" <<-EOF
+ # Update usage statistics on server
+ SHELL=/bin/sh
+ PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/openslx/sbin:/opt/openslx/bin
+
+ ${START}-59/5 * * * * root sleep ${DELAY}; /opt/openslx/scripts/cron-system_usage_update
+ EOF
+ # TODO remove this hack one day: Sometimes, aufs doesn't update the mtime of dirs
+ # when creating files, so cron would not rescan the cron directory.
+ touch "/etc/cron.d"
+ # Trigger right now so resource usage gets updated
+ /opt/openslx/scripts/cron-system_usage_update
+ return 0
+}
+
+report_hardware_info
+
diff --git a/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-warnings b/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-warnings
new file mode 100755
index 00000000..54894119
--- /dev/null
+++ b/core/modules/hardware-stats/data/opt/openslx/system-check/hooks.d/50-hardware-warnings
@@ -0,0 +1,14 @@
+#!/bin/ash
+# This file is executed in /opt/openslx/scripts/systemd-generate_warnings
+# If a first parameter is given, we write to that file instead of stdout.
+
+. /opt/openslx/config
+. /run/hwinfo
+
+if [ -n "$1" ]; then
+ exec >> "$1"
+fi
+
+if grep -q '^nouveau ' "/proc/modules"; then
+ echo 'slx-gfx-nouveau'
+fi
diff --git a/core/modules/hardware-stats/data/opt/openslx/system-check/lang/de/slx-gfx-nouveau b/core/modules/hardware-stats/data/opt/openslx/system-check/lang/de/slx-gfx-nouveau
new file mode 100644
index 00000000..a85f6c36
--- /dev/null
+++ b/core/modules/hardware-stats/data/opt/openslx/system-check/lang/de/slx-gfx-nouveau
@@ -0,0 +1 @@
+* Die nVidia-Karte in diesem Rechner wird nur von den quelloffenen Treibern (nouveau) unterstützt, und daher mit verminderter Leistung laufen.
diff --git a/core/modules/hardware-stats/data/opt/openslx/system-check/tags/slx-gfx-nouveau b/core/modules/hardware-stats/data/opt/openslx/system-check/tags/slx-gfx-nouveau
new file mode 100644
index 00000000..3459a3a4
--- /dev/null
+++ b/core/modules/hardware-stats/data/opt/openslx/system-check/tags/slx-gfx-nouveau
@@ -0,0 +1,2 @@
+color="ff5500"
+contact=