summaryrefslogtreecommitdiff
path: root/debian
diff options
context:
space:
mode:
authorGuilherme G. Piccoli <gpiccoli@canonical.com>2019-07-04 17:38:58 -0300
committerThadeu Lima de Souza Cascardo <cascardo@debian.org>2020-03-02 04:48:05 -0300
commite44e68159ce007bfca8f9ddfa095afaea02a7788 (patch)
treedd927e904b27c077e4ec838cbb2a6697dab79d8e /debian
parente4d8ca17f5cb681bbeff0e357400053c530b4d4e (diff)
Add kdump retry/delay mechanism when dumping over network
Kdump currently try mounting NFS (or doing the SSH dump) only once, and if it fails, it just gives-up. Since kdump may be essential to debug hard to reproduce bugs, we should improve the resilience and retry a bit, delaying at each attempt. This patch introduces a retry/delay mechanism for both NFS and SSH dumps; the delay time is the same but number of retries is different (since NFS mounts takes a long time between failures and is inherently more resilient), both being configurable parameters from /etc. The original trigger of this issue is a long-term (bad) behavior of some NICs, which present a "Link Up" status _before_ being ready to transmit packets; hence network kdump will try and fail without this patch. Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@debian.org>
Diffstat (limited to 'debian')
-rwxr-xr-xdebian/kdump-config.in42
1 files changed, 33 insertions, 9 deletions
diff --git a/debian/kdump-config.in b/debian/kdump-config.in
index a1c3b4e..d619468 100755
--- a/debian/kdump-config.in
+++ b/debian/kdump-config.in
@@ -50,6 +50,8 @@ KDUMP_DIR="/var/lib/kdump"
KDUMP_NUM_DUMPS=${KDUMP_NUM_DUMPS:=0}
NFS_TIMEO=${NFS_TIMEO:=600}
NFS_RETRANS=${NFS_RETRANS:=3}
+NFS_MOUNT_RETRY=${NFS_MOUNT_RETRY:=4}
+SSH_KDUMP_RETRY=${SSH_KDUMP_RETRY:=16}
MAKEDUMP_ARGS=${MAKEDUMP_ARGS:="-c -d 31"}
KDUMP_CMDLINE_APPEND=${KDUMP_CMDLINE_APPEND:="@KDUMP_CMDLINE_APPEND@"}
KDUMP_KERNEL_HOOK="/etc/kernel/postinst.d/kdump-tools"
@@ -647,8 +649,21 @@ function kdump_save_core()
#
if [ -n "$NFS" ];then
log_action_msg "Mounting NFS mountpoint $NFS ..."
- mount -t nfs -o nolock -o tcp -o soft -o timeo=${NFS_TIMEO} -o retrans=${NFS_RETRANS} $NFS $KDUMP_COREDIR
- ERROR=$?
+ MOUNTOPTS="-o nolock -o tcp -o soft -o timeo=${NFS_TIMEO} -o retrans=${NFS_RETRANS}"
+
+ CNT=${NFS_MOUNT_RETRY}
+ while [ $CNT -ne 0 ];do
+ mount -t nfs $MOUNTOPTS $NFS $KDUMP_COREDIR
+ ERROR=$?
+ if [ $ERROR -eq 0 ];then
+ CNT=0
+ else
+ ((CNT--))
+ log_action_msg "Network not reachable; will try $CNT more times"
+ sleep 3
+ fi
+ done
+
if [ $ERROR -ne 0 ];then
log_failure_msg "$NAME: Unable to mount remote NFS directory $NFS. Cannot save core"
logger -t $NAME "Unable to mount remote NFS directory $NFS. Cannot save core"
@@ -759,14 +774,23 @@ function kdump_save_core_to_ssh()
KDUMP_COREFILE="$KDUMP_STAMPDIR/dump.$KDUMP_STAMP"
KDUMP_TMPDMESG="/tmp/dmesg.$KDUMP_STAMP"
KDUMP_DMESGFILE="$KDUMP_STAMPDIR/dmesg.$KDUMP_STAMP"
- ERROR=0
- ssh -i $KDUMP_SSH_KEY $KDUMP_REMOTE_HOST mkdir -p $KDUMP_STAMPDIR
- ERROR=$?
- # If remote connections fails, no need to continue
- if [ $ERROR -ne 0 ] ; then
- log_failure_msg "$NAME: Unable to reach remote server $KDUMP_REMOTE_HOST. No reason to continue"
- logger -t $NAME "Unable to reach remote server $KDUMP_REMOTE_HOST. No reason to continue"
+ CNT=${SSH_KDUMP_RETRY}
+ while [ $CNT -ne 0 ];do
+ ssh -i $KDUMP_SSH_KEY $KDUMP_REMOTE_HOST mkdir -p $KDUMP_STAMPDIR
+ ERROR=$?
+ if [ $ERROR -eq 0 ];then
+ CNT=0
+ else
+ ((CNT--))
+ log_action_msg "Network not reachable; will try $CNT more times"
+ sleep 3
+ fi
+ done
+
+ if [ $ERROR -ne 0 ]; then
+ log_failure_msg "$NAME: Unable to reach remote server $KDUMP_REMOTE_HOST; can't continue"
+ logger -t $NAME "Unable to reach remote server $KDUMP_REMOTE_HOST; can't continue"
return 1
fi