diff options
author | Guilherme G. Piccoli <gpiccoli@canonical.com> | 2019-07-04 17:38:58 -0300 |
---|---|---|
committer | Thadeu Lima de Souza Cascardo <cascardo@debian.org> | 2020-03-02 04:48:05 -0300 |
commit | e44e68159ce007bfca8f9ddfa095afaea02a7788 (patch) | |
tree | dd927e904b27c077e4ec838cbb2a6697dab79d8e /debian | |
parent | e4d8ca17f5cb681bbeff0e357400053c530b4d4e (diff) |
Add kdump retry/delay mechanism when dumping over network
Kdump currently try mounting NFS (or doing the SSH dump) only once, and
if it fails, it just gives-up. Since kdump may be essential to debug hard
to reproduce bugs, we should improve the resilience and retry a bit,
delaying at each attempt.
This patch introduces a retry/delay mechanism for both NFS and SSH dumps;
the delay time is the same but number of retries is different (since NFS
mounts takes a long time between failures and is inherently more resilient),
both being configurable parameters from /etc.
The original trigger of this issue is a long-term (bad) behavior of some
NICs, which present a "Link Up" status _before_ being ready to transmit
packets; hence network kdump will try and fail without this patch.
Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@debian.org>
Diffstat (limited to 'debian')
-rwxr-xr-x | debian/kdump-config.in | 42 |
1 files changed, 33 insertions, 9 deletions
diff --git a/debian/kdump-config.in b/debian/kdump-config.in index a1c3b4e..d619468 100755 --- a/debian/kdump-config.in +++ b/debian/kdump-config.in @@ -50,6 +50,8 @@ KDUMP_DIR="/var/lib/kdump" KDUMP_NUM_DUMPS=${KDUMP_NUM_DUMPS:=0} NFS_TIMEO=${NFS_TIMEO:=600} NFS_RETRANS=${NFS_RETRANS:=3} +NFS_MOUNT_RETRY=${NFS_MOUNT_RETRY:=4} +SSH_KDUMP_RETRY=${SSH_KDUMP_RETRY:=16} MAKEDUMP_ARGS=${MAKEDUMP_ARGS:="-c -d 31"} KDUMP_CMDLINE_APPEND=${KDUMP_CMDLINE_APPEND:="@KDUMP_CMDLINE_APPEND@"} KDUMP_KERNEL_HOOK="/etc/kernel/postinst.d/kdump-tools" @@ -647,8 +649,21 @@ function kdump_save_core() # if [ -n "$NFS" ];then log_action_msg "Mounting NFS mountpoint $NFS ..." - mount -t nfs -o nolock -o tcp -o soft -o timeo=${NFS_TIMEO} -o retrans=${NFS_RETRANS} $NFS $KDUMP_COREDIR - ERROR=$? + MOUNTOPTS="-o nolock -o tcp -o soft -o timeo=${NFS_TIMEO} -o retrans=${NFS_RETRANS}" + + CNT=${NFS_MOUNT_RETRY} + while [ $CNT -ne 0 ];do + mount -t nfs $MOUNTOPTS $NFS $KDUMP_COREDIR + ERROR=$? + if [ $ERROR -eq 0 ];then + CNT=0 + else + ((CNT--)) + log_action_msg "Network not reachable; will try $CNT more times" + sleep 3 + fi + done + if [ $ERROR -ne 0 ];then log_failure_msg "$NAME: Unable to mount remote NFS directory $NFS. Cannot save core" logger -t $NAME "Unable to mount remote NFS directory $NFS. Cannot save core" @@ -759,14 +774,23 @@ function kdump_save_core_to_ssh() KDUMP_COREFILE="$KDUMP_STAMPDIR/dump.$KDUMP_STAMP" KDUMP_TMPDMESG="/tmp/dmesg.$KDUMP_STAMP" KDUMP_DMESGFILE="$KDUMP_STAMPDIR/dmesg.$KDUMP_STAMP" - ERROR=0 - ssh -i $KDUMP_SSH_KEY $KDUMP_REMOTE_HOST mkdir -p $KDUMP_STAMPDIR - ERROR=$? - # If remote connections fails, no need to continue - if [ $ERROR -ne 0 ] ; then - log_failure_msg "$NAME: Unable to reach remote server $KDUMP_REMOTE_HOST. No reason to continue" - logger -t $NAME "Unable to reach remote server $KDUMP_REMOTE_HOST. No reason to continue" + CNT=${SSH_KDUMP_RETRY} + while [ $CNT -ne 0 ];do + ssh -i $KDUMP_SSH_KEY $KDUMP_REMOTE_HOST mkdir -p $KDUMP_STAMPDIR + ERROR=$? + if [ $ERROR -eq 0 ];then + CNT=0 + else + ((CNT--)) + log_action_msg "Network not reachable; will try $CNT more times" + sleep 3 + fi + done + + if [ $ERROR -ne 0 ]; then + log_failure_msg "$NAME: Unable to reach remote server $KDUMP_REMOTE_HOST; can't continue" + logger -t $NAME "Unable to reach remote server $KDUMP_REMOTE_HOST; can't continue" return 1 fi |