Project

General

Profile

Bug #8594

zfs panic: Pool 'tstpool3' has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.

Added by Igor Kozhukhov over 3 years ago. Updated over 3 years ago.

Status:
Rejected
Priority:
Normal
Assignee:
-
Category:
-
Start date:
2017-08-30
Due date:
% Done:

0%

Estimated time:
Difficulty:
Medium
Tags:
needs-triage
Gerrit CR:

Description

iSCSI COMSTAR setup with : 1 initiator + 3 targets
added additional property to zpool:
zpool set failmode=panic <pool>

FIO random tests with 10GB data.

root@con1:/var/crash/myhost# mdb *.0
Loading modules: [ unix genunix specfs dtrace mac cpu.generic uppc apix scsi_vhci zfs mpt_sas sd ip hook neti sockfs arp usba xhci stmf stmf_sbd mm sata idm random crypto ptm ufs logindmux smbsrv nfs nsmb ipc ]
> $C
ffffd001ec545a70 vpanic()
ffffd001ec545ab0 zio_suspend+0xa9(ffffd064b1739000, 0)
ffffd001ec545b80 spa_sync+0x62e(ffffd064b1739000, 1db8)
ffffd001ec545c20 txg_sync_thread+0x1f7(ffffd064b03281c0)
ffffd001ec545c30 thread_start+8()
> 
> ::status
debugging crash dump vmcore.0 (64-bit) from con1
operating system: 5.11 1.3.7.200 (i86pc)
image uuid: f9ceee5b-eed3-6428-8d74-946c7345e466
panic message: Pool 'tstpool3' has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.
dump content: kernel pages only
  (curproc requested, but a kernel thread panicked)
root@con1:/var/crash/myhost# cat 02.txt 
MESSAGE                                                               
sd13 at scsi_vhci0: unit-address g600144f07adeb1180000599164480002: f_tpgs
sd13 is /scsi_vhci/disk@g600144f07adeb1180000599164480002
/scsi_vhci/disk@g600144f07adede1e000059916c5f0003 (sd14) online
/scsi_vhci/disk@g600144f07adede1e000059916c5f0003 (sd14) multipath status: degraded: path 15 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,2 is online
sd14 at scsi_vhci0: unit-address g600144f07adede1e000059916c5f0003: f_tpgs
sd14 is /scsi_vhci/disk@g600144f07adede1e000059916c5f0003
/scsi_vhci/disk@g600144f07adeb11800005991644d0003 (sd15) online
/scsi_vhci/disk@g600144f07adeb11800005991644d0003 (sd15) multipath status: degraded: path 16 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec0001,2 is online
sd15 at scsi_vhci0: unit-address g600144f07adeb11800005991644d0003: f_tpgs
sd15 is /scsi_vhci/disk@g600144f07adeb11800005991644d0003
/scsi_vhci/disk@g600144f07adedf00000059916fe40003 (sd16) online
/scsi_vhci/disk@g600144f07adedf00000059916fe40003 (sd16) multipath status: degraded: path 17 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5da3fc0-05b2-4f4b-ed65-b401fbe691980001,2 is online
sd16 at scsi_vhci0: unit-address g600144f07adedf00000059916fe40003: f_tpgs
sd16 is /scsi_vhci/disk@g600144f07adedf00000059916fe40003
/scsi_vhci/disk@g600144f07adede1e000059916c630004 (sd17) online
/scsi_vhci/disk@g600144f07adede1e000059916c630004 (sd17) multipath status: degraded: path 18 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,3 is online
sd17 at scsi_vhci0: unit-address g600144f07adede1e000059916c630004: f_tpgs
sd17 is /scsi_vhci/disk@g600144f07adede1e000059916c630004
/scsi_vhci/disk@g600144f07adeb1180000599164520004 (sd18) online
/scsi_vhci/disk@g600144f07adeb1180000599164520004 (sd18) multipath status: degraded: path 19 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec0001,3 is online
sd18 at scsi_vhci0: unit-address g600144f07adeb1180000599164520004: f_tpgs
sd18 is /scsi_vhci/disk@g600144f07adeb1180000599164520004
/scsi_vhci/disk@g600144f07adede1e000059916c670005 (sd19) online
/scsi_vhci/disk@g600144f07adede1e000059916c670005 (sd19) multipath status: degraded: path 20 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,4 is online
sd19 at scsi_vhci0: unit-address g600144f07adede1e000059916c670005: f_tpgs
sd19 is /scsi_vhci/disk@g600144f07adede1e000059916c670005
/scsi_vhci/disk@g600144f07adedf00000059916fe70004 (sd20) online
/scsi_vhci/disk@g600144f07adedf00000059916fe70004 (sd20) multipath status: degraded: path 21 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5da3fc0-05b2-4f4b-ed65-b401fbe691980001,3 is online
sd20 at scsi_vhci0: unit-address g600144f07adedf00000059916fe70004: f_tpgs
sd20 is /scsi_vhci/disk@g600144f07adedf00000059916fe70004
/scsi_vhci/disk@g600144f07adeb1180000599164550005 (sd21) online
/scsi_vhci/disk@g600144f07adeb1180000599164550005 (sd21) multipath status: degraded: path 22 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec0001,4 is online
sd21 at scsi_vhci0: unit-address g600144f07adeb1180000599164550005: f_tpgs
sd21 is /scsi_vhci/disk@g600144f07adeb1180000599164550005
/scsi_vhci/disk@g600144f07adedf00000059916fea0005 (sd22) online
/scsi_vhci/disk@g600144f07adedf00000059916fea0005 (sd22) multipath status: degraded: path 23 iscsi0/disk@0000iqn.2010-08.org.illumos:02:f5da3fc0-05b2-4f4b-ed65-b401fbe691980001,4 is online
sd22 at scsi_vhci0: unit-address g600144f07adedf00000059916fea0005: f_tpgs
sd22 is /scsi_vhci/disk@g600144f07adedf00000059916fea0005
ISA-device: asy0
asy0 is /pci@0,0/isa@1f/asy@1,3f8
ISA-device: asy1
asy1 is /pci@0,0/isa@1f/asy@1,2f8
pseudo-device: fcp0
fcp0 is /pseudo/fcp@0
pseudo-device: fct0
fct0 is /pseudo/fct@0
pseudo-device: fbt0
fbt0 is /pseudo/fbt@0
pseudo-device: lockstat0
lockstat0 is /pseudo/lockstat@0
pseudo-device: profile0
profile0 is /pseudo/profile@0
pseudo-device: sdt0
sdt0 is /pseudo/sdt@0
smp0 at mpt_sas1: target-port w500304801d31447f
smp0 is /pci@0,0/pci8086,6f02@1/pci15d9,808@0/iport@ff/smp@w500304801d31447f
pseudo-device: systrace0
systrace0 is /pseudo/systrace@0
pseudo-device: fcsm0
fcsm0 is /pseudo/fcsm@0
pseudo-device: llc10
llc10 is /pseudo/llc1@0
pseudo-device: lofi0
lofi0 is /pseudo/lofi@0
pseudo-device: ramdisk1024
ramdisk1024 is /pseudo/ramdisk@1024
pseudo-device: ucode0
ucode0 is /pseudo/ucode@0
pseudo-device: bpf0
bpf0 is /pseudo/bpf@0
pseudo-device: eventfd0
eventfd0 is /pseudo/eventfd@0
pseudo-device: fssnap0
fssnap0 is /pseudo/fssnap@0
NOTICE: igb0 registered
pseudo-device: inotify0
inotify0 is /pseudo/inotify@0
pseudo-device: signalfd0
signalfd0 is /pseudo/signalfd@0
pseudo-device: timerfd0
timerfd0 is /pseudo/timerfd@0
pseudo-device: pm0
pm0 is /pseudo/pm@0
pseudo-device: nsmb0
nsmb0 is /pseudo/nsmb@0
NOTICE: ixgbe4 registered
NOTICE: ixgbe4: Intel 10Gb Ethernet
NOTICE: ixgbe1 registered
NOTICE: ixgbe1: Intel 10Gb Ethernet
NOTICE: ixgbe3 registered
NOTICE: ixgbe3: Intel 10Gb Ethernet
NOTICE: ixgbe5 registered
NOTICE: ixgbe5: Intel 10Gb Ethernet
NOTICE: ixgbe5 link up, 10000 Mbps, full duplex
NOTICE: ixgbe4 link up, 10000 Mbps, full duplex
NOTICE: ixgbe1 link up, 10000 Mbps, full duplex
NOTICE: ixgbe3 link up, 1000 Mbps, full duplex
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
NOTICE: iscsi connection(15) unable to connect to target iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
NOTICE: iscsi connection(15) unable to connect to target iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
NOTICE: iscsi connection(15) unable to connect to target iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
NOTICE: iscsi connection(15) unable to connect to target iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
NOTICE: iscsi connection(15) unable to connect to target iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49
NOTICE: iscsi connection(21) unable to connect to target iqn.2010-08.org.illumos:02:f5a77de5-60c0-69ea-90a7-833c1ca9aeec
WARNING: /scsi_vhci (scsi_vhci0):
        /scsi_vhci/disk@g600144f07adede1e000059916c630004 (sd17): Command Timeout on path iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,3
WARNING: /scsi_vhci (scsi_vhci0):
        /scsi_vhci/disk@g600144f07adede1e000059916c570001 (sd10): Command Timeout on path iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,0
WARNING: iscsi connection(15/3f) closing connection - target requested reason:0x7
WARNING: vhci_scsi_reset 0x1
NOTICE: iscsi session(14) iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a49 offline

/scsi_vhci/disk@g600144f07adede1e000059916c670005 (sd19) offline
/scsi_vhci/disk@g600144f07adede1e000059916c670005 (sd19) multipath status: failed: path 20 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,4 is offline
WARNING: /scsi_vhci (scsi_vhci0):
        sd17: path iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,3, reset 1 failed
WARNING: vhci_scsi_reset 0x0
WARNING: /scsi_vhci/disk@g600144f07adede1e000059916c630004 (sd17):
        SYNCHRONIZE CACHE command failed (5)

/scsi_vhci/disk@g600144f07adede1e000059916c5f0003 (sd14) offline
/scsi_vhci/disk@g600144f07adede1e000059916c5f0003 (sd14) multipath status: failed: path 15 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,2 is offline
/scsi_vhci/disk@g600144f07adede1e000059916c5b0002 (sd12) offline
/scsi_vhci/disk@g600144f07adede1e000059916c5b0002 (sd12) multipath status: failed: path 13 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,1 is offline
WARNING: /scsi_vhci/disk@g600144f07adede1e000059916c570001 (sd10):
        SYNCHRONIZE CACHE command failed (5)

/scsi_vhci/disk@g600144f07adede1e000059916c570001 (sd10) offline
/scsi_vhci/disk@g600144f07adede1e000059916c570001 (sd10) multipath status: failed: path 11 iscsi0/disk@0000iqn.2010-08.org.illumos:02:5fd770c8-8b41-441c-a48f-91d6a2013a490001,0 is offline
NOTICE: SUNW-MSG-ID: SUNOS-8000-0G, TYPE: Error, VER: 1, SEVERITY: Major

panic[cpu6]/thread=ffffd001ec545c40: 
Pool 'tstpool3' has encountered an uncorrectable I/O failure and the failure mode property for this pool is set to panic.

ffffd001ec545ab0 zfs:zio_suspend+a9 ()
ffffd001ec545b80 zfs:spa_sync+62e ()
ffffd001ec545c20 zfs:txg_sync_thread+1f7 ()
ffffd001ec545c30 unix:thread_start+8 ()

dumping to /dev/zvol/dsk/rpool/dump, offset 65536, content: kernel + curproc

details in env.

Precondition:
shelfves config: atime=off && sync=disabled on targets
1) CON1: create a zpool
tstpool3 - 4 disks from 3 shelves (lbsize 512 only) - (#6: SHELF8) (#11: SHELF7) (#12: SHELF7) (#16: SHELF4)

root@con1:~# zpool create -f tstpool3 c0t600144F07ADEB1180000599164520004d0 c0t600144F07ADEDE1E000059916C570001d0 c0t600144F07ADEDE1E000059916C630004d0 c0t600144F07ADEDF00000059916FE40003d0

root@con1:~# zpool list -v tstpool3
NAME  SIZE  ALLOC  FREE  EXPANDSZ  FRAG  CAP  DEDUP  HEALTH  ALTROOT
tstpool3  65T  89K  65.0T  -  0%  0%  1.00x  ONLINE  -
 c0t600144F07ADEB1180000599164520004d0  16.2T  33.5K  16.2T  -  0%  0%
 c0t600144F07ADEDE1E000059916C570001d0  16.2T  22K  16.2T  -  0%  0%
 c0t600144F07ADEDE1E000059916C630004d0  16.2T  0  16.2T  -  0%  0%
 c0t600144F07ADEDF00000059916FE40003d0  16.2T  33.5K  16.2T  -  0%  0%

root@con1:~# zdb tstpool3 | grep ashift
 ashift: 9
 ashift: 9
 ashift: 9
 ashift: 9
 ashift: 9
 ashift: 9
 ashift: 9
 ashift: 9
loading space map for vdev 3 of 4, metaslab 0 of 130 ...
root@con1:~#

2) CON1: create dataset
zfs create tstpool3/space1

3) CON1: set property failmode to zpool
root@con1:~# zpool set failmode=panic tstpool3

Steps:
Run fio write test on dataset tstpool3/space1

1.1) run the following script
./init_write_rand_1G_10G.sh tstpool3 space1

$ cat init_write_rand_1G_10G.sh
#!/bin/bash

if [ $# != 2 ]; then
 echo "Usage: $0 <tstpool> <space>" 
 exit 1
fi

POOL=$1
SPACE=$2

for file in conf/write_rand/*;
do
if [ -f $file ]; then
. $file;
echo -e "\n" 
echo "RUNPATH ${RUNPATH}" 
echo "RESPATH ${RESPATH}" 

mkdir -p ${RESPATH}

FILENAME=$(echo $RESPATH | cut -d'/' -f5-)
echo "FILENAME ${FILENAME}" 

echo -e "\n" 
echo "Started for ${RUNPATH} 1job" 
for ((i=1; i<=10; i++))
do
rm -rf /${POOL}/${SPACE}/*
wait
echo "Randwrite-J1 run $i" 2>&1 | tee ${RESPATH}/Wrand_1j_r$i.txt
wait
fio ${RUNPATH}/write_rand_1j.fio --filename=/${POOL}/${SPACE}/W${FILENAME}1j$i 2>&1 | tee ${RESPATH}/Wrand_1j_r$i.txt
grep "iops" ${RESPATH}/Wrand_1j_r$i.txt >> ${RESPATH}/res_Wrand_1j.txt
grep "clat (" ${RESPATH}/Wrand_1j_r$i.txt >> ${RESPATH}/res_Wrand_1j.txt
zfs list -H ${POOL}/${SPACE} >> ${RESPATH}/zfs_res_Wrand_1j.txt
time sync; sync
done
wait
echo "Compeleted for ${RUNPATH} 1job" 

echo -e "\n" 
echo "Started for ${RUNPATH} 10jobs" 
for ((i=1; i<=10; i++))
do
rm -rf /${POOL}/${SPACE}/*
wait
echo "Randwrite-J10 run $i" 2>&1 | tee ${RESPATH}/Wrand_10j_r$i.txt
wait
fio ${RUNPATH}/write_rand_10j.fio --filename=/${POOL}/${SPACE}/W${FILENAME}10j$i 2>&1 | tee ${RESPATH}/Wrand_10j_r$i.txt
grep "iops" ${RESPATH}/Wrand_10j_r$i.txt >> ${RESPATH}/res_Wrand_10j.txt
grep "clat (" ${RESPATH}/Wrand_10j_r$i.txt >> ${RESPATH}/res_Wrand_10j.txt
zfs list -H ${POOL}/${SPACE} >> ${RESPATH}/zfs_res_Wrand_10j.txt
time sync; sync
done
wait
echo "Compeleted for ${RUNPATH} 10jobs" 

fi
done

$

2) path config file

$ cat conf/write_rand/randwrite_128k1G.sh
#!/bin/bash
RUNPATH=/opt/ksenia/New_fio_v2/tests/write_rand/128k/1G
RESPATH=/opt/ksenia/New_fio_v2/res/${POOL}/${SPACE}/write_rand/128k1G

$ cat conf/write_rand/randwrite_128k10G.sh
#!/bin/bash
RUNPATH=a/a/New_fio_v2/tests/write_rand/128k/10G
RESPATH=/opt/ksenia/New_fio_v2/res/${POOL}/${SPACE}/write_rand/128k10G
$

3) fio job files

$ cat tests/write_rand/128k/1G/write_rand_1j.fio
[writetest1]
bs=128k
size=1G
name=randwrite
rw=randwrite
ioengine=sync
iodepth=1
numjobs=1
runtime=20
group_reporting
fallocate=none
clocksource=clock_gettime
time_based

$ cat tests/write_rand/128k/1G/write_rand_10j.fio
[writetest1]
bs=128k
size=1G
name=randwrite
rw=randwrite
ioengine=sync
iodepth=1
numjobs=10
runtime=30
group_reporting
fallocate=none
clocksource=clock_gettime
time_based

$ cat tests/write_rand/128k/10G/write_rand_1j.fio
[writetest]
bs=128k
size=10G
name=randwrite
rw=randwrite
ioengine=sync
iodepth=1
numjobs=1
runtime=160
group_reporting
fallocate=none
clocksource=clock_gettime
time_based
cat tests/write_rand/128k/10G/write_rand_10j.fio
[writetest1]
bs=128k
size=10G
name=randwrite
rw=randwrite
ioengine=sync
iodepth=1
numjobs=10
runtime=300
group_reporting
fallocate=none
clocksource=clock_gettime
time_based
$

Result:
The following writes are passed: - panic at 10G in 10jobs

# ls -l res/tstpool3/space1/write_rand/128k10G/
total 1
-rw-r--r-- 1 root root 74 Aug 30 09:40 Wrand_1j_r1.txt
# ls -l res/tstpool3/space1/write_rand/128k1G/
ls: cannot access 'res/tstpool3/space1/write_rand/128k1G/': No such file or directory
#

#1

Updated by Igor Kozhukhov over 3 years ago

  • Description updated (diff)
#2

Updated by Igor Kozhukhov over 3 years ago

  • Status changed from New to Rejected

no a bug - it is proposed panic with failure by setup failmode=panic property

Also available in: Atom PDF