Bug #5065
ZFS assertion failure on 20140723 nightly il-gate
0%
Description
System running oi151_a9 with il-gate 20140723, purely serving storage, no interactive logins from users.
Was running two zfs sends, came back after the weekend to find it had panicked on an assertion failure:
panicstr = assertion failed: delta > 0 ? dd->dd_phys->dd_used_breakdown[oldtype] >= delta : dd->dd_phys->dd_used_breakdown[newtype] >= -delta, file: ../../common/fs/zfs/dsl_dir.c, line: 1395 panicstack = genunix:process_type+17bbe0 () | zfs:dsl_dir_transfer_space+dd () | zfs:dsl_dir_diduse_space+1b4 () | zfs:dsl_dataset_block_born+196 () | zfs:dbuf_write_done+429 () | zfs:arc_write_done+20f () | zfs:zio_done+422 () | zfs:zio_execute+d5 () | zfs:zio_notify_parent+113 () | zfs:zio_done+480 () | zfs:zio_execute+d5 () | zfs:zio_notify_parent+113 () | zfs:zio_done+480 () | zfs:zio_execute+d5 () | genunix:taskq_thread+318 () | unix:thread_start+8 () | crashtime = 1407060720 panic-time = August 3, 2014 06:12:00 AM EDT EDT
Don't have a dump, system panicked on ZFS deadman while trying to serialize the dump out after restarting, resulting in it scribbling over that dump with the dump from the deadman firing...sigh.
Files
Updated by Rich Ercolani over 6 years ago
- File msgbuf.txt msgbuf.txt added
- File stack.txt stack.txt added
- File stacks.txt stacks.txt added
Hurray, I got a core.
Boo, that means it's reproducible.
Updated by Igor Kozhukhov over 6 years ago
could you try reproduce it on DEBUG build?
DEBUG build will have more info
also - will be interest in additional info about configuration:
- zpool: one drive or raidz or mirrored
- enabled compressions: YES|NO, dedup YES|NO
- size of zpool
- is it real HW or VM?
- what is controller ?
Updated by Rich Ercolani over 6 years ago
Igor Kozhukhov wrote:
could you try reproduce it on DEBUG build?
DEBUG build will have more info
This is a debug build; otherwise, AFAIK, the ASSERT wouldn't have done anything at all.
also - will be interest in additional info about configuration:
- zpool: one drive or raidz or mirrored
11x8-disk RAIDZ-2
- enabled compressions: YES|NO, dedup YES|NO
compression=lz4, dedup=off and was never on
- size of zpool
160T
- is it real HW or VM?
real
- what is controller ?
2x LSI 9201-16e
Updated by Rich Ercolani over 6 years ago
Huh, so it's not the spacemap_histogram pool at all, it seems...
> ::stack vpanic() 0xfffffffffbe0b968() dsl_dir_transfer_space+0xdd(ffffff2334dc1b80, 48c0, 3, 2, ffffff3b8969ea00) dsl_dir_diduse_space+0x1b4(ffffff24d9ef6040, 0, 8880, 2000, 2000, ffffff3b8969ea00) dsl_dataset_block_born+0x196(ffffff2464b93180, ffffff24e2cace80, ffffff3b8969ea00) dbuf_write_done+0x429(ffffff2dffc8c8d8, ffffffaeec503418, ffffffc03981ceb0) arc_write_done+0x20f(ffffff2dffc8c8d8) zio_done+0x422(ffffff2dffc8c8d8) zio_execute+0xd5(ffffff2dffc8c8d8) zio_notify_parent+0x113(ffffff2dffc8c8d8, ffffffb17ce41c88, 1) zio_done+0x480(ffffffb17ce41c88) zio_execute+0xd5(ffffffb17ce41c88) zio_notify_parent+0x113(ffffffb17ce41c88, ffffffb7e28f35a8, 1) zio_done+0x480(ffffffb7e28f35a8) zio_execute+0xd5(ffffffb7e28f35a8) zio_notify_parent+0x113(ffffffb7e28f35a8, ffffffc98afa64c8, 1) zio_done+0x480(ffffffc98afa64c8) zio_execute+0xd5(ffffffc98afa64c8) taskq_thread+0x318(ffffff2332012510) thread_start+8() > ffffff2334dc1b80::print -t dsl_dir_t dsl_dir_t { uint64_t dd_object = 0x2 dsl_dir_phys_t *dd_phys = 0xffffff23cc6bec00 dmu_buf_t *dd_dbuf = 0xffffff23d7647618 dsl_pool_t *dd_pool = 0xffffff23cc2954c0 txg_node_t dd_dirty_link = { struct txg_node *[4] tn_next = [ 0, 0, 0, 0xffffff24d9ef6060 ] uint8_t [4] tn_member = [ 0, 0, 0, 0x1 ] } dsl_dir_t *dd_parent = 0 kmutex_t dd_lock = { void *[1] _opaque = [ 0xffffff00f64ffc40 ] } list_t dd_prop_cbs = { size_t list_size = 0x30 size_t list_offset = 0 struct list_node list_head = { struct list_node *list_next = 0xffffff24e25627d8 struct list_node *list_prev = 0xffffff24e25628b0 } } timestruc_t dd_snap_cmtime = { time_t tv_sec = 2014 Sep 22 01:22:53 long tv_nsec = 0x1f013ac0 } uint64_t dd_origin_txg = 0x1 uint64_t [4] dd_tempreserved = [ 0, 0, 0, 0 ] int64_t [4] dd_space_towrite = [ 0, 0, 0, 0x2520000 ] char [256] dd_myname = [ "dss003-1" ] } # zpool get all dss003-1 NAME PROPERTY VALUE SOURCE dss003-1 size 160T - dss003-1 capacity 94% - dss003-1 altroot - default dss003-1 health DEGRADED - dss003-1 guid 1016107226285437055 default dss003-1 version - default dss003-1 bootfs - default dss003-1 delegation on default dss003-1 autoreplace off default dss003-1 cachefile - default dss003-1 failmode wait default dss003-1 listsnapshots off default dss003-1 autoexpand off default dss003-1 dedupditto 0 default dss003-1 dedupratio 1.00x - dss003-1 free 8.69T - dss003-1 allocated 151T - dss003-1 readonly off - dss003-1 comment - default dss003-1 expandsize - - dss003-1 freeing 0 default dss003-1 fragmentation - - dss003-1 leaked 0 default dss003-1 feature@async_destroy enabled local dss003-1 feature@empty_bpobj active local dss003-1 feature@lz4_compress disabled local dss003-1 feature@multi_vdev_crash_dump disabled local dss003-1 feature@spacemap_histogram disabled local dss003-1 feature@enabled_txg disabled local dss003-1 feature@hole_birth disabled local dss003-1 feature@extensible_dataset disabled local dss003-1 feature@embedded_data disabled local dss003-1 feature@bookmarks disabled local dss003-1 feature@filesystem_limits disabled local
"Oh, good."
Updated by Rich Ercolani over 6 years ago
Well, here's how we get our assertion failure, now for why...
> ::stack vpanic() 0xfffffffffbe0b968() dsl_dir_transfer_space+0xdd(ffffff2334dc1b80, 48c0, 3, 2, ffffff3b8969ea00) dsl_dir_diduse_space+0x1b4(ffffff24d9ef6040, 0, 8880, 2000, 2000, ffffff3b8969ea00) dsl_dataset_block_born+0x196(ffffff2464b93180, ffffff24e2cace80, ffffff3b8969ea00) dbuf_write_done+0x429(ffffff2dffc8c8d8, ffffffaeec503418, ffffffc03981ceb0) arc_write_done+0x20f(ffffff2dffc8c8d8) zio_done+0x422(ffffff2dffc8c8d8) zio_execute+0xd5(ffffff2dffc8c8d8) zio_notify_parent+0x113(ffffff2dffc8c8d8, ffffffb17ce41c88, 1) zio_done+0x480(ffffffb17ce41c88) zio_execute+0xd5(ffffffb17ce41c88) zio_notify_parent+0x113(ffffffb17ce41c88, ffffffb7e28f35a8, 1) zio_done+0x480(ffffffb7e28f35a8) zio_execute+0xd5(ffffffb7e28f35a8) zio_notify_parent+0x113(ffffffb7e28f35a8, ffffffc98afa64c8, 1) zio_done+0x480(ffffffc98afa64c8) zio_execute+0xd5(ffffffc98afa64c8) taskq_thread+0x318(ffffff2332012510) thread_start+8() > ffffff2334dc1b80::print -t dsl_dir_t dsl_dir_t { uint64_t dd_object = 0x2 dsl_dir_phys_t *dd_phys = 0xffffff23cc6bec00 dmu_buf_t *dd_dbuf = 0xffffff23d7647618 dsl_pool_t *dd_pool = 0xffffff23cc2954c0 txg_node_t dd_dirty_link = { struct txg_node *[4] tn_next = [ 0, 0, 0, 0xffffff24d9ef6060 ] uint8_t [4] tn_member = [ 0, 0, 0, 0x1 ] } dsl_dir_t *dd_parent = 0 kmutex_t dd_lock = { void *[1] _opaque = [ 0xffffff00f64ffc40 ] } list_t dd_prop_cbs = { size_t list_size = 0x30 size_t list_offset = 0 struct list_node list_head = { struct list_node *list_next = 0xffffff24e25627d8 struct list_node *list_prev = 0xffffff24e25628b0 } } timestruc_t dd_snap_cmtime = { time_t tv_sec = 2014 Sep 22 01:22:53 long tv_nsec = 0x1f013ac0 } uint64_t dd_origin_txg = 0x1 uint64_t [4] dd_tempreserved = [ 0, 0, 0, 0 ] int64_t [4] dd_space_towrite = [ 0, 0, 0, 0x2520000 ] char [256] dd_myname = [ "dss003-1" ] } > 0xffffff23cc6bec00::print -t dsl_dir_phys_t dsl_dir_phys_t { uint64_t dd_creation_time = 0x5127cdac uint64_t dd_head_dataset_obj = 0x15 uint64_t dd_parent_obj = 0 uint64_t dd_origin_obj = 0x12 uint64_t dd_child_dir_zapobj = 0x4 uint64_t dd_used_bytes = 0x6ae18f3157e0 uint64_t dd_compressed_bytes = 0x6963f001d200 uint64_t dd_uncompressed_bytes = 0x7737a1d90000 uint64_t dd_quota = 0 uint64_t dd_reserved = 0 uint64_t dd_props_zapobj = 0x3 uint64_t dd_deleg_zapobj = 0 uint64_t dd_flags = 0x1 uint64_t [5] dd_used_breakdown = [ 0x59940, 0, 0x6ae18f2bbea0, 0, 0 ] uint64_t dd_clones = 0 uint64_t [13] dd_pad = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] }
The assertion failure is:
1393 ASSERT(delta > 0 ? 1394 dd->dd_phys->dd_used_breakdown[oldtype] >= delta : 1395 dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
from the call in:
dsl_dir_transfer_space+0xdd(ffffff2334dc1b80, 48c0, 3, 2, ffffff3b8969ea00)
So delta is arg1 (0x48c0), and oldtype/newtype are args23 (3 and 2 respectively).
uint64_t [5] dd_used_breakdown = [ 0x59940, 0, 0x6ae18f2bbea0, 0, 0 ]
We then can evaluate the assertion failure, with substitution:
1393 ASSERT([0x48c0] > 0 ? 1394 [0] >= [0x48c0] : 1395 [0x6ae18f2bbea0] >= -[0x48c0]);
Now, if only I understood this codepath enough to know where the logical flaw is...