Project

General

Profile

Actions

Bug #14131

open

zfs crypto calls kmem_alloc while holding ARC locks needed by pageout

Added by Alex Wilson over 2 years ago. Updated 4 months ago.

Status:
New
Priority:
Normal
Category:
-
Start date:
Due date:
% Done:

0%

Estimated time:
Difficulty:
Medium
Tags:
Gerrit CR:
External Bug:

Description

Had a machine with encrypted ZFS datasets hit the pageout deadman:

pageout_deadman: stuck pushing the same page for 90 seconds (freemem is 196557)

The thread running pageout is stuck under zil_commit_waiter, and the txg_sync_thread is stuck waiting for an ARC buf hash lock:

> fffffbe361bd3c20::findstack -v
stack pointer for thread fffffbe361bd3c20 (pageout/1): fffffbe361bd3570
[ fffffbe361bd3570 _resume_from_idle+0x12b() ]
  fffffbe361bd35a0 swtch+0x133()
  fffffbe361bd35e0 cv_wait+0x68(fffffdf790ea6f48, fffffdf790ea6f50)
  fffffbe361bd3620 zil_commit_waiter+0x86(fffffdf4bed943c0, fffffdf790ea6f48)
  fffffbe361bd3650 zil_commit_impl+0x3c(fffffdf4bed943c0, 1)
  fffffbe361bd3690 zil_commit+0x3e(fffffdf4bed943c0, 1)
  fffffbe361bd3720 zvol_strategy+0x381(fffffe3a1f0a8e80)
  fffffbe361bd3750 bdev_strategy+0x64(fffffe3a1f0a8e80)
  fffffbe361bd37b0 spec_startio+0x8e(fffffdf4c2b22680, fffffbe228234f60, 1032075000, e000, 8500)
  fffffbe361bd3810 spec_pageio+0x3a(fffffdf4c2b22680, fffffbe228234f60, 1032075000, e000, 8500, fffffdf4496e7db0, 0)
  fffffbe361bd38a0 fop_pageio+0x5e(fffffdf4c2b22680, fffffbe228234f60, 1032075000, e000, 8500, fffffdf4496e7db0, 0)
  fffffbe361bd3980 swap_putapage+0x1f4(fffffdf4c231ee80, fffffbe228234f60, fffffbe361bd39b8, fffffbe361bd39c0, 8400, fffffdf4496e7db0)
  fffffbe361bd3a30 swap_putpage+0x2d1(fffffdf4c231ee80, 1fffffc457ff8000, 1000, 8400, fffffdf4496e7db0, 0)
  fffffbe361bd3ab0 fop_putpage+0x56(fffffdf4c231ee80, 1fffffc457ff8000, 1000, 8400, fffffdf4496e7db0, 0)
  fffffbe361bd3b10 pageout+0x27d()
  fffffbe361bd3b20 thread_start+0xb()

> fffffbe36252fc20::findstack -v
stack pointer for thread fffffbe36252fc20 (txg_sync_thread()): fffffbe36252f170
[ fffffbe36252f170 _resume_from_idle+0x12b() ]
  fffffbe36252f1a0 swtch+0x133()
  fffffbe36252f240 turnstile_block+0x25b(fffffdf794015520, 0, fffffffffbd8d670, fffffffffbc1b600, 0, 0)
  fffffbe36252f2b0 mutex_vector_enter+0x358(fffffffffbd8d670)
  fffffbe36252f350 arc_release+0x6b(fffffe22e4b5ea10, fffffe2040471c68)
  fffffbe36252f370 dbuf_release_bp+0x15(fffffe2040471c68)
  fffffbe36252f4c0 dbuf_write+0xab(fffffe257ac20480, fffffe22e4b5ea10, fffffeffc8742780)
  fffffbe36252f5a0 dbuf_sync_leaf+0x23b(fffffe257ac20480, fffffeffc8742780)
  fffffbe36252f5f0 dbuf_sync_list+0xb5(fffffe2566f70108, 0, fffffeffc8742780)
  fffffbe36252f640 dbuf_sync_indirect+0xb7(fffffe2566f70040, fffffeffc8742780)
  fffffbe36252f690 dbuf_sync_list+0x90(fffffeffadc3fe48, 1, fffffeffc8742780)
  fffffbe36252f6e0 dbuf_sync_indirect+0xb7(fffffeffadc3fd80, fffffeffc8742780)
  fffffbe36252f730 dbuf_sync_list+0x90(ffffff4959063408, 2, fffffeffc8742780)
  fffffbe36252f780 dbuf_sync_indirect+0xb7(ffffff4959063340, fffffeffc8742780)
  fffffbe36252f7d0 dbuf_sync_list+0x90(fffffdf4ec6126c0, 3, fffffeffc8742780)
  fffffbe36252f850 dnode_sync+0x327(fffffdf4ec612558, fffffeffc8742780)
  fffffbe36252f990 dmu_objset_sync+0x16f(fffffdf4bea82080, fffffdf7b669f110, fffffeffc8742780)
  fffffbe36252f9d0 dsl_pool_sync_mos+0x42(fffffdf4bedc8280, fffffeffc8742780)
  fffffbe36252fa80 dsl_pool_sync+0x3cb(fffffdf4bedc8280, 31c8373)
  fffffbe36252fb00 spa_sync_iterate_to_convergence+0xd0(fffffdf4bd143000, fffffe2c30954a40)
  fffffbe36252fb60 spa_sync+0x2f6(fffffdf4bd143000, 31c8373)
  fffffbe36252fc00 txg_sync_thread+0x1fd(fffffdf4bedc8280)
  fffffbe36252fc10 thread_start+0xb()

Another ZIO worker is stuck waiting on this same lock (fffffffffbd8d670):

> fffffbe360e54c20::findstack -v
stack pointer for thread fffffbe360e54c20 (zpool-rpool/123): fffffbe360e547b0
[ fffffbe360e547b0 _resume_from_idle+0x12b() ]
  fffffbe360e547e0 swtch+0x133()
  fffffbe360e54880 turnstile_block+0x25b(fffffdf794015520, 0, fffffffffbd8d670, fffffffffbc1b600, 0, 0)
  fffffbe360e548f0 mutex_vector_enter+0x358(fffffffffbd8d670)
  fffffbe360e54930 buf_hash_insert+0x4b(fffffe00dd16d3c8, fffffbe360e54950)
  fffffbe360e54990 arc_write_done+0xec(fffffea504aa8f20)
  fffffbe360e54a30 zio_done+0x402(fffffea504aa8f20)
  fffffbe360e54a60 zio_execute+0xa7(fffffea504aa8f20)
  fffffbe360e54b10 taskq_thread+0x2cd(fffffdf4bd2ce4d8)
  fffffbe360e54b20 thread_start+0xb()

Who's holding it?

> fffffffffbd8d670::mutex
            ADDR  TYPE             HELD MINSPL OLDSPL WAITERS
fffffffffbd8d670 adapt fffffbe364cc4c20      -      -     yes

> fffffbe364cc4c20::findstack -v
stack pointer for thread fffffbe364cc4c20 (receive_writer_thread()): fffffbe364cc3750
[ fffffbe364cc3750 _resume_from_idle+0x12b() ]
  fffffbe364cc3780 swtch+0x133()
  fffffbe364cc37c0 cv_wait+0x68(fffffffffbc4161c, fffffffffbc41610)
  fffffbe364cc3810 page_create_throttle+0x17c(1, 3)
  fffffbe364cc38f0 page_create_va+0x598(fffffffffbd08920, ffffffced511d000, 1000, 13, fffffbe364cc3900, ffffffced511d000)
  fffffbe364cc3980 segkmem_page_create+0x97(ffffffced511d000, 1000, 0, fffffffffbd08920)
  fffffbe364cc3a20 segkmem_xalloc+0x13f(fffffdf400e1f000, 0, 1000, 0, 0, fffffffffb8a84e0, fffffffffbd08920)
  fffffbe364cc3a90 segkmem_alloc_vn+0x3b(fffffdf400e1f000, 1000, 0, fffffffffbd08920)
  fffffbe364cc3ac0 segkmem_alloc+0x17(fffffdf400e1f000, 1000, 0)
  fffffbe364cc3bd0 vmem_xalloc+0x629(fffffdf400e20000, 1000, 1000, 0, 0, 0, 0, 0)
  fffffbe364cc3c40 vmem_alloc+0x190(fffffdf400e20000, 1000, 0)
  fffffbe364cc3cd0 kmem_slab_create+0x7c(fffffdf400f0a008, 0)
  fffffbe364cc3d30 kmem_slab_alloc+0x10b(fffffdf400f0a008, 0)
  fffffbe364cc3d90 kmem_cache_alloc+0x15b(fffffdf400f0a008, 0)
  fffffbe364cc3dd0 kmem_alloc+0x4b(dc0, 0)
  fffffbe364cc3e40 ccm_init_ctx+0x131(fffffbe364cc3ef0, fffffbe364cc4240, 0, 0, 10, fffffffff84fad30, fffffffff84fb050)
  fffffbe364cc3ed0 aes_common_init_ctx+0x2a0(fffffbe364cc3ef0, fffffe045f86fe00, fffffbe364cc4080, fffffe094ef54e88, 0, 0)
  fffffbe364cc4020 aes_decrypt_atomic+0x104(0, 0, fffffbe364cc4080, fffffe094ef54e88, fffffbe364cc4210, fffffbe364cc41e0, fffffe045f86fe00, 0)
  fffffbe364cc4190 crypto_decrypt+0x29b(fffffbe364cc41c8, fffffbe364cc4210, fffffe094ef54e88, fffffdf83e2f5b88, fffffbe364cc41e0, 0)
  fffffbe364cc4340 zio_do_crypt_uio+0x1ff(0, 5, fffffe094ef54e88, fffffdf83e2f5b88, fffffdf4f9447258, dc0, fffffbe364cc43f8, fffffbe364cc4428, fffffdf6e58d9000, fffffbe300002e20)
  fffffbe364cc44e0 zio_do_crypt_data+0x200(0, fffffe094ef54de0, a, 0, fffffdf4f9447250, fffffdf4f9447258, fffffdf4f9447264, ffffffff00004000, fffffdf59371a000, fffffdf5943f2000, fffffbe364cc4644)
  fffffbe364cc45e0 spa_do_crypt_abd+0x250(0, fffffdf4bd143000, fffffbe364cc4808, a, 0, 0, fffffdf4f9447250, fffffdf4f9447258, fffffdf4f9447264, fffffe0100004000, fffffe294e969b00, fffffe0ce4be7d40, fffffbe364cc4644)
  fffffbe364cc46a0 arc_hdr_decrypt+0xc6(fffffdf4f9447188, fffffdf4bd143000, fffffbe364cc4808)
  fffffbe364cc4710 arc_fill_hdr_crypt+0xb4(fffffdf4f9447188, fffffffffbd8d670, fffffdf4bd143000, fffffbe364cc4808, 0)
  fffffbe364cc4790 arc_buf_fill+0x1f5(fffffe05bf202cd8, fffffdf4bd143000, fffffbe364cc4808, 10)
  fffffbe364cc47e0 arc_untransform+0x36(fffffe05bf202cd8, fffffdf4bd143000, fffffbe364cc4808, 1)
  fffffbe364cc4870 dbuf_read_verify_dnode_crypt+0x105(fffffdf6bc403c48, 9)
  fffffbe364cc4930 dbuf_read_impl+0x41d(fffffdf6bc403c48, 0, 9)
  fffffbe364cc49d0 dbuf_read+0xc9(fffffdf6bc403c48, 0, 9)
  fffffbe364cc4a40 dmu_bonus_hold_by_dnode+0x8d(ffffffced84692b8, fffffffff7a6ad50, fffffbe364cc4a98, 1)
  fffffbe364cc4b80 receive_object+0x590(fffffbe3661925c0, ffffff497f9f2388, ffffff1be9a7ed00)
  fffffbe364cc4bc0 receive_process_record+0x86(fffffbe3661925c0, ffffff497f9f2380)
  fffffbe364cc4c00 receive_writer_thread+0x7d(fffffbe3661925c0)
  fffffbe364cc4c10 thread_start+0xb()

It seems that arc_fill_hdr_crypt takes an ARC hash_lock but then goes on to call into code that will eventually do a kmem_alloc. It's not marked for T_PUSHPAGE, so it can end up waiting for pageout in low memory conditions, but pageout needs this same lock.

Either this logic needs to mark its thread for PUSHPAGE, or it should be allocating all this state earlier before it takes the lock. If we go for the former approach, should we be marking everywhere that ARC takes a hash_lock?

Actions #1

Updated by Jason King over 1 year ago

Part of this is that IIUC, one of the relevant specs calls for no output of AEAD mechanisms until all of the input has been validated. I'm not sure what the rationale behind the behavior was aside from forcing users to only be able to use the plaintext if it's been validated by not letting them 'see it'.

As a result, what's happening here is it's allocating memory to hold the entire plaintext before returning it to the caller.

If we can relax that behavior (which would be nice since currently if we try to say encrypt/decrypt a 1gb chunk of data with ads-{ccm,gcm} it means allocating another 1gb of kernel memory for the output), that should allow us to avoid the offending allocation.Given this is crypto related we obviously need to be very sure of the implications before doing so, but it might be good to get a conclusion on the record either way.

Actions #2

Updated by Bill Sommerfeld over 1 year ago

I'm not sure what the rationale behind the behavior was aside from forcing users to only be able to use the plaintext if it's been validated by not letting them 'see it'.

It's also conservative crypto design for other reasons -- sloppiness around revealing information derived from failed validation has also enabled attacks on the underlying key management, algorithms, and/or modes used. For instance, revealing plaintext from a botched decryption might enable adaptive chosen ciphertext attacks (see https://en.wikipedia.org/wiki/Adaptive_chosen-ciphertext_attack). Plus the https://en.wikipedia.org/wiki/Padding_oracle_attack

Actions #3

Updated by Thirteen Oxide 11 months ago

Another alternative, much more invasive but also breathtakingly general, would be to make T_PUSHPAGE heritable through mutexes. Just as thread priority is heritable for scheduling, so too would PUSHPAGE be heritable for allocation, either in page_create_throttle or some other suitable manner. If we want exotic swap devices (personally I don't think it's worth it), this is probably the only general solution. It should at least get us out of these deadlocks and to either a failed operation or a panic. Note that it's not as simple as priority inheritance because the thread that needs to get memory may already be asleep; an implicit assumption here is that it's sleeping on one of some very small number of CVs (maybe just freemem_cv, but I'm sure it won't be this simple).

The options noted in the original ticket have their own problems. Marking every thread that wants to do an arc_read, or even ard_fill_hdr_crypt as T_PUSHPAGE would seem to make the flag largely meaningless (if everything is top priority, nothing is). Preallocating might be possible but we're a long way from the actual allocation and may not have the necessary context; moreover, isn't it likely that there are or will be other places where this is a problem? Put another way, how can a programmer know whether a KM_SLEEP is really allowed? It seems very easy to write code almost anywhere that might run in user or kernel context, is not itself part of pageout or providing a swap device, yet could end up in some long blocking chain holding up something that does. This is a relatively modest example, but it's hard to believe there aren't others that are even more difficult to anticipate. That's exactly why we have priority inheritance.

Another real example, FWIW:

...
  fffff78825ef0ce0 kmem_alloc+0x4a(20000, 0)
  fffff78825ef0d50 gcm_mode_decrypt_contiguous_blocks+0x7d(fffff78825ef0e40, fffffd00c71c7000, 20000, fffff78825ef1130, 10, fffffffff7dbaf10, fffffffff7dbb110, fffffffff7dbb260)
  fffff78825ef0d90 aes_decrypt_contiguous_blocks+0xe3(fffff78825ef0e40, fffffd00c71c7000, 20000, fffff78825ef1130)
  fffff78825ef0e20 crypto_update_uio+0x114(fffff78825ef0e40, fffff78825ef1160, fffff78825ef1130, fffffffff7dbb3f0, fffffffff7dbb1c0)
  fffff78825ef0f70 aes_decrypt_atomic+0x2b0(0, 0, fffff78825ef0fd0, fffffcfaa2bb15c8, fffff78825ef1160, fffff78825ef1130, fffffcfae99ab600, 0)
  fffff78825ef10e0 crypto_decrypt+0x2a0(fffff78825ef1118, fffff78825ef1160, fffffcfaa2bb15c8, fffffcfa896c3de0, fffff78825ef1130, 0)
  fffff78825ef1290 zio_do_crypt_uio+0x205(0, 8, fffffcfaa2bb15c8, fffffcfa896c3de0, fffffcfad60160f8, 20000, fffff78825ef1338, fffff78825ef1368, 0, fffff78800000000)
  fffff78825ef1420 zio_do_crypt_data+0x1da(0, fffffcfaa2bb1520, 13, 0, fffffcfad60160f0, fffffcfad60160f8, fffffcfad6016104, 20000, fffffd30495f8000, fffffd00c71c7000, fffff78825ef1584)
  fffff78825ef1520 spa_do_crypt_abd+0x241(0, fffffcfa52fe5000, fffff78825ef18e8, 13, 0, 0, fffffcfad60160f0, fffffcfad60160f8, fffffcfad6016104, fffffcf900020000, fffffcfbb4f7dbc0, fffffcfad0f7aa00, fffff78825ef1584)
  fffff78825ef15e0 arc_hdr_decrypt+0xcc(fffffcfad6016020, fffffcfa52fe5000, fffff78825ef18e8)
  fffff78825ef1660 arc_fill_hdr_crypt+0xb2(fffffcfad6016020, 0, fffffcfa52fe5000, fffff78825ef18e8, 0)
  fffff78825ef16e0 arc_buf_fill+0x1a1(fffffcfb59830df8, fffffcfa52fe5000, fffff78825ef18e8, 1)
  fffff78825ef1760 arc_buf_alloc_impl+0x1ab(fffffcfad6016020, fffffcfa52fe5000, fffff78825ef18e8, fffffcfaf2a8f7d8, 0, 0, fffffcfa00000000, fffffcff00000001, fffff78825ef1820)
  fffff78825ef1890 arc_read+0xe8e(fffffd16810b3250, fffffcfa52fe5000, fffff78825ef1908, fffffffff7e5ba40, fffffcfaf2a8f7d8, 0, 80, fffff78825ef18e4, fffff78825ef18e8)
...

Thoughts?

Actions #4

Updated by Joshua M. Clulow 11 months ago

I think this problem is similar to priority inversion, but indeed not quite the same. For example, by making all arc_read() threads T_PUSHPAGE, as we already do for all ZIO pipeline executor threads, you're not actually increasing their priority as such -- you're just allowing them to dip into the pageout reserve pool for allocations. That pool is really just a bang-bang threshold, though: if freemem is under it, we block, and if not, we don't.

If the system is not already under catastrophic pressure it's not like you are imbued with an increased priority in any other sense: nothing is particularly different. If you do hit catastrophic pressure, as currently laid out that allocation does need to succeed.

It would be better to preallocate before taking the locks, but I can't really imagine how the situation would be worse than it is now if we grant pushpage powers to all holders of ARC hash locks: either we weren't under pressure and it isn't a material difference, or we were, and they should have been allowed to move forward probably.

Actions #5

Updated by Joshua M. Clulow 11 months ago

  • Assignee set to Joshua M. Clulow
Actions #6

Updated by Thirteen Oxide 8 months ago

Joshua M. Clulow wrote in #note-4:

I think this problem is similar to priority inversion, but indeed not quite the same. For example, by making all arc_read() threads T_PUSHPAGE, as we already do for all ZIO pipeline executor threads, you're not actually increasing their priority as such -- you're just allowing them to dip into the pageout reserve pool for allocations. That pool is really just a bang-bang threshold, though: if freemem is under it, we block, and if not, we don't.

Yes, perhaps I wrote sloppily; I should have been clearer that I understand that this has almost nothing to do with scheduling or priority in the usual sense. You're correct that what I'm talking about is access to this reserved pool of pages. That is a kind of privilege, in the same way that getting onto CPU ahead of other threads is a kind of privilege, but this kind really has nothing to do with getting onto CPU other than that a particular allocation path might fail (or succeed) instead of blocking. That's very important, but since no amount of priority is going to get a blocked thread to run, it's not really a matter of scheduling priority as such. I was merely attempting to reason about the problem by analogy with another spectacularly successful technology that solves a somewhat similar class of problems. It might have been better to say that if everyone has access to the reserved pool, the pool isn't reserved at all.

If the system is not already under catastrophic pressure it's not like you are imbued with an increased priority in any other sense: nothing is particularly different. If you do hit catastrophic pressure, as currently laid out that allocation does need to succeed.

It would be better to preallocate before taking the locks, but I can't really imagine how the situation would be worse than it is now if we grant pushpage powers to all holders of ARC hash locks: either we weren't under pressure and it isn't a material difference, or we were, and they should have been allowed to move forward probably.

The problem is twofold:

  1. T_PUSHPAGE is a gigantic hammer. Every allocation made by a thread with this flag set can potentially come out of the reserved pool, even those that aren't required by pageout. This increases the likelihood that when an allocation is attempted that really is required by pageout, there won't be anything left in the pool.
  1. There is no obvious bound here; it looks an awful lot like whack-a-mole. Thread A might be a ZIO executor with T_PUSHPAGE set, blocked on thread B that has nothing to do not only with pageout but even with ZIO but doesn't get T_PUSHPAGE and is now blocked on allocation. So we go file another bug to set T_PUSHPAGE when we create thread B because we happen to know about this particular dependency chain. Enter thread C, lather rinse and repeat, and each time we set T_PUSHPAGE somewhere else, see problem (1).

If the assertion you're making here is that (1) isn't a real problem and we should just do away with the reserved pool altogether, that would certainly present a simpler solution than anything else under consideration. Perhaps it would be interesting to try that, but it addresses only the case in this bug, not the cases where we simply run out of memory trying to push a page and trigger the deadman. Otherwise, I don't really see how this isn't just another game of whack-a-mole or how we're supposed to win at it. What's especially frustrating is that this problem was very well understood many years ago when swap-to-zvol was introduced, and a comprehensive and complete solution to it should have been a precondition to the introduction of that feature.

While I appreciate the intent behind T_PUSHPAGE to ameliorate some of the worst manifestations of the problem, it's very clear that it is not that comprehensive solution. While I've suggested an approach that might be, I haven't implemented it (it appears from here to be a career-defining project and a doctoral-level paper to go with it) and I've seen no alternative proposal with the necessary attributes. In the absence of such a solution, I believe we should deprecate swap-to-zvol by adding a new -Z flag to swap, causing swap -a to fail when given a zvol without -Z, and printing a warning message when -Z is used that this configuration is likely to result in deadlock. If a zvol is listed as a swap mountpoint in /etc/vfstab, the entry should be ignored (with an error message generated by /sbin/swapadd) unless a new mount option (suggest deadlocks_considered_desirable) is specified, in which case it emits only the same deprecation and deadlock warning and adds the swap device. After 10 years, if a comprehensive solution has not been implemented, swap-to-zvol should be removed. That provides ample time to develop and test such a solution if desired, at which point the additional options can be removed from documentation and ignored by these utilities and the error and warning messages deleted. This is harsh. It will make probably no one happy. But what we have right now is behaviour consistently worse than whatever problems swap-to-zvol was intended to solve. It could be a wonderful feature, totally eliminating (at least on non-PCs, which don't need an EFI system partition) the need for slices on block devices and any storage architecture other than a single ZFS pool. What we have isn't that, and our documentation leads operators to think it is.

Band-Aids haven't worked; it's time to amputate and cauterise to save the patient.

Actions #7

Updated by Joshua M. Clulow 8 months ago

I think the situation is substantially less black-and-white, and indeed less dire, than you're suggesting. From my perspective, the rough narrative of what has transpired in the last decade is:

  • swapping to ZFS volumes did actually work pretty well once upon a time
  • we started importing some changes from ZFS-on-Linux/OpenZFS, where KM_PUSHPAGE is not something that really exists because swap to ZFS volume is not something that works on Linux for whatever reason; this has included a variety of things like encryption which are, in some paths, a bit too keen to allocate especially under locks that pageout needs to make progress
  • around 2019 I started using more classical install-to-disk distributions (e.g., OmniOS) in smaller virtual machines and I found that certain software builds (e.g., the Python test suite) would routinely hang up the machine; this lead to my introducing the pageout deadman in #13082 -- but I believe most or all of this instability is the result of new work ported from ZOL, not ancient lingering defects
  • armed with the tools to get crash dumps from pageout deadlock, I was able to chase down a number of issues; e.g., #13094 and and #13096 that were contributing to exhaustion and deadlocks
  • beyond that there was some tuning of the VM thresholds in #13097 and then a substantial improvement from T_PUSHPAGE in #13092
  • I just put back, finally, #13095 that was also in that series of work originally, which I expect to help under pressure as well

There is more outstanding work to do, obviously, but I don't think it's whack-a-mole any more than other kernel work is. The most critical issue I see is that we imported a lot of work from a foreign platform that doesn't swap to ZFS, when really we should have thought more about that at the time. There are obviously more areas we've missed, which is what this defect represents; we should fix it and continue on.

There is also other outstanding follow-up that needs more thought but is somewhat promising, in the form of #13093 where we could actually avoid allocations that aren't strictly needed when under memory pressure to improve resilience.

I think we should fix the specific defect here, and should likely perform some inspection or analysis to find other similar lingering issues. Having put back #13095 this is next on my list of ZFS things to poke at.

Actions #8

Updated by Thirteen Oxide 4 months ago

Another instance, similar to the others yet different:

> $C
fffff8673046ea50 vpanic()
fffff8673046ea60 pageout_deadman+0x62()
fffff8673046ead0 clock+0x7b3()
fffff8673046eb60 cyclic_softint+0xe1(fffffffffbe38000, 1)
fffff8673046eb80 cbe_softclock+0x23(0, 0)
fffff8673046ebd0 av_dispatch_softvect+0x72(a)
fffff8673046ec00 apix_dispatch_softint+0x35(0, 0)
fffff867363ed840 switch_sp_and_call+0x15()
fffff867363ed890 apix_do_softint+0x5a(fffff867363ed900)
fffff867363ed8f0 apix_do_interrupt+0x2bf(fffff867363ed900, 2)
fffff867363ed900 _interrupt+0xc3()
fffff867363eda40 checkpage+0x2d(fffff8627d60bec8, 2)
fffff867363edb10 pageout_scanner+0x203(d)
fffff867363edb20 thread_start+0xb()
> ::stacks -c pageout
THREAD           STATE    SOBJ                COUNT
fffff867324eac20 SLEEP    CV                      1
                 swtch+0x139
                 cv_wait+0x70
                 txg_wait_synced_impl+0x9c
                 txg_wait_synced+0x17
                 dmu_tx_wait+0x14c
                 dmu_tx_assign+0x4b
                 zvol_strategy+0x1f6
                 bdev_strategy+0x5c
                 spec_startio+0x8e
                 spec_pageio+0x20
                 fop_pageio+0x43
                 swap_putapage+0x1e6
                 swap_putpage+0x26d
                 fop_putpage+0x52
                 pageout+0x266
                 thread_start+0xb
> ::zio_state ! grep -v OPEN
ADDRESS                 TYPE  STAGE            WAITER           TIME_ELAPSED
fffffd3aa09efdc8        NULL  CHECKSUM_VERIFY  fffff8673b9c4c20 -
> fffff8673b9c4c20::findstack -v
stack pointer for thread fffff8673b9c4c20 (txg_sync_thread()): fffff8673b9c4930
[ fffff8673b9c4930 _resume_from_idle+0x12b() ]
  fffff8673b9c4960 swtch+0x139()
  fffff8673b9c4990 cv_wait+0x70(fffffd3aa09f0190, fffffd3aa09f0188)
  fffff8673b9c49d0 zio_wait+0x6b(fffffd3aa09efdc8)
  fffff8673b9c4a80 dsl_pool_sync+0xc5(fffffd39c56af040, 38002c)
  fffff8673b9c4b00 spa_sync_iterate_to_convergence+0xe0(fffffd39c6b6d000, fffffd39c8ecce40)
  fffff8673b9c4b60 spa_sync+0x286(fffffd39c6b6d000, 38002c)
  fffff8673b9c4c00 txg_sync_thread+0x1e7(fffffd39c56af040)
  fffff8673b9c4c10 thread_start+0xb()
> ::stacks -m zfs
THREAD           STATE    SOBJ                COUNT
fffffd39cbda63c0 SLEEP    CV                     50
                 swtch+0x139
                 cv_wait+0x70
                 page_create_throttle+0x17a
                 page_create_va+0xc7
                 pvn_read_kluster+0xc8
                 zfs_fillpage+0x1c7
                 zfs_getpage+0x1ec
                 fop_getpage+0x66
                 segvn_fault+0xff8
                 as_fault+0x1b2
                 pagefault+0x9d
                 trap+0x100a

fffff86730ab2c20 SLEEP    CV                     27
                 swtch+0x139
                 cv_wait+0x70
                 zthr_procedure+0x57
                 thread_start+0xb

fffff8673f30fc20 SLEEP    MUTEX                  25
                 swtch+0x139
                 turnstile_block+0x279
                 mutex_vector_enter+0x378
                 buf_hash_insert+0x4b
                 arc_write_done+0x104
                 zio_done+0x414
                 zio_execute+0xa7
                 taskq_thread+0x2a6
                 thread_start+0xb
...
fffffd39e04ee3e0 SLEEP    CV                      4
                 swtch+0x139
                 cv_wait+0x70
                 page_create_throttle+0x17a
                 page_create_va+0xc7
                 zfs_fillpage+0x86
                 zfs_getpage+0x1ec
                 fop_getpage+0x66
                 segvn_fault+0xff8
                 as_fault+0x1b2
                 pagefault+0x9d
                 trap+0x100a

fffff8673f2fdc20 SLEEP    MUTEX                   3
                 swtch+0x139
                 turnstile_block+0x279
                 mutex_vector_enter+0x378
> fffff8673f30fc20::findstack -v
stack pointer for thread fffff8673f30fc20 (zpool-oxp_6bfb41/136): fffff8673f30f7b0
[ fffff8673f30f7b0 _resume_from_idle+0x12b() ]
  fffff8673f30f7e0 swtch+0x139()
  fffff8673f30f880 turnstile_block+0x279(fffffd3a041a1888, 0, ffffffffc00425d0, fffffffffbe175c0, 0, 0)
  fffff8673f30f8f0 mutex_vector_enter+0x378(ffffffffc00425d0)
  fffff8673f30f930 buf_hash_insert+0x4b(fffffd3a382755f0, fffff8673f30f940)
  fffff8673f30f990 arc_write_done+0x104(fffffd481047e4e0)
  fffff8673f30fa30 zio_done+0x414(fffffd481047e4e0)
  fffff8673f30fa60 zio_execute+0xa7(fffffd481047e4e0)
  fffff8673f30fb10 taskq_thread+0x2a6(fffffd39df91f680)
  fffff8673f30fb20 thread_start+0xb()
> ffffffffc00425d0::mutex
            ADDR  TYPE             HELD MINSPL OLDSPL WAITERS
ffffffffc00425d0 adapt fffffd3b54218000      -      -     yes
> fffffd3b54218000::findstack -v
stack pointer for thread fffffd3b54218000 (crucible-downsta/126 [tokio-runtime-worker]): fffff86768fd8f60
[ fffff86768fd8f60 _resume_from_idle+0x12b() ]
  fffff86768fd8f90 swtch+0x139()
  fffff86768fd8fc0 cv_wait+0x70(fffffffffbe96cd4, fffffffffbe96cc8)
  fffff86768fd9000 page_create_throttle+0x17a(1, 3)
  fffff86768fd90e0 page_create_va+0xc7(fffffffffc265300, fffffd39e4f3d000, 1000, 13, fffff86768fd90f0, fffffd39e4f3d000)
  fffff86768fd9170 segkmem_page_create+0xa3(fffffd39e4f3d000, 1000, 0, fffffffffc265300)
  fffff86768fd9210 segkmem_xalloc+0x150(fffffd37fbc1f000, 0, 1000, 0, 0, fffffffffbcabb60, fffffffffc265300)
  fffff86768fd9280 segkmem_alloc_vn+0x3c(fffffd37fbc1f000, 1000, 0, fffffffffc265300)
  fffff86768fd92b0 segkmem_alloc+0x20(fffffd37fbc1f000, 1000, 0)
  fffff86768fd93d0 vmem_xalloc+0x4e9(fffffd37fbc20000, 1000, 1000, 0, 0, 0, 0, 0)
  fffff86768fd9440 vmem_alloc+0x139(fffffd37fbc20000, 1000, 0)
  fffff86768fd94d0 kmem_slab_create+0x78(fffffd37fbc7c008, 0)
  fffff86768fd9520 kmem_slab_alloc+0x10b(fffffd37fbc7c008, 0)
  fffff86768fd9570 kmem_cache_alloc+0x15b(fffffd37fbc7c008, 0)
  fffff86768fd95b0 kmem_zalloc+0x4a(60, 0)
  fffff86768fd96e0 arc_read+0x359(fffffd3ac0eae9e8, fffffd39c4baa000, fffff86768fd97d8, fffffffff7e613f0, fffffd3ae7ff2038, 2, ffffffff00000180, fffff86768fd9714, fffffd3ae7ff2040)
  fffff86768fd9730 dbuf_issue_final_prefetch+0xa2(fffffd3ae7ff2038, fffff86768fd97d8)
  fffff86768fd98c0 dbuf_prefetch_impl+0x4ee(fffffd39ce283aa0, 0, 40, 2, 20, fffffffff7e9fad0, fffffd3a27b87aa8)
  fffff86768fd9980 dmu_zfetch+0x28c(fffffd39ce283d18, 6, 2, 1, 1)
  fffff86768fd9a30 dmu_buf_hold_array_by_dnode+0x2e6(fffffd39ce283aa0, c2000, 20000, 1, fffffffff7f7f020, fffff86768fd9a6c, fffff86768fd9a70, fffffd3900000000)
  fffff86768fd9ad0 dmu_read_uio_dnode+0x54(fffffd39ce283aa0, fffff86768fd9c88, 20000)
  fffff86768fd9b20 dmu_read_uio_dbuf+0x51(fffffd3a56615df8, fffff86768fd9c88, 20000)
  fffff86768fd9bc0 zfs_read+0x178(fffffd3a16f7e240, fffff86768fd9c88, 0, fffffd3a51672688, 0)
  fffff86768fd9c40 fop_read+0x5d(fffffd3a16f7e240, fffff86768fd9c88, 0, fffffd3a51672688, 0)
  fffff86768fd9f00 preadv+0x261(41b, ff6b11d0, 100, c2000, 0)
  fffff86768fd9f10 sys_syscall+0x17d()

There's not much more to be said as this is simply yet another instance/variant of the same thing which illustrates the whack-a-mole concept nicely.

Actions

Also available in: Atom PDF