OS-4112: stack overflow from promisc callbacks

Details

Issue Type:Bug
Priority:3 - Elevated
Status:Resolved
Created at:2015-03-26T23:59:03.000Z
Updated at:2016-05-02T16:04:54.000Z

People

Created by:Former user
Reported by:Former user
Assigned to:Former user

Resolution

Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2015-04-02T23:31:18.000Z)

Fix Versions

2015-04-16 Mars Climate Orbiter (Release Date: 2015-04-16)

Description

We had a system with a deep stack that led to an overflow:

> $C
fffff001e8de2010 plcnt_inc_dec+0xf()
fffff001e8de20b0 page_ctr_sub_internal+0x65(1, 4, fffff001d54537a8, 1)
fffff001e8de2110 page_ctr_sub+0x76(1, 4, fffff001d54537a8, 1)
fffff001e8de2230 page_get_mnode_freelist+0x32c(1, 5a, 4, 0, 2020009)
fffff001e8de2300 page_get_freelist+0x16d(fffffffffbcf14c0, fffff04398818000, fffff001e8de23f0, fffff04398818000, 1000, 9, fffffffffbc11708)
fffff001e8de23e0 page_create_va+0x2ad(fffffffffbcf14c0, fffff04398818000, 3000, 11, fffff001e8de23f0, fffff04398818000)
fffff001e8de2470 segkmem_page_create+0x97(fffff04398818000, 3000, 1, fffffffffbcf14c0)
fffff001e8de2510 segkmem_xalloc+0x8b(fffff042a5a1f000, 0, 3000, 1, 0, fffffffffb886650, fffffffffbcf14c0)
fffff001e8de2570 segkmem_alloc_vn+0x4a(fffff042a5a1f000, 3000, 1, fffffffffbcf14c0)
fffff001e8de25a0 segkmem_alloc+0x20(fffff042a5a1f000, 3000, 1)
fffff001e8de26d0 vmem_xalloc+0x5b1(fffff042a5a20000, 3000, 1000, 0, 0, 0, 0, baddcafe00000001)
fffff001e8de2740 vmem_alloc+0x135(fffff042a5a20000, 3000, 1)
fffff001e8de27d0 kmem_slab_create+0x8d(fffff042a5fc8008, 1)
fffff001e8de2830 kmem_slab_alloc+0x11e(fffff042a5fc8008, 1)
fffff001e8de2890 kmem_cache_alloc+0x233(fffff042a5fc8008, 1)
fffff001e8de28d0 allocb+0x9e(2f10, 0)
fffff001e8de2920 allocb_tmpl+0x2a(2f10, fffff0431c994e60)
fffff001e8de2980 copyb+0x64(fffff0431c994e60)
fffff001e8de29c0 copymsg+0x6d(fffff04330aa8160)
fffff001e8de2a10 mac_promisc_dispatch_one+0x43(fffff043978b4c48, fffff04330aa8160, 1)
fffff001e8de2a90 mac_promisc_dispatch+0xa7(fffff0431cfe4008, fffff04330aa8160, fffff0433fa22448)
fffff001e8de2b70 mac_tx_send+0x32a(fffff0433fa22448, fffff0436ea2f1b0, fffff04330aa8160, fffff001e8de2b90)
fffff001e8de2c10 mac_tx_soft_ring_process+0x79(fffff0438ca6cd80, fffff04330aa8160, 0, 0)
fffff001e8de2c80 mac_tx_aggr_mode+0x7c(fffff0436ec57340, fffff04330aa8160, 865df9b, 0, 0)
fffff001e8de2d30 mac_tx+0xda(fffff0433fa22448, fffff04330aa8160, 865df9b, 0, 0)
fffff001e8de2d80 str_mdata_fastpath_put+0x53(fffff0436f03fbe8, fffff04330aa8160, 865df9b, 0)
fffff001e8de2e90 ip_xmit+0x94f(fffff04330aa8160, fffff042dfd7ad20, 80006080, 8a, 865df9b, 0, 0, fffff0432ef46ca8)
fffff001e8de30e0 ire_send_wire_v4+0x3e9(fffff0439eacf740, fffff04330aa8160, fffff0432ec025e4, fffff0432ef46b40, fffff042ea654ef0)
fffff001e8de3160 conn_ip_output+0x190(fffff04330aa8160, fffff0432ef46b40)
fffff001e8de3250 udp_output_lastdst+0x106(fffff0432efcdac0, fffff043461440e0, fffff042dfd05db0, 0, fffff0432ef46b40)
fffff001e8de3300 udp_send+0x56b(fffff0432efcdac0, fffff043461440e0, fffff001e8de3670, fffff042dfd05db0)
fffff001e8de33a0 so_sendmblk_impl+0x192(fffff043a69465a0, fffff001e8de3670, 0, fffff042dfd05db0, fffff001e8de3538, 0, fffff00100000000)
fffff001e8de3430 so_sendmblk+0xb3(fffff043a69465a0, fffff001e8de3670, 0, fffff042dfd05db0, fffff001e8de3538)
fffff001e8de3480 socket_sendmblk+0x23(fffff043a69465a0, fffff001e8de3670, 0, fffff042dfd05db0, fffff001e8de3538)
fffff001e8de3520 ksocket_sendmblk+0xed(fffff043a69465a0, fffff001e8de3670, 0, fffff001e8de3538, fffff042dfd05db0)
fffff001e8de3560 overlay_mux_tx+0x2e(fffff043a0253400, fffff001e8de3670, fffff043461440e0)
fffff001e8de3700 overlay_m_tx+0x123(fffff043a5e209c0, fffff0431c994e60)
fffff001e8de37b0 mac_tx+0x594(fffff043aa778530, fffff0431c994e60, 0, 0, 0)
fffff001e8de3800 str_mdata_fastpath_put+0x53(fffff043a04d04d0, fffff0431c994e60, 0, 0)
fffff001e8de3910 ip_xmit+0x94f(fffff0431c994e60, fffff042dfd7ab68, 80006840, 54, 0, 0, fffff00100000000, fffff001e8de3e18)
fffff001e8de3b60 ire_send_wire_v4+0x3e9(fffff0439eacf8a0, fffff0431c994e60, fffff043acc340fa, fffff001e8de3cb0, fffff042ea654608)
fffff001e8de3c20 ip_output_simple_v4+0x366(fffff0431c994e60, fffff001e8de3cb0)
fffff001e8de3c90 ip_output_simple+0x15c(fffff0431c994e60, fffff001e8de3cb0)
fffff001e8de3e90 icmp_send_reply_v4+0x15c(fffff0431c994e60, fffff043acc340fa, fffff043acc3410e, fffff001e8de4230)
fffff001e8de3f10 icmp_inbound_v4+0x5fb(fffff0431c994e60, fffff001e8de4230)
fffff001e8de3ff0 ip_fanout_v4+0xe88(fffff0431c994e60, fffff043acc340fa, fffff001e8de4230)
fffff001e8de4060 ip_input_local_v4+0x16e(fffff043a1c045d0, fffff0431c994e60, fffff043acc340fa, fffff001e8de4230)
fffff001e8de40d0 ire_recv_local_v4+0x132(fffff043a1c045d0, fffff0431c994e60, fffff043acc340fa, fffff001e8de4230)
fffff001e8de41f0 ill_input_short_v4+0x4d6(fffff0431c994e60, fffff043acc340fa, fffff043acc3410a, fffff001e8de4230, fffff001e8de43c0)
fffff001e8de4440 ip_input_common_v4+0x372(fffff043b85048a8, 0, fffff0431c994e60, fffff001e8de44d0, 0, 0, 0)
fffff001e8de4480 ip_input+0x2b(fffff043b85048a8, 0, fffff0431c994e60, fffff001e8de44d0)
fffff001e8de4590 i_dls_link_rx+0x1cd(fffff043aa760d80, 0, fffff0431c994e60, 0)
fffff001e8de45e0 mac_rx_deliver+0x37(fffff043aa778530, 0, fffff0431c994e60, 0)
fffff001e8de4670 mac_rx_soft_ring_process+0x19a(fffff043aa778530, fffff043aa763780, fffff0431c994e60, fffff0431c994e60, 1, 0)
fffff001e8de4f20 mac_rx_srs_fanout+0x3b2(fffff0436ec50000, fffff0431c994e60)
fffff001e8de4fa0 mac_rx_srs_drain+0x256(fffff0436ec50000, 800)
fffff001e8de5030 mac_rx_srs_process+0x3ce(fffff0438caeabb8, fffff0436ec50000, fffff0431c994e60, 0)
fffff001e8de5090 mac_rx_common+0x143(fffff0438caeabb8, 0, fffff0431c994e60)
fffff001e8de50f0 mac_rx+0xb6(fffff0438caeabb8, 0, fffff0431c994e60)
fffff001e8de5310 overlay_mux_recv+0x2c8(fffff043a69465a0, fffff04330aa8160, 6e, 0, fffff043a0253400)
fffff001e8de53b0 so_queue_msg_impl+0x127(fffff043a69465a0, fffff04330aa8160, 6e, 0, fffff001e8de541c, 0, 0)
fffff001e8de5400 so_queue_msg+0x30(fffff043a69465a0, fffff04330aa8160, 6e, 0, fffff001e8de541c, 0)
fffff001e8de5470 udp_ulp_recv+0xa3(fffff0432efcdac0, fffff04330aa8160, 6e, fffff001e8de58e0)
fffff001e8de55c0 udp_input+0x3a2(fffff0432efcdac0, fffff0431c994e60, 0, fffff001e8de58e0)
fffff001e8de56a0 ip_fanout_v4+0xd29(fffff0431c994e60, fffff043acc340c4, fffff001e8de58e0)
fffff001e8de5710 ip_input_local_v4+0x16e(fffff0438ca70430, fffff0431c994e60, fffff043acc340c4, fffff001e8de58e0)
fffff001e8de5780 ire_recv_local_v4+0x132(fffff0438ca70430, fffff0431c994e60, fffff043acc340c4, fffff001e8de58e0)
fffff001e8de58a0 ill_input_short_v4+0x4d6(fffff0431c994e60, fffff043acc340c4, fffff043acc340d4, fffff001e8de58e0, fffff001e8de5a70)
fffff001e8de5af0 ip_input_common_v4+0x372(fffff0431eff28e8, 0, fffff0431c994e60, fffff001e8de5b80, 0, 0, 0)
fffff001e8de5b30 ip_input+0x2b(fffff0431eff28e8, 0, fffff0431c994e60, fffff001e8de5b80)
fffff001e8de5c40 i_dls_link_rx+0x1cd(fffff0431cfc62b8, 0, fffff0431c994e60, 0)
fffff001e8de5c90 mac_rx_deliver+0x37(fffff0433fa22448, 0, fffff0431c994e60, 0)
fffff001e8de5d20 mac_rx_soft_ring_process+0x19a(fffff0433fa22448, fffff0438cad9c80, fffff0431c994e60, fffff0431c994e60, 1, 0)
fffff001e8de65d0 mac_rx_srs_fanout+0x3b2(fffff0436ec56680, fffff0431c994e60)
fffff001e8de6650 mac_rx_srs_drain+0x256(fffff0436ec56680, 800)
fffff001e8de66e0 mac_rx_srs_process+0x3ce(fffff0431cfe4008, fffff0436ec56680, fffff0431c994e60, 0)
fffff001e8de6730 mac_rx_classify+0x129(fffff0431cfe4008, 0, fffff0431c994e60)
fffff001e8de67a0 mac_rx_flow+0x63(fffff0431cfe4008, 0, fffff0431c994e60)
fffff001e8de6800 mac_rx_common+0x196(fffff0431cfe4008, 0, fffff0431c994e60)
fffff001e8de6860 mac_rx+0xb6(fffff0431cfe4008, 0, fffff0431c994e60)
fffff001e8de6890 aggr_mac_rx+0x2d(fffff0431cfe4008, 0, fffff0431c994e60)
fffff001e8de6910 aggr_recv_path_cb+0x1bf(fffff0433e4f33e8, 0, fffff0431c994e60, 0, 1)
fffff001e8de6940 aggr_recv_promisc_cb+0x23(fffff0433e4f33e8, 0, fffff0431c994e60, 0)
fffff001e8de6990 mac_promisc_dispatch_one+0x69(fffff043978b4d28, fffff043570e0180, 0)
fffff001e8de6a10 mac_promisc_dispatch+0xa7(fffff0431cff4bb0, fffff043570e0180, 0)
fffff001e8de6a70 mac_rx_common+0x45(fffff0431cff4bb0, fffff0431cfce608, fffff043570e0180)
fffff001e8de6ad0 mac_rx+0xb6(fffff0431cff4bb0, fffff0431cfce608, fffff043570e0180)
fffff001e8de6b10 mac_rx_ring+0x2b(fffff0431cff4bb0, fffff0431cfce608, fffff043570e0180, 1)
fffff001e8de6b50 ixgbe_intr_rx_work+0x5c(fffff0431c47c280)
fffff001e8de6b90 ixgbe_intr_msix+0x57(fffff0431c8bd6b0, 0)
fffff001e8de6be0 apix_dispatch_by_vector+0x8c(20)
fffff001e8de6c20 apix_dispatch_lowlevel+0x25(20, 0)
fffff001e8db0a60 switch_sp_and_call+0x13()
fffff001e8db0ac0 apix_do_interrupt+0x387(fffff001e8db0ad0, 0)
fffff001e8db0ad0 _interrupt+0xba()
fffff001e8db0bc0 i86_mwait+0xd()
fffff001e8db0c00 cpu_idle_mwait+0x109()
fffff001e8db0c20 idle+0xa7()
fffff001e8db0c30 thread_start+8()

That's 99 frames. If we look at what's heavy, we'll see the biggest frames are about 2k which are each of the mac_rx_srs_fanout. It's not immediately obvious what the best solution here will be, but perhaps a stack check and pulling a split stack will be the best answer.

Comments

Comment by Former user
Created at 2015-04-02T23:09:51.000Z

illumos-joyent commit 9052673 (branch master, by Robert Mustacchi)

OS-4112 stack overflow from promisc callbacks
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>


Comment by Former user
Created at 2015-04-02T23:31:07.000Z

We're caught in a bit of a bind. We've basically hit a case where we exceed our current stack size due to the combo of an aggr in promisc mode and trying to refill a slab by doing some allocations and bam. Well, that's bad. So, we have a few options. We can either:

If we look at our stacks, it's fairly legitimate. Unfortunately the mac_rx_srs_fanout() calls use a fair bit of stack space, around 2200 bytes. This is because they declare the maximum fanout bits on the stack rather than allocating them. This is fairly important as it reduces latency and jitter. Because we can cause it to be re-entrant, it doesn't make sense to try and associate some storage with an interrupt thread or whatever. Instead, we're kind of forced to say that this is pretty much all we get.

Forcing async isn't appealing because much like the kmem allocation we're now changing things around and playing games with not just the scheduler but also how we set up the datapath. This isn't great either. It also increases a lot of latency and jitter.

Another option, and the one we're opting for is to increase the stack size. Importantly we're not increasing the stack size of all threads, but merely the fixed set that we have per CPU. There's one each for low-level interrupts. We're increasing this to 32k, which on a typical amd64 system means an increase of about 100k of kernel memory per schedulable hardware thread. This feels like a fairly reasonable trade off. It'll be slightly less on SPARC and slightly more on a 32-bit system.

With this, snoop has served over 3 billion packets and we're still not in trouble.