Issue Type: | Bug |
---|---|
Priority: | 2 - Critical |
Status: | Resolved |
Created at: | 2015-05-01T16:20:00.000Z |
Updated at: | 2015-05-19T00:11:02.000Z |
Created by: | Former user |
---|---|
Reported by: | Former user |
Assigned to: | Former user |
Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2015-05-19T00:11:02.000Z)
2015-05-14 Orkut (Release Date: 2015-05-14)
We had a compute node panic with the following stack:
HTTPS clone URL You can clone with HTTPS or SSH. gistfile1.txt > ::stack mac_hcksum_get+0xc() ixgbe_get_context+0x32(fffff04601e75c00, fffff001f5ff80e0) ixgbe_ring_tx+0x41b(fffff0431b94af00, fffff04601e75c00) mac_hwring_tx+0x1d(fffff0431d2e6850, fffff04601e75c00) mac_hwring_send_priv+0x90(fffff0435aa5fcb8, fffff0431d2e6850, fffff04601e75c00) aggr_ring_tx+0x24(fffff0435dab4968, fffff04601e75c00) mac_hwring_tx+0x1d(fffff0438aedec38, fffff04601e75c00) mac_tx_send+0x5dc(fffff0435aa5d448, fffff0438aedec38, fffff04601e75c00, fffff001f5ff8300) mac_tx_soft_ring_process+0x79(fffff0438b230680, fffff04601e75c00, 0, 0) mac_tx_aggr_mode+0x7c(fffff0435aa4b000, fffff04601e75c00, 87779d9, 0, 0) mac_tx+0xda(fffff0435aa5d448, fffff04601e75c00, 87779d9, 0, 0) str_mdata_fastpath_put+0x53(fffff0438b234be8, fffff04601e75c00, 87779d9, 0) ip_xmit+0x94f(fffff04601e75c00, fffff046349391e8, 80006080, 6a, 87779d9, 0) ire_send_wire_v4+0x3e9(fffff08b3b1afcd8, fffff04601e75c00, fffff0efc43688a4, fffff0f08f883b80, fffff042ea653ef0) conn_ip_output+0x190(fffff04601e75c00, fffff0f08f883b80) udp_output_lastdst+0x106(fffff043bbceca40, fffff045fcc41ac0, fffff042dfd05e18, ae1d, fffff0f08f883b80) udp_send+0x56b(fffff043bbceca40, fffff045fcc41ac0, fffff001f5ff8de0, fffff042dfd05e18) so_sendmblk_impl+0x192(fffff043befc9060, fffff001f5ff8de0, 0, fffff042dfd05e18, fffff001f5ff8ca8, 0) so_sendmblk+0xb3(fffff043befc9060, fffff001f5ff8de0, 0, fffff042dfd05e18, fffff001f5ff8ca8) socket_sendmblk+0x23(fffff043befc9060, fffff001f5ff8de0, 0, fffff042dfd05e18, fffff001f5ff8ca8) ksocket_sendmblk+0xed(fffff043befc9060, fffff001f5ff8de0, 0, fffff001f5ff8ca8, fffff042dfd05e18) overlay_mux_tx+0x2e(fffff043e4d22400, fffff001f5ff8de0, fffff045fcc41ac0) overlay_m_tx+0x123(fffff045ccc74ac0, fffff0ef2eba7ae0) mac_tx_send+0x726(fffff044e0d200d8, 0, fffff0ef2eba7ae0, fffff001f5ff8f70) mac_tx_single_ring_mode+0x6e(fffff04641081680, fffff0ef2eba7ae0, 8b87f83, 1, 0) mac_tx+0xda(fffff044e0d200d8, fffff0ef2eba7ae0, 8b87f83, 1, 0) str_mdata_fastpath_put+0x53(fffff0461d730158, fffff0ef2eba7ae0, 8b87f83, 1) ip_xmit+0x82d(fffff0ef2eba7ae0, fffff08c905b3328, 180036060, 34, 8b87f83, 0) ire_send_wire_v4+0x3e9(fffff0c3173d4098, fffff0ef2eba7ae0, fffff0ac89271190, fffff045cd4b7b00, fffff046257f18a0) conn_ip_output+0x190(fffff0ef2eba7ae0, fffff045cd4b7b00) tcp_send_data+0x59(fffff045c3fc1b40, fffff0ef2eba7ae0) tcp_input_data+0x1dfb(fffff045c3fc1840, fffff0efefeaa6c0, fffff042ea3611c0, fffff001f5ff9aa0) squeue_enter+0x41c(fffff042ea3611c0, fffff0efefeaa6c0, fffff0efefeaa6c0, 1, fffff001f5ff9aa0, 4) ip_fanout_v4+0xc7c(fffff0efefeaa6c0, fffff0436be1504a, fffff001f5ff9aa0) ip_input_local_v4+0x16e(fffff04556dad048, fffff0efefeaa6c0, fffff0436be1504a, fffff001f5ff9aa0) ire_recv_local_v4+0x132(fffff04556dad048, fffff0efefeaa6c0, fffff0436be1504a, fffff001f5ff9aa0) ill_input_short_v4+0x4d6(fffff0efefeaa6c0, fffff0436be1504a, fffff0436be1505a, fffff001f5ff9aa0, fffff001f5ff9c30) ip_input_common_v4+0x372(fffff044ab307928, 0, fffff0efefeaa6c0, fffff001f5ff9d40, 0, 0) ip_input+0x2b(fffff044ab307928, 0, fffff0efefeaa6c0, fffff001f5ff9d40) i_dls_link_rx+0x1cd(fffff044d1f61338, 0, fffff0efefeaa6c0, 0) mac_rx_deliver+0x37(fffff044e0d200d8, 0, fffff0efefeaa6c0, 0) mac_rx_soft_ring_process+0x19a(fffff044e0d200d8, fffff044811b9e00, fffff0efefeaa6c0, fffff0efefeaa6c0, 1, 0) mac_rx_srs_fanout+0x3b2(fffff04641082340, fffff0efefeaa6c0) mac_rx_srs_drain+0x256(fffff04641082340, 800) mac_rx_srs_process+0x3ce(fffff044d1f830b8, fffff04641082340, fffff0effe1ccaa0, 0) mac_tx_send+0x431(fffff0530631cd88, 0, fffff0effe1ccaa0, fffff001f5ffa9a0) mac_tx_single_ring_mode+0x6e(fffff053062cd000, fffff0effe1ccaa0, 8c44faf, 1, 0) mac_tx+0xda(fffff0530631cd88, fffff0effe1ccaa0, 8c44faf, 1, 0) str_mdata_fastpath_put+0x53(fffff043db647750, fffff0effe1ccaa0, 8c44faf, 1) ip_xmit+0x82d(fffff0effe1ccaa0, fffff08c905b3488, 180036060, 34, 8c44faf, 0) ire_send_wire_v4+0x3e9(fffff04622269dc0, fffff0effe1ccaa0, fffff0ac89275c10, fffff045cc7be880, fffff08d20f0f800) conn_ip_output+0x190(fffff0effe1ccaa0, fffff045cc7be880) tcp_send_data+0x59(fffff046227d7bc0, fffff0effe1ccaa0) tcp_input_data+0x1dfb(fffff046227d78c0, fffff0435dc4d3c0, fffff0430afbdc00, fffff001f5ffb4d0) squeue_enter+0x41c(fffff0430afbdc00, fffff0435dc4d3c0, fffff0435dc4d3c0, 1, fffff001f5ffb4d0, 4) ip_fanout_v4+0xc7c(fffff0435dc4d3c0, fffff045cf5bd8ac, fffff001f5ffb4d0) ip_input_local_v4+0x16e(fffff0531eef8460, fffff0435dc4d3c0, fffff045cf5bd8ac, fffff001f5ffb4d0) ire_recv_local_v4+0x132(fffff0531eef8460, fffff0435dc4d3c0, fffff045cf5bd8ac, fffff001f5ffb4d0) ill_input_short_v4+0x4d6(fffff0435dc4d3c0, fffff045cf5bd8ac, fffff045cf5bd8bc, fffff001f5ffb4d0, fffff001f5ffb660) ip_input_common_v4+0x372(fffff04640b63228, 0, fffff0435dc4d3c0, fffff001f5ffb770, 0, 0) ip_input+0x2b(fffff04640b63228, 0, fffff0435dc4d3c0, fffff001f5ffb770) i_dls_link_rx+0x1cd(fffff046413b3ca8, 0, fffff0435dc4d3c0, 0) mac_rx_deliver+0x37(fffff0530631cd88, 0, fffff0437071a720, 0) mac_rx_soft_ring_process+0x19a(fffff0530631cd88, fffff046413b4980, fffff0437071a720, fffff0437071a720, 1, 0) mac_rx_srs_fanout+0x3b2(fffff053062cdcc0, fffff0437071a720) mac_rx_srs_drain+0x256(fffff053062cdcc0, 800) mac_rx_srs_process+0x3ce(fffff044d1f830b8, fffff053062cdcc0, fffff0437071a720, 0) mac_tx_send+0x431(fffff044e0d200d8, 0, fffff0437071a720, fffff001f5ffc3d0) mac_tx_single_ring_mode+0x6e(fffff04641081680, fffff0437071a720, 88123ea, 1, 0) mac_tx+0xda(fffff044e0d200d8, fffff0437071a720, 88123ea, 1, 0) str_mdata_fastpath_put+0x53(fffff0461d730158, fffff0437071a720, 88123ea, 1) ip_xmit+0x82d(fffff0437071a720, fffff045c397bc70, 180036060, 5ac, 88123ea, 0) ire_send_wire_v4+0x3e9(fffff04cef489b10, fffff0437071a720, fffff04645d620ac, fffff045c30b8100, fffff04629842a28) conn_ip_output+0x190(fffff0437071a720, fffff045c30b8100) tcp_send_data+0x59(fffff044091f5400, fffff0437071a720) tcp_output+0x58c(fffff044091f5100, fffff04371b58080, fffff042ea361280, 0) squeue_enter+0x41c(fffff042ea361280, fffff04371b58080, fffff04371b58080, 1, 0, 4) tcp_sendmsg+0x14f(fffff044091f5100, fffff04371b58080, fffff001f5ffce80, fffff04634d33a30) so_sendmsg+0x26b(fffff045c412d7f8, fffff001f5ffce80, fffff001f5ffce20, fffff04634d33a30) socket_sendmsg+0x48(fffff045c412d7f8, fffff001f5ffce80, fffff001f5ffce20, fffff04634d33a30) sendit+0x162(103, fffff001f5ffce80, fffff001f5ffce20, 8000) sendmsg+0x15b(103, 7ffffe5feb20, 8000) sys_syscall+0x196()
We had a double fault here because of a stack overflow. We've switched back and forth between rx and tx several times already. We should likely up the mac rx srs check here and add some logic so we can check what the value is in the future to see how close we're getting.
Based on looking at various dumps in thoth, we've seen that the closest we came to this was 12,336. To give ourselves that and a bit of headroom, we're increasing this to 13k. We don't believe we'll need to go too much further as we've mitigated this with the changes to interrupt stack sizing.
illumos-joyent commit 336afb6 (branch master, by Robert Mustacchi)
OS-4245 mac_rx_srs_process stack depth needs to account for harder usage