OS-5989: OS-3506 introduced deadlock in dls_devnet_prop_task

Details

Issue Type:Bug
Priority:4 - Normal
Status:Resolved
Created at:2017-03-01T08:03:25.000Z
Updated at:2017-08-17T01:15:54.000Z

People

Created by:Ryan Zezeski [X]
Reported by:Ryan Zezeski [X]
Assigned to:Ryan Zezeski [X]

Resolution

Fixed: A fix for this issue is checked into the tree and tested.
(Resolution Date: 2017-05-19T19:53:38.000Z)

Fix Versions

2017-05-25 ROMANIAN STRIKE (Release Date: 2017-05-25)

Related Issues

Related Links

Description

This deadlock was reported by the user kshannon in the #smartos IRC channel. You can work around this problem by using the -t flag for setting link properties.

When setting certain link properties the dls_devnet_prop_task() routine can end up waiting on itself because of a change I made to the lock acqusition in drv_ioc_prop_common() for OS-3506.

The user presented us with multiple stuck dladm processes waiting on dls_devnet_prop_task_wait().

> ::pgrep dladm
S    PID   PPID   PGID    SID    UID      FLAGS             ADDR NAME
R  14276  13012  14276  13012      0 0x4a004000 fffffea6fdd59008 dladm
R  11158  11012    201    201      0 0x4a004000 fffffea6fdd4e020 dladm

> fffffea6fdd59008::walk thread | ::findstack -v
stack pointer for thread fffffea3b9a38c00: fffffe426f6fc8b0
[ fffffe426f6fc8b0 _resume_from_idle+0x112() ]
  fffffe426f6fc8e0 swtch+0x141()
  fffffe426f6fc920 cv_wait+0x70(fffffea70f289720, fffffea70f289718)
  fffffe426f6fc960 dls_devnet_prop_task_wait+0x4b(fffffea70f2895d8)
  fffffe426f6fc9b0 stubs_common_code+0x59()
  fffffe426f6fca00 mac_open_by_linkid+0x2d(2, fffffe426f6fca30)
  fffffe426f6fcbd0 drv_ioc_macaddrget+0x9b(fffffea7020b3d68, 8109008, 100003, fffffea74ca5bee0, fffffe426f6fce58)
  fffffe426f6fcc70 drv_ioctl+0x1e4(1200000000, d1d0015, 8109008, 100003, fffffea74ca5bee0, fffffe426f6fce58)
  fffffe426f6fccb0 cdev_ioctl+0x39(1200000000, d1d0015, 8109008, 100003, fffffea74ca5bee0, fffffe426f6fce58)
  fffffe426f6fcd00 spec_ioctl+0x60(fffffea36bfd6480, d1d0015, 8109008, 100003, fffffea74ca5bee0, fffffe426f6fce58, 0)
  fffffe426f6fcd90 fop_ioctl+0x55(fffffea36bfd6480, d1d0015, 8109008, 100003, fffffea74ca5bee0, fffffe426f6fce58, 0)
  fffffe426f6fceb0 ioctl+0x9b(3, d1d0015, 8109008)
  fffffe426f6fcf10 _sys_sysenter_post_swapgs+0x153()

The taskq entry shows it's a task for the ixgbe0 interface and that the kernel thread responsible for completing the task is running dls_devnet_prop_task() which turns into an upcall into dlmgmtd to perform the real work.

> fffffea70f2895d8::print -T struct dls_devnet_s
struct dls_devnet_s {
    datalink_id_t <<unsigned int>> dd_linkid = 0x2
    char [32] dd_linkname = [ "ixgbe0" ]
    char [256] dd_mac = [ "ixgbe0" ]
    kstat_t *dd_ksp = 0xfffffea39db5fa30
    kstat_t *dd_zone_ksp = 0
    uint32_t <<unsigned int>> dd_ref = 0x1
    kmutex_t <<struct mutex>> dd_mutex = {
        void *[1] _opaque = [ 0 ]
    }
    kcondvar_t <<struct _kcondvar>> dd_cv = {
        ushort_t <<unsigned short>> _opaque = 0x3
    }
    uint32_t <<unsigned int>> dd_tref = 0x3
    uint_t <<unsigned int>> dd_flags = 0
    zoneid_t <<int>> dd_owner_zid = 0
    zoneid_t <<int>> dd_zid = 0
    boolean_t dd_prop_loaded = 0 (0)
    taskqid_t <<unsigned long>> dd_prop_taskid = 0xfffffea39eb178c0
    boolean_t dd_transient = 0 (0)
}

> fffffea70f2895d8::print struct dls_devnet_s dd_prop_taskid | ::print -T taskq_ent_t tqent_thread | ::findstack -v
stack pointer for thread fffffe4269b69c40: fffffe4269b69870
[ fffffe4269b69870 _resume_from_idle+0x112() ]
  fffffe4269b698b0 swtch_to+0xb6(fffffea39d6adc60)
  fffffe4269b69900 shuttle_resume+0x2af(fffffea39d6adc60, ffffffffc0022f90)
  fffffe4269b699b0 door_upcall+0x212(fffffea39cd8c980, fffffe4269b69aa0, fffffea361780e18, ffffffffffffffff, 0)
  fffffe4269b69a30 door_ki_upcall_limited+0x67(fffffea39cd76b18, fffffe4269b69aa0, fffffea361780e18, ffffffffffffffff, 0)
  fffffe4269b69a80 stubs_common_code+0x59()
  fffffe4269b69b20 i_dls_mgmt_upcall+0xbf(fffffe4269b69b30, 8, fffffe4269b69b40, 4)
  fffffe4269b69b60 dls_mgmt_linkprop_init+0x2d(2)
  fffffe4269b69b90 dls_devnet_prop_task+0x25(fffffea70f2895d8)
  fffffe4269b69c20 taskq_d_thread+0xb7(fffffea39eb178c0)
  fffffe4269b69c30 thread_start+8()

But if you look at the dlmgmtd threads you'll see two of them also waiting on the prop task (and it's the same taskq entry). That makes no damn sense!

> ::pgrep dlmgmtd | ::walk thread | ::stacks -a
THREAD           STATE    SOBJ                COUNT
fffffea39ce48060 SLEEP    SHUTTLE                 7
fffffea708019120 SLEEP    SHUTTLE                 -
fffffea6f8fafba0 SLEEP    SHUTTLE                 -
fffffea6f8daa000 SLEEP    SHUTTLE                 -
fffffea6e3dc88c0 SLEEP    SHUTTLE                 -
fffffea39fb4e780 SLEEP    SHUTTLE                 -
fffffea39d7cfba0 SLEEP    SHUTTLE                 -
                 swtch_to+0xb6
                 shuttle_resume+0x2af
                 door_return+0x3e0
                 doorfs32+0x180
                 sys_syscall32+0x12f

THREAD           STATE    SOBJ                COUNT
fffffea39d6adc60 SLEEP    CV                      2
fffffea3a017dc60 SLEEP    CV                      -
                 swtch+0x141
                 cv_wait+0x70
                 dls_devnet_prop_task_wait+0x4b
                 stubs_common_code+0x59
                 mac_open_by_linkid+0x2d
                 mac_perim_enter_by_linkid+0x20
                 drv_ioc_prop_common+0x177
                 drv_ioc_setprop+0x29
                 drv_ioctl+0x1e4
                 cdev_ioctl+0x39
                 spec_ioctl+0x60
                 fop_ioctl+0x55
                 ioctl+0x9b
                 sys_syscall32+0x12f

THREAD           STATE    SOBJ                COUNT
fffffea39ce48400 SLEEP    CV                      1
                 swtch+0x141
                 cv_wait_sig_swap_core+0x1b9
                 cv_wait_sig_swap+0x17
                 pause+0x45
                 sys_syscall32+0x12f

THREAD           STATE    SOBJ                COUNT
fffffea39fb4eb20 SLEEP    SHUTTLE                 1
                 swtch+0x141
                 shuttle_swtch+0x203
                 door_return+0x214
                 doorfs32+0x180
                 sys_syscall32+0x12f

> fffffea39d6adc60::findstack -v ! grep prop_task_wait
  fffffe4269f3b9d0 dls_devnet_prop_task_wait+0x4b(fffffea70f2895d8)

> fffffea70f2895d8::print struct dls_devnet_s
{
    dd_linkid = 0x2
    dd_linkname = [ "ixgbe0" ]
    dd_mac = [ "ixgbe0" ]
    dd_ksp = 0xfffffea39db5fa30
    dd_zone_ksp = 0
    dd_ref = 0x1
    dd_mutex = {
        _opaque = [ 0 ]
    }
    dd_cv = {
        _opaque = 0x3
    }
    dd_tref = 0x3
    dd_flags = 0
    dd_owner_zid = 0
    dd_zid = 0
    dd_prop_loaded = 0 (0)
    dd_prop_taskid = 0xfffffea39eb178c0
    dd_transient = 0 (0)
}

The epihany comes when you look at the struct door_data for the dlmgmtd thread. Each door_data has two sides: client and server. When a door call is made the client thread's door_data.d_client is populated and the server thread's door_data.d_server is populated (this way a thread can be both a client and server). If we look at the server thread (dlmgmtd thread handling the dls_devnet_prop_task() upcall) we notice that the caller thread (aka the client) is 0xfffffe4269b69c40 -- the thread responsible for running the prop task!

> fffffea39d6adc60::print kthread_t t_door | ::print -T struct door_data d_server
door_server_t <<struct door_server>> d_server = {
    struct _kthread *d_server.d_caller = 0xfffffe4269b69c40
    struct _kthread *d_server.d_servers = 0
    struct door_node *d_server.d_active = 0xfffffea39ce2c910
    struct door_node *d_server.d_pool = 0
    door_layout_t <<struct door_layout>> d_server.d_layout = {
        caddr_t <<char *>> dl_descp = 0
        caddr_t <<char *>> dl_datap = 0xfec2edf8
        caddr_t <<char *>> dl_infop = 0
        caddr_t <<char *>> dl_resultsp = 0xfec2edd8
        caddr_t <<char *>> dl_sp = 0xfec2edd8
    }
    caddr_t <<char *>> d_server.d_sp = 0xfec2ee00
    size_t <<unsigned long>> d_server.d_ssize = 0xf5f00
    kcondvar_t <<struct _kcondvar>> d_server.d_cv = {
        ushort_t <<unsigned short>> _opaque = 0
    }
    uchar_t <<unsigned char>> d_server.d_hold = 0
    uchar_t <<unsigned char>> d_server.d_invbound = 0
    uchar_t <<unsigned char>> d_server.d_layout_done = 0x1
}

> 0xfffffe4269b69c40::findstack -v
stack pointer for thread fffffe4269b69c40: fffffe4269b69870
[ fffffe4269b69870 _resume_from_idle+0x112() ]
  fffffe4269b698b0 swtch_to+0xb6(fffffea39d6adc60)
  fffffe4269b69900 shuttle_resume+0x2af(fffffea39d6adc60, ffffffffc0022f90)
  fffffe4269b699b0 door_upcall+0x212(fffffea39cd8c980, fffffe4269b69aa0, fffffea361780e18, ffffffffffffffff, 0)
  fffffe4269b69a30 door_ki_upcall_limited+0x67(fffffea39cd76b18, fffffe4269b69aa0, fffffea361780e18, ffffffffffffffff, 0)
  fffffe4269b69a80 stubs_common_code+0x59()
  fffffe4269b69b20 i_dls_mgmt_upcall+0xbf(fffffe4269b69b30, 8, fffffe4269b69b40, 4)
  fffffe4269b69b60 dls_mgmt_linkprop_init+0x2d(2)
  fffffe4269b69b90 dls_devnet_prop_task+0x25(fffffea70f2895d8)
  fffffe4269b69c20 taskq_d_thread+0xb7(fffffea39eb178c0)
  fffffe4269b69c30 thread_start+8()

So Thread A (0xfffffe4269b69c40) performs an upcall into dlmgmtd which runs the the upcall on Thread B (fffffea39d6adc60). But then Thread B decides to make a call into the kernel to set a link property. But part of setting the property is calling mac_open_by_linkid() which must wait for any pending prop task. So Thread A is waiting on Thread B to do the upcall, but Thread B is now stuck waiting on Thread A. Boom, deadlock.

> fffffea39d6adc60::findstack -v
stack pointer for thread fffffea39d6adc60: fffffe4269f3b920
[ fffffe4269f3b920 _resume_from_idle+0x112() ]
  fffffe4269f3b950 swtch+0x141()
  fffffe4269f3b990 cv_wait+0x70(fffffea70f289720, fffffea70f289718)
  fffffe4269f3b9d0 dls_devnet_prop_task_wait+0x4b(fffffea70f2895d8)
  fffffe4269f3ba20 stubs_common_code+0x59()
  fffffe4269f3ba70 mac_open_by_linkid+0x2d(2, fffffe4269f3ba98)
  fffffe4269f3bac0 mac_perim_enter_by_linkid+0x20(2, fffffe4269f3bb10)
  fffffe4269f3bb90 drv_ioc_prop_common+0x177(fffffea7035bd8c0, 806f578, 1, fffffea70f25f328, 100003)
  fffffe4269f3bbd0 drv_ioc_setprop+0x29(fffffea7035bd8c0, 806f578, 100003, fffffea70f25f328, fffffe4269f3be58)
  fffffe4269f3bc70 drv_ioctl+0x1e4(1200000000, d1d001b, 806f578, 100003, fffffea70f25f328, fffffe4269f3be58)
  fffffe4269f3bcb0 cdev_ioctl+0x39(1200000000, d1d001b, 806f578, 100003, fffffea70f25f328, fffffe4269f3be58)
  fffffe4269f3bd00 spec_ioctl+0x60(fffffea36bfd6480, d1d001b, 806f578, 100003, fffffea70f25f328, fffffe4269f3be58, 0)
  fffffe4269f3bd90 fop_ioctl+0x55(fffffea36bfd6480, d1d001b, 806f578, 100003, fffffea70f25f328, fffffe4269f3be58, 0)
  fffffe4269f3beb0 ioctl+0x9b(0, d1d001b, 806f578)
  fffffe4269f3bf10 sys_syscall32+0x12f()

So why is this happening? Well, Thread B is making a call into the kernel to set the en_100fdx_cap property.

> fffffea39d6adc60::findstack -v ! grep drv_ioc_prop_common
  fffffe4269f3bb90 drv_ioc_prop_common+0x177(fffffea7035bd8c0, 806f578, 1, fffffea70f25f328, 100003)
> fffffea7035bd8c0::print -T dld_ioc_macprop_t
dld_ioc_macprop_t <<struct dld_ioc_macprop_s>> {
    uint_t <<unsigned int>> pr_flags = 0
    datalink_id_t <<unsigned int>> pr_linkid = 0x2
    mac_prop_id_t pr_num = 0t15 (MAC_PROP_EN_100FDX_CAP)
    uint_t <<unsigned int>> pr_perm_flags = 0
    char [256] pr_name = [ "en_100fdx_cap" ]
    uint_t <<unsigned int>> pr_valsize = 0x1
    char [1] pr_val = [ '\0' ]
}

At this point the crash dump doesn't help and you must read dlmgmtd userland code. It turns out that if dladm finds certain properties, like en_100fdx_cap, in the persistent file (/etc/dladm/datalink.conf) it will get their value in userland and then perform a SETMACPROP ioctl to set it in the kernel too (see i_dladm_init_one_prop(). This is happening to the user because they have a custom service deployed that sets the en_100fdx_cap property at boot. This causes the prop task to fire up and the following userland stack to occur.

ioctl(DLDIOC_SETMACPROP)
i_dladm_macprop()
set_public_prop() (for en_100fdx_cap)
  pdp->pd_set()
i_dladm_set_single_prop()
i_dladm_set_linkprop()
dladm_set_linkprop()
i_dladm_init_one_prop()
dladm_init_linkprop()
dlmgmt_upcall_linkprop_init()

This deadlock was introduced by a change I made in drv_ioc_prop_common() for OS-3506. I changed it to enter the mac perim via linkid instead of mac name. But since the canonical store for linkid to name mappings is in dlmgmtd we now have to call dls_devnet_prop_task_wait() to make sure all kernel data is up-to-date. I didn't make this change on a whim so I have to revisit why I made this change and how I can avoid both the original deadlock and this one.

@@ -671,11 +662,10 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set,
                        goto done;
        }

-       if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0)
-               goto done;
-       if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0)
+       if ((err = mac_perim_enter_by_linkid(linkid, &mph)) != 0)
                goto done;
-       if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0)
+
+       if ((err = dls_devnet_hold_link(linkid, &dlh, &dlp)) != 0)
                goto done;

Comments

Comment by Ryan Zezeski [X]
Created at 2017-05-04T06:41:07.000Z

I tested this by setting up a vmware instance with two NICs and creating an aggr over those links.

Then I setup a custom SMF service to run this script at boot:

#!/bin/bash

set -o pipefail
set -o xtrace

admin_aggr=aggr0

. /lib/svc/share/smf_include.sh

for port in $(/usr/sbin/dladm show-aggr -p -o port -L $admin_aggr)
do
        /usr/sbin/dladm set-linkprop -p en_100fdx_cap=0 $port
done

exit $SMF_EXIT_OK

Then I would ssh into the host and run dladm show-link.

On a PI including my OS-3506 patch this dladm command would never complete because of the deadlock. On my patched PI there is no deadlock and the command completes immediately.


Comment by Bot Bot [X]
Created at 2017-05-19T19:53:27.000Z

illumos-joyent commit 87b3eba (branch master, by Ryan Zezeski)

OS-6056 dls_devnet_unset() and DLS temp holds can deadlock
OS-5989 OS-3506 introduced deadlock in dls_devnet_prop_task
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>