NetBSD Problem Report #52347

From martin@duskware.de  Wed Jun 28 15:57:01 2017
Return-Path: <martin@duskware.de>
Received: from mail.netbsd.org (mail.netbsd.org [199.233.217.200])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(Client CN "mail.netbsd.org", Issuer "Postmaster NetBSD.org" (verified OK))
	by mollari.NetBSD.org (Postfix) with ESMTPS id 05BE47A2BA
	for <gnats-bugs@gnats.NetBSD.org>; Wed, 28 Jun 2017 15:57:01 +0000 (UTC)
Message-Id: <20170628155649.08FDA5CC761@emmas.aprisoft.de>
Date: Wed, 28 Jun 2017 17:56:49 +0200 (CEST)
From: martin@NetBSD.org
Reply-To: martin@NetBSD.org
To: gnats-bugs@NetBSD.org
Subject: ww mutex class mismatch
X-Send-Pr-Version: 3.95

>Number:         52347
>Category:       kern
>Synopsis:       ww mutex class mismatch
>Confidential:   no
>Severity:       critical
>Priority:       high
>Responsible:    kern-bug-people
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Wed Jun 28 16:00:00 +0000 2017
>Last-Modified:  Thu Jul 06 08:50:01 +0000 2017
>Originator:     Martin Husemann
>Release:        NetBSD 8.99.1
>Organization:
The NetBSD Foundation, Inc.
>Environment:
System: NetBSD night-owl.duskware.de 8.99.1 NetBSD 8.99.1 (NIGHT-OWL) #516: Mon Jun 26 15:40:28 CEST 2017 martin@night-owl.duskware.de:/usr/src/sys/arch/amd64/compile/NIGHT-OWL amd64
Architecture: x86_64
Machine: amd64
>Description:

While using something that uses gtk3, I got this crash (seen it before, but
last time no crash dump):

 > fgrep "savecore:  reboot" /var/log/messages
Jun 28 17:42:32 night-owl savecore: reboot after panic: panic: kernel diagnostic assertion "(ctx->wwx_class == mutex->wwm_u.ctx->wwx_class)" failed: file "../../../../external/bsd/drm2/linux/linux_ww_mutex.c", line 304 ww mutex class mismatch: 0xffffffff812b6bc0 != 0x0
 > gdb netbsd.gdb 
[..]
Reading symbols from netbsd.gdb...done.
warning: ../../../../gdbscripts/kdump: No such file or directory
(gdb) target kvm /tmp/netbsd.core
0xffffffff80229485 in cpu_reboot (howto=howto@entry=256, 
    bootstr=bootstr@entry=0x0) at ../../../../arch/amd64/amd64/machdep.c:674
674			dumpsys();
(gdb) bt
#0  0xffffffff80229485 in cpu_reboot (howto=howto@entry=256, 
    bootstr=bootstr@entry=0x0) at ../../../../arch/amd64/amd64/machdep.c:674
#1  0xffffffff80632474 in db_sync_cmd (addr=<optimized out>, 
    have_addr=<optimized out>, count=<optimized out>, modif=<optimized out>)
    at ../../../../ddb/db_command.c:1380
#2  0xffffffff80632c3e in db_command (
    last_cmdp=last_cmdp@entry=0xffffffff812657a0 <db_last_command>)
    at ../../../../ddb/db_command.c:914
#3  0xffffffff80632fa5 in db_command_loop ()
    at ../../../../ddb/db_command.c:572
#4  0xffffffff806367be in db_trap (type=type@entry=1, code=code@entry=0)
    at ../../../../ddb/db_trap.c:90
#5  0xffffffff80225f32 in kdb_trap (type=type@entry=1, code=code@entry=0, 
    regs=regs@entry=0xffffe40041379670)
    at ../../../../arch/amd64/amd64/db_interface.c:234
#6  0xffffffff8022a80e in trap (frame=0xffffe40041379670)
    at ../../../../arch/amd64/amd64/trap.c:293
#7  0xffffffff8020108e in alltraps ()
#8  0xffffffff80224945 in breakpoint ()
#9  0xffffffff80814bc3 in vpanic (
    fmt=0xffffffff80f58218 "kernel %sassertion \"%s\" failed: file \"%s\", line %d ww mutex class mismatch: %p != %p", ap=ap@entry=0xffffe400413797a8)
    at ../../../../kern/subr_prf.c:340
#10 0xffffffff80ad9485 in kern_assert (
    fmt=fmt@entry=0xffffffff80f58218 "kernel %sassertion \"%s\" failed: file \"%s\", line %d ww mutex class mismatch: %p != %p")
    at ../../../../../../lib/libkern/kern_assert.c:51
#11 0xffffffff80aaaadc in ww_mutex_lock_wait_sig (
    mutex=mutex@entry=0xffffe4012d67b620, ctx=ctx@entry=0xffffe40041379af8)
    at ../../../../external/bsd/drm2/linux/linux_ww_mutex.c:302
#12 0xffffffff80aab9aa in linux_ww_mutex_lock_slow_interruptible (
    mutex=0xffffe4012d67b620, ctx=ctx@entry=0xffffe40041379af8)
    at ../../../../external/bsd/drm2/linux/linux_ww_mutex.c:732
#13 0xffffffff80ab2d86 in ttm_eu_reserve_buffers (
    ticket=ticket@entry=0xffffe40041379af8, list=list@entry=0xffffe40041379a20)
    at ../../../../external/bsd/drm2/dist/drm/ttm/ttm_execbuf_util.c:157
#14 0xffffffff8099ac53 in radeon_bo_list_validate (rdev=0xffff800007067000, 
    ticket=ticket@entry=0xffffe40041379af8, 
    head=head@entry=0xffffe40041379a20, ring=0)
    at ../../../../external/bsd/drm2/dist/drm/radeon/radeon_object.c:453
#15 0xffffffff80986231 in radeon_cs_parser_relocs (p=0xffffe400413799d0)
    at ../../../../external/bsd/drm2/dist/drm/radeon/radeon_cs.c:180
#16 radeon_cs_ioctl (dev=<optimized out>, data=<optimized out>, 
    filp=<optimized out>)
    at ../../../../external/bsd/drm2/dist/drm/radeon/radeon_cs.c:631
#17 0xffffffff806d8c13 in drm_ioctl (fp=<optimized out>, cmd=<optimized out>, 
    data=0xffffe40041379de0) at ../../../../external/bsd/drm2/drm/drm_drv.c:676
#18 0xffffffff80820311 in sys_ioctl (l=<optimized out>, 
    uap=0xffffe40041379f00, retval=<optimized out>)
    at ../../../../kern/sys_generic.c:671
#19 0xffffffff8024b2ac in sy_call (rval=0xffffe40041379eb0, 
    uap=0xffffe40041379f00, l=0xffffe40118907180, 
    sy=0xffffffff81272bd0 <sysent+1296>) at ../../../../sys/syscallvar.h:65
#20 sy_invoke (code=54, rval=0xffffe40041379eb0, uap=0xffffe40041379f00, 
    l=0xffffe40118907180, sy=0xffffffff81272bd0 <sysent+1296>)
    at ../../../../sys/syscallvar.h:94
#21 syscall (frame=0xffffe40041379f00)
    at ../../../../arch/x86/x86/syscall.c:144
#22 0xffffffff80200771 in Xsyscall ()
(gdb) up 11
#11 0xffffffff80aaaadc in ww_mutex_lock_wait_sig (
    mutex=mutex@entry=0xffffe4012d67b620, ctx=ctx@entry=0xffffe40041379af8)
    at ../../../../external/bsd/drm2/linux/linux_ww_mutex.c:302
302		KASSERTMSG((ctx->wwx_class == mutex->wwm_u.ctx->wwx_class),
(gdb) p *ctx
$1 = {wwx_class = 0xffffffff812b6bc0 <reservation_ww_class>, 
  wwx_owner = 0xffffe40118907180, wwx_ticket = 50953, wwx_acquired = 0, 
  wwx_acquire_done = false, wwx_rb_node = {rb_nodes = {0x0, 0x0}, rb_info = 0}}
(gdb) p *mutex
$2 = {wwm_state = WW_CTX, wwm_u = {owner = 0xffffe4004134faf8, 
    ctx = 0xffffe4004134faf8}, wwm_lock = {u = {mtxa_owner = 67073}}, 
  wwm_class = 0xffffffff812b6bc0 <reservation_ww_class>, wwm_waiters = {
    rbt_root = 0x0, rbt_ops = 0xffffffff80e91c00 <ww_acquire_ctx_rb_ops>, 
    rbt_minmax = {0xffffe4012d67b640, 0xffffe4012d67b640}}, wwm_cv = {
    cv_opaque = {0x0, 0xffffe4012d67b660, 0xffffffff80f587d5}}}
(gdb) p *mutex->wwm_u.ctx
$3 = {wwx_class = 0xffffe4004134fd60, wwx_owner = 0xffffe40107170940, 
  wwx_ticket = 18446713288477965536, wwx_acquired = 2149602382, 
  wwx_acquire_done = 255, wwx_rb_node = {rb_nodes = {0x0, 0xffffe400bcd5f284}, 
    rb_info = 88}}
(gdb) p *mutex->wwm_u.ctx->wwx_class
$4 = {wwc_ticket = 18446713288477949952}
(gdb) quit


>How-To-Repeat:
Just use gtk3 base stuff with a radeon on -current?

>Fix:
n/a

>Audit-Trail:
From: coypu@sdf.org
To: gnats-bugs@NetBSD.org
Cc: 
Subject: Re: kern/52347: ww mutex class mismatch
Date: Wed, 28 Jun 2017 17:26:50 +0000

 One thing I see in linux is that it has (and we don't)

 in radeon_cs_ioctl:
         if (rdev->in_reset) {
                 up_read(&rdev->exclusive_lock);
                 r = radeon_gpu_reset(rdev);
                 if (!r)
                         r = -EAGAIN;
                 return r;
         }

 And in radeon_gpu_rest (which we do have)

 	rdev->in_reset = true;

 	... precarious things, I assume ...
 	drm_helper_resume_force_mode(rdev->ddev);

 	rdev->in_reset = false;

 We do drm_helper_resume_force_mode etc without in_reset without
 something blocking entry to radeon_cs_ioctl.

 It's possible I misunderstand rwlocks (hence the question to tech-kern)
 and the lock prevents this too.

From: coypu@sdf.org
To: gnats-bugs@NetBSD.org
Cc: 
Subject: Re: kern/52347: ww mutex class mismatch
Date: Wed, 28 Jun 2017 22:16:21 +0000

 Never mind the previous, they reduced the lock and added another
 confusing thing and it's confusing but we shouldn't be able to enter
 drm_ioctl.

From: Martin Husemann <martin@duskware.de>
To: gnats-bugs@NetBSD.org
Cc: 
Subject: Re: kern/52347: ww mutex class mismatch
Date: Thu, 6 Jul 2017 10:06:27 +0200

 Taylor asked me to run a lockdebug kernel and indeed that fires ~immediately.
 No crash dump, so manual transcripton:

 LOCKDEBUG: Wait/wound mutex error: linux_wm_mutex_unlock,826: not locked
 ..
 linux_wm_mutex_unlock() at netbsd:linux_wm_mutex_unlock+0x64
 ttm_eu_fence_buffer_objects() at netbsd:radeon_cs_parser_fini+0x1d5
 radeon_cs_ioctl() at netbsd:radeon_cs_ioctl+0x6d2
 drm_ioctl() at netbsd:drm_ioctl+0x11e
 sys_ioctl() at netbsd:sys_ioctl+0x101

 and the source lines:

 (gdb) list *(radeon_cs_parser_fini+0x1d5)
 0xffffffff809887af is in radeon_cs_parser_fini (../../../../external/bsd/drm2/dist/drm/radeon/radeon_cs.c:411).
 406             } else if (backoff) {
 407                     ttm_eu_backoff_reservation(&parser->ticket,
 408                                                &parser->validated);
 409             }
 410
 411             if (parser->relocs != NULL) {
 412                     for (i = 0; i < parser->nrelocs; i++) {
 413                             if (parser->relocs[i].gobj)
 414                                     drm_gem_object_unreference_unlocked(parser->relocs[i].gobj);
 415                     }
 (gdb) list *(radeon_cs_ioctl+0x6d2)      
 0xffffffff80989345 is in radeon_cs_ioctl (../../../../external/bsd/drm2/include/linux/rwsem.h:84).
 79
 80      static inline void
 81      up_read(struct rw_semaphore *rwsem)
 82      {
 83
 84              rw_exit(&rwsem->rws_lock);
 85      }
 86
 87      static inline void
 88      up_write(struct rw_semaphore *rwsem)
 (gdb) list *(drm_ioctl+0x11e)      
 0xffffffff806d8a03 is in drm_ioctl (../../../../external/bsd/drm2/drm/drm_drv.c:676).
 671
 672             if (!ISSET(ioctl->flags, DRM_UNLOCKED))
 673                     mutex_lock(&drm_global_mutex);
 674
 675             /* XXX errno Linux->NetBSD */
 676             error = -(*ioctl->func)(dev, data, file);
 677
 678             if (!ISSET(ioctl->flags, DRM_UNLOCKED))
 679                     mutex_unlock(&drm_global_mutex);
 680


 Martin

From: Martin Husemann <martin@duskware.de>
To: gnats-bugs@NetBSD.org
Cc: 
Subject: Re: kern/52347: ww mutex class mismatch
Date: Thu, 6 Jul 2017 10:46:01 +0200

 Additional gdb output:

    0xffffffff80989339 <radeon_cs_ioctl+1734>:   lea    -0x3a0(%rbp),%rdi
    0xffffffff80989340 <radeon_cs_ioctl+1741>:   
     callq  0xffffffff809885da <radeon_cs_parser_fini>
    0xffffffff80989345 <radeon_cs_ioctl+1746>:   mov    -0x3a8(%rbp),%rdi

 (gdb) list *(0xffffffff80989340)
 0xffffffff80989340 is in radeon_cs_ioctl (../../../../external/bsd/drm2/dist/drm/radeon/radeon_cs.c:654).
 649             r = radeon_cs_ib_vm_chunk(rdev, &parser);
 650             if (r) {
 651                     goto out;
 652             }
 653     out:
 654             radeon_cs_parser_fini(&parser, r, true);
 655             up_read(&rdev->exclusive_lock);
 656             r = radeon_cs_handle_lockup(rdev, r);
 657             return r;
 658     }


 Martin

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2014 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.