NetBSD Problem Report #42772

From www@NetBSD.org  Tue Feb  9 01:18:54 2010
Return-Path: <www@NetBSD.org>
Received: from mail.netbsd.org (mail.netbsd.org [204.152.190.11])
	by www.NetBSD.org (Postfix) with ESMTP id 6315263C445
	for <gnats-bugs@gnats.NetBSD.org>; Tue,  9 Feb 2010 01:18:54 +0000 (UTC)
Message-Id: <20100209011854.2884163BBAC@www.NetBSD.org>
Date: Tue,  9 Feb 2010 01:18:54 +0000 (UTC)
From: naruse@airemix.jp
Reply-To: naruse@airemix.jp
To: gnats-bugs@NetBSD.org
Subject: fork from other than the main thread causes wrong pthread condition
X-Send-Pr-Version: www-1.0

>Number:         42772
>Category:       kern
>Synopsis:       fork from other than the main thread causes wrong pthread condition
>Confidential:   no
>Severity:       non-critical
>Priority:       medium
>Responsible:    kern-bug-people
>State:          closed
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Tue Feb 09 01:20:00 +0000 2010
>Closed-Date:    Wed Nov 03 12:30:51 +0000 2010
>Last-Modified:  Wed Nov 03 12:30:51 +0000 2010
>Originator:     NARUSE, Yui
>Release:        5.0.1
>Organization:
>Environment:
NetBSD kelvena 5.0_STABLE NetBSD 5.0_STABLE (GENERIC) #0: Sat Jan 16 22:36:33 JST 2010  naruse@:/usr/obj/sys/arch/i386/compile/GENERIC i386
>Description:
1. start initial thread
2. start second thread
3. start third thread
4. stop second thread
5. fork from third thread
then check ((struct __pthread_st *)pthread_self())->pt_lid in child process.
It should equal to _lwp_self() = 1, but it is 3; equals to third thread's pt_lid.

Run following code:

/*-
 * Copyright (c)2010 Takehiko NOZAKI,
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <errno.h>
#include <lwp.h>
#include <pthread.h>
#include <pthread_queue.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/tree.h>
#include <unistd.h>
#define PTHREAD_KEYS_MAX 	256
#define	PTHREAD__UNPARK_MAX	32

/*
 * The size of this structure needs to be no larger than struct
 * __pthread_cleanup_store, defined in pthread.h.
 */
struct pt_clean_t {
	PTQ_ENTRY(pt_clean_t)	ptc_next;
	void	(*ptc_cleanup)(void *);
	void	*ptc_arg;
};

struct pthread_lock_ops {
	void	(*plo_init)(__cpu_simple_lock_t *);
	int	(*plo_try)(__cpu_simple_lock_t *);
	void	(*plo_unlock)(__cpu_simple_lock_t *);
	void	(*plo_lock)(__cpu_simple_lock_t *);
};

struct	__pthread_st {
	pthread_t	pt_self;	/* Must be first. */
	unsigned int	pt_magic;	/* Magic number */
	int		pt_state;	/* running, blocked, etc. */
	pthread_mutex_t	pt_lock;	/* lock on state */
	int		pt_flags;	/* see PT_FLAG_* below */
	int		pt_cancel;	/* Deferred cancellation */
	int		pt_errno;	/* Thread-specific errno. */
	stack_t		pt_stack;	/* Our stack */
	void		*pt_exitval;	/* Read by pthread_join() */
	char		*pt_name;	/* Thread's name, set by the app. */
	int		pt_willpark;	/* About to park */
	lwpid_t		pt_unpark;	/* Unpark this when parking */
	struct pthread_lock_ops pt_lockops;/* Cached to avoid PIC overhead */
	pthread_mutex_t	*pt_droplock;	/* Drop this lock if cancelled */
	pthread_cond_t	pt_joiners;	/* Threads waiting to join. */

	/* Threads to defer waking, usually until pthread_mutex_unlock(). */
	lwpid_t		pt_waiters[PTHREAD__UNPARK_MAX];
	size_t		pt_nwaiters;

	/* Stack of cancellation cleanup handlers and their arguments */
	PTQ_HEAD(, pt_clean_t)	pt_cleanup_stack;

	/* LWP ID and entry on the list of all threads. */
	lwpid_t		pt_lid;
	RB_ENTRY(__pthread_st) pt_alltree;
	PTQ_ENTRY(__pthread_st) pt_allq;
	PTQ_ENTRY(__pthread_st)	pt_deadq;

	/*
	 * General synchronization data.  We try to align, as threads
	 * on other CPUs will access this data frequently.
	 */
	int		pt_dummy1 __aligned(128);
	struct lwpctl 	*pt_lwpctl;	/* Kernel/user comms area */
	volatile int	pt_blocking;	/* Blocking in userspace */
	volatile int	pt_rwlocked;	/* Handed rwlock successfully */
	volatile int	pt_signalled;	/* Received pthread_cond_signal() */
	volatile int	pt_mutexwait;	/* Waiting to acquire mutex */
	void * volatile pt_mutexnext;	/* Next thread in chain */
	void * volatile	pt_sleepobj;	/* Object slept on */
	PTQ_ENTRY(__pthread_st) pt_sleep;
	void		(*pt_early)(void *);
	int		pt_dummy2 __aligned(128);

	/* Thread-specific data.  Large so it sits close to the end. */
	int		pt_havespecific;
	void		*pt_specific[PTHREAD_KEYS_MAX];

	/*
	 * Context for thread creation.  At the end as it's cached
	 * and then only ever passed to _lwp_create(). 
	 */
	ucontext_t	pt_uc;
};

static int running;
static pthread_t timer_thread, thread;
static pthread_mutex_t timer_mutex, mutex;
static pthread_cond_t timer_cond, cond;

static void *timer(void *);
static void init_timer(void);
static void start_timer(void);
static void stop_timer(void);

static void
show_lwps(void)
{
	int i;
	printf("%6d: lwps: ", getpid());
	for (i = 1; i < 5; i++) {
		printf("%2d ", _lwp_kill(i, 0));
	}
	printf("\n");
}

static void *
timer(void *arg)
{
	struct timeval now;
	struct timespec timeout;

	pthread_mutex_lock(&timer_mutex);
	pthread_cond_signal(&timer_cond);
	do {
		int err;
		printf("%6d: zzz...\n",getpid());
		gettimeofday(&now, NULL);
		timeout.tv_sec  = now.tv_sec + 1;
		timeout.tv_nsec = now.tv_usec * 1000;
		err = pthread_cond_timedwait(&timer_cond, &timer_mutex, &timeout);
		switch (err) {
		case 0:
		case ETIMEDOUT:
			break;
		case ESRCH:
			printf("%6d: err: ESRCH %s\n", getpid(), sys_errlist[err]);
			abort();
			break;
		default:
			printf("%6d: err: %d = %s\n", getpid(), err, sys_errlist[err]);
			abort();
		}
	} while (running);
	pthread_mutex_unlock(&timer_mutex);
	return NULL;
}

static void
init_timer(void)
{
	running = 0;
	pthread_mutex_init(&timer_mutex, NULL);
	pthread_cond_init(&timer_cond, NULL);
}

static void
start_timer(void)
{
	printf("%6d: starting timer\n", getpid());
	pthread_mutex_lock(&timer_mutex);
	running = 1;
	if (pthread_create(&timer_thread, NULL, &timer, NULL) == 0) {
		pthread_cond_wait(&timer_cond, &timer_mutex);
	}


	pthread_mutex_unlock(&timer_mutex);
}

static void
stop_timer()
{
	printf("%6d: stopping timer\n", getpid());
	pthread_mutex_lock(&timer_mutex);
	running = 0;
	pthread_cond_signal(&timer_cond);
	pthread_mutex_unlock(&timer_mutex);
	pthread_join(timer_thread, NULL);
}

static void
my_fork()
{
	int pid;
	struct	__pthread_st *ptst;

	show_lwps();
	stop_timer();
	show_lwps();

	printf("%6d: fork() in thirdthread(_lwp_self()=%d)\n", getpid(), _lwp_self());
	pid = fork();
	if (pid < 0)
		abort();
	if (pid == 0) {
		/* child process */
		printf("%6d: CHILD _lwp_self()=%d pthread_self()->pt_lid=%d\n",
				getpid(),_lwp_self(),
				((struct __pthread_st *)pthread_self())->pt_lid);
		if (_lwp_self() != ((struct __pthread_st *)pthread_self())->pt_lid)
		printf("%6d: CHILD's self->pt_lid is wrong!\n", getpid());
		/* ((struct __pthread_st *)pthread_self())->pt_lid = _lwp_self(); */
		show_lwps();

		pthread_mutex_init(&mutex, NULL);
		pthread_cond_init(&cond, NULL);
		init_timer();
		start_timer();
	} else {
		/* parent process */
		printf("%6d: PARENT _lwp_self()=%d pthread_self()->pt_lid=%d\n",
				getpid(),_lwp_self(),
				((struct __pthread_st *)pthread_self())->pt_lid);
		show_lwps();
		start_timer();
		show_lwps();
	}
	pthread_cond_wait(&cond, &mutex);
}

static void *
thirdthread(void *arg)
{
	printf("%6d: starting thirdthread\n", getpid());
	my_fork();
	exit(0);
	return NULL;
}

int
main(void)
{
	init_timer();
	start_timer();

	pthread_mutex_init(&mutex, NULL);
	pthread_cond_init(&cond, NULL);
	if (pthread_create(&thread, NULL, &thirdthread, NULL) == 0)
		pthread_cond_wait(&cond, &mutex);

	sleep(1);
	return 0;
}


Then following result:
 16558: starting timer
 16558: zzz...
 16558: starting thirdthread
 16558: lwps:  0  0  0 -1 
 16558: stopping timer
 16558: lwps:  0 -1  0 -1 
 16558: fork() in thirdthread(_lwp_self()=3)
 16558: PARENT _lwp_self()=3 pthread_self()->pt_lid=3
  5661: CHILD _lwp_self()=1 pthread_self()->pt_lid=3
 16558: lwps:  0 -1  0 -1 
  5661: CHILD's self->pt_lid is wrong!
 16558: starting timer
  5661: lwps:  0 -1 -1 -1 
  5661: starting timer
 16558: zzz...
  5661: zzz...
 16558: lwps:  0 -1  0  0 
  5661: err: ESRCH No such process

Last error message is because it calls _lwp_park with wrong argument:
  2570      2 a.out    CALL  _lwp_park(0xbb7ffd94,3,0x804a348,0x804a348)
  2570      2 a.out    RET   _lwp_park -1 errno 3 No such process

Workaround for above program is uncomment this line:
/* ((struct __pthread_st *)pthread_self())->pt_lid = _lwp_self(); */
The code resets pt_lid and it will work.

Fix for NetBSD may be following, but it may need more initialization codes.

Index: lib/libpthread/pthread.c
===================================================================
RCS file: /cvsroot/src/lib/libpthread/pthread.c,v
retrieving revision 1.106.2.2
diff -u -p -r1.106.2.2 pthread.c
--- lib/libpthread/pthread.c	11 Jan 2010 00:47:29 -0000	1.106.2.2
+++ lib/libpthread/pthread.c	8 Feb 2010 22:59:19 -0000
@@ -261,6 +261,8 @@ pthread__child_callback(void)
 	 * much. Anything that permits some pthread_* calls to work is
 	 * merely being polite.
 	 */
+	struct __pthread_st *self = pthread_self();
+	self->pt_lid = _lwp_self();
 	pthread__started = 0;
 }

>How-To-Repeat:

>Fix:

>Release-Note:

>Audit-Trail:

State-Changed-From-To: open->feedback
State-Changed-By: pooka@NetBSD.org
State-Changed-When: Wed, 03 Nov 2010 00:28:33 +0300
State-Changed-Why:
this should be fixed in -current and in 5.1.  ok to close?


From: "NARUSE, Yui" <naruse@airemix.jp>
To: gnats-bugs@NetBSD.org
Cc: pooka@NetBSD.org, kern-bug-people@netbsd.org, netbsd-bugs@netbsd.org,
        gnats-admin@netbsd.org
Subject: Re: kern/42772 (fork from other than the main thread causes wrong
 pthread condition)
Date: Wed, 03 Nov 2010 17:30:04 +0900

 (2010/11/03 6:28), pooka@NetBSD.org wrote:
 > Synopsis: fork from other than the main thread causes wrong pthread condition
 > 
 > State-Changed-From-To: open->feedback
 > State-Changed-By: pooka@NetBSD.org
 > State-Changed-When: Wed, 03 Nov 2010 00:28:33 +0300
 > State-Changed-Why:
 > this should be fixed in -current and in 5.1.  ok to close?

 5.0 also affects this.
 When 5.0 is fixed or don't backport to 5.0, I'm ok.

 Thanks,

 -- 
 NARUSE, Yui  <naruse@airemix.jp>

State-Changed-From-To: feedback->closed
State-Changed-By: pooka@NetBSD.org
State-Changed-When: Wed, 03 Nov 2010 15:30:51 +0300
State-Changed-Why:
release engineering has stated 5.0.x is only for critical security
fixes and this fix will not be available in 5.0.


>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.39 2013/11/01 18:47:49 spz Exp $
$NetBSD: gnats_config.sh,v 1.8 2006/05/07 09:23:38 tsutsui Exp $
Copyright © 1994-2007 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.