NetBSD Problem Report #45426

From woods@once.weird.com  Tue Oct  4 20:16:50 2011
Return-Path: <woods@once.weird.com>
Received: from mail.netbsd.org (mail.netbsd.org [204.152.190.11])
	by www.NetBSD.org (Postfix) with ESMTP id 406DE63B85F
	for <gnats-bugs@gnats.NetBSD.org>; Tue,  4 Oct 2011 20:16:50 +0000 (UTC)
Message-Id: <m1RBBPx-001EBgC@once.weird.com>
Date: Tue, 4 Oct 2011 13:16:45 -0700 (PDT)
From: "Greg A. Woods" <woods@planix.com>
Sender: "Greg A. Woods" <woods@once.weird.com>
Reply-To: "Greg A. Woods" <woods@planix.com>
To: gnats-bugs@gnats.NetBSD.org
Subject: better logging for core dumps and process termination, including full core pathname
X-Send-Pr-Version: 3.95

>Number:         45426
>Category:       kern
>Synopsis:       better logging for core dumps and process termination, including full core pathname
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    kern-bug-people
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Tue Oct 04 20:20:00 +0000 2011
>Last-Modified:  Fri Nov 20 10:28:26 +0000 2020
>Originator:     Greg A. Woods
>Release:        NetBSD-current 2011/10/04
>Organization:
Planix, Inc.; Kelowna, BC; Canada
>Environment:
System: NetBSD
Architecture: all
Machine: all
>Description:

	I had a very strong desire for the kernel to log where and when
	it left core droppings from aborted processes, and also for it
	to be quite a bit more intelligent and intelligible about why a
	core dump might have failed.

>How-To-Repeat:

>Fix:

	(untested, except in netbsd-5 and prior where it is well tested)

Index: sys/kern/sys_process.c
===================================================================
RCS file: /cvs/master/m-NetBSD/main/src/sys/kern/sys_process.c,v
retrieving revision 1.162
diff -u -r1.162 sys_process.c
--- sys/kern/sys_process.c	6 Sep 2011 11:22:42 -0000	1.162
+++ sys/kern/sys_process.c	4 Oct 2011 18:06:42 -0000
@@ -130,6 +130,7 @@
 #include <sys/ptrace.h>
 #include <sys/uio.h>
 #include <sys/ras.h>
+#include <sys/syslog.h>
 #include <sys/kmem.h>
 #include <sys/kauth.h>
 #include <sys/mount.h>
@@ -563,6 +564,10 @@
 		break;

 	case  PT_DUMPCORE:
+	{
+		const char     *cdwhy;
+		int		cdlp = LOG_DEBUG;
+
 		if ((path = SCARG(uap, addr)) != NULL) {
 			char *dst;
 			len = SCARG(uap, data);
@@ -578,11 +583,62 @@
 			}
 			path = dst;
 			path[len] = '\0';
+		} else {
+			error = EINVAL;
+			break;
 		}
-		error = (*coredump_vec)(lt, path);
+		error = (*coredump_vec)(lt, path, NULL);
+		/*
+		 * XXX this logging code is much the same as the logging code
+		 * in kern_sig.c:sigexit() -- perhaps it should be in coredump()
+		 */
+		switch (error) {
+		case 0:
+			cdwhy = "dumped core";
+			cdlp = LOG_DEBUG;
+			break;
+		case EAUTH:
+			cdwhy = "was set-id, core dump not permitted";
+			cdlp = LOG_ERR;
+			break;
+		case EFBIG:
+			cdwhy = "core dump would exceed rlimit";
+			cdlp = LOG_NOTICE;
+			break;
+		case EINVAL:
+			cdwhy = "core dump not permitted over non-regular file";
+			cdlp = LOG_WARNING;
+			break;
+		case ENXIO:
+			cdwhy = "filesystem mount flag prevented core dump";
+			cdlp = LOG_NOTICE;
+			break;
+		case EPERM:
+			cdwhy = "core dump not permitted";
+			cdlp = LOG_WARNING;
+			break;
+		default:
+			/* XXX too bad strerror() isn't in the kernel.... */
+			cdwhy = "core dump failed";
+			cdlp = LOG_NOTICE;
+			break;
+		}
+		log(cdlp, "%s: pid %d [eid %d:%d, rid %d:%d, svid %d:%d] PT_DUMPCORE: %s [in %s]\n",
+		    p->p_comm,
+		    p->p_pid,
+		    kauth_cred_geteuid(l->l_cred),
+		    kauth_cred_getegid(l->l_cred),
+		    kauth_cred_getuid(l->l_cred),
+		    kauth_cred_getgid(l->l_cred),
+		    kauth_cred_getsvuid(l->l_cred),
+		    kauth_cred_getsvgid(l->l_cred),
+		    cdwhy,
+		    path);
+		free(path, M_TEMP);
 		if (path)
 			kmem_free(path, len + 1);
 		break;
+	}

 #ifdef PT_STEP
 	case  PT_STEP:
Index: sys/kern/kern_sig.c
===================================================================
RCS file: /cvs/master/m-NetBSD/main/src/sys/kern/kern_sig.c,v
retrieving revision 1.316
diff -u -r1.316 kern_sig.c
--- sys/kern/kern_sig.c	16 Sep 2011 22:07:17 -0000	1.316
+++ sys/kern/kern_sig.c	4 Oct 2011 18:03:41 -0000
@@ -95,6 +95,8 @@
 #include <sys/savar.h>
 #include <sys/exec.h>
 #include <sys/kauth.h>
+#include <sys/mount.h>
+#include <sys/syscallargs.h>
 #include <sys/acct.h>
 #include <sys/callout.h>
 #include <sys/atomic.h>
@@ -170,11 +172,6 @@
 int	kern_logsigexit = 0;
 #endif

-static const char logcoredump[] =
-    "pid %d (%s), uid %d: exited on signal %d (core dumped)\n";
-static const char lognocoredump[] =
-    "pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n";
-
 static kauth_listener_t signal_listener;

 static int
@@ -1901,14 +1898,11 @@
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
-#ifdef DIAGNOSTIC
 				/*
-				 * Are you sure you want to ignore SIGSEGV
-				 * in init? XXX
+				 * warn of SIGSEGV in init!
 				 */
-				printf_nolog("Process (pid %d) got sig %d\n",
+				printf("Process (pid %d) got sig %d\n",
 				    p->p_pid, signo);
-#endif
 				continue;
 			}

@@ -2120,8 +2114,21 @@

 	KASSERT(mutex_owned(proc_lock));

-	log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
-	uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why);
+	/* XXX could log cwd here too, but maybe not much use... */
+	/* XXX should use lesser priority for "orphaned, traced, process." and maybe others */
+	/* XXX should use PRIdUID if it is ever provided */
+	log(LOG_ERR, "%s: pid %d [eid %d:%d, rid %d:%d, svid %d:%d] was killed: %s\n",
+	    p->p_comm,
+	    p->p_pid,
+	    kauth_cred_geteuid(p->p_cred),
+	    kauth_cred_getegid(p->p_cred),
+	    kauth_cred_getuid(p->p_cred),
+	    kauth_cred_getgid(p->p_cred),
+	    kauth_cred_getsvuid(p->p_cred),
+	    kauth_cred_getsvgid(p->p_cred),
+	    why);
+	uprintf_locked("sorry, pid %d, command %s, was killed: %s\n", p->p_pid, p->p_comm, why);
+
 	psignal(p, SIGKILL);
 }

@@ -2206,20 +2213,107 @@
 	p->p_sigctx.ps_signo = signo;

 	if (docore) {
+		/* XXX should probably use PNBUF_GET() for these! */
+		char		cdcwdbuf[MAXPATHLEN]; /* XXX too bad we don't have asprintf() in the kernel */
+		char		corename[MAXPATHLEN];
+		char           *cdcwdbp;
+		const char     *cdwhy;
+		int		cdlp = LOG_DEBUG;
+
 		mutex_exit(p->p_lock);
-		if ((error = (*coredump_vec)(l, NULL)) == 0)
+		if ((error = (*coredump_vec)(l, NULL, corename)) == 0)
 			exitsig |= WCOREFLAG;

+		switch (error) {
+		case 0:
+			cdwhy = "dumped core";
+			cdlp = LOG_DEBUG;
+			break;
+		case EAUTH:
+			cdwhy = "was set-id, core dump not permitted";
+			cdlp = LOG_ERR;
+			break;
+		case EFBIG:
+			cdwhy = "core dump would exceed rlimit";
+			cdlp = LOG_NOTICE;
+			break;
+		case EINVAL:
+			cdwhy = "core dump not permitted over non-regular file";
+			cdlp = LOG_WARNING;
+			break;
+		case ENXIO:
+			cdwhy = "filesystem mount flag prevented core dump";
+			cdlp = LOG_NOTICE;
+			break;
+		case EPERM:
+			cdwhy = "core dump not permitted";
+			cdlp = LOG_WARNING;
+			break;
+		default:
+			/* XXX too bad strerror() isn't in the kernel.... */
+			cdwhy = "core dump failed";
+			cdlp = LOG_NOTICE;
+			break;
+		}
+		cdcwdbp = &cdcwdbuf[MAXPATHLEN];
+		*(--cdcwdbp) = '\0';
+		/*
+		 * getcwd_common() is a bit of a hack -- but it's better than
+		 * nothing!  It comes from vfs_getcwd.c.  (put in getcwd_common(9))
+		 *
+		 * The first arg is a pointer to the process' cwd vnode.
+		 *
+		 * The second arg is always a NULL vnode pointer.
+		 *
+		 * The name is placed in the *end* of the buffer which starts
+		 * at the pointer passed in the 4th arg.  The pointer passed in
+		 * by reference in the 3rd arg has to point at the end of the
+		 * buffer and it is is moved back to point at the beginning of
+		 * the resulting pathname.  The buffer has to be NUL
+		 * terminated(?).
+		 *
+		 * The 5th argument here is "max number of vnodes to traverse".
+		 * Since each entry takes up at least 2 bytes in the output
+		 * buffer, limit it to N/2 vnodes for an N byte buffer.
+		 * 
+		 * The 6th arg is for flags (always 0 here).
+		 *
+		 * The 7th arg is a pointer to the lwp structure.
+		 */
+		if (getcwd_common(p->p_cwdi->cwdi_cdir,
+				  (struct vnode *) NULL, &cdcwdbp,
+				  cdcwdbuf, sizeof(cdcwdbuf)/2, 0, l))
+			cdcwdbp = __UNCONST("(error getting process cwd)");
+
 		if (kern_logsigexit) {
-			int uid = l->l_cred ?
-			    (int)kauth_cred_geteuid(l->l_cred) : -1;
+			kauth_cred_t	cred;

-			if (error)
-				log(LOG_INFO, lognocoredump, p->p_pid,
-				    p->p_comm, uid, signo, error);
-			else
-				log(LOG_INFO, logcoredump, p->p_pid,
-				    p->p_comm, uid, signo);
+			cred = l->l_cred;
+			/* XXX should use PRIdUID if it is ever provided */
+			log(cdlp, "%s: pid %d [eid %d:%d, rid %d:%d, svid %d:%d] exited on signal %d: %s [in %s%s%s]\n",
+			    p->p_comm,
+			    p->p_pid,
+			    cred ? kauth_cred_geteuid(cred) : -1,
+			    cred ? kauth_cred_getegid(cred) : -1,
+			    cred ? kauth_cred_getuid(cred) : -1,
+			    cred ? kauth_cred_getgid(cred) : -1,
+			    cred ? kauth_cred_getsvuid(cred) : -1,
+			    cred ? kauth_cred_getsvgid(cred) : -1,
+			    signo, /* XXX too bad strsignal() isn't in the kernel.... */
+			    cdwhy,
+			    (error != 0 || (corename && *corename && *corename != '/')) ? cdcwdbp : "",
+			    (error == 0 && *corename != '/') ? "/" : "",
+			    (error == 0) ? corename : "");
+		}
+		if (cdlp != LOG_DEBUG) { /* tell the user of really abnormal events */
+			uprintf("%s: pid %d exited on signal %d: %s [in %s%s%s]\n",
+				p->p_comm,
+				p->p_pid,
+				signo, /* XXX strsignal() */
+				cdwhy,
+				(error != 0 || (corename && *corename && *corename != '/')) ? cdcwdbp : "",
+				(error == 0 && *corename != '/') ? "/" : "",
+				(error == 0) ? corename : "");
 		}

 #ifdef PAX_SEGVGUARD
Index: sys/kern/kern_core.c
===================================================================
RCS file: /cvs/master/m-NetBSD/main/src/sys/kern/kern_core.c,v
retrieving revision 1.20
diff -u -r1.20 kern_core.c
--- sys/kern/kern_core.c	24 Sep 2011 22:53:50 -0000	1.20
+++ sys/kern/kern_core.c	4 Oct 2011 17:53:46 -0000
@@ -60,7 +60,7 @@
 	off_t io_offset;
 };

-static int	coredump(struct lwp *, const char *);
+static int	coredump(struct lwp *, const char *, char *);
 static int	coredump_buildname(struct proc *, char *, const char *, size_t);

 static int
@@ -90,7 +90,7 @@
  * value of shortcorename), unless the process was setuid/setgid.
  */
 static int
-coredump(struct lwp *l, const char *pattern)
+coredump(struct lwp *l, const char *pattern, char *corename)
 {
 	struct vnode		*vp;
 	struct proc		*p;
@@ -105,6 +105,10 @@
 	char			*name, *lastslash;

 	name = PNBUF_GET();
+	if (name == NULL) {
+		error = ENOMEM;
+		goto done;
+	}

 	p = l->l_proc;
 	vm = p->p_vmspace;
@@ -119,7 +123,7 @@
 	 */
 	if (USPACE + ctob(vm->vm_dsize + vm->vm_ssize) >=
 	    p->p_rlimit[RLIMIT_CORE].rlim_cur) {
-		error = EFBIG;		/* better error code? */
+		error = EFBIG;
 		mutex_exit(p->p_lock);
 		mutex_exit(proc_lock);
 		goto done;
@@ -138,12 +142,12 @@
 	 */
 	if (p->p_flag & PK_SUGID) {
 		if (!security_setidcore_dump) {
-			error = EPERM;
+			error = EAUTH;
 			mutex_exit(p->p_lock);
 			mutex_exit(proc_lock);
 			goto done;
 		}
-		pattern = security_setidcore_path;
+		pattern = security_setidcore_path; /* is permitted to be NULL */
 	}

 	/* Lock, as p_limit and pl_corename might change. */
@@ -152,6 +156,7 @@
 	if (pattern == NULL) {
 		pattern = lim->pl_corename;
 	}
+	KASSERT(pattern); /* otherwise name is "", what will vn_open() do? */
 	error = coredump_buildname(p, name, pattern, MAXPATHLEN);
 	mutex_exit(&lim->pl_lock);

@@ -254,8 +259,11 @@
 	if (error == 0)
 		error = error1;
 done:
-	if (name != NULL)
+	if (name != NULL) {
+		if (corename)
+			strcpy(corename, name);
 		PNBUF_PUT(name);
+	}
 	return error;
 }


>Release-Note:

>Audit-Trail:

Responsible-Changed-From-To: kern-bug-people->kamil
Responsible-Changed-By: kamil@NetBSD.org
Responsible-Changed-When: Mon, 06 Nov 2017 05:27:49 +0100
Responsible-Changed-Why:
Take.


Responsible-Changed-From-To: kamil->kern-bug-people
Responsible-Changed-By: kamil@NetBSD.org
Responsible-Changed-When: Fri, 20 Nov 2020 11:28:26 +0100
Responsible-Changed-Why:
Reassign to kern-bug-people.


>Unformatted:

NetBSD Home
NetBSD PR Database Search

(Contact us) $NetBSD: query-full-pr,v 1.46 2020/01/03 16:35:01 leot Exp $
$NetBSD: gnats_config.sh,v 1.9 2014/08/02 14:16:04 spz Exp $
Copyright © 1994-2020 The NetBSD Foundation, Inc. ALL RIGHTS RESERVED.