From 5d8de293c224896a4da99763fce4f9794308caf4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Apr 2022 14:37:59 -0700
Subject: vmcore: convert copy_oldmem_page() to take an iov_iter

Patch series "Convert vmcore to use an iov_iter", v5.

For some reason several people have been sending bad patches to fix
compiler warnings in vmcore recently.  Here's how it should be done.
Compile-tested only on x86.  As noted in the first patch, s390 should take
this conversion a bit further, but I'm not inclined to do that work
myself.


This patch (of 3):

Instead of passing in a 'buf' and 'userbuf' argument, pass in an iov_iter.
s390 needs more work to pass the iov_iter down further, or refactor, but
I'd be more comfortable if someone who can test on s390 did that work.

It's more convenient to convert the whole of read_from_oldmem() to take an
iov_iter at the same time, so rename it to read_from_oldmem_iter() and add
a temporary read_from_oldmem() wrapper that creates an iov_iter.

Link: https://lkml.kernel.org/r/20220408090636.560886-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20220408090636.560886-2-bhe@redhat.com
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_dump.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 620821549b23..a1cf7d5c03c7 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -24,11 +24,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
 				  unsigned long from, unsigned long pfn,
 				  unsigned long size, pgprot_t prot);
 
-extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
-						unsigned long, int);
-extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
-					  size_t csize, unsigned long offset,
-					  int userbuf);
+ssize_t copy_oldmem_page(struct iov_iter *i, unsigned long pfn, size_t csize,
+		unsigned long offset);
+ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn,
+				   size_t csize, unsigned long offset);
 
 void vmcore_cleanup(void);
 
-- 
cgit v1.2.3


From e0690479917cbce740eef51fa3de92c69647a5ad Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 29 Apr 2022 14:37:59 -0700
Subject: vmcore: convert read_from_oldmem() to take an iov_iter

Remove the read_from_oldmem() wrapper introduced earlier and convert all
the remaining callers to pass an iov_iter.

Link: https://lkml.kernel.org/r/20220408090636.560886-4-bhe@redhat.com
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Amit Daniel Kachhap <amit.kachhap@arm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/crash_dump_64.c |  7 ++++++-
 fs/proc/vmcore.c                | 40 +++++++++++++++-------------------------
 include/linux/crash_dump.h      | 10 ++++------
 3 files changed, 25 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 94fe4aff9694..e75bc2f217ff 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -54,6 +54,11 @@ ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn,
 
 ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0,
+	struct kvec kvec = { .iov_base = buf, .iov_len = count };
+	struct iov_iter iter;
+
+	iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+	return read_from_oldmem(&iter, count, ppos,
 				cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
 }
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 4a721865b5cd..4eaeb645e759 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -129,7 +129,7 @@ static int open_vmcore(struct inode *inode, struct file *file)
 }
 
 /* Reads a page from the oldmem device from given offset. */
-static ssize_t read_from_oldmem_iter(struct iov_iter *iter, size_t count,
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
 			 u64 *ppos, bool encrypted)
 {
 	unsigned long pfn, offset;
@@ -178,27 +178,6 @@ static ssize_t read_from_oldmem_iter(struct iov_iter *iter, size_t count,
 	return read;
 }
 
-ssize_t read_from_oldmem(char *buf, size_t count,
-			 u64 *ppos, int userbuf,
-			 bool encrypted)
-{
-	struct iov_iter iter;
-	struct iovec iov;
-	struct kvec kvec;
-
-	if (userbuf) {
-		iov.iov_base = (__force void __user *)buf;
-		iov.iov_len = count;
-		iov_iter_init(&iter, READ, &iov, 1, count);
-	} else {
-		kvec.iov_base = buf;
-		kvec.iov_len = count;
-		iov_iter_kvec(&iter, READ, &kvec, 1, count);
-	}
-
-	return read_from_oldmem_iter(&iter, count, ppos, encrypted);
-}
-
 /*
  * Architectures may override this function to allocate ELF header in 2nd kernel
  */
@@ -218,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr)
  */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0, false);
+	struct kvec kvec = { .iov_base = buf, .iov_len = count };
+	struct iov_iter iter;
+
+	iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+	return read_from_oldmem(&iter, count, ppos, false);
 }
 
 /*
@@ -226,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
  */
 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 {
-	return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT));
+	struct kvec kvec = { .iov_base = buf, .iov_len = count };
+	struct iov_iter iter;
+
+	iov_iter_kvec(&iter, READ, &kvec, 1, count);
+
+	return read_from_oldmem(&iter, count, ppos,
+			cc_platform_has(CC_ATTR_MEM_ENCRYPT));
 }
 
 /*
@@ -402,7 +392,7 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos)
 					    m->offset + m->size - *fpos,
 					    iov_iter_count(iter));
 			start = m->paddr + *fpos - m->offset;
-			tmp = read_from_oldmem_iter(iter, tsz, &start,
+			tmp = read_from_oldmem(iter, tsz, &start,
 					cc_platform_has(CC_ATTR_MEM_ENCRYPT));
 			if (tmp < 0)
 				return tmp;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index a1cf7d5c03c7..0f3a656293b0 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -134,13 +134,11 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
 #ifdef CONFIG_PROC_VMCORE
-ssize_t read_from_oldmem(char *buf, size_t count,
-			 u64 *ppos, int userbuf,
-			 bool encrypted);
+ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+			 u64 *ppos, bool encrypted);
 #else
-static inline ssize_t read_from_oldmem(char *buf, size_t count,
-				       u64 *ppos, int userbuf,
-				       bool encrypted)
+static inline ssize_t read_from_oldmem(struct iov_iter *iter, size_t count,
+				       u64 *ppos, bool encrypted)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From f485922d8fe4e44f6d52a5bb95a603b7c65554bb Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Fri, 29 Apr 2022 14:38:01 -0700
Subject: pipe: make poll_usage boolean and annotate its access

Patch series "Fix data-races around epoll reported by KCSAN."

This series suppresses a false positive KCSAN's message and fixes a real
data-race.


This patch (of 2):

pipe_poll() runs locklessly and assigns 1 to poll_usage.  Once poll_usage
is set to 1, it never changes in other places.  However, concurrent writes
of a value trigger KCSAN, so let's make KCSAN happy.

BUG: KCSAN: data-race in pipe_poll / pipe_poll

write to 0xffff8880042f6678 of 4 bytes by task 174 on cpu 3:
 pipe_poll (fs/pipe.c:656)
 ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853)
 do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234)
 __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241)
 do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

write to 0xffff8880042f6678 of 4 bytes by task 177 on cpu 1:
 pipe_poll (fs/pipe.c:656)
 ep_item_poll.isra.0 (./include/linux/poll.h:88 fs/eventpoll.c:853)
 do_epoll_wait (fs/eventpoll.c:1692 fs/eventpoll.c:1806 fs/eventpoll.c:2234)
 __x64_sys_epoll_wait (fs/eventpoll.c:2246 fs/eventpoll.c:2241 fs/eventpoll.c:2241)
 do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 177 Comm: epoll_race Not tainted 5.17.0-58927-gf443e374ae13 #6
Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014

Link: https://lkml.kernel.org/r/20220322002653.33865-1-kuniyu@amazon.co.jp
Link: https://lkml.kernel.org/r/20220322002653.33865-2-kuniyu@amazon.co.jp
Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Kuniyuki Iwashima <kuni1840@gmail.com>
Cc: "Soheil Hassas Yeganeh" <soheil@google.com>
Cc: "Sridhar Samudrala" <sridhar.samudrala@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/pipe.c                 | 2 +-
 include/linux/pipe_fs_i.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index e140ea150bbb..d04c3fce28a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	unsigned int head, tail;
 
 	/* Epoll has some historical nasty semantics, this enables them */
-	pipe->poll_usage = 1;
+	WRITE_ONCE(pipe->poll_usage, true);
 
 	/*
 	 * Reading pipe state only -- no need for acquiring the semaphore.
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index c00c618ef290..cb0fd633a610 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -71,7 +71,7 @@ struct pipe_inode_info {
 	unsigned int files;
 	unsigned int r_counter;
 	unsigned int w_counter;
-	unsigned int poll_usage;
+	bool poll_usage;
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
-- 
cgit v1.2.3


From d679ae94fdd5d3ab00c35078f5af5f37e068b03d Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Date: Fri, 29 Apr 2022 14:38:01 -0700
Subject: list: fix a data-race around ep->rdllist

ep_poll() first calls ep_events_available() with no lock held and checks
if ep->rdllist is empty by list_empty_careful(), which reads
rdllist->prev.  Thus all accesses to it need some protection to avoid
store/load-tearing.

Note INIT_LIST_HEAD_RCU() already has the annotation for both prev
and next.

Commit bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket
fds.") added the first lockless ep_events_available(), and commit
c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")
made some ep_events_available() calls lockless and added single call under
a lock, finally commit e59d3c64cba6 ("epoll: eliminate unnecessary lock
for zero timeout") made the last ep_events_available() lockless.

BUG: KCSAN: data-race in do_epoll_wait / do_epoll_wait

write to 0xffff88810480c7d8 of 8 bytes by task 1802 on cpu 0:
 INIT_LIST_HEAD include/linux/list.h:38 [inline]
 list_splice_init include/linux/list.h:492 [inline]
 ep_start_scan fs/eventpoll.c:622 [inline]
 ep_send_events fs/eventpoll.c:1656 [inline]
 ep_poll fs/eventpoll.c:1806 [inline]
 do_epoll_wait+0x4eb/0xf40 fs/eventpoll.c:2234
 do_epoll_pwait fs/eventpoll.c:2268 [inline]
 __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline]
 __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275
 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88810480c7d8 of 8 bytes by task 1799 on cpu 1:
 list_empty_careful include/linux/list.h:329 [inline]
 ep_events_available fs/eventpoll.c:381 [inline]
 ep_poll fs/eventpoll.c:1797 [inline]
 do_epoll_wait+0x279/0xf40 fs/eventpoll.c:2234
 do_epoll_pwait fs/eventpoll.c:2268 [inline]
 __do_sys_epoll_pwait fs/eventpoll.c:2281 [inline]
 __se_sys_epoll_pwait+0x12b/0x240 fs/eventpoll.c:2275
 __x64_sys_epoll_pwait+0x74/0x80 fs/eventpoll.c:2275
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0xffff88810480c7d0 -> 0xffff888103c15098

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 1799 Comm: syz-fuzzer Tainted: G        W         5.17.0-rc7-syzkaller-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Link: https://lkml.kernel.org/r/20220322002653.33865-3-kuniyu@amazon.co.jp
Fixes: e59d3c64cba6 ("epoll: eliminate unnecessary lock for zero timeout")
Fixes: c5a282e9635e ("fs/epoll: reduce the scope of wq lock in epoll_wait()")
Fixes: bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Reported-by: syzbot+bdd6e38a1ed5ee58d8bd@syzkaller.appspotmail.com
Cc: Al Viro <viro@zeniv.linux.org.uk>, Andrew Morton <akpm@linux-foundation.org>
Cc: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Cc: Kuniyuki Iwashima <kuni1840@gmail.com>
Cc: "Soheil Hassas Yeganeh" <soheil@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Sridhar Samudrala" <sridhar.samudrala@intel.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/list.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index dd6c2041d09c..d7d2bfa1a365 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -35,7 +35,7 @@
 static inline void INIT_LIST_HEAD(struct list_head *list)
 {
 	WRITE_ONCE(list->next, list);
-	list->prev = list;
+	WRITE_ONCE(list->prev, list);
 }
 
 #ifdef CONFIG_DEBUG_LIST
@@ -306,7 +306,7 @@ static inline int list_empty(const struct list_head *head)
 static inline void list_del_init_careful(struct list_head *entry)
 {
 	__list_del_entry(entry);
-	entry->prev = entry;
+	WRITE_ONCE(entry->prev, entry);
 	smp_store_release(&entry->next, entry);
 }
 
@@ -326,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry)
 static inline int list_empty_careful(const struct list_head *head)
 {
 	struct list_head *next = smp_load_acquire(&head->next);
-	return list_is_head(next, head) && (next == head->prev);
+	return list_is_head(next, head) && (next == READ_ONCE(head->prev));
 }
 
 /**
-- 
cgit v1.2.3


From a9866bef5171c859cfabc1155c594d28f194aa23 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 29 Apr 2022 14:38:02 -0700
Subject: ptrace: fix wrong comment of PT_DTRACE

PT_DTRACE is only used on um now, fix the wrong comment.

Link: https://lkml.kernel.org/r/1649240981-11024-3-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ptrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 15b3d176b6b4..db4509587d2c 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -30,7 +30,7 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 
 #define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
-#define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
+#define PT_DTRACE	0x00000002	/* delayed trace (used on um) */
 
 #define PT_OPT_FLAG_SHIFT	3
 /* PT_TRACE_* event enable flags */
-- 
cgit v1.2.3


From 0e0af57e0e91b304f36b7d1dba859e3c04094273 Mon Sep 17 00:00:00 2001
From: "Dr. Thomas Orgis" <thomas.orgis@uni-hamburg.de>
Date: Fri, 29 Apr 2022 14:38:03 -0700
Subject: taskstats: version 12 with thread group and exe info

The task exit struct needs some crucial information to be able to provide
an enhanced version of process and thread accounting.  This change
provides:

1. ac_tgid in additon to ac_pid
2. thread group execution walltime in ac_tgetime
3. flag AGROUP in ac_flag to indicate the last task
   in a thread group / process
4. device ID and inode of task's /proc/self/exe in
   ac_exe_dev and ac_exe_inode
5. tools/accounting/procacct as demonstrator

When a task exits, taskstats are reported to userspace including the
task's pid and ppid, but without the id of the thread group this task is
part of.  Without the tgid, the stats of single tasks cannot be correlated
to each other as a thread group (process).

The taskstats documentation suggests that on process exit a data set
consisting of accumulated stats for the whole group is produced.  But such
an additional set of stats is only produced for actually multithreaded
processes, not groups that had only one thread, and also those stats only
contain data about delay accounting and not the more basic information
about CPU and memory resource usage.  Adding the AGROUP flag to be set
when the last task of a group exited enables determination of process end
also for single-threaded processes.

My applicaton basically does enhanced process accounting with summed
cputime, biggest maxrss, tasks per process.  The data is not available
with the traditional BSD process accounting (which is not designed to be
extensible) and the taskstats interface allows more efficient on-the-fly
grouping and summing of the stats, anyway, without intermediate disk
writes.

Furthermore, I do carry statistics on which exact program binary is used
how often with associated resources, getting a picture on how important
which parts of a collection of installed scientific software in different
versions are, and how well they put load on the machine.  This is enabled
by providing information on /proc/self/exe for each task.  I assume the
two 64-bit fields for device ID and inode are more appropriate than the
possibly large resolved path to keep the data volume down.

Add the tgid to the stats to complete task identification, the flag AGROUP
to mark the last task of a group, the group wallclock time, and
inode-based identification of the associated executable file.

Add tools/accounting/procacct.c as a simplified fork of getdelays.c to
demonstrate process and thread accounting.

[thomas.orgis@uni-hamburg.de: fix version number in comment]
  Link: https://lkml.kernel.org/r/20220405003601.7a5f6008@plasteblaster
Link: https://lkml.kernel.org/r/20220331004106.64e5616b@plasteblaster
Signed-off-by: Dr. Thomas Orgis <thomas.orgis@uni-hamburg.de>
Reviewed-by: Ismael Luceno <ismael@iodev.co.uk>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/acct.h      |   3 +-
 include/uapi/linux/taskstats.h |  24 ++-
 kernel/taskstats.c             |  23 +++
 kernel/tsacct.c                |  10 +-
 tools/accounting/.gitignore    |   1 +
 tools/accounting/Makefile      |   2 +-
 tools/accounting/procacct.c    | 417 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 473 insertions(+), 7 deletions(-)
 create mode 100644 tools/accounting/procacct.c

(limited to 'include')

diff --git a/include/uapi/linux/acct.h b/include/uapi/linux/acct.h
index 985b89068591..0e591152aa8a 100644
--- a/include/uapi/linux/acct.h
+++ b/include/uapi/linux/acct.h
@@ -103,12 +103,13 @@ struct acct_v3
 /*
  *  accounting flags
  */
-				/* bit set when the process ... */
+				/* bit set when the process/task ... */
 #define AFORK		0x01	/* ... executed fork, but did not exec */
 #define ASU		0x02	/* ... used super-user privileges */
 #define ACOMPAT		0x04	/* ... used compatibility mode (VAX only not used) */
 #define ACORE		0x08	/* ... dumped core */
 #define AXSIG		0x10	/* ... was killed by a signal */
+#define AGROUP		0x20	/* ... was the last task of the process (task group) */
 
 #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN)
 #define ACCT_BYTEORDER	0x80	/* accounting file is big endian */
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 12327d32378f..736154171489 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	11
+#define TASKSTATS_VERSION	12
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -48,7 +48,8 @@ struct taskstats {
 	__u32	ac_exitcode;		/* Exit status */
 
 	/* The accounting flags of a task as defined in <linux/acct.h>
-	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+	 * Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP.
+	 * (AGROUP since version 12).
 	 */
 	__u8	ac_flag;		/* Record flags */
 	__u8	ac_nice;		/* task_nice */
@@ -173,9 +174,26 @@ struct taskstats {
 	/* v10: 64-bit btime to avoid overflow */
 	__u64	ac_btime64;		/* 64-bit begin time */
 
-	/* Delay waiting for memory compact */
+	/* v11: Delay waiting for memory compact */
 	__u64	compact_count;
 	__u64	compact_delay_total;
+
+	/* v12 begin */
+	__u32   ac_tgid;	/* thread group ID */
+	/* Thread group walltime up to now. This is total process walltime if
+	 * AGROUP flag is set.
+	 */
+	__u64	ac_tgetime __attribute__((aligned(8)));
+	/* Lightweight information to identify process binary files.
+	 * This leaves userspace to match this to a file system path, using
+	 * MAJOR() and MINOR() macros to identify a device and mount point,
+	 * the inode to identify the executable file. This is /proc/self/exe
+	 * at the end, so matching the most recent exec(). Values are zero
+	 * for kernel threads.
+	 */
+	__u64   ac_exe_dev;     /* program binary device ID */
+	__u64   ac_exe_inode;   /* program binary inode number */
+	/* v12 end */
 };
 
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bcac5a9043aa..72415e22342b 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/tsacct_kern.h>
+#include <linux/acct.h>
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
@@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb,
 	up_write(&listeners->sem);
 }
 
+static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk)
+{
+	/* No idea if I'm allowed to access that here, now. */
+	struct file *exe_file = get_task_exe_file(tsk);
+
+	if (exe_file) {
+		/* Following cp_new_stat64() in stat.c . */
+		stats->ac_exe_dev =
+			huge_encode_dev(exe_file->f_inode->i_sb->s_dev);
+		stats->ac_exe_inode = exe_file->f_inode->i_ino;
+		fput(exe_file);
+	} else {
+		stats->ac_exe_dev = 0;
+		stats->ac_exe_inode = 0;
+	}
+}
+
 static void fill_stats(struct user_namespace *user_ns,
 		       struct pid_namespace *pid_ns,
 		       struct task_struct *tsk, struct taskstats *stats)
@@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns,
 
 	/* fill in extended acct fields */
 	xacct_add_tsk(stats, tsk);
+
+	/* add executable info */
+	exe_add_tsk(stats, tsk);
 }
 
 static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
@@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 		goto err;
 
 	fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);
+	if (group_dead)
+		stats->ac_flag |= AGROUP;
 
 	/*
 	 * Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 1d261fbe367b..4252f0645b9e 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 {
 	const struct cred *tcred;
 	u64 utime, stime, utimescaled, stimescaled;
-	u64 delta;
+	u64 now_ns, delta;
 	time64_t btime;
 
 	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
 
 	/* calculate task elapsed time in nsec */
-	delta = ktime_get_ns() - tsk->start_time;
+	now_ns = ktime_get_ns();
+	/* store whole group time first */
+	delta = now_ns - tsk->group_leader->start_time;
 	/* Convert to micro seconds */
 	do_div(delta, NSEC_PER_USEC);
+	stats->ac_tgetime = delta;
+	delta = now_ns - tsk->start_time;
+	do_div(delta, NSEC_PER_USEC);
 	stats->ac_etime = delta;
 	/* Convert to seconds for btime (note y2106 limit) */
 	btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC);
@@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 	stats->ac_nice	 = task_nice(tsk);
 	stats->ac_sched	 = tsk->policy;
 	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);
+	stats->ac_tgid   = task_tgid_nr_ns(tsk, pid_ns);
 	rcu_read_lock();
 	tcred = __task_cred(tsk);
 	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid);
diff --git a/tools/accounting/.gitignore b/tools/accounting/.gitignore
index c45fb4ed4309..522a690aaf3d 100644
--- a/tools/accounting/.gitignore
+++ b/tools/accounting/.gitignore
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 getdelays
+procacct
diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile
index 03687f19cbb1..11def1ad046c 100644
--- a/tools/accounting/Makefile
+++ b/tools/accounting/Makefile
@@ -2,7 +2,7 @@
 CC := $(CROSS_COMPILE)gcc
 CFLAGS := -I../../usr/include
 
-PROGS := getdelays
+PROGS := getdelays procacct
 
 all: $(PROGS)
 
diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c
new file mode 100644
index 000000000000..8353d3237e50
--- /dev/null
+++ b/tools/accounting/procacct.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0
+/* procacct.c
+ *
+ * Demonstrator of fetching resource data on task exit, as a way
+ * to accumulate accurate program resource usage statistics, without
+ * prior identification of the programs. For that, the fields for
+ * device and inode of the program executable binary file are also
+ * extracted in addition to the command string.
+ *
+ * The TGID together with the PID and the AGROUP flag allow
+ * identification of threads in a process and single-threaded processes.
+ * The ac_tgetime field gives proper whole-process walltime.
+ *
+ * Written (changed) by Thomas Orgis, University of Hamburg in 2022
+ *
+ * This is a cheap derivation (inheriting the style) of getdelays.c:
+ *
+ * Utility to get per-pid and per-tgid delay accounting statistics
+ * Also illustrates usage of the taskstats interface
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2005
+ * Copyright (C) Balbir Singh, IBM Corp. 2006
+ * Copyright (c) Jay Lan, SGI. 2006
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <poll.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#include <linux/genetlink.h>
+#include <linux/acct.h>
+#include <linux/taskstats.h>
+#include <linux/kdev_t.h>
+
+/*
+ * Generic macros for dealing with netlink sockets. Might be duplicated
+ * elsewhere. It is recommended that commercial grade applications use
+ * libnl or libnetlink and use the interfaces provided by the library
+ */
+#define GENLMSG_DATA(glh)	((void *)(NLMSG_DATA(glh) + GENL_HDRLEN))
+#define GENLMSG_PAYLOAD(glh)	(NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN)
+#define NLA_DATA(na)		((void *)((char *)(na) + NLA_HDRLEN))
+#define NLA_PAYLOAD(len)	(len - NLA_HDRLEN)
+
+#define err(code, fmt, arg...)			\
+	do {					\
+		fprintf(stderr, fmt, ##arg);	\
+		exit(code);			\
+	} while (0)
+
+int rcvbufsz;
+char name[100];
+int dbg;
+int print_delays;
+int print_io_accounting;
+int print_task_context_switch_counts;
+
+#define PRINTF(fmt, arg...) {			\
+		if (dbg) {			\
+			printf(fmt, ##arg);	\
+		}				\
+	}
+
+/* Maximum size of response requested or message sent */
+#define MAX_MSG_SIZE	1024
+/* Maximum number of cpus expected to be specified in a cpumask */
+#define MAX_CPUS	32
+
+struct msgtemplate {
+	struct nlmsghdr n;
+	struct genlmsghdr g;
+	char buf[MAX_MSG_SIZE];
+};
+
+char cpumask[100+6*MAX_CPUS];
+
+static void usage(void)
+{
+	fprintf(stderr, "procacct [-v] [-w logfile] [-r bufsize] [-m cpumask]\n");
+	fprintf(stderr, "  -v: debug on\n");
+}
+
+/*
+ * Create a raw netlink socket and bind
+ */
+static int create_nl_socket(int protocol)
+{
+	int fd;
+	struct sockaddr_nl local;
+
+	fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+	if (fd < 0)
+		return -1;
+
+	if (rcvbufsz)
+		if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF,
+				&rcvbufsz, sizeof(rcvbufsz)) < 0) {
+			fprintf(stderr, "Unable to set socket rcv buf size to %d\n",
+				rcvbufsz);
+			goto error;
+		}
+
+	memset(&local, 0, sizeof(local));
+	local.nl_family = AF_NETLINK;
+
+	if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0)
+		goto error;
+
+	return fd;
+error:
+	close(fd);
+	return -1;
+}
+
+
+static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid,
+	     __u8 genl_cmd, __u16 nla_type,
+	     void *nla_data, int nla_len)
+{
+	struct nlattr *na;
+	struct sockaddr_nl nladdr;
+	int r, buflen;
+	char *buf;
+
+	struct msgtemplate msg;
+
+	msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+	msg.n.nlmsg_type = nlmsg_type;
+	msg.n.nlmsg_flags = NLM_F_REQUEST;
+	msg.n.nlmsg_seq = 0;
+	msg.n.nlmsg_pid = nlmsg_pid;
+	msg.g.cmd = genl_cmd;
+	msg.g.version = 0x1;
+	na = (struct nlattr *) GENLMSG_DATA(&msg);
+	na->nla_type = nla_type;
+	na->nla_len = nla_len + 1 + NLA_HDRLEN;
+	memcpy(NLA_DATA(na), nla_data, nla_len);
+	msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len);
+
+	buf = (char *) &msg;
+	buflen = msg.n.nlmsg_len;
+	memset(&nladdr, 0, sizeof(nladdr));
+	nladdr.nl_family = AF_NETLINK;
+	while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr,
+			   sizeof(nladdr))) < buflen) {
+		if (r > 0) {
+			buf += r;
+			buflen -= r;
+		} else if (errno != EAGAIN)
+			return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * Probe the controller in genetlink to find the family id
+ * for the TASKSTATS family
+ */
+static int get_family_id(int sd)
+{
+	struct {
+		struct nlmsghdr n;
+		struct genlmsghdr g;
+		char buf[256];
+	} ans;
+
+	int id = 0, rc;
+	struct nlattr *na;
+	int rep_len;
+
+	strcpy(name, TASKSTATS_GENL_NAME);
+	rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY,
+			CTRL_ATTR_FAMILY_NAME, (void *)name,
+			strlen(TASKSTATS_GENL_NAME)+1);
+	if (rc < 0)
+		return 0;	/* sendto() failure? */
+
+	rep_len = recv(sd, &ans, sizeof(ans), 0);
+	if (ans.n.nlmsg_type == NLMSG_ERROR ||
+	    (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len))
+		return 0;
+
+	na = (struct nlattr *) GENLMSG_DATA(&ans);
+	na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len));
+	if (na->nla_type == CTRL_ATTR_FAMILY_ID)
+		id = *(__u16 *) NLA_DATA(na);
+
+	return id;
+}
+
+#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1))
+
+static void print_procacct(struct taskstats *t)
+{
+	/* First letter: T is a mere thread, G the last in a group, U  unknown. */
+	printf(
+		"%c pid=%lu tgid=%lu uid=%lu wall=%llu gwall=%llu cpu=%llu vmpeak=%llu rsspeak=%llu dev=%lu:%lu inode=%llu comm=%s\n"
+	,	t->version >= 12 ? (t->ac_flag & AGROUP ? 'P' : 'T') : '?'
+	,	(unsigned long)t->ac_pid
+	,	(unsigned long)(t->version >= 12 ? t->ac_tgid : 0)
+	,	(unsigned long)t->ac_uid
+	,	(unsigned long long)t->ac_etime
+	,	(unsigned long long)(t->version >= 12 ? t->ac_tgetime : 0)
+	,	(unsigned long long)(t->ac_utime+t->ac_stime)
+	,	(unsigned long long)t->hiwater_vm
+	,	(unsigned long long)t->hiwater_rss
+	,	(unsigned long)(t->version >= 12 ? MAJOR(t->ac_exe_dev) : 0)
+	,	(unsigned long)(t->version >= 12 ? MINOR(t->ac_exe_dev) : 0)
+	,	(unsigned long long)(t->version >= 12 ? t->ac_exe_inode : 0)
+	,	t->ac_comm
+	);
+}
+
+void handle_aggr(int mother, struct nlattr *na, int fd)
+{
+	int aggr_len = NLA_PAYLOAD(na->nla_len);
+	int len2 = 0;
+	pid_t rtid = 0;
+
+	na = (struct nlattr *) NLA_DATA(na);
+	while (len2 < aggr_len) {
+		switch (na->nla_type) {
+		case TASKSTATS_TYPE_PID:
+			rtid = *(int *) NLA_DATA(na);
+			PRINTF("PID\t%d\n", rtid);
+			break;
+		case TASKSTATS_TYPE_TGID:
+			rtid = *(int *) NLA_DATA(na);
+			PRINTF("TGID\t%d\n", rtid);
+			break;
+		case TASKSTATS_TYPE_STATS:
+			if (mother == TASKSTATS_TYPE_AGGR_PID)
+				print_procacct((struct taskstats *) NLA_DATA(na));
+			if (fd) {
+				if (write(fd, NLA_DATA(na), na->nla_len) < 0)
+					err(1, "write error\n");
+			}
+			break;
+		case TASKSTATS_TYPE_NULL:
+			break;
+		default:
+			fprintf(stderr, "Unknown nested nla_type %d\n",
+				na->nla_type);
+			break;
+		}
+		len2 += NLA_ALIGN(na->nla_len);
+		na = (struct nlattr *)((char *)na +
+						 NLA_ALIGN(na->nla_len));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	int c, rc, rep_len, aggr_len, len2;
+	int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
+	__u16 id;
+	__u32 mypid;
+
+	struct nlattr *na;
+	int nl_sd = -1;
+	int len = 0;
+	pid_t tid = 0;
+
+	int fd = 0;
+	int write_file = 0;
+	int maskset = 0;
+	char *logfile = NULL;
+	int containerset = 0;
+	char *containerpath = NULL;
+	int cfd = 0;
+	int forking = 0;
+	sigset_t sigset;
+
+	struct msgtemplate msg;
+
+	while (!forking) {
+		c = getopt(argc, argv, "m:vr:");
+		if (c < 0)
+			break;
+
+		switch (c) {
+		case 'w':
+			logfile = strdup(optarg);
+			printf("write to file %s\n", logfile);
+			write_file = 1;
+			break;
+		case 'r':
+			rcvbufsz = atoi(optarg);
+			printf("receive buf size %d\n", rcvbufsz);
+			if (rcvbufsz < 0)
+				err(1, "Invalid rcv buf size\n");
+			break;
+		case 'm':
+			strncpy(cpumask, optarg, sizeof(cpumask));
+			cpumask[sizeof(cpumask) - 1] = '\0';
+			maskset = 1;
+			break;
+		case 'v':
+			printf("debug on\n");
+			dbg = 1;
+			break;
+		default:
+			usage();
+			exit(-1);
+		}
+	}
+	if (!maskset) {
+		maskset = 1;
+		strncpy(cpumask, "1", sizeof(cpumask));
+		cpumask[sizeof(cpumask) - 1] = '\0';
+	}
+	printf("cpumask %s maskset %d\n", cpumask, maskset);
+
+	if (write_file) {
+		fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd == -1) {
+			perror("Cannot open output file\n");
+			exit(1);
+		}
+	}
+
+	nl_sd = create_nl_socket(NETLINK_GENERIC);
+	if (nl_sd < 0)
+		err(1, "error creating Netlink socket\n");
+
+	mypid = getpid();
+	id = get_family_id(nl_sd);
+	if (!id) {
+		fprintf(stderr, "Error getting family id, errno %d\n", errno);
+		goto err;
+	}
+	PRINTF("family id %d\n", id);
+
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_REGISTER_CPUMASK,
+			      &cpumask, strlen(cpumask) + 1);
+		PRINTF("Sent register cpumask, retval %d\n", rc);
+		if (rc < 0) {
+			fprintf(stderr, "error sending register cpumask\n");
+			goto err;
+		}
+	}
+
+	do {
+		rep_len = recv(nl_sd, &msg, sizeof(msg), 0);
+		PRINTF("received %d bytes\n", rep_len);
+
+		if (rep_len < 0) {
+			fprintf(stderr, "nonfatal reply error: errno %d\n",
+				errno);
+			continue;
+		}
+		if (msg.n.nlmsg_type == NLMSG_ERROR ||
+		    !NLMSG_OK((&msg.n), rep_len)) {
+			struct nlmsgerr *err = NLMSG_DATA(&msg);
+
+			fprintf(stderr, "fatal reply error,  errno %d\n",
+				err->error);
+			goto done;
+		}
+
+		PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n",
+		       sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len);
+
+
+		rep_len = GENLMSG_PAYLOAD(&msg.n);
+
+		na = (struct nlattr *) GENLMSG_DATA(&msg);
+		len = 0;
+		while (len < rep_len) {
+			len += NLA_ALIGN(na->nla_len);
+			int mother = na->nla_type;
+
+			PRINTF("mother=%i\n", mother);
+			switch (na->nla_type) {
+			case TASKSTATS_TYPE_AGGR_PID:
+			case TASKSTATS_TYPE_AGGR_TGID:
+				/* For nested attributes, na follows */
+				handle_aggr(mother, na, fd);
+				break;
+			default:
+				fprintf(stderr, "Unexpected nla_type %d\n",
+					na->nla_type);
+			case TASKSTATS_TYPE_NULL:
+				break;
+			}
+			na = (struct nlattr *) (GENLMSG_DATA(&msg) + len);
+		}
+	} while (1);
+done:
+	if (maskset) {
+		rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET,
+			      TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK,
+			      &cpumask, strlen(cpumask) + 1);
+		printf("Sent deregister mask, retval %d\n", rc);
+		if (rc < 0)
+			err(rc, "error sending deregister cpumask\n");
+	}
+err:
+	close(nl_sd);
+	if (fd)
+		close(fd);
+	if (cfd)
+		close(cfd);
+	return 0;
+}
-- 
cgit v1.2.3


From c9b516f16be5896a3d798f8efb03acbd2ceec715 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 12 May 2022 20:38:36 -0700
Subject: ELF, uapi: fixup ELF_ST_TYPE definition

This is very theoretical compile failure:

	ELF_ST_TYPE(st_info = A)

Cast will bind first and st_info will stop being lvalue:

	error: lvalue required as left operand of assignment

Given that the only use of this macro is

	ELF_ST_TYPE(sym->st_info)

where st_info is "unsigned char" I've decided to remove cast especially
given that companion macro ELF_ST_BIND doesn't use cast.

Link: https://lkml.kernel.org/r/Ymv7G1BeX4kt3obz@localhost.localdomain
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/uapi/linux/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 787c657bfae8..237f21a5e0f6 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -134,7 +134,7 @@ typedef __s64	Elf64_Sxword;
 #define STT_TLS     6
 
 #define ELF_ST_BIND(x)		((x) >> 4)
-#define ELF_ST_TYPE(x)		(((unsigned int) x) & 0xf)
+#define ELF_ST_TYPE(x)		((x) & 0xf)
 #define ELF32_ST_BIND(x)	ELF_ST_BIND(x)
 #define ELF32_ST_TYPE(x)	ELF_ST_TYPE(x)
 #define ELF64_ST_BIND(x)	ELF_ST_BIND(x)
-- 
cgit v1.2.3