From c269cba35b061181bc23c470809c00e8f71e535a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 22 Feb 2016 15:02:07 +0100 Subject: memremap: add arch specific hook for MEMREMAP_WB mappings Currently, the memremap code serves MEMREMAP_WB mappings directly from the kernel direct mapping, unless the region is in high memory, in which case it falls back to using ioremap_cache(). However, the semantics of ioremap_cache() are not unambiguously defined, and on ARM, it will actually result in a mapping type that differs from the attributes used for the linear mapping, and for this reason, the ioremap_cache() call fails if the region is part of the memory managed by the kernel. So instead, implement an optional hook 'arch_memremap_wb' whose default implementation calls ioremap_cache() as before, but which can be overridden by the architecture to do what is appropriate for it. Acked-by: Dan Williams Signed-off-by: Ard Biesheuvel --- kernel/memremap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a6d382312e6f..017532193fb1 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) } #endif +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + static void *try_ram_remap(resource_size_t offset, size_t size) { unsigned long pfn = PHYS_PFN(offset); @@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size) /* In the simple case just return the existing linear address */ if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); - return NULL; /* fallback to ioremap_cache */ + return NULL; /* fallback to arch_memremap_wb */ } /** @@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); if (!addr) - addr = ioremap_cache(offset, size); + addr = arch_memremap_wb(offset, size); } /* -- cgit v1.2.3 From 0bf676d1fd5144e66ac06939fa3c7c12086c3b18 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 31 Mar 2016 10:49:28 +0200 Subject: audit: cleanup prune_tree_thread We can use kthread_run instead of kthread_create+wake_up_process for creating the thread. We do not need to set the task state to TASK_RUNNING after schedule(), the process is in that state already. And we do not need to set the state to TASK_INTERRUPTIBLE when not doing schedule() as we set the state to TASK_RUNNING immediately afterwards. Signed-off-by: Jiri Slaby Cc: Paul Moore Cc: Eric Paris Cc: Signed-off-by: Paul Moore --- kernel/audit_tree.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 5efe9b299a12..25772476fa4a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg) static int prune_tree_thread(void *unused) { for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (list_empty(&prune_list)) + if (list_empty(&prune_list)) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); + } mutex_lock(&audit_cmd_mutex); mutex_lock(&audit_filter_mutex); @@ -693,16 +693,14 @@ static int audit_launch_prune(void) { if (prune_thread) return 0; - prune_thread = kthread_create(prune_tree_thread, NULL, + prune_thread = kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); if (IS_ERR(prune_thread)) { pr_err("cannot start thread audit_prune_tree"); prune_thread = NULL; return -ENOMEM; - } else { - wake_up_process(prune_thread); - return 0; } + return 0; } /* called with audit_filter_mutex */ -- cgit v1.2.3 From 7ffb8e317bae03b8ee5bdcec93dc3723be945e9b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 4 Apr 2016 16:44:02 -0400 Subject: audit: we don't need to __set_current_state(TASK_RUNNING) Remove the calls to __set_current_state() to mark the task as running and do some related cleanup in wait_for_auditd() to limit the amount of work we do when we aren't going to reschedule the current task. Signed-off-by: Paul Moore --- kernel/audit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3a3e5deeda8d..f52fbefede09 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -430,7 +430,6 @@ restart: attempts, audit_pid); set_current_state(TASK_INTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); goto restart; } } @@ -1324,15 +1323,14 @@ static inline void audit_get_stamp(struct audit_context *ctx, static long wait_for_auditd(long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue_exclusive(&audit_backlog_wait, &wait); if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); sleep_time = schedule_timeout(sleep_time); - - __set_current_state(TASK_RUNNING); - remove_wait_queue(&audit_backlog_wait, &wait); + remove_wait_queue(&audit_backlog_wait, &wait); + } return sleep_time; } -- cgit v1.2.3 From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:24 +0100 Subject: KEYS: Generalise system_verify_data() to provide access to internal content Generalise system_verify_data() to provide access to internal content through a callback. This allows all the PKCS#7 stuff to be hidden inside this function and removed from the PE file parser and the PKCS#7 test key. If external content is not required, NULL should be passed as data to the function. If the callback is not required, that can be set to NULL. The function is now called verify_pkcs7_signature() to contrast with verify_pefile_signature() and the definitions of both have been moved into linux/verification.h along with the key_being_used_for enum. Signed-off-by: David Howells --- arch/x86/kernel/kexec-bzimage64.c | 18 ++------- certs/system_keyring.c | 45 ++++++++++++++++----- crypto/asymmetric_keys/Kconfig | 4 +- crypto/asymmetric_keys/mscode_parser.c | 21 ++++------ crypto/asymmetric_keys/pkcs7_key_type.c | 72 +++++++++++++-------------------- crypto/asymmetric_keys/pkcs7_parser.c | 21 +++++----- crypto/asymmetric_keys/verify_pefile.c | 40 +++++------------- crypto/asymmetric_keys/verify_pefile.h | 5 +-- include/crypto/pkcs7.h | 3 +- include/crypto/public_key.h | 14 ------- include/keys/asymmetric-type.h | 1 + include/keys/system_keyring.h | 7 +--- include/linux/verification.h | 50 +++++++++++++++++++++++ include/linux/verify_pefile.h | 22 ---------- kernel/module_signing.c | 5 ++- 15 files changed, 155 insertions(+), 173 deletions(-) create mode 100644 include/linux/verification.h delete mode 100644 include/linux/verify_pefile.h (limited to 'kernel') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 2af478e3fd4e..f2356bda2b05 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -19,8 +19,7 @@ #include #include #include -#include -#include +#include #include #include @@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data) #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) { - bool trusted; - int ret; - - ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, - VERIFYING_KEXEC_PE_SIGNATURE, - &trusted); - if (ret < 0) - return ret; - if (!trusted) - return -EKEYREJECTED; - return 0; + return verify_pefile_signature(kernel, kernel_len, + NULL, + VERIFYING_KEXEC_PE_SIGNATURE); } #endif diff --git a/certs/system_keyring.c b/certs/system_keyring.c index f4180326c2e1..a83bffedc0aa 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -108,16 +108,25 @@ late_initcall(load_system_certificate_list); #ifdef CONFIG_SYSTEM_DATA_VERIFICATION /** - * Verify a PKCS#7-based signature on system data. - * @data: The data to be verified. + * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data. + * @data: The data to be verified (NULL if expecting internal data). * @len: Size of @data. * @raw_pkcs7: The PKCS#7 message that is the signature. * @pkcs7_len: The size of @raw_pkcs7. + * @trusted_keys: Trusted keys to use (NULL for system_trusted_keyring). * @usage: The use to which the key is being put. + * @view_content: Callback to gain access to content. + * @ctx: Context for callback. */ -int system_verify_data(const void *data, unsigned long len, - const void *raw_pkcs7, size_t pkcs7_len, - enum key_being_used_for usage) +int verify_pkcs7_signature(const void *data, size_t len, + const void *raw_pkcs7, size_t pkcs7_len, + struct key *trusted_keys, + int untrusted_error, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx) { struct pkcs7_message *pkcs7; bool trusted; @@ -128,7 +137,7 @@ int system_verify_data(const void *data, unsigned long len, return PTR_ERR(pkcs7); /* The data should be detached - so we need to supply it. */ - if (pkcs7_supply_detached_data(pkcs7, data, len) < 0) { + if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) { pr_err("PKCS#7 signature with non-detached data\n"); ret = -EBADMSG; goto error; @@ -138,13 +147,29 @@ int system_verify_data(const void *data, unsigned long len, if (ret < 0) goto error; - ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted); + if (!trusted_keys) + trusted_keys = system_trusted_keyring; + ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted); if (ret < 0) goto error; - if (!trusted) { + if (!trusted && untrusted_error) { pr_err("PKCS#7 signature not signed with a trusted key\n"); - ret = -ENOKEY; + ret = untrusted_error; + goto error; + } + + if (view_content) { + size_t asn1hdrlen; + + ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen); + if (ret < 0) { + if (ret == -ENODATA) + pr_devel("PKCS#7 message does not contain data\n"); + goto error; + } + + ret = view_content(ctx, data, len, asn1hdrlen); } error: @@ -152,6 +177,6 @@ error: pr_devel("<==%s() = %d\n", __func__, ret); return ret; } -EXPORT_SYMBOL_GPL(system_verify_data); +EXPORT_SYMBOL_GPL(verify_pkcs7_signature); #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig index 91a7e047a765..f7d2ef9789d8 100644 --- a/crypto/asymmetric_keys/Kconfig +++ b/crypto/asymmetric_keys/Kconfig @@ -40,8 +40,7 @@ config PKCS7_MESSAGE_PARSER config PKCS7_TEST_KEY tristate "PKCS#7 testing key type" - depends on PKCS7_MESSAGE_PARSER - select SYSTEM_TRUSTED_KEYRING + depends on SYSTEM_DATA_VERIFICATION help This option provides a type of key that can be loaded up from a PKCS#7 message - provided the message is signed by a trusted key. If @@ -54,6 +53,7 @@ config PKCS7_TEST_KEY config SIGNED_PE_FILE_VERIFICATION bool "Support for PE file signature verification" depends on PKCS7_MESSAGE_PARSER=y + depends on SYSTEM_DATA_VERIFICATION select ASN1 select OID_REGISTRY help diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c index 3242cbfaeaa2..6a76d5c70ef6 100644 --- a/crypto/asymmetric_keys/mscode_parser.c +++ b/crypto/asymmetric_keys/mscode_parser.c @@ -21,19 +21,13 @@ /* * Parse a Microsoft Individual Code Signing blob */ -int mscode_parse(struct pefile_context *ctx) +int mscode_parse(void *_ctx, const void *content_data, size_t data_len, + size_t asn1hdrlen) { - const void *content_data; - size_t data_len; - int ret; - - ret = pkcs7_get_content_data(ctx->pkcs7, &content_data, &data_len, 1); - - if (ret) { - pr_debug("PKCS#7 message does not contain data\n"); - return ret; - } + struct pefile_context *ctx = _ctx; + content_data -= asn1hdrlen; + data_len += asn1hdrlen; pr_devel("Data: %zu [%*ph]\n", data_len, (unsigned)(data_len), content_data); @@ -129,7 +123,6 @@ int mscode_note_digest(void *context, size_t hdrlen, { struct pefile_context *ctx = context; - ctx->digest = value; - ctx->digest_len = vlen; - return 0; + ctx->digest = kmemdup(value, vlen, GFP_KERNEL); + return ctx->digest ? 0 : -ENOMEM; } diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c index e2d0edbbc71a..ab9bf5363ecd 100644 --- a/crypto/asymmetric_keys/pkcs7_key_type.c +++ b/crypto/asymmetric_keys/pkcs7_key_type.c @@ -13,12 +13,9 @@ #include #include #include +#include #include -#include -#include #include -#include -#include "pkcs7_parser.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PKCS#7 testing key type"); @@ -29,59 +26,46 @@ MODULE_PARM_DESC(pkcs7_usage, "Usage to specify when verifying the PKCS#7 message"); /* - * Preparse a PKCS#7 wrapped and validated data blob. + * Retrieve the PKCS#7 message content. */ -static int pkcs7_preparse(struct key_preparsed_payload *prep) +static int pkcs7_view_content(void *ctx, const void *data, size_t len, + size_t asn1hdrlen) { - enum key_being_used_for usage = pkcs7_usage; - struct pkcs7_message *pkcs7; - const void *data, *saved_prep_data; - size_t datalen, saved_prep_datalen; - bool trusted; + struct key_preparsed_payload *prep = ctx; + const void *saved_prep_data; + size_t saved_prep_datalen; int ret; - kenter(""); - - if (usage >= NR__KEY_BEING_USED_FOR) { - pr_err("Invalid usage type %d\n", usage); - return -EINVAL; - } - saved_prep_data = prep->data; saved_prep_datalen = prep->datalen; - pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen); - if (IS_ERR(pkcs7)) { - ret = PTR_ERR(pkcs7); - goto error; - } - - ret = pkcs7_verify(pkcs7, usage); - if (ret < 0) - goto error_free; - - ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted); - if (ret < 0) - goto error_free; - if (!trusted) - pr_warn("PKCS#7 message doesn't chain back to a trusted key\n"); - - ret = pkcs7_get_content_data(pkcs7, &data, &datalen, false); - if (ret < 0) - goto error_free; - prep->data = data; - prep->datalen = datalen; + prep->datalen = len; + ret = user_preparse(prep); + prep->data = saved_prep_data; prep->datalen = saved_prep_datalen; - -error_free: - pkcs7_free_message(pkcs7); -error: - kleave(" = %d", ret); return ret; } +/* + * Preparse a PKCS#7 wrapped and validated data blob. + */ +static int pkcs7_preparse(struct key_preparsed_payload *prep) +{ + enum key_being_used_for usage = pkcs7_usage; + + if (usage >= NR__KEY_BEING_USED_FOR) { + pr_err("Invalid usage type %d\n", usage); + return -EINVAL; + } + + return verify_pkcs7_signature(NULL, 0, + prep->data, prep->datalen, + NULL, -ENOKEY, usage, + pkcs7_view_content, prep); +} + /* * user defined keys take an arbitrary string as the description and an * arbitrary blob of data as the payload diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 835701613125..af4cd8649117 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -168,24 +168,25 @@ EXPORT_SYMBOL_GPL(pkcs7_parse_message); * @pkcs7: The preparsed PKCS#7 message to access * @_data: Place to return a pointer to the data * @_data_len: Place to return the data length - * @want_wrapper: True if the ASN.1 object header should be included in the data + * @_headerlen: Size of ASN.1 header not included in _data * - * Get access to the data content of the PKCS#7 message, including, optionally, - * the header of the ASN.1 object that contains it. Returns -ENODATA if the - * data object was missing from the message. + * Get access to the data content of the PKCS#7 message. The size of the + * header of the ASN.1 object that contains it is also provided and can be used + * to adjust *_data and *_data_len to get the entire object. + * + * Returns -ENODATA if the data object was missing from the message. */ int pkcs7_get_content_data(const struct pkcs7_message *pkcs7, const void **_data, size_t *_data_len, - bool want_wrapper) + size_t *_headerlen) { - size_t wrapper; - if (!pkcs7->data) return -ENODATA; - wrapper = want_wrapper ? pkcs7->data_hdrlen : 0; - *_data = pkcs7->data - wrapper; - *_data_len = pkcs7->data_len + wrapper; + *_data = pkcs7->data; + *_data_len = pkcs7->data_len; + if (_headerlen) + *_headerlen = pkcs7->data_hdrlen; return 0; } EXPORT_SYMBOL_GPL(pkcs7_get_content_data); diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index 7e8c2338ae25..265351075b0e 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include "verify_pefile.h" @@ -392,9 +392,8 @@ error_no_desc: * verify_pefile_signature - Verify the signature on a PE binary image * @pebuf: Buffer containing the PE binary image * @pelen: Length of the binary image - * @trust_keyring: Signing certificates to use as starting points + * @trust_keys: Signing certificate(s) to use as starting points * @usage: The use to which the key is being put. - * @_trusted: Set to true if trustworth, false otherwise * * Validate that the certificate chain inside the PKCS#7 message inside the PE * binary image intersects keys we already know and trust. @@ -418,14 +417,10 @@ error_no_desc: * May also return -ENOMEM. */ int verify_pefile_signature(const void *pebuf, unsigned pelen, - struct key *trusted_keyring, - enum key_being_used_for usage, - bool *_trusted) + struct key *trusted_keys, + enum key_being_used_for usage) { - struct pkcs7_message *pkcs7; struct pefile_context ctx; - const void *data; - size_t datalen; int ret; kenter(""); @@ -439,19 +434,10 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen, if (ret < 0) return ret; - pkcs7 = pkcs7_parse_message(pebuf + ctx.sig_offset, ctx.sig_len); - if (IS_ERR(pkcs7)) - return PTR_ERR(pkcs7); - ctx.pkcs7 = pkcs7; - - ret = pkcs7_get_content_data(ctx.pkcs7, &data, &datalen, false); - if (ret < 0 || datalen == 0) { - pr_devel("PKCS#7 message does not contain data\n"); - ret = -EBADMSG; - goto error; - } - - ret = mscode_parse(&ctx); + ret = verify_pkcs7_signature(NULL, 0, + pebuf + ctx.sig_offset, ctx.sig_len, + trusted_keys, -EKEYREJECTED, usage, + mscode_parse, &ctx); if (ret < 0) goto error; @@ -462,16 +448,8 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen, * contents. */ ret = pefile_digest_pe(pebuf, pelen, &ctx); - if (ret < 0) - goto error; - - ret = pkcs7_verify(pkcs7, usage); - if (ret < 0) - goto error; - - ret = pkcs7_validate_trust(pkcs7, trusted_keyring, _trusted); error: - pkcs7_free_message(ctx.pkcs7); + kfree(ctx.digest); return ret; } diff --git a/crypto/asymmetric_keys/verify_pefile.h b/crypto/asymmetric_keys/verify_pefile.h index a133eb81a492..cd4d20930728 100644 --- a/crypto/asymmetric_keys/verify_pefile.h +++ b/crypto/asymmetric_keys/verify_pefile.h @@ -9,7 +9,6 @@ * 2 of the Licence, or (at your option) any later version. */ -#include #include #include @@ -23,7 +22,6 @@ struct pefile_context { unsigned sig_offset; unsigned sig_len; const struct section_header *secs; - struct pkcs7_message *pkcs7; /* PKCS#7 MS Individual Code Signing content */ const void *digest; /* Digest */ @@ -39,4 +37,5 @@ struct pefile_context { /* * mscode_parser.c */ -extern int mscode_parse(struct pefile_context *ctx); +extern int mscode_parse(void *_ctx, const void *content_data, size_t data_len, + size_t asn1hdrlen); diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h index 441aff9b5aa7..8323e3e57131 100644 --- a/include/crypto/pkcs7.h +++ b/include/crypto/pkcs7.h @@ -12,6 +12,7 @@ #ifndef _CRYPTO_PKCS7_H #define _CRYPTO_PKCS7_H +#include #include struct key; @@ -26,7 +27,7 @@ extern void pkcs7_free_message(struct pkcs7_message *pkcs7); extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7, const void **_data, size_t *_datalen, - bool want_wrapper); + size_t *_headerlen); /* * pkcs7_trust.c diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 2f5de5c1a3a0..b3928e801b8c 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -14,20 +14,6 @@ #ifndef _LINUX_PUBLIC_KEY_H #define _LINUX_PUBLIC_KEY_H -/* - * The use to which an asymmetric key is being put. - */ -enum key_being_used_for { - VERIFYING_MODULE_SIGNATURE, - VERIFYING_FIRMWARE_SIGNATURE, - VERIFYING_KEXEC_PE_SIGNATURE, - VERIFYING_KEY_SIGNATURE, - VERIFYING_KEY_SELF_SIGNATURE, - VERIFYING_UNSPECIFIED_SIGNATURE, - NR__KEY_BEING_USED_FOR -}; -extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; - /* * Cryptographic data for the public-key subtype of the asymmetric key type. * diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h index 70a8775bb444..d1e23dda4363 100644 --- a/include/keys/asymmetric-type.h +++ b/include/keys/asymmetric-type.h @@ -15,6 +15,7 @@ #define _KEYS_ASYMMETRIC_TYPE_H #include +#include extern struct key_type key_type_asymmetric; diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h index 39fd38cfa8c9..b2d645ac35a0 100644 --- a/include/keys/system_keyring.h +++ b/include/keys/system_keyring.h @@ -15,6 +15,7 @@ #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING #include +#include #include extern struct key *system_trusted_keyring; @@ -29,12 +30,6 @@ static inline struct key *get_system_trusted_keyring(void) } #endif -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -extern int system_verify_data(const void *data, unsigned long len, - const void *raw_pkcs7, size_t pkcs7_len, - enum key_being_used_for usage); -#endif - #ifdef CONFIG_IMA_MOK_KEYRING extern struct key *ima_mok_keyring; extern struct key *ima_blacklist_keyring; diff --git a/include/linux/verification.h b/include/linux/verification.h new file mode 100644 index 000000000000..bb0fcf941cb7 --- /dev/null +++ b/include/linux/verification.h @@ -0,0 +1,50 @@ +/* Signature verification + * + * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _LINUX_VERIFICATION_H +#define _LINUX_VERIFICATION_H + +/* + * The use to which an asymmetric key is being put. + */ +enum key_being_used_for { + VERIFYING_MODULE_SIGNATURE, + VERIFYING_FIRMWARE_SIGNATURE, + VERIFYING_KEXEC_PE_SIGNATURE, + VERIFYING_KEY_SIGNATURE, + VERIFYING_KEY_SELF_SIGNATURE, + VERIFYING_UNSPECIFIED_SIGNATURE, + NR__KEY_BEING_USED_FOR +}; +extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; + +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + +struct key; + +extern int verify_pkcs7_signature(const void *data, size_t len, + const void *raw_pkcs7, size_t pkcs7_len, + struct key *trusted_keys, + int untrusted_error, + enum key_being_used_for usage, + int (*view_content)(void *ctx, + const void *data, size_t len, + size_t asn1hdrlen), + void *ctx); + +#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION +extern int verify_pefile_signature(const void *pebuf, unsigned pelen, + struct key *trusted_keys, + enum key_being_used_for usage); +#endif + +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +#endif /* _LINUX_VERIFY_PEFILE_H */ diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h deleted file mode 100644 index da2049b5161c..000000000000 --- a/include/linux/verify_pefile.h +++ /dev/null @@ -1,22 +0,0 @@ -/* Signed PE file verification - * - * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef _LINUX_VERIFY_PEFILE_H -#define _LINUX_VERIFY_PEFILE_H - -#include - -extern int verify_pefile_signature(const void *pebuf, unsigned pelen, - struct key *trusted_keyring, - enum key_being_used_for usage, - bool *_trusted); - -#endif /* _LINUX_VERIFY_PEFILE_H */ diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 64b9dead4a07..593aace88a02 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -EBADMSG; } - return system_verify_data(mod, modlen, mod + modlen, sig_len, - VERIFYING_MODULE_SIGNATURE); + return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, + NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } -- cgit v1.2.3 From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:24 +0100 Subject: PKCS#7: Make trust determination dependent on contents of trust keyring Make the determination of the trustworthiness of a key dependent on whether a key that can verify it is present in the supplied ring of trusted keys rather than whether or not the verifying key has KEY_FLAG_TRUSTED set. verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust chain cannot be verified. Signed-off-by: David Howells --- certs/system_keyring.c | 13 ++++--------- crypto/asymmetric_keys/pkcs7_key_type.c | 2 +- crypto/asymmetric_keys/pkcs7_parser.h | 1 - crypto/asymmetric_keys/pkcs7_trust.c | 18 +++--------------- crypto/asymmetric_keys/verify_pefile.c | 2 +- crypto/asymmetric_keys/x509_parser.h | 1 - include/crypto/pkcs7.h | 3 +-- include/linux/verification.h | 1 - kernel/module_signing.c | 2 +- 9 files changed, 11 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/certs/system_keyring.c b/certs/system_keyring.c index a83bffedc0aa..dc18869ff680 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -121,7 +121,6 @@ late_initcall(load_system_certificate_list); int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, - int untrusted_error, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, @@ -129,7 +128,6 @@ int verify_pkcs7_signature(const void *data, size_t len, void *ctx) { struct pkcs7_message *pkcs7; - bool trusted; int ret; pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len); @@ -149,13 +147,10 @@ int verify_pkcs7_signature(const void *data, size_t len, if (!trusted_keys) trusted_keys = system_trusted_keyring; - ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted); - if (ret < 0) - goto error; - - if (!trusted && untrusted_error) { - pr_err("PKCS#7 signature not signed with a trusted key\n"); - ret = untrusted_error; + ret = pkcs7_validate_trust(pkcs7, trusted_keys); + if (ret < 0) { + if (ret == -ENOKEY) + pr_err("PKCS#7 signature not signed with a trusted key\n"); goto error; } diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c index ab9bf5363ecd..3b92523882e5 100644 --- a/crypto/asymmetric_keys/pkcs7_key_type.c +++ b/crypto/asymmetric_keys/pkcs7_key_type.c @@ -62,7 +62,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep) return verify_pkcs7_signature(NULL, 0, prep->data, prep->datalen, - NULL, -ENOKEY, usage, + NULL, usage, pkcs7_view_content, prep); } diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h index d5eec31e95b6..f4e81074f5e0 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.h +++ b/crypto/asymmetric_keys/pkcs7_parser.h @@ -22,7 +22,6 @@ struct pkcs7_signed_info { struct pkcs7_signed_info *next; struct x509_certificate *signer; /* Signing certificate (in msg->certs) */ unsigned index; - bool trusted; bool unsupported_crypto; /* T if not usable due to missing crypto */ /* Message digest - the digest of the Content Data (or NULL) */ diff --git a/crypto/asymmetric_keys/pkcs7_trust.c b/crypto/asymmetric_keys/pkcs7_trust.c index b9a5487cd82d..36e77cb07bd0 100644 --- a/crypto/asymmetric_keys/pkcs7_trust.c +++ b/crypto/asymmetric_keys/pkcs7_trust.c @@ -30,7 +30,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7, struct public_key_signature *sig = sinfo->sig; struct x509_certificate *x509, *last = NULL, *p; struct key *key; - bool trusted; int ret; kenter(",%u,", sinfo->index); @@ -42,10 +41,8 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7, for (x509 = sinfo->signer; x509; x509 = x509->signer) { if (x509->seen) { - if (x509->verified) { - trusted = x509->trusted; + if (x509->verified) goto verified; - } kleave(" = -ENOKEY [cached]"); return -ENOKEY; } @@ -122,7 +119,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7, matched: ret = verify_signature(key, sig); - trusted = test_bit(KEY_FLAG_TRUSTED, &key->flags); key_put(key); if (ret < 0) { if (ret == -ENOMEM) @@ -134,12 +130,9 @@ matched: verified: if (x509) { x509->verified = true; - for (p = sinfo->signer; p != x509; p = p->signer) { + for (p = sinfo->signer; p != x509; p = p->signer) p->verified = true; - p->trusted = trusted; - } } - sinfo->trusted = trusted; kleave(" = 0"); return 0; } @@ -148,7 +141,6 @@ verified: * pkcs7_validate_trust - Validate PKCS#7 trust chain * @pkcs7: The PKCS#7 certificate to validate * @trust_keyring: Signing certificates to use as starting points - * @_trusted: Set to true if trustworth, false otherwise * * Validate that the certificate chain inside the PKCS#7 message intersects * keys we already know and trust. @@ -170,16 +162,13 @@ verified: * May also return -ENOMEM. */ int pkcs7_validate_trust(struct pkcs7_message *pkcs7, - struct key *trust_keyring, - bool *_trusted) + struct key *trust_keyring) { struct pkcs7_signed_info *sinfo; struct x509_certificate *p; int cached_ret = -ENOKEY; int ret; - *_trusted = false; - for (p = pkcs7->certs; p; p = p->next) p->seen = false; @@ -193,7 +182,6 @@ int pkcs7_validate_trust(struct pkcs7_message *pkcs7, cached_ret = -ENOPKG; continue; case 0: - *_trusted |= sinfo->trusted; cached_ret = 0; continue; default: diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index 265351075b0e..672a94c2c3ff 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -436,7 +436,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen, ret = verify_pkcs7_signature(NULL, 0, pebuf + ctx.sig_offset, ctx.sig_len, - trusted_keys, -EKEYREJECTED, usage, + trusted_keys, usage, mscode_parse, &ctx); if (ret < 0) goto error; diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h index f24f4d808e7f..05eef1c68881 100644 --- a/crypto/asymmetric_keys/x509_parser.h +++ b/crypto/asymmetric_keys/x509_parser.h @@ -39,7 +39,6 @@ struct x509_certificate { unsigned index; bool seen; /* Infinite recursion prevention */ bool verified; - bool trusted; bool self_signed; /* T if self-signed (check unsupported_sig too) */ bool unsupported_key; /* T if key uses unsupported crypto */ bool unsupported_sig; /* T if signature uses unsupported crypto */ diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h index 8323e3e57131..583f199400a3 100644 --- a/include/crypto/pkcs7.h +++ b/include/crypto/pkcs7.h @@ -33,8 +33,7 @@ extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7, * pkcs7_trust.c */ extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7, - struct key *trust_keyring, - bool *_trusted); + struct key *trust_keyring); /* * pkcs7_verify.c diff --git a/include/linux/verification.h b/include/linux/verification.h index bb0fcf941cb7..a10549a6c7cd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -33,7 +33,6 @@ struct key; extern int verify_pkcs7_signature(const void *data, size_t len, const void *raw_pkcs7, size_t pkcs7_len, struct key *trusted_keys, - int untrusted_error, enum key_being_used_for usage, int (*view_content)(void *ctx, const void *data, size_t len, diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 593aace88a02..6a64e03b9f44 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) } return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len, - NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE, + NULL, VERIFYING_MODULE_SIGNATURE, NULL, NULL); } -- cgit v1.2.3 From a511e1af8b12f44c6e55786c463c9f093c214fb6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Apr 2016 16:14:26 +0100 Subject: KEYS: Move the point of trust determination to __key_link() Move the point at which a key is determined to be trustworthy to __key_link() so that we use the contents of the keyring being linked in to to determine whether the key being linked in is trusted or not. What is 'trusted' then becomes a matter of what's in the keyring. Currently, the test is done when the key is parsed, but given that at that point we can only sensibly refer to the contents of the system trusted keyring, we can only use that as the basis for working out the trustworthiness of a new key. With this change, a trusted keyring is a set of keys that once the trusted-only flag is set cannot be added to except by verification through one of the contained keys. Further, adding a key into a trusted keyring, whilst it might grant trustworthiness in the context of that keyring, does not automatically grant trustworthiness in the context of a second keyring to which it could be secondarily linked. To accomplish this, the authentication data associated with the key source must now be retained. For an X.509 cert, this means the contents of the AuthorityKeyIdentifier and the signature data. If system keyrings are disabled then restrict_link_by_builtin_trusted() resolves to restrict_link_reject(). The integrity digital signature code still works correctly with this as it was previously using KEY_FLAG_TRUSTED_ONLY, which doesn't permit anything to be added if there is no system keyring against which trust can be determined. Signed-off-by: David Howells --- certs/system_keyring.c | 20 +++++++++-- crypto/asymmetric_keys/restrict.c | 62 ++++++++++++++++---------------- crypto/asymmetric_keys/x509_parser.h | 6 ---- crypto/asymmetric_keys/x509_public_key.c | 21 +---------- include/crypto/public_key.h | 7 ++++ include/keys/system_keyring.h | 19 ++++------ kernel/module_signing.c | 2 +- security/integrity/digsig.c | 33 ++++++++++++++++- security/integrity/ima/ima_mok.c | 6 ++-- 9 files changed, 100 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/certs/system_keyring.c b/certs/system_keyring.c index 417d65882870..4e2fa8ab01d6 100644 --- a/certs/system_keyring.c +++ b/certs/system_keyring.c @@ -18,12 +18,26 @@ #include #include -struct key *system_trusted_keyring; -EXPORT_SYMBOL_GPL(system_trusted_keyring); +static struct key *system_trusted_keyring; extern __initconst const u8 system_certificate_list[]; extern __initconst const unsigned long system_certificate_list_size; +/** + * restrict_link_by_builtin_trusted - Restrict keyring addition by system CA + * + * Restrict the addition of keys into a keyring based on the key-to-be-added + * being vouched for by a key in the system keyring. + */ +int restrict_link_by_builtin_trusted(struct key *keyring, + const struct key_type *type, + unsigned long flags, + const union key_payload *payload) +{ + return restrict_link_by_signature(system_trusted_keyring, + type, payload); +} + /* * Load the compiled-in keys */ @@ -37,7 +51,7 @@ static __init int system_trusted_keyring_init(void) ((KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, - keyring_restrict_trusted_only, NULL); + restrict_link_by_builtin_trusted, NULL); if (IS_ERR(system_trusted_keyring)) panic("Can't allocate system trusted keyring\n"); return 0; diff --git a/crypto/asymmetric_keys/restrict.c b/crypto/asymmetric_keys/restrict.c index b4c10f2f5034..ac4bddf669de 100644 --- a/crypto/asymmetric_keys/restrict.c +++ b/crypto/asymmetric_keys/restrict.c @@ -1,6 +1,6 @@ /* Instantiate a public key crypto key from an X.509 Certificate * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2012, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -9,20 +9,12 @@ * 2 of the Licence, or (at your option) any later version. */ -#define pr_fmt(fmt) "X.509: "fmt +#define pr_fmt(fmt) "ASYM: "fmt #include #include -#include #include -#include -#include -#include -#include -#include -#include #include #include "asymmetric_keys.h" -#include "x509_parser.h" static bool use_builtin_keys; static struct asymmetric_key_id *ca_keyid; @@ -62,45 +54,55 @@ static int __init ca_keys_setup(char *str) __setup("ca_keys=", ca_keys_setup); #endif -/* +/** + * restrict_link_by_signature - Restrict additions to a ring of public keys + * @trust_keyring: A ring of keys that can be used to vouch for the new cert. + * @type: The type of key being added. + * @payload: The payload of the new key. + * * Check the new certificate against the ones in the trust keyring. If one of * those is the signing key and validates the new certificate, then mark the * new certificate as being trusted. * - * Return 0 if the new certificate was successfully validated, 1 if we couldn't - * find a matching parent certificate in the trusted list and an error if there - * is a matching certificate but the signature check fails. + * Returns 0 if the new certificate was accepted, -ENOKEY if we couldn't find a + * matching parent certificate in the trusted list, -EKEYREJECTED if the + * signature check fails or the key is blacklisted and some other error if + * there is a matching certificate but the signature check cannot be performed. */ -int x509_validate_trust(struct x509_certificate *cert, - struct key *trust_keyring) +int restrict_link_by_signature(struct key *trust_keyring, + const struct key_type *type, + const union key_payload *payload) { - struct public_key_signature *sig = cert->sig; + const struct public_key_signature *sig; struct key *key; - int ret = 1; + int ret; - if (!sig->auth_ids[0] && !sig->auth_ids[1]) - return 1; + pr_devel("==>%s()\n", __func__); if (!trust_keyring) + return -ENOKEY; + + if (type != &key_type_asymmetric) return -EOPNOTSUPP; + + sig = payload->data[asym_auth]; + if (!sig->auth_ids[0] && !sig->auth_ids[1]) + return 0; + if (ca_keyid && !asymmetric_key_id_partial(sig->auth_ids[1], ca_keyid)) return -EPERM; - if (cert->unsupported_sig) - return -ENOPKG; + /* See if we have a key that signed this one. */ key = find_asymmetric_key(trust_keyring, sig->auth_ids[0], sig->auth_ids[1], false); if (IS_ERR(key)) - return PTR_ERR(key); + return -ENOKEY; - if (!use_builtin_keys || - test_bit(KEY_FLAG_BUILTIN, &key->flags)) { - ret = verify_signature(key, cert->sig); - if (ret == -ENOPKG) - cert->unsupported_sig = true; - } + if (use_builtin_keys && !test_bit(KEY_FLAG_BUILTIN, &key->flags)) + ret = -ENOKEY; + else + ret = verify_signature(key, sig); key_put(key); return ret; } -EXPORT_SYMBOL_GPL(x509_validate_trust); diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h index 7a802b09a509..05eef1c68881 100644 --- a/crypto/asymmetric_keys/x509_parser.h +++ b/crypto/asymmetric_keys/x509_parser.h @@ -58,9 +58,3 @@ extern int x509_decode_time(time64_t *_t, size_t hdrlen, */ extern int x509_get_sig_params(struct x509_certificate *cert); extern int x509_check_for_self_signed(struct x509_certificate *cert); - -/* - * public_key_trust.c - */ -extern int x509_validate_trust(struct x509_certificate *cert, - struct key *trust_keyring); diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 6d7f42f0de9a..fb732296cd36 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -178,31 +178,12 @@ static int x509_key_preparse(struct key_preparsed_payload *prep) cert->pub->id_type = "X509"; - /* See if we can derive the trustability of this certificate. - * - * When it comes to self-signed certificates, we cannot evaluate - * trustedness except by the fact that we obtained it from a trusted - * location. So we just rely on x509_validate_trust() failing in this - * case. - * - * Note that there's a possibility of a self-signed cert matching a - * cert that we have (most likely a duplicate that we already trust) - - * in which case it will be marked trusted. - */ - if (cert->unsupported_sig || cert->self_signed) { + if (cert->unsupported_sig) { public_key_signature_free(cert->sig); cert->sig = NULL; } else { pr_devel("Cert Signature: %s + %s\n", cert->sig->pkey_algo, cert->sig->hash_algo); - - ret = x509_validate_trust(cert, get_system_trusted_keyring()); - if (ret) - ret = x509_validate_trust(cert, get_ima_mok_keyring()); - if (ret == -EKEYREJECTED) - goto error_free_cert; - if (!ret) - prep->trusted = true; } /* Propose a description */ diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 96ef27b8dd41..882ca0e1e7a5 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -47,6 +47,13 @@ extern void public_key_signature_free(struct public_key_signature *sig); extern struct asymmetric_key_subtype public_key_subtype; struct key; +struct key_type; +union key_payload; + +extern int restrict_link_by_signature(struct key *trust_keyring, + const struct key_type *type, + const union key_payload *payload); + extern int verify_signature(const struct key *key, const struct public_key_signature *sig); diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h index b2d645ac35a0..93715913a0b1 100644 --- a/include/keys/system_keyring.h +++ b/include/keys/system_keyring.h @@ -12,22 +12,17 @@ #ifndef _KEYS_SYSTEM_KEYRING_H #define _KEYS_SYSTEM_KEYRING_H +#include + #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING -#include -#include -#include +extern int restrict_link_by_builtin_trusted(struct key *keyring, + const struct key_type *type, + unsigned long flags, + const union key_payload *payload); -extern struct key *system_trusted_keyring; -static inline struct key *get_system_trusted_keyring(void) -{ - return system_trusted_keyring; -} #else -static inline struct key *get_system_trusted_keyring(void) -{ - return NULL; -} +#define restrict_link_by_builtin_trusted restrict_link_reject #endif #ifdef CONFIG_IMA_MOK_KEYRING diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6a64e03b9f44..937c844bee4a 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include "module-internal.h" diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c index 659566c2200b..d647178c6bbd 100644 --- a/security/integrity/digsig.c +++ b/security/integrity/digsig.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "integrity.h" @@ -40,6 +42,35 @@ static bool init_keyring __initdata = true; static bool init_keyring __initdata; #endif +#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING +/* + * Restrict the addition of keys into the IMA keyring. + * + * Any key that needs to go in .ima keyring must be signed by CA in + * either .system or .ima_mok keyrings. + */ +static int restrict_link_by_ima_mok(struct key *keyring, + const struct key_type *type, + unsigned long flags, + const union key_payload *payload) +{ + int ret; + + ret = restrict_link_by_builtin_trusted(keyring, type, flags, payload); + if (ret != -ENOKEY) + return ret; + + return restrict_link_by_signature(get_ima_mok_keyring(), + type, payload); +} +#else +/* + * If there's no system trusted keyring, then keys cannot be loaded into + * .ima_mok and added keys cannot be marked trusted. + */ +#define restrict_link_by_ima_mok restrict_link_reject +#endif + int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen, const char *digest, int digestlen) { @@ -84,7 +115,7 @@ int __init integrity_init_keyring(const unsigned int id) KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH), KEY_ALLOC_NOT_IN_QUOTA, - NULL, NULL); + restrict_link_by_ima_mok, NULL); if (IS_ERR(keyring[id])) { err = PTR_ERR(keyring[id]); pr_info("Can't allocate %s keyring (%d)\n", diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c index ef91248cb934..2988726d30d6 100644 --- a/security/integrity/ima/ima_mok.c +++ b/security/integrity/ima/ima_mok.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct key *ima_mok_keyring; @@ -36,7 +36,7 @@ __init int ima_mok_init(void) KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, - keyring_restrict_trusted_only, NULL); + restrict_link_by_builtin_trusted, NULL); ima_blacklist_keyring = keyring_alloc(".ima_blacklist", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), @@ -44,7 +44,7 @@ __init int ima_mok_init(void) KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, - keyring_restrict_trusted_only, NULL); + restrict_link_by_builtin_trusted, NULL); if (IS_ERR(ima_mok_keyring) || IS_ERR(ima_blacklist_keyring)) panic("Can't allocate IMA MOK or blacklist keyrings."); -- cgit v1.2.3 From 9ebc57cfaad21aacbc363eecead579269a46b493 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Apr 2016 20:39:55 -0400 Subject: tracing: Rename check_ignore_pid() to ignore_this_task() The name "check_ignore_pid" is confusing in trying to figure out if the pid should be ignored or not. Rename it to "ignore_this_task" which is pretty straight forward, as a task (not a pid) is passed in, and should if true should be ignored. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05ddc0820771..598a18675a6b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -484,7 +484,7 @@ static int cmp_pid(const void *key, const void *elt) } static bool -check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) +ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { pid_t search_pid; pid_t *pid; @@ -517,8 +517,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, prev) && - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, prev) && + ignore_this_task(pid_list, next)); } static void @@ -531,7 +531,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt, pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, next)); + ignore_this_task(pid_list, next)); } static void @@ -547,7 +547,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task) pid_list = rcu_dereference_sched(tr->filtered_pids); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, task)); + ignore_this_task(pid_list, task)); } static void @@ -564,7 +564,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task) /* Set tracing if current is enabled */ this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static void __ftrace_clear_event_pids(struct trace_array *tr) @@ -1561,7 +1561,7 @@ static void ignore_task_cpu(void *data) mutex_is_locked(&event_mutex)); this_cpu_write(tr->trace_buffer.data->ignore_pid, - check_ignore_pid(pid_list, current)); + ignore_this_task(pid_list, current)); } static ssize_t -- cgit v1.2.3 From f4d34a87e9c10f0ffd03d3548db6bfb200d06cdf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2016 16:27:49 -0400 Subject: tracing: Use pid bitmap instead of a pid array for set_event_pid In order to add the ability to let tasks that are filtered by the events have their children also be traced on fork (and then not traced on exit), convert the array into a pid bitmask. Most of the time the number of pids is only 32768 pids or a 4k bitmask, which is the same size as the default list currently is, and that list could grow if more pids are listed. This also greatly simplifies the code. Suggested-by: "H. Peter Anvin" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 +- kernel/trace/trace_events.c | 221 ++++++++++++++++++++------------------------ 2 files changed, 102 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3fff4adfd431..68cbb8e10aea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,9 +177,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned int nr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 598a18675a6b..45f7cc72bf25 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(&event_mutex); } -static int cmp_pid(const void *key, const void *elt) -{ - const pid_t *search_pid = key; - const pid_t *pid = elt; - - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; -} +/* Shouldn't this be in a header? */ +extern int pid_max; static bool ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { - pid_t search_pid; - pid_t *pid; + pid_t pid; /* * Return false, because if filtered_pids does not exist, @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - search_pid = task->pid; + pid = task->pid; - pid = bsearch(&search_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (task->pid >= filtered_pids->pid_max) return true; - return false; + return !test_bit(task->pid, filtered_pids->pids); } static void @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); } @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; + unsigned long pid; + loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) + return NULL; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) return NULL; - return (void *)&pid_list->pids[*pos]; + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; + pid = (unsigned long)p_next(m, (void *)pid, &l)) + ; + return (void *)pid; } static void p_stop(struct seq_file *m, void *p) @@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } -static void * -p_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_array *tr = m->private; - struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); - - (*pos)++; - - if (*pos >= pid_list->nr_pids) - return NULL; - - return (void *)&pid_list->pids[*pos]; -} - static int p_show(struct seq_file *m, void *v) { - pid_t *pid = v; + unsigned long pid = (unsigned long)v - 1; - seq_printf(m, "%d\n", *pid); + seq_printf(m, "%lu\n", pid); return 0; } @@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } -static int max_pids(struct trace_pid_list *pid_list) -{ - return (PAGE_SIZE << pid_list->order) / sizeof(pid_t); -} - static void ignore_task_cpu(void *data) { struct trace_array *tr = data; @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, struct seq_file *m = filp->private_data; struct trace_array *tr = m->private; struct trace_pid_list *filtered_pids = NULL; - struct trace_pid_list *pid_list = NULL; + struct trace_pid_list *pid_list; struct trace_event_file *file; struct trace_parser parser; unsigned long val; @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ssize_t read = 0; ssize_t ret = 0; pid_t pid; - int i; + int nr_pids = 0; if (!cnt) return 0; @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, return -ENOMEM; mutex_lock(&event_mutex); + filtered_pids = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + /* - * Load as many pids into the array before doing a - * swap from the tr->filtered_pids to the new list. + * Always recreate a new array. The write is an all or nothing + * operation. Always create a new array when adding new pids by + * the user. If the operation fails, then the current list is + * not modified. */ + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) { + read = -ENOMEM; + goto out; + } + pid_list->pid_max = READ_ONCE(pid_max); + /* Only truncating will shrink pid_max */ + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) + pid_list->pid_max = filtered_pids->pid_max; + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + read = -ENOMEM; + goto out; + } + if (filtered_pids) { + /* copy the current bits to the new max */ + pid = find_first_bit(filtered_pids->pids, + filtered_pids->pid_max); + while (pid < filtered_pids->pid_max) { + set_bit(pid, pid_list->pids); + pid = find_next_bit(filtered_pids->pids, + filtered_pids->pid_max, + pid + 1); + nr_pids++; + } + } + while (cnt > 0) { this_pos = 0; @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val > INT_MAX) + if (val >= pid_list->pid_max) break; pid = (pid_t)val; - ret = -ENOMEM; - if (!pid_list) { - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) - break; - - filtered_pids = rcu_dereference_protected(tr->filtered_pids, - lockdep_is_held(&event_mutex)); - if (filtered_pids) - pid_list->order = filtered_pids->order; - else - pid_list->order = 0; - - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order); - if (!pid_list->pids) - break; - - if (filtered_pids) { - pid_list->nr_pids = filtered_pids->nr_pids; - memcpy(pid_list->pids, filtered_pids->pids, - pid_list->nr_pids * sizeof(pid_t)); - } else - pid_list->nr_pids = 0; - } - - if (pid_list->nr_pids >= max_pids(pid_list)) { - pid_t *pid_page; - - pid_page = (void *)__get_free_pages(GFP_KERNEL, - pid_list->order + 1); - if (!pid_page) - break; - memcpy(pid_page, pid_list->pids, - pid_list->nr_pids * sizeof(pid_t)); - free_pages((unsigned long)pid_list->pids, pid_list->order); - - pid_list->order++; - pid_list->pids = pid_page; - } + set_bit(pid, pid_list->pids); + nr_pids++; - pid_list->pids[pid_list->nr_pids++] = pid; trace_parser_clear(&parser); ret = 0; } trace_parser_put(&parser); if (ret < 0) { - if (pid_list) - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); - mutex_unlock(&event_mutex); - return ret; - } - - if (!pid_list) { - mutex_unlock(&event_mutex); - return ret; + read = ret; + goto out; } - sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL); - - /* Remove duplicates */ - for (i = 1; i < pid_list->nr_pids; i++) { - int start = i; - - while (i < pid_list->nr_pids && - pid_list->pids[i - 1] == pid_list->pids[i]) - i++; - - if (start != i) { - if (i < pid_list->nr_pids) { - memmove(&pid_list->pids[start], &pid_list->pids[i], - (pid_list->nr_pids - i) * sizeof(pid_t)); - pid_list->nr_pids -= i - start; - i = start; - } else - pid_list->nr_pids = start; - } + if (!nr_pids) { + /* Cleared the list of pids */ + vfree(pid_list->pids); + kfree(pid_list); + read = ret; + if (!filtered_pids) + goto out; + pid_list = NULL; } - rcu_assign_pointer(tr->filtered_pids, pid_list); list_for_each_entry(file, &tr->events, list) { @@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_sched(); - free_pages((unsigned long)filtered_pids->pids, filtered_pids->order); + vfree(filtered_pids->pids); kfree(filtered_pids); } else { /* @@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); + out: mutex_unlock(&event_mutex); ret = read; - *ppos += read; + if (read > 0) + *ppos += read; return ret; } -- cgit v1.2.3 From c37775d57830a36382a9774bb84eca4ce3d019cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Apr 2016 16:59:18 -0400 Subject: tracing: Add infrastructure to allow set_event_pid to follow children Add the infrastructure needed to have the PIDs in set_event_pid to automatically add PIDs of the children of the tasks that have their PIDs in set_event_pid. This will also remove PIDs from set_event_pid when a task exits This is implemented by adding hooks into the fork and exit tracepoints. On fork, the PIDs are added to the list, and on exit, they are removed. Add a new option called event_fork that when set, PIDs in set_event_pid will automatically get their children PIDs added when they fork, as well as any task that exits will have its PID removed from set_event_pid. This works for instances as well. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 84 +++++++++++++++++++++++++++++++++++++++------ 3 files changed, 79 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a2f0b9f33e9b..0d12dbde8399 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3571,6 +3571,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + if (mask == TRACE_ITER_EVENT_FORK) + trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68cbb8e10aea..2525042760e6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -655,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -966,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ + C(EVENT_FORK, "event-fork"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f7cc72bf25..add81dff7520 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -474,11 +474,23 @@ static void ftrace_clear_events(struct trace_array *tr) /* Shouldn't this be in a header? */ extern int pid_max; +/* Returns true if found in filter */ static bool -ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - pid_t pid; + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (search_pid >= filtered_pids->pid_max) + return false; + + return test_bit(search_pid, filtered_pids->pids); +} +static bool +ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) +{ /* * Return false, because if filtered_pids does not exist, * all pids are good to trace. @@ -486,16 +498,68 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - pid = task->pid; + return !find_filtered_pid(filtered_pids, task->pid); +} - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (task->pid >= filtered_pids->pid_max) - return true; +static void filter_add_remove_task(struct trace_pid_list *pid_list, + struct task_struct *self, + struct task_struct *task) +{ + if (!pid_list) + return; + + /* For forks, we only add if the forking task is listed */ + if (self) { + if (!find_filtered_pid(pid_list, self->pid)) + return; + } + + /* Sorry, but we don't support pid_max changing after setting */ + if (task->pid >= pid_list->pid_max) + return; + + /* "self" is set for forks, and NULL for exits */ + if (self) + set_bit(task->pid, pid_list->pids); + else + clear_bit(task->pid, pid_list->pids); +} + +static void +event_filter_pid_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, NULL, task); +} - return !test_bit(task->pid, filtered_pids->pids); +static void +event_filter_pid_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->filtered_pids); + filter_add_remove_task(pid_list, self, task); +} + +void trace_event_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork, + tr, INT_MIN); + register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit, + tr, INT_MAX); + } else { + unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork, + tr); + unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit, + tr); + } } static void -- cgit v1.2.3 From 08d43a5fa063e03c860f2f391a30c388bcbc948e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 10 Dec 2015 12:50:50 -0600 Subject: tracing: Add lock-free tracing_map Add tracing_map, a special-purpose lock-free map for tracing. tracing_map is designed to aggregate or 'sum' one or more values associated with a specific object of type tracing_map_elt, which is associated by the map to a given key. It provides various hooks allowing per-tracer customization and is separated out into a separate file in order to allow it to be shared between multiple tracers, but isn't meant to be generally used outside of that context. The tracing_map implementation was inspired by lock-free map algorithms originated by Dr. Cliff Click: http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf Link: http://lkml.kernel.org/r/b43d68d1add33582a396f553c8ef705a33a6a748.1449767187.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 13 + kernel/trace/Makefile | 1 + kernel/trace/tracing_map.c | 1058 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/tracing_map.h | 282 ++++++++++++ 4 files changed, 1354 insertions(+) create mode 100644 kernel/trace/tracing_map.c create mode 100644 kernel/trace/tracing_map.h (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..48b732943e0e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -528,6 +528,19 @@ config MMIOTRACE See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. +config TRACING_MAP + bool + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + default n + help + tracing_map is a special-purpose lock-free map for tracing, + separated out as a stand-alone facility in order to allow it + to be shared between multiple tracers. It isn't meant to be + generally used outside of that context, and is normally + selected by tracers that use it. + + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..4255c4057aaa 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c new file mode 100644 index 000000000000..dd6401dd145e --- /dev/null +++ b/kernel/trace/tracing_map.c @@ -0,0 +1,1058 @@ +/* + * tracing_map - lock-free map for tracing + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi + * + * tracing_map implementation inspired by lock-free map algorithms + * originated by Dr. Cliff Click: + * + * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable + * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf + */ + +#include +#include +#include +#include + +#include "tracing_map.h" +#include "trace.h" + +/* + * NOTE: For a detailed description of the data structures used by + * these functions (such as tracing_map_elt) please see the overview + * of tracing_map data structures at the beginning of tracing_map.h. + */ + +/** + * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * @n: The value to add to the sum + * + * Add n to sum i associated with the specified tracing_map_elt + * instance. The index i is the index returned by the call to + * tracing_map_add_sum_field() when the tracing map was set up. + */ +void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n) +{ + atomic64_add(n, &elt->fields[i].sum); +} + +/** + * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field + * @elt: The tracing_map_elt + * @i: The index of the given sum associated with the tracing_map_elt + * + * Retrieve the value of the sum i associated with the specified + * tracing_map_elt instance. The index i is the index returned by the + * call to tracing_map_add_sum_field() when the tracing map was set + * up. + * + * Return: The sum associated with field i for elt. + */ +u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) +{ + return (u64)atomic64_read(&elt->fields[i].sum); +} + +int tracing_map_cmp_string(void *val_a, void *val_b) +{ + char *a = val_a; + char *b = val_b; + + return strcmp(a, b); +} + +int tracing_map_cmp_none(void *val_a, void *val_b) +{ + return 0; +} + +static int tracing_map_cmp_atomic64(void *val_a, void *val_b) +{ + u64 a = atomic64_read((atomic64_t *)val_a); + u64 b = atomic64_read((atomic64_t *)val_b); + + return (a > b) ? 1 : ((a < b) ? -1 : 0); +} + +#define DEFINE_TRACING_MAP_CMP_FN(type) \ +static int tracing_map_cmp_##type(void *val_a, void *val_b) \ +{ \ + type a = *(type *)val_a; \ + type b = *(type *)val_b; \ + \ + return (a > b) ? 1 : ((a < b) ? -1 : 0); \ +} + +DEFINE_TRACING_MAP_CMP_FN(s64); +DEFINE_TRACING_MAP_CMP_FN(u64); +DEFINE_TRACING_MAP_CMP_FN(s32); +DEFINE_TRACING_MAP_CMP_FN(u32); +DEFINE_TRACING_MAP_CMP_FN(s16); +DEFINE_TRACING_MAP_CMP_FN(u16); +DEFINE_TRACING_MAP_CMP_FN(s8); +DEFINE_TRACING_MAP_CMP_FN(u8); + +tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed) +{ + tracing_map_cmp_fn_t fn = tracing_map_cmp_none; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = tracing_map_cmp_s64; + else + fn = tracing_map_cmp_u64; + break; + case 4: + if (field_is_signed) + fn = tracing_map_cmp_s32; + else + fn = tracing_map_cmp_u32; + break; + case 2: + if (field_is_signed) + fn = tracing_map_cmp_s16; + else + fn = tracing_map_cmp_u16; + break; + case 1: + if (field_is_signed) + fn = tracing_map_cmp_s8; + else + fn = tracing_map_cmp_u8; + break; + } + + return fn; +} + +static int tracing_map_add_field(struct tracing_map *map, + tracing_map_cmp_fn_t cmp_fn) +{ + int ret = -EINVAL; + + if (map->n_fields < TRACING_MAP_FIELDS_MAX) { + ret = map->n_fields; + map->fields[map->n_fields++].cmp_fn = cmp_fn; + } + + return ret; +} + +/** + * tracing_map_add_sum_field - Add a field describing a tracing_map sum + * @map: The tracing_map + * + * Add a sum field to the key and return the index identifying it in + * the map and associated tracing_map_elts. This is the index used + * for instance to update a sum for a particular tracing_map_elt using + * tracing_map_update_sum() or reading it via tracing_map_read_sum(). + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts. + */ +int tracing_map_add_sum_field(struct tracing_map *map) +{ + return tracing_map_add_field(map, tracing_map_cmp_atomic64); +} + +/** + * tracing_map_add_key_field - Add a field describing a tracing_map key + * @map: The tracing_map + * @offset: The offset within the key + * @cmp_fn: The comparison function that will be used to sort on the key + * + * Let the map know there is a key and that if it's used as a sort key + * to use cmp_fn. + * + * A key can be a subset of a compound key; for that purpose, the + * offset param is used to describe where within the the compound key + * the key referenced by this key field resides. + * + * Return: The index identifying the field in the map and associated + * tracing_map_elts. + */ +int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn) + +{ + int idx = tracing_map_add_field(map, cmp_fn); + + if (idx < 0) + return idx; + + map->fields[idx].offset = offset; + + map->key_idx[map->n_keys++] = idx; + + return idx; +} + +void tracing_map_array_clear(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a->pages) + return; + + for (i = 0; i < a->n_pages; i++) + memset(a->pages[i], 0, PAGE_SIZE); +} + +void tracing_map_array_free(struct tracing_map_array *a) +{ + unsigned int i; + + if (!a) + return; + + if (!a->pages) { + kfree(a); + return; + } + + for (i = 0; i < a->n_pages; i++) { + if (!a->pages[i]) + break; + free_page((unsigned long)a->pages[i]); + } +} + +struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, + unsigned int entry_size) +{ + struct tracing_map_array *a; + unsigned int i; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return NULL; + + a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1); + a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift); + a->n_pages = n_elts / a->entries_per_page; + if (!a->n_pages) + a->n_pages = 1; + a->entry_shift = fls(a->entries_per_page) - 1; + a->entry_mask = (1 << a->entry_shift) - 1; + + a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL); + if (!a->pages) + goto free; + + for (i = 0; i < a->n_pages; i++) { + a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!a->pages[i]) + goto free; + } + out: + return a; + free: + tracing_map_array_free(a); + a = NULL; + + goto out; +} + +static void tracing_map_elt_clear(struct tracing_map_elt *elt) +{ + unsigned i; + + for (i = 0; i < elt->map->n_fields; i++) + if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) + atomic64_set(&elt->fields[i].sum, 0); + + if (elt->map->ops && elt->map->ops->elt_clear) + elt->map->ops->elt_clear(elt); +} + +static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) +{ + unsigned int i; + + tracing_map_elt_clear(elt); + + for (i = 0; i < elt->map->n_fields; i++) { + elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn; + + if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64) + elt->fields[i].offset = elt->map->fields[i].offset; + } +} + +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + kfree(elt->fields); + kfree(elt->key); + kfree(elt); +} + +static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) +{ + struct tracing_map_elt *elt; + int err = 0; + + elt = kzalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return ERR_PTR(-ENOMEM); + + elt->map = map; + + elt->key = kzalloc(map->key_size, GFP_KERNEL); + if (!elt->key) { + err = -ENOMEM; + goto free; + } + + elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL); + if (!elt->fields) { + err = -ENOMEM; + goto free; + } + + tracing_map_elt_init_fields(elt); + + if (map->ops && map->ops->elt_alloc) { + err = map->ops->elt_alloc(elt); + if (err) + goto free; + } + return elt; + free: + tracing_map_elt_free(elt); + + return ERR_PTR(err); +} + +static struct tracing_map_elt *get_free_elt(struct tracing_map *map) +{ + struct tracing_map_elt *elt = NULL; + int idx; + + idx = atomic_inc_return(&map->next_elt); + if (idx < map->max_elts) { + elt = *(TRACING_MAP_ELT(map->elts, idx)); + if (map->ops && map->ops->elt_init) + map->ops->elt_init(elt); + } + + return elt; +} + +static void tracing_map_free_elts(struct tracing_map *map) +{ + unsigned int i; + + if (!map->elts) + return; + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + + tracing_map_array_free(map->elts); +} + +static int tracing_map_alloc_elts(struct tracing_map *map) +{ + unsigned int i; + + map->elts = tracing_map_array_alloc(map->max_elts, + sizeof(struct tracing_map_elt *)); + if (!map->elts) + return -ENOMEM; + + for (i = 0; i < map->max_elts; i++) { + *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); + if (!(*(TRACING_MAP_ELT(map->elts, i)))) { + tracing_map_free_elts(map); + + return -ENOMEM; + } + } + + return 0; +} + +static inline bool keys_match(void *key, void *test_key, unsigned key_size) +{ + bool match = true; + + if (memcmp(key, test_key, key_size)) + match = false; + + return match; +} + +static inline struct tracing_map_elt * +__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) +{ + u32 idx, key_hash, test_key; + struct tracing_map_entry *entry; + + key_hash = jhash(key, map->key_size, 0); + if (key_hash == 0) + key_hash = 1; + idx = key_hash >> (32 - (map->map_bits + 1)); + + while (1) { + idx &= (map->map_size - 1); + entry = TRACING_MAP_ENTRY(map->map, idx); + test_key = entry->key; + + if (test_key && test_key == key_hash && entry->val && + keys_match(key, entry->val->key, map->key_size)) { + atomic64_inc(&map->hits); + return entry->val; + } + + if (!test_key) { + if (lookup_only) + break; + + if (!cmpxchg(&entry->key, 0, key_hash)) { + struct tracing_map_elt *elt; + + elt = get_free_elt(map); + if (!elt) { + atomic64_inc(&map->drops); + entry->key = 0; + break; + } + + memcpy(elt->key, key, map->key_size); + entry->val = elt; + atomic64_inc(&map->hits); + + return entry->val; + } + } + + idx++; + } + + return NULL; +} + +/** + * tracing_map_insert - Insert key and/or retrieve val from a tracing_map + * @map: The tracing_map to insert into + * @key: The key to insert + * + * Inserts a key into a tracing_map and creates and returns a new + * tracing_map_elt for it, or if the key has already been inserted by + * a previous call, returns the tracing_map_elt already associated + * with it. When the map was created, the number of elements to be + * allocated for the map was specified (internally maintained as + * 'max_elts' in struct tracing_map), and that number of + * tracing_map_elts was created by tracing_map_init(). This is the + * pre-allocated pool of tracing_map_elts that tracing_map_insert() + * will allocate from when adding new keys. Once that pool is + * exhausted, tracing_map_insert() is useless and will return NULL to + * signal that state. There are two user-visible tracing_map + * variables, 'hits' and 'drops', which are updated by this function. + * Every time an element is either successfully inserted or retrieved, + * the 'hits' value is incrememented. Every time an element insertion + * fails, the 'drops' value is incremented. + * + * This is a lock-free tracing map insertion function implementing a + * modified form of Cliff Click's basic insertion algorithm. It + * requires the table size be a power of two. To prevent any + * possibility of an infinite loop we always make the internal table + * size double the size of the requested table size (max_elts * 2). + * Likewise, we never reuse a slot or resize or delete elements - when + * we've reached max_elts entries, we simply return NULL once we've + * run out of entries. Readers can at any point in time traverse the + * tracing map and safely access the key/val pairs. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If this was a newly inserted key, the val will be a newly allocated + * and associated tracing_map_elt pointer val. If the key wasn't + * found and the pool of tracing_map_elts has been exhausted, NULL is + * returned and no further insertions will succeed. + */ +struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, false); +} + +/** + * tracing_map_lookup - Retrieve val from a tracing_map + * @map: The tracing_map to perform the lookup on + * @key: The key to look up + * + * Looks up key in tracing_map and if found returns the matching + * tracing_map_elt. This is a lock-free lookup; see + * tracing_map_insert() for details on tracing_map and how it works. + * Every time an element is retrieved, the 'hits' value is + * incrememented. There is one user-visible tracing_map variable, + * 'hits', which is updated by this function. Every time an element + * is successfully retrieved, the 'hits' value is incrememented. The + * 'drops' value is never updated by this function. + * + * Return: the tracing_map_elt pointer val associated with the key. + * If the key wasn't found, NULL is returned. + */ +struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key) +{ + return __tracing_map_insert(map, key, true); +} + +/** + * tracing_map_destroy - Destroy a tracing_map + * @map: The tracing_map to destroy + * + * Frees a tracing_map along with its associated array of + * tracing_map_elts. + * + * Callers should make sure there are no readers or writers actively + * reading or inserting into the map before calling this. + */ +void tracing_map_destroy(struct tracing_map *map) +{ + if (!map) + return; + + tracing_map_free_elts(map); + + tracing_map_array_free(map->map); + kfree(map); +} + +/** + * tracing_map_clear - Clear a tracing_map + * @map: The tracing_map to clear + * + * Resets the tracing map to a cleared or initial state. The + * tracing_map_elts are all cleared, and the array of struct + * tracing_map_entry is reset to an initialized state. + * + * Callers should make sure there are no writers actively inserting + * into the map before calling this. + */ +void tracing_map_clear(struct tracing_map *map) +{ + unsigned int i; + + atomic_set(&map->next_elt, -1); + atomic64_set(&map->hits, 0); + atomic64_set(&map->drops, 0); + + tracing_map_array_clear(map->map); + + for (i = 0; i < map->max_elts; i++) + tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i))); +} + +static void set_sort_key(struct tracing_map *map, + struct tracing_map_sort_key *sort_key) +{ + map->sort_key = *sort_key; +} + +/** + * tracing_map_create - Create a lock-free map and element pool + * @map_bits: The size of the map (2 ** map_bits) + * @key_size: The size of the key for the map in bytes + * @ops: Optional client-defined tracing_map_ops instance + * @private_data: Client data associated with the map + * + * Creates and sets up a map to contain 2 ** map_bits number of + * elements (internally maintained as 'max_elts' in struct + * tracing_map). Before using, map fields should be added to the map + * with tracing_map_add_sum_field() and tracing_map_add_key_field(). + * tracing_map_init() should then be called to allocate the array of + * tracing_map_elts, in order to avoid allocating anything in the map + * insertion path. The user-specified map size reflects the maximum + * number of elements that can be contained in the table requested by + * the user - internally we double that in order to keep the table + * sparse and keep collisions manageable. + * + * A tracing_map is a special-purpose map designed to aggregate or + * 'sum' one or more values associated with a specific object of type + * tracing_map_elt, which is attached by the map to a given key. + * + * tracing_map_create() sets up the map itself, and provides + * operations for inserting tracing_map_elts, but doesn't allocate the + * tracing_map_elts themselves, or provide a means for describing the + * keys or sums associated with the tracing_map_elts. All + * tracing_map_elts for a given map have the same set of sums and + * keys, which are defined by the client using the functions + * tracing_map_add_key_field() and tracing_map_add_sum_field(). Once + * the fields are defined, the pool of elements allocated for the map + * can be created, which occurs when the client code calls + * tracing_map_init(). + * + * When tracing_map_init() returns, tracing_map_elt elements can be + * inserted into the map using tracing_map_insert(). When called, + * tracing_map_insert() grabs a free tracing_map_elt from the pool, or + * finds an existing match in the map and in either case returns it. + * The client can then use tracing_map_update_sum() and + * tracing_map_read_sum() to update or read a given sum field for the + * tracing_map_elt. + * + * The client can at any point retrieve and traverse the current set + * of inserted tracing_map_elts in a tracing_map, via + * tracing_map_sort_entries(). Sorting can be done on any field, + * including keys. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +struct tracing_map *tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data) +{ + struct tracing_map *map; + unsigned int i; + + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + return ERR_PTR(-EINVAL); + + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + map->map_bits = map_bits; + map->max_elts = (1 << map_bits); + atomic_set(&map->next_elt, -1); + + map->map_size = (1 << (map_bits + 1)); + map->ops = ops; + + map->private_data = private_data; + + map->map = tracing_map_array_alloc(map->map_size, + sizeof(struct tracing_map_entry)); + if (!map->map) + goto free; + + map->key_size = key_size; + for (i = 0; i < TRACING_MAP_KEYS_MAX; i++) + map->key_idx[i] = -1; + out: + return map; + free: + tracing_map_destroy(map); + map = ERR_PTR(-ENOMEM); + + goto out; +} + +/** + * tracing_map_init - Allocate and clear a map's tracing_map_elts + * @map: The tracing_map to initialize + * + * Allocates a clears a pool of tracing_map_elts equal to the + * user-specified size of 2 ** map_bits (internally maintained as + * 'max_elts' in struct tracing_map). Before using, the map fields + * should be added to the map with tracing_map_add_sum_field() and + * tracing_map_add_key_field(). tracing_map_init() should then be + * called to allocate the array of tracing_map_elts, in order to avoid + * allocating anything in the map insertion path. The user-specified + * map size reflects the max number of elements requested by the user + * - internally we double that in order to keep the table sparse and + * keep collisions manageable. + * + * See tracing_map.h for a description of tracing_map_ops. + * + * Return: the tracing_map pointer if successful, ERR_PTR if not. + */ +int tracing_map_init(struct tracing_map *map) +{ + int err; + + if (map->n_fields < 2) + return -EINVAL; /* need at least 1 key and 1 val */ + + err = tracing_map_alloc_elts(map); + if (err) + return err; + + tracing_map_clear(map); + + return err; +} + +static int cmp_entries_dup(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + int ret = 0; + + if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + ret = 1; + + return ret; +} + +static int cmp_entries_sum(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + cmp_fn = field->cmp_fn; + + val_a = &elt_a->fields[sort_key->field_idx].sum; + val_b = &elt_b->fields[sort_key->field_idx].sum; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static int cmp_entries_key(const struct tracing_map_sort_entry **a, + const struct tracing_map_sort_entry **b) +{ + const struct tracing_map_elt *elt_a, *elt_b; + struct tracing_map_sort_key *sort_key; + struct tracing_map_field *field; + tracing_map_cmp_fn_t cmp_fn; + void *val_a, *val_b; + int ret = 0; + + elt_a = (*a)->elt; + elt_b = (*b)->elt; + + sort_key = &elt_a->map->sort_key; + + field = &elt_a->fields[sort_key->field_idx]; + + cmp_fn = field->cmp_fn; + + val_a = elt_a->key + field->offset; + val_b = elt_b->key + field->offset; + + ret = cmp_fn(val_a, val_b); + if (sort_key->descending) + ret = -ret; + + return ret; +} + +static void destroy_sort_entry(struct tracing_map_sort_entry *entry) +{ + if (!entry) + return; + + if (entry->elt_copied) + tracing_map_elt_free(entry->elt); + + kfree(entry); +} + +/** + * tracing_map_destroy_sort_entries - Destroy an array of sort entries + * @entries: The entries to destroy + * @n_entries: The number of entries in the array + * + * Destroy the elements returned by a tracing_map_sort_entries() call. + */ +void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries) +{ + unsigned int i; + + for (i = 0; i < n_entries; i++) + destroy_sort_entry(entries[i]); + + vfree(entries); +} + +static struct tracing_map_sort_entry * +create_sort_entry(void *key, struct tracing_map_elt *elt) +{ + struct tracing_map_sort_entry *sort_entry; + + sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL); + if (!sort_entry) + return NULL; + + sort_entry->key = key; + sort_entry->elt = elt; + + return sort_entry; +} + +static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) +{ + struct tracing_map_elt *dup_elt; + unsigned int i; + + dup_elt = tracing_map_elt_alloc(elt->map); + if (!dup_elt) + return NULL; + + if (elt->map->ops && elt->map->ops->elt_copy) + elt->map->ops->elt_copy(dup_elt, elt); + + dup_elt->private_data = elt->private_data; + memcpy(dup_elt->key, elt->key, elt->map->key_size); + + for (i = 0; i < elt->map->n_fields; i++) { + atomic64_set(&dup_elt->fields[i].sum, + atomic64_read(&elt->fields[i].sum)); + dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; + } + + return dup_elt; +} + +static int merge_dup(struct tracing_map_sort_entry **sort_entries, + unsigned int target, unsigned int dup) +{ + struct tracing_map_elt *target_elt, *elt; + bool first_dup = (target - dup) == 1; + int i; + + if (first_dup) { + elt = sort_entries[target]->elt; + target_elt = copy_elt(elt); + if (!target_elt) + return -ENOMEM; + sort_entries[target]->elt = target_elt; + sort_entries[target]->elt_copied = true; + } else + target_elt = sort_entries[target]->elt; + + elt = sort_entries[dup]->elt; + + for (i = 0; i < elt->map->n_fields; i++) + atomic64_add(atomic64_read(&elt->fields[i].sum), + &target_elt->fields[i].sum); + + sort_entries[dup]->dup = true; + + return 0; +} + +static int merge_dups(struct tracing_map_sort_entry **sort_entries, + int n_entries, unsigned int key_size) +{ + unsigned int dups = 0, total_dups = 0; + int err, i, j; + void *key; + + if (n_entries < 2) + return total_dups; + + sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_dup, NULL); + + key = sort_entries[0]->key; + for (i = 1; i < n_entries; i++) { + if (!memcmp(sort_entries[i]->key, key, key_size)) { + dups++; total_dups++; + err = merge_dup(sort_entries, i - dups, i); + if (err) + return err; + continue; + } + key = sort_entries[i]->key; + dups = 0; + } + + if (!total_dups) + return total_dups; + + for (i = 0, j = 0; i < n_entries; i++) { + if (!sort_entries[i]->dup) { + sort_entries[j] = sort_entries[i]; + if (j++ != i) + sort_entries[i] = NULL; + } else { + destroy_sort_entry(sort_entries[i]); + sort_entries[i] = NULL; + } + } + + return total_dups; +} + +static bool is_key(struct tracing_map *map, unsigned int field_idx) +{ + unsigned int i; + + for (i = 0; i < map->n_keys; i++) + if (map->key_idx[i] == field_idx) + return true; + return false; +} + +static void sort_secondary(struct tracing_map *map, + const struct tracing_map_sort_entry **entries, + unsigned int n_entries, + struct tracing_map_sort_key *primary_key, + struct tracing_map_sort_key *secondary_key) +{ + int (*primary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + int (*secondary_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + unsigned i, start = 0, n_sub = 1; + + if (is_key(map, primary_key->field_idx)) + primary_fn = cmp_entries_key; + else + primary_fn = cmp_entries_sum; + + if (is_key(map, secondary_key->field_idx)) + secondary_fn = cmp_entries_key; + else + secondary_fn = cmp_entries_sum; + + for (i = 0; i < n_entries - 1; i++) { + const struct tracing_map_sort_entry **a = &entries[i]; + const struct tracing_map_sort_entry **b = &entries[i + 1]; + + if (primary_fn(a, b) == 0) { + n_sub++; + if (i < n_entries - 2) + continue; + } + + if (n_sub < 2) { + start = i + 1; + n_sub = 1; + continue; + } + + set_sort_key(map, secondary_key); + sort(&entries[start], n_sub, + sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))secondary_fn, NULL); + set_sort_key(map, primary_key); + + start = i + 1; + n_sub = 1; + } +} + +/** + * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map + * @map: The tracing_map + * @sort_key: The sort key to use for sorting + * @sort_entries: outval: pointer to allocated and sorted array of entries + * + * tracing_map_sort_entries() sorts the current set of entries in the + * map and returns the list of tracing_map_sort_entries containing + * them to the client in the sort_entries param. The client can + * access the struct tracing_map_elt element of interest directly as + * the 'elt' field of a returned struct tracing_map_sort_entry object. + * + * The sort_key has only two fields: idx and descending. 'idx' refers + * to the index of the field added via tracing_map_add_sum_field() or + * tracing_map_add_key_field() when the tracing_map was initialized. + * 'descending' is a flag that if set reverses the sort order, which + * by default is ascending. + * + * The client should not hold on to the returned array but should use + * it and call tracing_map_destroy_sort_entries() when done. + * + * Return: the number of sort_entries in the struct tracing_map_sort_entry + * array, negative on error + */ +int tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries) +{ + int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, + const struct tracing_map_sort_entry **); + struct tracing_map_sort_entry *sort_entry, **entries; + int i, n_entries, ret; + + entries = vmalloc(map->max_elts * sizeof(sort_entry)); + if (!entries) + return -ENOMEM; + + for (i = 0, n_entries = 0; i < map->map_size; i++) { + struct tracing_map_entry *entry; + + entry = TRACING_MAP_ENTRY(map->map, i); + + if (!entry->key || !entry->val) + continue; + + entries[n_entries] = create_sort_entry(entry->val->key, + entry->val); + if (!entries[n_entries++]) { + ret = -ENOMEM; + goto free; + } + } + + if (n_entries == 0) { + ret = 0; + goto free; + } + + if (n_entries == 1) { + *sort_entries = entries; + return 1; + } + + ret = merge_dups(entries, n_entries, map->key_size); + if (ret < 0) + goto free; + n_entries -= ret; + + if (is_key(map, sort_keys[0].field_idx)) + cmp_entries_fn = cmp_entries_key; + else + cmp_entries_fn = cmp_entries_sum; + + set_sort_key(map, &sort_keys[0]); + + sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *), + (int (*)(const void *, const void *))cmp_entries_fn, NULL); + + if (n_sort_keys > 1) + sort_secondary(map, + (const struct tracing_map_sort_entry **)entries, + n_entries, + &sort_keys[0], + &sort_keys[1]); + + *sort_entries = entries; + + return n_entries; + free: + tracing_map_destroy_sort_entries(entries, n_entries); + + return ret; +} diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h new file mode 100644 index 000000000000..75718255a8ad --- /dev/null +++ b/kernel/trace/tracing_map.h @@ -0,0 +1,282 @@ +#ifndef __TRACING_MAP_H +#define __TRACING_MAP_H + +#define TRACING_MAP_BITS_DEFAULT 11 +#define TRACING_MAP_BITS_MAX 17 +#define TRACING_MAP_BITS_MIN 7 + +#define TRACING_MAP_FIELDS_MAX 4 +#define TRACING_MAP_KEYS_MAX 2 + +#define TRACING_MAP_SORT_KEYS_MAX 2 + +typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); + +/* + * This is an overview of the tracing_map data structures and how they + * relate to the tracing_map API. The details of the algorithms + * aren't discussed here - this is just a general overview of the data + * structures and how they interact with the API. + * + * The central data structure of the tracing_map is an initially + * zeroed array of struct tracing_map_entry (stored in the map field + * of struct tracing_map). tracing_map_entry is a very simple data + * structure containing only two fields: a 32-bit unsigned 'key' + * variable and a pointer named 'val'. This array of struct + * tracing_map_entry is essentially a hash table which will be + * modified by a single function, tracing_map_insert(), but which can + * be traversed and read by a user at any time (though the user does + * this indirectly via an array of tracing_map_sort_entry - see the + * explanation of that data structure in the discussion of the + * sorting-related data structures below). + * + * The central function of the tracing_map API is + * tracing_map_insert(). tracing_map_insert() hashes the + * arbitrarily-sized key passed into it into a 32-bit unsigned key. + * It then uses this key, truncated to the array size, as an index + * into the array of tracing_map_entries. If the value of the 'key' + * field of the tracing_map_entry found at that location is 0, then + * that entry is considered to be free and can be claimed, by + * replacing the 0 in the 'key' field of the tracing_map_entry with + * the new 32-bit hashed key. Once claimed, that tracing_map_entry's + * 'val' field is then used to store a unique element which will be + * forever associated with that 32-bit hashed key in the + * tracing_map_entry. + * + * That unique element now in the tracing_map_entry's 'val' field is + * an instance of tracing_map_elt, where 'elt' in the latter part of + * that variable name is short for 'element'. The purpose of a + * tracing_map_elt is to hold values specific to the the particular + * 32-bit hashed key it's assocated with. Things such as the unique + * set of aggregated sums associated with the 32-bit hashed key, along + * with a copy of the full key associated with the entry, and which + * was used to produce the 32-bit hashed key. + * + * When tracing_map_create() is called to create the tracing map, the + * user specifies (indirectly via the map_bits param, the details are + * unimportant for this discussion) the maximum number of elements + * that the map can hold (stored in the max_elts field of struct + * tracing_map). This is the maximum possible number of + * tracing_map_entries in the tracing_map_entry array which can be + * 'claimed' as described in the above discussion, and therefore is + * also the maximum number of tracing_map_elts that can be associated + * with the tracing_map_entry array in the tracing_map. Because of + * the way the insertion algorithm works, the size of the allocated + * tracing_map_entry array is always twice the maximum number of + * elements (2 * max_elts). This value is stored in the map_size + * field of struct tracing_map. + * + * Because tracing_map_insert() needs to work from any context, + * including from within the memory allocation functions themselves, + * both the tracing_map_entry array and a pool of max_elts + * tracing_map_elts are pre-allocated before any call is made to + * tracing_map_insert(). + * + * The tracing_map_entry array is allocated as a single block by + * tracing_map_create(). + * + * Because the tracing_map_elts are much larger objects and can't + * generally be allocated together as a single large array without + * failure, they're allocated individually, by tracing_map_init(). + * + * The pool of tracing_map_elts are allocated by tracing_map_init() + * rather than by tracing_map_create() because at the time + * tracing_map_create() is called, there isn't enough information to + * create the tracing_map_elts. Specifically,the user first needs to + * tell the tracing_map implementation how many fields the + * tracing_map_elts contain, and which types of fields they are (key + * or sum). The user does this via the tracing_map_add_sum_field() + * and tracing_map_add_key_field() functions, following which the user + * calls tracing_map_init() to finish up the tracing map setup. The + * array holding the pointers which make up the pre-allocated pool of + * tracing_map_elts is allocated as a single block and is stored in + * the elts field of struct tracing_map. + * + * There is also a set of structures used for sorting that might + * benefit from some minimal explanation. + * + * struct tracing_map_sort_key is used to drive the sort at any given + * time. By 'any given time' we mean that a different + * tracing_map_sort_key will be used at different times depending on + * whether the sort currently being performed is a primary or a + * secondary sort. + * + * The sort key is very simple, consisting of the field index of the + * tracing_map_elt field to sort on (which the user saved when adding + * the field), and whether the sort should be done in an ascending or + * descending order. + * + * For the convenience of the sorting code, a tracing_map_sort_entry + * is created for each tracing_map_elt, again individually allocated + * to avoid failures that might be expected if allocated as a single + * large array of struct tracing_map_sort_entry. + * tracing_map_sort_entry instances are the objects expected by the + * various internal sorting functions, and are also what the user + * ultimately receives after calling tracing_map_sort_entries(). + * Because it doesn't make sense for users to access an unordered and + * sparsely populated tracing_map directly, the + * tracing_map_sort_entries() function is provided so that users can + * retrieve a sorted list of all existing elements. In addition to + * the associated tracing_map_elt 'elt' field contained within the + * tracing_map_sort_entry, which is the object of interest to the + * user, tracing_map_sort_entry objects contain a number of additional + * fields which are used for caching and internal purposes and can + * safely be ignored. +*/ + +struct tracing_map_field { + tracing_map_cmp_fn_t cmp_fn; + union { + atomic64_t sum; + unsigned int offset; + }; +}; + +struct tracing_map_elt { + struct tracing_map *map; + struct tracing_map_field *fields; + void *key; + void *private_data; +}; + +struct tracing_map_entry { + u32 key; + struct tracing_map_elt *val; +}; + +struct tracing_map_sort_key { + unsigned int field_idx; + bool descending; +}; + +struct tracing_map_sort_entry { + void *key; + struct tracing_map_elt *elt; + bool elt_copied; + bool dup; +}; + +struct tracing_map_array { + unsigned int entries_per_page; + unsigned int entry_size_shift; + unsigned int entry_shift; + unsigned int entry_mask; + unsigned int n_pages; + void **pages; +}; + +#define TRACING_MAP_ARRAY_ELT(array, idx) \ + (array->pages[idx >> array->entry_shift] + \ + ((idx & array->entry_mask) << array->entry_size_shift)) + +#define TRACING_MAP_ENTRY(array, idx) \ + ((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx)) + +#define TRACING_MAP_ELT(array, idx) \ + ((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx)) + +struct tracing_map { + unsigned int key_size; + unsigned int map_bits; + unsigned int map_size; + unsigned int max_elts; + atomic_t next_elt; + struct tracing_map_array *elts; + struct tracing_map_array *map; + const struct tracing_map_ops *ops; + void *private_data; + struct tracing_map_field fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_fields; + int key_idx[TRACING_MAP_KEYS_MAX]; + unsigned int n_keys; + struct tracing_map_sort_key sort_key; + atomic64_t hits; + atomic64_t drops; +}; + +/** + * struct tracing_map_ops - callbacks for tracing_map + * + * The methods in this structure define callback functions for various + * operations on a tracing_map or objects related to a tracing_map. + * + * For a detailed description of tracing_map_elt objects please see + * the overview of tracing_map data structures at the beginning of + * this file. + * + * All the methods below are optional. + * + * @elt_alloc: When a tracing_map_elt is allocated, this function, if + * defined, will be called and gives clients the opportunity to + * allocate additional data and attach it to the element + * (tracing_map_elt->private_data is meant for that purpose). + * Element allocation occurs before tracing begins, when the + * tracing_map_init() call is made by client code. + * + * @elt_copy: At certain points in the lifetime of an element, it may + * need to be copied. The copy should include a copy of the + * client-allocated data, which can be copied into the 'to' + * element from the 'from' element. + * + * @elt_free: When a tracing_map_elt is freed, this function is called + * and allows client-allocated per-element data to be freed. + * + * @elt_clear: This callback allows per-element client-defined data to + * be cleared, if applicable. + * + * @elt_init: This callback allows per-element client-defined data to + * be initialized when used i.e. when the element is actually + * claimed by tracing_map_insert() in the context of the map + * insertion. + */ +struct tracing_map_ops { + int (*elt_alloc)(struct tracing_map_elt *elt); + void (*elt_copy)(struct tracing_map_elt *to, + struct tracing_map_elt *from); + void (*elt_free)(struct tracing_map_elt *elt); + void (*elt_clear)(struct tracing_map_elt *elt); + void (*elt_init)(struct tracing_map_elt *elt); +}; + +extern struct tracing_map * +tracing_map_create(unsigned int map_bits, + unsigned int key_size, + const struct tracing_map_ops *ops, + void *private_data); +extern int tracing_map_init(struct tracing_map *map); + +extern int tracing_map_add_sum_field(struct tracing_map *map); +extern int tracing_map_add_key_field(struct tracing_map *map, + unsigned int offset, + tracing_map_cmp_fn_t cmp_fn); + +extern void tracing_map_destroy(struct tracing_map *map); +extern void tracing_map_clear(struct tracing_map *map); + +extern struct tracing_map_elt * +tracing_map_insert(struct tracing_map *map, void *key); +extern struct tracing_map_elt * +tracing_map_lookup(struct tracing_map *map, void *key); + +extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size, + int field_is_signed); +extern int tracing_map_cmp_string(void *val_a, void *val_b); +extern int tracing_map_cmp_none(void *val_a, void *val_b); + +extern void tracing_map_update_sum(struct tracing_map_elt *elt, + unsigned int i, u64 n); +extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); +extern void tracing_map_set_field_descr(struct tracing_map *map, + unsigned int i, + unsigned int key_offset, + tracing_map_cmp_fn_t cmp_fn); +extern int +tracing_map_sort_entries(struct tracing_map *map, + struct tracing_map_sort_key *sort_keys, + unsigned int n_sort_keys, + struct tracing_map_sort_entry ***sort_entries); + +extern void +tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries, + unsigned int n_entries); +#endif /* __TRACING_MAP_H */ -- cgit v1.2.3 From 8d44f2f34fb50f40185ef6404b0047223a27ba82 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 19 Feb 2016 14:47:00 -0500 Subject: tracing: Fix TRACING_MAP Kconfig The config option for TRACING_MAP has "default n", which is not needed because the default of configs is 'n'. Also, since the TRACING_MAP has no config prompt, there's no reason to include "If in doubt, say N" in the help text. Fixed a typo in the comments of tracing_map.h. Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 3 --- kernel/trace/tracing_map.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 48b732943e0e..d39556fd863a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -531,7 +531,6 @@ config MMIOTRACE config TRACING_MAP bool depends on ARCH_HAVE_NMI_SAFE_CMPXCHG - default n help tracing_map is a special-purpose lock-free map for tracing, separated out as a stand-alone facility in order to allow it @@ -539,8 +538,6 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. - If in doubt, say N. - config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 75718255a8ad..1f7eda548787 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -46,7 +46,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); * That unique element now in the tracing_map_entry's 'val' field is * an instance of tracing_map_elt, where 'elt' in the latter part of * that variable name is short for 'element'. The purpose of a - * tracing_map_elt is to hold values specific to the the particular + * tracing_map_elt is to hold values specific to the particular * 32-bit hashed key it's assocated with. Things such as the unique * set of aggregated sums associated with the 32-bit hashed key, along * with a copy of the full key associated with the entry, and which -- cgit v1.2.3 From 3b772b96b8338bca2532839b2cd7802800e66037 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:41 -0600 Subject: tracing: Update some tracing_map constants and comments Make it clear exactly how many keys and values are supported through better defines, and add 1 to the vals count, since normally clients want support for at least a hitcount and two other values. Also, note the error return value for tracing_map_add_key/val_field() in the comments. Link: http://lkml.kernel.org/r/6696fa02ebc716aa344c27a571a2afaa25e5b4d4.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 4 ++-- kernel/trace/tracing_map.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index dd6401dd145e..e0f172932eca 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -163,7 +163,7 @@ static int tracing_map_add_field(struct tracing_map *map, * tracing_map_update_sum() or reading it via tracing_map_read_sum(). * * Return: The index identifying the field in the map and associated - * tracing_map_elts. + * tracing_map_elts, or -EINVAL on error. */ int tracing_map_add_sum_field(struct tracing_map *map) { @@ -184,7 +184,7 @@ int tracing_map_add_sum_field(struct tracing_map *map) * the key referenced by this key field resides. * * Return: The index identifying the field in the map and associated - * tracing_map_elts. + * tracing_map_elts, or -EINVAL on error. */ int tracing_map_add_key_field(struct tracing_map *map, unsigned int offset, diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 1f7eda548787..618838f5f30a 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -5,9 +5,10 @@ #define TRACING_MAP_BITS_MAX 17 #define TRACING_MAP_BITS_MIN 7 -#define TRACING_MAP_FIELDS_MAX 4 #define TRACING_MAP_KEYS_MAX 2 - +#define TRACING_MAP_VALS_MAX 3 +#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ + TRACING_MAP_VALS_MAX) #define TRACING_MAP_SORT_KEYS_MAX 2 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); -- cgit v1.2.3 From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:42 -0600 Subject: tracing: Add 'hist' event trigger command 'hist' triggers allow users to continually aggregate trace events, which can then be viewed afterwards by simply reading a 'hist' file containing the aggregation in a human-readable format. The basic idea is very simple and boils down to a mechanism whereby trace events, rather than being exhaustively dumped in raw form and viewed directly, are automatically 'compressed' into meaningful tables completely defined by the user. This is done strictly via single-line command-line commands and without the aid of any kind of programming language or interpreter. A surprising number of typical use cases can be accomplished by users via this simple mechanism. In fact, a large number of the tasks that users typically do using the more complicated script-based tracing tools, at least during the initial stages of an investigation, can be accomplished by simply specifying a set of keys and values to be used in the creation of a hash table. The Linux kernel trace event subsystem happens to provide an extensive list of keys and values ready-made for such a purpose in the form of the event format files associated with each trace event. By simply consulting the format file for field names of interest and by plugging them into the hist trigger command, users can create an endless number of useful aggregations to help with investigating various properties of the system. See Documentation/trace/events.txt for examples. hist triggers are implemented on top of the existing event trigger infrastructure, and as such are consistent with the existing triggers from a user's perspective as well. The basic syntax follows the existing trigger syntax. Users start an aggregation by writing a 'hist' trigger to the event of interest's trigger file: # echo hist:keys=xxx [ if filter] > event/trigger Once a hist trigger has been set up, by default it continually aggregates every matching event into a hash table using the event key and a value field named 'hitcount'. To view the aggregation at any point in time, simply read the 'hist' file in the same directory as the 'trigger' file: # cat event/hist The detailed syntax provides additional options for user control, and is described exhaustively in Documentation/trace/events.txt and in the virtual tracing/README file in the tracing subsystem. Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 1 + kernel/trace/Kconfig | 16 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 17 + kernel/trace/trace.h | 7 + kernel/trace/trace_events.c | 4 + kernel/trace/trace_events_hist.c | 849 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 1 + 8 files changed, 896 insertions(+) create mode 100644 kernel/trace/trace_events_hist.c (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0810f81b6db2..404603720650 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -407,6 +407,7 @@ enum event_trigger_type { ETT_SNAPSHOT = (1 << 1), ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), + ETT_EVENT_HIST = (1 << 4), }; extern int filter_match_preds(struct event_filter *filter, void *rec); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d39556fd863a..fafeaf803bd0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -538,6 +538,22 @@ config TRACING_MAP generally used outside of that context, and is normally selected by tracers that use it. +config HIST_TRIGGERS + bool "Histogram triggers" + depends on ARCH_HAVE_NMI_SAFE_CMPXCHG + select TRACING_MAP + default n + help + Hist triggers allow one or more arbitrary trace event fields + to be aggregated into hash tables and dumped to stdout by + reading a debugfs/tracefs file. They're useful for + gathering quick and dirty (though precise) summaries of + event activity as an initial guide for further investigation + using more advanced tools. + + See Documentation/trace/events.txt. + If in doubt, say N. + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 4255c4057aaa..979e7bfbde7a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0d12dbde8399..6cf8fd03b028 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3812,6 +3812,9 @@ static const char readme_msg[] = #endif #ifdef CONFIG_TRACER_SNAPSHOT "\t\t snapshot\n" +#endif +#ifdef CONFIG_HIST_TRIGGERS + "\t\t hist (see below)\n" #endif "\t example: echo traceoff > events/block/block_unplug/trigger\n" "\t echo traceoff:3 > events/block/block_unplug/trigger\n" @@ -3828,6 +3831,20 @@ static const char readme_msg[] = "\t To remove a trigger with a count:\n" "\t echo '!:0 > //trigger\n" "\t Filters can be ignored when removing a trigger.\n" +#ifdef CONFIG_HIST_TRIGGERS + " hist trigger\t- If set, event hits are aggregated into a hash table\n" + "\t Format: hist:keys=\n" + "\t [:size=#entries]\n" + "\t [if ]\n\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key named, and the value of a sum called\n" + "\t 'hitcount' is incremented. Keys correspond to fields in the\n" + "\t event's format description. Keys can be any field. The\n" + "\t 'size' parameter can be used to specify more or fewer than\n" + "\t the default 2048 entries for the hashtable size.\n\n" + "\t Reading the 'hist' file for the event will dump the hash\n" + "\t table in its entirety to stdout." +#endif ; static ssize_t diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2525042760e6..505f8a45f426 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1162,6 +1162,13 @@ extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; +extern const struct file_operations event_hist_fops; + +#ifdef CONFIG_HIST_TRIGGERS +extern int register_trigger_hist_cmd(void); +#else +static inline int register_trigger_hist_cmd(void) { return 0; } +#endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index add81dff7520..e7cb983ee93c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); +#ifdef CONFIG_HIST_TRIGGERS + trace_create_file("hist", 0444, file->dir, file, + &event_hist_fops); +#endif trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c new file mode 100644 index 000000000000..23b45e462117 --- /dev/null +++ b/kernel/trace/trace_events_hist.c @@ -0,0 +1,849 @@ +/* + * trace_events_hist - trace event hist triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright (C) 2015 Tom Zanussi + */ + +#include +#include +#include +#include +#include + +#include "tracing_map.h" +#include "trace.h" + +struct hist_field; + +typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); + +struct hist_field { + struct ftrace_event_field *field; + unsigned long flags; + hist_field_fn_t fn; + unsigned int size; +}; + +static u64 hist_field_counter(struct hist_field *field, void *event) +{ + return 1; +} + +static u64 hist_field_string(struct hist_field *hist_field, void *event) +{ + char *addr = (char *)(event + hist_field->field->offset); + + return (u64)(unsigned long)addr; +} + +#define DEFINE_HIST_FIELD_FN(type) \ +static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ +{ \ + type *addr = (type *)(event + hist_field->field->offset); \ + \ + return (u64)*addr; \ +} + +DEFINE_HIST_FIELD_FN(s64); +DEFINE_HIST_FIELD_FN(u64); +DEFINE_HIST_FIELD_FN(s32); +DEFINE_HIST_FIELD_FN(u32); +DEFINE_HIST_FIELD_FN(s16); +DEFINE_HIST_FIELD_FN(u16); +DEFINE_HIST_FIELD_FN(s8); +DEFINE_HIST_FIELD_FN(u8); + +#define for_each_hist_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_fields; (i)++) + +#define for_each_hist_val_field(i, hist_data) \ + for ((i) = 0; (i) < (hist_data)->n_vals; (i)++) + +#define for_each_hist_key_field(i, hist_data) \ + for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) + +#define HITCOUNT_IDX 0 +#define HIST_KEY_MAX 1 +#define HIST_KEY_SIZE_MAX MAX_FILTER_STR_VAL + +enum hist_field_flags { + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, +}; + +struct hist_trigger_attrs { + char *keys_str; + unsigned int map_bits; +}; + +struct hist_trigger_data { + struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; + unsigned int n_vals; + unsigned int n_keys; + unsigned int n_fields; + unsigned int key_size; + struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; + unsigned int n_sort_keys; + struct trace_event_file *event_file; + struct hist_trigger_attrs *attrs; + struct tracing_map *map; +}; + +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +{ + hist_field_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (field_is_signed) + fn = hist_field_s64; + else + fn = hist_field_u64; + break; + case 4: + if (field_is_signed) + fn = hist_field_s32; + else + fn = hist_field_u32; + break; + case 2: + if (field_is_signed) + fn = hist_field_s16; + else + fn = hist_field_u16; + break; + case 1: + if (field_is_signed) + fn = hist_field_s8; + else + fn = hist_field_u8; + break; + } + + return fn; +} + +static int parse_map_size(char *str) +{ + unsigned long size, map_bits; + int ret; + + strsep(&str, "="); + if (!str) { + ret = -EINVAL; + goto out; + } + + ret = kstrtoul(str, 0, &size); + if (ret) + goto out; + + map_bits = ilog2(roundup_pow_of_two(size)); + if (map_bits < TRACING_MAP_BITS_MIN || + map_bits > TRACING_MAP_BITS_MAX) + ret = -EINVAL; + else + ret = map_bits; + out: + return ret; +} + +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) +{ + if (!attrs) + return; + + kfree(attrs->keys_str); + kfree(attrs); +} + +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +{ + struct hist_trigger_attrs *attrs; + int ret = 0; + + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return ERR_PTR(-ENOMEM); + + while (trigger_str) { + char *str = strsep(&trigger_str, ":"); + + if ((strncmp(str, "key=", strlen("key=")) == 0) || + (strncmp(str, "keys=", strlen("keys=")) == 0)) + attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "size=", strlen("size=")) == 0) { + int map_bits = parse_map_size(str); + + if (map_bits < 0) { + ret = map_bits; + goto free; + } + attrs->map_bits = map_bits; + } else { + ret = -EINVAL; + goto free; + } + } + + if (!attrs->keys_str) { + ret = -EINVAL; + goto free; + } + + return attrs; + free: + destroy_hist_trigger_attrs(attrs); + + return ERR_PTR(ret); +} + +static void destroy_hist_field(struct hist_field *hist_field) +{ + kfree(hist_field); +} + +static struct hist_field *create_hist_field(struct ftrace_event_field *field, + unsigned long flags) +{ + struct hist_field *hist_field; + + if (field && is_function_field(field)) + return NULL; + + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); + if (!hist_field) + return NULL; + + if (flags & HIST_FIELD_FL_HITCOUNT) { + hist_field->fn = hist_field_counter; + goto out; + } + + if (is_string_field(field)) { + flags |= HIST_FIELD_FL_STRING; + hist_field->fn = hist_field_string; + } else { + hist_field->fn = select_value_fn(field->size, + field->is_signed); + if (!hist_field->fn) { + destroy_hist_field(hist_field); + return NULL; + } + } + out: + hist_field->field = field; + hist_field->flags = flags; + + return hist_field; +} + +static void destroy_hist_fields(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { + if (hist_data->fields[i]) { + destroy_hist_field(hist_data->fields[i]); + hist_data->fields[i] = NULL; + } + } +} + +static int create_hitcount_val(struct hist_trigger_data *hist_data) +{ + hist_data->fields[HITCOUNT_IDX] = + create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); + if (!hist_data->fields[HITCOUNT_IDX]) + return -ENOMEM; + + hist_data->n_vals++; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + return -EINVAL; + + return 0; +} + +static int create_val_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_hitcount_val(hist_data); + + return ret; +} + +static int create_key_field(struct hist_trigger_data *hist_data, + unsigned int key_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + unsigned int key_size; + int ret = 0; + + if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + flags |= HIST_FIELD_FL_KEY; + + field = trace_find_event_field(file->event_call, field_str); + if (!field) { + ret = -EINVAL; + goto out; + } + + key_size = field->size; + + hist_data->fields[key_idx] = create_hist_field(field, flags); + if (!hist_data->fields[key_idx]) { + ret = -ENOMEM; + goto out; + } + + key_size = ALIGN(key_size, sizeof(u64)); + hist_data->fields[key_idx]->size = key_size; + hist_data->key_size = key_size; + if (hist_data->key_size > HIST_KEY_SIZE_MAX) { + ret = -EINVAL; + goto out; + } + + hist_data->n_keys++; + + if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) + return -EINVAL; + + ret = key_size; + out: + return ret; +} + +static int create_key_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + unsigned int i, n_vals = hist_data->n_vals; + char *fields_str, *field_str; + int ret = -EINVAL; + + fields_str = hist_data->attrs->keys_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + ret = create_key_field(hist_data, i, file, field_str); + if (ret < 0) + goto out; + } + if (fields_str) { + ret = -EINVAL; + goto out; + } + ret = 0; + out: + return ret; +} + +static int create_hist_fields(struct hist_trigger_data *hist_data, + struct trace_event_file *file) +{ + int ret; + + ret = create_val_fields(hist_data, file); + if (ret) + goto out; + + ret = create_key_fields(hist_data, file); + if (ret) + goto out; + + hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; + out: + return ret; +} + +static int create_sort_keys(struct hist_trigger_data *hist_data) +{ + int ret = 0; + + hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */ + + return ret; +} + +static void destroy_hist_data(struct hist_trigger_data *hist_data) +{ + destroy_hist_trigger_attrs(hist_data->attrs); + destroy_hist_fields(hist_data); + tracing_map_destroy(hist_data->map); + kfree(hist_data); +} + +static int create_tracing_map_fields(struct hist_trigger_data *hist_data) +{ + struct tracing_map *map = hist_data->map; + struct ftrace_event_field *field; + struct hist_field *hist_field; + unsigned int i, idx; + + for_each_hist_field(i, hist_data) { + hist_field = hist_data->fields[i]; + if (hist_field->flags & HIST_FIELD_FL_KEY) { + tracing_map_cmp_fn_t cmp_fn; + + field = hist_field->field; + + if (is_string_field(field)) + cmp_fn = tracing_map_cmp_string; + else + cmp_fn = tracing_map_cmp_num(field->size, + field->is_signed); + idx = tracing_map_add_key_field(map, 0, cmp_fn); + } else + idx = tracing_map_add_sum_field(map); + + if (idx < 0) + return idx; + } + + return 0; +} + +static struct hist_trigger_data * +create_hist_data(unsigned int map_bits, + struct hist_trigger_attrs *attrs, + struct trace_event_file *file) +{ + struct hist_trigger_data *hist_data; + int ret = 0; + + hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL); + if (!hist_data) + return ERR_PTR(-ENOMEM); + + hist_data->attrs = attrs; + + ret = create_hist_fields(hist_data, file); + if (ret) + goto free; + + ret = create_sort_keys(hist_data); + if (ret) + goto free; + + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, + NULL, hist_data); + if (IS_ERR(hist_data->map)) { + ret = PTR_ERR(hist_data->map); + hist_data->map = NULL; + goto free; + } + + ret = create_tracing_map_fields(hist_data); + if (ret) + goto free; + + ret = tracing_map_init(hist_data->map); + if (ret) + goto free; + + hist_data->event_file = file; + out: + return hist_data; + free: + hist_data->attrs = NULL; + + destroy_hist_data(hist_data); + + hist_data = ERR_PTR(ret); + + goto out; +} + +static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + void *rec) +{ + struct hist_field *hist_field; + unsigned int i; + u64 hist_val; + + for_each_hist_val_field(i, hist_data) { + hist_field = hist_data->fields[i]; + hist_val = hist_field->fn(hist_field, rec); + tracing_map_update_sum(elt, i, hist_val); + } +} + +static void event_hist_trigger(struct event_trigger_data *data, void *rec) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + struct tracing_map_elt *elt; + u64 field_contents; + void *key = NULL; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) + key = (void *)(unsigned long)field_contents; + else + key = (void *)&field_contents; + } + + elt = tracing_map_insert(hist_data->map, key); + if (elt) + hist_trigger_elt_update(hist_data, elt, rec); +} + +static void +hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, void *key, + struct tracing_map_elt *elt) +{ + struct hist_field *key_field; + unsigned int i; + u64 uval; + + seq_puts(m, "{ "); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ", "); + + if (key_field->flags & HIST_FIELD_FL_STRING) { + seq_printf(m, "%s: %-50s", key_field->field->name, + (char *)key); + } else { + uval = *(u64 *)key; + seq_printf(m, "%s: %10llu", + key_field->field->name, uval); + } + } + + seq_puts(m, " }"); + + seq_printf(m, " hitcount: %10llu", + tracing_map_read_sum(elt, HITCOUNT_IDX)); + + seq_puts(m, "\n"); +} + +static int print_entries(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + unsigned int i, n_entries; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, + &sort_entries); + if (n_entries < 0) + return n_entries; + + for (i = 0; i < n_entries; i++) + hist_trigger_entry_print(m, hist_data, + sort_entries[i]->key, + sort_entries[i]->elt); + + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *test, *data = NULL; + struct trace_event_file *event_file; + struct hist_trigger_data *hist_data; + int n_entries, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(test, &event_file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + data = test; + break; + } + } + if (!data) + goto out_unlock; + + seq_puts(m, "# event histogram\n#\n# trigger info: "); + data->ops->print(m, data->ops, data); + seq_puts(m, "\n"); + + hist_data = data->private_data; + n_entries = print_entries(m, hist_data); + if (n_entries < 0) { + ret = n_entries; + n_entries = 0; + } + + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", + (u64)atomic64_read(&hist_data->map->hits), + n_entries, (u64)atomic64_read(&hist_data->map->drops)); + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +static int event_hist_open(struct inode *inode, struct file *file) +{ + return single_open(file, hist_show, file); +} + +const struct file_operations event_hist_fops = { + .open = event_hist_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) +{ + seq_printf(m, "%s", hist_field->field->name); +} + +static int event_hist_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + struct hist_field *key_field; + unsigned int i; + + seq_puts(m, "hist:keys="); + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (i > hist_data->n_vals) + seq_puts(m, ","); + + hist_field_print(m, key_field); + } + + seq_puts(m, ":vals="); + seq_puts(m, "hitcount"); + + seq_puts(m, ":sort="); + seq_puts(m, "hitcount"); + + seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); + + if (data->filter_str) + seq_printf(m, " if %s", data->filter_str); + + seq_puts(m, " [active]"); + + seq_putc(m, '\n'); + + return 0; +} + +static void event_hist_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + trigger_data_free(data); + destroy_hist_data(hist_data); + } +} + +static struct event_trigger_ops event_hist_trigger_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_trigger_init, + .free = event_hist_trigger_free, +}; + +static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) +{ + return &event_hist_trigger_ops; +} + +static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + update_cond_flag(file); + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + update_cond_flag(file); + ret--; + } + out: + return ret; +} + +static int event_hist_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) +{ + unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; + struct event_trigger_data *trigger_data; + struct hist_trigger_attrs *attrs; + struct event_trigger_ops *trigger_ops; + struct hist_trigger_data *hist_data; + char *trigger; + int ret = 0; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (k:v [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + attrs = parse_hist_trigger_attrs(trigger); + if (IS_ERR(attrs)) + return PTR_ERR(attrs); + + if (attrs->map_bits) + hist_trigger_bits = attrs->map_bits; + + hist_data = create_hist_data(hist_trigger_bits, attrs, file); + if (IS_ERR(hist_data)) { + destroy_hist_trigger_attrs(attrs); + return PTR_ERR(hist_data); + } + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out_free; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + trigger_data->private_data = hist_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + ret = 0; + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of triggers registered, + * but if it didn't register any it returns zero. Consider no + * triggers registered a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + /* Just return zero, not the number of registered triggers */ + ret = 0; + out: + return ret; + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + + kfree(trigger_data); + + destroy_hist_data(hist_data); + goto out; +} + +static struct event_command trigger_hist_cmd = { + .name = "hist", + .trigger_type = ETT_EVENT_HIST, + .flags = EVENT_CMD_FL_NEEDS_REC, + .func = event_hist_trigger_func, + .reg = hist_register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = event_hist_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +__init int register_trigger_hist_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_cmd); + WARN_ON(ret < 0); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d67992f3bb0e..d29092afe005 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_cmd(); return 0; } -- cgit v1.2.3 From f2606835d70d2a2e6a134f01821da8149e124796 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:43 -0600 Subject: tracing: Add hist trigger support for multiple values ('vals=' param) Allow users to specify trace event fields to use in aggregated sums via a new 'vals=' keyword. Before this addition, the only aggregated sum supported was the implied value 'hitcount'. With this addition, 'hitcount' is also supported as an explicit value field, as is any numeric trace event field. This expands the hist trigger syntax from this: # echo hist:keys=xxx [ if filter] > event/trigger to this: # echo hist:keys=xxx:vals=yyy [ if filter] > event/trigger Link: http://lkml.kernel.org/r/2a5d1adb5ba6c65d7bb2148e379f2fed47f29a68.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 +++--- kernel/trace/trace_events_hist.c | 79 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6cf8fd03b028..bb62f54c5480 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3834,14 +3834,16 @@ static const char readme_msg[] = #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=\n" + "\t [:values=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" - "\t table using the key named, and the value of a sum called\n" - "\t 'hitcount' is incremented. Keys correspond to fields in the\n" - "\t event's format description. Keys can be any field. The\n" - "\t 'size' parameter can be used to specify more or fewer than\n" - "\t the default 2048 entries for the hashtable size.\n\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" + "\t correspond to fields in the event's format description. Keys\n" + "\t can be any field. Values must correspond to numeric fields.\n" + "\t The 'size' parameter can be used to specify more or fewer\n" + "\t than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout." #endif diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 23b45e462117..1b33590b9c8f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -84,6 +84,7 @@ enum hist_field_flags { struct hist_trigger_attrs { char *keys_str; + char *vals_str; unsigned int map_bits; }; @@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) return; kfree(attrs->keys_str); + kfree(attrs->vals_str); kfree(attrs); } @@ -183,6 +185,10 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) if ((strncmp(str, "key=", strlen("key=")) == 0) || (strncmp(str, "keys=", strlen("keys=")) == 0)) attrs->keys_str = kstrdup(str, GFP_KERNEL); + else if ((strncmp(str, "val=", strlen("val=")) == 0) || + (strncmp(str, "vals=", strlen("vals=")) == 0) || + (strncmp(str, "values=", strlen("values=")) == 0)) + attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -276,13 +282,70 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data) return 0; } +static int create_val_field(struct hist_trigger_data *hist_data, + unsigned int val_idx, + struct trace_event_file *file, + char *field_str) +{ + struct ftrace_event_field *field = NULL; + unsigned long flags = 0; + int ret = 0; + + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) + return -EINVAL; + field = trace_find_event_field(file->event_call, field_str); + if (!field) { + ret = -EINVAL; + goto out; + } + + hist_data->fields[val_idx] = create_hist_field(field, flags); + if (!hist_data->fields[val_idx]) { + ret = -ENOMEM; + goto out; + } + + ++hist_data->n_vals; + + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) + ret = -EINVAL; + out: + return ret; +} + static int create_val_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { + char *fields_str, *field_str; + unsigned int i, j; int ret; ret = create_hitcount_val(hist_data); + if (ret) + goto out; + fields_str = hist_data->attrs->vals_str; + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) + goto out; + + for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX && + j < TRACING_MAP_VALS_MAX; i++) { + field_str = strsep(&fields_str, ","); + if (!field_str) + break; + if (strcmp(field_str, "hitcount") == 0) + continue; + ret = create_val_field(hist_data, j++, file, field_str); + if (ret) + goto out; + } + if (fields_str && (strcmp(fields_str, "hitcount") != 0)) + ret = -EINVAL; + out: return ret; } @@ -552,6 +615,12 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); + for (i = 1; i < hist_data->n_vals; i++) { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } + seq_puts(m, "\n"); } @@ -659,7 +728,15 @@ static int event_hist_trigger_print(struct seq_file *m, } seq_puts(m, ":vals="); - seq_puts(m, "hitcount"); + + for_each_hist_val_field(i, hist_data) { + if (i == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + seq_puts(m, ","); + hist_field_print(m, hist_data->fields[i]); + } + } seq_puts(m, ":sort="); seq_puts(m, "hitcount"); -- cgit v1.2.3 From 76a3b0c8ac344e1d0f436160cbb59b670b086947 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:44 -0600 Subject: tracing: Add hist trigger support for compound keys Allow users to specify multiple trace event fields to use in keys by allowing multiple fields in the 'keys=' keyword. With this addition, any unique combination of any of the fields named in the 'keys' keyword will result in a new entry being added to the hash table. Link: http://lkml.kernel.org/r/0cfa24e6ac3b0dcece7737d94aa1f322ae3afc4b.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++---- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62f54c5480..7a4c4dcf0bfe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3833,7 +3833,7 @@ static const char readme_msg[] = "\t Filters can be ignored when removing a trigger.\n" #ifdef CONFIG_HIST_TRIGGERS " hist trigger\t- If set, event hits are aggregated into a hash table\n" - "\t Format: hist:keys=\n" + "\t Format: hist:keys=\n" "\t [:values=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" @@ -3841,9 +3841,11 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field. Values must correspond to numeric fields.\n" - "\t The 'size' parameter can be used to specify more or fewer\n" - "\t than the default 2048 entries for the hashtable size.\n\n" + "\t can be any field. Compound keys consisting of up to two\n" + "\t fields can be specified by the 'keys' keyword. Values must\n" + "\t correspond to numeric fields. The 'size' parameter can be\n" + "\t used to specify more or fewer than the default 2048 entries\n" + "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout." #endif diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1b33590b9c8f..65fdfc6cb633 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -32,6 +32,7 @@ struct hist_field { unsigned long flags; hist_field_fn_t fn; unsigned int size; + unsigned int offset; }; static u64 hist_field_counter(struct hist_field *field, void *event) @@ -73,8 +74,7 @@ DEFINE_HIST_FIELD_FN(u8); for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) #define HITCOUNT_IDX 0 -#define HIST_KEY_MAX 1 -#define HIST_KEY_SIZE_MAX MAX_FILTER_STR_VAL +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, @@ -351,6 +351,7 @@ static int create_val_fields(struct hist_trigger_data *hist_data, static int create_key_field(struct hist_trigger_data *hist_data, unsigned int key_idx, + unsigned int key_offset, struct trace_event_file *file, char *field_str) { @@ -380,7 +381,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, key_size = ALIGN(key_size, sizeof(u64)); hist_data->fields[key_idx]->size = key_size; - hist_data->key_size = key_size; + hist_data->fields[key_idx]->offset = key_offset; + hist_data->key_size += key_size; if (hist_data->key_size > HIST_KEY_SIZE_MAX) { ret = -EINVAL; goto out; @@ -399,7 +401,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, static int create_key_fields(struct hist_trigger_data *hist_data, struct trace_event_file *file) { - unsigned int i, n_vals = hist_data->n_vals; + unsigned int i, key_offset = 0, n_vals = hist_data->n_vals; char *fields_str, *field_str; int ret = -EINVAL; @@ -411,13 +413,15 @@ static int create_key_fields(struct hist_trigger_data *hist_data, if (!fields_str) goto out; - for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) { + for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) { field_str = strsep(&fields_str, ","); if (!field_str) break; - ret = create_key_field(hist_data, i, file, field_str); + ret = create_key_field(hist_data, i, key_offset, + file, field_str); if (ret < 0) goto out; + key_offset += ret; } if (fields_str) { ret = -EINVAL; @@ -482,7 +486,10 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) else cmp_fn = tracing_map_cmp_num(field->size, field->is_signed); - idx = tracing_map_add_key_field(map, 0, cmp_fn); + idx = tracing_map_add_key_field(map, + hist_field->offset, + cmp_fn); + } else idx = tracing_map_add_sum_field(map); @@ -562,12 +569,16 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + char compound_key[HIST_KEY_SIZE_MAX]; struct hist_field *key_field; struct tracing_map_elt *elt; u64 field_contents; void *key = NULL; unsigned int i; + if (hist_data->n_keys > 1) + memset(compound_key, 0, hist_data->key_size); + for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -576,8 +587,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = (void *)(unsigned long)field_contents; else key = (void *)&field_contents; + + if (hist_data->n_keys > 1) { + memcpy(compound_key + key_field->offset, key, + key_field->size); + } } + if (hist_data->n_keys > 1) + key = compound_key; + elt = tracing_map_insert(hist_data->map, key); if (elt) hist_trigger_elt_update(hist_data, elt, rec); @@ -602,11 +621,11 @@ hist_trigger_entry_print(struct seq_file *m, if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, - (char *)key); + (char *)(key + key_field->offset)); } else { - uval = *(u64 *)key; - seq_printf(m, "%s: %10llu", - key_field->field->name, uval); + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %10llu", key_field->field->name, + uval); } } -- cgit v1.2.3 From e62347d2453474fa514fbbbc4636313d34d3c850 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:45 -0600 Subject: tracing: Add hist trigger support for user-defined sorting ('sort=' param) Allow users to specify keys and/or values to sort on. With this addition, keys and values specified using the 'keys=' and 'vals=' keywords can be used to sort the hist trigger output via a new 'sort=' keyword. If multiple sort keys are specified, the output will be sorted using the second key as a secondary sort key, etc. The default sort order is ascending; if the user wants a different sort order, '.descending' can be appended to the specific sort key. Before this addition, output was always sorted by 'hitcount' in ascending order. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy \ [ if filter] > event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/b30a41db66ba486979c4f987aff5fab500ea53b3.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++- kernel/trace/trace_events_hist.c | 112 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7a4c4dcf0bfe..d16fa8e1cdbf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3835,6 +3835,7 @@ static const char readme_msg[] = " hist trigger\t- If set, event hits are aggregated into a hash table\n" "\t Format: hist:keys=\n" "\t [:values=]\n" + "\t [:sort=]\n" "\t [:size=#entries]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" @@ -3843,7 +3844,10 @@ static const char readme_msg[] = "\t correspond to fields in the event's format description. Keys\n" "\t can be any field. Compound keys consisting of up to two\n" "\t fields can be specified by the 'keys' keyword. Values must\n" - "\t correspond to numeric fields. The 'size' parameter can be\n" + "\t correspond to numeric fields. Sort keys consisting of up to\n" + "\t two fields can be specified using the 'sort' keyword. The\n" + "\t sort direction can be modified by appending '.descending' or\n" + "\t '.ascending' to a sort field. The 'size' parameter can be\n" "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 65fdfc6cb633..03ba4530c08c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -85,6 +85,7 @@ enum hist_field_flags { struct hist_trigger_attrs { char *keys_str; char *vals_str; + char *sort_key_str; unsigned int map_bits; }; @@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) if (!attrs) return; + kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); kfree(attrs); @@ -189,6 +191,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) (strncmp(str, "vals=", strlen("vals=")) == 0) || (strncmp(str, "values=", strlen("values=")) == 0)) attrs->vals_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "sort=", strlen("sort=")) == 0) + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -450,12 +454,92 @@ static int create_hist_fields(struct hist_trigger_data *hist_data, return ret; } +static int is_descending(const char *str) +{ + if (!str) + return 0; + + if (strcmp(str, "descending") == 0) + return 1; + + if (strcmp(str, "ascending") == 0) + return 0; + + return -EINVAL; +} + static int create_sort_keys(struct hist_trigger_data *hist_data) { - int ret = 0; + char *fields_str = hist_data->attrs->sort_key_str; + struct ftrace_event_field *field = NULL; + struct tracing_map_sort_key *sort_key; + int descending, ret = 0; + unsigned int i, j; + + hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ + + if (!fields_str) + goto out; + + strsep(&fields_str, "="); + if (!fields_str) { + ret = -EINVAL; + goto out; + } + + for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + char *field_str, *field_name; + + sort_key = &hist_data->sort_keys[i]; + + field_str = strsep(&fields_str, ","); + if (!field_str) { + if (i == 0) + ret = -EINVAL; + break; + } + + if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) { + ret = -EINVAL; + break; + } - hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */ + field_name = strsep(&field_str, "."); + if (!field_name) { + ret = -EINVAL; + break; + } + + if (strcmp(field_name, "hitcount") == 0) { + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + break; + } + sort_key->descending = descending; + continue; + } + for (j = 1; j < hist_data->n_fields; j++) { + field = hist_data->fields[j]->field; + if (field && (strcmp(field_name, field->name) == 0)) { + sort_key->field_idx = j; + descending = is_descending(field_str); + if (descending < 0) { + ret = descending; + goto out; + } + sort_key->descending = descending; + break; + } + } + if (j == hist_data->n_fields) { + ret = -EINVAL; + break; + } + } + hist_data->n_sort_keys = i; + out: return ret; } @@ -758,7 +842,29 @@ static int event_hist_trigger_print(struct seq_file *m, } seq_puts(m, ":sort="); - seq_puts(m, "hitcount"); + + for (i = 0; i < hist_data->n_sort_keys; i++) { + struct tracing_map_sort_key *sort_key; + + sort_key = &hist_data->sort_keys[i]; + + if (i > 0) + seq_puts(m, ","); + + if (sort_key->field_idx == HITCOUNT_IDX) + seq_puts(m, "hitcount"); + else { + unsigned int idx = sort_key->field_idx; + + if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) + return -EINVAL; + + hist_field_print(m, hist_data->fields[idx]); + } + + if (sort_key->descending) + seq_puts(m, ".descending"); + } seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); -- cgit v1.2.3 From 83e99914c9e2677d8a80f2a23eca0d215d5bfb0f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:46 -0600 Subject: tracing: Add hist trigger support for pausing and continuing a trace Allow users to append 'pause' or 'continue' to an existing trigger in order to have it paused or to have a paused trace continue. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \ [ if filter] >> event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause or cont \ [ if filter] >> event/trigger Link: http://lkml.kernel.org/r/b672a92c14702cb924cdf6fc27ea1809bed04907.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 ++++++- kernel/trace/trace_events_hist.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d16fa8e1cdbf..7d5c63dd410a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3837,6 +3837,7 @@ static const char readme_msg[] = "\t [:values=]\n" "\t [:sort=]\n" "\t [:size=#entries]\n" + "\t [:pause][:continue]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3851,7 +3852,11 @@ static const char readme_msg[] = "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout." + "\t table in its entirety to stdout.\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" + "\t restart a paused hist trigger.\n\n" #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 03ba4530c08c..e3fa9c89f125 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -86,6 +86,8 @@ struct hist_trigger_attrs { char *keys_str; char *vals_str; char *sort_key_str; + bool pause; + bool cont; unsigned int map_bits; }; @@ -193,6 +195,11 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "sort=", strlen("sort=")) == 0) attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strcmp(str, "pause") == 0) + attrs->pause = true; + else if ((strcmp(str, "cont") == 0) || + (strcmp(str, "continue") == 0)) + attrs->cont = true; else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -871,7 +878,10 @@ static int event_hist_trigger_print(struct seq_file *m, if (data->filter_str) seq_printf(m, " if %s", data->filter_str); - seq_puts(m, " [active]"); + if (data->paused) + seq_puts(m, " [paused]"); + else + seq_puts(m, " [active]"); seq_putc(m, '\n'); @@ -910,16 +920,30 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { + struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test; int ret = 0; list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - ret = -EEXIST; + if (hist_data->attrs->pause) + test->paused = true; + else if (hist_data->attrs->cont) + test->paused = false; + else + ret = -EEXIST; goto out; } } + if (hist_data->attrs->cont) { + ret = -ENOENT; + goto out; + } + + if (hist_data->attrs->pause) + data->paused = true; + if (data->ops->init) { ret = data->ops->init(data->ops, data); if (ret < 0) @@ -1011,7 +1035,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, * triggers registered a failure too. */ if (!ret) { - ret = -ENOENT; + if (!(attrs->pause || attrs->cont)) + ret = -ENOENT; goto out_free; } else if (ret < 0) goto out_free; -- cgit v1.2.3 From e86ae9baacfa9efe957df4fc037f079772636d76 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:47 -0600 Subject: tracing: Add hist trigger support for clearing a trace Allow users to append 'clear' to an existing trigger in order to have the hash table cleared. This expands the hist trigger syntax from this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont \ [ if filter] >> event/trigger to this: # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont/clear \ [ if filter] >> event/trigger Link: http://lkml.kernel.org/r/ae15dd0d9b2f7af07a37c1ff682063e2dbcdf160.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- kernel/trace/trace_events_hist.c | 24 ++++++++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d5c63dd410a..4097cd3763f7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3837,7 +3837,7 @@ static const char readme_msg[] = "\t [:values=]\n" "\t [:sort=]\n" "\t [:size=#entries]\n" - "\t [:pause][:continue]\n" + "\t [:pause][:continue][:clear]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3857,6 +3857,9 @@ static const char readme_msg[] = "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" "\t restart a paused hist trigger.\n\n" + "\t The 'clear' parameter will clear the contents of a running\n" + "\t hist trigger and leave its current paused/active state\n" + "\t unchanged.\n\n" #endif ; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e3fa9c89f125..83cb25e067ad 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -88,6 +88,7 @@ struct hist_trigger_attrs { char *sort_key_str; bool pause; bool cont; + bool clear; unsigned int map_bits; }; @@ -200,6 +201,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) else if ((strcmp(str, "cont") == 0) || (strcmp(str, "continue") == 0)) attrs->cont = true; + else if (strcmp(str, "clear") == 0) + attrs->clear = true; else if (strncmp(str, "size=", strlen("size=")) == 0) { int map_bits = parse_map_size(str); @@ -916,6 +919,21 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, return &event_hist_trigger_ops; } +static void hist_clear(struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + bool paused; + + paused = data->paused; + data->paused = true; + + synchronize_sched(); + + tracing_map_clear(hist_data->map); + + data->paused = paused; +} + static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -930,13 +948,15 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, test->paused = true; else if (hist_data->attrs->cont) test->paused = false; + else if (hist_data->attrs->clear) + hist_clear(test); else ret = -EEXIST; goto out; } } - if (hist_data->attrs->cont) { + if (hist_data->attrs->cont || hist_data->attrs->clear) { ret = -ENOENT; goto out; } @@ -1035,7 +1055,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, * triggers registered a failure too. */ if (!ret) { - if (!(attrs->pause || attrs->cont)) + if (!(attrs->pause || attrs->cont || attrs->clear)) ret = -ENOENT; goto out_free; } else if (ret < 0) -- cgit v1.2.3 From 0c4a6b4666e8eb86dead3f09b40bb8ca4f614e4f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:48 -0600 Subject: tracing: Add hist trigger 'hex' modifier for displaying numeric fields Allow users to have numeric fields displayed as hex values in the output by appending '.hex' to field names: # echo hist:keys=aaa,bbb.hex:vals=ccc.hex ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/67bd431edda2af5798d7694818f7e8d71b6b3463.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++- kernel/trace/trace_events_hist.c | 62 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4097cd3763f7..2eb2eafbe7f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3852,7 +3852,10 @@ static const char readme_msg[] = "\t used to specify more or fewer than the default 2048 entries\n" "\t for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout.\n\n" + "\t table in its entirety to stdout. The default format used to\n" + "\t display a given field can be modified by appending any of the\n" + "\t following modifiers to the field name, as applicable:\n\n" + "\t .hex display a number as a hex value\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 83cb25e067ad..8e49eeef8867 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -80,6 +80,7 @@ enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, HIST_FIELD_FL_KEY = 2, HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, }; struct hist_trigger_attrs { @@ -303,11 +304,23 @@ static int create_val_field(struct hist_trigger_data *hist_data, { struct ftrace_event_field *field = NULL; unsigned long flags = 0; + char *field_name; int ret = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) return -EINVAL; - field = trace_find_event_field(file->event_call, field_str); + + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); if (!field) { ret = -EINVAL; goto out; @@ -372,6 +385,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct ftrace_event_field *field = NULL; unsigned long flags = 0; unsigned int key_size; + char *field_name; int ret = 0; if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) @@ -379,7 +393,17 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_KEY; - field = trace_find_event_field(file->event_call, field_str); + field_name = strsep(&field_str, "."); + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); if (!field) { ret = -EINVAL; goto out; @@ -713,7 +737,11 @@ hist_trigger_entry_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ", "); - if (key_field->flags & HIST_FIELD_FL_STRING) { + if (key_field->flags & HIST_FIELD_FL_HEX) { + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %llx", + key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); } else { @@ -729,9 +757,15 @@ hist_trigger_entry_print(struct seq_file *m, tracing_map_read_sum(elt, HITCOUNT_IDX)); for (i = 1; i < hist_data->n_vals; i++) { - seq_printf(m, " %s: %10llu", - hist_data->fields[i]->field->name, - tracing_map_read_sum(elt, i)); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } else { + seq_printf(m, " %s: %10llu", + hist_data->fields[i]->field->name, + tracing_map_read_sum(elt, i)); + } } seq_puts(m, "\n"); @@ -816,9 +850,25 @@ const struct file_operations event_hist_fops = { .release = single_release, }; +static const char *get_hist_field_flags(struct hist_field *hist_field) +{ + const char *flags_str = NULL; + + if (hist_field->flags & HIST_FIELD_FL_HEX) + flags_str = "hex"; + + return flags_str; +} + static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { seq_printf(m, "%s", hist_field->field->name); + if (hist_field->flags) { + const char *flags_str = get_hist_field_flags(hist_field); + + if (flags_str) + seq_printf(m, ".%s", flags_str); + } } static int event_hist_trigger_print(struct seq_file *m, -- cgit v1.2.3 From c6afad49d127f6d7c9957319f55173a2198b1ba8 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:49 -0600 Subject: tracing: Add hist trigger 'sym' and 'sym-offset' modifiers Allow users to have address fields displayed as symbols in the output by appending '.sym' or 'sym-offset' to field names: # echo hist:keys=aaa.sym,bbb.sym-offset ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/87d4935821491c0275513f0fbfb9bab8d3d3f079.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- kernel/trace/trace_events_hist.c | 29 +++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2eb2eafbe7f5..34459eb0c844 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3855,7 +3855,9 @@ static const char readme_msg[] = "\t table in its entirety to stdout. The default format used to\n" "\t display a given field can be modified by appending any of the\n" "\t following modifiers to the field name, as applicable:\n\n" - "\t .hex display a number as a hex value\n\n" + "\t .hex display a number as a hex value\n" + "\t .sym display an address as a symbol\n" + "\t .sym-offset display an address as a symbol and offset\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 8e49eeef8867..808cc06cdb61 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -77,10 +77,12 @@ DEFINE_HIST_FIELD_FN(u8); #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) enum hist_field_flags { - HIST_FIELD_FL_HITCOUNT = 1, - HIST_FIELD_FL_KEY = 2, - HIST_FIELD_FL_STRING = 4, - HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_HITCOUNT = 1, + HIST_FIELD_FL_KEY = 2, + HIST_FIELD_FL_STRING = 4, + HIST_FIELD_FL_HEX = 8, + HIST_FIELD_FL_SYM = 16, + HIST_FIELD_FL_SYM_OFFSET = 32, }; struct hist_trigger_attrs { @@ -397,6 +399,10 @@ static int create_key_field(struct hist_trigger_data *hist_data, if (field_str) { if (strcmp(field_str, "hex") == 0) flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; else { ret = -EINVAL; goto out; @@ -726,6 +732,7 @@ hist_trigger_entry_print(struct seq_file *m, struct tracing_map_elt *elt) { struct hist_field *key_field; + char str[KSYM_SYMBOL_LEN]; unsigned int i; u64 uval; @@ -741,6 +748,16 @@ hist_trigger_entry_print(struct seq_file *m, uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %llx", key_field->field->name, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYM) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol_no_offset(str, uval); + seq_printf(m, "%s: [%llx] %-45s", + key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { + uval = *(u64 *)(key + key_field->offset); + sprint_symbol(str, uval); + seq_printf(m, "%s: [%llx] %-55s", + key_field->field->name, uval, str); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -856,6 +873,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_HEX) flags_str = "hex"; + else if (hist_field->flags & HIST_FIELD_FL_SYM) + flags_str = "sym"; + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) + flags_str = "sym-offset"; return flags_str; } -- cgit v1.2.3 From 6b4827ad028a1ab2fc4dcf1f5e6e077018d1b770 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:50 -0600 Subject: tracing: Add hist trigger 'execname' modifier Allow users to have common_pid field values displayed as program names in the output by appending '.execname' to a common_pid field name: # echo hist:keys=common_pid.execname ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/e172e81f10f5b8d1f08450e3763c850f39fbf698.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +- kernel/trace/trace_events_hist.c | 100 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 34459eb0c844..58a8bee50c6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3857,7 +3857,8 @@ static const char readme_msg[] = "\t following modifiers to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" - "\t .sym-offset display an address as a symbol and offset\n\n" + "\t .sym-offset display an address as a symbol and offset\n" + "\t .execname display a common_pid as a program name\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 808cc06cdb61..227e6c21b1eb 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -83,6 +83,7 @@ enum hist_field_flags { HIST_FIELD_FL_HEX = 8, HIST_FIELD_FL_SYM = 16, HIST_FIELD_FL_SYM_OFFSET = 32, + HIST_FIELD_FL_EXECNAME = 64, }; struct hist_trigger_attrs { @@ -232,6 +233,73 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) return ERR_PTR(ret); } +static inline void save_comm(char *comm, struct task_struct *task) +{ + if (!task->pid) { + strcpy(comm, ""); + return; + } + + if (WARN_ON_ONCE(task->pid < 0)) { + strcpy(comm, ""); + return; + } + + memcpy(comm, task->comm, TASK_COMM_LEN); +} + +static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) +{ + kfree((char *)elt->private_data); +} + +static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) +{ + struct hist_trigger_data *hist_data = elt->map->private_data; + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + unsigned int size = TASK_COMM_LEN + 1; + + elt->private_data = kzalloc(size, GFP_KERNEL); + if (!elt->private_data) + return -ENOMEM; + break; + } + } + + return 0; +} + +static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, + struct tracing_map_elt *from) +{ + char *comm_from = from->private_data; + char *comm_to = to->private_data; + + if (comm_from) + memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); +} + +static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) +{ + char *comm = elt->private_data; + + if (comm) + save_comm(comm, current); +} + +static const struct tracing_map_ops hist_trigger_elt_comm_ops = { + .elt_alloc = hist_trigger_elt_comm_alloc, + .elt_copy = hist_trigger_elt_comm_copy, + .elt_free = hist_trigger_elt_comm_free, + .elt_init = hist_trigger_elt_comm_init, +}; + static void destroy_hist_field(struct hist_field *hist_field) { kfree(hist_field); @@ -403,6 +471,9 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_SYM; else if (strcmp(field_str, "sym-offset") == 0) flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; else { ret = -EINVAL; goto out; @@ -624,11 +695,27 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) return 0; } +static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) +{ + struct hist_field *key_field; + unsigned int i; + + for_each_hist_key_field(i, hist_data) { + key_field = hist_data->fields[i]; + + if (key_field->flags & HIST_FIELD_FL_EXECNAME) + return true; + } + + return false; +} + static struct hist_trigger_data * create_hist_data(unsigned int map_bits, struct hist_trigger_attrs *attrs, struct trace_event_file *file) { + const struct tracing_map_ops *map_ops = NULL; struct hist_trigger_data *hist_data; int ret = 0; @@ -646,8 +733,11 @@ create_hist_data(unsigned int map_bits, if (ret) goto free; + if (need_tracing_map_ops(hist_data)) + map_ops = &hist_trigger_elt_comm_ops; + hist_data->map = tracing_map_create(map_bits, hist_data->key_size, - NULL, hist_data); + map_ops, hist_data); if (IS_ERR(hist_data->map)) { ret = PTR_ERR(hist_data->map); hist_data->map = NULL; @@ -758,6 +848,12 @@ hist_trigger_entry_print(struct seq_file *m, sprint_symbol(str, uval); seq_printf(m, "%s: [%llx] %-55s", key_field->field->name, uval, str); + } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { + char *comm = elt->private_data; + + uval = *(u64 *)(key + key_field->offset); + seq_printf(m, "%s: %-16s[%10llu]", + key_field->field->name, comm, uval); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -877,6 +973,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "sym"; else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) flags_str = "sym-offset"; + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) + flags_str = "execname"; return flags_str; } -- cgit v1.2.3 From 316961988b5ec71bbf4b2ad447662770349aec13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:51 -0600 Subject: tracing: Add hist trigger 'syscall' modifier Allow users to have syscall id fields displayed as syscall names in the output by appending '.syscall' to field names: # echo hist:keys=aaa.syscall ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/2bab1e59933d76a14b545bd2e02f80b8b08ac4d3.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 ++- kernel/trace/trace_events_hist.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 58a8bee50c6e..0dd23c5b1c34 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3858,7 +3858,8 @@ static const char readme_msg[] = "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" - "\t .execname display a common_pid as a program name\n\n" + "\t .execname display a common_pid as a program name\n" + "\t .syscall display a syscall id as a syscall name\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 227e6c21b1eb..f9e7ecb33847 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -84,6 +84,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYM = 16, HIST_FIELD_FL_SYM_OFFSET = 32, HIST_FIELD_FL_EXECNAME = 64, + HIST_FIELD_FL_SYSCALL = 128, }; struct hist_trigger_attrs { @@ -474,6 +475,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, else if ((strcmp(field_str, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; else { ret = -EINVAL; goto out; @@ -854,6 +857,16 @@ hist_trigger_entry_print(struct seq_file *m, uval = *(u64 *)(key + key_field->offset); seq_printf(m, "%s: %-16s[%10llu]", key_field->field->name, comm, uval); + } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { + const char *syscall_name; + + uval = *(u64 *)(key + key_field->offset); + syscall_name = get_syscall_name(uval); + if (!syscall_name) + syscall_name = "unknown_syscall"; + + seq_printf(m, "%s: %-30s[%3llu]", + key_field->field->name, syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -975,6 +988,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "sym-offset"; else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) flags_str = "execname"; + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) + flags_str = "syscall"; return flags_str; } -- cgit v1.2.3 From 69a0200c2e25d61c50091549d00cfeb426c258f5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:52 -0600 Subject: tracing: Add hist trigger support for stacktraces as keys It's often useful to be able to use a stacktrace as a hash key, for keeping a count of the number of times a particular call path resulted in a trace event, for instance. Add a special key named 'stacktrace' which can be used as key in a 'keys=' param for this purpose: # echo hist:keys=stacktrace ... \ [ if filter] > event/trigger Link: http://lkml.kernel.org/r/87515e90b3785232a874a12156174635a348edb1.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 16 ++--- kernel/trace/trace_events_hist.c | 135 +++++++++++++++++++++++++++++---------- 2 files changed, 109 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0dd23c5b1c34..2238bfde799b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3843,14 +3843,14 @@ static const char readme_msg[] = "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" "\t correspond to fields in the event's format description. Keys\n" - "\t can be any field. Compound keys consisting of up to two\n" - "\t fields can be specified by the 'keys' keyword. Values must\n" - "\t correspond to numeric fields. Sort keys consisting of up to\n" - "\t two fields can be specified using the 'sort' keyword. The\n" - "\t sort direction can be modified by appending '.descending' or\n" - "\t '.ascending' to a sort field. The 'size' parameter can be\n" - "\t used to specify more or fewer than the default 2048 entries\n" - "\t for the hashtable size.\n\n" + "\t can be any field, or the special string 'stacktrace'.\n" + "\t Compound keys consisting of up to two fields can be specified\n" + "\t by the 'keys' keyword. Values must correspond to numeric\n" + "\t fields. Sort keys consisting of up to two fields can be\n" + "\t specified using the 'sort' keyword. The sort direction can\n" + "\t be modified by appending '.descending' or '.ascending' to a\n" + "\t sort field. The 'size' parameter can be used to specify more\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. The default format used to\n" "\t display a given field can be modified by appending any of the\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f9e7ecb33847..21cae42b54da 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -35,6 +35,11 @@ struct hist_field { unsigned int offset; }; +static u64 hist_field_none(struct hist_field *field, void *event) +{ + return 0; +} + static u64 hist_field_counter(struct hist_field *field, void *event) { return 1; @@ -73,8 +78,12 @@ DEFINE_HIST_FIELD_FN(u8); #define for_each_hist_key_field(i, hist_data) \ for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) +#define HIST_STACKTRACE_DEPTH 16 +#define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP 5 + #define HITCOUNT_IDX 0 -#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + sizeof(u64)) +#define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) enum hist_field_flags { HIST_FIELD_FL_HITCOUNT = 1, @@ -85,6 +94,7 @@ enum hist_field_flags { HIST_FIELD_FL_SYM_OFFSET = 32, HIST_FIELD_FL_EXECNAME = 64, HIST_FIELD_FL_SYSCALL = 128, + HIST_FIELD_FL_STACKTRACE = 256, }; struct hist_trigger_attrs { @@ -323,6 +333,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (flags & HIST_FIELD_FL_STACKTRACE) { + hist_field->fn = hist_field_none; + goto out; + } + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; hist_field->fn = hist_field_string; @@ -456,7 +471,6 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct ftrace_event_field *field = NULL; unsigned long flags = 0; unsigned int key_size; - char *field_name; int ret = 0; if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) @@ -464,33 +478,39 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_KEY; - field_name = strsep(&field_str, "."); - if (field_str) { - if (strcmp(field_str, "hex") == 0) - flags |= HIST_FIELD_FL_HEX; - else if (strcmp(field_str, "sym") == 0) - flags |= HIST_FIELD_FL_SYM; - else if (strcmp(field_str, "sym-offset") == 0) - flags |= HIST_FIELD_FL_SYM_OFFSET; - else if ((strcmp(field_str, "execname") == 0) && - (strcmp(field_name, "common_pid") == 0)) - flags |= HIST_FIELD_FL_EXECNAME; - else if (strcmp(field_str, "syscall") == 0) - flags |= HIST_FIELD_FL_SYSCALL; - else { + if (strcmp(field_str, "stacktrace") == 0) { + flags |= HIST_FIELD_FL_STACKTRACE; + key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; + } else { + char *field_name = strsep(&field_str, "."); + + if (field_str) { + if (strcmp(field_str, "hex") == 0) + flags |= HIST_FIELD_FL_HEX; + else if (strcmp(field_str, "sym") == 0) + flags |= HIST_FIELD_FL_SYM; + else if (strcmp(field_str, "sym-offset") == 0) + flags |= HIST_FIELD_FL_SYM_OFFSET; + else if ((strcmp(field_str, "execname") == 0) && + (strcmp(field_name, "common_pid") == 0)) + flags |= HIST_FIELD_FL_EXECNAME; + else if (strcmp(field_str, "syscall") == 0) + flags |= HIST_FIELD_FL_SYSCALL; + else { + ret = -EINVAL; + goto out; + } + } + + field = trace_find_event_field(file->event_call, field_name); + if (!field) { ret = -EINVAL; goto out; } - } - field = trace_find_event_field(file->event_call, field_name); - if (!field) { - ret = -EINVAL; - goto out; + key_size = field->size; } - key_size = field->size; - hist_data->fields[key_idx] = create_hist_field(field, flags); if (!hist_data->fields[key_idx]) { ret = -ENOMEM; @@ -679,7 +699,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) field = hist_field->field; - if (is_string_field(field)) + if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) + cmp_fn = tracing_map_cmp_none; + else if (is_string_field(field)) cmp_fn = tracing_map_cmp_string; else cmp_fn = tracing_map_cmp_num(field->size, @@ -786,7 +808,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + unsigned long entries[HIST_STACKTRACE_DEPTH]; char compound_key[HIST_KEY_SIZE_MAX]; + struct stack_trace stacktrace; struct hist_field *key_field; struct tracing_map_elt *elt; u64 field_contents; @@ -799,15 +823,27 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; - field_contents = key_field->fn(key_field, rec); - if (key_field->flags & HIST_FIELD_FL_STRING) - key = (void *)(unsigned long)field_contents; - else - key = (void *)&field_contents; + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + stacktrace.max_entries = HIST_STACKTRACE_DEPTH; + stacktrace.entries = entries; + stacktrace.nr_entries = 0; + stacktrace.skip = HIST_STACKTRACE_SKIP; - if (hist_data->n_keys > 1) { - memcpy(compound_key + key_field->offset, key, - key_field->size); + memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); + save_stack_trace(&stacktrace); + + key = entries; + } else { + field_contents = key_field->fn(key_field, rec); + if (key_field->flags & HIST_FIELD_FL_STRING) + key = (void *)(unsigned long)field_contents; + else + key = (void *)&field_contents; + + if (hist_data->n_keys > 1) { + memcpy(compound_key + key_field->offset, key, + key_field->size); + } } } @@ -819,6 +855,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) hist_trigger_elt_update(hist_data, elt, rec); } +static void hist_trigger_stacktrace_print(struct seq_file *m, + unsigned long *stacktrace_entries, + unsigned int max_entries) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned int spaces = 8; + unsigned int i; + + for (i = 0; i < max_entries; i++) { + if (stacktrace_entries[i] == ULONG_MAX) + return; + + seq_printf(m, "%*c", 1 + spaces, ' '); + sprint_symbol(str, stacktrace_entries[i]); + seq_printf(m, "%s\n", str); + } +} + static void hist_trigger_entry_print(struct seq_file *m, struct hist_trigger_data *hist_data, void *key, @@ -826,6 +880,7 @@ hist_trigger_entry_print(struct seq_file *m, { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; + bool multiline = false; unsigned int i; u64 uval; @@ -867,6 +922,12 @@ hist_trigger_entry_print(struct seq_file *m, seq_printf(m, "%s: %-30s[%3llu]", key_field->field->name, syscall_name, uval); + } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { + seq_puts(m, "stacktrace:\n"); + hist_trigger_stacktrace_print(m, + key + key_field->offset, + HIST_STACKTRACE_DEPTH); + multiline = true; } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -877,7 +938,10 @@ hist_trigger_entry_print(struct seq_file *m, } } - seq_puts(m, " }"); + if (!multiline) + seq_puts(m, " "); + + seq_puts(m, "}"); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -1021,7 +1085,10 @@ static int event_hist_trigger_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ","); - hist_field_print(m, key_field); + if (key_field->flags & HIST_FIELD_FL_STACKTRACE) + seq_puts(m, "stacktrace"); + else + hist_field_print(m, key_field); } seq_puts(m, ":vals="); -- cgit v1.2.3 From 79e577cbce4c4c2bf0d64ec315adb04eda40736b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 3 Mar 2016 12:54:53 -0600 Subject: tracing: Support string type key properly The string in a trace event is usually recorded as dynamic array which is variable length. But current hist code only support fixed length array so it cannot support most strings. This patch fixes it by checking filter_type of the field and get proper pointer with it. With this, it can get a histogram of exec() based on filenames like below: # cd /sys/kernel/tracing/events/sched/sched_process_exec # cat 'hist:key=filename' > trigger # ps PID TTY TIME CMD 1 ? 00:00:00 init 29 ? 00:00:00 sh 38 ? 00:00:00 ps # ls enable filter format hist id trigger # cat hist # trigger info: hist:keys=filename:vals=hitcount:sort=hitcount:size=2048 [active] { filename: /usr/bin/ps } hitcount: 1 { filename: /usr/bin/ls } hitcount: 1 { filename: /usr/bin/cat } hitcount: 1 Totals: Hits: 3 Entries: 3 Dropped: 0 Link: http://lkml.kernel.org/r/610180d6df0cfdf11ee205452f3b241dea657233.1457029949.git.tom.zanussi@linux.intel.com Cc: Tom Zanussi Cc: Masami Hiramatsu Signed-off-by: Namhyung Kim Tested-by: Masami Hiramatsu [ Added (unsigned long) typecast to fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 21cae42b54da..418ff27cbbaf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -52,12 +52,28 @@ static u64 hist_field_string(struct hist_field *hist_field, void *event) return (u64)(unsigned long)addr; } +static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) +{ + u32 str_item = *(u32 *)(event + hist_field->field->offset); + int str_loc = str_item & 0xffff; + char *addr = (char *)(event + str_loc); + + return (u64)(unsigned long)addr; +} + +static u64 hist_field_pstring(struct hist_field *hist_field, void *event) +{ + char **addr = (char **)(event + hist_field->field->offset); + + return (u64)(unsigned long)*addr; +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ type *addr = (type *)(event + hist_field->field->offset); \ \ - return (u64)*addr; \ + return (u64)(unsigned long)*addr; \ } DEFINE_HIST_FIELD_FN(s64); @@ -340,7 +356,13 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; - hist_field->fn = hist_field_string; + + if (field->filter_type == FILTER_STATIC_STRING) + hist_field->fn = hist_field_string; + else if (field->filter_type == FILTER_DYN_STRING) + hist_field->fn = hist_field_dynstring; + else + hist_field->fn = hist_field_pstring; } else { hist_field->fn = select_value_fn(field->size, field->is_signed); @@ -508,7 +530,10 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - key_size = field->size; + if (is_string_field(field)) /* should be last key field */ + key_size = HIST_KEY_SIZE_MAX - key_offset; + else + key_size = field->size; } hist_data->fields[key_idx] = create_hist_field(field, flags); @@ -841,8 +866,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = (void *)&field_contents; if (hist_data->n_keys > 1) { + /* ensure NULL-termination */ + size_t size = key_field->size - 1; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + + if (size > key_field->size - 1) + size = key_field->size - 1; + } + memcpy(compound_key + key_field->offset, key, - key_field->size); + size); } } } -- cgit v1.2.3 From 6a475cb17fe161dbde62eca7e4c2cf59edff182f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:54 -0600 Subject: tracing: Remove restriction on string position in hist trigger keys If we assume the maximum size for a string field, we don't have to worry about its position. Since we only allow two keys in a compound key and having more than one string key in a given compound key doesn't make much sense anyway, trading a bit of extra space instead of introducing an arbitrary restriction makes more sense. We also need to use the event field size for static strings when copying the contents, otherwise we get random garbage in the key. Also, cast string return values to avoid warnings on 32-bit compiles. Finally, rearrange the code without changing any functionality by moving the compound key updating code into a separate function. Link: http://lkml.kernel.org/r/8976e1ab04b66bc2700ad1ed0768a2de85ac1983.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 63 ++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 418ff27cbbaf..4f4041d76926 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -530,8 +530,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - if (is_string_field(field)) /* should be last key field */ - key_size = HIST_KEY_SIZE_MAX - key_offset; + if (is_string_field(field)) + key_size = MAX_FILTER_STR_VAL; else key_size = field->size; } @@ -830,9 +830,34 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, } } +static inline void add_to_key(char *compound_key, void *key, + struct hist_field *key_field, void *rec) +{ + size_t size = key_field->size; + + if (key_field->flags & HIST_FIELD_FL_STRING) { + struct ftrace_event_field *field; + + field = key_field->field; + if (field->filter_type == FILTER_DYN_STRING) + size = *(u32 *)(rec + field->offset) >> 16; + else if (field->filter_type == FILTER_PTR_STRING) + size = strlen(key); + else if (field->filter_type == FILTER_STATIC_STRING) + size = field->size; + + /* ensure NULL-termination */ + if (size > key_field->size - 1) + size = key_field->size - 1; + } + + memcpy(compound_key + key_field->offset, key, size); +} + static void event_hist_trigger(struct event_trigger_data *data, void *rec) { struct hist_trigger_data *hist_data = data->private_data; + bool use_compound_key = (hist_data->n_keys > 1); unsigned long entries[HIST_STACKTRACE_DEPTH]; char compound_key[HIST_KEY_SIZE_MAX]; struct stack_trace stacktrace; @@ -842,8 +867,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) void *key = NULL; unsigned int i; - if (hist_data->n_keys > 1) - memset(compound_key, 0, hist_data->key_size); + memset(compound_key, 0, hist_data->key_size); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -860,35 +884,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) key = entries; } else { field_contents = key_field->fn(key_field, rec); - if (key_field->flags & HIST_FIELD_FL_STRING) + if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; - else + use_compound_key = true; + } else key = (void *)&field_contents; - - if (hist_data->n_keys > 1) { - /* ensure NULL-termination */ - size_t size = key_field->size - 1; - - if (key_field->flags & HIST_FIELD_FL_STRING) { - struct ftrace_event_field *field; - - field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) - size = *(u32 *)(rec + field->offset) >> 16; - else if (field->filter_type == FILTER_PTR_STRING) - size = strlen(key); - - if (size > key_field->size - 1) - size = key_field->size - 1; - } - - memcpy(compound_key + key_field->offset, key, - size); - } } + + if (use_compound_key) + add_to_key(compound_key, key, key_field, rec); } - if (hist_data->n_keys > 1) + if (use_compound_key) key = compound_key; elt = tracing_map_insert(hist_data->map, key); -- cgit v1.2.3 From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:55 -0600 Subject: tracing: Add enable_hist/disable_hist triggers Similar to enable_event/disable_event triggers, these triggers enable and disable the aggregation of events into maps rather than enabling and disabling their writing into the trace buffer. They can be used to automatically start and stop hist triggers based on a matching filter condition. If there's a paused hist trigger on system:event, the following would start it when the filter condition was hit: # echo enable_hist:system:event [ if filter] > event/trigger And the following would disable a running system:event hist trigger: # echo disable_hist:system:event [ if filter] > event/trigger See Documentation/trace/events.txt for real examples. Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 8 +++ kernel/trace/trace.h | 32 ++++++++++ kernel/trace/trace_events_hist.c | 115 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_trigger.c | 71 ++++++++++++---------- 5 files changed, 196 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 404603720650..5f89a5b0c7e6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -408,6 +408,7 @@ enum event_trigger_type { ETT_STACKTRACE = (1 << 2), ETT_EVENT_ENABLE = (1 << 3), ETT_EVENT_HIST = (1 << 4), + ETT_HIST_ENABLE = (1 << 5), }; extern int filter_match_preds(struct event_filter *filter, void *rec); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2238bfde799b..8430145bea12 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3807,6 +3807,10 @@ static const char readme_msg[] = "\t trigger: traceon, traceoff\n" "\t enable_event::\n" "\t disable_event::\n" +#ifdef CONFIG_HIST_TRIGGERS + "\t enable_hist::\n" + "\t disable_hist::\n" +#endif #ifdef CONFIG_STACKTRACE "\t\t stacktrace\n" #endif @@ -3867,6 +3871,10 @@ static const char readme_msg[] = "\t The 'clear' parameter will clear the contents of a running\n" "\t hist trigger and leave its current paused/active state\n" "\t unchanged.\n\n" + "\t The enable_hist and disable_hist triggers can be used to\n" + "\t have one event conditionally start and stop another event's\n" + "\t already-attached hist trigger. The syntax is analagous to\n" + "\t the enable_event and disable_event triggers.\n" #endif ; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 505f8a45f426..cab1f4bfe85b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); +extern int register_trigger_hist_enable_disable_cmds(void); #else static inline int register_trigger_hist_cmd(void) { return 0; } +static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } #endif extern int register_trigger_cmds(void); @@ -1185,6 +1187,34 @@ struct event_trigger_data { struct list_head list; }; +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" +#define ENABLE_HIST_STR "enable_hist" +#define DISABLE_HIST_STR "disable_hist" + +struct enable_trigger_data { + struct trace_event_file *file; + bool enable; + bool hist; +}; + +extern int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); +extern int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file); +extern void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_ops *ops, struct event_trigger_data *data); @@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); extern int register_event_command(struct event_command *cmd); +extern int unregister_event_command(struct event_command *cmd); +extern int register_trigger_hist_enable_disable_cmds(void); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4f4041d76926..5d4f02792440 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void) return ret; } + +static void +hist_enable_trigger(struct event_trigger_data *data, void *rec) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (enable_data->enable) + test->paused = false; + else + test->paused = true; + break; + } + } +} + +static void +hist_enable_count_trigger(struct event_trigger_data *data, void *rec) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + hist_enable_trigger(data, rec); +} + +static struct event_trigger_ops hist_enable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_enable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_trigger_ops = { + .func = hist_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops hist_disable_count_trigger_ops = { + .func = hist_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops * +hist_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); + + if (enable) + ops = param ? &hist_enable_count_trigger_ops : + &hist_enable_trigger_ops; + else + ops = param ? &hist_disable_count_trigger_ops : + &hist_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_hist_enable_cmd = { + .name = ENABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_hist_disable_cmd = { + .name = DISABLE_HIST_STR, + .trigger_type = ETT_HIST_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = hist_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_hist_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_hist_enable_cmd); + unregister_event_command(&trigger_hist_disable_cmd); +} + +__init int register_trigger_hist_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_hist_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_hist_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_hist_enable_disable_cmds(); + + return ret; +} diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d29092afe005..d133f2094566 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd) * Currently we only unregister event commands from __init, so mark * this __init too. */ -static __init int unregister_event_command(struct event_command *cmd) +__init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; int ret = -ENODEV; @@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) unregister_event_command(&trigger_traceoff_cmd); } -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct enable_trigger_data { - struct trace_event_file *file; - bool enable; -}; - static void event_enable_trigger(struct event_trigger_data *data, void *rec) { @@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) event_enable_trigger(data, rec); } -static int -event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_enable_trigger_print(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; seq_printf(m, "%s:%s:%s", - enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->hist ? + (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) : + (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR), enable_data->file->event_call->class->system, trace_event_name(enable_data->file->event_call)); @@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, return 0; } -static void -event_enable_trigger_free(struct event_trigger_ops *ops, - struct event_trigger_data *data) +void event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) { struct enable_trigger_data *enable_data = data->private_data; @@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -static int -event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_func(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, struct trace_array *tr = file->tr; const char *system; const char *event; + bool hist = false; char *trigger; char *number; bool enable; @@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops, if (!event_enable_file) goto out; - enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#ifdef CONFIG_HIST_TRIGGERS + hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) || + (strcmp(cmd, DISABLE_HIST_STR) == 0)); + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; +#endif trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); ret = -ENOMEM; @@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->list); RCU_INIT_POINTER(trigger_data->filter, NULL); + enable_data->hist = hist; enable_data->enable = enable; enable_data->file = event_enable_file; trigger_data->private_data = enable_data; @@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops, goto out; } -static int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) { struct enable_trigger_data *enable_data = data->private_data; struct enable_trigger_data *test_enable_data; @@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob, list_for_each_entry_rcu(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && + (test->cmd_ops->trigger_type == + data->cmd_ops->trigger_type) && (test_enable_data->file == enable_data->file)) { ret = -EEXIST; goto out; @@ -1343,10 +1345,10 @@ out: return ret; } -static void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct enable_trigger_data *test_enable_data = test->private_data; struct enable_trigger_data *enable_data; @@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob, list_for_each_entry_rcu(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && + (data->cmd_ops->trigger_type == + test->cmd_ops->trigger_type) && (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); @@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param) struct event_trigger_ops *ops; bool enable; +#ifdef CONFIG_HIST_TRIGGERS + enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) || + (strcmp(cmd, ENABLE_HIST_STR) == 0)); +#else enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; - +#endif if (enable) ops = param ? &event_enable_count_trigger_ops : &event_enable_trigger_ops; @@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void) register_trigger_snapshot_cmd(); register_trigger_stacktrace_cmd(); register_trigger_enable_disable_cmds(); + register_trigger_hist_enable_disable_cmds(); register_trigger_hist_cmd(); return 0; -- cgit v1.2.3 From 52a7f16dedff8f23d03df3ea556dec95b92a5801 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:57 -0600 Subject: tracing: Add support for multiple hist triggers per event Allow users to define any number of hist triggers per trace event. Any number of hist triggers may be added for a given event, which may differ by key, value, or filter. Reading the event's 'hist' file will display the output of all the hist triggers defined on an event concatenated in the order they were defined. Link: http://lkml.kernel.org/r/48a0c8dd34c344571de880fb35e211c6d9a28961.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- Documentation/trace/events.txt | 154 +++++++++++++++++++++++++++++++++-- kernel/trace/trace.c | 8 +- kernel/trace/trace_events_hist.c | 172 +++++++++++++++++++++++++++++++-------- 3 files changed, 293 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index db223e85bd2f..cc0216760b78 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -550,12 +550,14 @@ The following commands are supported: 'hist' triggers add a 'hist' file to each event's subdirectory. Reading the 'hist' file for the event will dump the hash table in - its entirety to stdout. Each printed hash table entry is a simple - list of the keys and values comprising the entry; keys are printed - first and are delineated by curly braces, and are followed by the - set of value fields for the entry. By default, numeric fields are - displayed as base-10 integers. This can be modified by appending - any of the following modifiers to the field name: + its entirety to stdout. If there are multiple hist triggers + attached to an event, there will be a table for each trigger in the + output. Each printed hash table entry is a simple list of the keys + and values comprising the entry; keys are printed first and are + delineated by curly braces, and are followed by the set of value + fields for the entry. By default, numeric fields are displayed as + base-10 integers. This can be modified by appending any of the + following modifiers to the field name: .hex display a number as a hex value .sym display an address as a symbol @@ -1667,3 +1669,143 @@ The following commands are supported: . . . + + The following example demonstrates how multiple hist triggers can be + attached to a given event. This capability can be useful for + creating a set of different summaries derived from the same set of + events, or for comparing the effects of different filters, among + other things. + + # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=skbaddr.hex:vals=len' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:keys=len:vals=common_preempt_count' >> \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + + The above set of commands create four triggers differing only in + their filters, along with a completely different though fairly + nonsensical trigger. Note that in order to append multiple hist + triggers to the same file, you should use the '>>' operator to + append them ('>' will also add the new hist trigger, but will remove + any existing hist triggers beforehand). + + Displaying the contents of the 'hist' file for the event shows the + contents of all five histograms: + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist + + # event histogram + # + # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active] + # + + { len: 176 } hitcount: 1 common_preempt_count: 0 + { len: 223 } hitcount: 1 common_preempt_count: 0 + { len: 4854 } hitcount: 1 common_preempt_count: 0 + { len: 395 } hitcount: 1 common_preempt_count: 0 + { len: 177 } hitcount: 1 common_preempt_count: 0 + { len: 446 } hitcount: 1 common_preempt_count: 0 + { len: 1601 } hitcount: 1 common_preempt_count: 0 + . + . + . + { len: 1280 } hitcount: 66 common_preempt_count: 0 + { len: 116 } hitcount: 81 common_preempt_count: 40 + { len: 708 } hitcount: 112 common_preempt_count: 0 + { len: 46 } hitcount: 221 common_preempt_count: 0 + { len: 1264 } hitcount: 458 common_preempt_count: 0 + + Totals: + Hits: 1428 + Entries: 147 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130 + { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280 + { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280 + { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115 + { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46 + { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118 + { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60 + { skbaddr: ffff880100065900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116 + { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280 + { skbaddr: ffff880100064700 } hitcount: 1 len: 365 + { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60 + . + . + . + { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677 + { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052 + { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589 + { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326 + { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678 + { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678 + { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589 + { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307 + { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032 + + Totals: + Hits: 1451 + Entries: 318 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active] + # + + + Totals: + Hits: 0 + Entries: 0 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active] + # + + { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492 + { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212 + { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212 + { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854 + { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636 + { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924 + { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356 + { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420 + { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996 + + Totals: + Hits: 14 + Entries: 12 + Dropped: 0 + + + # event histogram + # + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active] + # + + + Totals: + Hits: 0 + Entries: 0 + Dropped: 0 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8430145bea12..e89b2ed76d2d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3856,9 +3856,11 @@ static const char readme_msg[] = "\t sort field. The 'size' parameter can be used to specify more\n" "\t or fewer than the default 2048 entries for the hashtable size.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" - "\t table in its entirety to stdout. The default format used to\n" - "\t display a given field can be modified by appending any of the\n" - "\t following modifiers to the field name, as applicable:\n\n" + "\t table in its entirety to stdout. If there are multiple hist\n" + "\t triggers attached to an event, there will be a table for each\n" + "\t trigger in the output. The default format used to display a\n" + "\t given field can be modified by appending any of the following\n" + "\t modifiers to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5d4f02792440..4b02f8ab4dd3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1032,33 +1032,18 @@ static int print_entries(struct seq_file *m, return n_entries; } -static int hist_show(struct seq_file *m, void *v) +static void hist_trigger_show(struct seq_file *m, + struct event_trigger_data *data, int n) { - struct event_trigger_data *test, *data = NULL; - struct trace_event_file *event_file; struct hist_trigger_data *hist_data; int n_entries, ret = 0; - mutex_lock(&event_mutex); - - event_file = event_file_data(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } - - list_for_each_entry_rcu(test, &event_file->triggers, list) { - if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - data = test; - break; - } - } - if (!data) - goto out_unlock; + if (n > 0) + seq_puts(m, "\n\n"); seq_puts(m, "# event histogram\n#\n# trigger info: "); data->ops->print(m, data->ops, data); - seq_puts(m, "\n"); + seq_puts(m, "#\n\n"); hist_data = data->private_data; n_entries = print_entries(m, hist_data); @@ -1070,6 +1055,27 @@ static int hist_show(struct seq_file *m, void *v) seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); +} + +static int hist_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct trace_event_file *event_file; + int n = 0, ret = 0; + + mutex_lock(&event_mutex); + + event_file = event_file_data(m->private); + if (unlikely(!event_file)) { + ret = -ENODEV; + goto out_unlock; + } + + list_for_each_entry_rcu(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) + hist_trigger_show(m, data, n++); + } + out_unlock: mutex_unlock(&event_mutex); @@ -1233,6 +1239,54 @@ static void hist_clear(struct event_trigger_data *data) data->paused = paused; } +static bool hist_trigger_match(struct event_trigger_data *data, + struct event_trigger_data *data_test) +{ + struct tracing_map_sort_key *sort_key, *sort_key_test; + struct hist_trigger_data *hist_data, *hist_data_test; + struct hist_field *key_field, *key_field_test; + unsigned int i; + + hist_data = data->private_data; + hist_data_test = data_test->private_data; + + if (hist_data->n_vals != hist_data_test->n_vals || + hist_data->n_fields != hist_data_test->n_fields || + hist_data->n_sort_keys != hist_data_test->n_sort_keys) + return false; + + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + + for_each_hist_field(i, hist_data) { + key_field = hist_data->fields[i]; + key_field_test = hist_data_test->fields[i]; + + if (key_field->flags != key_field_test->flags) + return false; + if (key_field->field != key_field_test->field) + return false; + if (key_field->offset != key_field_test->offset) + return false; + } + + for (i = 0; i < hist_data->n_sort_keys; i++) { + sort_key = &hist_data->sort_keys[i]; + sort_key_test = &hist_data_test->sort_keys[i]; + + if (sort_key->field_idx != sort_key_test->field_idx || + sort_key->descending != sort_key_test->descending) + return false; + } + + if (data->filter_str && + (strcmp(data->filter_str, data_test->filter_str) != 0)) + return false; + + return true; +} + static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) @@ -1243,6 +1297,8 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test)) + continue; if (hist_data->attrs->pause) test->paused = true; else if (hist_data->attrs->cont) @@ -1282,6 +1338,44 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, return ret; } +static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + struct event_trigger_data *test; + bool unregistered = false; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + if (!hist_trigger_match(data, test)) + continue; + unregistered = true; + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + break; + } + } + + if (unregistered && test->ops->free) + test->ops->free(test->ops, test); +} + +static void hist_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { + list_del_rcu(&test->list); + trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + static int event_hist_trigger_func(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param) @@ -1331,22 +1425,19 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger_data->private_data = hist_data; + /* if param is non-empty, it's supposed to be a filter */ + if (param && cmd_ops->set_filter) { + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + } + if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); ret = 0; goto out_free; } - if (!param) /* if param is non-empty, it's supposed to be a filter */ - goto out_reg; - - if (!cmd_ops->set_filter) - goto out_reg; - - ret = cmd_ops->set_filter(param, trigger_data, file); - if (ret < 0) - goto out_free; - out_reg: ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); /* * The above returns on success the # of triggers registered, @@ -1379,7 +1470,8 @@ static struct event_command trigger_hist_cmd = { .flags = EVENT_CMD_FL_NEEDS_REC, .func = event_hist_trigger_func, .reg = hist_register_trigger, - .unreg = unregister_trigger, + .unreg = hist_unregister_trigger, + .unreg_all = hist_unreg_all, .get_trigger_ops = event_hist_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1406,7 +1498,6 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) test->paused = false; else test->paused = true; - break; } } } @@ -1469,12 +1560,28 @@ hist_enable_get_trigger_ops(char *cmd, char *param) return ops; } +static void hist_enable_unreg_all(struct trace_event_file *file) +{ + struct event_trigger_data *test; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) { + list_del_rcu(&test->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + if (test->ops->free) + test->ops->free(test->ops, test); + } + } +} + static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, .func = event_enable_trigger_func, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; @@ -1485,6 +1592,7 @@ static struct event_command trigger_hist_disable_cmd = { .func = event_enable_trigger_func, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, + .unreg_all = hist_enable_unreg_all, .get_trigger_ops = hist_enable_get_trigger_ops, .set_filter = set_trigger_filter, }; -- cgit v1.2.3 From db1388b4ffa9e31e9ff0abacc3bdb121bec8c688 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:58 -0600 Subject: tracing: Add support for named triggers Named triggers are sets of triggers that share a common set of trigger data. An example of functionality that could benefit from this type of capability would be a set of inlined probes that would each contribute event counts, for example, to a shared counter data structure. The first named trigger registered with a given name owns the common trigger data that the others subsequently registered with the same name will reference. The functions defined here allow users to add, delete, and find named triggers. It also adds functions to pause and unpause named triggers; since named triggers act upon common data, they should also be paused and unpaused as a group. Link: http://lkml.kernel.org/r/c09ff648360f65b10a3e321eddafe18060b4a04f.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 13 ++++ kernel/trace/trace_events_trigger.c | 143 ++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cab1f4bfe85b..727a3d28bce5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1184,7 +1184,11 @@ struct event_trigger_data { char *filter_str; void *private_data; bool paused; + bool paused_tmp; struct list_head list; + char *name; + struct list_head named_list; + struct event_trigger_data *named_data; }; /* Avoid typos */ @@ -1227,6 +1231,15 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); +extern struct event_trigger_data *find_named_trigger(const char *name); +extern bool is_named_trigger(struct event_trigger_data *test); +extern int save_named_trigger(const char *name, + struct event_trigger_data *data); +extern void del_named_trigger(struct event_trigger_data *data); +extern void pause_named_trigger(struct event_trigger_data *data); +extern void unpause_named_trigger(struct event_trigger_data *data); +extern void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d133f2094566..a975571cde24 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); @@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str, return ret; } +static LIST_HEAD(named_triggers); + +/** + * find_named_trigger - Find the common named trigger associated with @name + * @name: The name of the set of named triggers to find the common data for + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * returns the common trigger data associated with that first + * registered instance. + * + * Return: the common trigger data for the given named trigger on + * success, NULL otherwise. + */ +struct event_trigger_data *find_named_trigger(const char *name) +{ + struct event_trigger_data *data; + + if (!name) + return NULL; + + list_for_each_entry(data, &named_triggers, named_list) { + if (data->named_data) + continue; + if (strcmp(data->name, name) == 0) + return data; + } + + return NULL; +} + +/** + * is_named_trigger - determine if a given trigger is a named trigger + * @test: The trigger data to test + * + * Return: true if 'test' is a named trigger, false otherwise. + */ +bool is_named_trigger(struct event_trigger_data *test) +{ + struct event_trigger_data *data; + + list_for_each_entry(data, &named_triggers, named_list) { + if (test == data) + return true; + } + + return false; +} + +/** + * save_named_trigger - save the trigger in the named trigger list + * @name: The name of the named trigger set + * @data: The trigger data to save + * + * Return: 0 if successful, negative error otherwise. + */ +int save_named_trigger(const char *name, struct event_trigger_data *data) +{ + data->name = kstrdup(name, GFP_KERNEL); + if (!data->name) + return -ENOMEM; + + list_add(&data->named_list, &named_triggers); + + return 0; +} + +/** + * del_named_trigger - delete a trigger from the named trigger list + * @data: The trigger data to delete + */ +void del_named_trigger(struct event_trigger_data *data) +{ + kfree(data->name); + data->name = NULL; + + list_del(&data->named_list); +} + +static void __pause_named_trigger(struct event_trigger_data *data, bool pause) +{ + struct event_trigger_data *test; + + list_for_each_entry(test, &named_triggers, named_list) { + if (strcmp(test->name, data->name) == 0) { + if (pause) { + test->paused_tmp = test->paused; + test->paused = true; + } else { + test->paused = test->paused_tmp; + } + } + } +} + +/** + * pause_named_trigger - Pause all named triggers with the same name + * @data: The trigger data of a named trigger to pause + * + * Pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * pausing only one is meaningless, so pausing one named trigger needs + * to pause all triggers with the same name. + */ +void pause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, true); +} + +/** + * unpause_named_trigger - Un-pause all named triggers with the same name + * @data: The trigger data of a named trigger to unpause + * + * Un-pauses a named trigger along with all other triggers having the + * same name. Because named triggers share a common set of data, + * unpausing only one is meaningless, so unpausing one named trigger + * needs to unpause all triggers with the same name. + */ +void unpause_named_trigger(struct event_trigger_data *data) +{ + __pause_named_trigger(data, false); +} + +/** + * set_named_trigger_data - Associate common named trigger data + * @data: The trigger data of a named trigger to unpause + * + * Named triggers are sets of triggers that share a common set of + * trigger data. The first named trigger registered with a given name + * owns the common trigger data that the others subsequently + * registered with the same name will reference. This function + * associates the common trigger data from the first trigger with the + * given trigger. + */ +void set_named_trigger_data(struct event_trigger_data *data, + struct event_trigger_data *named_data) +{ + data->named_data = named_data; +} + static void traceon_trigger(struct event_trigger_data *data, void *rec) { -- cgit v1.2.3 From 5463bfda327b1f7310556ef3136533e27c774f13 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 3 Mar 2016 12:54:59 -0600 Subject: tracing: Add support for named hist triggers Allow users to define 'named' hist triggers. All triggers created with the same 'name=xxx' option will update the same shared histogram data. This expands the hist trigger syntax from this: # echo hist:keys=xxx ... [ if filter] > event/trigger to this: # echo hist:name=xxx:keys=xxx ... [ if filter] > event/trigger Named histograms must use a 'compatible' set of keys and values, which means each event added to a set of named triggers must have the same names and types. Reading the 'hist' file of any of the participating events will produce the same output as any other participating event, which is to be expected since they share the same data. Link: http://lkml.kernel.org/r/1dbc84ee3322a75daaf5b3ef1d0cc0a2fb682fc7.1457029949.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Tested-by: Masami Hiramatsu Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt --- Documentation/trace/events.txt | 274 +++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.c | 14 +- kernel/trace/trace_events_hist.c | 148 ++++++++++++++++++--- 3 files changed, 407 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index cc0216760b78..08d74d75150d 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -524,7 +524,7 @@ The following commands are supported: hist:keys=[:values=] [:sort=][:size=#entries][:pause][:continue] - [:clear] [if ] + [:clear][:name=histname1] [if ] When a matching event is hit, an entry is added to a hash table using the key(s) and value(s) named. Keys and values correspond to @@ -546,18 +546,28 @@ The following commands are supported: specified by the 'sort' keyword. If more than one field is specified, the result will be a 'sort within a sort': the first key is taken to be the primary sort key and the second the secondary - key. + key. If a hist trigger is given a name using the 'name' parameter, + its histogram data will be shared with other triggers of the same + name, and trigger hits will update this common data. Only triggers + with 'compatible' fields can be combined in this way; triggers are + 'compatible' if the fields named in the trigger share the same + number and type of fields and those fields also have the same names. + Note that any two events always share the compatible 'hitcount' and + 'stacktrace' fields and can therefore be combined using those + fields, however pointless that may be. 'hist' triggers add a 'hist' file to each event's subdirectory. Reading the 'hist' file for the event will dump the hash table in its entirety to stdout. If there are multiple hist triggers attached to an event, there will be a table for each trigger in the - output. Each printed hash table entry is a simple list of the keys - and values comprising the entry; keys are printed first and are - delineated by curly braces, and are followed by the set of value - fields for the entry. By default, numeric fields are displayed as - base-10 integers. This can be modified by appending any of the - following modifiers to the field name: + output. The table displayed for a named trigger will be the same as + any other instance having the same name. Each printed hash table + entry is a simple list of the keys and values comprising the entry; + keys are printed first and are delineated by curly braces, and are + followed by the set of value fields for the entry. By default, + numeric fields are displayed as base-10 integers. This can be + modified by appending any of the following modifiers to the field + name: .hex display a number as a hex value .sym display an address as a symbol @@ -1809,3 +1819,251 @@ The following commands are supported: Hits: 0 Entries: 0 Dropped: 0 + + Named triggers can be used to have triggers share a common set of + histogram data. This capability is mostly useful for combining the + output of events generated by tracepoints contained inside inline + functions, but names can be used in a hist trigger on any event. + For example, these two triggers when hit will update the same 'len' + field in the shared 'foo' histogram data: + + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + You can see that they're updating common histogram data by reading + each event's hist files at the same time: + + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist; + cat /sys/kernel/debug/tracing/events/net/netif_rx/hist + + # event histogram + # + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 + + Totals: + Hits: 81 + Entries: 42 + Dropped: 0 + # event histogram + # + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] + # + + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 + + Totals: + Hits: 81 + Entries: 42 + Dropped: 0 + + And here's an example that shows how to combine histogram data from + any two events even if they don't share any 'compatible' fields + other than 'hitcount' and 'stacktrace'. These commands create a + couple of triggers named 'bar' using those fields: + + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ + /sys/kernel/debug/tracing/events/net/netif_rx/trigger + + And displaying the output of either shows some interesting if + somewhat confusing output: + + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist + + # event histogram + # + # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] + # + + { stacktrace: + _do_fork+0x18e/0x330 + kernel_thread+0x29/0x30 + kthreadd+0x154/0x1b0 + ret_from_fork+0x3f/0x70 + } hitcount: 1 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx_ni+0x20/0x70 + dev_loopback_xmit+0xaa/0xd0 + ip_mc_output+0x126/0x240 + ip_local_out_sk+0x31/0x40 + igmp_send_report+0x1e9/0x230 + igmp_timer_expire+0xe9/0x120 + call_timer_fn+0x39/0xf0 + run_timer_softirq+0x1e1/0x290 + __do_softirq+0xfd/0x290 + irq_exit+0x98/0xb0 + smp_apic_timer_interrupt+0x4a/0x60 + apic_timer_interrupt+0x6d/0x80 + cpuidle_enter+0x17/0x20 + call_cpuidle+0x3b/0x60 + cpu_startup_entry+0x22d/0x310 + } hitcount: 1 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx_ni+0x20/0x70 + dev_loopback_xmit+0xaa/0xd0 + ip_mc_output+0x17f/0x240 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x13e/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + SYSC_sendto+0xef/0x170 + SyS_sendto+0xe/0x10 + entry_SYSCALL_64_fastpath+0x12/0x6a + } hitcount: 2 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + ___sys_sendmsg+0x14e/0x270 + } hitcount: 76 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + ___sys_sendmsg+0x269/0x270 + } hitcount: 77 + { stacktrace: + netif_rx_internal+0xb2/0xd0 + netif_rx+0x1c/0x60 + loopback_xmit+0x6c/0xb0 + dev_hard_start_xmit+0x219/0x3a0 + __dev_queue_xmit+0x415/0x4f0 + dev_queue_xmit_sk+0x13/0x20 + ip_finish_output2+0x237/0x340 + ip_finish_output+0x113/0x1d0 + ip_output+0x66/0xc0 + ip_local_out_sk+0x31/0x40 + ip_send_skb+0x1a/0x50 + udp_send_skb+0x16d/0x270 + udp_sendmsg+0x2bf/0x980 + inet_sendmsg+0x67/0xa0 + sock_sendmsg+0x38/0x50 + SYSC_sendto+0xef/0x170 + } hitcount: 88 + { stacktrace: + _do_fork+0x18e/0x330 + SyS_clone+0x19/0x20 + entry_SYSCALL_64_fastpath+0x12/0x6a + } hitcount: 244 + + Totals: + Hits: 489 + Entries: 7 + Dropped: 0 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e89b2ed76d2d..4e342d354c12 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3842,6 +3842,7 @@ static const char readme_msg[] = "\t [:sort=]\n" "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" + "\t [:name=histname1]\n" "\t [if ]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -3854,13 +3855,18 @@ static const char readme_msg[] = "\t specified using the 'sort' keyword. The sort direction can\n" "\t be modified by appending '.descending' or '.ascending' to a\n" "\t sort field. The 'size' parameter can be used to specify more\n" - "\t or fewer than the default 2048 entries for the hashtable size.\n\n" + "\t or fewer than the default 2048 entries for the hashtable size.\n" + "\t If a hist trigger is given a name using the 'name' parameter,\n" + "\t its histogram data will be shared with other triggers of the\n" + "\t same name, and trigger hits will update this common data.\n\n" "\t Reading the 'hist' file for the event will dump the hash\n" "\t table in its entirety to stdout. If there are multiple hist\n" "\t triggers attached to an event, there will be a table for each\n" - "\t trigger in the output. The default format used to display a\n" - "\t given field can be modified by appending any of the following\n" - "\t modifiers to the field name, as applicable:\n\n" + "\t trigger in the output. The table displayed for a named\n" + "\t trigger will be the same as any other instance having the\n" + "\t same name. The default format used to display a given field\n" + "\t can be modified by appending any of the following modifiers\n" + "\t to the field name, as applicable:\n\n" "\t .hex display a number as a hex value\n" "\t .sym display an address as a symbol\n" "\t .sym-offset display an address as a symbol and offset\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 4b02f8ab4dd3..bdea5603cbde 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -117,6 +117,7 @@ struct hist_trigger_attrs { char *keys_str; char *vals_str; char *sort_key_str; + char *name; bool pause; bool cont; bool clear; @@ -200,6 +201,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) if (!attrs) return; + kfree(attrs->name); kfree(attrs->sort_key_str); kfree(attrs->keys_str); kfree(attrs->vals_str); @@ -227,6 +229,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) attrs->vals_str = kstrdup(str, GFP_KERNEL); else if (strncmp(str, "sort=", strlen("sort=")) == 0) attrs->sort_key_str = kstrdup(str, GFP_KERNEL); + else if (strncmp(str, "name=", strlen("name=")) == 0) + attrs->name = kstrdup(str, GFP_KERNEL); else if (strcmp(str, "pause") == 0) attrs->pause = true; else if ((strcmp(str, "cont") == 0) || @@ -1131,7 +1135,12 @@ static int event_hist_trigger_print(struct seq_file *m, struct hist_field *key_field; unsigned int i; - seq_puts(m, "hist:keys="); + seq_puts(m, "hist:"); + + if (data->name) + seq_printf(m, "%s:", data->name); + + seq_puts(m, "keys="); for_each_hist_key_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -1196,6 +1205,19 @@ static int event_hist_trigger_print(struct seq_file *m, return 0; } +static int event_hist_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct hist_trigger_data *hist_data = data->private_data; + + if (!data->ref && hist_data->attrs->name) + save_named_trigger(hist_data->attrs->name, data); + + data->ref++; + + return 0; +} + static void event_hist_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data) { @@ -1206,6 +1228,8 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, data->ref--; if (!data->ref) { + if (data->name) + del_named_trigger(data); trigger_data_free(data); destroy_hist_data(hist_data); } @@ -1214,10 +1238,44 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, static struct event_trigger_ops event_hist_trigger_ops = { .func = event_hist_trigger, .print = event_hist_trigger_print, - .init = event_trigger_init, + .init = event_hist_trigger_init, .free = event_hist_trigger_free, }; +static int event_hist_trigger_named_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + + save_named_trigger(data->named_data->name, data); + + event_hist_trigger_init(ops, data->named_data); + + return 0; +} + +static void event_hist_trigger_named_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + event_hist_trigger_free(ops, data->named_data); + + data->ref--; + if (!data->ref) { + del_named_trigger(data); + trigger_data_free(data); + } +} + +static struct event_trigger_ops event_hist_trigger_named_ops = { + .func = event_hist_trigger, + .print = event_hist_trigger_print, + .init = event_hist_trigger_named_init, + .free = event_hist_trigger_named_free, +}; + static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, char *param) { @@ -1227,26 +1285,54 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, static void hist_clear(struct event_trigger_data *data) { struct hist_trigger_data *hist_data = data->private_data; - bool paused; - paused = data->paused; - data->paused = true; + if (data->name) + pause_named_trigger(data); synchronize_sched(); tracing_map_clear(hist_data->map); - data->paused = paused; + if (data->name) + unpause_named_trigger(data); +} + +static bool compatible_field(struct ftrace_event_field *field, + struct ftrace_event_field *test_field) +{ + if (field == test_field) + return true; + if (field == NULL || test_field == NULL) + return false; + if (strcmp(field->name, test_field->name) != 0) + return false; + if (strcmp(field->type, test_field->type) != 0) + return false; + if (field->size != test_field->size) + return false; + if (field->is_signed != test_field->is_signed) + return false; + + return true; } static bool hist_trigger_match(struct event_trigger_data *data, - struct event_trigger_data *data_test) + struct event_trigger_data *data_test, + struct event_trigger_data *named_data, + bool ignore_filter) { struct tracing_map_sort_key *sort_key, *sort_key_test; struct hist_trigger_data *hist_data, *hist_data_test; struct hist_field *key_field, *key_field_test; unsigned int i; + if (named_data && (named_data != data_test) && + (named_data != data_test->named_data)) + return false; + + if (!named_data && is_named_trigger(data_test)) + return false; + hist_data = data->private_data; hist_data_test = data_test->private_data; @@ -1255,9 +1341,11 @@ static bool hist_trigger_match(struct event_trigger_data *data, hist_data->n_sort_keys != hist_data_test->n_sort_keys) return false; - if ((data->filter_str && !data_test->filter_str) || - (!data->filter_str && data_test->filter_str)) - return false; + if (!ignore_filter) { + if ((data->filter_str && !data_test->filter_str) || + (!data->filter_str && data_test->filter_str)) + return false; + } for_each_hist_field(i, hist_data) { key_field = hist_data->fields[i]; @@ -1265,7 +1353,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, if (key_field->flags != key_field_test->flags) return false; - if (key_field->field != key_field_test->field) + if (!compatible_field(key_field->field, key_field_test->field)) return false; if (key_field->offset != key_field_test->offset) return false; @@ -1280,7 +1368,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return false; } - if (data->filter_str && + if (!ignore_filter && data->filter_str && (strcmp(data->filter_str, data_test->filter_str) != 0)) return false; @@ -1292,12 +1380,26 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, struct trace_event_file *file) { struct hist_trigger_data *hist_data = data->private_data; - struct event_trigger_data *test; + struct event_trigger_data *test, *named_data = NULL; int ret = 0; + if (hist_data->attrs->name) { + named_data = find_named_trigger(hist_data->attrs->name); + if (named_data) { + if (!hist_trigger_match(data, named_data, named_data, + true)) { + ret = -EINVAL; + goto out; + } + } + } + + if (hist_data->attrs->name && !named_data) + goto new; + list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test)) + if (!hist_trigger_match(data, test, named_data, false)) continue; if (hist_data->attrs->pause) test->paused = true; @@ -1310,12 +1412,19 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } } - + new: if (hist_data->attrs->cont || hist_data->attrs->clear) { ret = -ENOENT; goto out; } + if (named_data) { + destroy_hist_data(data->private_data); + data->private_data = named_data->private_data; + set_named_trigger_data(data, named_data); + data->ops = &event_hist_trigger_named_ops; + } + if (hist_data->attrs->pause) data->paused = true; @@ -1329,6 +1438,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret++; update_cond_flag(file); + if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); update_cond_flag(file); @@ -1342,12 +1452,16 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { - struct event_trigger_data *test; + struct hist_trigger_data *hist_data = data->private_data; + struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + if (hist_data->attrs->name) + named_data = find_named_trigger(hist_data->attrs->name); + list_for_each_entry_rcu(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { - if (!hist_trigger_match(data, test)) + if (!hist_trigger_match(data, test, named_data, false)) continue; unregistered = true; list_del_rcu(&test->list); -- cgit v1.2.3 From 4b94f5b7b4a5ffd885609bd033af2ecca0c9cc54 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 3 Mar 2016 12:55:02 -0600 Subject: tracing: Add hist trigger 'log2' modifier Allow users to have numeric fields displayed as log2 values in case value range is very wide by appending '.log2' to field names. For example, # echo 'hist:key=bytes_req' > kmalloc/trigger # cat kmalloc/hist { bytes_req: 504 } hitcount: 1 { bytes_req: 11 } hitcount: 1 { bytes_req: 104 } hitcount: 1 { bytes_req: 48 } hitcount: 1 { bytes_req: 2048 } hitcount: 1 { bytes_req: 4096 } hitcount: 1 { bytes_req: 240 } hitcount: 1 { bytes_req: 392 } hitcount: 1 { bytes_req: 13 } hitcount: 1 { bytes_req: 28 } hitcount: 1 { bytes_req: 12 } hitcount: 1 { bytes_req: 64 } hitcount: 2 { bytes_req: 128 } hitcount: 2 { bytes_req: 32 } hitcount: 2 { bytes_req: 8 } hitcount: 11 { bytes_req: 10 } hitcount: 13 { bytes_req: 24 } hitcount: 25 { bytes_req: 160 } hitcount: 29 { bytes_req: 16 } hitcount: 33 { bytes_req: 80 } hitcount: 36 When using '.log2' modifier, the output looks like: # echo 'hist:key=bytes_req.log2' > kmalloc/trigger # cat kmalloc/hist { bytes_req: ~ 2^12 } hitcount: 1 { bytes_req: ~ 2^11 } hitcount: 1 { bytes_req: ~ 2^9 } hitcount: 2 { bytes_req: ~ 2^6 } hitcount: 3 { bytes_req: ~ 2^3 } hitcount: 13 { bytes_req: ~ 2^5 } hitcount: 19 { bytes_req: ~ 2^8 } hitcount: 49 { bytes_req: ~ 2^7 } hitcount: 57 { bytes_req: ~ 2^4 } hitcount: 74 Link: http://lkml.kernel.org/r/7ff396b246c6a881f46b979735fddf05a0d6c71a.1457029949.git.tom.zanussi@linux.intel.com Cc: Tom Zanussi Cc: Masami Hiramatsu Signed-off-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + kernel/trace/trace_events_hist.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4e342d354c12..988a35263fdd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3872,6 +3872,7 @@ static const char readme_msg[] = "\t .sym-offset display an address as a symbol and offset\n" "\t .execname display a common_pid as a program name\n" "\t .syscall display a syscall id as a syscall name\n\n" + "\t .log2 display log2 value rather than raw number\n\n" "\t The 'pause' parameter can be used to pause an existing hist\n" "\t trigger or to start a hist trigger but not log any events\n" "\t until told to do so. 'continue' can be used to start or\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bdea5603cbde..23df38aab503 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -68,6 +68,13 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event) return (u64)(unsigned long)*addr; } +static u64 hist_field_log2(struct hist_field *hist_field, void *event) +{ + u64 val = *(u64 *)(event + hist_field->field->offset); + + return (u64) ilog2(roundup_pow_of_two(val)); +} + #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ { \ @@ -111,6 +118,7 @@ enum hist_field_flags { HIST_FIELD_FL_EXECNAME = 64, HIST_FIELD_FL_SYSCALL = 128, HIST_FIELD_FL_STACKTRACE = 256, + HIST_FIELD_FL_LOG2 = 512, }; struct hist_trigger_attrs { @@ -358,6 +366,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (flags & HIST_FIELD_FL_LOG2) { + hist_field->fn = hist_field_log2; + goto out; + } + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; @@ -522,6 +535,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, flags |= HIST_FIELD_FL_EXECNAME; else if (strcmp(field_str, "syscall") == 0) flags |= HIST_FIELD_FL_SYSCALL; + else if (strcmp(field_str, "log2") == 0) + flags |= HIST_FIELD_FL_LOG2; else { ret = -EINVAL; goto out; @@ -980,6 +995,9 @@ hist_trigger_entry_print(struct seq_file *m, key + key_field->offset, HIST_STACKTRACE_DEPTH); multiline = true; + } else if (key_field->flags & HIST_FIELD_FL_LOG2) { + seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + *(u64 *)(key + key_field->offset)); } else if (key_field->flags & HIST_FIELD_FL_STRING) { seq_printf(m, "%s: %-50s", key_field->field->name, (char *)(key + key_field->offset)); @@ -1112,6 +1130,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) flags_str = "execname"; else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) flags_str = "syscall"; + else if (hist_field->flags & HIST_FIELD_FL_LOG2) + flags_str = "log2"; return flags_str; } -- cgit v1.2.3 From d50c744ecde7ee3ba4d7ffb0e1c55e7a2f6bbc8e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 8 Mar 2016 17:17:15 -0500 Subject: tracing: Fix unsigned comparison to zero in hist trigger code Fengguang Wu's bot found two comparisons of unsigned integers to zero. These were real bugs, as it would miss error conditions returned to zero. trace_events_hist.c:426:6-9: WARNING: Unsigned expression compared with zero: idx < 0 trace_events_hist.c:568:5-14: WARNING: Unsigned expression compared with zero: n_entries < 0 Reported-by: kbuild test robot Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 23df38aab503..f98b6b3a2804 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -734,7 +734,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) struct tracing_map *map = hist_data->map; struct ftrace_event_field *field; struct hist_field *hist_field; - unsigned int i, idx; + int i, idx; for_each_hist_field(i, hist_data) { hist_field = hist_data->fields[i]; @@ -1036,7 +1036,7 @@ static int print_entries(struct seq_file *m, { struct tracing_map_sort_entry **sort_entries = NULL; struct tracing_map *map = hist_data->map; - unsigned int i, n_entries; + int i, n_entries; n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, hist_data->n_sort_keys, -- cgit v1.2.3 From 205506228b69be4efbb3809957a69420f26a8d9f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 25 Apr 2016 22:40:12 -0400 Subject: tracing: Do not inherit event-fork option for instances As the event-fork option requires doing work when enabled and disabled, it can not be passed down to created instances. The instance must clear this flag when it is created, and must clear it when its removed. As more options may be created with this need, a macro ZEROED_TRACE_FLAGS is created that holds the flags that must not be inherited by the top level instance, and must be cleared on removal of instances. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 988a35263fdd..5e3ad3481e4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec) #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD) +/* trace_flags that are default zero for instances */ +#define ZEROED_TRACE_FLAGS \ + TRACE_ITER_EVENT_FORK /* * The global_trace is the descriptor that holds the tracing @@ -6710,7 +6713,7 @@ static int instance_mkdir(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; - tr->trace_flags = global_trace.trace_flags; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -6784,6 +6787,12 @@ static int instance_rmdir(const char *name) list_del(&tr->list); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1 << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1 << i, 0); + } + tracing_set_nop(tr); event_trace_del_tracer(tr); ftrace_destroy_function_files(tr); -- cgit v1.2.3 From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 25 Apr 2016 23:22:35 -0400 Subject: ext4: fix races between changing inode journal mode and ext4_writepages In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 4 ++++ fs/ext4/inode.c | 15 ++++++++++++--- fs/ext4/super.c | 4 ++++ kernel/locking/percpu-rwsem.c | 1 + 4 files changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cb00b1119ec9..ba5aecc07fbc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif @@ -1508,6 +1509,9 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 779ef4c11bc1..4d8ebbe00456 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); + if (dax_mapping(mapping)) { + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + goto out_writepages; + } /* * No pages to write? This is mainly a kludge to avoid starting @@ -2786,6 +2789,7 @@ retry: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); return ret; } @@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's @@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } + percpu_down_write(&sbi->s_journal_flag_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); + if (val) up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 304c712dbe12..20c5d52253b4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -3930,6 +3931,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely -- cgit v1.2.3 From 4812952f9c94f67b3cc78ad0a6482bf182aa5a44 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 23 Apr 2016 13:23:47 +0300 Subject: tracing: checking for NULL instead of IS_ERR() tracing_map_elt_alloc() returns ERR_PTRs on error, never NULL. Fixes: 08d43a5fa063 ('tracing: Add lock-free tracing_map') Link: http://lkml.kernel.org/r/20160423102347.GA11136@mwanda Acked-by: Tom Zanussi Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index e0f172932eca..e7dfc5eec669 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -814,7 +814,7 @@ static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) unsigned int i; dup_elt = tracing_map_elt_alloc(elt->map); - if (!dup_elt) + if (IS_ERR(dup_elt)) return NULL; if (elt->map->ops && elt->map->ops->elt_copy) -- cgit v1.2.3 From 432480c58219eff32904b879eb3fcc1d268a3b06 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 25 Apr 2016 14:01:27 -0500 Subject: tracing: Add check for NULL event field when creating hist field Smatch flagged create_hist_field() as possibly being able to dereference a NULL pointer, although the current code exits in all cases where the event field could be NULL, so it's not actually a problem. Still, to prevent future changes to the code from overlooking new cases, make the NULL pointer check explicit and warn once in that case. Link: http://lkml.kernel.org/r/cfbc003f534a3e441b4313272fd412310aba6336.1461610073.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_hist.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f98b6b3a2804..0c05b8a99806 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -371,6 +371,9 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, goto out; } + if (WARN_ON_ONCE(!field)) + goto out; + if (is_string_field(field)) { flags |= HIST_FIELD_FL_STRING; -- cgit v1.2.3 From 6e4cf657defa8fb06dbf9ed91adb78414758f259 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 25 Apr 2016 14:01:28 -0500 Subject: tracing: Handle tracing_map_alloc_elts() error path correctly If tracing_map_elt_alloc() fails, it will return ERR_PTR() instead of NULL, so change the check to IS_ERROR(). We also need to set the failed entry in the map->elts array to NULL instead of ERR_PTR() so tracing_map_free_elts() doesn't try freeing an ERR_PTR(). tracing_map_free_elts() should also zero out what it frees so a reentrant call won't find previously freed elements. Link: http://lkml.kernel.org/r/f29d03b00bce3aac8cf151a8a30e6c83e5fee66d.1461610073.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/tracing_map.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index e7dfc5eec669..0a689bbb78ef 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -366,10 +366,13 @@ static void tracing_map_free_elts(struct tracing_map *map) if (!map->elts) return; - for (i = 0; i < map->max_elts; i++) + for (i = 0; i < map->max_elts; i++) { tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i))); + *(TRACING_MAP_ELT(map->elts, i)) = NULL; + } tracing_map_array_free(map->elts); + map->elts = NULL; } static int tracing_map_alloc_elts(struct tracing_map *map) @@ -383,7 +386,8 @@ static int tracing_map_alloc_elts(struct tracing_map *map) for (i = 0; i < map->max_elts; i++) { *(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map); - if (!(*(TRACING_MAP_ELT(map->elts, i)))) { + if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) { + *(TRACING_MAP_ELT(map->elts, i)) = NULL; tracing_map_free_elts(map); return -ENOMEM; -- cgit v1.2.3 From 4afe6495e5cb3c352d95f07512cbb227e607e2ce Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Mon, 18 Apr 2016 15:23:29 +0800 Subject: tracing: Don't use the address of the buffer array name in copy_from_user With the following code snippet: ... char buf[64]; ... if (copy_from_user(&buf, ubuf, cnt)) ... Even though the value of "&buf" equals "buf", but there is no need to get the address of the "buf" again. Use "buf" instead of "&buf". Link: http://lkml.kernel.org/r/20160418152329.18b72bea@debian Signed-off-by: Wang Xiaoqiang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5e3ad3481e4b..46028d47d252 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3664,7 +3664,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -4537,7 +4537,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (cnt > MAX_TRACER_SIZE) cnt = MAX_TRACER_SIZE; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -5327,7 +5327,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (cnt >= sizeof(buf)) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; -- cgit v1.2.3 From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 21 Apr 2016 14:14:01 -0400 Subject: audit: add tty field to LOGIN event The tty field was missing from AUDIT_LOGIN events. Refactor code to create a new function audit_get_tty(), using it to replace the call in audit_log_task_info() and to add it to audit_log_set_loginuid(). Lock and bump the kref to protect it, adding audit_put_tty() alias to decrement it. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 24 ++++++++++++++++++++++++ kernel/audit.c | 18 +++++------------- kernel/auditsc.c | 8 ++++++-- 3 files changed, 35 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index b40ed5df5542..32cdafb312d8 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -26,6 +26,7 @@ #include #include #include +#include #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) @@ -343,6 +344,23 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk) return tsk->sessionid; } +static inline struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ + struct tty_struct *tty = NULL; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + if (tsk->signal) + tty = tty_kref_get(tsk->signal->tty); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + return tty; +} + +static inline void audit_put_tty(struct tty_struct *tty) +{ + tty_kref_put(tty); +} + extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); @@ -500,6 +518,12 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return -1; } +static inline struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ + return NULL; +} +static inline void audit_put_tty(struct tty_struct *tty) +{ } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, diff --git a/kernel/audit.c b/kernel/audit.c index f52fbefede09..384374a1d232 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -64,7 +64,6 @@ #include #endif #include -#include #include #include @@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; char comm[sizeof(tsk->comm)]; - char *tty; + struct tty_struct *tty; if (!ab) return; /* tsk == current */ cred = current_cred(); - - spin_lock_irq(&tsk->sighand->siglock); - if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) - tty = tsk->signal->tty->name; - else - tty = "(none)"; - spin_unlock_irq(&tsk->sighand->siglock); - + tty = audit_get_tty(tsk); audit_log_format(ab, " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" @@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->egid), from_kgid(&init_user_ns, cred->sgid), from_kgid(&init_user_ns, cred->fsgid), - tty, audit_get_sessionid(tsk)); - + tty ? tty_name(tty) : "(none)", + audit_get_sessionid(tsk)); + audit_put_tty(tty); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); - audit_log_d_path_exe(ab, tsk->mm); audit_log_task_context(ab); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..71e14d836e69 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, { struct audit_buffer *ab; uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; if (!audit_enabled) return; @@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(current); ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (!ab) return; audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, oldsessionid, sessionid, !rc); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); audit_log_end(ab); } -- cgit v1.2.3 From 7132e2d669bd42c3783327f301aaac5f4463299b Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 25 Apr 2016 18:56:14 -0300 Subject: ftrace: Match dot symbols when searching functions on ppc64 In the ppc64 big endian ABI, function symbols point to function descriptors. The symbols which point to the function entry points have a dot in front of the function name. Consequently, when the ftrace filter mechanism searches for the symbol corresponding to an entry point address, it gets the dot symbol. As a result, ftrace filter users have to be aware of this ABI detail on ppc64 and prepend a dot to the function name when setting the filter. The perf probe command insulates the user from this by ignoring the dot in front of the symbol name when matching function names to symbols, but the sysfs interface does not. This patch makes the ftrace filter mechanism do the same when searching symbols. Fixes the following failure in ftracetest's kprobe_ftrace.tc: .../kprobe_ftrace.tc: line 9: echo: write error: Invalid argument That failure is on this line of kprobe_ftrace.tc: echo _do_fork > set_ftrace_filter This is because there's no _do_fork entry in the functions list: # cat available_filter_functions | grep _do_fork ._do_fork This change introduces no regressions on the perf and ftracetest testsuite results. Cc: Steven Rostedt Cc: Ingo Molnar Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Thiago Jung Bauermann Acked-by: Steven Rostedt Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/ftrace.c | 10 ++++++++++ kernel/trace/ftrace.c | 12 ++++++++++++ 2 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 9dac18dabd03..1123a4d8d8dd 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr) return sys_call_table[nr*2]; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ + +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) +char *arch_ftrace_match_adjust(char *str, const char *search) +{ + if (str[0] == '.' && search[0] != '.') + return str + 1; + else + return str; +} +#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e8d792da963..a6c8252d7776 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3456,11 +3456,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) -- cgit v1.2.3 From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 26 Apr 2016 21:22:22 -0400 Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header The functions event_trigger_unlock_commit() and event_trigger_unlock_commit_regs() are no longer used outside the tracing system. Move them out of the generic headers and into the local one. Along with __event_trigger_test_discard() that is only used by them. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 94 -------------------------------------------- kernel/trace/trace.h | 94 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5f89a5b0c7e6..70a181cb3585 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -452,100 +452,6 @@ trace_trigger_soft_disabled(struct trace_event_file *file) return false; } -/* - * Helper function for event_trigger_unlock_commit{_regs}(). - * If there are event triggers attached to this event that requires - * filtering against its fields, then they wil be called as the - * entry already holds the field information of the current event. - * - * It also checks if the event should be discarded or not. - * It is to be discarded if the event is soft disabled and the - * event was only recorded to process triggers, or if the event - * filter is active and this event did not match the filters. - * - * Returns true if the event is discarded, false otherwise. - */ -static inline bool -__event_trigger_test_discard(struct trace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, - enum event_trigger_type *tt) -{ - unsigned long eflags = file->flags; - - if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry); - - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) - ring_buffer_discard_commit(buffer, event); - else if (!filter_check_discard(file, entry, buffer, event)) - return false; - - return true; -} - -/** - * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - */ -static inline void -event_trigger_unlock_commit(struct trace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); - - if (tt) - event_triggers_post_call(file, tt, entry); -} - -/** - * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @irq_flags: The state of the interrupts at the start of the event - * @pc: The state of the preempt count at the start of the event. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - * - * Same as event_trigger_unlock_commit() but calls - * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). - */ -static inline void -event_trigger_unlock_commit_regs(struct trace_event_file *file, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned long irq_flags, int pc, - struct pt_regs *regs) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit_regs(file->tr, buffer, event, - irq_flags, pc, regs); - - if (tt) - event_triggers_post_call(file, tt, entry); -} - #ifdef CONFIG_BPF_EVENTS unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); #else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 727a3d28bce5..c0eac7b1e5a6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,6 +1065,100 @@ struct trace_subsystem_dir { int nr_events; }; +/* + * Helper function for event_trigger_unlock_commit{_regs}(). + * If there are event triggers attached to this event that requires + * filtering against its fields, then they wil be called as the + * entry already holds the field information of the current event. + * + * It also checks if the event should be discarded or not. + * It is to be discarded if the event is soft disabled and the + * event was only recorded to process triggers, or if the event + * filter is active and this event did not match the filters. + * + * Returns true if the event is discarded, false otherwise. + */ +static inline bool +__event_trigger_test_discard(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, + enum event_trigger_type *tt) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_COND) + *tt = event_triggers_call(file, entry); + + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) + ring_buffer_discard_commit(buffer, event); + else if (!filter_check_discard(file, entry, buffer, event)) + return false; + + return true; +} + +/** + * event_trigger_unlock_commit - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + */ +static inline void +event_trigger_unlock_commit(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + +/** + * event_trigger_unlock_commit_regs - handle triggers and finish event commit + * @file: The file pointer assoctiated to the event + * @buffer: The ring buffer that the event is being written to + * @event: The event meta data in the ring buffer + * @entry: The event itself + * @irq_flags: The state of the interrupts at the start of the event + * @pc: The state of the preempt count at the start of the event. + * + * This is a helper function to handle triggers that require data + * from the event itself. It also tests the event against filters and + * if the event is soft disabled and should be discarded. + * + * Same as event_trigger_unlock_commit() but calls + * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). + */ +static inline void +event_trigger_unlock_commit_regs(struct trace_event_file *file, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + void *entry, unsigned long irq_flags, int pc, + struct pt_regs *regs) +{ + enum event_trigger_type tt = ETT_NONE; + + if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) + trace_buffer_unlock_commit_regs(file->tr, buffer, event, + irq_flags, pc, regs); + + if (tt) + event_triggers_post_call(file, tt, entry); +} + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -- cgit v1.2.3 From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Apr 2016 10:13:46 -0400 Subject: tracing: Make filter_check_discard() local Nothing outside of the tracing directory calls filter_check_discard() or check_filter_check_discard(). They should not be called by modules. Move their prototypes into the local tracing header and remove their EXPORT_SYMBOL() macros. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 6 ------ kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 6 ++++++ 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70a181cb3585..bb383af35cc7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -413,12 +413,6 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); -extern int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); -extern int call_filter_check_discard(struct trace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, void *rec); extern void event_triggers_post_call(struct trace_event_file *file, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46028d47d252..02f5a5f51d49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec, return 0; } -EXPORT_SYMBOL_GPL(filter_check_discard); int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, @@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } -EXPORT_SYMBOL_GPL(call_filter_check_discard); static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c0eac7b1e5a6..ee8691c66bfe 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,6 +1065,12 @@ struct trace_subsystem_dir { int nr_events; }; +extern int filter_check_discard(struct trace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); +extern int call_filter_check_discard(struct trace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event); /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From 9cbb1506ab2db987c160e7fc50665bf47b5b6fa1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 27 Apr 2016 11:09:42 -0400 Subject: tracing: Fold filter_check_discard() into its only user The function filter_check_discard() is small and only called by one user, its code can be folded into that one caller and make the code a bit less comlplex. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 ------------- kernel/trace/trace.h | 13 ++++++------- 2 files changed, 6 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02f5a5f51d49..1ba54e241c8d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -306,19 +306,6 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} - int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee8691c66bfe..0862e7559548 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1065,9 +1065,6 @@ struct trace_subsystem_dir { int nr_events; }; -extern int filter_check_discard(struct trace_event_file *file, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event); extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); @@ -1096,12 +1093,14 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags)) + if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || + (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && + !filter_match_preds(file->filter, entry))) { ring_buffer_discard_commit(buffer, event); - else if (!filter_check_discard(file, entry, buffer, event)) - return false; + return true; + } - return true; + return false; } /** -- cgit v1.2.3 From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 28 Apr 2016 12:04:13 -0400 Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header The functions trace_buffer_unlock_commit() and the _regs() version are only used within the kernel/trace directory. Move them to the local header and remove the export as well. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 9 --------- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 10 ++++++++++ 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index bb383af35cc7..48cc5e19c5f5 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -158,15 +158,6 @@ struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, int type, unsigned long len, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_buffer_unlock_commit_regs(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc, - struct pt_regs *regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ba54e241c8d..94e7e4d11b79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); static struct ring_buffer *temp_buffer; @@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } -EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); void trace_current_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0862e7559548..bd5ae56dec7a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1068,6 +1068,16 @@ struct trace_subsystem_dir { extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_buffer_unlock_commit_regs(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + struct pt_regs *regs); /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 16:12:39 -0400 Subject: tracing: Remove unused function trace_current_buffer_discard_commit() The function trace_current_buffer_discard_commit() has no callers, remove it. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 2 -- kernel/trace/trace.c | 8 -------- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 48cc5e19c5f5..356c39b3abbb 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -158,8 +158,6 @@ struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, int type, unsigned long len, unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 94e7e4d11b79..e5bdb9accf52 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } -EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, @@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } -void trace_current_buffer_discard_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - ring_buffer_discard_commit(buffer, event); -} -EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); - void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, -- cgit v1.2.3 From 33fddff24d05d71f97722cb7deec4964d39d10dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 17:44:01 -0400 Subject: tracing: Have trace_buffer_unlock_commit() call the _regs version with NULL There's no real difference between trace_buffer_unlock_commit() and trace_buffer_unlock_commit_regs() except that the former passes NULL to ftrace_stack_trace() instead of regs. Have the former be a static inline of the latter which passes NULL for regs. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 ----------- kernel/trace/trace.h | 13 +++++++++---- 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5bdb9accf52..41bf14412666 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1686,17 +1686,6 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve ring_buffer_unlock_commit(buffer, event); } -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc) -{ - __buffer_unlock_commit(buffer, event); - - ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL); - ftrace_trace_userstack(buffer, flags, pc); -} - static struct ring_buffer *temp_buffer; struct ring_buffer_event * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bd5ae56dec7a..10156a09103f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1069,15 +1069,20 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags, int pc); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs); + +static inline void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); +} + /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires -- cgit v1.2.3 From 9b9db275051cd9191e7776c4fd79ccd4318aa2dc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 18:10:21 -0400 Subject: tracing: Remove one use of trace_current_buffer_lock_reserve() The only user of trace_current_buffer_lock_reserve() is in the boot up self tests. Restructure the code a little to have that code use what everything else uses: trace_event_buffer_lock_reserve(). Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7cb983ee93c..da1eeb6190e3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3392,7 +3392,7 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); -static struct trace_array *event_tr; +static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, @@ -3416,17 +3416,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, local_save_flags(flags); - event = trace_current_buffer_lock_reserve(&buffer, - TRACE_FN, sizeof(*entry), - flags, pc); + event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file, + TRACE_FN, sizeof(*entry), + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; - trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc); - + event_trigger_unlock_commit(&event_trace_file, buffer, event, + entry, flags, pc); out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); preempt_enable_notrace(); @@ -3441,9 +3441,11 @@ static struct ftrace_ops trace_ops __initdata = static __init void event_trace_self_test_with_function(void) { int ret; - event_tr = top_trace_array(); - if (WARN_ON(!event_tr)) + + event_trace_file.tr = top_trace_array(); + if (WARN_ON(!event_trace_file.tr)) return; + ret = register_ftrace_function(&trace_ops); if (WARN_ON(ret < 0)) { pr_info("Failed to enable function tracer for event tests\n"); -- cgit v1.2.3 From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 29 Apr 2016 18:11:54 -0400 Subject: tracing: Remove unused function trace_current_buffer_lock_reserve() trace_current_buffer_lock_reserve() has no more users. Remove it. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 4 ---- kernel/trace/trace.c | 10 ---------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 356c39b3abbb..3111a1efdad6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -154,10 +154,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, struct trace_event_file *trace_file, int type, unsigned long len, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, - int type, unsigned long len, - unsigned long flags, int pc); void tracing_record_cmdline(struct task_struct *tsk); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 41bf14412666..c09e8ffadc73 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, - int type, unsigned long len, - unsigned long flags, int pc) -{ - *current_rb = global_trace.trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); -} - void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, -- cgit v1.2.3 From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 2 May 2016 21:30:04 -0400 Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it. Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 3 -- kernel/trace/trace_events_filter.c | 71 ++++++-------------------------------- 2 files changed, 10 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3111a1efdad6..ba6456302e34 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -214,7 +214,6 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, TRACE_EVENT_FL_WAS_ENABLED_BIT, - TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, TRACE_EVENT_FL_KPROBE_BIT, TRACE_EVENT_FL_UPROBE_BIT, @@ -229,7 +228,6 @@ enum { * WAS_ENABLED - Set and stays set when an event was ever enabled * (used for module unloading, if a module event is enabled, * it is best to clear the buffers that used it). - * USE_CALL_FILTER - For trace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint * KPROBE - Event is a kprobe * UPROBE - Event is a uprobe @@ -240,7 +238,6 @@ enum { TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), - TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), TRACE_EVENT_FL_UPROBE = (1 << TRACE_EVENT_FL_UPROBE_BIT), diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b3f5051cd4e9..d1d27bf37a19 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps, static inline struct event_filter *event_filter(struct trace_event_file *file) { - if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - return file->event_call->filter; - else - return file->filter; + return file->filter; } /* caller must hold event_mutex */ @@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_FILTERED; - else - file->flags &= ~EVENT_FILE_FL_FILTERED; + file->flags &= ~EVENT_FILE_FL_FILTERED; } static void __free_filter(struct event_filter *filter) @@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) static inline void __remove_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - filter_disable(file); - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - remove_filter_string(call->filter); - else - remove_filter_string(file->filter); + remove_filter_string(file->filter); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, static inline void __free_subsystem_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { - __free_filter(call->filter); - call->filter = NULL; - } else { - __free_filter(file->filter); - file->filter = NULL; - } + __free_filter(file->filter); + file->filter = NULL; } static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, @@ -1718,69 +1698,38 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_FILTERED; - else - file->flags |= EVENT_FILE_FL_FILTERED; + file->flags |= EVENT_FILE_FL_FILTERED; } static inline void event_set_filter(struct trace_event_file *file, struct event_filter *filter) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - rcu_assign_pointer(call->filter, filter); - else - rcu_assign_pointer(file->filter, filter); + rcu_assign_pointer(file->filter, filter); } static inline void event_clear_filter(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - RCU_INIT_POINTER(call->filter, NULL); - else - RCU_INIT_POINTER(file->filter, NULL); + RCU_INIT_POINTER(file->filter, NULL); } static inline void event_set_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; + file->flags |= EVENT_FILE_FL_NO_SET_FILTER; } static inline void event_clear_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - - if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; - else - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; + file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; } static inline bool event_no_set_filter_flag(struct trace_event_file *file) { - struct trace_event_call *call = file->event_call; - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) return true; - if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && - (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) - return true; - return false; } -- cgit v1.2.3 From 0fc1b09ff1ff404ddf753f5ffa5cd0adc8fdcdc9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 May 2016 17:15:43 -0400 Subject: tracing: Use temp buffer when filtering events Filtering of events requires the data to be written to the ring buffer before it can be decided to filter or not. This is because the parameters of the filter are based on the result that is written to the ring buffer and not on the parameters that are passed into the trace functions. The ftrace ring buffer is optimized for writing into the ring buffer and committing. The discard procedure used when filtering decides the event should be discarded is much more heavy weight. Thus, using a temporary filter when filtering events can speed things up drastically. Without a temp buffer we have: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.790706626 seconds time elapsed ( +- 0.71% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.566904059 seconds time elapsed ( +- 0.27% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.690598511 seconds time elapsed ( +- 0.19% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.707486364 seconds time elapsed ( +- 0.30% ) The first run above is without any tracing, just to get a based figure. hackbench takes ~0.79 seconds to run on the system. The second run enables tracing all events where nothing is filtered. This increases the time by 100% and hackbench takes 1.57 seconds to run. The third run filters all events where the preempt count will equal "20" (this should never happen) thus all events are discarded. This takes 1.69 seconds to run. This is 10% slower than just committing the events! The last run enables all events and filters where the filter will commit all events, and this takes 1.70 seconds to run. The filtering overhead is approximately 10%. Thus, the discard and commit of an event from the ring buffer may be about the same time. With this patch, the numbers change: # trace-cmd start -p nop # perf stat -r 10 hackbench 50 0.778233033 seconds time elapsed ( +- 0.38% ) # trace-cmd start -e all # perf stat -r 10 hackbench 50 1.582102692 seconds time elapsed ( +- 0.28% ) # trace-cmd start -e all -f 'common_preempt_count==20' # perf stat -r 10 hackbench 50 1.309230710 seconds time elapsed ( +- 0.22% ) # trace-cmd start -e all -f 'common_preempt_count!=20' # perf stat -r 10 hackbench 50 1.786001924 seconds time elapsed ( +- 0.20% ) The first run is again the base with no tracing. The second run is all tracing with no filtering. It is a little slower, but that may be well within the noise. The third run shows that discarding all events only took 1.3 seconds. This is a speed up of 23%! The discard is much faster than even the commit. The one downside is shown in the last run. Events that are not discarded by the filter will take longer to add, this is due to the extra copy of the event. Cc: Alexei Starovoitov Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 154 +++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 19 ++++- kernel/trace/trace_events.c | 10 +++ kernel/trace/trace_events_filter.c | 10 +++ 4 files changed, 185 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c09e8ffadc73..8a4bd6b68a0b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -312,7 +312,7 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, { if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); + __trace_event_discard_commit(buffer, event); return 1; } @@ -1660,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, @@ -1669,21 +1679,136 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) { - struct trace_entry *ent = ring_buffer_event_data(event); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + +DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DEFINE_PER_CPU(int, trace_buffered_event_cnt); +static int trace_buffered_event_ref; + +/** + * trace_buffered_event_enable - enable buffering events + * + * When events are being filtered, it is quicker to use a temporary + * buffer to write the event data into if there's a likely chance + * that it will not be committed. The discard of the ring buffer + * is not as fast as committing, and is much slower than copying + * a commit. + * + * When an event is to be filtered, allocate per cpu buffers to + * write the event data into, and if the event is filtered and discarded + * it is simply dropped, otherwise, the entire data is to be committed + * in one shot. + */ +void trace_buffered_event_enable(void) +{ + struct ring_buffer_event *event; + struct page *page; + int cpu; - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (trace_buffered_event_ref++) + return; + + for_each_tracing_cpu(cpu) { + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY, 0); + if (!page) + goto failed; + + event = page_address(page); + memset(event, 0, sizeof(*event)); + + per_cpu(trace_buffered_event, cpu) = event; + + preempt_disable(); + if (cpu == smp_processor_id() && + this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + preempt_enable(); } - return event; + return; + failed: + trace_buffered_event_disable(); +} + +static void enable_trace_buffered_event(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); + this_cpu_dec(trace_buffered_event_cnt); +} + +static void disable_trace_buffered_event(void *data) +{ + this_cpu_inc(trace_buffered_event_cnt); +} + +/** + * trace_buffered_event_disable - disable buffering events + * + * When a filter is removed, it is faster to not use the buffered + * events, and to commit directly into the ring buffer. Free up + * the temp buffers when there are no more users. This requires + * special synchronization with current events. + */ +void trace_buffered_event_disable(void) +{ + int cpu; + + WARN_ON_ONCE(!mutex_is_locked(&event_mutex)); + + if (WARN_ON_ONCE(!trace_buffered_event_ref)) + return; + + if (--trace_buffered_event_ref) + return; + + preempt_disable(); + /* For each CPU, set the buffer as used. */ + smp_call_function_many(tracing_buffer_mask, + disable_trace_buffered_event, NULL, 1); + preempt_enable(); + + /* Wait for all current users to finish */ + synchronize_sched(); + + for_each_tracing_cpu(cpu) { + free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); + per_cpu(trace_buffered_event, cpu) = NULL; + } + /* + * Make sure trace_buffered_event is NULL before clearing + * trace_buffered_event_cnt. + */ + smp_wmb(); + + preempt_disable(); + /* Do the work on each cpu */ + smp_call_function_many(tracing_buffer_mask, + enable_trace_buffered_event, NULL, 1); + preempt_enable(); } void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); - ring_buffer_unlock_commit(buffer, event); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); } static struct ring_buffer *temp_buffer; @@ -1695,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, unsigned long flags, int pc) { struct ring_buffer_event *entry; + int val; *current_rb = trace_file->tr->trace_buffer.buffer; + + if ((trace_file->flags & + (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + (entry = this_cpu_read(trace_buffered_event))) { + /* Try to use the per cpu buffer first */ + val = this_cpu_inc_return(trace_buffered_event_cnt); + if (val == 1) { + trace_event_setup(entry, type, flags, pc); + entry->array[0] = len; + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); + } + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10156a09103f..5167c366d6b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1083,6 +1083,23 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); } +DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); +DECLARE_PER_CPU(int, trace_buffered_event_cnt); +void trace_buffered_event_disable(void); +void trace_buffered_event_enable(void); + +static inline void +__trace_event_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (this_cpu_read(trace_buffered_event) == event) { + /* Simply release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + return; + } + ring_buffer_discard_commit(buffer, event); +} + /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires @@ -1111,7 +1128,7 @@ __event_trigger_test_discard(struct trace_event_file *file, if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && !filter_match_preds(file->filter, entry))) { - ring_buffer_discard_commit(buffer, event); + __trace_event_discard_commit(buffer, event); return true; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index da1eeb6190e3..4d006707b947 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -363,6 +363,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -445,6 +446,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } + /* Enable or disable use of trace_buffered_event */ + if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != + (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + trace_buffered_event_enable(); + else + trace_buffered_event_disable(); + } + return ret; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d1d27bf37a19..9daa9b3bc6d9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -823,7 +823,12 @@ static void __free_preds(struct event_filter *filter) static void filter_disable(struct trace_event_file *file) { + unsigned long old_flags = file->flags; + file->flags &= ~EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_disable(); } static void __free_filter(struct event_filter *filter) @@ -1698,7 +1703,12 @@ fail: static inline void event_set_filtered_flag(struct trace_event_file *file) { + unsigned long old_flags = file->flags; + file->flags |= EVENT_FILE_FL_FILTERED; + + if (old_flags != file->flags) + trace_buffered_event_enable(); } static inline void event_set_filter(struct trace_event_file *file, -- cgit v1.2.3 From 470bf1f27a1472264d18c84b324389509f0e30b3 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 24 Mar 2016 02:46:33 +0100 Subject: seccomp: Fix comment typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop accidentally repeated word in comment. Signed-off-by: Mickaël Salaün Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e1e5a354854e..6c9bb62ed046 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -915,7 +915,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, fprog = filter->prog->orig_prog; if (!fprog) { - /* This must be a new non-cBPF filter, since we save every + /* This must be a new non-cBPF filter, since we save * every cBPF filter's orig_prog above when * CONFIG_CHECKPOINT_RESTORE is enabled. */ -- cgit v1.2.3 From cc622420798c4bcf093785d872525087a7798db9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Apr 2016 17:35:29 +0200 Subject: gcov: disable for COMPILE_TEST Enabling gcov is counterproductive to compile testing: it significantly increases the kernel image size, compile time, and it produces lots of false positive "may be used uninitialized" warnings as the result of missed optimizations. This is in line with how UBSAN_SANITIZE_ALL and PROFILE_ALL_BRANCHES work, both of which have similar problems. With an ARM allmodconfig kernel, I see the build time drop from 283 minutes CPU time to 225 minutes, and the vmlinux size drops from 43MB to 26MB. Signed-off-by: Arnd Bergmann Acked-by: Peter Oberparleiter Signed-off-by: Michal Marek --- kernel/gcov/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n -- cgit v1.2.3 From c983f0e8677d5d686ea81de13f9c9ac34f21a2a7 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 29 Mar 2016 09:35:32 +0100 Subject: seccomp: Get compat syscalls from asm-generic header Move retrieval of compat syscall numbers into inline function defined in asm-generic header so that arches may override it. [ralf@linux-mips.org: Resolve merge conflict.] Suggested-by: Paul Burton Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Will Drewry Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/12978/ Signed-off-by: Ralf Baechle --- include/asm-generic/seccomp.h | 14 ++++++++++++++ kernel/seccomp.c | 9 +-------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h index c9ccafa0d99a..e74072d23e69 100644 --- a/include/asm-generic/seccomp.h +++ b/include/asm-generic/seccomp.h @@ -29,4 +29,18 @@ #define __NR_seccomp_sigreturn __NR_rt_sigreturn #endif +#ifdef CONFIG_COMPAT +#ifndef get_compat_mode1_syscalls +static inline const int *get_compat_mode1_syscalls(void) +{ + static const int mode1_syscalls_32[] = { + __NR_seccomp_read_32, __NR_seccomp_write_32, + __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, + 0, /* null terminated */ + }; + return mode1_syscalls_32; +} +#endif +#endif /* CONFIG_COMPAT */ + #endif /* _ASM_GENERIC_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e1e5a354854e..737436ebb4fe 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -518,19 +518,12 @@ static int mode1_syscalls[] = { 0, /* null terminated */ }; -#ifdef CONFIG_COMPAT -static int mode1_syscalls_32[] = { - __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ -}; -#endif - static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = mode1_syscalls_32; + syscall_whitelist = get_compat_mode1_syscalls(); #endif do { if (*syscall_whitelist == this_syscall) -- cgit v1.2.3 From cb4253aa0f77f20be018970dbe5d01d78b930ef9 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 29 Mar 2016 09:35:34 +0100 Subject: secomp: Constify mode1 syscall whitelist These values are constant and should be marked as such. Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: Will Drewry Cc: Andy Lutomirski Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/12979/ Signed-off-by: Ralf Baechle --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 737436ebb4fe..a0ffcb1a2bee 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -513,14 +513,14 @@ static void seccomp_send_sigsys(int syscall, int reason) * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ -static int mode1_syscalls[] = { +static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, 0, /* null terminated */ }; static void __secure_computing_strict(int this_syscall) { - int *syscall_whitelist = mode1_syscalls; + const int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) syscall_whitelist = get_compat_mode1_syscalls(); -- cgit v1.2.3 From a831100aeefbe6d9f3e47a3e2712f82c042f1f5c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 May 2016 16:34:53 -0300 Subject: perf core: Generalize max_stack sysctl handler So that it can be used for other stack related knobs, such as the upcoming one to tweak the max number of of contexts per stack sample. In all those cases we can only change the value if there are no perf sessions collecting stacks, so they need to grab that mutex, etc. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-8t3fk94wuzp8m2z1n4gc0s17@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/callchain.c | 5 +++-- kernel/sysctl.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9325e7dcba1..7fc89939ede9 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -228,7 +228,8 @@ exit_put: int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int new_value = sysctl_perf_event_max_stack, ret; + int *value = table->data; + int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; @@ -240,7 +241,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write, if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else - sysctl_perf_event_max_stack = new_value; + *value = new_value; mutex_unlock(&callchain_mutex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..0ec6907a16b3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1149,7 +1149,7 @@ static struct ctl_table kern_table[] = { }, { .procname = "perf_event_max_stack", - .data = NULL, /* filled in by handler */ + .data = &sysctl_perf_event_max_stack, .maxlen = sizeof(sysctl_perf_event_max_stack), .mode = 0644, .proc_handler = perf_event_max_stack_handler, -- cgit v1.2.3 From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 28 Apr 2016 12:30:53 -0300 Subject: perf core: Pass max stack as a perf_callchain_entry context This makes perf_callchain_{user,kernel}() receive the max stack as context for the perf_callchain_entry, instead of accessing the global sysctl_perf_event_max_stack. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/arc/kernel/perf_event.c | 6 +++--- arch/arm/kernel/perf_callchain.c | 10 +++++----- arch/arm64/kernel/perf_callchain.c | 14 +++++++------- arch/metag/kernel/perf_callchain.c | 10 +++++----- arch/mips/kernel/perf_event.c | 12 ++++++------ arch/powerpc/perf/callchain.c | 14 +++++++------- arch/s390/kernel/perf_event.c | 4 ++-- arch/sh/kernel/perf_callchain.c | 4 ++-- arch/sparc/kernel/perf_event.c | 14 +++++++------- arch/tile/kernel/perf_event.c | 6 +++--- arch/x86/events/core.c | 14 +++++++------- arch/xtensa/kernel/perf_event.c | 10 +++++----- include/linux/perf_event.h | 16 +++++++++++----- kernel/bpf/stackmap.c | 3 ++- kernel/events/callchain.c | 20 ++++++++++++-------- 15 files changed, 84 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 8b134cfe5e1f..6fd48021324b 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -48,7 +48,7 @@ struct arc_callchain_trace { static int callchain_trace(unsigned int addr, void *data) { struct arc_callchain_trace *ctrl = data; - struct perf_callchain_entry *entry = ctrl->perf_stuff; + struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff; perf_callchain_store(entry, addr); if (ctrl->depth++ < 3) @@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data) } void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct arc_callchain_trace ctrl = { .depth = 0, @@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { /* * User stack can't be unwound trivially with kernel dwarf unwinder diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 27563befa8a2..bc552e813e7b 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -31,7 +31,7 @@ struct frame_tail { */ static struct frame_tail __user * user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry *entry) + struct perf_callchain_entry_ctx *entry) { struct frame_tail buftail; unsigned long err; @@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail, } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct frame_tail __user *tail; @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->nr < sysctl_perf_event_max_stack) && + while ((entry->entry->nr < entry->max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } @@ -89,13 +89,13 @@ static int callchain_trace(struct stackframe *fr, void *data) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, fr->pc); return 0; } void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stackframe fr; diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 32c3c6e70119..0d60150057cf 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -31,7 +31,7 @@ struct frame_tail { */ static struct frame_tail __user * user_backtrace(struct frame_tail __user *tail, - struct perf_callchain_entry *entry) + struct perf_callchain_entry_ctx *entry) { struct frame_tail buftail; unsigned long err; @@ -76,7 +76,7 @@ struct compat_frame_tail { static struct compat_frame_tail __user * compat_user_backtrace(struct compat_frame_tail __user *tail, - struct perf_callchain_entry *entry) + struct perf_callchain_entry_ctx *entry) { struct compat_frame_tail buftail; unsigned long err; @@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, } #endif /* CONFIG_COMPAT */ -void perf_callchain_user(struct perf_callchain_entry *entry, +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->nr < sysctl_perf_event_max_stack && + while (entry->entry->nr < entry->max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->nr < sysctl_perf_event_max_stack) && + while ((entry->entry->nr < entry->max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif @@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry, */ static int callchain_trace(struct stackframe *frame, void *data) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, frame->pc); return 0; } -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stackframe frame; diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index 252abc12a5a3..b3261a98b15b 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr) static struct metag_frame __user * user_backtrace(struct metag_frame __user *user_frame, - struct perf_callchain_entry *entry) + struct perf_callchain_entry_ctx *entry) { struct metag_frame frame; unsigned long calladdr; @@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame, } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long sp = regs->ctx.AX[0].U0; struct metag_frame __user *frame; @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) --frame; - while ((entry->nr < sysctl_perf_event_max_stack) && frame) + while ((entry->entry->nr < entry->max_stack) && frame) frame = user_backtrace(frame, entry); } @@ -78,13 +78,13 @@ static int callchain_trace(struct stackframe *fr, void *data) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, fr->pc); return 0; } void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stackframe fr; diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index 5021c546ad07..22395c7d7030 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -25,8 +25,8 @@ * the user stack callchains, we will add it here. */ -static void save_raw_perf_callchain(struct perf_callchain_entry *entry, - unsigned long reg29) +static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry, + unsigned long reg29) { unsigned long *sp = (unsigned long *)reg29; unsigned long addr; @@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->nr >= sysctl_perf_event_max_stack) + if (entry->entry->nr >= entry->max_stack) break; } } } -void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) { unsigned long sp = regs->regs[29]; #ifdef CONFIG_KALLSYMS @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } do { perf_callchain_store(entry, pc); - if (entry->nr >= sysctl_perf_event_max_stack) + if (entry->entry->nr >= entry->max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 22d9015c1acc..c9260c1dfdbc 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp) } void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp) puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct perf_callchain_entry *entry, +static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long sp, next_sp; @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) return rc; } -static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, +static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp, return mctx->mc_gregs; } -static void perf_callchain_user_32(struct perf_callchain_entry *entry, +static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned int sp, next_sp; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; @@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (current_is_64bit()) perf_callchain_user_64(entry, regs); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index c3e4099b60a5..87035fa58bbe 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register); static int __perf_callchain_kernel(void *data, unsigned long address) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, address); return 0; } -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (user_mode(regs)) diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c index cc80b614b5fa..fa2c0cd23eaa 100644 --- a/arch/sh/kernel/perf_callchain.c +++ b/arch/sh/kernel/perf_callchain.c @@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name) static void callchain_address(void *data, unsigned long addr, int reliable) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; if (reliable) perf_callchain_store(entry, addr); @@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = { }; void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { perf_callchain_store(entry, regs->pc); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index a4b8b5aed21c..bcc5376db74b 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void) } pure_initcall(init_hw_perf_events); -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long ksp, fp; @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, } } #endif - } while (entry->nr < sysctl_perf_event_max_stack); + } while (entry->entry->nr < entry->max_stack); } static inline int @@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size) return (__range_not_ok(fp, size, TASK_SIZE) == 0); } -static void perf_callchain_user_64(struct perf_callchain_entry *entry, +static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long ufp; @@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->nr < sysctl_perf_event_max_stack); + } while (entry->entry->nr < entry->max_stack); } -static void perf_callchain_user_32(struct perf_callchain_entry *entry, +static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long ufp; @@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->nr < sysctl_perf_event_max_stack); + } while (entry->entry->nr < entry->max_stack); } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { u64 saved_fault_address = current_thread_info()->fault_address; u8 saved_fault_code = get_thread_fault_code(); diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c index 8767060d70fb..6394c1ccb68e 100644 --- a/arch/tile/kernel/perf_event.c +++ b/arch/tile/kernel/perf_event.c @@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events); /* * Tile specific backtracing code for perf_events. */ -static inline void perf_callchain(struct perf_callchain_entry *entry, +static inline void perf_callchain(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct KBacktraceIterator kbt; @@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry, } } -void perf_callchain_user(struct perf_callchain_entry *entry, +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { perf_callchain(entry, regs); } -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { perf_callchain(entry, regs); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5e5e76a52f58..07f2b01cfb72 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name) static int backtrace_address(void *data, unsigned long addr, int reliable) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; return perf_callchain_store(entry, addr); } @@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = { }; void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ @@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment) #include static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; @@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #else static inline int -perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; @@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; pagefault_disable(); - while (entry->nr < sysctl_perf_event_max_stack) { + while (entry->entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c index a6b00b3af429..ef90479e0397 100644 --- a/arch/xtensa/kernel/perf_event.c +++ b/arch/xtensa/kernel/perf_event.c @@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event) static int callchain_trace(struct stackframe *frame, void *data) { - struct perf_callchain_entry *entry = data; + struct perf_callchain_entry_ctx *entry = data; perf_callchain_store(entry, frame->pc); return 0; } -void perf_callchain_kernel(struct perf_callchain_entry *entry, +void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack, + xtensa_backtrace_kernel(regs, entry->max_stack, callchain_trace, NULL, entry); } -void perf_callchain_user(struct perf_callchain_entry *entry, +void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - xtensa_backtrace_user(regs, sysctl_perf_event_max_stack, + xtensa_backtrace_user(regs, entry->max_stack, callchain_trace, entry); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9e1c3ada91c4..dbd18246b36e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,11 @@ struct perf_callchain_entry { __u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ }; +struct perf_callchain_entry_ctx { + struct perf_callchain_entry *entry; + u32 max_stack; +}; + struct perf_raw_record { u32 size; void *data; @@ -1063,19 +1068,20 @@ extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); -extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs); -extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs); +extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); +extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); extern int sysctl_perf_event_max_stack; -static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip) +static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { - if (entry->nr < sysctl_perf_event_max_stack) { + struct perf_callchain_entry *entry = ctx->entry; + if (entry->nr < ctx->max_stack) { entry->ip[entry->nr++] = ip; return 0; } else { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index f5a19548be12..a82d7605db3f 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 7fc89939ede9..af95ad92893a 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, +__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } -__weak void perf_callchain_user(struct perf_callchain_entry *entry, +__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; + struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); @@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) goto exit_put; + ctx.entry = entry; + ctx.max_stack = max_stack; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); + perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(&ctx, regs); } if (user) { @@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + perf_callchain_store(&ctx, PERF_CONTEXT_USER); + perf_callchain_user(&ctx, regs); } } -- cgit v1.2.3 From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 May 2016 18:08:32 -0300 Subject: perf core: Add a 'nr' field to perf_event_callchain_context We will use it to count how many addresses are in the entry->ip[] array, excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really return the number of entries specified by the user via the relevant sysctl, kernel.perf_event_max_contexts, or via the per event perf_event_attr.sample_max_stack knob. This way we keep the perf_sample->ip_callchain->nr meaning, that is the number of entries, be it real addresses or PERF_CONTEXT_ entries, while honouring the max_stack knobs, i.e. the end result will be max_stack entries if we have at least that many entries in a given stack trace. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/arm/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/perf_callchain.c | 4 ++-- arch/metag/kernel/perf_callchain.c | 2 +- arch/mips/kernel/perf_event.c | 4 ++-- arch/powerpc/perf/callchain.c | 4 ++-- arch/sparc/kernel/perf_event.c | 6 +++--- arch/x86/events/core.c | 4 ++-- include/linux/perf_event.h | 6 ++++-- kernel/events/callchain.c | 3 +-- 9 files changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index bc552e813e7b..22bf1f64d99a 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs tail = (struct frame_tail __user *)regs->ARM_fp - 1; - while ((entry->entry->nr < entry->max_stack) && + while ((entry->nr < entry->max_stack) && tail && !((unsigned long)tail & 0x3)) tail = user_backtrace(tail, entry); } diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 0d60150057cf..713ca824f266 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, tail = (struct frame_tail __user *)regs->regs[29]; - while (entry->entry->nr < entry->max_stack && + while (entry->nr < entry->max_stack && tail && !((unsigned long)tail & 0xf)) tail = user_backtrace(tail, entry); } else { @@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, tail = (struct compat_frame_tail __user *)regs->compat_fp - 1; - while ((entry->entry->nr < entry->max_stack) && + while ((entry->nr < entry->max_stack) && tail && !((unsigned long)tail & 0x3)) tail = compat_user_backtrace(tail, entry); #endif diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c index b3261a98b15b..3e8e048040df 100644 --- a/arch/metag/kernel/perf_callchain.c +++ b/arch/metag/kernel/perf_callchain.c @@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs --frame; - while ((entry->entry->nr < entry->max_stack) && frame) + while ((entry->nr < entry->max_stack) && frame) frame = user_backtrace(frame, entry); } diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c index 22395c7d7030..d64056e0bb56 100644 --- a/arch/mips/kernel/perf_event.c +++ b/arch/mips/kernel/perf_event.c @@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry, addr = *sp++; if (__kernel_text_address(addr)) { perf_callchain_store(entry, addr); - if (entry->entry->nr >= entry->max_stack) + if (entry->nr >= entry->max_stack) break; } } @@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } do { perf_callchain_store(entry, pc); - if (entry->entry->nr >= entry->max_stack) + if (entry->nr >= entry->max_stack) break; pc = unwind_stack(current, &sp, pc, &ra); } while (pc); diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c9260c1dfdbc..f68f213dc36c 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->entry->nr < entry->max_stack) { + while (entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->entry->nr < entry->max_stack) { + while (entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index bcc5376db74b..710f3278d448 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, } } #endif - } while (entry->entry->nr < entry->max_stack); + } while (entry->nr < entry->max_stack); } static inline int @@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, pc = sf.callers_pc; ufp = (unsigned long)sf.fp + STACK_BIAS; perf_callchain_store(entry, pc); - } while (entry->entry->nr < entry->max_stack); + } while (entry->nr < entry->max_stack); } static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, @@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, ufp = (unsigned long)sf.fp; } perf_callchain_store(entry, pc); - } while (entry->entry->nr < entry->max_stack); + } while (entry->nr < entry->max_stack); } void diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 07f2b01cfb72..5de96a18cd9c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); - while (entry->entry->nr < entry->max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; @@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs return; pagefault_disable(); - while (entry->entry->nr < entry->max_stack) { + while (entry->nr < entry->max_stack) { unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dbd18246b36e..3803bb1a862b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -64,6 +64,7 @@ struct perf_callchain_entry { struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; + u32 nr; }; struct perf_raw_record { @@ -1080,9 +1081,10 @@ extern int sysctl_perf_event_max_stack; static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { - struct perf_callchain_entry *entry = ctx->entry; - if (entry->nr < ctx->max_stack) { + if (ctx->nr < ctx->max_stack) { + struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; + ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index af95ad92893a..8774ff86debb 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.entry = entry; ctx.max_stack = max_stack; - - entry->nr = init_nr; + ctx.nr = entry->nr = init_nr; if (kernel && !user_mode(regs)) { if (add_mark) -- cgit v1.2.3 From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 May 2016 13:01:50 -0300 Subject: perf core: Add perf_callchain_store_context() helper We need have different helpers to account how many contexts we have in the sample and for real addresses, so do it now as a prep patch, to ease review. Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- arch/powerpc/perf/callchain.c | 6 +++--- include/linux/perf_event.h | 2 ++ kernel/events/callchain.c | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index f68f213dc36c..f62597dbd757 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re next_ip = regs->nip; lr = regs->link; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL); } else { if (level == 0) @@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, read_user_stack_64(&uregs[PT_R1], &sp)) return; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store_context(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); continue; } @@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, read_user_stack_32(&uregs[PT_R1], &sp)) return; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store_context(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); continue; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3803bb1a862b..2024b14cc2b1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1079,6 +1079,8 @@ extern void put_callchain_buffers(void); extern int sysctl_perf_event_max_stack; +#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context) + static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8774ff86debb..ca645736a983 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); perf_callchain_kernel(&ctx, regs); } @@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(&ctx, PERF_CONTEXT_USER); + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); perf_callchain_user(&ctx, regs); } } -- cgit v1.2.3 From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 May 2016 13:06:21 -0300 Subject: perf core: Separate accounting of contexts and real addresses in a stack trace The perf_sample->ip_callchain->nr value includes all the entries in the ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc}, while what the user expects is that what is in the kernel.perf_event_max_stack sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be honoured in terms of IP addresses in the stack trace. So allocate a bunch of extra entries for contexts, and do the accounting via perf_callchain_entry_ctx struct members. A new sysctl, kernel.perf_event_max_contexts_per_stack is also introduced for investigating possible bugs in the callchain implementation by some arch. Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexei Starovoitov Cc: Brendan Gregg Cc: David Ahern Cc: Frederic Weisbecker Cc: He Kuang Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Wang Nan Cc: Zefan Li Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/sysctl/kernel.txt | 14 ++++++++++++++ include/linux/perf_event.h | 18 ++++++++++++++++-- include/uapi/linux/perf_event.h | 1 + kernel/events/callchain.c | 10 +++++++++- kernel/sysctl.c | 9 +++++++++ 5 files changed, 49 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index daabdd7ee543..a3683ce2a2f3 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -61,6 +61,7 @@ show up in /proc/sys/kernel: - perf_cpu_time_max_percent - perf_event_paranoid - perf_event_max_stack +- perf_event_max_contexts_per_stack - pid_max - powersave-nap [ PPC only ] - printk @@ -668,6 +669,19 @@ The default value is 127. ============================================================== +perf_event_max_contexts_per_stack: + +Controls maximum number of stack frame context entries for +(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for +instance, when using 'perf record -g' or 'perf trace --call-graph fp'. + +This can only be done when no events are in use that have callchains +enabled, otherwise writing to this file will return -EBUSY. + +The default value is 8. + +============================================================== + pid_max: PID allocation wrap value. When the kernel's next PID value diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2024b14cc2b1..6b87be908790 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -65,6 +65,8 @@ struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; + short contexts; + bool contexts_maxed; }; struct perf_raw_record { @@ -1078,12 +1080,24 @@ extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); extern int sysctl_perf_event_max_stack; +extern int sysctl_perf_event_max_contexts_per_stack; -#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context) +static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) +{ + if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { + struct perf_callchain_entry *entry = ctx->entry; + entry->ip[entry->nr++] = ip; + ++ctx->contexts; + return 0; + } else { + ctx->contexts_maxed = true; + return -1; /* no more room, stop walking the stack */ + } +} static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { - if (ctx->nr < ctx->max_stack) { + if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 43fc8d213472..36ce552cf6a9 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -862,6 +862,7 @@ enum perf_event_type { }; #define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index ca645736a983..179ef4640964 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -19,11 +19,13 @@ struct callchain_cpus_entries { }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; +int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + - sizeof(__u64) * sysctl_perf_event_max_stack); + sizeof(__u64) * (sysctl_perf_event_max_stack + + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); @@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, ctx.entry = entry; ctx.max_stack = max_stack; ctx.nr = entry->nr = init_nr; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) @@ -228,6 +232,10 @@ exit_put: return entry; } +/* + * Used for sysctl_perf_event_max_stack and + * sysctl_perf_event_max_contexts_per_stack. + */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ec6907a16b3..bec4c11c47d6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &six_hundred_forty_kb, }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = &zero, + .extra2 = &one_thousand, + }, #endif #ifdef CONFIG_KMEMCHECK { -- cgit v1.2.3 From 60f05e86cf3e8c5f379fe5ba94634fcec17dd67e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 18 May 2016 17:55:28 +0530 Subject: cpufreq: schedutil: Improve prints messages with pr_fmt Prefix print messages with KBUILD_MODNAME, i.e 'cpufreq_schedutil: '. This helps to keep similar formatting for all the print messages particular to a file and identify those easily in kernel logs. Its already done this way for rest of the governors. Along with that, remove the (now) redundant bits from a print message. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 154ae3a51e86..14c4aa25cc45 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -9,6 +9,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); - pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret); + pr_err("initialization failed (error %d)\n", ret); return ret; } -- cgit v1.2.3 From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:02 -0700 Subject: time: add missing implementation for timespec64_add_safe() timespec64_add_safe() has been defined in time64.h for 64 bit systems. But, 32 bit systems only have an extern function prototype defined. Provide a definition for the above function. The function will be necessary as part of y2038 changes. struct timespec is not y2038 safe. All references to timespec will be replaced by struct timespec64. The function is meant to be a replacement for timespec_add_safe(). The implementation is similar to timespec_add_safe(). Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/time64.h | 4 +--- kernel/time/time.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/time64.h b/include/linux/time64.h index 367d5af899e8..1778937221bf 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n /* * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME_T_MAX if the returned value would be - * smaller then either of the arguments. + * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { diff --git a/kernel/time/time.c b/kernel/time/time.c index a4064b612066..cb1f83eb5599 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } + +#if __BITS_PER_LONG != 64 + +/* + * Add two timespec64 values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0). + * And, each timespec64 is in normalized form. + */ +struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs) +{ + struct timespec64 res; + + set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) { + res.tv_sec = TIME64_MAX; + res.tv_nsec = 0; + } + + return res; +} + +#endif -- cgit v1.2.3 From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Thu, 19 May 2016 17:09:08 -0700 Subject: time: remove timespec_add_safe() All references to timespec_add_safe() now use timespec64_add_safe(). The plan is to replace struct timespec references with struct timespec64 throughout the kernel as timespec is not y2038 safe. Drop timespec_add_safe() and use timespec64_add_safe() for all architectures. Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Acked-by: John Stultz Cc: Thomas Gleixner Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/time64.h | 15 +++++++-------- kernel/time/time.c | 4 ---- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/time64.h b/include/linux/time64.h index 1778937221bf..7e5d2fa9ac46 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec * # define timespec64_equal timespec_equal # define timespec64_compare timespec_compare # define set_normalized_timespec64 set_normalized_timespec -# define timespec64_add_safe timespec_add_safe # define timespec64_add timespec_add # define timespec64_sub timespec_sub # define timespec64_valid timespec_valid @@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); -/* - * timespec64_add_safe assumes both values are positive and checks for - * overflow. It will return TIME64_MAX in case of overflow. - */ -extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, - const struct timespec64 rhs); - static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { @@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) #endif +/* + * timespec64_add_safe assumes both values are positive and checks for + * overflow. It will return TIME64_MAX in case of overflow. + */ +extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, + const struct timespec64 rhs); + #endif /* _LINUX_TIME64_H */ diff --git a/kernel/time/time.c b/kernel/time/time.c index cb1f83eb5599..667b9335f5d6 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs, return res; } -#if __BITS_PER_LONG != 64 - /* * Add two timespec64 values and do a safety check for overflow. * It's assumed that both values are valid (>= 0). @@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } - -#endif -- cgit v1.2.3 From 02a982a6ec631d871571f940ca13817551759884 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:26 -0700 Subject: workqueue: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to change (debugobjects: make fixup functions return bool instead of int) Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f5068e94003..6751b18fd9ac 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -437,7 +437,7 @@ static void *work_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int work_fixup_init(void *addr, enum debug_obj_state state) +static bool work_fixup_init(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -445,9 +445,9 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_init(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -456,7 +456,7 @@ static int work_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int work_fixup_activate(void *addr, enum debug_obj_state state) +static bool work_fixup_activate(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -471,16 +471,16 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); - return 0; + return false; } WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -488,7 +488,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int work_fixup_free(void *addr, enum debug_obj_state state) +static bool work_fixup_free(void *addr, enum debug_obj_state state) { struct work_struct *work = addr; @@ -496,9 +496,9 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: cancel_work_sync(work); debug_object_free(work, &work_debug_descr); - return 1; + return true; default: - return 0; + return false; } } -- cgit v1.2.3 From e3252464da222ef2c2ca52dff383a824080ea3d5 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:29 -0700 Subject: timer: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/hrtimer.c | 18 +++++++++--------- kernel/time/timer.c | 30 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index fa0b983290cf..f962a58c0957 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -342,9 +342,9 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -353,19 +353,19 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_NOTAVAILABLE: WARN_ON_ONCE(1); - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -373,7 +373,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; @@ -381,9 +381,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 73164c3aa56b..be33481a4da1 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -493,7 +493,7 @@ static void *timer_debug_hint(void *addr) * fixup_init is called when: * - an active object is initialized */ -static int timer_fixup_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -501,9 +501,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -518,7 +518,7 @@ static void stub_timer(unsigned long data) * - an active object is activated * - an unknown object is activated (might be a statically initialized object) */ -static int timer_fixup_activate(void *addr, enum debug_obj_state state) +static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -534,18 +534,18 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } - return 0; + return false; case ODEBUG_STATE_ACTIVE: WARN_ON(1); default: - return 0; + return false; } } @@ -553,7 +553,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * fixup_free is called when: * - an active object is freed */ -static int timer_fixup_free(void *addr, enum debug_obj_state state) +static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -561,9 +561,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); - return 1; + return true; default: - return 0; + return false; } } @@ -571,7 +571,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ -static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; @@ -584,13 +584,13 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) * is tracked in the object tracker. */ debug_object_init(timer, &timer_debug_descr); - return 0; + return false; } else { setup_timer(timer, stub_timer, 0); - return 1; + return true; } default: - return 0; + return false; } } -- cgit v1.2.3 From 3263d28eb5b93b3c1b024366df87b1d0c774228f Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:32 -0700 Subject: rcu: update debugobjects fixup callbacks return type Update the return type to use bool instead of int, corresponding to cheange (debugobjects: make fixup functions return bool instead of int). Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/update.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3ccdc8eebc5a..a9df198eb22d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -386,7 +386,7 @@ void destroy_rcu_head(struct rcu_head *head) * - an unknown object is activated (might be a statically initialized object) * Activation is performed internally by call_rcu(). */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) { struct rcu_head *head = addr; @@ -399,9 +399,9 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) */ debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); - return 0; + return false; default: - return 1; + return true; } } -- cgit v1.2.3 From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001 From: "Du, Changbin" Date: Thu, 19 May 2016 17:09:41 -0700 Subject: debugobjects: insulate non-fixup logic related to static obj from fixup callbacks When activating a static object we need make sure that the object is tracked in the object tracker. If it is a non-static object then the activation is illegal. In previous implementation, each subsystem need take care of this in their fixup callbacks. Actually we can put it into debugobjects core. Thus we can save duplicated code, and have *pure* fixup callbacks. To achieve this, a new callback "is_static_object" is introduced to let the type specific code decide whether a object is static or not. If yes, we take it into object tracker, otherwise give warning and invoke fixup callback. This change has paassed debugobjects selftest, and I also do some test with all debugobjects supports enabled. At last, I have a concern about the fixups that can it change the object which is in incorrect state on fixup? Because the 'addr' may not point to any valid object if a non-static object is not tracked. Then Change such object can overwrite someone's memory and cause unexpected behaviour. For example, the timer_fixup_activate bind timer to function stub_timer. Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com [changbin.du@intel.com: improve code comments where invoke the new is_static_object callback] Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com Signed-off-by: Du, Changbin Cc: Jonathan Corbet Cc: Josh Triplett Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tejun Heo Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/debugobjects.h | 2 ++ kernel/rcu/update.c | 26 +++-------------------- kernel/time/hrtimer.c | 7 +------ kernel/time/timer.c | 43 +++++++++++++------------------------- kernel/workqueue.c | 42 ++++++++----------------------------- lib/debugobjects.c | 49 +++++++++++++++++++++++++++++--------------- 6 files changed, 60 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index a899f10c9365..46056cb161fc 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -38,6 +38,7 @@ struct debug_obj { * @name: name of the object typee * @debug_hint: function returning address, which have associated * kernel symbol, to allow identify the object + * @is_static_object return true if the obj is static, otherwise return false * @fixup_init: fixup function, which is called when the init check * fails. All fixup functions must return true if fixup * was successful, otherwise return false @@ -53,6 +54,7 @@ struct debug_obj { struct debug_obj_descr { const char *name; void *(*debug_hint)(void *addr); + bool (*is_static_object)(void *addr); bool (*fixup_init)(void *addr, enum debug_obj_state state); bool (*fixup_activate)(void *addr, enum debug_obj_state state); bool (*fixup_destroy)(void *addr, enum debug_obj_state state); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a9df198eb22d..3e888cd5a594 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +static bool rcuhead_is_static_object(void *addr) { - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return false; - default: - return true; - } + return true; } /** @@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, + .is_static_object = rcuhead_is_static_object, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f962a58c0957..8c7392c4fdbd 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - WARN_ON_ONCE(1); - return false; - case ODEBUG_STATE_ACTIVE: WARN_ON(1); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index be33481a4da1..3a95f9728778 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr) return ((struct timer_list *) addr)->function; } +static bool timer_is_static_object(void *addr) +{ + struct timer_list *timer = addr; + + return (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC); +} + /* * fixup_init is called when: * - an active object is initialized @@ -516,30 +524,16 @@ static void stub_timer(unsigned long data) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (timer->entry.pprev == NULL && - timer->entry.next == TIMER_ENTRY_STATIC) { - debug_object_init(timer, &timer_debug_descr); - debug_object_activate(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } - return false; + setup_timer(timer, stub_timer, 0); + return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); @@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.next == TIMER_ENTRY_STATIC) { - /* - * This is not really a fixup. The timer was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - debug_object_init(timer, &timer_debug_descr); - return false; - } else { - setup_timer(timer, stub_timer, 0); - return true; - } + setup_timer(timer, stub_timer, 0); + return true; default: return false; } @@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, + .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6751b18fd9ac..e1c0e996b5ae 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr) return ((struct work_struct *) addr)->func; } +static bool work_is_static_object(void *addr) +{ + struct work_struct *work = addr; + + return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work)); +} + /* * fixup_init is called when: * - an active object is initialized @@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state) } } -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - */ -static bool work_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct work_struct *work = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. The work struct was - * statically initialized. We just make sure that it - * is tracked in the object tracker. - */ - if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { - debug_object_init(work, &work_debug_descr); - debug_object_activate(work, &work_debug_descr); - return false; - } - WARN_ON_ONCE(1); - return false; - - case ODEBUG_STATE_ACTIVE: - WARN_ON(1); - - default: - return false; - } -} - /* * fixup_free is called when: * - an active object is freed @@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", .debug_hint = work_debug_hint, + .is_static_object = work_is_static_object, .fixup_init = work_fixup_init, - .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, }; diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2f07c8c697b8..a8e12601eb37 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * This happens when a static object is activated. We - * let the type specific code decide whether this is - * true or not. + * We are here when a static object is activated. We + * let the type specific code confirm whether this is + * true or not. if true, we just make sure that the + * static object is tracked in the object tracker. If + * not, this must be a bug, so we try to fix it up. */ - if (debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE)) { + if (descr->is_static_object && descr->is_static_object(addr)) { + /* track this static object */ + debug_object_init(addr, descr); + debug_object_activate(addr, descr); + } else { debug_print_object(&o, "activate"); - return -EINVAL; + ret = debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE); + return ret ? 0 : -EINVAL; } return 0; } @@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); /* - * Maybe the object is static. Let the type specific - * code decide what to do. + * Maybe the object is static, and we let the type specific + * code confirm. Track this static object if true, else invoke + * fixup. */ - if (debug_object_fixup(descr->fixup_assert_init, addr, - ODEBUG_STATE_NOTAVAILABLE)) + if (descr->is_static_object && descr->is_static_object(addr)) { + /* Track this static object */ + debug_object_init(addr, descr); + } else { debug_print_object(&o, "assert_init"); + debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE); + } return; } @@ -792,6 +805,13 @@ struct self_test { static __initdata struct debug_obj_descr descr_type_test; +static bool __init is_static_object(void *addr) +{ + struct self_test *obj = addr; + + return obj->static_init; +} + /* * fixup_init is called when: * - an active object is initialized @@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state) /* * fixup_activate is called when: * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) + * - an unknown non-static object is activated */ static bool __init fixup_activate(void *addr, enum debug_obj_state state) { @@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (obj->static_init == 1) { - debug_object_init(obj, &descr_type_test); - debug_object_activate(obj, &descr_type_test); - return false; - } return true; - case ODEBUG_STATE_ACTIVE: debug_object_deactivate(obj, &descr_type_test); debug_object_activate(obj, &descr_type_test); @@ -916,6 +930,7 @@ out: static __initdata struct debug_obj_descr descr_type_test = { .name = "selftest", + .is_static_object = is_static_object, .fixup_init = fixup_init, .fixup_activate = fixup_activate, .fixup_destroy = fixup_destroy, -- cgit v1.2.3 From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Thu, 19 May 2016 17:09:56 -0700 Subject: kernel/padata.c: removed unused code By accident I stumbled across code that has never been used. This driver has EXPORT_SYMBOL functions, and the only user of the code is pcrypt.c, but this only uses a subset of the exported symbols. According to 'git log -G', the functions, padata_set_cpumasks, padata_add_cpu, and padata_remove_cpu have never been used since they were first introduced. This patch removes the unused code. On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than 4k smaller. kbuild_hp> size $KBUILD_OUTPUT/vmlinux text data bss dec hex filename 10566658 4678360 1122304 16367322 f9beda vmlinux 10561984 4678360 1122304 16362648 f9ac98 vmlinux On another config, 32 bit, the saving is about 0.5k bytes. kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux 6012005 2409513 2785280 11206798 ab008e vmlinux 6011491 2409513 2785280 11206284 aafe8c vmlinux Signed-off-by: Richard Cochran Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/padata.h | 5 ---- kernel/padata.c | 64 -------------------------------------------------- 2 files changed, 69 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 438694650471..113ee626a4dc 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst, extern void padata_do_serial(struct padata_priv *padata); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); -extern int padata_set_cpumasks(struct padata_instance *pinst, - cpumask_var_t pcpumask, - cpumask_var_t cbcpumask); -extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask); -extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask); extern int padata_start(struct padata_instance *pinst); extern void padata_stop(struct padata_instance *pinst); extern int padata_register_cpumask_notifier(struct padata_instance *pinst, diff --git a/kernel/padata.c b/kernel/padata.c index b38bea9c466a..67ddd4acde9d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -606,33 +606,6 @@ out_replace: return 0; } -/** - * padata_set_cpumasks - Set both parallel and serial cpumasks. The first - * one is used by parallel workers and the second one - * by the wokers doing serialization. - * - * @pinst: padata instance - * @pcpumask: the cpumask to use for parallel workers - * @cbcpumask: the cpumsak to use for serial workers - */ -int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, - cpumask_var_t cbcpumask) -{ - int err; - - mutex_lock(&pinst->lock); - get_online_cpus(); - - err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); - - put_online_cpus(); - mutex_unlock(&pinst->lock); - - return err; - -} -EXPORT_SYMBOL(padata_set_cpumasks); - /** * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value * equivalent to @cpumask. @@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) return 0; } - /** - * padata_add_cpu - add a cpu to one or both(parallel and serial) - * padata cpumasks. - * - * @pinst: padata instance - * @cpu: cpu to add - * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. - * The @mask may be any combination of the following flags: - * PADATA_CPU_SERIAL - serial cpumask - * PADATA_CPU_PARALLEL - parallel cpumask - */ - -int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) -{ - int err; - - if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) - return -EINVAL; - - mutex_lock(&pinst->lock); - - get_online_cpus(); - if (mask & PADATA_CPU_SERIAL) - cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); - if (mask & PADATA_CPU_PARALLEL) - cpumask_set_cpu(cpu, pinst->cpumask.pcpu); - - err = __padata_add_cpu(pinst, cpu); - put_online_cpus(); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_add_cpu); - static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd = NULL; @@ -1091,7 +1028,6 @@ err_free_inst: err: return NULL; } -EXPORT_SYMBOL(padata_alloc); /** * padata_free - free a padata instance -- cgit v1.2.3 From 19d795b677bda354644cfb87a196b087fdc2a965 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 May 2016 17:09:59 -0700 Subject: kernel/padata.c: hide unused functions A recent cleanup removed some exported functions that were not used anywhere, which in turn exposed the fact that some other functions in the same file are only used in some configurations. We now get a warning about them when CONFIG_HOTPLUG_CPU is disabled: kernel/padata.c:670:12: error: '__padata_remove_cpu' defined but not used [-Werror=unused-function] static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) ^~~~~~~~~~~~~~~~~~~ kernel/padata.c:650:12: error: '__padata_add_cpu' defined but not used [-Werror=unused-function] static int __padata_add_cpu(struct padata_instance *pinst, int cpu) This rearranges the code so the __padata_remove_cpu/__padata_add_cpu functions are within the #ifdef that protects the code that calls them. [akpm@linux-foundation.org: coding-style fixes] Fixes: 4ba6d78c671e ("kernel/padata.c: removed unused code") Signed-off-by: Arnd Bergmann Cc: Richard Cochran Cc: Steffen Klassert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/padata.c | 74 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 67ddd4acde9d..993278895ccc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -647,6 +647,43 @@ out: } EXPORT_SYMBOL(padata_set_cpumask); +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err = -EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + static int __padata_add_cpu(struct padata_instance *pinst, int cpu) { struct parallel_data *pd; @@ -726,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) } EXPORT_SYMBOL(padata_remove_cpu); -/** - * padata_start - start the parallel processing - * - * @pinst: padata instance to start - */ -int padata_start(struct padata_instance *pinst) -{ - int err = 0; - - mutex_lock(&pinst->lock); - - if (pinst->flags & PADATA_INVALID) - err =-EINVAL; - - __padata_start(pinst); - - mutex_unlock(&pinst->lock); - - return err; -} -EXPORT_SYMBOL(padata_start); - -/** - * padata_stop - stop the parallel processing - * - * @pinst: padata instance to stop - */ -void padata_stop(struct padata_instance *pinst) -{ - mutex_lock(&pinst->lock); - __padata_stop(pinst); - mutex_unlock(&pinst->lock); -} -EXPORT_SYMBOL(padata_stop); - -#ifdef CONFIG_HOTPLUG_CPU - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || -- cgit v1.2.3 From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:49 -0700 Subject: mm: rename _count, field of the struct page, to _refcount Many developers already know that field for reference count of the struct page is _count and atomic type. They would try to handle it directly and this could break the purpose of page reference count tracepoint. To prevent direct _count modification, this patch rename it to _refcount and add warning message on the code. After that, developer who need to handle reference count will find that field should not be accessed directly. [akpm@linux-foundation.org: fix comments, per Vlastimil] [akpm@linux-foundation.org: Documentation/vm/transhuge.txt too] [sfr@canb.auug.org.au: sync ethernet driver changes] Signed-off-by: Joonsoo Kim Signed-off-by: Stephen Rothwell Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Cc: Manish Chopra Cc: Yuval Mintz Cc: Tariq Toukan Cc: Saeed Mahameed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 10 +++++----- arch/tile/mm/init.c | 2 +- drivers/block/aoe/aoecmd.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++---------- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 ++-- fs/proc/page.c | 2 +- include/linux/mm.h | 2 +- include/linux/mm_types.h | 14 ++++++++----- include/linux/page_ref.h | 26 ++++++++++++------------- include/linux/pagemap.h | 8 ++++---- kernel/kexec_core.c | 2 +- mm/huge_memory.c | 4 ++-- mm/internal.h | 2 +- mm/page_alloc.c | 4 ++-- mm/slub.c | 4 ++-- mm/vmscan.c | 4 ++-- 17 files changed, 58 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index d9cb65cf5cfd..fb0e1f2a19cc 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock. Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate in head page's ->_count. + - get_page()/put_page() and GUP operate in head page's ->_refcount. - - ->_count in tail pages is always zero: get_page_unless_zero() never + - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeed on tail pages. - map/unmap of the pages with PTE entry increment/decrement ->_mapcount @@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to sum of mapcount of all sub-pages plus one (split_huge_page caller must have reference for head page). -split_huge_page uses migration entries to stabilize page->_count and +split_huge_page uses migration entries to stabilize page->_refcount and page->_mapcount. We safe against physical memory scanners too: the only legitimate way scanner can get reference to a page is get_page_unless_zero(). -All tail pages has zero ->_count until atomic_add(). It prevent scanner +All tail pages has zero ->_refcount until atomic_add(). It prevent scanner from geting reference to tail page up to the point. After the atomic_add() -we don't care about ->_count value. We already known how many references +we don't care about ->_refcount value. We already known how many references with should uncharge from head page. For head page get_page_unless_zero() will succeed and we don't mind. It's diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index a0582b7f41d3..adce25462b0d 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end) * Hacky direct set to avoid unnecessary * lock take/release for EVERY page here. */ - p->_count.counter = 0; + p->_refcount.counter = 0; p->_mapcount.counter = -1; } init_page_count(page); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a822f44..d597e432e195 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index d9d6022c5aca..d2209147dc89 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma) if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex)) return; - /* drop page _counts */ + /* drop page _refcounts */ for (pg = 0; pg < msc->nr_pages; pg++) { struct page *page = msc_buffer_get_page(msc, pg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f3456798c596..bd947704b59c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i))) goto err_unmap; - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_add(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -452,8 +452,8 @@ err_unmap: while (--i >= 0) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq, */ split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->dma_info.page[i]._count); + page_ref_add(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq, dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz, PCI_DMA_FROMDEVICE); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->dma_info.page[i]._count); + page_ref_sub(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(&wi->dma_info.page[i]); } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 3aabfc0adefe..73dd525fbf08 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev, /* Incr page ref count to reuse on allocation failure * so that it doesn't get freed while freeing SKB. */ - atomic_inc(¤t_bd->data->_count); + page_ref_inc(current_bd->data); goto out; } @@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget) * freeing SKB. */ - atomic_inc(&sw_rx_data->data->_count); + page_ref_inc(sw_rx_data->data); rxq->rx_alloc_errors++; qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num); diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9992cc..3ecd445e830d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 727f799757ab..1193a54ea2b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -734,7 +734,7 @@ static inline void get_page(struct page *page) page = compound_head(page); /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. + * requires to already have an elevated page->_refcount. */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2d75b4fa86c..1fda9c99ef95 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,9 +73,9 @@ struct page { unsigned long counters; #else /* - * Keep _count separate from slub cmpxchg_double data. - * As the rest of the double word is protected by - * slab_lock but _count is not. + * Keep _refcount separate from slub cmpxchg_double + * data. As the rest of the double word is protected by + * slab_lock but _refcount is not. */ unsigned counters; #endif @@ -97,7 +97,11 @@ struct page { }; int units; /* SLOB */ }; - atomic_t _count; /* Usage count, see below. */ + /* + * Usage count, *USE WRAPPER FUNCTION* + * when manual accounting. See page_ref.h + */ + atomic_t _refcount; }; unsigned int active; /* SLAB */ }; @@ -248,7 +252,7 @@ struct page_frag_cache { __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. + * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index e596d5d9540e..8b5e0a9f2431 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v) static inline int page_ref_count(struct page *page) { - return atomic_read(&page->_count); + return atomic_read(&page->_refcount); } static inline int page_count(struct page *page) { - return atomic_read(&compound_head(page)->_count); + return atomic_read(&compound_head(page)->_refcount); } static inline void set_page_count(struct page *page, int v) { - atomic_set(&page->_count, v); + atomic_set(&page->_refcount, v); if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) __page_ref_set(page, v); } @@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { - atomic_add(nr, &page->_count); + atomic_add(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { - atomic_sub(nr, &page->_count); + atomic_sub(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { - atomic_inc(&page->_count); + atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { - atomic_dec(&page->_count); + atomic_dec(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - int ret = atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); @@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) static inline int page_ref_dec_and_test(struct page *page) { - int ret = atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); @@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page) static inline int page_ref_dec_return(struct page *page) { - int ret = atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); @@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page) static inline int page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); @@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u) static inline int page_ref_freeze(struct page *page, int count) { - int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) __page_ref_freeze(page, count, ret); @@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); - atomic_set(&page->_count, count); + atomic_set(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7e1ab155c67c..fe1513ffb7bf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. - * If the page is free (_count == 0), then _count is untouched, and 0 - * is returned. Otherwise, _count is incremented by 1 and 1 is returned. + * If the page is free (_refcount == 0), then _refcount is untouched, and 0 + * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _count. + * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher @@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold); * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * - * Remove-side that cares about stability of _count (eg. reclaim) has the + * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with tree_lock held for write): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3ee3b86..1c03dfb4abfd 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b49ee126d4d1..f8ac8f582fd8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); diff --git a/mm/internal.h b/mm/internal.h index b79abb6721cf..098a89e3b97c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069efcc4d7..4ce57f938b7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) diff --git a/mm/slub.c b/mm/slub.c index 8671de2e5b12..cf1faa4d3992 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; diff --git a/mm/vmscan.c b/mm/vmscan.c index 142cb61f4822..d3a02ac3eed7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ -- cgit v1.2.3 From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:10:58 -0700 Subject: include/linux/nodemask.h: create next_node_in() helper Lots of code does node = next_node(node, XXX); if (node == MAX_NUMNODES) node = first_node(XXX); so create next_node_in() to do this and use it in various places. [mhocko@suse.com: use next_node_in() helper] Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Michal Hocko Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/kernel/setup.c | 4 +--- arch/x86/mm/numa.c | 4 +--- include/linux/nodemask.h | 11 ++++++++++- kernel/cpuset.c | 8 +------- lib/Makefile | 2 +- lib/nodemask.c | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 4 +--- mm/memcontrol.c | 4 +--- mm/mempolicy.c | 24 ++---------------------- mm/page_isolation.c | 9 +++------ mm/slab.c | 13 +++---------- 11 files changed, 54 insertions(+), 59 deletions(-) create mode 100644 lib/nodemask.c (limited to 'kernel') diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index a992238e9b58..153020abd2f5 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void) cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]); cpu_2_node[best_cpu] = node; cpumask_clear_cpu(best_cpu, &unbound_cpus); - node = next_node(node, default_nodes); - if (node == MAX_NUMNODES) - node = first_node(default_nodes); + node = next_node_in(node, default_nodes); } /* Print out node assignments and set defaults for disabled cpus */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff46125..9c086c57105c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 6e85889cf9ab..f746e44d4046 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -43,8 +43,10 @@ * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int next_node_in(node, mask) Next node past 'node', or wrap to first, + * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or - * MAX_NUMNODES. + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set @@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp) return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +/* + * Find the next present node in src, starting after node n, wrapping around to + * the first node in src if needed. Returns MAX_NUMNODES if src is empty. + */ +#define next_node_in(n, src) __next_node_in((n), &(src)) +int __next_node_in(int node, const nodemask_t *srcp); + static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956baba1..611cc69af8f0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) diff --git a/lib/Makefile b/lib/Makefile index 931396ada5eb..42b69185f963 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 000000000000..e42a5bf44d33 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include +#include +#include + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..5856093f9062 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -937,9 +937,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe787f5c41bd..6740c4c2b550 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01bc950a..8d369cee0cd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include #include -#include #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f568206544..67bedd18429c 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/slab.c b/mm/slab.c index d81565a92864..c11bf5007952 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } -- cgit v1.2.3 From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:50 -0700 Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 14 +++++++++++ include/linux/vmstat.h | 4 +++ kernel/sysctl.c | 7 ++++++ mm/vmstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fece3121..720355cbdf45 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm: - panic_on_oom - percpu_pagelist_fraction - stat_interval +- stat_refresh - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -755,6 +756,19 @@ is 1 second. ============================================================== +stat_refresh + +Any read or write (by root only) flushes all the per-cpu vm statistics +into their global totals, for more accurate reports when testing +e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo + +As a side-effect, it also checks for negative totals (elsewhere reported +as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. +(At time of writing, a few stats are known sometimes to be found negative, +with no ill effects: errors and warnings on these stats are suppressed.) + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c4a5fb..02fce415b3d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -193,6 +193,10 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); +struct ctl_table; +int vmstat_refresh(struct ctl_table *, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9adacbd9..c831be32a1a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + static void vmstat_update(struct work_struct *w) { if (refresh_cpu_vm_stats(true)) { -- cgit v1.2.3 From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:30 -0700 Subject: cpuset: use static key better and convert to new API An important function for cpusets is cpuset_node_allowed(), which optimizes on the fact if there's a single root CPU set, it must be trivially allowed. But the check "nr_cpusets() <= 1" doesn't use the cpusets_enabled_key static key the right way where static keys eliminate branching overhead with jump labels. This patch converts it so that static key is used properly. It's also switched to the new static key API and the checking functions are converted to return bool instead of int. We also provide a new variant __cpuset_zone_allowed() which expects that the static key check was already done and they key was enabled. This is needed for get_page_from_freelist() where we want to also avoid the relatively slower check when ALLOC_CPUSET is not set in alloc_flags. The impact on the page allocator microbenchmark is less than expected but the cleanup in itself is worthwhile. 4.6.0-rc2 4.6.0-rc2 multcheck-v1r20 cpuset-v1r20 Min alloc-odr0-1 348.00 ( 0.00%) 348.00 ( 0.00%) Min alloc-odr0-2 254.00 ( 0.00%) 254.00 ( 0.00%) Min alloc-odr0-4 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-16 173.00 ( 0.00%) 171.00 ( 1.16%) Min alloc-odr0-32 166.00 ( 0.00%) 163.00 ( 1.81%) Min alloc-odr0-64 162.00 ( 0.00%) 159.00 ( 1.85%) Min alloc-odr0-128 160.00 ( 0.00%) 157.00 ( 1.88%) Min alloc-odr0-256 169.00 ( 0.00%) 166.00 ( 1.78%) Min alloc-odr0-512 180.00 ( 0.00%) 180.00 ( 0.00%) Min alloc-odr0-1024 188.00 ( 0.00%) 187.00 ( 0.53%) Min alloc-odr0-2048 194.00 ( 0.00%) 193.00 ( 0.52%) Min alloc-odr0-4096 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-8192 202.00 ( 0.00%) 201.00 ( 0.50%) Min alloc-odr0-16384 203.00 ( 0.00%) 202.00 ( 0.49%) Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Acked-by: Zefan Li Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++-------------- kernel/cpuset.c | 14 +++++++------- mm/page_alloc.c | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..bfc204e70338 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -16,26 +16,26 @@ #ifdef CONFIG_CPUSETS -extern struct static_key cpusets_enabled_key; +extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { - return static_key_false(&cpusets_enabled_key); + return static_branch_unlikely(&cpusets_enabled_key); } static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key) + 1; + return static_key_count(&cpusets_enabled_key.key) + 1; } static inline void cpuset_inc(void) { - static_key_slow_inc(&cpusets_enabled_key); + static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_key_slow_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_enabled_key); } extern int cpuset_init(void); @@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask); -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); + if (cpusets_enabled()) + return __cpuset_node_allowed(node, gfp_mask); + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return __cpuset_node_allowed(zone_to_nid(z), gfp_mask); +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + if (cpusets_enabled()) + return __cpuset_zone_allowed(z, gfp_mask); + return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return 1; + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return 1; + return true; +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 611cc69af8f0..73e93e53884d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include #include -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdf7a13311b5..39c441bb8d61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2859,7 +2859,7 @@ zonelist_scan: if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual -- cgit v1.2.3 From 6112a300c9e41993cc0dc56ac393743d28381284 Mon Sep 17 00:00:00 2001 From: Soumya PN Date: Tue, 17 May 2016 21:31:14 +0530 Subject: ftrace: Don't disable irqs when taking the tasklist_lock read_lock In ftrace.c inside the function alloc_retstack_tasklist() (which will be invoked when function_graph tracing is on) the tasklist_lock is being held as reader while iterating through a list of threads. Here the lock is being held as reader with irqs disabled. The tasklist_lock is never write_locked in interrupt context so it is safe to not disable interrupts for the duration of read_lock in this block which, can be significant, given the block of code iterates through all threads. Hence changing the code to call read_lock() and read_unlock() instead of read_lock_irqsave() and read_unlock_irqrestore(). A similar change was made in commits: 8063e41d2ffc ("tracing: Change syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread()")' and 3472eaa1f12e ("sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock()")' Link: http://lkml.kernel.org/r/1463500874-77480-1-git-send-email-soumya.p.n@hpe.com Signed-off-by: Soumya PN Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b1870fbd2b67..a6804823a058 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5713,7 +5713,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) { int i; int ret = 0; - unsigned long flags; int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; struct task_struct *g, *t; @@ -5729,7 +5728,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } } - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); do_each_thread(g, t) { if (start == end) { ret = -EAGAIN; @@ -5747,7 +5746,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } while_each_thread(g, t); unlock: - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); free: for (i = start; i < end; i++) kfree(ret_stack_list[i]); -- cgit v1.2.3 From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 May 2016 14:14:28 +0200 Subject: bpf: rather use get_random_int for randomizations Start address randomization and blinding in BPF currently use prandom_u32(). prandom_u32() values are not exposed to unpriviledged user space to my knowledge, but given other kernel facilities such as ASLR, stack canaries, etc make use of stronger get_random_int(), we better make use of it here as well given blinding requests successively new random values. get_random_int() has minimal entropy pool depletion, is not cryptographically secure, but doesn't need to be for our use cases here. Suggested-by: Hannes Frederic Sowa Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f1e8a0def99b..b94a36550591 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->pages = size / PAGE_SIZE; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (prandom_u32() % hole) & ~(alignment - 1); + start = (get_random_int() % hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, struct bpf_insn *to_buff) { struct bpf_insn *to = to_buff; - u32 imm_rnd = prandom_u32(); + u32 imm_rnd = get_random_int(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); -- cgit v1.2.3 From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 May 2016 17:22:48 -0500 Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the bpf filesystem. Looking at the code I saw a broken usage of mount_ns with current->nsproxy->mnt_ns. As the code does not acquire a reference to the mount namespace it can not possibly be correct to store the mount namespace on the superblock as it does. Replace mount_ns with mount_nodev so that each mount of the bpf filesystem returns a distinct instance, and the code is not buggy. In discussion with Hannes Frederic Sowa it was reported that the use of mount_ns was an attempt to have one bpf instance per mount namespace, in an attempt to keep resources that pin resources from hiding. That intent simply does not work, the vfs is not built to allow that kind of behavior. Which means that the bpf filesystem really is buggy both semantically and in it's implemenation as it does not nor can it implement the original intent. This change is userspace visible, but my experience with similar filesystems leads me to believe nothing will break with a model of each mount of the bpf filesystem is distinct from all others. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Cc: Hannes Frederic Sowa Acked-by: Daniel Borkmann Signed-off-by: "Eric W. Biederman" Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 71b75d9c81da..04be7021f848 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) static struct dentry *bpf_mount(struct file_system_type *type, int flags, const char *dev_name, void *data) { - return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); + return mount_nodev(type, flags, data, bpf_fill_super); } static struct file_system_type bpf_fs_type = { -- cgit v1.2.3 From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:13 -0700 Subject: bpf: support decreasing order in direct packet access when packet headers are accessed in 'decreasing' order (like TCP port may be fetched before the program reads IP src) the llvm may generate the following code: [...] // R7=pkt(id=0,off=22,r=70) r2 = *(u32 *)(r7 +0) // good access [...] r7 += 40 // R7=pkt(id=0,off=62,r=70) r8 = *(u32 *)(r7 +0) // good access [...] r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range // it's doing *(u32*)(skb->data + 42) Fix verifier to recognize such code pattern Alos turned out that 'off > range' condition is not a verifier bug. It's a buggy program that may do something like: if (ptr + 50 > data_end) return 0; ptr += 60; *(u32*)ptr; in such case emit "invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message, so all information is available for the program author to fix the program. Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a08d66215245..d54e34874579 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, { struct reg_state *regs = env->cur_state.regs; struct reg_state *reg = ®s[regno]; - int linear_size = (int) reg->range - (int) reg->off; - if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) { - verbose("verifier bug\n"); - return -EFAULT; - } - if (off < 0 || off + size > linear_size) { - verbose("invalid access to packet, off=%d size=%d, allowed=%d\n", - off, size, linear_size); + off += reg->off; + if (off < 0 || off + size > reg->range) { + verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } return 0; -- cgit v1.2.3 From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 19 May 2016 18:17:14 -0700 Subject: bpf: teach verifier to recognize imm += ptr pattern Humans don't write C code like: u8 *ptr = skb->data; int imm = 4; imm += ptr; but from llvm backend point of view 'imm' and 'ptr' are registers and imm += ptr may be preferred vs ptr += imm depending which register value will be used further in the code, while verifier can only recognize ptr += imm. That caused small unrelated changes in the C code of the bpf program to trigger rejection by the verifier. Therefore teach the verifier to recognize both ptr += imm and imm += ptr. For example: when R6=pkt(id=0,off=0,r=62) R7=imm22 after r7 += r6 instruction will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62) Fixes: 969bf05eb3ce ("bpf: direct packet access") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d54e34874579..668e07903c8f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *regs = env->cur_state.regs; struct reg_state *dst_reg = ®s[insn->dst_reg]; struct reg_state *src_reg = ®s[insn->src_reg]; + struct reg_state tmp_reg; s32 imm; if (BPF_SRC(insn->code) == BPF_K) { @@ -1267,6 +1268,19 @@ add_imm: */ dst_reg->off += imm; } else { + if (src_reg->type == PTR_TO_PACKET) { + /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */ + tmp_reg = *dst_reg; /* save r7 state */ + *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */ + src_reg = &tmp_reg; /* pretend it's src_reg state */ + /* if the checks below reject it, the copy won't matter, + * since we're rejecting the whole program. If all ok, + * then imm22 state will be added to r7 + * and r7 will be pkt(id=0,off=22,r=62) while + * r6 will stay as pkt(id=0,off=0,r=62) + */ + } + if (src_reg->type == CONST_IMM) { /* pkt_ptr += reg where reg is known constant */ imm = src_reg->imm; @@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && - dst_reg->type == PTR_TO_PACKET) { + (dst_reg->type == PTR_TO_PACKET || + (BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == PTR_TO_PACKET))) { /* ptr_to_packet += K|X */ return check_packet_ptr_add(env, insn); } else if (BPF_CLASS(insn->code) == BPF_ALU64 && -- cgit v1.2.3 From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper context Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 5 +++++ kernel/fork.c | 50 +++++++++++++++++++++++++++++++++--------------- mm/oom_kill.c | 8 ++++++-- 4 files changed, 48 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1fda9c99ef95..d553855503e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif + struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 40eabf176ce2..479e3cade7e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +/* same as above but performs the slow path from the async kontext. Can + * be called from the atomic context as well + */ +extern void mmput_async(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* diff --git a/kernel/fork.c b/kernel/fork.c index 3e8451527cbe..8fbed7194af1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); /** * set_mm_exe_file - change a reference to the mm's executable file diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0376efa79ec..c0e37dd1422f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; @@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk) */ set_bit(MMF_OOM_REAPED, &mm->flags); out: - mmput(mm); + /* + * Drop our reference but make sure the mmput slow path is called from a + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ + mmput_async(mm); return ret; } -- cgit v1.2.3 From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:20 -0700 Subject: exit_thread: accept a task parameter to be exited We need to call exit_thread from copy_process in a fail path. So make it accept task_struct as a parameter. [v2] * s390: exit_thread_runtime_instr doesn't make sense to be called for non-current tasks. * arm: fix the comment in vfp_thread_copy * change 'me' to 'tsk' for task_struct * now we can change only archs that actually have exit_thread [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 4 ++-- arch/arm/vfp/vfpmodule.c | 4 ---- arch/avr32/kernel/process.c | 4 ++-- arch/cris/arch-v32/kernel/process.c | 4 ++-- arch/ia64/kernel/perfmon.c | 4 ++-- arch/ia64/kernel/process.c | 14 +++++++------- arch/metag/kernel/process.c | 6 +++--- arch/mn10300/kernel/process.c | 4 ++-- arch/s390/kernel/process.c | 5 +++-- arch/sh/kernel/process_64.c | 5 ++--- arch/sparc/kernel/process_32.c | 12 ++++++------ arch/sparc/kernel/process_64.c | 4 ++-- arch/tile/kernel/process.c | 4 ++-- arch/x86/kernel/process.c | 5 ++--- arch/xtensa/kernel/process.c | 4 ++-- include/linux/sched.h | 4 ++-- kernel/exit.c | 2 +- 17 files changed, 42 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 4adfb46e3ee9..a647d6642f3e 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head); /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - thread_notify(THREAD_NOTIFY_EXIT, current_thread_info()); + thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk)); } void flush_thread(void) diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 2a61e4b04600..73085d3482ed 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread) * - we could be preempted if tree preempt rcu is enabled, so * it is unsafe to use thread->cpu. * THREAD_NOTIFY_EXIT - * - the thread (v) will be running on the local CPU, so - * v === current_thread_info() - * - thread->cpu is the local CPU number at the time it is accessed, - * but may change at any time. * - we could be preempted if tree preempt rcu is enabled, so * it is unsafe to use thread->cpu. */ diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 42a53e740a7e..68e5b9dac059 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -62,9 +62,9 @@ void machine_restart(char *cmd) /* * Free current thread data structures etc */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - ocd_disable(current); + ocd_disable(tsk); } void flush_thread(void) diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index c7ce784a393c..4d1afa9f9fd3 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -33,9 +33,9 @@ void default_idle(void) */ extern void deconfigure_bp(long pid); -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - deconfigure_bp(current->pid); + deconfigure_bp(tsk->pid); } /* diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 9cd607b06964..2436ad5f92c1 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg /* - * called only from exit_thread(): task == current - * we come here only if current has a context attached (loaded or masked) + * called only from exit_thread() + * we come here only if the task has a context attached (loaded or masked) */ void pfm_exit_thread(struct task_struct *task) diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index b51514957620..aae6c4dc7ae7 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -570,22 +570,22 @@ flush_thread (void) } /* - * Clean up state associated with current thread. This is called when + * Clean up state associated with a thread. This is called when * the thread calls exit(). */ void -exit_thread (void) +exit_thread (struct task_struct *tsk) { - ia64_drop_fpu(current); + ia64_drop_fpu(tsk); #ifdef CONFIG_PERFMON /* if needed, stop monitoring and flush state to perfmon context */ - if (current->thread.pfm_context) - pfm_exit_thread(current); + if (tsk->thread.pfm_context) + pfm_exit_thread(tsk); /* free debug register resources */ - if (current->thread.flags & IA64_THREAD_DBG_VALID) - pfm_release_debug_registers(current); + if (tsk->thread.flags & IA64_THREAD_DBG_VALID) + pfm_release_debug_registers(tsk); #endif } diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c index 7f546183a0f0..35062796edf2 100644 --- a/arch/metag/kernel/process.c +++ b/arch/metag/kernel/process.c @@ -345,10 +345,10 @@ void flush_thread(void) /* * Free current thread data structures etc. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - clear_fpu(¤t->thread); - clear_dsp(¤t->thread); + clear_fpu(&tsk->thread); + clear_dsp(&tsk->thread); } /* TODO: figure out how to unwind the kernel stack here to figure out diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 74a96ccf7451..cbede4e88dee 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs) /* * free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - exit_fpu(current); + exit_fpu(tsk); } void flush_thread(void) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 481d7a83efc6..bba4fa74b321 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -68,9 +68,10 @@ extern void kernel_thread_starter(void); /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - exit_thread_runtime_instr(); + if (tsk == current) + exit_thread_runtime_instr(); } void flush_thread(void) diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index e2062e643341..9d3e9916555d 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { /* * See arch/sparc/kernel/process.c for the precedent for doing @@ -307,9 +307,8 @@ void exit_thread(void) * which it would get safely nulled. */ #ifdef CONFIG_SH_FPU - if (last_task_used_math == current) { + if (last_task_used_math == tsk) last_task_used_math = NULL; - } #endif } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index c5113c7ce2fd..b7780a5bef11 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #ifndef CONFIG_SMP - if(last_task_used_math == current) { + if (last_task_used_math == tsk) { #else - if (test_thread_flag(TIF_USEDFPU)) { + if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) { #endif /* Keep process from leaving FPU in a bogon state. */ put_psr(get_psr() | PSR_EF); - fpsave(¤t->thread.float_regs[0], ¤t->thread.fsr, - ¤t->thread.fpqueue[0], ¤t->thread.fpqdepth); + fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr, + &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth); #ifndef CONFIG_SMP last_task_used_math = NULL; #else - clear_thread_flag(TIF_USEDFPU); + clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU); #endif } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index c16ef1af1843..fa14402b33f9 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk) } /* Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct thread_info *t = current_thread_info(); + struct thread_info *t = task_thread_info(tsk); if (t->utraps) { if (t->utraps[0] < 2) diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index b5f30d376ce1..6b705ccc9cc1 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -541,7 +541,7 @@ void flush_thread(void) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #ifdef CONFIG_HARDWALL /* @@ -550,7 +550,7 @@ void exit_thread(void) * the last reference to a hardwall fd, it would already have * been released and deactivated at this point.) */ - hardwall_deactivate_all(current); + hardwall_deactivate_all(tsk); #endif } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2915d54e9dd5..96becbbb52e0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) /* * Free current thread data structures etc.. */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { - struct task_struct *me = current; - struct thread_struct *t = &me->thread; + struct thread_struct *t = &tsk->thread; unsigned long *bp = t->io_bitmap_ptr; struct fpu *fpu = &t->fpu; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 5bbfed81c97b..e0ded48561db 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -115,10 +115,10 @@ void arch_cpu_idle(void) /* * This is called when the thread calls exit(). */ -void exit_thread(void) +void exit_thread(struct task_struct *tsk) { #if XTENSA_HAVE_COPROCESSORS - coprocessor_release_all(current_thread_info()); + coprocessor_release_all(task_thread_info(tsk)); #endif } diff --git a/include/linux/sched.h b/include/linux/sched.h index 167c0d4bf3fa..02bdab4d6db7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2771,9 +2771,9 @@ static inline int copy_thread_tls( extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD -extern void exit_thread(void); +extern void exit_thread(struct task_struct *tsk); #else -static inline void exit_thread(void) +static inline void exit_thread(struct task_struct *tsk) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index fd90195667e1..75b34fe835b2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -746,7 +746,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent -- cgit v1.2.3 From 0740aa5f6375681c57488c4ea55d05a0341cfc9c Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 20 May 2016 17:00:25 -0700 Subject: fork: free thread in copy_process on failure When using this program (as root): #include #include #include #include #include #include #include #define ITER 1000 #define FORKERS 15 #define THREADS (6000/FORKERS) // 1850 is proc max static void fork_100_wait() { unsigned a, to_wait = 0; printf("\t%d forking %d\n", THREADS, getpid()); for (a = 0; a < THREADS; a++) { switch (fork()) { case 0: usleep(1000); exit(0); break; case -1: break; default: to_wait++; break; } } printf("\t%d forked from %d, waiting for %d\n", THREADS, getpid(), to_wait); for (a = 0; a < to_wait; a++) wait(NULL); printf("\t%d waited from %d\n", THREADS, getpid()); } static void run_forkers() { pid_t forkers[FORKERS]; unsigned a; for (a = 0; a < FORKERS; a++) { switch ((forkers[a] = fork())) { case 0: fork_100_wait(); exit(0); break; case -1: err(1, "DIE fork of %d'th forker", a); break; default: break; } } for (a = 0; a < FORKERS; a++) waitpid(forkers[a], NULL, 0); } int main() { unsigned a; int ret; ret = ioperm(10, 20, 0); if (ret < 0) err(1, "ioperm"); for (a = 0; a < ITER; a++) run_forkers(); return 0; } kmemleak reports many occurences of this leak: unreferenced object 0xffff8805917c8000 (size 8192): comm "fork-leak", pid 2932, jiffies 4295354292 (age 1871.028s) hex dump (first 32 bytes): ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ................ backtrace: [] kmemdup+0x25/0x50 [] copy_thread_tls+0x6c3/0x9a0 [] copy_process+0x1a84/0x5790 [] wake_up_new_task+0x2d5/0x6f0 [] _do_fork+0x12d/0x820 ... Due to the leakage of the memory items which should have been freed in arch/x86/kernel/process.c:exit_thread(). Make sure the memory is freed when fork fails later in copy_process. This is done by calling exit_thread with the thread to kill. Signed-off-by: Jiri Slaby Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf Cc: Chris Zankel Cc: David Howells Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Guan Xuetao Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Hogan Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Jonas Bonn Cc: Koichi Yasutake Cc: Lennox Wu Cc: Ley Foon Tan Cc: Mark Salter Cc: Martin Schwidefsky Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mikael Starvik Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Henderson Cc: Richard Kuo Cc: Richard Weinberger Cc: Russell King Cc: Steven Miao Cc: Thomas Gleixner Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8fbed7194af1..103d78fd8f75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1490,7 +1490,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns_for_children); if (IS_ERR(pid)) { retval = PTR_ERR(pid); - goto bad_fork_cleanup_io; + goto bad_fork_cleanup_thread; } } @@ -1652,6 +1652,8 @@ bad_fork_cancel_cgroup: bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); +bad_fork_cleanup_thread: + exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); -- cgit v1.2.3 From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:33 -0700 Subject: printk/nmi: generic solution for safe printk in NMI printk() takes some locks and could not be used a safe way in NMI context. The chance of a deadlock is real especially when printing stacks from all CPUs. This particular problem has been addressed on x86 by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). The patchset brings two big advantages. First, it makes the NMI backtraces safe on all architectures for free. Second, it makes all NMI messages almost safe on all architectures (the temporary buffer is limited. We still should keep the number of messages in NMI context at minimum). Note that there already are several messages printed in NMI context: WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE handlers. These are not easy to avoid. This patch reuses most of the code and makes it generic. It is useful for all messages and architectures that support NMI. The alternative printk_func is set when entering and is reseted when leaving NMI context. It queues IRQ work to copy the messages into the main ring buffer in a safe context. __printk_nmi_flush() copies all available messages and reset the buffer. Then we could use a simple cmpxchg operations to get synchronized with writers. There is also used a spinlock to get synchronized with other flushers. We do not longer use seq_buf because it depends on external lock. It would be hard to make all supported operations safe for a lockless use. It would be confusing and error prone to make only some operations safe. The code is put into separate printk/nmi.c as suggested by Steven Rostedt. It needs a per-CPU buffer and is compiled only on architectures that call nmi_enter(). This is achieved by the new HAVE_NMI Kconfig flag. The are MN10300 and Xtensa architectures. We need to clean up NMI handling there first. Let's do it separately. The patch is heavily based on the draft from Peter Zijlstra, see https://lkml.org/lkml/2015/6/10/327 [arnd@arndb.de: printk-nmi: use %zu format string for size_t] [akpm@linux-foundation.org: min_t->min - all types are size_t here] Signed-off-by: Petr Mladek Suggested-by: Peter Zijlstra Suggested-by: Steven Rostedt Cc: Jan Kara Acked-by: Russell King [arm part] Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 + arch/arm/Kconfig | 1 + arch/arm/kernel/smp.c | 2 + arch/avr32/Kconfig | 1 + arch/blackfin/Kconfig | 1 + arch/cris/Kconfig | 1 + arch/mips/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/tile/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/kernel/apic/hw_nmi.c | 1 - include/linux/hardirq.h | 2 + include/linux/percpu.h | 3 - include/linux/printk.h | 12 ++- init/Kconfig | 5 + init/main.c | 1 + kernel/printk/Makefile | 1 + kernel/printk/internal.h | 44 +++++++++ kernel/printk/nmi.c | 219 ++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 19 +--- lib/nmi_backtrace.c | 89 +---------------- 24 files changed, 306 insertions(+), 107 deletions(-) create mode 100644 kernel/printk/internal.h create mode 100644 kernel/printk/nmi.c (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 0f298f9123dc..8f84fd268dee 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -187,7 +187,11 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_NMI + bool + config HAVE_NMI_WATCHDOG + depends on HAVE_NMI bool # # An arch should select this if it provides all these things: diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 956d3575426c..90542db1220d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -67,6 +67,7 @@ config ARM select HAVE_KRETPROBES if (HAVE_KPROBES) select HAVE_MEMBLOCK select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select HAVE_OPROFILE if (HAVE_PERF_EVENTS) select HAVE_OPTPROBES if !THUMB2_KERNEL select HAVE_PERF_EVENTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index baee70267f29..df90bc59bfce 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; case IPI_CPU_BACKTRACE: + printk_nmi_enter(); irq_enter(); nmi_cpu_backtrace(regs); irq_exit(); + printk_nmi_exit(); break; default: diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index e43519a2ca89..7e75d45e20cd 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -18,6 +18,7 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA + select HAVE_NMI help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index a63c12259e77..28c63fea786d 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -40,6 +40,7 @@ config BLACKFIN select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW + select HAVE_NMI config GENERIC_CSUM def_bool y diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 5c0ca8ae9293..deba2662b9f3 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -70,6 +70,7 @@ config CRIS select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32 select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32 select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32 + select HAVE_NMI config HZ int diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5663f411c225..8040fb1845b4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -48,6 +48,7 @@ config MIPS select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC select GENERIC_CMOS_UPDATE select HAVE_MOD_ARCH_SPECIFIC + select HAVE_NMI select VIRT_TO_BUS select MODULES_USE_ELF_REL if MODULES select MODULES_USE_ELF_RELA if MODULES && 64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f0403b58ae8b..01f7464d9fea 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -155,6 +155,7 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_NMI if PERF_EVENTS select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e2c9aaaf64b2..1c3c43d9d1b5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -166,6 +166,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select HAVE_NMI config SCHED_OMIT_FRAME_POINTER diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index cb93af8f8017..f6254341c065 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -44,6 +44,7 @@ config SUPERH select OLD_SIGSUSPEND select OLD_SIGACTION select HAVE_ARCH_AUDITSYSCALL + select HAVE_NMI help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 27b3a0ad40a0..1012f7ffcdf5 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -79,6 +79,7 @@ config SPARC64 select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_NMI config ARCH_DEFCONFIG string diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 174746225577..76989b878f3c 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -30,6 +30,7 @@ config TILE select HAVE_DEBUG_STACKOVERFLOW select ARCH_WANT_FRAME_POINTERS select HAVE_CONTEXT_TRACKING + select HAVE_NMI if USE_PMC select EDAC_SUPPORT select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ff5b3be95d4..0a7b885964ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -131,6 +131,7 @@ config X86 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 045e424fb368..7788ce643bf4 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,7 +18,6 @@ #include #include #include -#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index dfd59d6bc6f0..c683996110b1 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -61,6 +61,7 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ + printk_nmi_enter(); \ lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ @@ -77,6 +78,7 @@ extern void irq_exit(void); preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ + printk_nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4bc6dafb703e..56939d3f6e53 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) -/* To avoid include hell, as printk can not declare this, we declare it here */ -DECLARE_PER_CPU(printk_func_t, printk_func); - #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 9ccbdf2c1453..51dd6b824fe2 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); +#ifdef CONFIG_PRINTK_NMI +extern void printk_nmi_init(void); +extern void printk_nmi_enter(void); +extern void printk_nmi_exit(void); +extern void printk_nmi_flush(void); +#else +static inline void printk_nmi_init(void) { } +static inline void printk_nmi_enter(void) { } +static inline void printk_nmi_exit(void) { } +static inline void printk_nmi_flush(void) { } +#endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) diff --git a/init/Kconfig b/init/Kconfig index 79a91a2c0444..bccc1d607be5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1454,6 +1454,11 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. +config PRINTK_NMI + def_bool y + depends on PRINTK + depends on HAVE_NMI + config BUG bool "BUG() support" if EXPERT default y diff --git a/init/main.c b/init/main.c index 2075fafaad59..fa9b2bdde183 100644 --- a/init/main.c +++ b/init/main.c @@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void) timekeeping_init(); time_init(); sched_clock_postinit(); + printk_nmi_init(); perf_event_init(); profile_init(); call_function_init(); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 85405bdcf2b3..abb0042a427b 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,2 +1,3 @@ obj-y = printk.o +obj-$(CONFIG_PRINTK_NMI) += nmi.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h new file mode 100644 index 000000000000..2de99faedfc1 --- /dev/null +++ b/kernel/printk/internal.h @@ -0,0 +1,44 @@ +/* + * internal.h - printk internal definitions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include + +typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args); + +int __printf(1, 0) vprintk_default(const char *fmt, va_list args); + +#ifdef CONFIG_PRINTK_NMI + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it temporary stores the strings into a per-CPU buffer. + * The alternative implementation is chosen transparently + * via per-CPU variable. + */ +DECLARE_PER_CPU(printk_func_t, printk_func); +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return this_cpu_read(printk_func)(fmt, args); +} + +#else /* CONFIG_PRINTK_NMI */ + +static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) +{ + return vprintk_default(fmt, args); +} + +#endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c new file mode 100644 index 000000000000..303cf0d15e57 --- /dev/null +++ b/kernel/printk/nmi.c @@ -0,0 +1,219 @@ +/* + * nmi.c - Safe printk in NMI context + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * printk() could not take logbuf_lock in NMI context. Instead, + * it uses an alternative implementation that temporary stores + * the strings into a per-CPU buffer. The content of the buffer + * is later flushed into the main ring buffer via IRQ work. + * + * The alternative implementation is chosen transparently + * via @printk_func per-CPU variable. + * + * The implementation allows to flush the strings also from another CPU. + * There are situations when we want to make sure that all buffers + * were handled or when IRQs are blocked. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; +static int printk_nmi_irq_ready; + +#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) + +struct nmi_seq_buf { + atomic_t len; /* length of written data */ + struct irq_work work; /* IRQ work that flushes the buffer */ + unsigned char buffer[NMI_LOG_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); + +/* + * Safe printk() for NMI context. It uses a per-CPU buffer to + * store the message. NMIs are not nested, so there is always only + * one writer running. But the buffer might get flushed from another + * CPU, so we need to be careful. + */ +static int vprintk_nmi(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + int add = 0; + size_t len; + +again: + len = atomic_read(&s->len); + + if (len >= sizeof(s->buffer)) + return 0; + + /* + * Make sure that all old data have been read before the buffer was + * reseted. This is not needed when we just append data. + */ + if (!len) + smp_rmb(); + + add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + + /* + * Do it once again if the buffer has been flushed in the meantime. + * Note that atomic_cmpxchg() is an implicit memory barrier that + * makes sure that the data were written before updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, len + add) != len) + goto again; + + /* Get flushed in a more safe context. */ + if (add && printk_nmi_irq_ready) { + /* Make sure that IRQ work is really initialized. */ + smp_rmb(); + irq_work_queue(&s->work); + } + + return add; +} + +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + +/* + * Flush data from the associated per_CPU buffer. The function + * can be called either via IRQ work or independently. + */ +static void __printk_nmi_flush(struct irq_work *work) +{ + static raw_spinlock_t read_lock = + __RAW_SPIN_LOCK_INITIALIZER(read_lock); + struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); + unsigned long flags; + size_t len, size; + int i, last_i; + + /* + * The lock has two functions. First, one reader has to flush all + * available message to make the lockless synchronization with + * writers easier. Second, we do not want to mix messages from + * different CPUs. This is especially important when printing + * a backtrace. + */ + raw_spin_lock_irqsave(&read_lock, flags); + + i = 0; +more: + len = atomic_read(&s->len); + + /* + * This is just a paranoid check that nobody has manipulated + * the buffer an unexpected way. If we printed something then + * @len must only increase. + */ + if (i && i >= len) + pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", + i, len); + + if (!len) + goto out; /* Someone else has already flushed the buffer. */ + + /* Make sure that data has been written up to the @len */ + smp_rmb(); + + size = min(len, sizeof(s->buffer)); + last_i = i; + + /* Print line by line. */ + for (; i < size; i++) { + if (s->buffer[i] == '\n') { + print_nmi_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < size) { + print_nmi_seq_line(s, last_i, size - 1); + pr_cont("\n"); + } + + /* + * Check that nothing has got added in the meantime and truncate + * the buffer. Note that atomic_cmpxchg() is an implicit memory + * barrier that makes sure that the data were copied before + * updating s->len. + */ + if (atomic_cmpxchg(&s->len, len, 0) != len) + goto more; + +out: + raw_spin_unlock_irqrestore(&read_lock, flags); +} + +/** + * printk_nmi_flush - flush all per-cpu nmi buffers. + * + * The buffers are flushed automatically via IRQ work. This function + * is useful only when someone wants to be sure that all buffers have + * been flushed at some point. + */ +void printk_nmi_flush(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); +} + +void __init printk_nmi_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu); + + init_irq_work(&s->work, __printk_nmi_flush); + } + + /* Make sure that IRQ works are initialized before enabling. */ + smp_wmb(); + printk_nmi_irq_ready = 1; + + /* Flush pending messages that did not have scheduled IRQ works. */ + printk_nmi_flush(); +} + +void printk_nmi_enter(void) +{ + this_cpu_write(printk_func, vprintk_nmi); +} + +void printk_nmi_exit(void) +{ + this_cpu_write(printk_func, vprintk_default); +} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bfbf284e4218..71eba0607034 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,7 @@ #include "console_cmdline.h" #include "braille.h" +#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(vprintk_default); -/* - * This allows printk to be diverted to another function per cpu. - * This is useful for calling printk functions from within NMI - * without worrying about race conditions that can lock up the - * box. - */ -DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; - /** * printk - print a kernel message * @fmt: format string @@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; */ asmlinkage __visible int printk(const char *fmt, ...) { - printk_func_t vprintk_func; va_list args; int r; va_start(args, fmt); - - /* - * If a caller overrides the per_cpu printk_func, then it needs - * to disable preemption when calling printk(). Otherwise - * the printk_func should be set to the default. No need to - * disable preemption here. - */ - vprintk_func = this_cpu_read(printk_func); r = vprintk_func(fmt, args); - va_end(args); return r; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 6019c53c669e..26caf51cc238 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,33 +16,14 @@ #include #include #include -#include #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) -{ - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); -} - /* * When raise() is called it will be is passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() @@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end) void nmi_trigger_all_cpu_backtrace(bool include_self, void (*raise)(cpumask_t *mask)) { - struct nmi_seq_buf *s; - int i, cpu, this_cpu = get_cpu(); + int i, this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, if (!include_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("Sending NMI to %s CPUs:\n", (include_self ? "all" : "other")); @@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, } /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. */ - for_each_cpu(cpu, &printtrace_mask) { - int len, last_i = 0; + printk_nmi_flush(); - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; -} - bool nmi_cpu_backtrace(struct pt_regs *regs) { int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); pr_warn("NMI backtrace for cpu %d\n", cpu); if (regs) show_regs(regs); else dump_stack(); - this_cpu_write(printk_func, printk_func_save); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } -- cgit v1.2.3 From b522deabc6f18e4f938d93a84f345f2cbf3347d1 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:36 -0700 Subject: printk/nmi: warn when some message has been lost in NMI context We could not resize the temporary buffer in NMI context. Let's warn if a message is lost. This is rather theoretical. printk() should not be used in NMI. The only sensible use is when we want to print backtrace from all CPUs. The current buffer should be enough for this purpose. [akpm@linux-foundation.org: whitespace fixlet] Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/internal.h | 11 +++++++++++ kernel/printk/nmi.c | 5 ++++- kernel/printk/printk.c | 10 ++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 2de99faedfc1..341bedccc065 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -34,6 +34,12 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return this_cpu_read(printk_func)(fmt, args); } +extern atomic_t nmi_message_lost; +static inline int get_nmi_message_lost(void) +{ + return atomic_xchg(&nmi_message_lost, 0); +} + #else /* CONFIG_PRINTK_NMI */ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) @@ -41,4 +47,9 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args) return vprintk_default(fmt, args); } +static inline int get_nmi_message_lost(void) +{ + return 0; +} + #endif /* CONFIG_PRINTK_NMI */ diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 303cf0d15e57..572f94922230 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -39,6 +39,7 @@ */ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; +atomic_t nmi_message_lost; #define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) @@ -64,8 +65,10 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) + if (len >= sizeof(s->buffer)) { + atomic_inc(&nmi_message_lost); return 0; + } /* * Make sure that all old data have been read before the buffer was diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 71eba0607034..e38579d730f4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1617,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + int nmi_message_lost; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static unsigned int logbuf_cpu = UINT_MAX; @@ -1667,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level, strlen(recursion_msg)); } + nmi_message_lost = get_nmi_message_lost(); + if (unlikely(nmi_message_lost)) { + text_len = scnprintf(textbuf, sizeof(textbuf), + "BAD LUCK: lost %d message(s) from NMI context!", + nmi_message_lost); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, textbuf, text_len); + } + /* * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. -- cgit v1.2.3 From 427934b8714ec130b068d1c9d88f25b24aaede32 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:39 -0700 Subject: printk/nmi: increase the size of NMI buffer and make it configurable Testing has shown that the backtrace sometimes does not fit into the 4kB temporary buffer that is used in NMI context. The warnings are gone when I double the temporary buffer size. This patch doubles the buffer size and makes it configurable. Note that this problem existed even in the x86-specific implementation that was added by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all CPUs"). Nobody noticed it because it did not print any warnings. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Russell King Cc: Daniel Thompson Cc: Jiri Kosina Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: David Miller Cc: Daniel Thompson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 22 ++++++++++++++++++++++ kernel/printk/nmi.c | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index bccc1d607be5..a9c4aefd5436 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -862,6 +862,28 @@ config LOG_CPU_MAX_BUF_SHIFT 13 => 8 KB for each CPU 12 => 4 KB for each CPU +config NMI_LOG_BUF_SHIFT + int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)" + range 10 21 + default 13 + depends on PRINTK_NMI + help + Select the size of a per-CPU buffer where NMI messages are temporary + stored. They are copied to the main log buffer in a safe context + to avoid a deadlock. The value defines the size as a power of 2. + + NMI messages are rare and limited. The largest one is when + a backtrace is printed. It usually fits into 4KB. Select + 8KB if you want to be on the safe side. + + Examples: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 572f94922230..bf08557d7e3d 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -41,7 +41,8 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; static int printk_nmi_irq_ready; atomic_t nmi_message_lost; -#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work)) +#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) - \ + sizeof(atomic_t) - sizeof(struct irq_work)) struct nmi_seq_buf { atomic_t len; /* length of written data */ -- cgit v1.2.3 From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 May 2016 17:00:42 -0700 Subject: printk/nmi: flush NMI messages on the system panic In NMI context, printk() messages are stored into per-CPU buffers to avoid a possible deadlock. They are normally flushed to the main ring buffer via an IRQ work. But the work is never called when the system calls panic() in the very same NMI handler. This patch tries to flush NMI buffers before the crash dump is generated. In this case it does not risk a double release and bails out when the logbuf_lock is already taken. The aim is to get the messages into the main ring buffer when possible. It makes them better accessible in the vmcore. Then the patch tries to flush the buffers second time when other CPUs are down. It might be more aggressive and reset logbuf_lock. The aim is to get the messages available for the consequent kmsg_dump() and console_flush_on_panic() calls. The patch causes vprintk_emit() to be called even in NMI context again. But it is done via printk_deferred() so that the console handling is skipped. Consoles use internal locks and we could not prevent a deadlock easily. They are explicitly called later when the crash dump is not generated, see console_flush_on_panic(). Signed-off-by: Petr Mladek Cc: Benjamin Herrenschmidt Cc: Daniel Thompson Cc: David Miller Cc: Ingo Molnar Cc: Jan Kara Cc: Jiri Kosina Cc: Martin Schwidefsky Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 ++ kernel/kexec_core.c | 1 + kernel/panic.c | 6 +++++- kernel/printk/internal.h | 2 ++ kernel/printk/nmi.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/printk/printk.c | 2 +- 6 files changed, 49 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 51dd6b824fe2..f4da695fd615 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -127,11 +127,13 @@ extern void printk_nmi_init(void); extern void printk_nmi_enter(void); extern void printk_nmi_exit(void); extern void printk_nmi_flush(void); +extern void printk_nmi_flush_on_panic(void); #else static inline void printk_nmi_init(void) { } static inline void printk_nmi_enter(void) { } static inline void printk_nmi_exit(void) { } static inline void printk_nmi_flush(void) { } +static inline void printk_nmi_flush_on_panic(void) { } #endif /* PRINTK_NMI */ #ifdef CONFIG_PRINTK diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1c03dfb4abfd..d5d408252992 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ + printk_nmi_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/panic.c b/kernel/panic.c index 535c96510a44..8aa74497cc5a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -160,8 +160,10 @@ void panic(const char *fmt, ...) * * Bypass the panic_cpu check and call __crash_kexec directly. */ - if (!crash_kexec_post_notifiers) + if (!crash_kexec_post_notifiers) { + printk_nmi_flush_on_panic(); __crash_kexec(NULL); + } /* * Note smp_send_stop is the usual smp shutdown function, which @@ -176,6 +178,8 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + /* Call flush even twice. It tries harder with a single online CPU */ + printk_nmi_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 341bedccc065..7fd2838fa417 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args); #ifdef CONFIG_PRINTK_NMI +extern raw_spinlock_t logbuf_lock; + /* * printk() could not take logbuf_lock in NMI context. Instead, * it temporary stores the strings into a per-CPU buffer. diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index bf08557d7e3d..b69eb8a2876f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) { const char *buf = s->buffer + start; - printk("%.*s", (end - start) + 1, buf); + /* + * The buffers are flushed in NMI only on panic. The messages must + * go only into the ring buffer at this stage. Consoles will get + * explicitly called later when a crashdump is not generated. + */ + if (in_nmi()) + printk_deferred("%.*s", (end - start) + 1, buf); + else + printk("%.*s", (end - start) + 1, buf); + } /* @@ -194,6 +204,33 @@ void printk_nmi_flush(void) __printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work); } +/** + * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system + * goes down. + * + * Similar to printk_nmi_flush() but it can be called even in NMI context when + * the system goes down. It does the best effort to get NMI messages into + * the main ring buffer. + * + * Note that it could try harder when there is only one CPU online. + */ +void printk_nmi_flush_on_panic(void) +{ + /* + * Make sure that we could access the main ring buffer. + * Do not risk a double release when more CPUs are up. + */ + if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (num_online_cpus() > 1) + return; + + debug_locks_off(); + raw_spin_lock_init(&logbuf_lock); + } + + printk_nmi_flush(); +} + void __init printk_nmi_init(void) { int cpu; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e38579d730f4..60cdf6386763 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -245,7 +245,7 @@ __packed __aligned(4) * within the scheduler's rq lock. It must be released before calling * console_unlock() or anything else that might wake up a process. */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); +DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); -- cgit v1.2.3 From ede9c27749b9b35efdffa4f63a39f819d7913752 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 May 2016 17:01:10 -0700 Subject: kernel/sysctl_binary.c: use generic UUID library UUID library provides uuid_be type and uuid_be_to_bin() function. This substitutes open coded variant by generic library calls. Signed-off-by: Andy Shevchenko Reviewed-by: Matt Fleming Cc: Dmitry Kasatkin Cc: Mimi Zohar Cc: Rasmus Villemoes Cc: Arnd Bergmann Cc: "Theodore Ts'o" Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 10a1d7dc9313..6eb99c17dbd8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file, /* Only supports reads */ if (oldval && oldlen) { - char buf[40], *str = buf; - unsigned char uuid[16]; - int i; + char buf[UUID_STRING_LEN + 1]; + uuid_be uuid; result = kernel_read(file, 0, buf, sizeof(buf) - 1); if (result < 0) @@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file, buf[result] = '\0'; - /* Convert the uuid to from a string to binary */ - for (i = 0; i < 16; i++) { - result = -EIO; - if (!isxdigit(str[0]) || !isxdigit(str[1])) - goto out; - - uuid[i] = (hex_to_bin(str[0]) << 4) | - hex_to_bin(str[1]); - str += 2; - if (*str == '-') - str++; - } + result = -EIO; + if (uuid_be_to_bin(buf, &uuid)) + goto out; if (oldlen > 16) oldlen = 16; result = -EFAULT; - if (copy_to_user(oldval, uuid, oldlen)) + if (copy_to_user(oldval, &uuid, oldlen)) goto out; copied = oldlen; -- cgit v1.2.3 From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:01:33 -0700 Subject: radix-tree: introduce radix_tree_empty Commit e61452365372 ("radix_tree: add support for multi-order entries") left the impression that the support for multiorder radix tree entries was functional. As soon as Ross tried to use it, it became apparent that my testing was completely inadequate, and it didn't even work a little bit for orders that were not a multiple of shift. This series of patches is the result of about 6 weeks of redesign, reimplementation, testing, arguing and hair-pulling. The great news is that the test-suite is now far better than it was. That's reflected in the diffstat for the test-suite alone: 12 files changed, 436 insertions(+), 28 deletions(-) The highlight for users of the tree is that the restriction on the order of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix tree now supports any order between 0 and 64. For those who are interested in how the tree works, patch 9 is probably the most interesting one as it introduces the new machinery for handling sibling entries. I've tried to be fair in attributing authorship to the person who contributed the majority of the code in each patch; Ross has been an invaluable partner in the development of this support and it's fair to say that each of us has code in every commit. I should also express my appreciation of the 0day testing. It prompted me that I was bloating the tinyconfig in an unacceptable way, and it bisected to a commit which contained a rather nasty memory-corruption bug. This patch (of 29): The irqdomain code was checking for 0 or 1 entries, not 0 entries like the comment said they were. Introduce a new helper that will actually check for an empty tree. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 5 +++++ kernel/irq/irqdomain.c | 7 +------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 51a97ac8bfbf..83f708e5db59 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -136,6 +136,11 @@ do { \ (root)->rnode = NULL; \ } while (0) +static inline bool radix_tree_empty(struct radix_tree_root *root) +{ + return root->rnode == NULL; +} + /** * Radix-tree synchronization * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d65f6f31a5b3..8798b6c9e945 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain) { mutex_lock(&irq_domain_mutex); - /* - * radix_tree_delete() takes care of destroying the root - * node when all entries are removed. Shout if there are - * any mappings left. - */ - WARN_ON(domain->revmap_tree.height); + WARN_ON(!radix_tree_empty(&domain->revmap_tree)); list_del(&domain->link); -- cgit v1.2.3 From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 17:21:27 -0700 Subject: x86: remove more uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_from_user_inatomic()" is mostly the special cases for the constant size, and it's actually almost never relevant. Most users aren't actually using a constant size anyway, and the few cases that do small constant copies are better off just using __get_user() instead. So get rid of the unnecessary complexity. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_32.h | 26 -------------------------- kernel/events/uprobes.c | 3 +-- kernel/futex.c | 2 +- mm/maccess.c | 3 +-- 4 files changed, 3 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 537cc883ea29..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..c01f733ff2e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/futex.c b/kernel/futex.c index c20f06f38ef3..ee25f5ba4aca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; -- cgit v1.2.3 From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 22 May 2016 23:16:18 +0200 Subject: bpf, inode: disallow userns mounts Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag. The original idea was to have a per mountns instance instead of a single global fs instance, but that didn't work out and we had to switch to mount_nodev() model. The intent of that middle ground was that we avoid users who don't play nice to create endless instances of bpf fs which are difficult to control and discover from an admin point of view, but at the same time it would have allowed us to be more flexible with regard to namespaces. Therefore, since we now did the switch to mount_nodev() as a fix where individual instances are created, we also need to remove userns mount flag along with it to avoid running into mentioned situation. I don't expect any breakage at this early point in time with removing the flag and we can revisit this later should the requirement for this come up with future users. This and commit e27f4a942a0e have been split to facilitate tracking should any of them run into the unlikely case of causing a regression. Fixes: b2197755b263 ("bpf: add support for persistent maps/progs") Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 04be7021f848..318858edb1cd 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = { .name = "bpf", .mount = bpf_mount, .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, }; MODULE_ALIAS_FS("bpf"); -- cgit v1.2.3 From f43edca7ed08fc02279f2a62015da5cb6aa0ad61 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 23 May 2016 16:22:26 -0700 Subject: ELF/MIPS build fix CONFIG_MIPS32_N32=y but CONFIG_BINFMT_ELF disabled results in the following linker errors: arch/mips/built-in.o: In function `elf_core_dump': binfmt_elfn32.c:(.text+0x23dbc): undefined reference to `elf_core_extra_phdrs' binfmt_elfn32.c:(.text+0x246e4): undefined reference to `elf_core_extra_data_size' binfmt_elfn32.c:(.text+0x248d0): undefined reference to `elf_core_write_extra_phdrs' binfmt_elfn32.c:(.text+0x24ac4): undefined reference to `elf_core_write_extra_data' CONFIG_MIPS32_O32=y but CONFIG_BINFMT_ELF disabled results in the following linker errors: arch/mips/built-in.o: In function `elf_core_dump': binfmt_elfo32.c:(.text+0x28a04): undefined reference to `elf_core_extra_phdrs' binfmt_elfo32.c:(.text+0x29330): undefined reference to `elf_core_extra_data_size' binfmt_elfo32.c:(.text+0x2951c): undefined reference to `elf_core_write_extra_phdrs' binfmt_elfo32.c:(.text+0x29710): undefined reference to `elf_core_write_extra_data' This is because binfmt_elfn32 and binfmt_elfo32 are using symbols from elfcore but for these configurations elfcore will not be built. Fixed by making elfcore selectable by a separate config symbol which unlike the current mechanism can also be used from other directories than kernel/, then having each flavor of ELF that relies on elfcore.o, select it in Kconfig, including CONFIG_MIPS32_N32 and CONFIG_MIPS32_O32 which fixes this issue. Link: http://lkml.kernel.org/r/20160520141705.GA1913@linux-mips.org Signed-off-by: Ralf Baechle Reviewed-by: James Hogan Cc: "Maciej W. Rozycki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 1 + fs/Kconfig.binfmt | 8 ++++++++ kernel/Makefile | 4 +--- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 8040fb1845b4..46938847e794 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3117,6 +3117,7 @@ config MIPS32_N32 config BINFMT_ELF32 bool default y if MIPS32_O32 || MIPS32_N32 + select ELFCORE endmenu diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 2d0cbbd14cfc..72c03354c14b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,7 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" depends on MMU && (BROKEN || !FRV) + select ELFCORE default y ---help--- ELF (Executable and Linkable Format) is a format for libraries and @@ -26,6 +27,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF + select ELFCORE config ARCH_BINFMT_ELF_STATE bool @@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each @@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config ELFCORE + bool + help + This option enables kernel/elfcore.o. + config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" default y diff --git a/kernel/Makefile b/kernel/Makefile index f0c40bf49d9f..e2ec54e2b952 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o -obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o +obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ -- cgit v1.2.3 From bf959931ddb88c4e4366e96dd22e68fa0db9527c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 May 2016 16:23:50 -0700 Subject: wait/ptrace: assume __WALL if the child is traced The following program (simplified version of generated by syzkaller) #include #include #include #include #include void *thread_func(void *arg) { ptrace(PTRACE_TRACEME, 0,0,0); return 0; } int main(void) { pthread_t thread; if (fork()) return 0; while (getppid() != 1) ; pthread_create(&thread, NULL, thread_func, NULL); pthread_join(thread, NULL); return 0; } creates an unreapable zombie if /sbin/init doesn't use __WALL. This is not a kernel bug, at least in a sense that everything works as expected: debugger should reap a traced sub-thread before it can reap the leader, but without __WALL/__WCLONE do_wait() ignores sub-threads. Unfortunately, it seems that /sbin/init in most (all?) distributions doesn't use it and we have to change the kernel to avoid the problem. Note also that most init's use sys_waitid() which doesn't allow __WALL, so the necessary user-space fix is not that trivial. This patch just adds the "ptrace" check into eligible_child(). To some degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal is mostly ignored when the tracee reports to debugger. Or WSTOPPED, the tracer doesn't need to set this flag to wait for the stopped tracee. This obviously means the user-visible change: __WCLONE and __WALL no longer have any meaning for debugger. And I can only hope that this won't break something, but at least strace/gdb won't suffer. We could make a more conservative change. Say, we can take __WCLONE into account, or !thread_group_leader(). But it would be nice to not complicate these historical/confusing checks. Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Cc: Denys Vlasenko Cc: Jan Kratochvil Cc: "Michael Kerrisk (man-pages)" Cc: Pedro Alves Cc: Roland McGrath Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 75b34fe835b2..44fbe6edd7fe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; -- cgit v1.2.3 From 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 23 May 2016 16:23:53 -0700 Subject: wait: allow sys_waitid() to accept __WNOTHREAD/__WCLONE/__WALL I see no reason why waitid() can't support other linux-specific flags allowed in sys_wait4(). In particular this change can help if we reconsider the previous change ("wait/ptrace: assume __WALL if the child is traced") which adds the "automagical" __WALL for debugger. Signed-off-by: Oleg Nesterov Cc: Dmitry Vyukov Cc: Denys Vlasenko Cc: Jan Kratochvil Cc: "Michael Kerrisk (man-pages)" Cc: Pedro Alves Cc: Roland McGrath Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 44fbe6edd7fe..9e6e1356e6bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1535,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, enum pid_type type; long ret; - if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; -- cgit v1.2.3 From 747800efbe8b98459f48d1d9d742298f8283f8fa Mon Sep 17 00:00:00 2001 From: Wang Xiaoqiang Date: Mon, 23 May 2016 16:23:59 -0700 Subject: kernel/signal.c: convert printk(KERN_ ...) to pr_(...) Use pr_ instead of printk(KERN_ ). Signed-off-by: Wang Xiaoqiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index ab122a2cee41..96e9bc40667f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig) if (!__ratelimit(&ratelimit_state)) return; - printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } @@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, static void print_fatal_signal(int signr) { struct pt_regs *regs = signal_pt_regs(); - printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr); + pr_info("potentially unexpected fatal signal %d.\n", signr); #if defined(__i386__) && !defined(__arch_um__) - printk(KERN_INFO "code at %08lx: ", regs->ip); + pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { @@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr) if (get_user(insn, (unsigned char *)(regs->ip + i))) break; - printk(KERN_CONT "%02x ", insn); + pr_cont("%02x ", insn); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); #endif preempt_disable(); show_regs(regs); -- cgit v1.2.3 From 725fc629ff2545b061407305ae51016c9f928fce Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 23 May 2016 16:24:05 -0700 Subject: kernek/fork.c: allocate idle task for a CPU always on its local node Linux preallocates the task structs of the idle tasks for all possible CPUs. This currently means they all end up on node 0. This also implies that the cache line of MWAIT, which is around the flags field in the task struct, are all located in node 0. We see a noticeable performance improvement on Knights Landing CPUs when the cache lines used for MWAIT are located in the local nodes of the CPUs using them. I would expect this to give a (likely slight) improvement on other systems too. The patch implements placing the idle task in the node of its CPUs, by passing the right target node to copy_process() [akpm@linux-foundation.org: use NUMA_NO_NODE, not a bare -1] Link: http://lkml.kernel.org/r/1463492694-15833-1-git-send-email-andi@firstfloor.org Signed-off-by: Andi Kleen Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 103d78fd8f75..e67d7b773348 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -340,13 +340,14 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; struct thread_info *ti; - int node = tsk_fork_get_node(orig); int err; + if (node == NUMA_NO_NODE) + node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; @@ -1276,7 +1277,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls) + unsigned long tls, + int node) { int retval; struct task_struct *p; @@ -1328,7 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, node); if (!p) goto fork_out; @@ -1706,7 +1708,8 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1751,7 +1754,7 @@ long _do_fork(unsigned long clone_flags, } p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls); + child_tidptr, NULL, trace, tls, NUMA_NO_NODE); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. -- cgit v1.2.3 From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 23 May 2016 16:24:10 -0700 Subject: kexec: introduce a protection mechanism for the crashkernel reserved memory For the cases that some kernel (module) path stamps the crash reserved memory(already mapped by the kernel) where has been loaded the second kernel data, the kdump kernel will probably fail to boot when panic happens (or even not happens) leaving the culprit at large, this is unacceptable. The patch introduces a mechanism for detecting such cases: 1) After each crash kexec loading, it simply marks the reserved memory regions readonly since we no longer access it after that. When someone stamps the region, the first kernel will panic and trigger the kdump. The weak arch_kexec_protect_crashkres() is introduced to do the actual protection. 2) To allow multiple loading, once 1) was done we also need to remark the reserved memory to readwrite each time a system call related to kdump is made. The weak arch_kexec_unprotect_crashkres() is introduced to do the actual protection. The architecture can make its specific implementation by overriding arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres(). Signed-off-by: Xunlei Pang Cc: Eric Biederman Cc: Dave Young Cc: Minfei Huang Cc: Vivek Goyal Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 ++ kernel/kexec.c | 9 ++++++++- kernel/kexec_core.c | 6 ++++++ kernel/kexec_file.c | 8 +++++++- 4 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 2cc643c6e870..643ff4a3fbf6 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned int relsec); int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, unsigned int relsec); +void arch_kexec_protect_crashkres(void); +void arch_kexec_unprotect_crashkres(void); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; diff --git a/kernel/kexec.c b/kernel/kexec.c index ee70aef5cd81..b44cb3f5a15c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) + if (flags & KEXEC_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } + if (nr_segments > 0) { unsigned long i; @@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, image = xchg(dest_image, image); out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d5d408252992..48b73cc8e425 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void) void __weak crash_unmap_reserved_pages(void) {} + +void __weak arch_kexec_protect_crashkres(void) +{} + +void __weak arch_kexec_unprotect_crashkres(void) +{} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index c72d2ff5896e..503bc2d348e5 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) + if (flags & KEXEC_FILE_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: + if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); return ret; -- cgit v1.2.3 From 917a35605f09c0d16aeb2e92c7fbff562e19a116 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 23 May 2016 16:24:16 -0700 Subject: kexec: make a pair of map/unmap reserved pages in error path For some arch, kexec shall map the reserved pages, then use them, when we try to start the kdump service. kexec may return directly, without unmaping the reserved pages, if it fails during starting service. To fix it, we make a pair of map/unmap reserved pages both in generic path and error path. This patch only affects s390. Other architecturess don't implement the interface of crash_unmap_reserved_pages and crash_map_reserved_pages. It isn't a urgent patch. Kernel can work well without any risk, although the reserved pages are not unmapped before returning in error path. Signed-off-by: Minfei Huang Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Xunlei Pang Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index b44cb3f5a15c..4b49aa71304f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -194,22 +194,25 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, segments, flags); } if (result) - goto out; + goto unmap_page; if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; result = machine_kexec_prepare(image); if (result) - goto out; + goto unmap_page; for (i = 0; i < nr_segments; i++) { result = kimage_load_segment(image, &image->segment[i]); if (result) - goto out; + goto unmap_page; } kimage_terminate(image); +unmap_page: if (flags & KEXEC_ON_CRASH) crash_unmap_reserved_pages(); + if (result) + goto out; } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); -- cgit v1.2.3 From 0eea08678ebe9f7d8ef98fed974a5bf1a0dd2dd2 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Mon, 23 May 2016 16:24:19 -0700 Subject: kexec: do a cleanup for function kexec_load There are a lof of work to be done in function kexec_load, not only for allocating structs and loading initram, but also for some misc. To make it more clear, wrap a new function do_kexec_load which is used to allocate structs and load initram. And the pre-work will be done in kexec_load. Signed-off-by: Minfei Huang Cc: Vivek Goyal Cc: "Eric W. Biederman" Cc: Xunlei Pang Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 125 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 69 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b49aa71304f..b73dc211fcfd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -103,6 +103,74 @@ out_free_image: return ret; } +static int do_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int ret; + + if (flags & KEXEC_ON_CRASH) { + dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; + } + + if (nr_segments == 0) { + /* Uninstall image */ + kimage_free(xchg(dest_image, NULL)); + return 0; + } + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + } + + ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); + if (ret) + return ret; + + if (flags & KEXEC_ON_CRASH) + crash_map_reserved_pages(); + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < nr_segments; i++) { + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* Install the new kernel and uninstall the old */ + image = xchg(dest_image, image); + +out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + + /* + * Once the reserved memory is mapped, we should unmap this memory + * before returning + */ + if (flags & KEXEC_ON_CRASH) + crash_unmap_reserved_pages(); + kimage_free(image); + return ret; +} + /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -127,7 +195,6 @@ out_free_image: SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -152,9 +219,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -166,63 +230,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) { - dest_image = &kexec_crash_image; - if (kexec_crash_image) - arch_kexec_unprotect_crashkres(); - } - - if (nr_segments > 0) { - unsigned long i; - - if (flags & KEXEC_ON_CRASH) { - /* - * Loading another kernel to switch to if this one - * crashes. Free any current crash dump kernel before - * we corrupt it. - */ - - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - } - if (result) - goto unmap_page; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto unmap_page; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto unmap_page; - } - kimage_terminate(image); -unmap_page: - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - if (result) - goto out; - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + result = do_kexec_load(entry, nr_segments, segments, flags); -out: if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); mutex_unlock(&kexec_mutex); - kimage_free(image); return result; } -- cgit v1.2.3 From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Mon, 23 May 2016 16:24:22 -0700 Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and arch_kexec_protect(unprotect)_crashkres() Commit 3f625002581b ("kexec: introduce a protection mechanism for the crashkernel reserved memory") is a similar mechanism for protecting the crash kernel reserved memory to previous crash_map/unmap_reserved_pages() implementation, the new one is more generic in name and cleaner in code (besides, some arch may not be allowed to unmap the pgtable). Therefore, this patch consolidates them, and uses the new arch_kexec_protect(unprotect)_crashkres() to replace former crash_map/unmap_reserved_pages() which by now has been only used by S390. The consolidation work needs the crash memory to be mapped initially, this is done in machine_kdump_pm_init() which is after reserve_crashkernel(). Once kdump kernel is loaded, the new arch_kexec_protect_crashkres() implemented for S390 will actually unmap the pgtable like before. Signed-off-by: Xunlei Pang Signed-off-by: Michael Holzheu Acked-by: Michael Holzheu Cc: Heiko Carstens Cc: "Eric W. Biederman" Cc: Minfei Huang Cc: Vivek Goyal Cc: Dave Young Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/machine_kexec.c | 28 ++++++++++++++++++---------- include/linux/kexec.h | 2 -- kernel/kexec.c | 12 ------------ kernel/kexec_core.c | 11 ++--------- 4 files changed, 20 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 2f1b7217c25c..0e64f08d3d69 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -43,13 +43,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, switch (action) { case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - if (crashk_res.start) - crash_map_reserved_pages(); + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - if (crashk_res.start) - crash_unmap_reserved_pages(); + if (kexec_crash_image) + arch_kexec_protect_crashkres(); break; default: return NOTIFY_DONE; @@ -60,6 +60,8 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, static int __init machine_kdump_pm_init(void) { pm_notifier(machine_kdump_pm_cb, 0); + /* Create initial mapping for crashkernel memory */ + arch_kexec_unprotect_crashkres(); return 0; } arch_initcall(machine_kdump_pm_init); @@ -146,6 +148,8 @@ static int kdump_csum_valid(struct kimage *image) #endif } +#ifdef CONFIG_CRASH_DUMP + /* * Map or unmap crashkernel memory */ @@ -167,21 +171,25 @@ static void crash_map_pages(int enable) } /* - * Map crashkernel memory + * Unmap crashkernel memory */ -void crash_map_reserved_pages(void) +void arch_kexec_protect_crashkres(void) { - crash_map_pages(1); + if (crashk_res.end) + crash_map_pages(0); } /* - * Unmap crashkernel memory + * Map crashkernel memory */ -void crash_unmap_reserved_pages(void) +void arch_kexec_unprotect_crashkres(void) { - crash_map_pages(0); + if (crashk_res.end) + crash_map_pages(1); } +#endif + /* * Give back memory to hypervisor before new kdump is loaded */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 643ff4a3fbf6..e8acb2b43dd9 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -230,8 +230,6 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); void crash_save_vmcoreinfo(void); -void crash_map_reserved_pages(void); -void crash_unmap_reserved_pages(void); void arch_crash_save_vmcoreinfo(void); __printf(1, 2) void vmcoreinfo_append_str(const char *fmt, ...); diff --git a/kernel/kexec.c b/kernel/kexec.c index b73dc211fcfd..4384672d3245 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, if (ret) return ret; - if (flags & KEXEC_ON_CRASH) - crash_map_reserved_pages(); - if (flags & KEXEC_PRESERVE_CONTEXT) image->preserve_context = 1; @@ -161,12 +158,6 @@ out: if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - /* - * Once the reserved memory is mapped, we should unmap this memory - * before returning - */ - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); kimage_free(image); return ret; } @@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, result = do_kexec_load(entry, nr_segments, segments, flags); - if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) - arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); return result; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 48b73cc8e425..56b3ed0927b0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) @@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size) crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); @@ -1553,17 +1551,12 @@ int kernel_kexec(void) } /* - * Add and remove page tables for crashkernel memory + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ -void __weak crash_map_reserved_pages(void) -{} - -void __weak crash_unmap_reserved_pages(void) -{} - void __weak arch_kexec_protect_crashkres(void) {} -- cgit v1.2.3 From 7c051267931a9be9c6620cc17b362bc6ee6dedc8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:48 -0700 Subject: mm, fork: make dup_mmap wait for mmap_sem for write killable dup_mmap needs to lock current's mm mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e67d7b773348..47887bba944f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -414,7 +414,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; uprobe_start_dup_mmap(); - down_write(&oldmm->mmap_sem); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -526,6 +529,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); +fail_uprobe_end: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: -- cgit v1.2.3 From 17b0573d77fbd547cbf1d711b90d269bdcc1efd3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:26:05 -0700 Subject: prctl: make PR_SET_THP_DISABLE wait for mmap_sem killable PR_SET_THP_DISABLE requires mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Alex Thorlton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index cf8ba545c7d3..89d5be418157 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) return -EINVAL; - down_write(&me->mm->mmap_sem); + if (down_write_killable(&me->mm->mmap_sem)) + return -EINTR; if (arg2) me->mm->def_flags |= VM_NOHUGEPAGE; else -- cgit v1.2.3 From 598fdc1d66674264e122ca9d007ad822e98d8b8d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:26:08 -0700 Subject: uprobes: wait for mmap_sem for write killable xol_add_vma needs mmap_sem for write. If the waiting task gets killed by the oom killer it would block oom_reaper from asynchronous address space reclaim and reduce the chances of timely OOM resolving. Wait for the lock in the killable mode and return with EINTR if the task got killed while waiting. Do not warn in dup_xol_work if __create_xol_area failed due to fatal signal pending because this is usually considered a kernel issue. Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c01f733ff2e1..b7a525ab2083 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1130,7 +1130,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; @@ -1469,7 +1471,8 @@ static void dup_xol_work(struct callback_head *work) if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->dup_xol_addr)) + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } -- cgit v1.2.3 From 59fa5860204ffc95128d60cba9f54f9740a42c7d Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 24 May 2016 11:42:30 +0100 Subject: genirq: Fix missing return value in irq_destroy_ipi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7cec18a3906b changed the return type of irq_destroy_ipi to int, but missed adding a value to one return statement. Fix this to silence the resulting compiler warning: kernel/irq/ipi.c In function ‘irq_destroy_ipi’: kernel/irq/ipi.c:128:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type] Fixes: 7cec18a3906b "genirq: Add error code reporting to irq_{reserve,destroy}_ipi" Signed-off-by: Matt Redfearn Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/1464086550-24734-1-git-send-email-matt.redfearn@imgtec.com Signed-off-by: Thomas Gleixner --- kernel/irq/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index c42742208e5e..89b49f6773f0 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest) domain = data->domain; if (WARN_ON(domain == NULL)) - return; + return -EINVAL; if (!irq_domain_is_ipi(domain)) { pr_warn("Trying to destroy a non IPI domain!\n"); -- cgit v1.2.3 From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2016 11:19:07 +0200 Subject: sched/core: Fix remote wakeups Commit: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration") ... introduced a bug: Mike Galbraith found that it introduced a performance regression, while Paul E. McKenney reported lost wakeups and bisected it to this commit. The reason is that I mis-read ttwu_queue() such that I assumed any wakeup that got a remote queue must have had the task migrated. Since this is not so; we need to transfer this information between queueing the wakeup and actually doing the wakeup. Use a new task_struct::sched_flag for this, we already write to sched_contributes_to_load in the wakeup path so this is a hot and modified cacheline. Reported-by: Paul E. McKenney Reported-by: Mike Galbraith Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Ben Segall Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Matt Fleming Cc: Morten Rasmussen Cc: Oleg Nesterov Cc: Paul Turner Cc: Pavan Kondeti Cc: Peter Zijlstra Cc: Quentin Casasnovas Cc: Thomas Gleixner Cc: byungchul.park@lge.com Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration") Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6cc0df970f1a..e053517a88b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1533,6 +1533,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_remote_wakeup:1; unsigned :0; /* force alignment to the next boundary */ /* unserialized, strictly 'current' */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 404c0784b1fc..7f2cae4620c7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void) cookie = lockdep_pin_lock(&rq->lock); while (llist) { + int wake_flags = 0; + p = llist_entry(llist, struct task_struct, wake_entry); llist = llist_next(llist); - /* - * See ttwu_queue(); we only call ttwu_queue_remote() when - * its a x-cpu wakeup. - */ - ttwu_do_activate(rq, p, WF_MIGRATED, cookie); + + if (p->sched_remote_wakeup) + wake_flags = WF_MIGRATED; + + ttwu_do_activate(rq, p, wake_flags, cookie); } lockdep_unpin_lock(&rq->lock, cookie); @@ -1819,10 +1821,12 @@ void scheduler_ipi(void) irq_exit(); } -static void ttwu_queue_remote(struct task_struct *p, int cpu) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); + p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { if (!set_nr_if_polling(rq->idle)) smp_send_reschedule(cpu); @@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ - ttwu_queue_remote(p, cpu); + ttwu_queue_remote(p, cpu, wake_flags); return; } #endif -- cgit v1.2.3 From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 May 2016 00:04:58 -0400 Subject: add down_write_killable_nested() Signed-off-by: Al Viro --- include/linux/rwsem.h | 2 ++ kernel/locking/rwsem.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index d1c12d160ace..d37fbb34d06f 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -156,6 +156,7 @@ extern void downgrade_write(struct rw_semaphore *sem); */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); +extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ @@ -176,6 +177,7 @@ extern void up_read_non_owner(struct rw_semaphore *sem); # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) +# define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c817216c1615..2e853ad93a3a 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); +int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + rwsem_release(&sem->dep_map, 1, _RET_IP_); + return -EINTR; + } + + rwsem_set_owner(sem); + return 0; +} + +EXPORT_SYMBOL(down_write_killable_nested); + void up_read_non_owner(struct rw_semaphore *sem) { __up_read(sem); -- cgit v1.2.3 From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 26 May 2016 15:16:22 -0700 Subject: mm: oom_reaper: remove some bloat mmput_async is currently used only from the oom_reaper which is defined only for CONFIG_MMU. We can save work_struct in mm_struct for !CONFIG_MMU. [akpm@linux-foundation.org: fix typo, per Minchan] Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz Reported-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Minchan Kim Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 4 +++- kernel/fork.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d553855503e6..ca3e517980a0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -514,7 +514,9 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif +#ifdef CONFIG_MMU struct work_struct async_put_work; +#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 23e075dcdfe4..6e42ada26345 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2745,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -/* same as above but performs the slow path from the async kontext. Can +#ifdef CONFIG_MMU +/* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ extern void mmput_async(struct mm_struct *); +#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 47887bba944f..5c2c355aa97f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +#ifdef CONFIG_MMU static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); @@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm) schedule_work(&mm->async_put_work); } } +#endif /** * set_mm_exe_file - change a reference to the mm's executable file -- cgit v1.2.3 From 287980e49ffc0f6d911601e7e352a812ed27768e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 27 May 2016 23:23:25 +0200 Subject: remove lots of IS_ERR_VALUE abuses Most users of IS_ERR_VALUE() in the kernel are wrong, as they pass an 'int' into a function that takes an 'unsigned long' argument. This happens to work because the type is sign-extended on 64-bit architectures before it gets converted into an unsigned type. However, anything that passes an 'unsigned short' or 'unsigned int' argument into IS_ERR_VALUE() is guaranteed to be broken, as are 8-bit integers and types that are wider than 'unsigned long'. Andrzej Hajda has already fixed a lot of the worst abusers that were causing actual bugs, but it would be nice to prevent any users that are not passing 'unsigned long' arguments. This patch changes all users of IS_ERR_VALUE() that I could find on 32-bit ARM randconfig builds and x86 allmodconfig. For the moment, this doesn't change the definition of IS_ERR_VALUE() because there are probably still architecture specific users elsewhere. Almost all the warnings I got are for files that are better off using 'if (err)' or 'if (err < 0)'. The only legitimate user I could find that we get a warning for is the (32-bit only) freescale fman driver, so I did not remove the IS_ERR_VALUE() there but changed the type to 'unsigned long'. For 9pfs, I just worked around one user whose calling conventions are so obscure that I did not dare change the behavior. I was using this definition for testing: #define IS_ERR_VALUE(x) ((unsigned long*)NULL == (typeof (x)*)NULL && \ unlikely((unsigned long long)(x) >= (unsigned long long)(typeof(x))-MAX_ERRNO)) which ends up making all 16-bit or wider types work correctly with the most plausible interpretation of what IS_ERR_VALUE() was supposed to return according to its users, but also causes a compile-time warning for any users that do not pass an 'unsigned long' argument. I suggested this approach earlier this year, but back then we ended up deciding to just fix the users that are obviously broken. After the initial warning that caused me to get involved in the discussion (fs/gfs2/dir.c) showed up again in the mainline kernel, Linus asked me to send the whole thing again. [ Updated the 9p parts as per Al Viro - Linus ] Signed-off-by: Arnd Bergmann Cc: Andrzej Hajda Cc: Andrew Morton Link: https://lkml.org/lkml/2016/1/7/363 Link: https://lkml.org/lkml/2016/5/27/486 Acked-by: Srinivas Kandagatla # For nvmem part Signed-off-by: Linus Torvalds --- drivers/acpi/acpi_dbg.c | 22 +++++++++++----------- drivers/ata/sata_highbank.c | 2 +- drivers/clk/tegra/clk-tegra210.c | 2 +- drivers/cpufreq/omap-cpufreq.c | 2 +- drivers/crypto/caam/ctrl.c | 2 +- drivers/dma/sun4i-dma.c | 16 ++++++++-------- drivers/gpio/gpio-xlp.c | 2 +- drivers/gpu/drm/sti/sti_vtg.c | 4 ++-- drivers/gpu/drm/tilcdc/tilcdc_tfp410.c | 2 +- drivers/gpu/host1x/hw/intr_hw.c | 2 +- drivers/iommu/arm-smmu-v3.c | 18 +++++++++--------- drivers/iommu/arm-smmu.c | 8 ++++---- drivers/irqchip/irq-clps711x.c | 2 +- drivers/irqchip/irq-gic.c | 2 +- drivers/irqchip/irq-hip04.c | 2 +- drivers/irqchip/spear-shirq.c | 2 +- drivers/media/i2c/adp1653.c | 10 +++++----- drivers/media/platform/s5p-tv/mixer_drv.c | 2 +- drivers/mfd/twl4030-irq.c | 2 +- drivers/mmc/core/mmc.c | 4 ++-- drivers/mmc/host/dw_mmc.c | 6 +++--- drivers/mmc/host/sdhci-esdhc-imx.c | 2 +- drivers/mmc/host/sdhci-of-at91.c | 2 +- drivers/mmc/host/sdhci.c | 4 ++-- drivers/net/ethernet/freescale/fman/fman.c | 2 +- drivers/net/ethernet/freescale/fman/fman_muram.c | 4 ++-- drivers/net/ethernet/freescale/fman/fman_muram.h | 4 ++-- drivers/net/wireless/ti/wlcore/spi.c | 4 ++-- drivers/nvmem/core.c | 22 +++++++++++----------- drivers/tty/serial/amba-pl011.c | 2 +- drivers/tty/serial/sprd_serial.c | 2 +- drivers/video/fbdev/da8xx-fb.c | 4 ++-- fs/afs/write.c | 4 ---- fs/binfmt_flat.c | 6 +++--- fs/gfs2/dir.c | 15 +++++++++------ kernel/pid.c | 2 +- net/9p/client.c | 8 ++++---- sound/soc/qcom/lpass-platform.c | 4 ++-- 38 files changed, 102 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c index 15e4604efba7..1f4128487dd4 100644 --- a/drivers/acpi/acpi_dbg.c +++ b/drivers/acpi/acpi_dbg.c @@ -265,7 +265,7 @@ static int acpi_aml_write_kern(const char *buf, int len) char *p; ret = acpi_aml_lock_write(crc, ACPI_AML_OUT_KERN); - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; /* sync tail before inserting logs */ smp_mb(); @@ -286,7 +286,7 @@ static int acpi_aml_readb_kern(void) char *p; ret = acpi_aml_lock_read(crc, ACPI_AML_IN_KERN); - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; /* sync head before removing cmds */ smp_rmb(); @@ -330,7 +330,7 @@ again: goto again; break; } - if (IS_ERR_VALUE(ret)) + if (ret < 0) break; size += ret; count -= ret; @@ -373,7 +373,7 @@ again: if (ret == 0) goto again; } - if (IS_ERR_VALUE(ret)) + if (ret < 0) break; *(msg + size) = (char)ret; size++; @@ -526,7 +526,7 @@ static int acpi_aml_open(struct inode *inode, struct file *file) } acpi_aml_io.users++; err_lock: - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { if (acpi_aml_active_reader == file) acpi_aml_active_reader = NULL; } @@ -587,7 +587,7 @@ static int acpi_aml_read_user(char __user *buf, int len) char *p; ret = acpi_aml_lock_read(crc, ACPI_AML_OUT_USER); - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; /* sync head before removing logs */ smp_rmb(); @@ -602,7 +602,7 @@ static int acpi_aml_read_user(char __user *buf, int len) crc->tail = (crc->tail + n) & (ACPI_AML_BUF_SIZE - 1); ret = n; out: - acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !IS_ERR_VALUE(ret)); + acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !ret); return ret; } @@ -634,7 +634,7 @@ again: goto again; } } - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { if (!acpi_aml_running()) ret = 0; break; @@ -657,7 +657,7 @@ static int acpi_aml_write_user(const char __user *buf, int len) char *p; ret = acpi_aml_lock_write(crc, ACPI_AML_IN_USER); - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; /* sync tail before inserting cmds */ smp_mb(); @@ -672,7 +672,7 @@ static int acpi_aml_write_user(const char __user *buf, int len) crc->head = (crc->head + n) & (ACPI_AML_BUF_SIZE - 1); ret = n; out: - acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !IS_ERR_VALUE(ret)); + acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !ret); return n; } @@ -704,7 +704,7 @@ again: goto again; } } - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { if (!acpi_aml_running()) ret = 0; break; diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c index 8638d575b2b9..aafb8cc03523 100644 --- a/drivers/ata/sata_highbank.c +++ b/drivers/ata/sata_highbank.c @@ -197,7 +197,7 @@ static void highbank_set_em_messages(struct device *dev, for (i = 0; i < SGPIO_PINS; i++) { err = of_get_named_gpio(np, "calxeda,sgpio-gpio", i); - if (IS_ERR_VALUE(err)) + if (err < 0) return; pdata->sgpio_gpio[i] = err; diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c index b8551813ec43..456cf586d2c2 100644 --- a/drivers/clk/tegra/clk-tegra210.c +++ b/drivers/clk/tegra/clk-tegra210.c @@ -1221,7 +1221,7 @@ static int tegra210_pll_fixed_mdiv_cfg(struct clk_hw *hw, p = rate >= params->vco_min ? 1 : -EINVAL; } - if (IS_ERR_VALUE(p)) + if (p < 0) return -EINVAL; cfg->m = tegra_pll_get_fixed_mdiv(hw, input_rate); diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index cead9bec4843..376e63ca94e8 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -54,7 +54,7 @@ static int omap_target(struct cpufreq_policy *policy, unsigned int index) freq = new_freq * 1000; ret = clk_round_rate(policy->clk, freq); - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { dev_warn(mpu_dev, "CPUfreq: Cannot find matching frequency for %lu\n", freq); diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c index 44d30b45f3cc..5ad5f3009ae0 100644 --- a/drivers/crypto/caam/ctrl.c +++ b/drivers/crypto/caam/ctrl.c @@ -402,7 +402,7 @@ int caam_get_era(void) ret = of_property_read_u32(caam_node, "fsl,sec-era", &prop); of_node_put(caam_node); - return IS_ERR_VALUE(ret) ? -ENOTSUPP : prop; + return ret ? -ENOTSUPP : prop; } EXPORT_SYMBOL(caam_get_era); diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c index e0df233dde92..57aa227bfadb 100644 --- a/drivers/dma/sun4i-dma.c +++ b/drivers/dma/sun4i-dma.c @@ -461,25 +461,25 @@ generate_ndma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest, /* Source burst */ ret = convert_burst(sconfig->src_maxburst); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret); /* Destination burst */ ret = convert_burst(sconfig->dst_maxburst); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret); /* Source bus width */ ret = convert_buswidth(sconfig->src_addr_width); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret); /* Destination bus width */ ret = convert_buswidth(sconfig->dst_addr_width); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret); @@ -518,25 +518,25 @@ generate_ddma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest, /* Source burst */ ret = convert_burst(sconfig->src_maxburst); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret); /* Destination burst */ ret = convert_burst(sconfig->dst_maxburst); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret); /* Source bus width */ ret = convert_buswidth(sconfig->src_addr_width); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret); /* Destination bus width */ ret = convert_buswidth(sconfig->dst_addr_width); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto fail; promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret); diff --git a/drivers/gpio/gpio-xlp.c b/drivers/gpio/gpio-xlp.c index 08897dc11915..1a33a19d95b9 100644 --- a/drivers/gpio/gpio-xlp.c +++ b/drivers/gpio/gpio-xlp.c @@ -393,7 +393,7 @@ static int xlp_gpio_probe(struct platform_device *pdev) irq_base = irq_alloc_descs(-1, 0, gc->ngpio, 0); else irq_base = irq_alloc_descs(-1, XLP_GPIO_IRQ_BASE, gc->ngpio, 0); - if (IS_ERR_VALUE(irq_base)) { + if (irq_base < 0) { dev_err(&pdev->dev, "Failed to allocate IRQ numbers\n"); return irq_base; } diff --git a/drivers/gpu/drm/sti/sti_vtg.c b/drivers/gpu/drm/sti/sti_vtg.c index 32c7986b63ab..6bf4ce466d20 100644 --- a/drivers/gpu/drm/sti/sti_vtg.c +++ b/drivers/gpu/drm/sti/sti_vtg.c @@ -437,7 +437,7 @@ static int vtg_probe(struct platform_device *pdev) return -EPROBE_DEFER; } else { vtg->irq = platform_get_irq(pdev, 0); - if (IS_ERR_VALUE(vtg->irq)) { + if (vtg->irq < 0) { DRM_ERROR("Failed to get VTG interrupt\n"); return vtg->irq; } @@ -447,7 +447,7 @@ static int vtg_probe(struct platform_device *pdev) ret = devm_request_threaded_irq(dev, vtg->irq, vtg_irq, vtg_irq_thread, IRQF_ONESHOT, dev_name(dev), vtg); - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { DRM_ERROR("Failed to register VTG interrupt\n"); return ret; } diff --git a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c index 7716f42f8aab..6b8c5b3bf588 100644 --- a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c +++ b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c @@ -342,7 +342,7 @@ static int tfp410_probe(struct platform_device *pdev) tfp410_mod->gpio = of_get_named_gpio_flags(node, "powerdn-gpio", 0, NULL); - if (IS_ERR_VALUE(tfp410_mod->gpio)) { + if (tfp410_mod->gpio < 0) { dev_warn(&pdev->dev, "No power down GPIO\n"); } else { ret = gpio_request(tfp410_mod->gpio, "DVI_PDn"); diff --git a/drivers/gpu/host1x/hw/intr_hw.c b/drivers/gpu/host1x/hw/intr_hw.c index 498b37e39058..e1e31e9e67cd 100644 --- a/drivers/gpu/host1x/hw/intr_hw.c +++ b/drivers/gpu/host1x/hw/intr_hw.c @@ -85,7 +85,7 @@ static int _host1x_intr_init_host_sync(struct host1x *host, u32 cpm, err = devm_request_irq(host->dev, host->intr_syncpt_irq, syncpt_thresh_isr, IRQF_SHARED, "host1x_syncpt", host); - if (IS_ERR_VALUE(err)) { + if (err < 0) { WARN_ON(1); return err; } diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c index ebab33e77d67..94b68213c50d 100644 --- a/drivers/iommu/arm-smmu-v3.c +++ b/drivers/iommu/arm-smmu-v3.c @@ -1477,7 +1477,7 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain, struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg; asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits); - if (IS_ERR_VALUE(asid)) + if (asid < 0) return asid; cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3, @@ -1508,7 +1508,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain, struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg; vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits); - if (IS_ERR_VALUE(vmid)) + if (vmid < 0) return vmid; cfg->vmid = (u16)vmid; @@ -1569,7 +1569,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain) smmu_domain->pgtbl_ops = pgtbl_ops; ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg); - if (IS_ERR_VALUE(ret)) + if (ret < 0) free_io_pgtable_ops(pgtbl_ops); return ret; @@ -1642,7 +1642,7 @@ static void arm_smmu_detach_dev(struct device *dev) struct arm_smmu_group *smmu_group = arm_smmu_group_get(dev); smmu_group->ste.bypass = true; - if (IS_ERR_VALUE(arm_smmu_install_ste_for_group(smmu_group))) + if (arm_smmu_install_ste_for_group(smmu_group) < 0) dev_warn(dev, "failed to install bypass STE\n"); smmu_group->domain = NULL; @@ -1694,7 +1694,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) smmu_group->ste.bypass = domain->type == IOMMU_DOMAIN_DMA; ret = arm_smmu_install_ste_for_group(smmu_group); - if (IS_ERR_VALUE(ret)) + if (ret < 0) smmu_group->domain = NULL; out_unlock: @@ -2235,7 +2235,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) arm_smmu_evtq_handler, arm_smmu_evtq_thread, 0, "arm-smmu-v3-evtq", smmu); - if (IS_ERR_VALUE(ret)) + if (ret < 0) dev_warn(smmu->dev, "failed to enable evtq irq\n"); } @@ -2244,7 +2244,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) ret = devm_request_irq(smmu->dev, irq, arm_smmu_cmdq_sync_handler, 0, "arm-smmu-v3-cmdq-sync", smmu); - if (IS_ERR_VALUE(ret)) + if (ret < 0) dev_warn(smmu->dev, "failed to enable cmdq-sync irq\n"); } @@ -2252,7 +2252,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) if (irq) { ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler, 0, "arm-smmu-v3-gerror", smmu); - if (IS_ERR_VALUE(ret)) + if (ret < 0) dev_warn(smmu->dev, "failed to enable gerror irq\n"); } @@ -2264,7 +2264,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu) arm_smmu_priq_thread, 0, "arm-smmu-v3-priq", smmu); - if (IS_ERR_VALUE(ret)) + if (ret < 0) dev_warn(smmu->dev, "failed to enable priq irq\n"); else diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index e206ce7a4e4b..9345a3fcb706 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -950,7 +950,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, ret = __arm_smmu_alloc_bitmap(smmu->context_map, start, smmu->num_context_banks); - if (IS_ERR_VALUE(ret)) + if (ret < 0) goto out_unlock; cfg->cbndx = ret; @@ -989,7 +989,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx]; ret = request_irq(irq, arm_smmu_context_fault, IRQF_SHARED, "arm-smmu-context-fault", domain); - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n", cfg->irptndx, irq); cfg->irptndx = INVALID_IRPTNDX; @@ -1099,7 +1099,7 @@ static int arm_smmu_master_configure_smrs(struct arm_smmu_device *smmu, for (i = 0; i < cfg->num_streamids; ++i) { int idx = __arm_smmu_alloc_bitmap(smmu->smr_map, 0, smmu->num_mapping_groups); - if (IS_ERR_VALUE(idx)) { + if (idx < 0) { dev_err(smmu->dev, "failed to allocate free SMR\n"); goto err_free_smrs; } @@ -1233,7 +1233,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) /* Ensure that the domain is finalised */ ret = arm_smmu_init_domain_context(domain, smmu); - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; /* diff --git a/drivers/irqchip/irq-clps711x.c b/drivers/irqchip/irq-clps711x.c index eb5eb0cd414d..2223b3f15d68 100644 --- a/drivers/irqchip/irq-clps711x.c +++ b/drivers/irqchip/irq-clps711x.c @@ -182,7 +182,7 @@ static int __init _clps711x_intc_init(struct device_node *np, writel_relaxed(0, clps711x_intc->intmr[2]); err = irq_alloc_descs(-1, 0, ARRAY_SIZE(clps711x_irqs), numa_node_id()); - if (IS_ERR_VALUE(err)) + if (err < 0) goto out_iounmap; clps711x_intc->ops.map = clps711x_intc_irq_map; diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index b4e647179346..fbc4ae2afd29 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -1123,7 +1123,7 @@ static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start, irq_base = irq_alloc_descs(irq_start, 16, gic_irqs, numa_node_id()); - if (IS_ERR_VALUE(irq_base)) { + if (irq_base < 0) { WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", irq_start); irq_base = irq_start; diff --git a/drivers/irqchip/irq-hip04.c b/drivers/irqchip/irq-hip04.c index 9688d2e2a636..9e25d8ce08e5 100644 --- a/drivers/irqchip/irq-hip04.c +++ b/drivers/irqchip/irq-hip04.c @@ -402,7 +402,7 @@ hip04_of_init(struct device_node *node, struct device_node *parent) nr_irqs -= hwirq_base; /* calculate # of irqs to allocate */ irq_base = irq_alloc_descs(-1, hwirq_base, nr_irqs, numa_node_id()); - if (IS_ERR_VALUE(irq_base)) { + if (irq_base < 0) { pr_err("failed to allocate IRQ numbers\n"); return -EINVAL; } diff --git a/drivers/irqchip/spear-shirq.c b/drivers/irqchip/spear-shirq.c index 1ccd2abed65f..1518ba31a80c 100644 --- a/drivers/irqchip/spear-shirq.c +++ b/drivers/irqchip/spear-shirq.c @@ -232,7 +232,7 @@ static int __init shirq_init(struct spear_shirq **shirq_blocks, int block_nr, nr_irqs += shirq_blocks[i]->nr_irqs; virq_base = irq_alloc_descs(-1, 0, nr_irqs, 0); - if (IS_ERR_VALUE(virq_base)) { + if (virq_base < 0) { pr_err("%s: irq desc alloc failed\n", __func__); goto err_unmap; } diff --git a/drivers/media/i2c/adp1653.c b/drivers/media/i2c/adp1653.c index 9e1731c565e7..e191e295c951 100644 --- a/drivers/media/i2c/adp1653.c +++ b/drivers/media/i2c/adp1653.c @@ -95,7 +95,7 @@ static int adp1653_get_fault(struct adp1653_flash *flash) int rval; fault = i2c_smbus_read_byte_data(client, ADP1653_REG_FAULT); - if (IS_ERR_VALUE(fault)) + if (fault < 0) return fault; flash->fault |= fault; @@ -105,13 +105,13 @@ static int adp1653_get_fault(struct adp1653_flash *flash) /* Clear faults. */ rval = i2c_smbus_write_byte_data(client, ADP1653_REG_OUT_SEL, 0); - if (IS_ERR_VALUE(rval)) + if (rval < 0) return rval; flash->led_mode->val = V4L2_FLASH_LED_MODE_NONE; rval = adp1653_update_hw(flash); - if (IS_ERR_VALUE(rval)) + if (rval) return rval; return flash->fault; @@ -158,7 +158,7 @@ static int adp1653_get_ctrl(struct v4l2_ctrl *ctrl) int rval; rval = adp1653_get_fault(flash); - if (IS_ERR_VALUE(rval)) + if (rval) return rval; ctrl->cur.val = 0; @@ -184,7 +184,7 @@ static int adp1653_set_ctrl(struct v4l2_ctrl *ctrl) int rval; rval = adp1653_get_fault(flash); - if (IS_ERR_VALUE(rval)) + if (rval) return rval; if ((rval & (ADP1653_REG_FAULT_FLT_SCP | ADP1653_REG_FAULT_FLT_OT | diff --git a/drivers/media/platform/s5p-tv/mixer_drv.c b/drivers/media/platform/s5p-tv/mixer_drv.c index 5ef67774971d..8a5d19469ddc 100644 --- a/drivers/media/platform/s5p-tv/mixer_drv.c +++ b/drivers/media/platform/s5p-tv/mixer_drv.c @@ -146,7 +146,7 @@ int mxr_power_get(struct mxr_device *mdev) /* returning 1 means that power is already enabled, * so zero success be returned */ - if (IS_ERR_VALUE(ret)) + if (ret < 0) return ret; return 0; } diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c index 40e51b0baa46..b46c0cfc27d9 100644 --- a/drivers/mfd/twl4030-irq.c +++ b/drivers/mfd/twl4030-irq.c @@ -696,7 +696,7 @@ int twl4030_init_irq(struct device *dev, int irq_num) nr_irqs = TWL4030_PWR_NR_IRQS + TWL4030_CORE_NR_IRQS; irq_base = irq_alloc_descs(-1, 0, nr_irqs, 0); - if (IS_ERR_VALUE(irq_base)) { + if (irq_base < 0) { dev_err(dev, "Fail to allocate IRQ descs\n"); return irq_base; } diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index b81b08f81325..c984321d1881 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1276,7 +1276,7 @@ static int mmc_select_hs200(struct mmc_card *card) * switch to HS200 mode if bus width is set successfully. */ err = mmc_select_bus_width(card); - if (!IS_ERR_VALUE(err)) { + if (!err) { val = EXT_CSD_TIMING_HS200 | card->drive_strength << EXT_CSD_DRV_STR_SHIFT; err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, @@ -1583,7 +1583,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, } else if (mmc_card_hs(card)) { /* Select the desired bus width optionally */ err = mmc_select_bus_width(card); - if (!IS_ERR_VALUE(err)) { + if (!err) { err = mmc_select_hs_ddr(card); if (err) goto free_card; diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 829a6eebcdce..2cc6123b1df9 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -1431,7 +1431,7 @@ static int dw_mci_get_ro(struct mmc_host *mmc) int gpio_ro = mmc_gpio_get_ro(mmc); /* Use platform get_ro function, else try on board write protect */ - if (!IS_ERR_VALUE(gpio_ro)) + if (gpio_ro >= 0) read_only = gpio_ro; else read_only = @@ -1454,7 +1454,7 @@ static int dw_mci_get_cd(struct mmc_host *mmc) if ((mmc->caps & MMC_CAP_NEEDS_POLL) || (mmc->caps & MMC_CAP_NONREMOVABLE)) present = 1; - else if (!IS_ERR_VALUE(gpio_cd)) + else if (gpio_cd >= 0) present = gpio_cd; else present = (mci_readl(slot->host, CDETECT) & (1 << slot->id)) @@ -2927,7 +2927,7 @@ static void dw_mci_enable_cd(struct dw_mci *host) if (slot->mmc->caps & MMC_CAP_NEEDS_POLL) return; - if (IS_ERR_VALUE(mmc_gpio_get_cd(slot->mmc))) + if (mmc_gpio_get_cd(slot->mmc) < 0) break; } if (i == host->num_slots) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 2d300d87cda8..9d3ae1f4bd3c 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1011,7 +1011,7 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev, if (ret) return ret; - if (!IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc))) + if (mmc_gpio_get_cd(host->mmc) >= 0) host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION; return 0; diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 25f779e09d8e..d4cef713d246 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -289,7 +289,7 @@ static int sdhci_at91_probe(struct platform_device *pdev) * to enable polling via device tree with broken-cd property. */ if (!(host->mmc->caps & MMC_CAP_NONREMOVABLE) && - IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc))) { + mmc_gpio_get_cd(host->mmc) < 0) { host->mmc->caps |= MMC_CAP_NEEDS_POLL; host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION; } diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index e010ea4eb6f5..0e3d7c056cb1 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1624,7 +1624,7 @@ static int sdhci_get_cd(struct mmc_host *mmc) * Try slot gpio detect, if defined it take precedence * over build in controller functionality */ - if (!IS_ERR_VALUE(gpio_cd)) + if (gpio_cd >= 0) return !!gpio_cd; /* If polling, assume that the card is always present. */ @@ -3077,7 +3077,7 @@ int sdhci_add_host(struct sdhci_host *host) if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) && !(mmc->caps & MMC_CAP_NONREMOVABLE) && - IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc))) + mmc_gpio_get_cd(host->mmc) < 0) mmc->caps |= MMC_CAP_NEEDS_POLL; /* If there are external regulators, get them */ diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c index bcb9dccada4d..1de2e1e51c2b 100644 --- a/drivers/net/ethernet/freescale/fman/fman.c +++ b/drivers/net/ethernet/freescale/fman/fman.c @@ -615,7 +615,7 @@ struct fman { struct fman_cfg *cfg; struct muram_info *muram; /* cam section in muram */ - int cam_offset; + unsigned long cam_offset; size_t cam_size; /* Fifo in MURAM */ int fifo_offset; diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.c b/drivers/net/ethernet/freescale/fman/fman_muram.c index 4eb0e9ac7182..47394c45b6e8 100644 --- a/drivers/net/ethernet/freescale/fman/fman_muram.c +++ b/drivers/net/ethernet/freescale/fman/fman_muram.c @@ -129,7 +129,7 @@ unsigned long fman_muram_offset_to_vbase(struct muram_info *muram, * * Return: address of the allocated memory; NULL otherwise. */ -int fman_muram_alloc(struct muram_info *muram, size_t size) +unsigned long fman_muram_alloc(struct muram_info *muram, size_t size) { unsigned long vaddr; @@ -150,7 +150,7 @@ int fman_muram_alloc(struct muram_info *muram, size_t size) * * Free an allocated memory from FM-MURAM partition. */ -void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size) +void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size) { unsigned long addr = fman_muram_offset_to_vbase(muram, offset); diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.h b/drivers/net/ethernet/freescale/fman/fman_muram.h index dbf0af9e5bb5..889649ad8931 100644 --- a/drivers/net/ethernet/freescale/fman/fman_muram.h +++ b/drivers/net/ethernet/freescale/fman/fman_muram.h @@ -44,8 +44,8 @@ struct muram_info *fman_muram_init(phys_addr_t base, size_t size); unsigned long fman_muram_offset_to_vbase(struct muram_info *muram, unsigned long offset); -int fman_muram_alloc(struct muram_info *muram, size_t size); +unsigned long fman_muram_alloc(struct muram_info *muram, size_t size); -void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size); +void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size); #endif /* __FM_MURAM_EXT */ diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c index 020ac1a4b408..cea9443c22a6 100644 --- a/drivers/net/wireless/ti/wlcore/spi.c +++ b/drivers/net/wireless/ti/wlcore/spi.c @@ -382,7 +382,7 @@ static int wlcore_probe_of(struct spi_device *spi, struct wl12xx_spi_glue *glue, ret = of_property_read_u32(dt_node, "ref-clock-frequency", &pdev_data->ref_clock_freq); - if (IS_ERR_VALUE(ret)) { + if (ret) { dev_err(glue->dev, "can't get reference clock frequency (%d)\n", ret); return ret; @@ -425,7 +425,7 @@ static int wl1271_probe(struct spi_device *spi) } ret = wlcore_probe_of(spi, glue, &pdev_data); - if (IS_ERR_VALUE(ret)) { + if (ret) { dev_err(glue->dev, "can't get device tree parameters (%d)\n", ret); return ret; diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index bb4ea123547f..965911d9b36a 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -113,7 +113,7 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj, rc = nvmem_reg_read(nvmem, pos, buf, count); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return count; @@ -147,7 +147,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj, rc = nvmem_reg_write(nvmem, pos, buf, count); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return count; @@ -366,7 +366,7 @@ static int nvmem_add_cells(struct nvmem_device *nvmem, } rval = nvmem_cell_info_to_nvmem_cell(nvmem, &info[i], cells[i]); - if (IS_ERR_VALUE(rval)) { + if (rval) { kfree(cells[i]); goto err; } @@ -963,7 +963,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem, rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; /* shift bits in-place */ @@ -998,7 +998,7 @@ void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len) return ERR_PTR(-ENOMEM); rc = __nvmem_cell_read(nvmem, cell, buf, len); - if (IS_ERR_VALUE(rc)) { + if (rc) { kfree(buf); return ERR_PTR(rc); } @@ -1083,7 +1083,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len) if (cell->bit_offset || cell->nbits) kfree(buf); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return len; @@ -1111,11 +1111,11 @@ ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem, return -EINVAL; rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; rc = __nvmem_cell_read(nvmem, &cell, buf, &len); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return len; @@ -1141,7 +1141,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem, return -EINVAL; rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return nvmem_cell_write(&cell, buf, cell.bytes); @@ -1170,7 +1170,7 @@ int nvmem_device_read(struct nvmem_device *nvmem, rc = nvmem_reg_read(nvmem, offset, buf, bytes); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; return bytes; @@ -1198,7 +1198,7 @@ int nvmem_device_write(struct nvmem_device *nvmem, rc = nvmem_reg_write(nvmem, offset, buf, bytes); - if (IS_ERR_VALUE(rc)) + if (rc) return rc; diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index a2aa655f56c4..1b7331e40d79 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2360,7 +2360,7 @@ static int pl011_probe_dt_alias(int index, struct device *dev) return ret; ret = of_alias_get_id(np, "serial"); - if (IS_ERR_VALUE(ret)) { + if (ret < 0) { seen_dev_without_alias = true; ret = index; } else { diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c index 18971063f95f..699447aa8b43 100644 --- a/drivers/tty/serial/sprd_serial.c +++ b/drivers/tty/serial/sprd_serial.c @@ -654,7 +654,7 @@ static int sprd_probe_dt_alias(int index, struct device *dev) return ret; ret = of_alias_get_id(np, "serial"); - if (IS_ERR_VALUE(ret)) + if (ret < 0) ret = index; else if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) { dev_warn(dev, "requested serial port %d not available.\n", ret); diff --git a/drivers/video/fbdev/da8xx-fb.c b/drivers/video/fbdev/da8xx-fb.c index d8d583d32a37..c229b1a0d13b 100644 --- a/drivers/video/fbdev/da8xx-fb.c +++ b/drivers/video/fbdev/da8xx-fb.c @@ -713,7 +713,7 @@ static int da8xx_fb_config_clk_divider(struct da8xx_fb_par *par, if (par->lcdc_clk_rate != lcdc_clk_rate) { ret = clk_set_rate(par->lcdc_clk, lcdc_clk_rate); - if (IS_ERR_VALUE(ret)) { + if (ret) { dev_err(par->dev, "unable to set clock rate at %u\n", lcdc_clk_rate); @@ -784,7 +784,7 @@ static int lcd_init(struct da8xx_fb_par *par, const struct lcd_ctrl_config *cfg, int ret = 0; ret = da8xx_fb_calc_config_clk_divider(par, panel); - if (IS_ERR_VALUE(ret)) { + if (ret) { dev_err(par->dev, "unable to configure clock\n"); return ret; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 65de439bdc4f..14d506efd1aa 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) return 0; result = generic_file_write_iter(iocb, from); - if (IS_ERR_VALUE(result)) { - _leave(" = %zd", result); - return result; - } _leave(" = %zd", result); return result; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f723cd3a455c..caf9e39bb82b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - IS_ERR_VALUE(load_flat_shared_library(id, p))) { + load_flat_shared_library(id, p) < 0) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (!IS_ERR_VALUE(res)) + if (!res) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm) stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (IS_ERR_VALUE(res)) + if (res < 0) return res; /* Update data segment pointers for all libraries */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 4a01f30e9995..271d93905bac 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { __be64 *hash; + int error; hash = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hash)) - return PTR_ERR(hash); - *leaf_out = be64_to_cpu(*(hash + index)); - return 0; + error = PTR_ERR_OR_ZERO(hash); + + if (!error) + *leaf_out = be64_to_cpu(*(hash + index)); + + return error; } static int get_first_leaf(struct gfs2_inode *dip, u32 index, @@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!IS_ERR_VALUE(error)) + if (!error) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (IS_ERR_VALUE(error)) + if (error) return error; /* Get the old leaf block */ diff --git a/kernel/pid.c b/kernel/pid.c index 4d73a834c7e6..f66162f2359b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); - if (IS_ERR_VALUE(nr)) { + if (nr < 0) { retval = nr; goto out_free; } diff --git a/net/9p/client.c b/net/9p/client.c index ea79ee9a7348..3fc94a49ccd5 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (err) goto out_err; - if (p9_is_proto_dotu(c)) + if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; - if (!err || !IS_ERR_VALUE(err)) { + if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", @@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, if (err) goto out_err; - if (p9_is_proto_dotu(c)) + if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; - if (!err || !IS_ERR_VALUE(err)) { + if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", diff --git a/sound/soc/qcom/lpass-platform.c b/sound/soc/qcom/lpass-platform.c index 6e8665430bd5..ddfe34434765 100644 --- a/sound/soc/qcom/lpass-platform.c +++ b/sound/soc/qcom/lpass-platform.c @@ -491,7 +491,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime) data->rdma_ch = v->alloc_dma_channel(drvdata, SNDRV_PCM_STREAM_PLAYBACK); - if (IS_ERR_VALUE(data->rdma_ch)) + if (data->rdma_ch < 0) return data->rdma_ch; drvdata->substream[data->rdma_ch] = psubstream; @@ -518,7 +518,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime) data->wrdma_ch = v->alloc_dma_channel(drvdata, SNDRV_PCM_STREAM_CAPTURE); - if (IS_ERR_VALUE(data->wrdma_ch)) + if (data->wrdma_ch < 0) goto capture_alloc_err; drvdata->substream[data->wrdma_ch] = csubstream; -- cgit v1.2.3 From c08376ac97cb202ec65320f3d90d5c4c5e2adb0b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 26 May 2016 17:21:05 -0700 Subject: timer: Export destroy_hrtimer_on_stack() hrtimer_init_on_stack() needs a matching call to destroy_hrtimer_on_stack(), so both need to be exported. Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8c7392c4fdbd..e99df0ff1d42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } -- cgit v1.2.3 From 9bd616e3dbedfc103f158197c8ad93678849b1ed Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 1 Jun 2016 18:52:16 +0100 Subject: cpuidle: Do not access cpuidle_devices when !CONFIG_CPU_IDLE The cpuidle_devices per-CPU variable is only defined when CPU_IDLE is enabled. Commit c8cc7d4de7a4 ("sched/idle: Reorganize the idle loop") removed the #ifdef CONFIG_CPU_IDLE around cpuidle_idle_call() with the compiler optimising away __this_cpu_read(cpuidle_devices). However, with CONFIG_UBSAN && !CONFIG_CPU_IDLE, this optimisation no longer happens and the kernel fails to link since cpuidle_devices is not defined. This patch introduces an accessor function for the current CPU cpuidle device (returning NULL when !CONFIG_CPU_IDLE) and uses it in cpuidle_idle_call(). Signed-off-by: Catalin Marinas Cc: 4.5+ # 4.5+ Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 3 +++ kernel/sched/idle.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 786ad32631a6..07b83d32f66c 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -152,6 +152,8 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); +static inline struct cpuidle_device *cpuidle_get_device(void) +{return __this_cpu_read(cpuidle_devices); } #else static inline void disable_cpuidle(void) { } static inline bool cpuidle_not_available(struct cpuidle_driver *drv, @@ -187,6 +189,7 @@ static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } +static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif #if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..c5aeedf4e93a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void cpuidle_idle_call(void) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; -- cgit v1.2.3 From 0422e83d84ae24b933e4b0d4c1e0f0b4ae8a0a3b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 26 May 2016 21:08:17 +0100 Subject: locking/ww_mutex: Report recursive ww_mutex locking early Recursive locking for ww_mutexes was originally conceived as an exception. However, it is heavily used by the DRM atomic modesetting code. Currently, the recursive deadlock is checked after we have queued up for a busy-spin and as we never release the lock, we spin until kicked, whereupon the deadlock is discovered and reported. A simple solution for the now common problem is to move the recursive deadlock discovery to the first action when taking the ww_mutex. Suggested-by: Maarten Lankhorst Signed-off-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Maarten Lankhorst Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1464293297-19777-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e364b424b019..79d2d765a75f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); -- cgit v1.2.3 From 5b6c1b4d46b0dae4edea636a776d09f2064f4cd7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 4 Jun 2016 20:50:59 +0200 Subject: bpf, trace: use READ_ONCE for retrieving file ptr In bpf_perf_event_read() and bpf_perf_event_output(), we must use READ_ONCE() for fetching the struct file pointer, which could get updated concurrently, so we must prevent the compiler from potential refetching. We already do this with tail calls for fetching the related bpf_prog, but not so on stored perf events. Semantics for both are the same with regards to updates. Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 780bcbe1d4de..720b7bb01d43 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -198,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; @@ -247,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - file = (struct file *)array->ptrs[index]; + file = READ_ONCE(array->ptrs[index]); if (unlikely(!file)) return -ENOENT; -- cgit v1.2.3 From 2c610022711675ee908b903d242f0b90e1db661f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 10:19:51 +0200 Subject: locking/qspinlock: Fix spin_unlock_wait() some more While this prior commit: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()") ... fixes spin_is_locked() and spin_unlock_wait() for the usage in ipc/sem and netfilter, it does not in fact work right for the usage in task_work and futex. So while the 2 locks crossed problem: spin_lock(A) spin_lock(B) if (!spin_is_locked(B)) spin_unlock_wait(A) foo() foo(); ... works with the smp_mb() injected by both spin_is_locked() and spin_unlock_wait(), this is not sufficient for: flag = 1; smp_mb(); spin_lock() spin_unlock_wait() if (!flag) // add to lockless list // iterate lockless list ... because in this scenario, the store from spin_lock() can be delayed past the load of flag, uncrossing the variables and loosing the guarantee. This patch reworks spin_is_locked() and spin_unlock_wait() to work in both cases by exploiting the observation that while the lock byte store can be delayed, the contender must have registered itself visibly in other state contained in the word. It also allows for architectures to override both functions, as PPC and ARM64 have an additional issue for which we currently have no generic solution. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Giovanni Gherdovich Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: stable@vger.kernel.org # v4.2 and later Fixes: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()") Signed-off-by: Ingo Molnar --- include/asm-generic/qspinlock.h | 53 ++++++++++++------------------------ kernel/locking/qspinlock.c | 60 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 6bd05700d8c9..05f05f17a7c2 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -21,38 +21,34 @@ #include +/** + * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock + * @lock : Pointer to queued spinlock structure + * + * There is a very slight possibility of live-lock if the lockers keep coming + * and the waiter is just unfortunate enough to not see any unlock state. + */ +#ifndef queued_spin_unlock_wait +extern void queued_spin_unlock_wait(struct qspinlock *lock); +#endif + /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure * Return: 1 if it is locked, 0 otherwise */ +#ifndef queued_spin_is_locked static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { /* - * queued_spin_lock_slowpath() can ACQUIRE the lock before - * issuing the unordered store that sets _Q_LOCKED_VAL. - * - * See both smp_cond_acquire() sites for more detail. - * - * This however means that in code like: - * - * spin_lock(A) spin_lock(B) - * spin_unlock_wait(B) spin_is_locked(A) - * do_something() do_something() - * - * Both CPUs can end up running do_something() because the store - * setting _Q_LOCKED_VAL will pass through the loads in - * spin_unlock_wait() and/or spin_is_locked(). + * See queued_spin_unlock_wait(). * - * Avoid this by issuing a full memory barrier between the spin_lock() - * and the loads in spin_unlock_wait() and spin_is_locked(). - * - * Note that regular mutual exclusion doesn't care about this - * delayed store. + * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL + * isn't immediately observable. */ - smp_mb(); - return atomic_read(&lock->val) & _Q_LOCKED_MASK; + return atomic_read(&lock->val); } +#endif /** * queued_spin_value_unlocked - is the spinlock structure unlocked? @@ -122,21 +118,6 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock) } #endif -/** - * queued_spin_unlock_wait - wait until current lock holder releases the lock - * @lock : Pointer to queued spinlock structure - * - * There is a very slight possibility of live-lock if the lockers keep coming - * and the waiter is just unfortunate enough to not see any unlock state. - */ -static inline void queued_spin_unlock_wait(struct qspinlock *lock) -{ - /* See queued_spin_is_locked() */ - smp_mb(); - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); -} - #ifndef virt_spin_lock static __always_inline bool virt_spin_lock(struct qspinlock *lock) { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ce2f75e32ae1..5fc8c311b8fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,66 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 + * + * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 + * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ + u32 val; + + for (;;) { + val = atomic_read(&lock->val); + + if (!val) /* not locked, we're done */ + goto done; + + if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ + break; + + /* not locked, but pending, wait until we observe the lock */ + cpu_relax(); + } + + /* any unlock is good */ + while (atomic_read(&lock->val) & _Q_LOCKED_MASK) + cpu_relax(); + +done: + smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif + #endif /* _GEN_PV_LOCK_SLOWPATH */ /** -- cgit v1.2.3 From 62a92c8f553e49270a0ee391b8733da71ab0aebc Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 7 Jun 2016 15:44:15 +0300 Subject: perf/core: Remove a redundant check There is no way to end up in _free_event() with event::pmu being NULL. The latter is initialized in event allocation path and remains set forever. In case of allocation failure, the error path doesn't use _free_event(). Having the check, however, suggests that it is possible to have a event::pmu==NULL situation in _free_event() and confuses the robots. This patch gets rid of the check. Reported-by: Dan Carpenter Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: eranian@google.com Cc: vince@deater.net Link: http://lkml.kernel.org/r/1465303455-26032-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 050a290c72c7..87e945d6ebb8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } + exclusive_event_destroy(event); + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } -- cgit v1.2.3 From 9c57259117b9c25472a3fa6d5a14d6bb3b647e87 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2016 17:58:40 -0500 Subject: sched/debug: Fix /proc/sched_debug regression Commit: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") ... introduced a bug when CONFIG_SCHEDSTATS is enabled and the runtime tunable is disabled (which is the default). The wait-time, sum-exec, and sum-sleep fields are missing from the /proc/sched_debug file in the runnable_tasks section. Fix it with a new schedstat_val() macro which returns the field value when schedstats is enabled and zero otherwise. The macro works with both SCHEDSTATS and !SCHEDSTATS. I put the macro in stats.h since it might end up being useful in other places. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/bcda7c2790cf2ccbe586a28c02dd7b6fe7749a2b.1464994423.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 15 ++++----------- kernel/sched/stats.h | 3 +++ 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cf905f655ba1..0368c393a336 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) { - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); - } -#else + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - 0LL, 0L, + SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)), SPLIT_NS(p->se.sum_exec_runtime), - 0LL, 0L); -#endif + SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); + #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 70b3b6a20fb0..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) # define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) # define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) +# define schedstat_val(rq, field) ((schedstat_enabled()) ? (rq)->field : 0) + #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) +# define schedstat_val(rq, field) 0 #endif #ifdef CONFIG_SCHED_INFO -- cgit v1.2.3 From 4698f88c06b893f2acc0b443004a53bf490fde7c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 7 Jun 2016 14:43:16 -0500 Subject: sched/debug: Fix 'schedstats=enable' cmdline option The 'schedstats=enable' option doesn't work, and also produces the following warning during boot: WARNING: CPU: 0 PID: 0 at /home/jpoimboe/git/linux/kernel/jump_label.c:61 static_key_slow_inc+0x8c/0xa0 static_key_slow_inc used before call to jump_label_init Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc1+ #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 0000000000000086 3ae3475a4bea95d4 ffffffff81e03da8 ffffffff8143fc83 ffffffff81e03df8 0000000000000000 ffffffff81e03de8 ffffffff810b1ffb 0000003d00000096 ffffffff823514d0 ffff88007ff197c8 0000000000000000 Call Trace: [] dump_stack+0x85/0xc2 [] __warn+0xcb/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] static_key_slow_inc+0x8c/0xa0 [] static_key_enable+0x16/0x40 [] setup_schedstats+0x29/0x94 [] unknown_bootoption+0x89/0x191 [] parse_args+0x297/0x4b0 [] start_kernel+0x1d8/0x4a9 [] ? set_init_arg+0x55/0x55 [] ? early_idt_handler_array+0x120/0x120 [] x86_64_start_reservations+0x2f/0x31 [] x86_64_start_kernel+0x14a/0x16d The problem is that it tries to update the 'sched_schedstats' static key before jump labels have been initialized. Changing jump_label_init() to be called earlier before parse_early_param() wouldn't fix it: it would still fail trying to poke_text() because mm isn't yet initialized. Instead, just create a temporary '__sched_schedstats' variable which can be copied to the static key later during sched_init() after jump labels have been initialized. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/453775fe3433bed65731a583e228ccea806d18cd.1465322027.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f2cae4620c7..385c947482e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +#ifdef CONFIG_SCHEDSTATS + DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS static void set_schedstats(bool enabled) { if (enabled) @@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str) if (!str) goto out; + /* + * This code is called before jump labels have been set up, so we can't + * change the static branch directly just yet. Instead set a temporary + * variable so init_schedstats() can do it later. + */ if (!strcmp(str, "enable")) { - set_schedstats(true); + __sched_schedstats = true; ret = 1; } else if (!strcmp(str, "disable")) { - set_schedstats(false); + __sched_schedstats = false; ret = 1; } out: @@ -2293,6 +2300,11 @@ out: } __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ + set_schedstats(__sched_schedstats); +} + #ifdef CONFIG_PROC_SYSCTL int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write, set_schedstats(state); return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */ /* * fork()/clone()-time setup: @@ -7487,6 +7501,8 @@ void __init sched_init(void) #endif init_sched_fair_class(); + init_schedstats(); + scheduler_running = 1; } -- cgit v1.2.3 From 077fa7aed17de5022e44bf07dbaf732078b7b5b2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 8 Jun 2016 14:25:22 +0100 Subject: futex: Calculate the futex key based on a tail page for file-based futexes Mike Galbraith reported that the LTP test case futex_wake04 was broken by commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()"). This test case uses futexes backed by hugetlbfs pages and so there is an associated inode with a futex stored on such pages. The problem is that the key is being calculated based on the head page index of the hugetlbfs page and not the tail page. Prior to the optimisation, the page lock was used to stabilise mappings and pin the inode is file-backed which is overkill. If the page was a compound page, the head page was automatically looked up as part of the page lock operation but the tail page index was used to calculate the futex key. After the optimisation, the compound head is looked up early and the page lock is only relied upon to identify truncated pages, special pages or a shmem page moving to swapcache. The head page is looked up because without the page lock, special care has to be taken to pin the inode correctly. However, the tail page is still required to calculate the futex key so this patch records the tail page. On vanilla 4.6, the output of the test case is; futex_wake04 0 TINFO : Hugepagesize 2097152 futex_wake04 1 TFAIL : futex_wake04.c:126: Bug: wait_thread2 did not wake after 30 secs. With the patch applied futex_wake04 0 TINFO : Hugepagesize 2097152 futex_wake04 1 TPASS : Hi hydra, thread2 awake! Fixes: 65d8fc777f6d "futex: Remove requirement for lock_page() in get_futex_key()" Reported-and-tested-by: Mike Galbraith Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Cc: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20160608132522.GM2469@suse.de Signed-off-by: Thomas Gleixner --- kernel/futex.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ee25f5ba4aca..33664f70e2d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *tail; struct address_space *mapping; int err, ro = 0; @@ -530,7 +530,15 @@ again: * considered here and page lock forces unnecessarily serialization * From this point on, mapping will be re-verified if necessary and * page lock will be acquired only if it is unavoidable - */ + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; page = compound_head(page); mapping = READ_ONCE(page->mapping); @@ -654,7 +662,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.inode = inode; - key->shared.pgoff = basepage_index(page); + key->shared.pgoff = basepage_index(tail); rcu_read_unlock(); } -- cgit v1.2.3 From ba62bafe942b159a6109cbec780d36496e06b6c5 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 8 Jun 2016 15:33:53 -0700 Subject: kernel/relay.c: fix potential memory leak When relay_open_buf() fails in relay_open(), code will goto free_bufs, but chan is nowhere freed. Link: http://lkml.kernel.org/r/1464777927-19675-1-git-send-email-yizhouzhou@ict.ac.cn Signed-off-by: Zhouyi Zhou Cc: Jens Axboe Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -614,6 +614,7 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); + kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); -- cgit v1.2.3 From 29d6455178a09e1dc340380c582b13356227e8df Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 1 Jun 2016 11:55:07 +0200 Subject: sched: panic on corrupted stack end Until now, hitting this BUG_ON caused a recursive oops (because oops handling involves do_exit(), which calls into the scheduler, which in turn raises an oops), which caused stuff below the stack to be overwritten until a panic happened (e.g. via an oops in interrupt context, caused by the overwritten CPU index in the thread_info). Just panic directly. Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d1f7149f8704..11546a6ed5df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3047,7 +3047,8 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(task_stack_end_corrupted(prev)); + if (task_stack_end_corrupted(prev)) + panic("corrupted stack end detected inside scheduler\n"); #endif if (unlikely(in_atomic_preempt_off())) { -- cgit v1.2.3 From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jun 2016 15:07:50 +0200 Subject: sched/fair: Fix post_init_entity_util_avg() serialization Chris Wilson reported a divide by 0 at: post_init_entity_util_avg(): > 725 if (cfs_rq->avg.util_avg != 0) { > 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; > -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1); > 728 > 729 if (sa->util_avg > cap) > 730 sa->util_avg = cap; > 731 } else { Which given the lack of serialization, and the code generated from update_cfs_rq_load_avg() is entirely possible: if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed_load = 1; } turns into: ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax ffffffff8108706b: 48 85 c0 test %rax,%rax ffffffff8108706e: 74 40 je ffffffff810870b0 ffffffff81087070: 4c 89 f8 mov %r15,%rax ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) ffffffff8108707e: 4c 89 f9 mov %r15,%rcx ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx Which you'll note ends up with 'sa->load_avg - r' in memory at ffffffff8108707a. By calling post_init_entity_util_avg() under rq->lock we're sure to be fully serialized against PELT updates and cannot observe intermediate state like this. Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value") Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..13d0896aff87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4e33ad12bb68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; -- cgit v1.2.3 From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 13 Jun 2016 02:32:09 -0500 Subject: sched/debug: Fix deadlock when enabling sched events I see a hang when enabling sched events: echo 1 > /sys/kernel/debug/tracing/events/sched/enable The printk buffer shows: BUG: spinlock recursion on CPU#1, swapper/1/0 lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 ... Call Trace: [] dump_stack+0x85/0xc2 [] spin_dump+0x78/0xc0 [] do_raw_spin_lock+0x11a/0x150 [] _raw_spin_lock+0x61/0x80 [] ? try_to_wake_up+0x256/0x4e0 [] try_to_wake_up+0x256/0x4e0 [] ? _raw_spin_unlock_irqrestore+0x4a/0x80 [] wake_up_process+0x15/0x20 [] insert_work+0x84/0xc0 [] __queue_work+0x18f/0x660 [] queue_work_on+0x46/0x90 [] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper] [] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper] [] soft_cursor+0x1ad/0x230 [] bit_cursor+0x649/0x680 [] ? update_attr.isra.2+0x90/0x90 [] fbcon_cursor+0x14a/0x1c0 [] hide_cursor+0x28/0x90 [] vt_console_print+0x3bf/0x3f0 [] call_console_drivers.constprop.24+0x183/0x200 [] console_unlock+0x3d4/0x610 [] vprintk_emit+0x3c5/0x610 [] vprintk_default+0x29/0x40 [] printk+0x57/0x73 [] enqueue_entity+0xc2e/0xc70 [] enqueue_task_fair+0x59/0xab0 [] ? kvm_sched_clock_read+0x9/0x20 [] ? sched_clock+0x9/0x10 [] activate_task+0x5c/0xa0 [] ttwu_do_activate+0x54/0xb0 [] sched_ttwu_pending+0x7a/0xb0 [] scheduler_ipi+0x61/0x170 [] smp_trace_reschedule_interrupt+0x4f/0x2a0 [] trace_reschedule_interrupt+0x96/0xa0 [] ? native_safe_halt+0x6/0x10 [] ? trace_hardirqs_on+0xd/0x10 [] default_idle+0x20/0x1a0 [] arch_cpu_idle+0xf/0x20 [] default_idle_call+0x2f/0x50 [] cpu_startup_entry+0x37e/0x450 [] start_secondary+0x160/0x1a0 Note the hang only occurs when echoing the above from a physical serial console, not from an ssh session. The bug is caused by a deadlock where the task is trying to grab the rq lock twice because printk()'s aren't safe in sched code. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e33ad12bb68..a2348deab7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3246,7 +3246,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " "kernel parameter schedstats=enabled or " "kernel.sched_schedstats=1\n"); -- cgit v1.2.3 From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..4135ac83cb65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -- cgit v1.2.3 From df4565f9ebdc4d6dc50edc6e8fed08004e328332 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 24 May 2016 14:05:05 +0200 Subject: kernel/kcov: unproxify debugfs file's fops Since commit 49d200deaa68 ("debugfs: prevent access to removed files' private data"), a debugfs file's file_operations methods get proxied through lifetime aware wrappers. However, only a certain subset of the file_operations members is supported by debugfs and ->mmap isn't among them -- it appears to be NULL from the VFS layer's perspective. This behaviour breaks the /sys/kernel/debug/kcov file introduced concurrently with commit 5c9a8750a640 ("kernel: add kcov code coverage"). Since that file never gets removed, there is no file removal race and thus, a lifetime checking proxy isn't needed. Avoid the proxying for /sys/kernel/debug/kcov by creating it via debugfs_create_file_unsafe() rather than debugfs_create_file(). Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data") Fixes: 5c9a8750a640 ("kernel: add kcov code coverage") Reported-by: Sasha Levin Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- kernel/kcov.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = { static int __init kcov_init(void) { - if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + /* + * The kcov debugfs file won't ever get removed and thus, + * there is no need to protect it against removal races. The + * use of debugfs_create_file_unsafe() is actually safe here. + */ + if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { pr_err("failed to create kcov in debugfs\n"); return -ENOMEM; } -- cgit v1.2.3 From 8974189222159154c55f24ddad33e3613960521a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 10:50:40 +0200 Subject: sched/fair: Fix cfs_rq avg tracking underflow As per commit: b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization") > the code generated from update_cfs_rq_load_avg(): > > if (atomic_long_read(&cfs_rq->removed_load_avg)) { > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > sa->load_avg = max_t(long, sa->load_avg - r, 0); > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); > removed_load = 1; > } > > turns into: > > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax > ffffffff8108706b: 48 85 c0 test %rax,%rax > ffffffff8108706e: 74 40 je ffffffff810870b0 > ffffffff81087070: 4c 89 f8 mov %r15,%rax > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx > > Which you'll note ends up with sa->load_avg -= r in memory at > ffffffff8108707a. So I _should_ have looked at other unserialized users of ->load_avg, but alas. Luckily nikbor reported a similar /0 from task_h_load() which instantly triggered recollection of this here problem. Aside from the intermediate value hitting memory and causing problems, there's another problem: the underflow detection relies on the signed bit. This reduces the effective width of the variables, IOW its effectively the same as having these variables be of signed type. This patch changes to a different means of unsigned underflow detection to not rely on the signed bit. This allows the variables to use the 'full' unsigned range. And it does so with explicit LOAD - STORE to ensure any intermediate value will never be visible in memory, allowing these unserialized loads. Note: GCC generates crap code for this, might warrant a look later. Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; maybe we should do clamping on add too. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: kernel@kyup.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2348deab7a3..2ae68f0e3bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) } } +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; } @@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); cfs_rq_util_change(cfs_rq); } -- cgit v1.2.3 From 70c8217acd4383e069fe1898bbad36ea4fcdbdcc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 16:10:42 -0400 Subject: tracing: Handle NULL formats in hold_module_trace_bprintk_format() If a task uses a non constant string for the format parameter in trace_printk(), then the trace_printk_fmt variable is set to NULL. This variable is then saved in the __trace_printk_fmt section. The function hold_module_trace_bprintk_format() checks to see if duplicate formats are used by modules, and reuses them if so (saves them to the list if it is new). But this function calls lookup_format() that does a strcmp() to the value (which is now NULL) and can cause a kernel oops. This wasn't an issue till 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") which added "__used" to the trace_printk_fmt variable, and before that, the kernel simply optimized it out (no NULL value was saved). The fix is simply to handle the NULL pointer in lookup_format() and have the caller ignore the value if it was NULL. Link: http://lkml.kernel.org/r/1464769870-18344-1-git-send-email-zhengjun.xing@intel.com Reported-by: xingzhen Acked-by: Namhyung Kim Fixes: 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } -- cgit v1.2.3 From 6720a305df74ca30bcc10fc316881641b6ff0c80 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jun 2016 12:11:17 -0700 Subject: locking: avoid passing around 'thread_info' in mutex debugging code None of the code actually wants a thread_info, it all wants a task_struct, and it's just converting back and forth between the two ("ti->task" to get the task_struct from the thread_info, and "task_thread_info(task)" to go the other way). No semantic change. Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/locking/mutex-debug.c | 12 ++++++------ kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.c | 6 +++--- kernel/locking/mutex.h | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 79d2d765a75f..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -- cgit v1.2.3 From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Jun 2016 18:52:17 +0200 Subject: locking/static_key: Fix concurrent static_key_slow_inc() The following scenario is possible: CPU 1 CPU 2 static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 0, no increment jump_label_lock() atomic_inc_return() -> key.enabled == 1 now static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 1, inc to 2 return ** static key is wrong! jump_label_update() jump_label_unlock() Testing the static key at the point marked by (**) will follow the wrong path for jumps that have not been patched yet. This can actually happen when creating many KVM virtual machines with userspace LAPIC emulation; just run several copies of the following program: #include #include #include #include int main(void) { for (;;) { int kvmfd = open("/dev/kvm", O_RDONLY); int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); close(ioctl(vmfd, KVM_CREATE_VCPU, 1)); close(vmfd); close(kvmfd); } return 0; } Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call. The static key's purpose is to skip NULL pointer checks and indeed one of the processes eventually dereferences NULL. As explained in the commit that introduced the bug: 706249c222f6 ("locking/static_keys: Rework update logic") jump_label_update() needs key.enabled to be true. The solution adopted here is to temporarily make key.enabled == -1, and use go down the slow path when key.enabled <= 0. Reported-by: Dmitry Vyukov Signed-off-by: Paolo Bonzini Signed-off-by: Peter Zijlstra (Intel) Cc: # v4.3+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 706249c222f6 ("locking/static_keys: Rework update logic") Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com [ Small stylistic edits to the changelog and the code. ] Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 16 +++++++++++++--- kernel/jump_label.c | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 0536524bb9eb..68904469fba1 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -117,13 +117,18 @@ struct module; #include +#ifdef HAVE_JUMP_LABEL + static inline int static_key_count(struct static_key *key) { - return atomic_read(&key->enabled); + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + return n >= 0 ? n : 1; } -#ifdef HAVE_JUMP_LABEL - #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_MASK 1UL @@ -162,6 +167,11 @@ extern void jump_label_apply_nops(struct module *mod); #else /* !HAVE_JUMP_LABEL */ +static inline int static_key_count(struct static_key *key) +{ + return atomic_read(&key->enabled); +} + static __always_inline void jump_label_init(void) { static_key_initialized = true; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); -- cgit v1.2.3 From 094f469172e00d6ab0a3130b0e01c83b3cf3a98d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:01 +0300 Subject: sched/fair: Initialize throttle_count for new task-groups lazily Cgroup created inside throttled group must inherit current throttle_count. Broken throttle_count allows to nominate throttled entries as a next buddy, later this leads to null pointer dereference in pick_next_task_fair(). This patch initialize cfs_rq->throttle_count at first enqueue: laziness allows to skip locking all rq at group creation. Lazy approach also allows to skip full sub-tree scan at throttling hierarchy (not in this patch). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++++++++ kernel/sched/sched.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ae68f0e3bf5..8c5d8c0c8827 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4202,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -437,7 +437,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:15 +0300 Subject: sched/fair: Do not announce throttled next buddy in dequeue_task_fair() Hierarchy could be already throttled at this point. Throttled next buddy could trigger a NULL pointer dereference in pick_next_task_fair(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5d8c0c8827..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4537,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v1.2.3 From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Jun 2016 15:35:04 -0400 Subject: sched/core: Allow kthreads to fall back to online && !active cpus During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is online but not active. A CPU_ONLINE callback may create or bind a kthread so that its cpus_allowed mask only allows the CPU which is being brought online. The kthread may start executing before the CPU is made active and can end up in select_fallback_rq(). In such cases, the expected behavior is selecting the CPU which is coming online; however, because select_fallback_rq() only chooses from active CPUs, it determines that the task doesn't have any viable CPU in its allowed mask and ends up overriding it to cpu_possible_mask. CPU_ONLINE callbacks should be able to put kthreads on the CPU which is coming online. Update select_fallback_rq() so that it follows cpu_online() rather than cpu_active() for kthreads. Reported-by: Gautham R Shenoy Tested-by: Gautham R. Shenoy Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Abdul Haleem Cc: Aneesh Kumar Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4135ac83cb65..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_active(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) + continue; + if (!cpu_online(dest_cpu)) continue; goto out; } -- cgit v1.2.3 From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 15:09:37 -0700 Subject: Clarify naming of thread info/stack allocators We've had the thread info allocated together with the thread stack for most architectures for a long time (since the thread_info was split off from the task struct), but that is about to change. But the patches that move the thread info to be off-stack (and a part of the task struct instead) made it clear how confused the allocator and freeing functions are. Because the common case was that we share an allocation with the thread stack and the thread_info, the two pointers were identical. That identity then meant that we would have things like ti = alloc_thread_info_node(tsk, node); ... tsk->stack = ti; which certainly _worked_ (since stack and thread_info have the same value), but is rather confusing: why are we assigning a thread_info to the stack? And if we move the thread_info away, the "confusing" code just gets to be entirely bogus. So remove all this confusion, and make it clear that we are doing the stack allocation by renaming and clarifying the function names to be about the stack. The fact that the thread_info then shares the allocation is an implementation detail, and not really about the allocation itself. This is a pure renaming and type fix: we pass in the same pointer, it's just that we clarify what the pointer means. The ia64 code that actually only has one single allocation (for all of task_struct, thread_info and kernel thread stack) now looks a bit odd, but since "tsk->stack" is actually not even used there, that oddity doesn't matter. It would be a separate thing to clean that up, I intentionally left the ia64 changes as a pure brute-force renaming and type change. Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 +-- arch/ia64/Kconfig | 2 +- arch/ia64/include/asm/thread_info.h | 8 +++--- arch/mn10300/include/asm/thread_info.h | 2 +- arch/mn10300/kernel/kgdb.c | 3 +- arch/tile/include/asm/thread_info.h | 2 +- arch/tile/kernel/process.c | 3 +- include/linux/sched.h | 2 +- init/main.c | 4 +-- kernel/fork.c | 50 +++++++++++++++++----------------- 10 files changed, 41 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index e9734796531f..15996290fed4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -226,8 +226,8 @@ config ARCH_INIT_TASK config ARCH_TASK_STRUCT_ALLOCATOR bool -# Select if arch has its private alloc_thread_info() function -config ARCH_THREAD_INFO_ALLOCATOR +# Select if arch has its private alloc_thread_stack() function +config ARCH_THREAD_STACK_ALLOCATOR bool # Select if arch wants to size task_struct dynamically via arch_task_struct_size: diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index f80758cb7157..e109ee95e919 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -45,7 +45,7 @@ config IA64 select GENERIC_SMP_IDLE_THREAD select ARCH_INIT_TASK select ARCH_TASK_STRUCT_ALLOCATOR - select ARCH_THREAD_INFO_ALLOCATOR + select ARCH_THREAD_STACK_ALLOCATOR select ARCH_CLOCKSOURCE_DATA select GENERIC_TIME_VSYSCALL_OLD select SYSCTL_ARCH_UNALIGN_NO_WARN diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index aa995b67c3f5..d1212b84fb83 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -48,15 +48,15 @@ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_info_node(tsk, node) \ - ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) +#define alloc_thread_stack_node(tsk, node) \ + ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_info_node(tsk, node) ((struct thread_info *) 0) +#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_info(ti) /* nothing */ +#define free_thread_stack(ti) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index 4861a78c7160..f5f90bbf019d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void) } #ifndef CONFIG_KGDB -void arch_release_thread_info(struct thread_info *ti); +void arch_release_thread_stack(unsigned long *stack); #endif #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c index 99770823451a..2d7986c386fe 100644 --- a/arch/mn10300/kernel/kgdb.c +++ b/arch/mn10300/kernel/kgdb.c @@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs) * single-step state is cleared. At this point the breakpoints should have * been removed by __switch_to(). */ -void arch_release_thread_info(struct thread_info *ti) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *ti = (void *)stack; if (kgdb_sstep_thread == ti) { kgdb_sstep_thread = NULL; diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 4b7cef9e94e0..c1467ac59ce6 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -78,7 +78,7 @@ struct thread_info { #ifndef __ASSEMBLY__ -void arch_release_thread_info(struct thread_info *info); +void arch_release_thread_stack(unsigned long *stack); /* How to get the thread information struct from C. */ register unsigned long stack_pointer __asm__("sp"); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 6b705ccc9cc1..a465d8372edd 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -73,8 +73,9 @@ void arch_cpu_idle(void) /* * Release a thread_info structure */ -void arch_release_thread_info(struct thread_info *info) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *info = (void *)stack; struct single_step_state *step_state = info->step_state; if (step_state) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..253538f29ade 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3007,7 +3007,7 @@ static inline int object_is_on_stack(void *obj) return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } -extern void thread_info_cache_init(void); +extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) diff --git a/init/main.c b/init/main.c index 4c17fda5c2ff..826fd57fa3aa 100644 --- a/init/main.c +++ b/init/main.c @@ -453,7 +453,7 @@ void __init __weak smp_setup_processor_id(void) } # if THREAD_SIZE >= PAGE_SIZE -void __init __weak thread_info_cache_init(void) +void __init __weak thread_stack_cache_init(void) { } #endif @@ -627,7 +627,7 @@ asmlinkage __visible void __init start_kernel(void) /* Should be run before the first non-init thread is created */ init_espfix_bsp(); #endif - thread_info_cache_init(); + thread_stack_cache_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..37b9439b8c07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(ti); + struct page *page = virt_to_page(stack); memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; -- cgit v1.2.3 From 74070542099c66d87aebeacd7b54dc0e8b6a73f9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Jun 2016 14:50:16 -0700 Subject: oom, suspend: fix oom_reaper vs. oom_killer_disable race Tetsuo has reported the following potential oom_killer_disable vs. oom_reaper race: (1) freeze_processes() starts freezing user space threads. (2) Somebody (maybe a kenrel thread) calls out_of_memory(). (3) The OOM killer calls mark_oom_victim() on a user space thread P1 which is already in __refrigerator(). (4) oom_killer_disable() sets oom_killer_disabled = true. (5) P1 leaves __refrigerator() and enters do_exit(). (6) The OOM reaper calls exit_oom_victim(P1) before P1 can call exit_oom_victim(P1). (7) oom_killer_disable() returns while P1 not yet finished (8) P1 perform IO/interfere with the freezer. This situation is unfortunate. We cannot move oom_killer_disable after all the freezable kernel threads are frozen because the oom victim might depend on some of those kthreads to make a forward progress to exit so we could deadlock. It is also far from trivial to teach the oom_reaper to not call exit_oom_victim() because then we would lose a guarantee of the OOM killer and oom_killer_disable forward progress because exit_mm->mmput might block and never call exit_oom_victim. It seems the easiest way forward is to workaround this race by calling try_to_freeze_tasks again after oom_killer_disable. This will make sure that all the tasks are frozen or it bails out. Fixes: 449d777d7ad6 ("mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper") Link: http://lkml.kernel.org/r/1466597634-16199-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,6 +146,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; -- cgit v1.2.3 From 9521d39976db20f8ef9b56af66661482a17d5364 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 25 Jun 2016 21:53:30 +1000 Subject: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE Commit b235beea9e99 ("Clarify naming of thread info/stack allocators") breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE: kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack' kernel/fork.c:355:8: error: assignment from incompatible pointer type stack = alloc_thread_stack_node(tsk, node); ^ Fix it by renaming free_stack() to free_thread_stack(), and updating the return type of alloc_thread_stack_node(). Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 37b9439b8c07..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -183,13 +183,13 @@ static inline void free_thread_stack(unsigned long *stack) # else static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_stack(unsigned long *stack) +static void free_thread_stack(unsigned long *stack) { kmem_cache_free(thread_stack_cache, stack); } -- cgit v1.2.3