From 90dc0d1ce890419f977e460b8258d25187dde64f Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 6 Dec 2019 16:27:41 -0600 Subject: of: Rework and simplify phandle cache to use a fixed size The phandle cache was added to speed up of_find_node_by_phandle() by avoiding walking the whole DT to find a matching phandle. The implementation has several shortcomings: - The cache is designed to work on a linear set of phandle values. This is true for dtc generated DTs, but not for other cases such as Power. - The cache isn't enabled until of_core_init() and a typical system may see hundreds of calls to of_find_node_by_phandle() before that point. - The cache is freed and re-allocated when the number of phandles changes. - It takes a raw spinlock around a memory allocation which breaks on RT. Change the implementation to a fixed size and use hash_32() as the cache index. This greatly simplifies the implementation. It avoids the need for any re-alloc of the cache and taking a reference on nodes in the cache. We only have a single source of removing cache entries which is of_detach_node(). Using hash_32() removes any assumption on phandle values improving the hit rate for non-linear phandle values. The effect on linear values using hash_32() is about a 10% collision. The chances of thrashing on colliding values seems to be low. To compare performance, I used a RK3399 board which is a pretty typical system. I found that just measuring boot time as done previously is noisy and may be impacted by other things. Also bringing up secondary cores causes some issues with measuring, so I booted with 'nr_cpus=1'. With no caching, calls to of_find_node_by_phandle() take about 20124 us for 1248 calls. There's an additional 288 calls before time keeping is up. Using the average time per hit/miss with the cache, we can calculate these calls to take 690 us (277 hit / 11 miss) with a 128 entry cache and 13319 us with no cache or an uninitialized cache. Comparing the 3 implementations the time spent in of_find_node_by_phandle() is: no cache: 20124 us (+ 13319 us) 128 entry cache: 5134 us (+ 690 us) current cache: 819 us (+ 13319 us) We could move the allocation of the cache earlier to improve the current cache, but that just further complicates the situation as it needs to be after slab is up, so we can't do it when unflattening (which uses memblock). Reported-by: Sebastian Andrzej Siewior Cc: Michael Ellerman Cc: Segher Boessenkool Cc: Frank Rowand Acked-by: Sebastian Andrzej Siewior Reviewed-by: Frank Rowand Tested-by: Frank Rowand Signed-off-by: Rob Herring --- drivers/of/base.c | 130 ++++++++---------------------------------------- drivers/of/dynamic.c | 2 +- drivers/of/of_private.h | 6 +-- drivers/of/overlay.c | 10 ---- 4 files changed, 24 insertions(+), 124 deletions(-) (limited to 'drivers/of') diff --git a/drivers/of/base.c b/drivers/of/base.c index db7fbc0c0893..8d173fb3552a 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -135,115 +135,38 @@ int __weak of_node_to_nid(struct device_node *np) } #endif -/* - * Assumptions behind phandle_cache implementation: - * - phandle property values are in a contiguous range of 1..n - * - * If the assumptions do not hold, then - * - the phandle lookup overhead reduction provided by the cache - * will likely be less - */ +#define OF_PHANDLE_CACHE_BITS 7 +#define OF_PHANDLE_CACHE_SZ BIT(OF_PHANDLE_CACHE_BITS) -static struct device_node **phandle_cache; -static u32 phandle_cache_mask; +static struct device_node *phandle_cache[OF_PHANDLE_CACHE_SZ]; -/* - * Caller must hold devtree_lock. - */ -static void __of_free_phandle_cache(void) +static u32 of_phandle_cache_hash(phandle handle) { - u32 cache_entries = phandle_cache_mask + 1; - u32 k; - - if (!phandle_cache) - return; - - for (k = 0; k < cache_entries; k++) - of_node_put(phandle_cache[k]); - - kfree(phandle_cache); - phandle_cache = NULL; + return hash_32(handle, OF_PHANDLE_CACHE_BITS); } -int of_free_phandle_cache(void) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&devtree_lock, flags); - - __of_free_phandle_cache(); - - raw_spin_unlock_irqrestore(&devtree_lock, flags); - - return 0; -} -#if !defined(CONFIG_MODULES) -late_initcall_sync(of_free_phandle_cache); -#endif - /* * Caller must hold devtree_lock. */ -void __of_free_phandle_cache_entry(phandle handle) +void __of_phandle_cache_inv_entry(phandle handle) { - phandle masked_handle; + u32 handle_hash; struct device_node *np; if (!handle) return; - masked_handle = handle & phandle_cache_mask; - - if (phandle_cache) { - np = phandle_cache[masked_handle]; - if (np && handle == np->phandle) { - of_node_put(np); - phandle_cache[masked_handle] = NULL; - } - } -} - -void of_populate_phandle_cache(void) -{ - unsigned long flags; - u32 cache_entries; - struct device_node *np; - u32 phandles = 0; - - raw_spin_lock_irqsave(&devtree_lock, flags); - - __of_free_phandle_cache(); - - for_each_of_allnodes(np) - if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL) - phandles++; - - if (!phandles) - goto out; + handle_hash = of_phandle_cache_hash(handle); - cache_entries = roundup_pow_of_two(phandles); - phandle_cache_mask = cache_entries - 1; - - phandle_cache = kcalloc(cache_entries, sizeof(*phandle_cache), - GFP_ATOMIC); - if (!phandle_cache) - goto out; - - for_each_of_allnodes(np) - if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL) { - of_node_get(np); - phandle_cache[np->phandle & phandle_cache_mask] = np; - } - -out: - raw_spin_unlock_irqrestore(&devtree_lock, flags); + np = phandle_cache[handle_hash]; + if (np && handle == np->phandle) + phandle_cache[handle_hash] = NULL; } void __init of_core_init(void) { struct device_node *np; - of_populate_phandle_cache(); /* Create the kset, and register existing nodes */ mutex_lock(&of_mutex); @@ -253,8 +176,11 @@ void __init of_core_init(void) pr_err("failed to register existing nodes\n"); return; } - for_each_of_allnodes(np) + for_each_of_allnodes(np) { __of_attach_node_sysfs(np); + if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)]) + phandle_cache[of_phandle_cache_hash(np->phandle)] = np; + } mutex_unlock(&of_mutex); /* Symlink in /proc as required by userspace ABI */ @@ -1235,36 +1161,24 @@ struct device_node *of_find_node_by_phandle(phandle handle) { struct device_node *np = NULL; unsigned long flags; - phandle masked_handle; + u32 handle_hash; if (!handle) return NULL; + handle_hash = of_phandle_cache_hash(handle); + raw_spin_lock_irqsave(&devtree_lock, flags); - masked_handle = handle & phandle_cache_mask; - - if (phandle_cache) { - if (phandle_cache[masked_handle] && - handle == phandle_cache[masked_handle]->phandle) - np = phandle_cache[masked_handle]; - if (np && of_node_check_flag(np, OF_DETACHED)) { - WARN_ON(1); /* did not uncache np on node removal */ - of_node_put(np); - phandle_cache[masked_handle] = NULL; - np = NULL; - } - } + if (phandle_cache[handle_hash] && + handle == phandle_cache[handle_hash]->phandle) + np = phandle_cache[handle_hash]; if (!np) { for_each_of_allnodes(np) if (np->phandle == handle && !of_node_check_flag(np, OF_DETACHED)) { - if (phandle_cache) { - /* will put when removed from cache */ - of_node_get(np); - phandle_cache[masked_handle] = np; - } + phandle_cache[handle_hash] = np; break; } } diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c index 49b16f76d78e..08fd823edac9 100644 --- a/drivers/of/dynamic.c +++ b/drivers/of/dynamic.c @@ -276,7 +276,7 @@ void __of_detach_node(struct device_node *np) of_node_set_flag(np, OF_DETACHED); /* race with of_find_node_by_phandle() prevented by devtree_lock */ - __of_free_phandle_cache_entry(np->phandle); + __of_phandle_cache_inv_entry(np->phandle); } /** diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 66294d29942a..207863c151a5 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -84,15 +84,11 @@ static inline void __of_detach_node_sysfs(struct device_node *np) {} int of_resolve_phandles(struct device_node *tree); #endif -#if defined(CONFIG_OF_DYNAMIC) -void __of_free_phandle_cache_entry(phandle handle); -#endif +void __of_phandle_cache_inv_entry(phandle handle); #if defined(CONFIG_OF_OVERLAY) void of_overlay_mutex_lock(void); void of_overlay_mutex_unlock(void); -int of_free_phandle_cache(void); -void of_populate_phandle_cache(void); #else static inline void of_overlay_mutex_lock(void) {}; static inline void of_overlay_mutex_unlock(void) {}; diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 9617b7df7c4d..97fe92c1f1d2 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -974,8 +974,6 @@ static int of_overlay_apply(const void *fdt, struct device_node *tree, goto err_free_overlay_changeset; } - of_populate_phandle_cache(); - ret = __of_changeset_apply_notify(&ovcs->cset); if (ret) pr_err("overlay apply changeset entry notify error %d\n", ret); @@ -1218,17 +1216,9 @@ int of_overlay_remove(int *ovcs_id) list_del(&ovcs->ovcs_list); - /* - * Disable phandle cache. Avoids race condition that would arise - * from removing cache entry when the associated node is deleted. - */ - of_free_phandle_cache(); - ret_apply = 0; ret = __of_changeset_revert_entries(&ovcs->cset, &ret_apply); - of_populate_phandle_cache(); - if (ret) { if (ret_apply) devicetree_state_flags |= DTSF_REVERT_FAIL; -- cgit v1.2.3 From 628d0e1cb7be79a20d2aca568411809e11ea343e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 31 Dec 2019 14:46:44 +0100 Subject: of: overlay: Remove blank line between assignment and check There used to be blank lines between assignment and check of the __of_changeset_revert_entries() result, to make the phandle cache management operations stand out. After the removal of those operations in commit 90dc0d1ce890419f ("of: Rework and simplify phandle cache to use a fixed size"), there is no longer a reason to have such a blank line. Remove the blank line, to rejoin visibly the status assignement and check, and to match coding style. Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- drivers/of/overlay.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/of') diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 97fe92c1f1d2..c9219fddf44b 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -1218,7 +1218,6 @@ int of_overlay_remove(int *ovcs_id) ret_apply = 0; ret = __of_changeset_revert_entries(&ovcs->cset, &ret_apply); - if (ret) { if (ret_apply) devicetree_state_flags |= DTSF_REVERT_FAIL; -- cgit v1.2.3 From dabf6b36b83a18d57e3d4b9d50544ed040d86255 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 26 Jan 2020 22:52:47 +1100 Subject: of: Add OF_DMA_DEFAULT_COHERENT & select it on powerpc There's an OF helper called of_dma_is_coherent(), which checks if a device has a "dma-coherent" property to see if the device is coherent for DMA. But on some platforms devices are coherent by default, and on some platforms it's not possible to update existing device trees to add the "dma-coherent" property. So add a Kconfig symbol to allow arch code to tell of_dma_is_coherent() that devices are coherent by default, regardless of the presence of the property. Select that symbol on powerpc when NOT_COHERENT_CACHE is not set, ie. when the system has a coherent cache. Fixes: 92ea637edea3 ("of: introduce of_dma_is_coherent() helper") Cc: stable@vger.kernel.org # v3.16+ Reported-by: Christian Zigotzky Tested-by: Christian Zigotzky Signed-off-by: Michael Ellerman Reviewed-by: Ulf Hansson Signed-off-by: Rob Herring --- arch/powerpc/Kconfig | 1 + drivers/of/Kconfig | 4 ++++ drivers/of/address.c | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/of') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..19f5aa8ac9a3 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -238,6 +238,7 @@ config PPC select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_SG_DMA_LENGTH select OF + select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select OF_EARLY_FLATTREE select OLD_SIGACTION if PPC32 select OLD_SIGSUSPEND diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 37c2ccbefecd..d91618641be6 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -103,4 +103,8 @@ config OF_OVERLAY config OF_NUMA bool +config OF_DMA_DEFAULT_COHERENT + # arches should select this if DMA is coherent by default for OF devices + bool + endif # OF diff --git a/drivers/of/address.c b/drivers/of/address.c index 99c1b8058559..e8a39c3ec4d4 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -995,12 +995,16 @@ out: * @np: device node * * It returns true if "dma-coherent" property was found - * for this device in DT. + * for this device in the DT, or if DMA is coherent by + * default for OF devices on the current platform. */ bool of_dma_is_coherent(struct device_node *np) { struct device_node *node = of_node_get(np); + if (IS_ENABLED(CONFIG_OF_DMA_DEFAULT_COHERENT)) + return true; + while (node) { if (of_property_read_bool(node, "dma-coherent")) { of_node_put(node); -- cgit v1.2.3