From c710fcc5d95a5e0d1648c40c0b101e198bfc3459 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:42:55 -0800 Subject: ACPI: NUMA: Establish a new drivers/acpi/numa/ directory Currently hmat.c lives under an "hmat" directory which does not enhance the description of the file. The initial motivation for giving hmat.c its own directory was to delineate it as mm functionality in contrast to ACPI device driver functionality. As ACPI continues to play an increasing role in conveying memory location and performance topology information to the OS take the opportunity to co-locate these NUMA relevant tables in a combined directory. numa.c is renamed to srat.c and moved to drivers/acpi/numa/ along with hmat.c. Signed-off-by: Dan Williams Reviewed-by: Dave Hansen Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 9 +- drivers/acpi/Makefile | 3 +- drivers/acpi/hmat/Kconfig | 11 - drivers/acpi/hmat/Makefile | 2 - drivers/acpi/hmat/hmat.c | 751 --------------------------------------------- drivers/acpi/numa.c | 489 ----------------------------- drivers/acpi/numa/Kconfig | 17 + drivers/acpi/numa/Makefile | 3 + drivers/acpi/numa/hmat.c | 751 +++++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/numa/srat.c | 489 +++++++++++++++++++++++++++++ 10 files changed, 1262 insertions(+), 1263 deletions(-) delete mode 100644 drivers/acpi/hmat/Kconfig delete mode 100644 drivers/acpi/hmat/Makefile delete mode 100644 drivers/acpi/hmat/hmat.c delete mode 100644 drivers/acpi/numa.c create mode 100644 drivers/acpi/numa/Kconfig create mode 100644 drivers/acpi/numa/Makefile create mode 100644 drivers/acpi/numa/hmat.c create mode 100644 drivers/acpi/numa/srat.c (limited to 'drivers') diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index ebe1e9e5fd81..8c7c46065e9d 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -319,12 +319,6 @@ config ACPI_THERMAL To compile this driver as a module, choose M here: the module will be called thermal. -config ACPI_NUMA - bool "NUMA support" - depends on NUMA - depends on (X86 || IA64 || ARM64) - default y if IA64 || ARM64 - config ACPI_CUSTOM_DSDT_FILE string "Custom DSDT Table file to include" default "" @@ -473,8 +467,7 @@ config ACPI_REDUCED_HARDWARE_ONLY If you are unsure what to do, do not enable this option. source "drivers/acpi/nfit/Kconfig" -source "drivers/acpi/hmat/Kconfig" - +source "drivers/acpi/numa/Kconfig" source "drivers/acpi/apei/Kconfig" source "drivers/acpi/dptf/Kconfig" diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 5d361e4e3405..f08a661274e8 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -55,7 +55,6 @@ acpi-$(CONFIG_X86) += acpi_cmos_rtc.o acpi-$(CONFIG_X86) += x86/apple.o acpi-$(CONFIG_X86) += x86/utils.o acpi-$(CONFIG_DEBUG_FS) += debugfs.o -acpi-$(CONFIG_ACPI_NUMA) += numa.o acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o acpi-y += acpi_lpat.o acpi-$(CONFIG_ACPI_LPIT) += acpi_lpit.o @@ -80,7 +79,7 @@ obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-$(CONFIG_ACPI) += container.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o obj-$(CONFIG_ACPI_NFIT) += nfit/ -obj-$(CONFIG_ACPI_HMAT) += hmat/ +obj-$(CONFIG_ACPI_NUMA) += numa/ obj-$(CONFIG_ACPI) += acpi_memhotplug.o obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o obj-$(CONFIG_ACPI_BATTERY) += battery.o diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/hmat/Kconfig deleted file mode 100644 index 95a29964dbea..000000000000 --- a/drivers/acpi/hmat/Kconfig +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -config ACPI_HMAT - bool "ACPI Heterogeneous Memory Attribute Table Support" - depends on ACPI_NUMA - select HMEM_REPORTING - help - If set, this option has the kernel parse and report the - platform's ACPI HMAT (Heterogeneous Memory Attributes Table), - register memory initiators with their targets, and export - performance attributes through the node's sysfs device if - provided. diff --git a/drivers/acpi/hmat/Makefile b/drivers/acpi/hmat/Makefile deleted file mode 100644 index 1c20ef36a385..000000000000 --- a/drivers/acpi/hmat/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_ACPI_HMAT) := hmat.o diff --git a/drivers/acpi/hmat/hmat.c b/drivers/acpi/hmat/hmat.c deleted file mode 100644 index 8b0de8a3c647..000000000000 --- a/drivers/acpi/hmat/hmat.c +++ /dev/null @@ -1,751 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2019, Intel Corporation. - * - * Heterogeneous Memory Attributes Table (HMAT) representation - * - * This program parses and reports the platform's HMAT tables, and registers - * the applicable attributes with the node's interfaces. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static u8 hmat_revision; - -static LIST_HEAD(targets); -static LIST_HEAD(initiators); -static LIST_HEAD(localities); - -static DEFINE_MUTEX(target_lock); - -/* - * The defined enum order is used to prioritize attributes to break ties when - * selecting the best performing node. - */ -enum locality_types { - WRITE_LATENCY, - READ_LATENCY, - WRITE_BANDWIDTH, - READ_BANDWIDTH, -}; - -static struct memory_locality *localities_types[4]; - -struct target_cache { - struct list_head node; - struct node_cache_attrs cache_attrs; -}; - -struct memory_target { - struct list_head node; - unsigned int memory_pxm; - unsigned int processor_pxm; - struct node_hmem_attrs hmem_attrs; - struct list_head caches; - struct node_cache_attrs cache_attrs; - bool registered; -}; - -struct memory_initiator { - struct list_head node; - unsigned int processor_pxm; -}; - -struct memory_locality { - struct list_head node; - struct acpi_hmat_locality *hmat_loc; -}; - -static struct memory_initiator *find_mem_initiator(unsigned int cpu_pxm) -{ - struct memory_initiator *initiator; - - list_for_each_entry(initiator, &initiators, node) - if (initiator->processor_pxm == cpu_pxm) - return initiator; - return NULL; -} - -static struct memory_target *find_mem_target(unsigned int mem_pxm) -{ - struct memory_target *target; - - list_for_each_entry(target, &targets, node) - if (target->memory_pxm == mem_pxm) - return target; - return NULL; -} - -static __init void alloc_memory_initiator(unsigned int cpu_pxm) -{ - struct memory_initiator *initiator; - - if (pxm_to_node(cpu_pxm) == NUMA_NO_NODE) - return; - - initiator = find_mem_initiator(cpu_pxm); - if (initiator) - return; - - initiator = kzalloc(sizeof(*initiator), GFP_KERNEL); - if (!initiator) - return; - - initiator->processor_pxm = cpu_pxm; - list_add_tail(&initiator->node, &initiators); -} - -static __init void alloc_memory_target(unsigned int mem_pxm) -{ - struct memory_target *target; - - target = find_mem_target(mem_pxm); - if (target) - return; - - target = kzalloc(sizeof(*target), GFP_KERNEL); - if (!target) - return; - - target->memory_pxm = mem_pxm; - target->processor_pxm = PXM_INVAL; - list_add_tail(&target->node, &targets); - INIT_LIST_HEAD(&target->caches); -} - -static __init const char *hmat_data_type(u8 type) -{ - switch (type) { - case ACPI_HMAT_ACCESS_LATENCY: - return "Access Latency"; - case ACPI_HMAT_READ_LATENCY: - return "Read Latency"; - case ACPI_HMAT_WRITE_LATENCY: - return "Write Latency"; - case ACPI_HMAT_ACCESS_BANDWIDTH: - return "Access Bandwidth"; - case ACPI_HMAT_READ_BANDWIDTH: - return "Read Bandwidth"; - case ACPI_HMAT_WRITE_BANDWIDTH: - return "Write Bandwidth"; - default: - return "Reserved"; - } -} - -static __init const char *hmat_data_type_suffix(u8 type) -{ - switch (type) { - case ACPI_HMAT_ACCESS_LATENCY: - case ACPI_HMAT_READ_LATENCY: - case ACPI_HMAT_WRITE_LATENCY: - return " nsec"; - case ACPI_HMAT_ACCESS_BANDWIDTH: - case ACPI_HMAT_READ_BANDWIDTH: - case ACPI_HMAT_WRITE_BANDWIDTH: - return " MB/s"; - default: - return ""; - } -} - -static u32 hmat_normalize(u16 entry, u64 base, u8 type) -{ - u32 value; - - /* - * Check for invalid and overflow values - */ - if (entry == 0xffff || !entry) - return 0; - else if (base > (UINT_MAX / (entry))) - return 0; - - /* - * Divide by the base unit for version 1, convert latency from - * picosenonds to nanoseconds if revision 2. - */ - value = entry * base; - if (hmat_revision == 1) { - if (value < 10) - return 0; - value = DIV_ROUND_UP(value, 10); - } else if (hmat_revision == 2) { - switch (type) { - case ACPI_HMAT_ACCESS_LATENCY: - case ACPI_HMAT_READ_LATENCY: - case ACPI_HMAT_WRITE_LATENCY: - value = DIV_ROUND_UP(value, 1000); - break; - default: - break; - } - } - return value; -} - -static void hmat_update_target_access(struct memory_target *target, - u8 type, u32 value) -{ - switch (type) { - case ACPI_HMAT_ACCESS_LATENCY: - target->hmem_attrs.read_latency = value; - target->hmem_attrs.write_latency = value; - break; - case ACPI_HMAT_READ_LATENCY: - target->hmem_attrs.read_latency = value; - break; - case ACPI_HMAT_WRITE_LATENCY: - target->hmem_attrs.write_latency = value; - break; - case ACPI_HMAT_ACCESS_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; - target->hmem_attrs.write_bandwidth = value; - break; - case ACPI_HMAT_READ_BANDWIDTH: - target->hmem_attrs.read_bandwidth = value; - break; - case ACPI_HMAT_WRITE_BANDWIDTH: - target->hmem_attrs.write_bandwidth = value; - break; - default: - break; - } -} - -static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) -{ - struct memory_locality *loc; - - loc = kzalloc(sizeof(*loc), GFP_KERNEL); - if (!loc) { - pr_notice_once("Failed to allocate HMAT locality\n"); - return; - } - - loc->hmat_loc = hmat_loc; - list_add_tail(&loc->node, &localities); - - switch (hmat_loc->data_type) { - case ACPI_HMAT_ACCESS_LATENCY: - localities_types[READ_LATENCY] = loc; - localities_types[WRITE_LATENCY] = loc; - break; - case ACPI_HMAT_READ_LATENCY: - localities_types[READ_LATENCY] = loc; - break; - case ACPI_HMAT_WRITE_LATENCY: - localities_types[WRITE_LATENCY] = loc; - break; - case ACPI_HMAT_ACCESS_BANDWIDTH: - localities_types[READ_BANDWIDTH] = loc; - localities_types[WRITE_BANDWIDTH] = loc; - break; - case ACPI_HMAT_READ_BANDWIDTH: - localities_types[READ_BANDWIDTH] = loc; - break; - case ACPI_HMAT_WRITE_BANDWIDTH: - localities_types[WRITE_BANDWIDTH] = loc; - break; - default: - break; - } -} - -static __init int hmat_parse_locality(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_hmat_locality *hmat_loc = (void *)header; - struct memory_target *target; - unsigned int init, targ, total_size, ipds, tpds; - u32 *inits, *targs, value; - u16 *entries; - u8 type, mem_hier; - - if (hmat_loc->header.length < sizeof(*hmat_loc)) { - pr_notice("HMAT: Unexpected locality header length: %d\n", - hmat_loc->header.length); - return -EINVAL; - } - - type = hmat_loc->data_type; - mem_hier = hmat_loc->flags & ACPI_HMAT_MEMORY_HIERARCHY; - ipds = hmat_loc->number_of_initiator_Pds; - tpds = hmat_loc->number_of_target_Pds; - total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds + - sizeof(*inits) * ipds + sizeof(*targs) * tpds; - if (hmat_loc->header.length < total_size) { - pr_notice("HMAT: Unexpected locality header length:%d, minimum required:%d\n", - hmat_loc->header.length, total_size); - return -EINVAL; - } - - pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%d Target Domains:%d Base:%lld\n", - hmat_loc->flags, hmat_data_type(type), ipds, tpds, - hmat_loc->entry_base_unit); - - inits = (u32 *)(hmat_loc + 1); - targs = inits + ipds; - entries = (u16 *)(targs + tpds); - for (init = 0; init < ipds; init++) { - alloc_memory_initiator(inits[init]); - for (targ = 0; targ < tpds; targ++) { - value = hmat_normalize(entries[init * tpds + targ], - hmat_loc->entry_base_unit, - type); - pr_info(" Initiator-Target[%d-%d]:%d%s\n", - inits[init], targs[targ], value, - hmat_data_type_suffix(type)); - - if (mem_hier == ACPI_HMAT_MEMORY) { - target = find_mem_target(targs[targ]); - if (target && target->processor_pxm == inits[init]) - hmat_update_target_access(target, type, value); - } - } - } - - if (mem_hier == ACPI_HMAT_MEMORY) - hmat_add_locality(hmat_loc); - - return 0; -} - -static __init int hmat_parse_cache(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_hmat_cache *cache = (void *)header; - struct memory_target *target; - struct target_cache *tcache; - u32 attrs; - - if (cache->header.length < sizeof(*cache)) { - pr_notice("HMAT: Unexpected cache header length: %d\n", - cache->header.length); - return -EINVAL; - } - - attrs = cache->cache_attributes; - pr_info("HMAT: Cache: Domain:%d Size:%llu Attrs:%08x SMBIOS Handles:%d\n", - cache->memory_PD, cache->cache_size, attrs, - cache->number_of_SMBIOShandles); - - target = find_mem_target(cache->memory_PD); - if (!target) - return 0; - - tcache = kzalloc(sizeof(*tcache), GFP_KERNEL); - if (!tcache) { - pr_notice_once("Failed to allocate HMAT cache info\n"); - return 0; - } - - tcache->cache_attrs.size = cache->cache_size; - tcache->cache_attrs.level = (attrs & ACPI_HMAT_CACHE_LEVEL) >> 4; - tcache->cache_attrs.line_size = (attrs & ACPI_HMAT_CACHE_LINE_SIZE) >> 16; - - switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) { - case ACPI_HMAT_CA_DIRECT_MAPPED: - tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP; - break; - case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING: - tcache->cache_attrs.indexing = NODE_CACHE_INDEXED; - break; - case ACPI_HMAT_CA_NONE: - default: - tcache->cache_attrs.indexing = NODE_CACHE_OTHER; - break; - } - - switch ((attrs & ACPI_HMAT_WRITE_POLICY) >> 12) { - case ACPI_HMAT_CP_WB: - tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_BACK; - break; - case ACPI_HMAT_CP_WT: - tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_THROUGH; - break; - case ACPI_HMAT_CP_NONE: - default: - tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_OTHER; - break; - } - list_add_tail(&tcache->node, &target->caches); - - return 0; -} - -static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_hmat_proximity_domain *p = (void *)header; - struct memory_target *target = NULL; - - if (p->header.length != sizeof(*p)) { - pr_notice("HMAT: Unexpected address range header length: %d\n", - p->header.length); - return -EINVAL; - } - - if (hmat_revision == 1) - pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%d\n", - p->reserved3, p->reserved4, p->flags, p->processor_PD, - p->memory_PD); - else - pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%d\n", - p->flags, p->processor_PD, p->memory_PD); - - if (p->flags & ACPI_HMAT_MEMORY_PD_VALID && hmat_revision == 1) { - target = find_mem_target(p->memory_PD); - if (!target) { - pr_debug("HMAT: Memory Domain missing from SRAT\n"); - return -EINVAL; - } - } - if (target && p->flags & ACPI_HMAT_PROCESSOR_PD_VALID) { - int p_node = pxm_to_node(p->processor_PD); - - if (p_node == NUMA_NO_NODE) { - pr_debug("HMAT: Invalid Processor Domain\n"); - return -EINVAL; - } - target->processor_pxm = p_node; - } - - return 0; -} - -static int __init hmat_parse_subtable(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_hmat_structure *hdr = (void *)header; - - if (!hdr) - return -EINVAL; - - switch (hdr->type) { - case ACPI_HMAT_TYPE_PROXIMITY: - return hmat_parse_proximity_domain(header, end); - case ACPI_HMAT_TYPE_LOCALITY: - return hmat_parse_locality(header, end); - case ACPI_HMAT_TYPE_CACHE: - return hmat_parse_cache(header, end); - default: - return -EINVAL; - } -} - -static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_srat_mem_affinity *ma = (void *)header; - - if (!ma) - return -EINVAL; - if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) - return 0; - alloc_memory_target(ma->proximity_domain); - return 0; -} - -static u32 hmat_initiator_perf(struct memory_target *target, - struct memory_initiator *initiator, - struct acpi_hmat_locality *hmat_loc) -{ - unsigned int ipds, tpds, i, idx = 0, tdx = 0; - u32 *inits, *targs; - u16 *entries; - - ipds = hmat_loc->number_of_initiator_Pds; - tpds = hmat_loc->number_of_target_Pds; - inits = (u32 *)(hmat_loc + 1); - targs = inits + ipds; - entries = (u16 *)(targs + tpds); - - for (i = 0; i < ipds; i++) { - if (inits[i] == initiator->processor_pxm) { - idx = i; - break; - } - } - - if (i == ipds) - return 0; - - for (i = 0; i < tpds; i++) { - if (targs[i] == target->memory_pxm) { - tdx = i; - break; - } - } - if (i == tpds) - return 0; - - return hmat_normalize(entries[idx * tpds + tdx], - hmat_loc->entry_base_unit, - hmat_loc->data_type); -} - -static bool hmat_update_best(u8 type, u32 value, u32 *best) -{ - bool updated = false; - - if (!value) - return false; - - switch (type) { - case ACPI_HMAT_ACCESS_LATENCY: - case ACPI_HMAT_READ_LATENCY: - case ACPI_HMAT_WRITE_LATENCY: - if (!*best || *best > value) { - *best = value; - updated = true; - } - break; - case ACPI_HMAT_ACCESS_BANDWIDTH: - case ACPI_HMAT_READ_BANDWIDTH: - case ACPI_HMAT_WRITE_BANDWIDTH: - if (!*best || *best < value) { - *best = value; - updated = true; - } - break; - } - - return updated; -} - -static int initiator_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct memory_initiator *ia; - struct memory_initiator *ib; - unsigned long *p_nodes = priv; - - ia = list_entry(a, struct memory_initiator, node); - ib = list_entry(b, struct memory_initiator, node); - - set_bit(ia->processor_pxm, p_nodes); - set_bit(ib->processor_pxm, p_nodes); - - return ia->processor_pxm - ib->processor_pxm; -} - -static void hmat_register_target_initiators(struct memory_target *target) -{ - static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); - struct memory_initiator *initiator; - unsigned int mem_nid, cpu_nid; - struct memory_locality *loc = NULL; - u32 best = 0; - int i; - - mem_nid = pxm_to_node(target->memory_pxm); - /* - * If the Address Range Structure provides a local processor pxm, link - * only that one. Otherwise, find the best performance attributes and - * register all initiators that match. - */ - if (target->processor_pxm != PXM_INVAL) { - cpu_nid = pxm_to_node(target->processor_pxm); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - return; - } - - if (list_empty(&localities)) - return; - - /* - * We need the initiator list sorted so we can use bitmap_clear for - * previously set initiators when we find a better memory accessor. - * We'll also use the sorting to prime the candidate nodes with known - * initiators. - */ - bitmap_zero(p_nodes, MAX_NUMNODES); - list_sort(p_nodes, &initiators, initiator_cmp); - for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { - loc = localities_types[i]; - if (!loc) - continue; - - best = 0; - list_for_each_entry(initiator, &initiators, node) { - u32 value; - - if (!test_bit(initiator->processor_pxm, p_nodes)) - continue; - - value = hmat_initiator_perf(target, initiator, loc->hmat_loc); - if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) - bitmap_clear(p_nodes, 0, initiator->processor_pxm); - if (value != best) - clear_bit(initiator->processor_pxm, p_nodes); - } - if (best) - hmat_update_target_access(target, loc->hmat_loc->data_type, best); - } - - for_each_set_bit(i, p_nodes, MAX_NUMNODES) { - cpu_nid = pxm_to_node(i); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - } -} - -static void hmat_register_target_cache(struct memory_target *target) -{ - unsigned mem_nid = pxm_to_node(target->memory_pxm); - struct target_cache *tcache; - - list_for_each_entry(tcache, &target->caches, node) - node_add_cache(mem_nid, &tcache->cache_attrs); -} - -static void hmat_register_target_perf(struct memory_target *target) -{ - unsigned mem_nid = pxm_to_node(target->memory_pxm); - node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); -} - -static void hmat_register_target(struct memory_target *target) -{ - int nid = pxm_to_node(target->memory_pxm); - - /* - * Skip offline nodes. This can happen when memory - * marked EFI_MEMORY_SP, "specific purpose", is applied - * to all the memory in a promixity domain leading to - * the node being marked offline / unplugged, or if - * memory-only "hotplug" node is offline. - */ - if (nid == NUMA_NO_NODE || !node_online(nid)) - return; - - mutex_lock(&target_lock); - if (!target->registered) { - hmat_register_target_initiators(target); - hmat_register_target_cache(target); - hmat_register_target_perf(target); - target->registered = true; - } - mutex_unlock(&target_lock); -} - -static void hmat_register_targets(void) -{ - struct memory_target *target; - - list_for_each_entry(target, &targets, node) - hmat_register_target(target); -} - -static int hmat_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_target *target; - struct memory_notify *mnb = arg; - int pxm, nid = mnb->status_change_nid; - - if (nid == NUMA_NO_NODE || action != MEM_ONLINE) - return NOTIFY_OK; - - pxm = node_to_pxm(nid); - target = find_mem_target(pxm); - if (!target) - return NOTIFY_OK; - - hmat_register_target(target); - return NOTIFY_OK; -} - -static struct notifier_block hmat_callback_nb = { - .notifier_call = hmat_callback, - .priority = 2, -}; - -static __init void hmat_free_structures(void) -{ - struct memory_target *target, *tnext; - struct memory_locality *loc, *lnext; - struct memory_initiator *initiator, *inext; - struct target_cache *tcache, *cnext; - - list_for_each_entry_safe(target, tnext, &targets, node) { - list_for_each_entry_safe(tcache, cnext, &target->caches, node) { - list_del(&tcache->node); - kfree(tcache); - } - list_del(&target->node); - kfree(target); - } - - list_for_each_entry_safe(initiator, inext, &initiators, node) { - list_del(&initiator->node); - kfree(initiator); - } - - list_for_each_entry_safe(loc, lnext, &localities, node) { - list_del(&loc->node); - kfree(loc); - } -} - -static __init int hmat_init(void) -{ - struct acpi_table_header *tbl; - enum acpi_hmat_type i; - acpi_status status; - - if (srat_disabled()) - return 0; - - status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); - if (ACPI_FAILURE(status)) - return 0; - - if (acpi_table_parse_entries(ACPI_SIG_SRAT, - sizeof(struct acpi_table_srat), - ACPI_SRAT_TYPE_MEMORY_AFFINITY, - srat_parse_mem_affinity, 0) < 0) - goto out_put; - acpi_put_table(tbl); - - status = acpi_get_table(ACPI_SIG_HMAT, 0, &tbl); - if (ACPI_FAILURE(status)) - goto out_put; - - hmat_revision = tbl->revision; - switch (hmat_revision) { - case 1: - case 2: - break; - default: - pr_notice("Ignoring HMAT: Unknown revision:%d\n", hmat_revision); - goto out_put; - } - - for (i = ACPI_HMAT_TYPE_PROXIMITY; i < ACPI_HMAT_TYPE_RESERVED; i++) { - if (acpi_table_parse_entries(ACPI_SIG_HMAT, - sizeof(struct acpi_table_hmat), i, - hmat_parse_subtable, 0) < 0) { - pr_notice("Ignoring HMAT: Invalid table"); - goto out_put; - } - } - hmat_register_targets(); - - /* Keep the table and structures if the notifier may use them */ - if (!register_hotmemory_notifier(&hmat_callback_nb)) - return 0; -out_put: - hmat_free_structures(); - acpi_put_table(tbl); - return 0; -} -subsys_initcall(hmat_init); diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c deleted file mode 100644 index eadbf90e65d1..000000000000 --- a/drivers/acpi/numa.c +++ /dev/null @@ -1,489 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * acpi_numa.c - ACPI NUMA support - * - * Copyright (C) 2002 Takayoshi Kochi - */ - -#define pr_fmt(fmt) "ACPI: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static nodemask_t nodes_found_map = NODE_MASK_NONE; - -/* maps to convert between proximity domain and logical node ID */ -static int pxm_to_node_map[MAX_PXM_DOMAINS] - = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; -static int node_to_pxm_map[MAX_NUMNODES] - = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; - -unsigned char acpi_srat_revision __initdata; -int acpi_numa __initdata; - -int pxm_to_node(int pxm) -{ - if (pxm < 0) - return NUMA_NO_NODE; - return pxm_to_node_map[pxm]; -} - -int node_to_pxm(int node) -{ - if (node < 0) - return PXM_INVAL; - return node_to_pxm_map[node]; -} - -static void __acpi_map_pxm_to_node(int pxm, int node) -{ - if (pxm_to_node_map[pxm] == NUMA_NO_NODE || node < pxm_to_node_map[pxm]) - pxm_to_node_map[pxm] = node; - if (node_to_pxm_map[node] == PXM_INVAL || pxm < node_to_pxm_map[node]) - node_to_pxm_map[node] = pxm; -} - -int acpi_map_pxm_to_node(int pxm) -{ - int node; - - if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) - return NUMA_NO_NODE; - - node = pxm_to_node_map[pxm]; - - if (node == NUMA_NO_NODE) { - if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) - return NUMA_NO_NODE; - node = first_unset_node(nodes_found_map); - __acpi_map_pxm_to_node(pxm, node); - node_set(node, nodes_found_map); - } - - return node; -} -EXPORT_SYMBOL(acpi_map_pxm_to_node); - -/** - * acpi_map_pxm_to_online_node - Map proximity ID to online node - * @pxm: ACPI proximity ID - * - * This is similar to acpi_map_pxm_to_node(), but always returns an online - * node. When the mapped node from a given proximity ID is offline, it - * looks up the node distance table and returns the nearest online node. - * - * ACPI device drivers, which are called after the NUMA initialization has - * completed in the kernel, can call this interface to obtain their device - * NUMA topology from ACPI tables. Such drivers do not have to deal with - * offline nodes. A node may be offline when a device proximity ID is - * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. - * "numa=off" on x86. - */ -int acpi_map_pxm_to_online_node(int pxm) -{ - int node, min_node; - - node = acpi_map_pxm_to_node(pxm); - - if (node == NUMA_NO_NODE) - node = 0; - - min_node = node; - if (!node_online(node)) { - int min_dist = INT_MAX, dist, n; - - for_each_online_node(n) { - dist = node_distance(node, n); - if (dist < min_dist) { - min_dist = dist; - min_node = n; - } - } - } - - return min_node; -} -EXPORT_SYMBOL(acpi_map_pxm_to_online_node); - -static void __init -acpi_table_print_srat_entry(struct acpi_subtable_header *header) -{ - switch (header->type) { - case ACPI_SRAT_TYPE_CPU_AFFINITY: - { - struct acpi_srat_cpu_affinity *p = - (struct acpi_srat_cpu_affinity *)header; - pr_debug("SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", - p->apic_id, p->local_sapic_eid, - p->proximity_domain_lo, - (p->flags & ACPI_SRAT_CPU_ENABLED) ? - "enabled" : "disabled"); - } - break; - - case ACPI_SRAT_TYPE_MEMORY_AFFINITY: - { - struct acpi_srat_mem_affinity *p = - (struct acpi_srat_mem_affinity *)header; - pr_debug("SRAT Memory (0x%llx length 0x%llx) in proximity domain %d %s%s%s\n", - (unsigned long long)p->base_address, - (unsigned long long)p->length, - p->proximity_domain, - (p->flags & ACPI_SRAT_MEM_ENABLED) ? - "enabled" : "disabled", - (p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? - " hot-pluggable" : "", - (p->flags & ACPI_SRAT_MEM_NON_VOLATILE) ? - " non-volatile" : ""); - } - break; - - case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: - { - struct acpi_srat_x2apic_cpu_affinity *p = - (struct acpi_srat_x2apic_cpu_affinity *)header; - pr_debug("SRAT Processor (x2apicid[0x%08x]) in proximity domain %d %s\n", - p->apic_id, - p->proximity_domain, - (p->flags & ACPI_SRAT_CPU_ENABLED) ? - "enabled" : "disabled"); - } - break; - - case ACPI_SRAT_TYPE_GICC_AFFINITY: - { - struct acpi_srat_gicc_affinity *p = - (struct acpi_srat_gicc_affinity *)header; - pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n", - p->acpi_processor_uid, - p->proximity_domain, - (p->flags & ACPI_SRAT_GICC_ENABLED) ? - "enabled" : "disabled"); - } - break; - - default: - pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", - header->type); - break; - } -} - -/* - * A lot of BIOS fill in 10 (= no distance) everywhere. This messes - * up the NUMA heuristics which wants the local node to have a smaller - * distance than the others. - * Do some quick checks here and only use the SLIT if it passes. - */ -static int __init slit_valid(struct acpi_table_slit *slit) -{ - int i, j; - int d = slit->locality_count; - for (i = 0; i < d; i++) { - for (j = 0; j < d; j++) { - u8 val = slit->entry[d*i + j]; - if (i == j) { - if (val != LOCAL_DISTANCE) - return 0; - } else if (val <= LOCAL_DISTANCE) - return 0; - } - } - return 1; -} - -void __init bad_srat(void) -{ - pr_err("SRAT: SRAT not used.\n"); - acpi_numa = -1; -} - -int __init srat_disabled(void) -{ - return acpi_numa < 0; -} - -#if defined(CONFIG_X86) || defined(CONFIG_ARM64) -/* - * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for - * I/O localities since SRAT does not list them. I/O localities are - * not supported at this point. - */ -void __init acpi_numa_slit_init(struct acpi_table_slit *slit) -{ - int i, j; - - for (i = 0; i < slit->locality_count; i++) { - const int from_node = pxm_to_node(i); - - if (from_node == NUMA_NO_NODE) - continue; - - for (j = 0; j < slit->locality_count; j++) { - const int to_node = pxm_to_node(j); - - if (to_node == NUMA_NO_NODE) - continue; - - numa_set_distance(from_node, to_node, - slit->entry[slit->locality_count * i + j]); - } - } -} - -/* - * Default callback for parsing of the Proximity Domain <-> Memory - * Area mappings - */ -int __init -acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) -{ - u64 start, end; - u32 hotpluggable; - int node, pxm; - - if (srat_disabled()) - goto out_err; - if (ma->header.length < sizeof(struct acpi_srat_mem_affinity)) { - pr_err("SRAT: Unexpected header length: %d\n", - ma->header.length); - goto out_err_bad_srat; - } - if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - goto out_err; - hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; - if (hotpluggable && !IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) - goto out_err; - - start = ma->base_address; - end = start + ma->length; - pxm = ma->proximity_domain; - if (acpi_srat_revision <= 1) - pxm &= 0xff; - - node = acpi_map_pxm_to_node(pxm); - if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { - pr_err("SRAT: Too many proximity domains.\n"); - goto out_err_bad_srat; - } - - if (numa_add_memblk(node, start, end) < 0) { - pr_err("SRAT: Failed to add memblk to node %u [mem %#010Lx-%#010Lx]\n", - node, (unsigned long long) start, - (unsigned long long) end - 1); - goto out_err_bad_srat; - } - - node_set(node, numa_nodes_parsed); - - pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", - node, pxm, - (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? " hotplug" : "", - ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); - - /* Mark hotplug range in memblock. */ - if (hotpluggable && memblock_mark_hotplug(start, ma->length)) - pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", - (unsigned long long)start, (unsigned long long)end - 1); - - max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); - - return 0; -out_err_bad_srat: - bad_srat(); -out_err: - return -EINVAL; -} -#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ - -static int __init acpi_parse_slit(struct acpi_table_header *table) -{ - struct acpi_table_slit *slit = (struct acpi_table_slit *)table; - - if (!slit_valid(slit)) { - pr_info("SLIT table looks invalid. Not used.\n"); - return -EINVAL; - } - acpi_numa_slit_init(slit); - - return 0; -} - -void __init __weak -acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) -{ - pr_warn("Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id); -} - -static int __init -acpi_parse_x2apic_affinity(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_srat_x2apic_cpu_affinity *processor_affinity; - - processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header; - if (!processor_affinity) - return -EINVAL; - - acpi_table_print_srat_entry(&header->common); - - /* let architecture-dependent part to do it */ - acpi_numa_x2apic_affinity_init(processor_affinity); - - return 0; -} - -static int __init -acpi_parse_processor_affinity(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_srat_cpu_affinity *processor_affinity; - - processor_affinity = (struct acpi_srat_cpu_affinity *)header; - if (!processor_affinity) - return -EINVAL; - - acpi_table_print_srat_entry(&header->common); - - /* let architecture-dependent part to do it */ - acpi_numa_processor_affinity_init(processor_affinity); - - return 0; -} - -static int __init -acpi_parse_gicc_affinity(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_srat_gicc_affinity *processor_affinity; - - processor_affinity = (struct acpi_srat_gicc_affinity *)header; - if (!processor_affinity) - return -EINVAL; - - acpi_table_print_srat_entry(&header->common); - - /* let architecture-dependent part to do it */ - acpi_numa_gicc_affinity_init(processor_affinity); - - return 0; -} - -static int __initdata parsed_numa_memblks; - -static int __init -acpi_parse_memory_affinity(union acpi_subtable_headers * header, - const unsigned long end) -{ - struct acpi_srat_mem_affinity *memory_affinity; - - memory_affinity = (struct acpi_srat_mem_affinity *)header; - if (!memory_affinity) - return -EINVAL; - - acpi_table_print_srat_entry(&header->common); - - /* let architecture-dependent part to do it */ - if (!acpi_numa_memory_affinity_init(memory_affinity)) - parsed_numa_memblks++; - return 0; -} - -static int __init acpi_parse_srat(struct acpi_table_header *table) -{ - struct acpi_table_srat *srat = (struct acpi_table_srat *)table; - - acpi_srat_revision = srat->header.revision; - - /* Real work done in acpi_table_parse_srat below. */ - - return 0; -} - -static int __init -acpi_table_parse_srat(enum acpi_srat_type id, - acpi_tbl_entry_handler handler, unsigned int max_entries) -{ - return acpi_table_parse_entries(ACPI_SIG_SRAT, - sizeof(struct acpi_table_srat), id, - handler, max_entries); -} - -int __init acpi_numa_init(void) -{ - int cnt = 0; - - if (acpi_disabled) - return -EINVAL; - - /* - * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= - * SRAT cpu entries could have different order with that in MADT. - * So go over all cpu entries in SRAT to get apicid to node mapping. - */ - - /* SRAT: System Resource Affinity Table */ - if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { - struct acpi_subtable_proc srat_proc[3]; - - memset(srat_proc, 0, sizeof(srat_proc)); - srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; - srat_proc[0].handler = acpi_parse_processor_affinity; - srat_proc[1].id = ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY; - srat_proc[1].handler = acpi_parse_x2apic_affinity; - srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY; - srat_proc[2].handler = acpi_parse_gicc_affinity; - - acpi_table_parse_entries_array(ACPI_SIG_SRAT, - sizeof(struct acpi_table_srat), - srat_proc, ARRAY_SIZE(srat_proc), 0); - - cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, - acpi_parse_memory_affinity, 0); - } - - /* SLIT: System Locality Information Table */ - acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); - - if (cnt < 0) - return cnt; - else if (!parsed_numa_memblks) - return -ENOENT; - return 0; -} - -static int acpi_get_pxm(acpi_handle h) -{ - unsigned long long pxm; - acpi_status status; - acpi_handle handle; - acpi_handle phandle = h; - - do { - handle = phandle; - status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); - if (ACPI_SUCCESS(status)) - return pxm; - status = acpi_get_parent(handle, &phandle); - } while (ACPI_SUCCESS(status)); - return -1; -} - -int acpi_get_node(acpi_handle handle) -{ - int pxm; - - pxm = acpi_get_pxm(handle); - - return acpi_map_pxm_to_node(pxm); -} -EXPORT_SYMBOL(acpi_get_node); diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig new file mode 100644 index 000000000000..acbd5aa76e40 --- /dev/null +++ b/drivers/acpi/numa/Kconfig @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +config ACPI_NUMA + bool "NUMA support" + depends on NUMA + depends on (X86 || IA64 || ARM64) + default y if IA64 || ARM64 + +config ACPI_HMAT + bool "ACPI Heterogeneous Memory Attribute Table Support" + depends on ACPI_NUMA + select HMEM_REPORTING + help + If set, this option has the kernel parse and report the + platform's ACPI HMAT (Heterogeneous Memory Attributes Table), + register memory initiators with their targets, and export + performance attributes through the node's sysfs device if + provided. diff --git a/drivers/acpi/numa/Makefile b/drivers/acpi/numa/Makefile new file mode 100644 index 000000000000..517a6c689a94 --- /dev/null +++ b/drivers/acpi/numa/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ACPI_NUMA) += srat.o +obj-$(CONFIG_ACPI_HMAT) += hmat.o diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c new file mode 100644 index 000000000000..8b0de8a3c647 --- /dev/null +++ b/drivers/acpi/numa/hmat.c @@ -0,0 +1,751 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019, Intel Corporation. + * + * Heterogeneous Memory Attributes Table (HMAT) representation + * + * This program parses and reports the platform's HMAT tables, and registers + * the applicable attributes with the node's interfaces. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u8 hmat_revision; + +static LIST_HEAD(targets); +static LIST_HEAD(initiators); +static LIST_HEAD(localities); + +static DEFINE_MUTEX(target_lock); + +/* + * The defined enum order is used to prioritize attributes to break ties when + * selecting the best performing node. + */ +enum locality_types { + WRITE_LATENCY, + READ_LATENCY, + WRITE_BANDWIDTH, + READ_BANDWIDTH, +}; + +static struct memory_locality *localities_types[4]; + +struct target_cache { + struct list_head node; + struct node_cache_attrs cache_attrs; +}; + +struct memory_target { + struct list_head node; + unsigned int memory_pxm; + unsigned int processor_pxm; + struct node_hmem_attrs hmem_attrs; + struct list_head caches; + struct node_cache_attrs cache_attrs; + bool registered; +}; + +struct memory_initiator { + struct list_head node; + unsigned int processor_pxm; +}; + +struct memory_locality { + struct list_head node; + struct acpi_hmat_locality *hmat_loc; +}; + +static struct memory_initiator *find_mem_initiator(unsigned int cpu_pxm) +{ + struct memory_initiator *initiator; + + list_for_each_entry(initiator, &initiators, node) + if (initiator->processor_pxm == cpu_pxm) + return initiator; + return NULL; +} + +static struct memory_target *find_mem_target(unsigned int mem_pxm) +{ + struct memory_target *target; + + list_for_each_entry(target, &targets, node) + if (target->memory_pxm == mem_pxm) + return target; + return NULL; +} + +static __init void alloc_memory_initiator(unsigned int cpu_pxm) +{ + struct memory_initiator *initiator; + + if (pxm_to_node(cpu_pxm) == NUMA_NO_NODE) + return; + + initiator = find_mem_initiator(cpu_pxm); + if (initiator) + return; + + initiator = kzalloc(sizeof(*initiator), GFP_KERNEL); + if (!initiator) + return; + + initiator->processor_pxm = cpu_pxm; + list_add_tail(&initiator->node, &initiators); +} + +static __init void alloc_memory_target(unsigned int mem_pxm) +{ + struct memory_target *target; + + target = find_mem_target(mem_pxm); + if (target) + return; + + target = kzalloc(sizeof(*target), GFP_KERNEL); + if (!target) + return; + + target->memory_pxm = mem_pxm; + target->processor_pxm = PXM_INVAL; + list_add_tail(&target->node, &targets); + INIT_LIST_HEAD(&target->caches); +} + +static __init const char *hmat_data_type(u8 type) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + return "Access Latency"; + case ACPI_HMAT_READ_LATENCY: + return "Read Latency"; + case ACPI_HMAT_WRITE_LATENCY: + return "Write Latency"; + case ACPI_HMAT_ACCESS_BANDWIDTH: + return "Access Bandwidth"; + case ACPI_HMAT_READ_BANDWIDTH: + return "Read Bandwidth"; + case ACPI_HMAT_WRITE_BANDWIDTH: + return "Write Bandwidth"; + default: + return "Reserved"; + } +} + +static __init const char *hmat_data_type_suffix(u8 type) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + return " nsec"; + case ACPI_HMAT_ACCESS_BANDWIDTH: + case ACPI_HMAT_READ_BANDWIDTH: + case ACPI_HMAT_WRITE_BANDWIDTH: + return " MB/s"; + default: + return ""; + } +} + +static u32 hmat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + else if (base > (UINT_MAX / (entry))) + return 0; + + /* + * Divide by the base unit for version 1, convert latency from + * picosenonds to nanoseconds if revision 2. + */ + value = entry * base; + if (hmat_revision == 1) { + if (value < 10) + return 0; + value = DIV_ROUND_UP(value, 10); + } else if (hmat_revision == 2) { + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + } + return value; +} + +static void hmat_update_target_access(struct memory_target *target, + u8 type, u32 value) +{ + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + target->hmem_attrs.read_latency = value; + target->hmem_attrs.write_latency = value; + break; + case ACPI_HMAT_READ_LATENCY: + target->hmem_attrs.read_latency = value; + break; + case ACPI_HMAT_WRITE_LATENCY: + target->hmem_attrs.write_latency = value; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + target->hmem_attrs.read_bandwidth = value; + target->hmem_attrs.write_bandwidth = value; + break; + case ACPI_HMAT_READ_BANDWIDTH: + target->hmem_attrs.read_bandwidth = value; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + target->hmem_attrs.write_bandwidth = value; + break; + default: + break; + } +} + +static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) +{ + struct memory_locality *loc; + + loc = kzalloc(sizeof(*loc), GFP_KERNEL); + if (!loc) { + pr_notice_once("Failed to allocate HMAT locality\n"); + return; + } + + loc->hmat_loc = hmat_loc; + list_add_tail(&loc->node, &localities); + + switch (hmat_loc->data_type) { + case ACPI_HMAT_ACCESS_LATENCY: + localities_types[READ_LATENCY] = loc; + localities_types[WRITE_LATENCY] = loc; + break; + case ACPI_HMAT_READ_LATENCY: + localities_types[READ_LATENCY] = loc; + break; + case ACPI_HMAT_WRITE_LATENCY: + localities_types[WRITE_LATENCY] = loc; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + localities_types[READ_BANDWIDTH] = loc; + localities_types[WRITE_BANDWIDTH] = loc; + break; + case ACPI_HMAT_READ_BANDWIDTH: + localities_types[READ_BANDWIDTH] = loc; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + localities_types[WRITE_BANDWIDTH] = loc; + break; + default: + break; + } +} + +static __init int hmat_parse_locality(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_locality *hmat_loc = (void *)header; + struct memory_target *target; + unsigned int init, targ, total_size, ipds, tpds; + u32 *inits, *targs, value; + u16 *entries; + u8 type, mem_hier; + + if (hmat_loc->header.length < sizeof(*hmat_loc)) { + pr_notice("HMAT: Unexpected locality header length: %d\n", + hmat_loc->header.length); + return -EINVAL; + } + + type = hmat_loc->data_type; + mem_hier = hmat_loc->flags & ACPI_HMAT_MEMORY_HIERARCHY; + ipds = hmat_loc->number_of_initiator_Pds; + tpds = hmat_loc->number_of_target_Pds; + total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds + + sizeof(*inits) * ipds + sizeof(*targs) * tpds; + if (hmat_loc->header.length < total_size) { + pr_notice("HMAT: Unexpected locality header length:%d, minimum required:%d\n", + hmat_loc->header.length, total_size); + return -EINVAL; + } + + pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%d Target Domains:%d Base:%lld\n", + hmat_loc->flags, hmat_data_type(type), ipds, tpds, + hmat_loc->entry_base_unit); + + inits = (u32 *)(hmat_loc + 1); + targs = inits + ipds; + entries = (u16 *)(targs + tpds); + for (init = 0; init < ipds; init++) { + alloc_memory_initiator(inits[init]); + for (targ = 0; targ < tpds; targ++) { + value = hmat_normalize(entries[init * tpds + targ], + hmat_loc->entry_base_unit, + type); + pr_info(" Initiator-Target[%d-%d]:%d%s\n", + inits[init], targs[targ], value, + hmat_data_type_suffix(type)); + + if (mem_hier == ACPI_HMAT_MEMORY) { + target = find_mem_target(targs[targ]); + if (target && target->processor_pxm == inits[init]) + hmat_update_target_access(target, type, value); + } + } + } + + if (mem_hier == ACPI_HMAT_MEMORY) + hmat_add_locality(hmat_loc); + + return 0; +} + +static __init int hmat_parse_cache(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_cache *cache = (void *)header; + struct memory_target *target; + struct target_cache *tcache; + u32 attrs; + + if (cache->header.length < sizeof(*cache)) { + pr_notice("HMAT: Unexpected cache header length: %d\n", + cache->header.length); + return -EINVAL; + } + + attrs = cache->cache_attributes; + pr_info("HMAT: Cache: Domain:%d Size:%llu Attrs:%08x SMBIOS Handles:%d\n", + cache->memory_PD, cache->cache_size, attrs, + cache->number_of_SMBIOShandles); + + target = find_mem_target(cache->memory_PD); + if (!target) + return 0; + + tcache = kzalloc(sizeof(*tcache), GFP_KERNEL); + if (!tcache) { + pr_notice_once("Failed to allocate HMAT cache info\n"); + return 0; + } + + tcache->cache_attrs.size = cache->cache_size; + tcache->cache_attrs.level = (attrs & ACPI_HMAT_CACHE_LEVEL) >> 4; + tcache->cache_attrs.line_size = (attrs & ACPI_HMAT_CACHE_LINE_SIZE) >> 16; + + switch ((attrs & ACPI_HMAT_CACHE_ASSOCIATIVITY) >> 8) { + case ACPI_HMAT_CA_DIRECT_MAPPED: + tcache->cache_attrs.indexing = NODE_CACHE_DIRECT_MAP; + break; + case ACPI_HMAT_CA_COMPLEX_CACHE_INDEXING: + tcache->cache_attrs.indexing = NODE_CACHE_INDEXED; + break; + case ACPI_HMAT_CA_NONE: + default: + tcache->cache_attrs.indexing = NODE_CACHE_OTHER; + break; + } + + switch ((attrs & ACPI_HMAT_WRITE_POLICY) >> 12) { + case ACPI_HMAT_CP_WB: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_BACK; + break; + case ACPI_HMAT_CP_WT: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_THROUGH; + break; + case ACPI_HMAT_CP_NONE: + default: + tcache->cache_attrs.write_policy = NODE_CACHE_WRITE_OTHER; + break; + } + list_add_tail(&tcache->node, &target->caches); + + return 0; +} + +static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_proximity_domain *p = (void *)header; + struct memory_target *target = NULL; + + if (p->header.length != sizeof(*p)) { + pr_notice("HMAT: Unexpected address range header length: %d\n", + p->header.length); + return -EINVAL; + } + + if (hmat_revision == 1) + pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%d\n", + p->reserved3, p->reserved4, p->flags, p->processor_PD, + p->memory_PD); + else + pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%d\n", + p->flags, p->processor_PD, p->memory_PD); + + if (p->flags & ACPI_HMAT_MEMORY_PD_VALID && hmat_revision == 1) { + target = find_mem_target(p->memory_PD); + if (!target) { + pr_debug("HMAT: Memory Domain missing from SRAT\n"); + return -EINVAL; + } + } + if (target && p->flags & ACPI_HMAT_PROCESSOR_PD_VALID) { + int p_node = pxm_to_node(p->processor_PD); + + if (p_node == NUMA_NO_NODE) { + pr_debug("HMAT: Invalid Processor Domain\n"); + return -EINVAL; + } + target->processor_pxm = p_node; + } + + return 0; +} + +static int __init hmat_parse_subtable(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_hmat_structure *hdr = (void *)header; + + if (!hdr) + return -EINVAL; + + switch (hdr->type) { + case ACPI_HMAT_TYPE_PROXIMITY: + return hmat_parse_proximity_domain(header, end); + case ACPI_HMAT_TYPE_LOCALITY: + return hmat_parse_locality(header, end); + case ACPI_HMAT_TYPE_CACHE: + return hmat_parse_cache(header, end); + default: + return -EINVAL; + } +} + +static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_mem_affinity *ma = (void *)header; + + if (!ma) + return -EINVAL; + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return 0; + alloc_memory_target(ma->proximity_domain); + return 0; +} + +static u32 hmat_initiator_perf(struct memory_target *target, + struct memory_initiator *initiator, + struct acpi_hmat_locality *hmat_loc) +{ + unsigned int ipds, tpds, i, idx = 0, tdx = 0; + u32 *inits, *targs; + u16 *entries; + + ipds = hmat_loc->number_of_initiator_Pds; + tpds = hmat_loc->number_of_target_Pds; + inits = (u32 *)(hmat_loc + 1); + targs = inits + ipds; + entries = (u16 *)(targs + tpds); + + for (i = 0; i < ipds; i++) { + if (inits[i] == initiator->processor_pxm) { + idx = i; + break; + } + } + + if (i == ipds) + return 0; + + for (i = 0; i < tpds; i++) { + if (targs[i] == target->memory_pxm) { + tdx = i; + break; + } + } + if (i == tpds) + return 0; + + return hmat_normalize(entries[idx * tpds + tdx], + hmat_loc->entry_base_unit, + hmat_loc->data_type); +} + +static bool hmat_update_best(u8 type, u32 value, u32 *best) +{ + bool updated = false; + + if (!value) + return false; + + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + if (!*best || *best > value) { + *best = value; + updated = true; + } + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + case ACPI_HMAT_READ_BANDWIDTH: + case ACPI_HMAT_WRITE_BANDWIDTH: + if (!*best || *best < value) { + *best = value; + updated = true; + } + break; + } + + return updated; +} + +static int initiator_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct memory_initiator *ia; + struct memory_initiator *ib; + unsigned long *p_nodes = priv; + + ia = list_entry(a, struct memory_initiator, node); + ib = list_entry(b, struct memory_initiator, node); + + set_bit(ia->processor_pxm, p_nodes); + set_bit(ib->processor_pxm, p_nodes); + + return ia->processor_pxm - ib->processor_pxm; +} + +static void hmat_register_target_initiators(struct memory_target *target) +{ + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); + struct memory_initiator *initiator; + unsigned int mem_nid, cpu_nid; + struct memory_locality *loc = NULL; + u32 best = 0; + int i; + + mem_nid = pxm_to_node(target->memory_pxm); + /* + * If the Address Range Structure provides a local processor pxm, link + * only that one. Otherwise, find the best performance attributes and + * register all initiators that match. + */ + if (target->processor_pxm != PXM_INVAL) { + cpu_nid = pxm_to_node(target->processor_pxm); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + return; + } + + if (list_empty(&localities)) + return; + + /* + * We need the initiator list sorted so we can use bitmap_clear for + * previously set initiators when we find a better memory accessor. + * We'll also use the sorting to prime the candidate nodes with known + * initiators. + */ + bitmap_zero(p_nodes, MAX_NUMNODES); + list_sort(p_nodes, &initiators, initiator_cmp); + for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { + loc = localities_types[i]; + if (!loc) + continue; + + best = 0; + list_for_each_entry(initiator, &initiators, node) { + u32 value; + + if (!test_bit(initiator->processor_pxm, p_nodes)) + continue; + + value = hmat_initiator_perf(target, initiator, loc->hmat_loc); + if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) + bitmap_clear(p_nodes, 0, initiator->processor_pxm); + if (value != best) + clear_bit(initiator->processor_pxm, p_nodes); + } + if (best) + hmat_update_target_access(target, loc->hmat_loc->data_type, best); + } + + for_each_set_bit(i, p_nodes, MAX_NUMNODES) { + cpu_nid = pxm_to_node(i); + register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); + } +} + +static void hmat_register_target_cache(struct memory_target *target) +{ + unsigned mem_nid = pxm_to_node(target->memory_pxm); + struct target_cache *tcache; + + list_for_each_entry(tcache, &target->caches, node) + node_add_cache(mem_nid, &tcache->cache_attrs); +} + +static void hmat_register_target_perf(struct memory_target *target) +{ + unsigned mem_nid = pxm_to_node(target->memory_pxm); + node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); +} + +static void hmat_register_target(struct memory_target *target) +{ + int nid = pxm_to_node(target->memory_pxm); + + /* + * Skip offline nodes. This can happen when memory + * marked EFI_MEMORY_SP, "specific purpose", is applied + * to all the memory in a promixity domain leading to + * the node being marked offline / unplugged, or if + * memory-only "hotplug" node is offline. + */ + if (nid == NUMA_NO_NODE || !node_online(nid)) + return; + + mutex_lock(&target_lock); + if (!target->registered) { + hmat_register_target_initiators(target); + hmat_register_target_cache(target); + hmat_register_target_perf(target); + target->registered = true; + } + mutex_unlock(&target_lock); +} + +static void hmat_register_targets(void) +{ + struct memory_target *target; + + list_for_each_entry(target, &targets, node) + hmat_register_target(target); +} + +static int hmat_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_target *target; + struct memory_notify *mnb = arg; + int pxm, nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE || action != MEM_ONLINE) + return NOTIFY_OK; + + pxm = node_to_pxm(nid); + target = find_mem_target(pxm); + if (!target) + return NOTIFY_OK; + + hmat_register_target(target); + return NOTIFY_OK; +} + +static struct notifier_block hmat_callback_nb = { + .notifier_call = hmat_callback, + .priority = 2, +}; + +static __init void hmat_free_structures(void) +{ + struct memory_target *target, *tnext; + struct memory_locality *loc, *lnext; + struct memory_initiator *initiator, *inext; + struct target_cache *tcache, *cnext; + + list_for_each_entry_safe(target, tnext, &targets, node) { + list_for_each_entry_safe(tcache, cnext, &target->caches, node) { + list_del(&tcache->node); + kfree(tcache); + } + list_del(&target->node); + kfree(target); + } + + list_for_each_entry_safe(initiator, inext, &initiators, node) { + list_del(&initiator->node); + kfree(initiator); + } + + list_for_each_entry_safe(loc, lnext, &localities, node) { + list_del(&loc->node); + kfree(loc); + } +} + +static __init int hmat_init(void) +{ + struct acpi_table_header *tbl; + enum acpi_hmat_type i; + acpi_status status; + + if (srat_disabled()) + return 0; + + status = acpi_get_table(ACPI_SIG_SRAT, 0, &tbl); + if (ACPI_FAILURE(status)) + return 0; + + if (acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + ACPI_SRAT_TYPE_MEMORY_AFFINITY, + srat_parse_mem_affinity, 0) < 0) + goto out_put; + acpi_put_table(tbl); + + status = acpi_get_table(ACPI_SIG_HMAT, 0, &tbl); + if (ACPI_FAILURE(status)) + goto out_put; + + hmat_revision = tbl->revision; + switch (hmat_revision) { + case 1: + case 2: + break; + default: + pr_notice("Ignoring HMAT: Unknown revision:%d\n", hmat_revision); + goto out_put; + } + + for (i = ACPI_HMAT_TYPE_PROXIMITY; i < ACPI_HMAT_TYPE_RESERVED; i++) { + if (acpi_table_parse_entries(ACPI_SIG_HMAT, + sizeof(struct acpi_table_hmat), i, + hmat_parse_subtable, 0) < 0) { + pr_notice("Ignoring HMAT: Invalid table"); + goto out_put; + } + } + hmat_register_targets(); + + /* Keep the table and structures if the notifier may use them */ + if (!register_hotmemory_notifier(&hmat_callback_nb)) + return 0; +out_put: + hmat_free_structures(); + acpi_put_table(tbl); + return 0; +} +subsys_initcall(hmat_init); diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c new file mode 100644 index 000000000000..eadbf90e65d1 --- /dev/null +++ b/drivers/acpi/numa/srat.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * acpi_numa.c - ACPI NUMA support + * + * Copyright (C) 2002 Takayoshi Kochi + */ + +#define pr_fmt(fmt) "ACPI: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static nodemask_t nodes_found_map = NODE_MASK_NONE; + +/* maps to convert between proximity domain and logical node ID */ +static int pxm_to_node_map[MAX_PXM_DOMAINS] + = { [0 ... MAX_PXM_DOMAINS - 1] = NUMA_NO_NODE }; +static int node_to_pxm_map[MAX_NUMNODES] + = { [0 ... MAX_NUMNODES - 1] = PXM_INVAL }; + +unsigned char acpi_srat_revision __initdata; +int acpi_numa __initdata; + +int pxm_to_node(int pxm) +{ + if (pxm < 0) + return NUMA_NO_NODE; + return pxm_to_node_map[pxm]; +} + +int node_to_pxm(int node) +{ + if (node < 0) + return PXM_INVAL; + return node_to_pxm_map[node]; +} + +static void __acpi_map_pxm_to_node(int pxm, int node) +{ + if (pxm_to_node_map[pxm] == NUMA_NO_NODE || node < pxm_to_node_map[pxm]) + pxm_to_node_map[pxm] = node; + if (node_to_pxm_map[node] == PXM_INVAL || pxm < node_to_pxm_map[node]) + node_to_pxm_map[node] = pxm; +} + +int acpi_map_pxm_to_node(int pxm) +{ + int node; + + if (pxm < 0 || pxm >= MAX_PXM_DOMAINS || numa_off) + return NUMA_NO_NODE; + + node = pxm_to_node_map[pxm]; + + if (node == NUMA_NO_NODE) { + if (nodes_weight(nodes_found_map) >= MAX_NUMNODES) + return NUMA_NO_NODE; + node = first_unset_node(nodes_found_map); + __acpi_map_pxm_to_node(pxm, node); + node_set(node, nodes_found_map); + } + + return node; +} +EXPORT_SYMBOL(acpi_map_pxm_to_node); + +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +int acpi_map_pxm_to_online_node(int pxm) +{ + int node, min_node; + + node = acpi_map_pxm_to_node(pxm); + + if (node == NUMA_NO_NODE) + node = 0; + + min_node = node; + if (!node_online(node)) { + int min_dist = INT_MAX, dist, n; + + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + } + + return min_node; +} +EXPORT_SYMBOL(acpi_map_pxm_to_online_node); + +static void __init +acpi_table_print_srat_entry(struct acpi_subtable_header *header) +{ + switch (header->type) { + case ACPI_SRAT_TYPE_CPU_AFFINITY: + { + struct acpi_srat_cpu_affinity *p = + (struct acpi_srat_cpu_affinity *)header; + pr_debug("SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n", + p->apic_id, p->local_sapic_eid, + p->proximity_domain_lo, + (p->flags & ACPI_SRAT_CPU_ENABLED) ? + "enabled" : "disabled"); + } + break; + + case ACPI_SRAT_TYPE_MEMORY_AFFINITY: + { + struct acpi_srat_mem_affinity *p = + (struct acpi_srat_mem_affinity *)header; + pr_debug("SRAT Memory (0x%llx length 0x%llx) in proximity domain %d %s%s%s\n", + (unsigned long long)p->base_address, + (unsigned long long)p->length, + p->proximity_domain, + (p->flags & ACPI_SRAT_MEM_ENABLED) ? + "enabled" : "disabled", + (p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? + " hot-pluggable" : "", + (p->flags & ACPI_SRAT_MEM_NON_VOLATILE) ? + " non-volatile" : ""); + } + break; + + case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: + { + struct acpi_srat_x2apic_cpu_affinity *p = + (struct acpi_srat_x2apic_cpu_affinity *)header; + pr_debug("SRAT Processor (x2apicid[0x%08x]) in proximity domain %d %s\n", + p->apic_id, + p->proximity_domain, + (p->flags & ACPI_SRAT_CPU_ENABLED) ? + "enabled" : "disabled"); + } + break; + + case ACPI_SRAT_TYPE_GICC_AFFINITY: + { + struct acpi_srat_gicc_affinity *p = + (struct acpi_srat_gicc_affinity *)header; + pr_debug("SRAT Processor (acpi id[0x%04x]) in proximity domain %d %s\n", + p->acpi_processor_uid, + p->proximity_domain, + (p->flags & ACPI_SRAT_GICC_ENABLED) ? + "enabled" : "disabled"); + } + break; + + default: + pr_warn("Found unsupported SRAT entry (type = 0x%x)\n", + header->type); + break; + } +} + +/* + * A lot of BIOS fill in 10 (= no distance) everywhere. This messes + * up the NUMA heuristics which wants the local node to have a smaller + * distance than the others. + * Do some quick checks here and only use the SLIT if it passes. + */ +static int __init slit_valid(struct acpi_table_slit *slit) +{ + int i, j; + int d = slit->locality_count; + for (i = 0; i < d; i++) { + for (j = 0; j < d; j++) { + u8 val = slit->entry[d*i + j]; + if (i == j) { + if (val != LOCAL_DISTANCE) + return 0; + } else if (val <= LOCAL_DISTANCE) + return 0; + } + } + return 1; +} + +void __init bad_srat(void) +{ + pr_err("SRAT: SRAT not used.\n"); + acpi_numa = -1; +} + +int __init srat_disabled(void) +{ + return acpi_numa < 0; +} + +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) +/* + * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them. I/O localities are + * not supported at this point. + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + int i, j; + + for (i = 0; i < slit->locality_count; i++) { + const int from_node = pxm_to_node(i); + + if (from_node == NUMA_NO_NODE) + continue; + + for (j = 0; j < slit->locality_count; j++) { + const int to_node = pxm_to_node(j); + + if (to_node == NUMA_NO_NODE) + continue; + + numa_set_distance(from_node, to_node, + slit->entry[slit->locality_count * i + j]); + } + } +} + +/* + * Default callback for parsing of the Proximity Domain <-> Memory + * Area mappings + */ +int __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + u64 start, end; + u32 hotpluggable; + int node, pxm; + + if (srat_disabled()) + goto out_err; + if (ma->header.length < sizeof(struct acpi_srat_mem_affinity)) { + pr_err("SRAT: Unexpected header length: %d\n", + ma->header.length); + goto out_err_bad_srat; + } + if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) + goto out_err; + hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; + if (hotpluggable && !IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + goto out_err; + + start = ma->base_address; + end = start + ma->length; + pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; + + node = acpi_map_pxm_to_node(pxm); + if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) { + pr_err("SRAT: Too many proximity domains.\n"); + goto out_err_bad_srat; + } + + if (numa_add_memblk(node, start, end) < 0) { + pr_err("SRAT: Failed to add memblk to node %u [mem %#010Lx-%#010Lx]\n", + node, (unsigned long long) start, + (unsigned long long) end - 1); + goto out_err_bad_srat; + } + + node_set(node, numa_nodes_parsed); + + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1, + hotpluggable ? " hotplug" : "", + ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); + + /* Mark hotplug range in memblock. */ + if (hotpluggable && memblock_mark_hotplug(start, ma->length)) + pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", + (unsigned long long)start, (unsigned long long)end - 1); + + max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); + + return 0; +out_err_bad_srat: + bad_srat(); +out_err: + return -EINVAL; +} +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ + +static int __init acpi_parse_slit(struct acpi_table_header *table) +{ + struct acpi_table_slit *slit = (struct acpi_table_slit *)table; + + if (!slit_valid(slit)) { + pr_info("SLIT table looks invalid. Not used.\n"); + return -EINVAL; + } + acpi_numa_slit_init(slit); + + return 0; +} + +void __init __weak +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + pr_warn("Found unsupported x2apic [0x%08x] SRAT entry\n", pa->apic_id); +} + +static int __init +acpi_parse_x2apic_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_x2apic_cpu_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_x2apic_cpu_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_x2apic_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_processor_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_cpu_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_cpu_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_processor_affinity_init(processor_affinity); + + return 0; +} + +static int __init +acpi_parse_gicc_affinity(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_srat_gicc_affinity *processor_affinity; + + processor_affinity = (struct acpi_srat_gicc_affinity *)header; + if (!processor_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + acpi_numa_gicc_affinity_init(processor_affinity); + + return 0; +} + +static int __initdata parsed_numa_memblks; + +static int __init +acpi_parse_memory_affinity(union acpi_subtable_headers * header, + const unsigned long end) +{ + struct acpi_srat_mem_affinity *memory_affinity; + + memory_affinity = (struct acpi_srat_mem_affinity *)header; + if (!memory_affinity) + return -EINVAL; + + acpi_table_print_srat_entry(&header->common); + + /* let architecture-dependent part to do it */ + if (!acpi_numa_memory_affinity_init(memory_affinity)) + parsed_numa_memblks++; + return 0; +} + +static int __init acpi_parse_srat(struct acpi_table_header *table) +{ + struct acpi_table_srat *srat = (struct acpi_table_srat *)table; + + acpi_srat_revision = srat->header.revision; + + /* Real work done in acpi_table_parse_srat below. */ + + return 0; +} + +static int __init +acpi_table_parse_srat(enum acpi_srat_type id, + acpi_tbl_entry_handler handler, unsigned int max_entries) +{ + return acpi_table_parse_entries(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), id, + handler, max_entries); +} + +int __init acpi_numa_init(void) +{ + int cnt = 0; + + if (acpi_disabled) + return -EINVAL; + + /* + * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= + * SRAT cpu entries could have different order with that in MADT. + * So go over all cpu entries in SRAT to get apicid to node mapping. + */ + + /* SRAT: System Resource Affinity Table */ + if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { + struct acpi_subtable_proc srat_proc[3]; + + memset(srat_proc, 0, sizeof(srat_proc)); + srat_proc[0].id = ACPI_SRAT_TYPE_CPU_AFFINITY; + srat_proc[0].handler = acpi_parse_processor_affinity; + srat_proc[1].id = ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY; + srat_proc[1].handler = acpi_parse_x2apic_affinity; + srat_proc[2].id = ACPI_SRAT_TYPE_GICC_AFFINITY; + srat_proc[2].handler = acpi_parse_gicc_affinity; + + acpi_table_parse_entries_array(ACPI_SIG_SRAT, + sizeof(struct acpi_table_srat), + srat_proc, ARRAY_SIZE(srat_proc), 0); + + cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, + acpi_parse_memory_affinity, 0); + } + + /* SLIT: System Locality Information Table */ + acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); + + if (cnt < 0) + return cnt; + else if (!parsed_numa_memblks) + return -ENOENT; + return 0; +} + +static int acpi_get_pxm(acpi_handle h) +{ + unsigned long long pxm; + acpi_status status; + acpi_handle handle; + acpi_handle phandle = h; + + do { + handle = phandle; + status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); + if (ACPI_SUCCESS(status)) + return pxm; + status = acpi_get_parent(handle, &phandle); + } while (ACPI_SUCCESS(status)); + return -1; +} + +int acpi_get_node(acpi_handle handle) +{ + int pxm; + + pxm = acpi_get_pxm(handle); + + return acpi_map_pxm_to_node(pxm); +} +EXPORT_SYMBOL(acpi_get_node); -- cgit v1.2.3 From fe3e5e65c06edb1c56e64e567f053e243142001f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:00 -0800 Subject: efi: Enumerate EFI_MEMORY_SP UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The intent of this bit is to allow the OS to identify precious or scarce memory resources and optionally manage it separately from EfiConventionalMemory. As defined older OSes that do not know about this attribute are permitted to ignore it and the memory will be handled according to the OS default policy for the given memory type. In other words, this "specific purpose" hint is deliberately weaker than EfiReservedMemoryType in that the system continues to operate if the OS takes no action on the attribute. The risk of taking no action is potentially unwanted / unmovable kernel allocations from the designated resource that prevent the full realization of the "specific purpose". For example, consider a system with a high-bandwidth memory pool. Older kernels are permitted to boot and consume that memory as conventional "System-RAM" newer kernels may arrange for that memory to be set aside (soft reserved) by the system administrator for a dedicated high-bandwidth memory aware application to consume. Specifically, this mechanism allows for the elimination of scenarios where platform firmware tries to game OS policy by lying about ACPI SLIT values, i.e. claiming that a precious memory resource has a high distance to trigger the OS to avoid it by default. This reservation hint allows platform-firmware to instead tell the truth about performance characteristics by indicate to OS memory management to put immovable allocations elsewhere. Implement simple detection of the bit for EFI memory table dumps and save the kernel policy for a follow-on change. Reviewed-by: Ard Biesheuvel Reviewed-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/firmware/efi/efi.c | 5 +++-- include/linux/efi.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index e98bbf8e56d9..f8f8e273d809 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -842,15 +842,16 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT | EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_RO | EFI_MEMORY_WP | EFI_MEMORY_RP | EFI_MEMORY_XP | - EFI_MEMORY_NV | + EFI_MEMORY_NV | EFI_MEMORY_SP | EFI_MEMORY_RUNTIME | EFI_MEMORY_MORE_RELIABLE)) snprintf(pos, size, "|attr=0x%016llx]", (unsigned long long)attr); else snprintf(pos, size, - "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + "|%3s|%2s|%2s|%2s|%2s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", attr & EFI_MEMORY_RUNTIME ? "RUN" : "", attr & EFI_MEMORY_MORE_RELIABLE ? "MR" : "", + attr & EFI_MEMORY_SP ? "SP" : "", attr & EFI_MEMORY_NV ? "NV" : "", attr & EFI_MEMORY_XP ? "XP" : "", attr & EFI_MEMORY_RP ? "RP" : "", diff --git a/include/linux/efi.h b/include/linux/efi.h index d87acf62958e..78c75992b313 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -112,6 +112,7 @@ typedef struct { #define EFI_MEMORY_MORE_RELIABLE \ ((u64)0x0000000000010000ULL) /* higher reliability */ #define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ +#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 -- cgit v1.2.3 From 6950e31b35fdf4588cbbdec1813091bb02cf8871 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:05 -0800 Subject: x86/efi: Push EFI_MEMMAP check into leaf routines In preparation for adding another EFI_MEMMAP dependent call that needs to occur before e820__memblock_setup() fixup the existing efi calls to check for EFI_MEMMAP internally. This ends up being cleaner than the alternative of checking EFI_MEMMAP multiple times in setup_arch(). Reviewed-by: Dave Hansen Reviewed-by: Ard Biesheuvel Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/efi.h | 9 ++++++++- arch/x86/kernel/setup.c | 18 ++++++++---------- arch/x86/platform/efi/efi.c | 3 +++ arch/x86/platform/efi/quirks.c | 3 +++ drivers/firmware/efi/esrt.c | 3 +++ drivers/firmware/efi/fake_mem.c | 2 +- include/linux/efi.h | 1 - 7 files changed, 26 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 43a82e59c59d..45f853bce869 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -140,7 +140,6 @@ extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); extern void efi_recover_from_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -extern void efi_reserve_boot_services(void); struct efi_setup_data { u64 fw_vendor; @@ -244,6 +243,8 @@ static inline bool efi_is_64bit(void) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); +extern void efi_find_mirror(void); +extern void efi_reserve_boot_services(void); #else static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} static inline bool efi_reboot_required(void) @@ -254,6 +255,12 @@ static inline bool efi_is_table_address(unsigned long phys_addr) { return false; } +static inline void efi_find_mirror(void) +{ +} +static inline void efi_reserve_boot_services(void) +{ +} #endif /* CONFIG_EFI */ #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 77ea96b794bd..1c4b866bc184 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1122,17 +1122,15 @@ void __init setup_arch(char **cmdline_p) reserve_bios_regions(); - if (efi_enabled(EFI_MEMMAP)) { - efi_fake_memmap(); - efi_find_mirror(); - efi_esrt_init(); + efi_fake_memmap(); + efi_find_mirror(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be - * called after ExitBootServices(). This is, in fact, a lie. - */ - efi_reserve_boot_services(); - } + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ + efi_reserve_boot_services(); /* preallocate 4k for mptable mpc */ e820__memblock_alloc_reserved_mpc_new(); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 425e025341db..e6e41b118d68 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -128,6 +128,9 @@ void __init efi_find_mirror(void) efi_memory_desc_t *md; u64 mirror_size = 0, total_size = 0; + if (!efi_enabled(EFI_MEMMAP)) + return; + for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 3b9fd679cea9..7675cf754d90 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -320,6 +320,9 @@ void __init efi_reserve_boot_services(void) { efi_memory_desc_t *md; + if (!efi_enabled(EFI_MEMMAP)) + return; + for_each_efi_memory_desc(md) { u64 start = md->phys_addr; u64 size = md->num_pages << EFI_PAGE_SHIFT; diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c index d6dd5f503fa2..2762e0662bf4 100644 --- a/drivers/firmware/efi/esrt.c +++ b/drivers/firmware/efi/esrt.c @@ -246,6 +246,9 @@ void __init efi_esrt_init(void) int rc; phys_addr_t end; + if (!efi_enabled(EFI_MEMMAP)) + return; + pr_debug("esrt-init: loading.\n"); if (!esrt_table_exists()) return; diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 9501edc0fcfb..526b45331d96 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -44,7 +44,7 @@ void __init efi_fake_memmap(void) void *new_memmap; int i; - if (!nr_fake_mem) + if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem) return; /* count up the number of EFI memory descriptor */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 78c75992b313..44c85b559e15 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1045,7 +1045,6 @@ extern void efi_enter_virtual_mode (void); /* switch EFI to virtual mode, if pos extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, bool nonblocking); -extern void efi_find_mirror(void); #else static inline efi_status_t efi_query_variable_store(u32 attributes, -- cgit v1.2.3 From b617c5266eedbef2ccbb90931bb9175faa4ae0bc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:11 -0800 Subject: efi: Common enable/disable infrastructure for EFI soft reservation UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The proposed Linux behavior for specific purpose memory is that it is reserved for direct-access (device-dax) by default and not available for any kernel usage, not even as an OOM fallback. Later, through udev scripts or another init mechanism, these device-dax claimed ranges can be reconfigured and hot-added to the available System-RAM with a unique node identifier. This device-dax management scheme implements "soft" in the "soft reserved" designation by allowing some or all of the reservation to be recovered as typical memory. This policy can be disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with efi=nosoftreserve. As for this patch, define the common helpers to determine if the EFI_MEMORY_SP attribute should be honored. The determination needs to be made early to prevent the kernel from being loaded into soft-reserved memory, or otherwise allowing early allocations to land there. Follow-on changes are needed per architecture to leverage these helpers in their respective mem-init paths. Reviewed-by: Ard Biesheuvel Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 9 ++++++++- drivers/firmware/efi/Kconfig | 21 +++++++++++++++++++++ drivers/firmware/efi/efi.c | 8 ++++++++ drivers/firmware/efi/libstub/efi-stub-helper.c | 19 +++++++++++++++++++ include/linux/efi.h | 14 ++++++++++++++ 5 files changed, 70 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a84a83f8881e..2359dc56d82c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1168,7 +1168,8 @@ Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map", "nochunk", "noruntime", "debug" } + Format: { "old_map", "nochunk", "noruntime", "debug", + "nosoftreserve" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. @@ -1177,6 +1178,12 @@ firmware implementations. noruntime : disable EFI runtime services support debug: enable misc debug output + nosoftreserve: The EFI_MEMORY_SP (Specific Purpose) + attribute may cause the kernel to reserve the + memory range for a memory mapping driver to + claim. Specify efi=nosoftreserve to disable this + reservation and treat the memory by its base type + (i.e. EFI_CONVENTIONAL_MEMORY / "System RAM"). efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index b248870a9806..bcc378c19ebe 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -75,6 +75,27 @@ config EFI_MAX_FAKE_MEM Ranges can be set up to this value using comma-separated list. The default value is 8. +config EFI_SOFT_RESERVE + bool "Reserve EFI Specific Purpose Memory" + depends on EFI && EFI_STUB && ACPI_HMAT + default ACPI_HMAT + help + On systems that have mixed performance classes of memory EFI + may indicate specific purpose memory with an attribute (See + EFI_MEMORY_SP in UEFI 2.8). A memory range tagged with this + attribute may have unique performance characteristics compared + to the system's general purpose "System RAM" pool. On the + expectation that such memory has application specific usage, + and its base EFI memory type is "conventional" answer Y to + arrange for the kernel to reserve it as a "Soft Reserved" + resource, and set aside for direct-access (device-dax) by + default. The memory range can later be optionally assigned to + the page allocator by system administrator policy via the + device-dax kmem facility. Say N to have the kernel treat this + memory as "System RAM" by default. + + If unsure, say Y. + config EFI_PARAMS_FROM_FDT bool help diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index f8f8e273d809..e1cb915b45c6 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -81,6 +81,11 @@ bool efi_runtime_disabled(void) return disable_runtime; } +bool __pure __efi_soft_reserve_enabled(void) +{ + return !efi_enabled(EFI_MEM_NO_SOFT_RESERVE); +} + static int __init parse_efi_cmdline(char *str) { if (!str) { @@ -94,6 +99,9 @@ static int __init parse_efi_cmdline(char *str) if (parse_option_str(str, "noruntime")) disable_runtime = true; + if (parse_option_str(str, "nosoftreserve")) + set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); + return 0; } early_param("efi", parse_efi_cmdline); diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 35dbc2791c97..e02579907f2e 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -32,6 +32,7 @@ static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; static int __section(.data) __nokaslr; static int __section(.data) __quiet; static int __section(.data) __novamap; +static bool __section(.data) efi_nosoftreserve; int __pure nokaslr(void) { @@ -45,6 +46,10 @@ int __pure novamap(void) { return __novamap; } +bool __pure __efi_soft_reserve_enabled(void) +{ + return !efi_nosoftreserve; +} #define EFI_MMAP_NR_SLACK_SLOTS 8 @@ -211,6 +216,10 @@ again: if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; + if (efi_soft_reserve_enabled() && + (desc->attribute & EFI_MEMORY_SP)) + continue; + if (desc->num_pages < nr_pages) continue; @@ -305,6 +314,10 @@ efi_status_t efi_low_alloc_above(efi_system_table_t *sys_table_arg, if (desc->type != EFI_CONVENTIONAL_MEMORY) continue; + if (efi_soft_reserve_enabled() && + (desc->attribute & EFI_MEMORY_SP)) + continue; + if (desc->num_pages < nr_pages) continue; @@ -484,6 +497,12 @@ efi_status_t efi_parse_options(char const *cmdline) __novamap = 1; } + if (IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) && + !strncmp(str, "nosoftreserve", 7)) { + str += strlen("nosoftreserve"); + efi_nosoftreserve = 1; + } + /* Group words together, delimited by "," */ while (*str && *str != ' ' && *str != ',') str++; diff --git a/include/linux/efi.h b/include/linux/efi.h index 44c85b559e15..88654910ce29 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1202,6 +1202,7 @@ extern int __init efi_setup_pcdp_console(char *); #define EFI_DBG 8 /* Print additional debug info at runtime */ #define EFI_NX_PE_DATA 9 /* Can runtime data regions be mapped non-executable? */ #define EFI_MEM_ATTR 10 /* Did firmware publish an EFI_MEMORY_ATTRIBUTES table? */ +#define EFI_MEM_NO_SOFT_RESERVE 11 /* Is the kernel configured to ignore soft reservations? */ #ifdef CONFIG_EFI /* @@ -1212,6 +1213,14 @@ static inline bool efi_enabled(int feature) return test_bit(feature, &efi.flags) != 0; } extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); + +bool __pure __efi_soft_reserve_enabled(void); + +static inline bool __pure efi_soft_reserve_enabled(void) +{ + return IS_ENABLED(CONFIG_EFI_SOFT_RESERVE) + && __efi_soft_reserve_enabled(); +} #else static inline bool efi_enabled(int feature) { @@ -1225,6 +1234,11 @@ efi_capsule_pending(int *reset_type) { return false; } + +static inline bool efi_soft_reserve_enabled(void) +{ + return false; +} #endif extern int efi_status_to_err(efi_status_t status); -- cgit v1.2.3 From 16993c0f0a43213e23666ea40e9163887f593ac7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:21 -0800 Subject: arm/efi: EFI soft reservation to memblock UEFI 2.8 defines an EFI_MEMORY_SP attribute bit to augment the interpretation of the EFI Memory Types as "reserved for a specific purpose". The proposed Linux behavior for specific purpose memory is that it is reserved for direct-access (device-dax) by default and not available for any kernel usage, not even as an OOM fallback. Later, through udev scripts or another init mechanism, these device-dax claimed ranges can be reconfigured and hot-added to the available System-RAM with a unique node identifier. This device-dax management scheme implements "soft" in the "soft reserved" designation by allowing some or all of the reservation to be recovered as typical memory. This policy can be disabled at compile-time with CONFIG_EFI_SOFT_RESERVE=n, or runtime with efi=nosoftreserve. For this patch, update the ARM paths that consider EFI_CONVENTIONAL_MEMORY to optionally take the EFI_MEMORY_SP attribute into account as a reservation indicator. Publish the soft reservation as IORES_DESC_SOFT_RESERVED memory, similar to x86. (Based on an original patch by Ard) Reviewed-by: Ard Biesheuvel Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- arch/arm64/mm/mmu.c | 2 ++ drivers/firmware/efi/arm-init.c | 9 +++++++++ drivers/firmware/efi/arm-runtime.c | 24 ++++++++++++++++++++++++ drivers/firmware/efi/libstub/arm32-stub.c | 5 +++++ drivers/firmware/efi/libstub/random.c | 4 ++++ 5 files changed, 44 insertions(+) (limited to 'drivers') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 60c929f3683b..2c385fe05fde 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1061,6 +1061,8 @@ int arch_add_memory(int nid, u64 start, u64 size, __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); + memblock_clear_nomap(start, size); + return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, restrictions); } diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 311cd349a862..904fa09e6a6b 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -163,6 +163,15 @@ static __init int is_usable_memory(efi_memory_desc_t *md) case EFI_BOOT_SERVICES_DATA: case EFI_CONVENTIONAL_MEMORY: case EFI_PERSISTENT_MEMORY: + /* + * Special purpose memory is 'soft reserved', which means it + * is set aside initially, but can be hotplugged back in or + * be assigned to the dax driver after boot. + */ + if (efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + return false; + /* * According to the spec, these regions are no longer reserved * after calling ExitBootServices(). However, we can only use diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index e2ac5fa5531b..899b803842bb 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -121,6 +121,30 @@ static int __init arm_enable_runtime_services(void) return 0; } + if (efi_soft_reserve_enabled()) { + efi_memory_desc_t *md; + + for_each_efi_memory_desc(md) { + int md_size = md->num_pages << EFI_PAGE_SHIFT; + struct resource *res; + + if (!(md->attribute & EFI_MEMORY_SP)) + continue; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (WARN_ON(!res)) + break; + + res->start = md->phys_addr; + res->end = md->phys_addr + md_size - 1; + res->name = "Soft Reserved"; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_SOFT_RESERVED; + + insert_resource(&iomem_resource, res); + } + } + if (efi_runtime_disabled()) { pr_info("EFI runtime services will be disabled.\n"); return 0; diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 41213bf5fcf5..4566640de650 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -146,6 +146,11 @@ static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg, continue; case EFI_CONVENTIONAL_MEMORY: + /* Skip soft reserved conventional memory */ + if (efi_soft_reserve_enabled() && + (desc->attribute & EFI_MEMORY_SP)) + continue; + /* * Reserve the intersection between this entry and the * region. diff --git a/drivers/firmware/efi/libstub/random.c b/drivers/firmware/efi/libstub/random.c index b4b1d1dcb5fd..6c188695e730 100644 --- a/drivers/firmware/efi/libstub/random.c +++ b/drivers/firmware/efi/libstub/random.c @@ -46,6 +46,10 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, if (md->type != EFI_CONVENTIONAL_MEMORY) return 0; + if (efi_soft_reserve_enabled() && + (md->attribute & EFI_MEMORY_SP)) + return 0; + region_end = min((u64)ULONG_MAX, md->phys_addr + md->num_pages*EFI_PAGE_SIZE - 1); first_slot = round_up(md->phys_addr, align); -- cgit v1.2.3 From 199c8471761273b7e287914cee968ddf21dfbfe0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:26 -0800 Subject: x86/efi: Add efi_fake_mem support for EFI_MEMORY_SP Given that EFI_MEMORY_SP is platform BIOS policy decision for marking memory ranges as "reserved for a specific purpose" there will inevitably be scenarios where the BIOS omits the attribute in situations where it is desired. Unlike other attributes if the OS wants to reserve this memory from the kernel the reservation needs to happen early in init. So early, in fact, that it needs to happen before e820__memblock_setup() which is a pre-requisite for efi_fake_memmap() that wants to allocate memory for the updated table. Introduce an x86 specific efi_fake_memmap_early() that can search for attempts to set EFI_MEMORY_SP via efi_fake_mem and update the e820 table accordingly. The KASLR code that scans the command line looking for user-directed memory reservations also needs to be updated to consider "efi_fake_mem=nn@ss:0x40000" requests. Acked-by: Ard Biesheuvel Reviewed-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 10 +++- arch/x86/boot/compressed/kaslr.c | 42 ++++++++++++--- arch/x86/include/asm/efi.h | 8 +++ arch/x86/platform/efi/efi.c | 2 + drivers/firmware/efi/Makefile | 5 +- drivers/firmware/efi/fake_mem.c | 24 ++++----- drivers/firmware/efi/fake_mem.h | 10 ++++ drivers/firmware/efi/x86_fake_mem.c | 69 +++++++++++++++++++++++++ 8 files changed, 147 insertions(+), 23 deletions(-) create mode 100644 drivers/firmware/efi/fake_mem.h create mode 100644 drivers/firmware/efi/x86_fake_mem.c (limited to 'drivers') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2359dc56d82c..5eee3ea05ac5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1196,15 +1196,21 @@ updating original EFI memory map. Region of memory which aa attribute is added to is from ss to ss+nn. + If efi_fake_mem=2G@4G:0x10000,2G@0x10a0000000:0x10000 is specified, EFI_MEMORY_MORE_RELIABLE(0x10000) attribute is added to range 0x100000000-0x180000000 and 0x10a0000000-0x1120000000. + If efi_fake_mem=8G@9G:0x40000 is specified, the + EFI_MEMORY_SP(0x40000) attribute is added to + range 0x240000000-0x43fffffff. + Using this parameter you can do debugging of EFI memmap - related feature. For example, you can do debugging of + related features. For example, you can do debugging of Address Range Mirroring feature even if your box - doesn't support it. + doesn't support it, or mark specific memory as + "soft reserved". efivar_ssdt= [EFI; X86] Name of an EFI variable that contains an SSDT that is to be dynamically loaded by Linux. If there are diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index ff6fa81949cd..da0eedd5635d 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -132,8 +132,14 @@ char *skip_spaces(const char *str) #include "../../../../lib/ctype.c" #include "../../../../lib/cmdline.c" +enum parse_mode { + PARSE_MEMMAP, + PARSE_EFI, +}; + static int -parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +parse_memmap(char *p, unsigned long long *start, unsigned long long *size, + enum parse_mode mode) { char *oldp; @@ -156,8 +162,29 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) *start = memparse(p + 1, &p); return 0; case '@': - /* memmap=nn@ss specifies usable region, should be skipped */ - *size = 0; + if (mode == PARSE_MEMMAP) { + /* + * memmap=nn@ss specifies usable region, should + * be skipped + */ + *size = 0; + } else { + unsigned long long flags; + + /* + * efi_fake_mem=nn@ss:attr the attr specifies + * flags that might imply a soft-reservation. + */ + *start = memparse(p + 1, &p); + if (p && *p == ':') { + p++; + if (kstrtoull(p, 0, &flags) < 0) + *size = 0; + else if (flags & EFI_MEMORY_SP) + return 0; + } + *size = 0; + } /* Fall through */ default: /* @@ -172,7 +199,7 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size) return -EINVAL; } -static void mem_avoid_memmap(char *str) +static void mem_avoid_memmap(enum parse_mode mode, char *str) { static int i; @@ -187,7 +214,7 @@ static void mem_avoid_memmap(char *str) if (k) *k++ = 0; - rc = parse_memmap(str, &start, &size); + rc = parse_memmap(str, &start, &size, mode); if (rc < 0) break; str = k; @@ -238,7 +265,6 @@ static void parse_gb_huge_pages(char *param, char *val) } } - static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); @@ -271,7 +297,7 @@ static void handle_mem_options(void) } if (!strcmp(param, "memmap")) { - mem_avoid_memmap(val); + mem_avoid_memmap(PARSE_MEMMAP, val); } else if (strstr(param, "hugepages")) { parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { @@ -284,6 +310,8 @@ static void handle_mem_options(void) goto out; mem_limit = mem_size; + } else if (!strcmp(param, "efi_fake_mem")) { + mem_avoid_memmap(PARSE_EFI, val); } } diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 45f853bce869..d028e9acdf1c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -263,4 +263,12 @@ static inline void efi_reserve_boot_services(void) } #endif /* CONFIG_EFI */ +#ifdef CONFIG_EFI_FAKE_MEMMAP +extern void __init efi_fake_memmap_early(void); +#else +static inline void efi_fake_memmap_early(void) +{ +} +#endif + #endif /* _ASM_X86_EFI_H */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 8609dccea096..38d44f36d5ed 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -262,6 +262,8 @@ int __init efi_memblock_x86_reserve_range(void) if (add_efi_memmap || do_efi_soft_reserve()) do_add_efi_memmap(); + efi_fake_memmap_early(); + WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", efi.memmap.desc_version); diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile index 4ac2de4dfa72..554d795270d9 100644 --- a/drivers/firmware/efi/Makefile +++ b/drivers/firmware/efi/Makefile @@ -20,13 +20,16 @@ obj-$(CONFIG_UEFI_CPER) += cper.o obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o obj-$(CONFIG_EFI_RUNTIME_WRAPPERS) += runtime-wrappers.o obj-$(CONFIG_EFI_STUB) += libstub/ -obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o +obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_map.o obj-$(CONFIG_EFI_BOOTLOADER_CONTROL) += efibc.o obj-$(CONFIG_EFI_TEST) += test/ obj-$(CONFIG_EFI_DEV_PATH_PARSER) += dev-path-parser.o obj-$(CONFIG_APPLE_PROPERTIES) += apple-properties.o obj-$(CONFIG_EFI_RCI2_TABLE) += rci2-table.o +fake_map-y += fake_mem.o +fake_map-$(CONFIG_X86) += x86_fake_mem.o + arm-obj-$(CONFIG_EFI) := arm-init.o arm-runtime.o obj-$(CONFIG_ARM) += $(arm-obj-y) obj-$(CONFIG_ARM64) += $(arm-obj-y) diff --git a/drivers/firmware/efi/fake_mem.c b/drivers/firmware/efi/fake_mem.c index 526b45331d96..bb9fc70d0cfa 100644 --- a/drivers/firmware/efi/fake_mem.c +++ b/drivers/firmware/efi/fake_mem.c @@ -17,12 +17,10 @@ #include #include #include -#include +#include "fake_mem.h" -#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM - -static struct efi_mem_range fake_mems[EFI_MAX_FAKEMEM]; -static int nr_fake_mem; +struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM]; +int nr_fake_mem; static int __init cmp_fake_mem(const void *x1, const void *x2) { @@ -50,7 +48,7 @@ void __init efi_fake_memmap(void) /* count up the number of EFI memory descriptor */ for (i = 0; i < nr_fake_mem; i++) { for_each_efi_memory_desc(md) { - struct range *r = &fake_mems[i].range; + struct range *r = &efi_fake_mems[i].range; new_nr_map += efi_memmap_split_count(md, r); } @@ -70,7 +68,7 @@ void __init efi_fake_memmap(void) } for (i = 0; i < nr_fake_mem; i++) - efi_memmap_insert(&efi.memmap, new_memmap, &fake_mems[i]); + efi_memmap_insert(&efi.memmap, new_memmap, &efi_fake_mems[i]); /* swap into new EFI memmap */ early_memunmap(new_memmap, efi.memmap.desc_size * new_nr_map); @@ -104,22 +102,22 @@ static int __init setup_fake_mem(char *p) if (nr_fake_mem >= EFI_MAX_FAKEMEM) break; - fake_mems[nr_fake_mem].range.start = start; - fake_mems[nr_fake_mem].range.end = start + mem_size - 1; - fake_mems[nr_fake_mem].attribute = attribute; + efi_fake_mems[nr_fake_mem].range.start = start; + efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1; + efi_fake_mems[nr_fake_mem].attribute = attribute; nr_fake_mem++; if (*p == ',') p++; } - sort(fake_mems, nr_fake_mem, sizeof(struct efi_mem_range), + sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range), cmp_fake_mem, NULL); for (i = 0; i < nr_fake_mem; i++) pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]", - fake_mems[i].attribute, fake_mems[i].range.start, - fake_mems[i].range.end); + efi_fake_mems[i].attribute, efi_fake_mems[i].range.start, + efi_fake_mems[i].range.end); return *p == '\0' ? 0 : -EINVAL; } diff --git a/drivers/firmware/efi/fake_mem.h b/drivers/firmware/efi/fake_mem.h new file mode 100644 index 000000000000..d52791af4b18 --- /dev/null +++ b/drivers/firmware/efi/fake_mem.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __EFI_FAKE_MEM_H__ +#define __EFI_FAKE_MEM_H__ +#include + +#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM + +extern struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM]; +extern int nr_fake_mem; +#endif /* __EFI_FAKE_MEM_H__ */ diff --git a/drivers/firmware/efi/x86_fake_mem.c b/drivers/firmware/efi/x86_fake_mem.c new file mode 100644 index 000000000000..e5d6d5a1b240 --- /dev/null +++ b/drivers/firmware/efi/x86_fake_mem.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2019 Intel Corporation. All rights reserved. */ +#include +#include +#include "fake_mem.h" + +void __init efi_fake_memmap_early(void) +{ + int i; + + /* + * The late efi_fake_mem() call can handle all requests if + * EFI_MEMORY_SP support is disabled. + */ + if (!efi_soft_reserve_enabled()) + return; + + if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem) + return; + + /* + * Given that efi_fake_memmap() needs to perform memblock + * allocations it needs to run after e820__memblock_setup(). + * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given + * address range that potentially needs to mark the memory as + * reserved prior to e820__memblock_setup(). Update e820 + * directly if EFI_MEMORY_SP is specified for an + * EFI_CONVENTIONAL_MEMORY descriptor. + */ + for (i = 0; i < nr_fake_mem; i++) { + struct efi_mem_range *mem = &efi_fake_mems[i]; + efi_memory_desc_t *md; + u64 m_start, m_end; + + if ((mem->attribute & EFI_MEMORY_SP) == 0) + continue; + + m_start = mem->range.start; + m_end = mem->range.end; + for_each_efi_memory_desc(md) { + u64 start, end; + + if (md->type != EFI_CONVENTIONAL_MEMORY) + continue; + + start = md->phys_addr; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1; + + if (m_start <= end && m_end >= start) + /* fake range overlaps descriptor */; + else + continue; + + /* + * Trim the boundary of the e820 update to the + * descriptor in case the fake range overlaps + * !EFI_CONVENTIONAL_MEMORY + */ + start = max(start, m_start); + end = min(end, m_end); + + if (end <= start) + continue; + e820__range_update(start, end - start + 1, E820_TYPE_RAM, + E820_TYPE_SOFT_RESERVED); + e820__update_table(e820_table); + } + } +} -- cgit v1.2.3 From 33dd70752cd76f4d883a165a674f13121a4155ed Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:31 -0800 Subject: lib: Uplevel the pmem "region" ida to a global allocator In preparation for handling platform differentiated memory types beyond persistent memory, uplevel the "region" identifier to a global number space. This enables a device-dax instance to be registered to any memory type with guaranteed unique names. Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/nvdimm/Kconfig | 1 + drivers/nvdimm/core.c | 1 - drivers/nvdimm/nd-core.h | 1 - drivers/nvdimm/region_devs.c | 13 ++++--------- include/linux/memregion.h | 19 +++++++++++++++++++ lib/Kconfig | 3 +++ lib/Makefile | 1 + lib/memregion.c | 18 ++++++++++++++++++ 8 files changed, 46 insertions(+), 11 deletions(-) create mode 100644 include/linux/memregion.h create mode 100644 lib/memregion.c (limited to 'drivers') diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index 36af7af6b7cf..b7d1eb38b27d 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -4,6 +4,7 @@ menuconfig LIBNVDIMM depends on PHYS_ADDR_T_64BIT depends on HAS_IOMEM depends on BLK_DEV + select MEMREGION help Generic support for non-volatile memory devices including ACPI-6-NFIT defined resources. On platforms that define an diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 9204f1e9fd14..e592c4964674 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -455,7 +455,6 @@ static __exit void libnvdimm_exit(void) nd_region_exit(); nvdimm_exit(); nvdimm_bus_exit(); - nd_region_devs_exit(); nvdimm_devs_exit(); } diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index 25fa121104d0..aa059439fca0 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -114,7 +114,6 @@ struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); int __init nvdimm_bus_init(void); void nvdimm_bus_exit(void); void nvdimm_devs_exit(void); -void nd_region_devs_exit(void); struct nd_region; void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev); void nd_region_create_ns_seed(struct nd_region *nd_region); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index ef423ba1a711..fbf34cf688f4 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -3,6 +3,7 @@ * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. */ #include +#include #include #include #include @@ -19,7 +20,6 @@ */ #include -static DEFINE_IDA(region_ida); static DEFINE_PER_CPU(int, flush_idx); static int nvdimm_map_flush(struct device *dev, struct nvdimm *nvdimm, int dimm, @@ -133,7 +133,7 @@ static void nd_region_release(struct device *dev) put_device(&nvdimm->dev); } free_percpu(nd_region->lane); - ida_simple_remove(®ion_ida, nd_region->id); + memregion_free(nd_region->id); if (is_nd_blk(dev)) kfree(to_nd_blk_region(dev)); else @@ -985,7 +985,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, if (!region_buf) return NULL; - nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); + nd_region->id = memregion_alloc(GFP_KERNEL); if (nd_region->id < 0) goto err_id; @@ -1044,7 +1044,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, return nd_region; err_percpu: - ida_simple_remove(®ion_ida, nd_region->id); + memregion_free(nd_region->id); err_id: kfree(region_buf); return NULL; @@ -1216,8 +1216,3 @@ int nd_region_conflict(struct nd_region *nd_region, resource_size_t start, return device_for_each_child(&nvdimm_bus->dev, &ctx, region_conflict); } - -void __exit nd_region_devs_exit(void) -{ - ida_destroy(®ion_ida); -} diff --git a/include/linux/memregion.h b/include/linux/memregion.h new file mode 100644 index 000000000000..7de7c0a1444e --- /dev/null +++ b/include/linux/memregion.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MEMREGION_H_ +#define _MEMREGION_H_ +#include +#include + +#ifdef CONFIG_MEMREGION +int memregion_alloc(gfp_t gfp); +void memregion_free(int id); +#else +static inline int memregion_alloc(gfp_t gfp) +{ + return -ENOMEM; +} +void memregion_free(int id) +{ +} +#endif +#endif /* _MEMREGION_H_ */ diff --git a/lib/Kconfig b/lib/Kconfig index 183f92a297ca..0dc043ac271d 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -606,6 +606,9 @@ config ARCH_NO_SG_CHAIN config ARCH_HAS_PMEM_API bool +config MEMREGION + bool + # use memcpy to implement user copies for nommu architectures config UACCESS_MEMCPY bool diff --git a/lib/Makefile b/lib/Makefile index c5892807e06f..2fb7b47018f1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -212,6 +212,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o obj-$(CONFIG_SG_SPLIT) += sg_split.o obj-$(CONFIG_SG_POOL) += sg_pool.o +obj-$(CONFIG_MEMREGION) += memregion.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o obj-$(CONFIG_IRQ_POLL) += irq_poll.o diff --git a/lib/memregion.c b/lib/memregion.c new file mode 100644 index 000000000000..77c85b5251da --- /dev/null +++ b/lib/memregion.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* identifiers for device / performance-differentiated memory regions */ +#include +#include + +static DEFINE_IDA(memregion_ids); + +int memregion_alloc(gfp_t gfp) +{ + return ida_alloc(&memregion_ids, gfp); +} +EXPORT_SYMBOL(memregion_alloc); + +void memregion_free(int id) +{ + ida_free(&memregion_ids, id); +} +EXPORT_SYMBOL(memregion_free); -- cgit v1.2.3 From 460370ab20b6cc174256e46e192adf01e730faf6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:37 -0800 Subject: dax: Fix alloc_dax_region() compile warning PFN flags are (unsigned long long), fix the alloc_dax_region() calling convention to fix warnings of the form: >> include/linux/pfn_t.h:18:17: warning: large integer implicitly truncated to unsigned type [-Woverflow] #define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3)) Reported-by: kbuild test robot Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/dax/bus.c | 2 +- drivers/dax/bus.h | 2 +- drivers/dax/dax-private.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 8fafbeab510a..eccdda1f7b71 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -227,7 +227,7 @@ static void dax_region_unregister(void *region) struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, int target_node, unsigned int align, - unsigned long pfn_flags) + unsigned long long pfn_flags) { struct dax_region *dax_region; diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index 8619e3299943..9e4eba67e8b9 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -11,7 +11,7 @@ struct dax_region; void dax_region_put(struct dax_region *dax_region); struct dax_region *alloc_dax_region(struct device *parent, int region_id, struct resource *res, int target_node, unsigned int align, - unsigned long flags); + unsigned long long flags); enum dev_dax_subsys { DEV_DAX_BUS, diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index 6ccca3b890d6..3107ce80e809 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -32,7 +32,7 @@ struct dax_region { struct device *dev; unsigned int align; struct resource res; - unsigned long pfn_flags; + unsigned long long pfn_flags; }; /** -- cgit v1.2.3 From a6c7f4c6aea5f4ca6056b06cec7ebd79f8c23e33 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:43 -0800 Subject: device-dax: Add a driver for "hmem" devices Platform firmware like EFI/ACPI may publish "hmem" platform devices. Such a device is a performance differentiated memory range likely reserved for an application specific use case. The driver gives access to 100% of the capacity via a device-dax mmap instance by default. However, if over-subscription and other kernel memory management is desired the resulting dax device can be assigned to the core-mm via the kmem driver. This consumes "hmem" devices the producer of "hmem" devices is saved for a follow-on patch so that it can reference the new CONFIG_DEV_DAX_HMEM symbol to gate performing the enumeration work. Reported-by: kbuild test robot Reviewed-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/dax/Kconfig | 27 ++++++++++++++++++----- drivers/dax/Makefile | 2 ++ drivers/dax/hmem.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/memregion.h | 4 ++++ 4 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 drivers/dax/hmem.c (limited to 'drivers') diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index f33c73e4af41..3b6c06f07326 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -32,19 +32,36 @@ config DEV_DAX_PMEM Say M if unsure +config DEV_DAX_HMEM + tristate "HMEM DAX: direct access to 'specific purpose' memory" + depends on EFI_SOFT_RESERVE + default DEV_DAX + help + EFI 2.8 platforms, and others, may advertise 'specific purpose' + memory. For example, a high bandwidth memory pool. The + indication from platform firmware is meant to reserve the + memory from typical usage by default. This driver creates + device-dax instances for these memory ranges, and that also + enables the possibility to assign them to the DEV_DAX_KMEM + driver to override the reservation and add them to kernel + "System RAM" pool. + + Say M if unsure. + config DEV_DAX_KMEM tristate "KMEM DAX: volatile-use of persistent memory" default DEV_DAX depends on DEV_DAX depends on MEMORY_HOTPLUG # for add_memory() and friends help - Support access to persistent memory as if it were RAM. This - allows easier use of persistent memory by unmodified - applications. + Support access to persistent, or other performance + differentiated memory as if it were System RAM. This allows + easier use of persistent memory by unmodified applications, or + adds core kernel memory services to heterogeneous memory types + (HMEM) marked "reserved" by platform firmware. To use this feature, a DAX device must be unbound from the - device_dax driver (PMEM DAX) and bound to this kmem driver - on each boot. + device_dax driver and bound to this kmem driver on each boot. Say N if unsure. diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 81f7d54dadfb..80065b38b3c4 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -2,9 +2,11 @@ obj-$(CONFIG_DAX) += dax.o obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o +obj-$(CONFIG_DEV_DAX_HMEM) += dax_hmem.o dax-y := super.o dax-y += bus.o device_dax-y := device.o +dax_hmem-y := hmem.o obj-y += pmem/ diff --git a/drivers/dax/hmem.c b/drivers/dax/hmem.c new file mode 100644 index 000000000000..fe7214daf62e --- /dev/null +++ b/drivers/dax/hmem.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "bus.h" + +static int dax_hmem_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct dev_pagemap pgmap = { }; + struct dax_region *dax_region; + struct memregion_info *mri; + struct dev_dax *dev_dax; + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENOMEM; + + mri = dev->platform_data; + memcpy(&pgmap.res, res, sizeof(*res)); + + dax_region = alloc_dax_region(dev, pdev->id, res, mri->target_node, + PMD_SIZE, PFN_DEV|PFN_MAP); + if (!dax_region) + return -ENOMEM; + + dev_dax = devm_create_dev_dax(dax_region, 0, &pgmap); + if (IS_ERR(dev_dax)) + return PTR_ERR(dev_dax); + + /* child dev_dax instances now own the lifetime of the dax_region */ + dax_region_put(dax_region); + return 0; +} + +static int dax_hmem_remove(struct platform_device *pdev) +{ + /* devm handles teardown */ + return 0; +} + +static struct platform_driver dax_hmem_driver = { + .probe = dax_hmem_probe, + .remove = dax_hmem_remove, + .driver = { + .name = "hmem", + }, +}; + +module_platform_driver(dax_hmem_driver); + +MODULE_ALIAS("platform:hmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memregion.h b/include/linux/memregion.h index 7de7c0a1444e..e11595256cac 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -4,6 +4,10 @@ #include #include +struct memregion_info { + int target_node; +}; + #ifdef CONFIG_MEMREGION int memregion_alloc(gfp_t gfp); void memregion_free(int id); -- cgit v1.2.3 From 0f847f8c0813c8ad7df5174c8f27bcba5926b972 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:49 -0800 Subject: ACPI: NUMA: HMAT: Register HMAT at device_initcall level In preparation for registering device-dax instances for accessing EFI specific-purpose memory, arrange for the HMAT registration to occur later in the init process. Critically HMAT initialization needs to occur after e820__reserve_resources_late() which is the point at which the iomem resource tree is populated with "Application Reserved" (IORES_DESC_APPLICATION_RESERVED). e820__reserve_resources_late() happens at subsys_initcall time. Reviewed-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 8b0de8a3c647..00e0a270ece3 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -748,4 +748,4 @@ out_put: acpi_put_table(tbl); return 0; } -subsys_initcall(hmat_init); +device_initcall(hmat_init); -- cgit v1.2.3 From cf8741ac57ed48613e49559d3e5ae43f56291e4c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Nov 2019 17:43:55 -0800 Subject: ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device Memory that has been tagged EFI_MEMORY_SP, and has performance properties described by the ACPI HMAT is expected to have an application specific consumer. Those consumers may want 100% of the memory capacity to be reserved from any usage by the kernel. By default, with this enabling, a platform device is created to represent this differentiated resource. The device-dax "hmem" driver claims these devices by default and provides an mmap interface for the target application. If the administrator prefers, the hmem resource range can be made available to the core-mm via the device-dax hotplug facility, kmem, to online the memory with its own numa node. This was tested with an emulated HMAT produced by qemu (with the pending HMAT enabling patches), and "efi_fake_mem=8G@9G:0x40000" on the kernel command line to mark the memory ranges associated with node2 and node3 as EFI_MEMORY_SP. qemu numa configuration options: -numa node,mem=4G,cpus=0-19,nodeid=0 -numa node,mem=4G,cpus=20-39,nodeid=1 -numa node,mem=4G,nodeid=2 -numa node,mem=4G,nodeid=3 -numa dist,src=0,dst=0,val=10 -numa dist,src=0,dst=1,val=21 -numa dist,src=0,dst=2,val=21 -numa dist,src=0,dst=3,val=21 -numa dist,src=1,dst=0,val=21 -numa dist,src=1,dst=1,val=10 -numa dist,src=1,dst=2,val=21 -numa dist,src=1,dst=3,val=21 -numa dist,src=2,dst=0,val=21 -numa dist,src=2,dst=1,val=21 -numa dist,src=2,dst=2,val=10 -numa dist,src=2,dst=3,val=21 -numa dist,src=3,dst=0,val=21 -numa dist,src=3,dst=1,val=21 -numa dist,src=3,dst=2,val=21 -numa dist,src=3,dst=3,val=10 -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 -numa hmat-lb,initiator=0,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 -numa hmat-lb,initiator=0,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 -numa hmat-lb,initiator=0,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 -numa hmat-lb,initiator=0,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-latency,base-lat=10,latency=10 -numa hmat-lb,initiator=1,target=0,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=10 -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-latency,base-lat=10,latency=5 -numa hmat-lb,initiator=1,target=1,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=5 -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-latency,base-lat=10,latency=15 -numa hmat-lb,initiator=1,target=2,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=15 -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-latency,base-lat=10,latency=20 -numa hmat-lb,initiator=1,target=3,hierarchy=memory,data-type=access-bandwidth,base-bw=20,bandwidth=20 Result: [ { "path":"\/platform\/hmem.1", "id":1, "size":"4.00 GiB (4.29 GB)", "align":2097152, "devices":[ { "chardev":"dax1.0", "size":"4.00 GiB (4.29 GB)" } ] }, { "path":"\/platform\/hmem.0", "id":0, "size":"4.00 GiB (4.29 GB)", "align":2097152, "devices":[ { "chardev":"dax0.0", "size":"4.00 GiB (4.29 GB)" } ] } ] [..] 240000000-43fffffff : Soft Reserved 240000000-33fffffff : hmem.0 240000000-33fffffff : dax0.0 340000000-43fffffff : hmem.1 340000000-43fffffff : dax1.0 Reviewed-by: Dave Hansen Signed-off-by: Dan Williams Acked-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/Kconfig | 1 + drivers/acpi/numa/hmat.c | 136 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 125 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/numa/Kconfig b/drivers/acpi/numa/Kconfig index acbd5aa76e40..fcf2e556d69d 100644 --- a/drivers/acpi/numa/Kconfig +++ b/drivers/acpi/numa/Kconfig @@ -9,6 +9,7 @@ config ACPI_HMAT bool "ACPI Heterogeneous Memory Attribute Table Support" depends on ACPI_NUMA select HMEM_REPORTING + select MEMREGION help If set, this option has the kernel parse and report the platform's ACPI HMAT (Heterogeneous Memory Attributes Table), diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 00e0a270ece3..1ce366a7bc55 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -8,12 +8,18 @@ * the applicable attributes with the node's interfaces. */ +#define pr_fmt(fmt) "acpi/hmat: " fmt +#define dev_fmt(fmt) "acpi/hmat: " fmt + #include #include #include #include #include +#include +#include #include +#include #include #include #include @@ -49,6 +55,7 @@ struct memory_target { struct list_head node; unsigned int memory_pxm; unsigned int processor_pxm; + struct resource memregions; struct node_hmem_attrs hmem_attrs; struct list_head caches; struct node_cache_attrs cache_attrs; @@ -104,22 +111,36 @@ static __init void alloc_memory_initiator(unsigned int cpu_pxm) list_add_tail(&initiator->node, &initiators); } -static __init void alloc_memory_target(unsigned int mem_pxm) +static __init void alloc_memory_target(unsigned int mem_pxm, + resource_size_t start, resource_size_t len) { struct memory_target *target; target = find_mem_target(mem_pxm); - if (target) - return; - - target = kzalloc(sizeof(*target), GFP_KERNEL); - if (!target) - return; + if (!target) { + target = kzalloc(sizeof(*target), GFP_KERNEL); + if (!target) + return; + target->memory_pxm = mem_pxm; + target->processor_pxm = PXM_INVAL; + target->memregions = (struct resource) { + .name = "ACPI mem", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, + }; + list_add_tail(&target->node, &targets); + INIT_LIST_HEAD(&target->caches); + } - target->memory_pxm = mem_pxm; - target->processor_pxm = PXM_INVAL; - list_add_tail(&target->node, &targets); - INIT_LIST_HEAD(&target->caches); + /* + * There are potentially multiple ranges per PXM, so record each + * in the per-target memregions resource tree. + */ + if (!__request_region(&target->memregions, start, len, "memory target", + IORESOURCE_MEM)) + pr_warn("failed to reserve %#llx - %#llx in pxm: %d\n", + start, start + len, mem_pxm); } static __init const char *hmat_data_type(u8 type) @@ -452,7 +473,7 @@ static __init int srat_parse_mem_affinity(union acpi_subtable_headers *header, return -EINVAL; if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) return 0; - alloc_memory_target(ma->proximity_domain); + alloc_memory_target(ma->proximity_domain, ma->base_address, ma->length); return 0; } @@ -613,10 +634,91 @@ static void hmat_register_target_perf(struct memory_target *target) node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0); } +static void hmat_register_target_device(struct memory_target *target, + struct resource *r) +{ + /* define a clean / non-busy resource for the platform device */ + struct resource res = { + .start = r->start, + .end = r->end, + .flags = IORESOURCE_MEM, + }; + struct platform_device *pdev; + struct memregion_info info; + int rc, id; + + rc = region_intersects(res.start, resource_size(&res), IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED); + if (rc != REGION_INTERSECTS) + return; + + id = memregion_alloc(GFP_KERNEL); + if (id < 0) { + pr_err("memregion allocation failure for %pr\n", &res); + return; + } + + pdev = platform_device_alloc("hmem", id); + if (!pdev) { + pr_err("hmem device allocation failure for %pr\n", &res); + goto out_pdev; + } + + pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->memory_pxm); + info = (struct memregion_info) { + .target_node = acpi_map_pxm_to_node(target->memory_pxm), + }; + rc = platform_device_add_data(pdev, &info, sizeof(info)); + if (rc < 0) { + pr_err("hmem memregion_info allocation failure for %pr\n", &res); + goto out_pdev; + } + + rc = platform_device_add_resources(pdev, &res, 1); + if (rc < 0) { + pr_err("hmem resource allocation failure for %pr\n", &res); + goto out_resource; + } + + rc = platform_device_add(pdev); + if (rc < 0) { + dev_err(&pdev->dev, "device add failed for %pr\n", &res); + goto out_resource; + } + + return; + +out_resource: + put_device(&pdev->dev); +out_pdev: + memregion_free(id); +} + +static __init void hmat_register_target_devices(struct memory_target *target) +{ + struct resource *res; + + /* + * Do not bother creating devices if no driver is available to + * consume them. + */ + if (!IS_ENABLED(CONFIG_DEV_DAX_HMEM)) + return; + + for (res = target->memregions.child; res; res = res->sibling) + hmat_register_target_device(target, res); +} + static void hmat_register_target(struct memory_target *target) { int nid = pxm_to_node(target->memory_pxm); + /* + * Devices may belong to either an offline or online + * node, so unconditionally add them. + */ + hmat_register_target_devices(target); + /* * Skip offline nodes. This can happen when memory * marked EFI_MEMORY_SP, "specific purpose", is applied @@ -677,11 +779,21 @@ static __init void hmat_free_structures(void) struct target_cache *tcache, *cnext; list_for_each_entry_safe(target, tnext, &targets, node) { + struct resource *res, *res_next; + list_for_each_entry_safe(tcache, cnext, &target->caches, node) { list_del(&tcache->node); kfree(tcache); } + list_del(&target->node); + res = target->memregions.child; + while (res) { + res_next = res->sibling; + __release_region(&target->memregions, res->start, + resource_size(res)); + res = res_next; + } kfree(target); } -- cgit v1.2.3 From 4caa525b783b0abe7bc06e41220b337ba311bbf7 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Mon, 28 Oct 2019 10:11:18 +0100 Subject: ACPI: HMAT: don't mix pxm and nid when setting memory target processor_pxm On systems where PXMs and nids are in different order, memory initiators exposed in sysfs could be wrong: On dual-socket CLX with SNC enabled (4 nodes, 1 and 2 swapped between PXMs and nids), node1 would only get node2 as initiator, and node2 would only get node1. With this patch, we get node1 as the only initiator of itself, and node2 as the only initiator of itself, as expected. This should likely go to stable up to 5.2. Signed-off-by: Brice Goglin Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 1ce366a7bc55..42cafeaac336 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -438,7 +438,7 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade pr_debug("HMAT: Invalid Processor Domain\n"); return -EINVAL; } - target->processor_pxm = p_node; + target->processor_pxm = p->processor_PD; } return 0; -- cgit v1.2.3 From 59b2c5b63587a9ed2292ccce32fd69d8de815036 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 11 Nov 2019 16:34:26 -0500 Subject: ACPI: NUMA: HMAT: fix a section mismatch Commit cf8741ac57ed ("ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device") introduced a linker warning, WARNING: vmlinux.o(.text+0x64ec3c): Section mismatch in reference from the function hmat_register_target() to the function .init.text:hmat_register_target_devices() The function hmat_register_target() references the function __init hmat_register_target_devices(). Since hmat_register_target() is also called from hmat_callback(), and then register_hotmemory_notifier(), where it should not be freed when hmat_init() is done, it indicates that the __init annotation of hmat_register_target_devices() is incorrect. Fixes: cf8741ac57ed ("ACPI: NUMA: HMAT: Register "soft reserved" memory as an "hmem" device") Signed-off-by: Qian Cai Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 42cafeaac336..600ae3babd15 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -694,7 +694,7 @@ out_pdev: memregion_free(id); } -static __init void hmat_register_target_devices(struct memory_target *target) +static void hmat_register_target_devices(struct memory_target *target) { struct resource *res; -- cgit v1.2.3 From 0f1839d0888700389e3062b4787046d61780d6d9 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Wed, 30 Oct 2019 14:34:03 +0800 Subject: ACPI: HMAT: use %u instead of %d to print u32 values Use %u instead of %d to print u32 values to expand the value range, especially when latency or bandwidth value is bigger than INT_MAX. Then HMAT latency can support up to 4.29s and bandwidth can support up to 4PB/s. Reviewed-by: Dan Williams Reviewed-by: Jingqi Liu Signed-off-by: Tao Xu Signed-off-by: Rafael J. Wysocki --- drivers/acpi/numa/hmat.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 600ae3babd15..2c32cfb72370 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -293,7 +293,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, u8 type, mem_hier; if (hmat_loc->header.length < sizeof(*hmat_loc)) { - pr_notice("HMAT: Unexpected locality header length: %d\n", + pr_notice("HMAT: Unexpected locality header length: %u\n", hmat_loc->header.length); return -EINVAL; } @@ -305,12 +305,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds + sizeof(*inits) * ipds + sizeof(*targs) * tpds; if (hmat_loc->header.length < total_size) { - pr_notice("HMAT: Unexpected locality header length:%d, minimum required:%d\n", + pr_notice("HMAT: Unexpected locality header length:%u, minimum required:%u\n", hmat_loc->header.length, total_size); return -EINVAL; } - pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%d Target Domains:%d Base:%lld\n", + pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n", hmat_loc->flags, hmat_data_type(type), ipds, tpds, hmat_loc->entry_base_unit); @@ -323,7 +323,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header, value = hmat_normalize(entries[init * tpds + targ], hmat_loc->entry_base_unit, type); - pr_info(" Initiator-Target[%d-%d]:%d%s\n", + pr_info(" Initiator-Target[%u-%u]:%u%s\n", inits[init], targs[targ], value, hmat_data_type_suffix(type)); @@ -350,13 +350,13 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header, u32 attrs; if (cache->header.length < sizeof(*cache)) { - pr_notice("HMAT: Unexpected cache header length: %d\n", + pr_notice("HMAT: Unexpected cache header length: %u\n", cache->header.length); return -EINVAL; } attrs = cache->cache_attributes; - pr_info("HMAT: Cache: Domain:%d Size:%llu Attrs:%08x SMBIOS Handles:%d\n", + pr_info("HMAT: Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n", cache->memory_PD, cache->cache_size, attrs, cache->number_of_SMBIOShandles); @@ -411,17 +411,17 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade struct memory_target *target = NULL; if (p->header.length != sizeof(*p)) { - pr_notice("HMAT: Unexpected address range header length: %d\n", + pr_notice("HMAT: Unexpected address range header length: %u\n", p->header.length); return -EINVAL; } if (hmat_revision == 1) - pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%d Memory Domain:%d\n", + pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%u Memory Domain:%u\n", p->reserved3, p->reserved4, p->flags, p->processor_PD, p->memory_PD); else - pr_info("HMAT: Memory Flags:%04x Processor Domain:%d Memory Domain:%d\n", + pr_info("HMAT: Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n", p->flags, p->processor_PD, p->memory_PD); if (p->flags & ACPI_HMAT_MEMORY_PD_VALID && hmat_revision == 1) { -- cgit v1.2.3