From d81056b5278c520f1dede99bd59c14366ac8b1e1 Mon Sep 17 00:00:00 2001 From: Lukasz Anaczkowski Date: Wed, 9 Sep 2015 15:47:29 +0200 Subject: x86, ACPI: Handle apic/x2apic entries in MADT in correct order ACPI specifies the following rules when listing APIC IDs: (1) Boot processor is listed first (2) For multi-threaded processors, BIOS should list the first logical processor of each of the individual multi-threaded processors in MADT before listing any of the second logical processors. (3) APIC IDs < 0xFF should be listed in APIC subtable, APIC IDs >= 0xFF should be listed in X2APIC subtable Because of above, when there's more than 0xFF logical CPUs, BIOS interleaves APIC/X2APIC subtables. Assuming, there's 72 cores, 72 hyper-threads each, 288 CPUs total, listing is like this: APIC (0,4,8, .., 252) X2APIC (258,260,264, .. 284) APIC (1,5,9,...,253) X2APIC (259,261,265,...,285) APIC (2,6,10,...,254) X2APIC (260,262,266,..,286) APIC (3,7,11,...,251) X2APIC (255,261,262,266,..,287) Now, before this patch, due to how ACPI MADT subtables were parsed (BSP then X2APIC then APIC), kernel enumerated CPUs in reverted order (i.e. high APIC IDs were getting low logical IDs, and low APIC IDs were getting high logical IDs). This is wrong for the following reasons: () it's hard to predict how cores and threads are enumerated () when it's hard to predict, s/w threads cannot be properly affinitized causing significant performance impact due to e.g. inproper cache sharing () enumeration is inconsistent with how threads are enumerated on other Intel Xeon processors So, order in which MADT APIC/X2APIC handlers are passed is reverse and both handlers are passed to be called during same MADT table to walk to achieve correct CPU enumeration. In scenario when someone boots kernel with options 'maxcpus=72 nox2apic', in result less cores may be booted, since some of the CPUs the kernel will try to use will have APIC ID >= 0xFF. In such case, one should not pass 'nox2apic'. Disclimer: code parsing MADT APIC/X2APIC has not been touched since 2009, when X2APIC support was initially added. I do not know why MADT parsing code was added in the reversed order in the first place. I guess it didn't matter at that time since nobody cared about cores with APIC IDs >= 0xFF, right? This patch is based on work of "Yinghai Lu " previously published at https://lkml.org/lkml/2013/1/21/563 Here's the explanation why parsing interface needs to be changed and why simpler approach will not work https://lkml.org/lkml/2015/9/7/285 Signed-off-by: Lukasz Anaczkowski Acked-by: Thomas Gleixner (commit message) Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index ded848c20e05..e75907601a41 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -976,6 +976,8 @@ static int __init acpi_parse_madt_lapic_entries(void) { int count; int x2count = 0; + int ret; + struct acpi_subtable_proc madt_proc[2]; if (!cpu_has_apic) return -ENODEV; @@ -999,10 +1001,22 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + + x2count = madt_proc[0].count; + count = madt_proc[1].count; } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); -- cgit v1.2.3 From 6a35fc2d6c22bafe45117cdc5d8cee332244edbb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Wed, 14 Oct 2015 16:11:59 -0700 Subject: cpufreq: intel_pstate: get P1 from TAR when available After Ivybridge, the max non turbo ratio obtained from platform info msr is not always guaranteed P1 on client platforms. The max non turbo activation ratio (TAR), determines the max for the current level of TDP. The ratio in platform info is physical max. The TAR MSR can be locked, so updating this value is not possible on all platforms. This change gets this ratio from MSR TURBO_ACTIVATION_RATIO if available, but also do some sanity checking to make sure that this value is correct. The sanity check involves reading the TDP ratio for the current tdp control value when platform has configurable TDP present and matching TAC with this. Signed-off-by: Srinivas Pandruvada Acked-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr-index.h | 7 +++++++ drivers/cpufreq/intel_pstate.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b8c14bb7fc8f..9f3905697f12 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -206,6 +206,13 @@ #define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0 #define MSR_RING_PERF_LIMIT_REASONS 0x000006B1 +/* Config TDP MSRs */ +#define MSR_CONFIG_TDP_NOMINAL 0x00000648 +#define MSR_CONFIG_TDP_LEVEL1 0x00000649 +#define MSR_CONFIG_TDP_LEVEL2 0x0000064A +#define MSR_CONFIG_TDP_CONTROL 0x0000064B +#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C + /* Hardware P state interface */ #define MSR_PPERF 0x0000064e #define MSR_PERF_LIMIT_REASONS 0x0000064f diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3af9dd7332e6..da92d0201d52 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -43,7 +43,6 @@ #define int_tofp(X) ((int64_t)(X) << FRAC_BITS) #define fp_toint(X) ((X) >> FRAC_BITS) - static inline int32_t mul_fp(int32_t x, int32_t y) { return ((int64_t)x * (int64_t)y) >> FRAC_BITS; @@ -593,10 +592,42 @@ static int core_get_min_pstate(void) static int core_get_max_pstate(void) { - u64 value; + u64 tar; + u64 plat_info; + int max_pstate; + int err; + + rdmsrl(MSR_PLATFORM_INFO, plat_info); + max_pstate = (plat_info >> 8) & 0xFF; + + err = rdmsrl_safe(MSR_TURBO_ACTIVATION_RATIO, &tar); + if (!err) { + /* Do some sanity checking for safety */ + if (plat_info & 0x600000000) { + u64 tdp_ctrl; + u64 tdp_ratio; + int tdp_msr; + + err = rdmsrl_safe(MSR_CONFIG_TDP_CONTROL, &tdp_ctrl); + if (err) + goto skip_tar; + + tdp_msr = MSR_CONFIG_TDP_NOMINAL + tdp_ctrl; + err = rdmsrl_safe(tdp_msr, &tdp_ratio); + if (err) + goto skip_tar; + + if (tdp_ratio - 1 == tar) { + max_pstate = tar; + pr_debug("max_pstate=TAC %x\n", max_pstate); + } else { + goto skip_tar; + } + } + } - rdmsrl(MSR_PLATFORM_INFO, value); - return (value >> 8) & 0xFF; +skip_tar: + return max_pstate; } static int core_get_turbo_pstate(void) -- cgit v1.2.3 From a3669868d99c8647d780895d83e74d9a921eba2b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 14 Oct 2015 14:29:40 +0800 Subject: ACPI/PCI: Reset acpi_root_dev->domain to 0 when pci_ignore_seg is set Reset acpi_root_dev->domain to 0 when pci_ignore_seg is set to keep consistence between ACPI PCI root device and PCI host bridge device. Acked-by: Bjorn Helgaas Signed-off-by: Jiang Liu Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ff9911707160..5bc018559cc4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -401,7 +401,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) int node; if (pci_ignore_seg) - domain = 0; + root->segment = domain = 0; if (domain && !pci_domains_supported) { printk(KERN_WARNING "pci_bus %04x:%02x: " -- cgit v1.2.3 From 4d6b4e69a245e9df4b84dba387596086cb66887d Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 14 Oct 2015 14:29:41 +0800 Subject: x86/PCI/ACPI: Use common interface to support PCI host bridge Use common interface to simplify ACPI PCI host bridge implementation. Signed-off-by: Jiang Liu Reviewed-by: Hanjun Guo Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- arch/x86/pci/acpi.c | 294 ++++++++++++++++------------------------------------ 1 file changed, 87 insertions(+), 207 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 5bc018559cc4..3cd69832d7f4 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -4,16 +4,15 @@ #include #include #include +#include #include #include struct pci_root_info { - struct acpi_device *bridge; - char name[16]; + struct acpi_pci_root_info common; struct pci_sysdata sd; #ifdef CONFIG_PCI_MMCONFIG bool mcfg_added; - u16 segment; u8 start_bus; u8 end_bus; #endif @@ -178,15 +177,18 @@ static int check_segment(u16 seg, struct device *dev, char *estr) return 0; } -static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, - u8 end, phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { - int result; - struct device *dev = &info->bridge->dev; + int result, seg; + struct pci_root_info *info; + struct acpi_pci_root *root = ci->root; + struct device *dev = &ci->bridge->dev; - info->start_bus = start; - info->end_bus = end; + info = container_of(ci, struct pci_root_info, common); + info->start_bus = (u8)root->secondary.start; + info->end_bus = (u8)root->secondary.end; info->mcfg_added = false; + seg = info->sd.domain; /* return success if MMCFG is not in use */ if (raw_pci_ext_ops && raw_pci_ext_ops != &pci_mmcfg) @@ -195,7 +197,8 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, if (!(pci_probe & PCI_PROBE_MMCONF)) return check_segment(seg, dev, "MMCONFIG is disabled,"); - result = pci_mmconfig_insert(dev, seg, start, end, addr); + result = pci_mmconfig_insert(dev, seg, info->start_bus, info->end_bus, + root->mcfg_addr); if (result == 0) { /* enable MMCFG if it hasn't been enabled yet */ if (raw_pci_ext_ops == NULL) @@ -208,134 +211,55 @@ static int setup_mcfg_map(struct pci_root_info *info, u16 seg, u8 start, return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { + struct pci_root_info *info; + + info = container_of(ci, struct pci_root_info, common); if (info->mcfg_added) { - pci_mmconfig_delete(info->segment, info->start_bus, - info->end_bus); + pci_mmconfig_delete(info->sd.domain, + info->start_bus, info->end_bus); info->mcfg_added = false; } } #else -static int setup_mcfg_map(struct pci_root_info *info, - u16 seg, u8 start, u8 end, - phys_addr_t addr) +static int setup_mcfg_map(struct acpi_pci_root_info *ci) { return 0; } -static void teardown_mcfg_map(struct pci_root_info *info) + +static void teardown_mcfg_map(struct acpi_pci_root_info *ci) { } #endif -static void validate_resources(struct device *dev, struct list_head *crs_res, - unsigned long type) +static int pci_acpi_root_get_node(struct acpi_pci_root *root) { - LIST_HEAD(list); - struct resource *res1, *res2, *root = NULL; - struct resource_entry *tmp, *entry, *entry2; - - BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0); - root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource; - - list_splice_init(crs_res, &list); - resource_list_for_each_entry_safe(entry, tmp, &list) { - bool free = false; - resource_size_t end; - - res1 = entry->res; - if (!(res1->flags & type)) - goto next; - - /* Exclude non-addressable range or non-addressable portion */ - end = min(res1->end, root->end); - if (end <= res1->start) { - dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n", - res1); - free = true; - goto next; - } else if (res1->end != end) { - dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n", - res1, (unsigned long long)end + 1, - (unsigned long long)res1->end); - res1->end = end; - } - - resource_list_for_each_entry(entry2, crs_res) { - res2 = entry2->res; - if (!(res2->flags & type)) - continue; - - /* - * I don't like throwing away windows because then - * our resources no longer match the ACPI _CRS, but - * the kernel resource tree doesn't allow overlaps. - */ - if (resource_overlaps(res1, res2)) { - res2->start = min(res1->start, res2->start); - res2->end = max(res1->end, res2->end); - dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n", - res2, res1); - free = true; - goto next; - } - } + int busnum = root->secondary.start; + struct acpi_device *device = root->device; + int node = acpi_get_node(device->handle); -next: - resource_list_del(entry); - if (free) - resource_list_free_entry(entry); - else - resource_list_add_tail(entry, crs_res); + if (node == NUMA_NO_NODE) { + node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); } + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + + return node; } -static void add_resources(struct pci_root_info *info, - struct list_head *resources, - struct list_head *crs_res) +static int pci_acpi_root_init_info(struct acpi_pci_root_info *ci) { - struct resource_entry *entry, *tmp; - struct resource *res, *conflict, *root = NULL; - - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM); - validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO); - - resource_list_for_each_entry_safe(entry, tmp, crs_res) { - res = entry->res; - if (res->flags & IORESOURCE_MEM) - root = &iomem_resource; - else if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - BUG_ON(res); - - conflict = insert_resource_conflict(root, res); - if (conflict) { - dev_info(&info->bridge->dev, - "ignoring host bridge window %pR (conflicts with %s %pR)\n", - res, conflict->name, conflict); - resource_list_destroy_entry(entry); - } - } - - list_splice_tail(crs_res, resources); + return setup_mcfg_map(ci); } -static void release_pci_root_info(struct pci_host_bridge *bridge) +static void pci_acpi_root_release_info(struct acpi_pci_root_info *ci) { - struct resource *res; - struct resource_entry *entry; - struct pci_root_info *info = bridge->release_data; - - resource_list_for_each_entry(entry, &bridge->windows) { - res = entry->res; - if (res->parent && - (res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) - release_resource(res); - } - - teardown_mcfg_map(info); - kfree(info); + teardown_mcfg_map(ci); + kfree(container_of(ci, struct pci_root_info, common)); } /* @@ -358,47 +282,44 @@ static bool resource_is_pcicfg_ioport(struct resource *res) res->start == 0xCF8 && res->end == 0xCFF; } -static void probe_pci_root_info(struct pci_root_info *info, - struct acpi_device *device, - int busnum, int domain, - struct list_head *list) +static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) { - int ret; + struct acpi_device *device = ci->bridge; + int busnum = ci->root->secondary.start; struct resource_entry *entry, *tmp; + int status; - sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); - info->bridge = device; - ret = acpi_dev_get_resources(device, list, - acpi_dev_filter_resource_type_cb, - (void *)(IORESOURCE_IO | IORESOURCE_MEM)); - if (ret < 0) - dev_warn(&device->dev, - "failed to parse _CRS method, error code %d\n", ret); - else if (ret == 0) - dev_dbg(&device->dev, - "no IO and memory resources present in _CRS\n"); - else - resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_DISABLED) || - resource_is_pcicfg_ioport(entry->res)) + status = acpi_pci_probe_root_resources(ci); + if (pci_use_crs) { + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) + if (resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); - else - entry->res->name = info->name; - } + return status; + } + + resource_list_for_each_entry_safe(entry, tmp, &ci->resources) { + dev_printk(KERN_DEBUG, &device->dev, + "host bridge window %pR (ignored)\n", entry->res); + resource_list_destroy_entry(entry); + } + x86_pci_root_bus_resources(busnum, &ci->resources); + + return 0; } +static struct acpi_pci_root_ops acpi_pci_root_ops = { + .pci_ops = &pci_root_ops, + .init_info = pci_acpi_root_init_info, + .release_info = pci_acpi_root_release_info, + .prepare_resources = pci_acpi_root_prepare_resources, +}; + struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { - struct acpi_device *device = root->device; - struct pci_root_info *info; int domain = root->segment; int busnum = root->secondary.start; - struct resource_entry *res_entry; - LIST_HEAD(crs_res); - LIST_HEAD(resources); + int node = pci_acpi_root_get_node(root); struct pci_bus *bus; - struct pci_sysdata *sd; - int node; if (pci_ignore_seg) root->segment = domain = 0; @@ -410,71 +331,33 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - node = acpi_get_node(device->handle); - if (node == NUMA_NO_NODE) { - node = x86_pci_root_bus_node(busnum); - if (node != 0 && node != NUMA_NO_NODE) - dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", - node); - } - - if (node != NUMA_NO_NODE && !node_online(node)) - node = NUMA_NO_NODE; - - info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); - if (!info) { - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (out of memory)\n", domain, busnum); - return NULL; - } - - sd = &info->sd; - sd->domain = domain; - sd->node = node; - sd->companion = device; - bus = pci_find_bus(domain, busnum); if (bus) { /* * If the desired bus has been scanned already, replace * its bus->sysdata. */ - memcpy(bus->sysdata, sd, sizeof(*sd)); - kfree(info); - } else { - /* insert busn res at first */ - pci_add_resource(&resources, &root->secondary); + struct pci_sysdata sd = { + .domain = domain, + .node = node, + .companion = root->device + }; - /* - * _CRS with no apertures is normal, so only fall back to - * defaults or native bridge info if we're ignoring _CRS. - */ - probe_pci_root_info(info, device, busnum, domain, &crs_res); - if (pci_use_crs) { - add_resources(info, &resources, &crs_res); - } else { - resource_list_for_each_entry(res_entry, &crs_res) - dev_printk(KERN_DEBUG, &device->dev, - "host bridge window %pR (ignored)\n", - res_entry->res); - resource_list_free(&crs_res); - x86_pci_root_bus_resources(busnum, &resources); - } - - if (!setup_mcfg_map(info, domain, (u8)root->secondary.start, - (u8)root->secondary.end, root->mcfg_addr)) - bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, - sd, &resources); - - if (bus) { - pci_scan_child_bus(bus); - pci_set_host_bridge_release( - to_pci_host_bridge(bus->bridge), - release_pci_root_info, info); - } else { - resource_list_free(&resources); - teardown_mcfg_map(info); - kfree(info); + memcpy(bus->sysdata, &sd, sizeof(sd)); + } else { + struct pci_root_info *info; + + info = kzalloc_node(sizeof(*info), GFP_KERNEL, node); + if (!info) + dev_err(&root->device->dev, + "pci_bus %04x:%02x: ignored (out of memory)\n", + domain, busnum); + else { + info->sd.domain = domain; + info->sd.node = node; + info->sd.companion = root->device; + bus = acpi_pci_root_create(root, &acpi_pci_root_ops, + &info->common, &info->sd); } } @@ -487,9 +370,6 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) pcie_bus_configure_settings(child); } - if (bus && node != NUMA_NO_NODE) - dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); - return bus; } -- cgit v1.2.3