From 51d07566045787b99219d809639c8724506fc78a Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Tue, 25 Jan 2011 11:10:06 -0800 Subject: bq20z75: Add support for charge properties Adding support for charge properties for gas gauge. Also ensuring that battery mode is correct now for energy as well as charge properties by setting it on the fly. I also added 2 functions to power_supply.h to help identify the units for specific properties more easily by power supplies. Signed-off-by: Rhyland Klein Signed-off-by: Anton Vorontsov --- include/linux/power_supply.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 7d7325685c42..e3419fc5541e 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -213,4 +213,48 @@ extern void power_supply_unregister(struct power_supply *psy); /* For APM emulation, think legacy userspace. */ extern struct class *power_supply_class; +static inline bool power_supply_is_amp_property(enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: + case POWER_SUPPLY_PROP_CHARGE_EMPTY_DESIGN: + case POWER_SUPPLY_PROP_CHARGE_FULL: + case POWER_SUPPLY_PROP_CHARGE_EMPTY: + case POWER_SUPPLY_PROP_CHARGE_NOW: + case POWER_SUPPLY_PROP_CHARGE_AVG: + case POWER_SUPPLY_PROP_CHARGE_COUNTER: + case POWER_SUPPLY_PROP_CURRENT_MAX: + case POWER_SUPPLY_PROP_CURRENT_NOW: + case POWER_SUPPLY_PROP_CURRENT_AVG: + return 1; + default: + break; + } + + return 0; +} + +static inline bool power_supply_is_watt_property(enum power_supply_property psp) +{ + switch (psp) { + case POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN: + case POWER_SUPPLY_PROP_ENERGY_EMPTY_DESIGN: + case POWER_SUPPLY_PROP_ENERGY_FULL: + case POWER_SUPPLY_PROP_ENERGY_EMPTY: + case POWER_SUPPLY_PROP_ENERGY_NOW: + case POWER_SUPPLY_PROP_ENERGY_AVG: + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_VOLTAGE_MIN: + case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: + case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + case POWER_SUPPLY_PROP_VOLTAGE_AVG: + return 1; + default: + break; + } + + return 0; +} + #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3 From 0b9536c957095eb1497828aa51b34ac695f67eae Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Fri, 7 Jan 2011 16:28:16 +0000 Subject: leds: Add ability to blink via simple trigger As blink API is now available, it's possible to add ability to blink via simple trigger. Signed-off-by: Vasily Khoruzhick Signed-off-by: Anton Vorontsov --- drivers/leds/led-triggers.c | 20 ++++++++++++++++++++ include/linux/leds.h | 3 +++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c index c41eb6180c9c..4bebae733349 100644 --- a/drivers/leds/led-triggers.c +++ b/drivers/leds/led-triggers.c @@ -231,6 +231,26 @@ void led_trigger_event(struct led_trigger *trigger, } EXPORT_SYMBOL_GPL(led_trigger_event); +void led_trigger_blink(struct led_trigger *trigger, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct list_head *entry; + + if (!trigger) + return; + + read_lock(&trigger->leddev_list_lock); + list_for_each(entry, &trigger->led_cdevs) { + struct led_classdev *led_cdev; + + led_cdev = list_entry(entry, struct led_classdev, trig_list); + led_blink_set(led_cdev, delay_on, delay_off); + } + read_unlock(&trigger->leddev_list_lock); +} +EXPORT_SYMBOL_GPL(led_trigger_blink); + void led_trigger_register_simple(const char *name, struct led_trigger **tp) { struct led_trigger *trigger; diff --git a/include/linux/leds.h b/include/linux/leds.h index 0f19df9e37b0..ffd5c3d91193 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -145,6 +145,9 @@ extern void led_trigger_register_simple(const char *name, extern void led_trigger_unregister_simple(struct led_trigger *trigger); extern void led_trigger_event(struct led_trigger *trigger, enum led_brightness event); +extern void led_trigger_blink(struct led_trigger *trigger, + unsigned long *delay_on, + unsigned long *delay_off); #else -- cgit v1.2.3 From 6501f728c56f831626d52b236023e556bca37f51 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Fri, 7 Jan 2011 18:28:17 +0200 Subject: power_supply: Add new LED trigger charging-blink-solid-full Add new trigger to power_supply LEDs. It will blink when battery is charging, and stay solid when battery is charged. It's usefull to indicate battery state when there's only one LED available. Signed-off-by: Vasily Khoruzhick Signed-off-by: Anton Vorontsov --- drivers/power/power_supply_leds.c | 19 +++++++++++++++++++ include/linux/power_supply.h | 2 ++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/power/power_supply_leds.c b/drivers/power/power_supply_leds.c index 031a554837f7..da25eb94e5c6 100644 --- a/drivers/power/power_supply_leds.c +++ b/drivers/power/power_supply_leds.c @@ -21,6 +21,8 @@ static void power_supply_update_bat_leds(struct power_supply *psy) { union power_supply_propval status; + unsigned long delay_on = 0; + unsigned long delay_off = 0; if (psy->get_property(psy, POWER_SUPPLY_PROP_STATUS, &status)) return; @@ -32,16 +34,22 @@ static void power_supply_update_bat_leds(struct power_supply *psy) led_trigger_event(psy->charging_full_trig, LED_FULL); led_trigger_event(psy->charging_trig, LED_OFF); led_trigger_event(psy->full_trig, LED_FULL); + led_trigger_event(psy->charging_blink_full_solid_trig, + LED_FULL); break; case POWER_SUPPLY_STATUS_CHARGING: led_trigger_event(psy->charging_full_trig, LED_FULL); led_trigger_event(psy->charging_trig, LED_FULL); led_trigger_event(psy->full_trig, LED_OFF); + led_trigger_blink(psy->charging_blink_full_solid_trig, + &delay_on, &delay_off); break; default: led_trigger_event(psy->charging_full_trig, LED_OFF); led_trigger_event(psy->charging_trig, LED_OFF); led_trigger_event(psy->full_trig, LED_OFF); + led_trigger_event(psy->charging_blink_full_solid_trig, + LED_OFF); break; } } @@ -64,15 +72,24 @@ static int power_supply_create_bat_triggers(struct power_supply *psy) if (!psy->full_trig_name) goto full_failed; + psy->charging_blink_full_solid_trig_name = kasprintf(GFP_KERNEL, + "%s-charging-blink-full-solid", psy->name); + if (!psy->charging_blink_full_solid_trig_name) + goto charging_blink_full_solid_failed; + led_trigger_register_simple(psy->charging_full_trig_name, &psy->charging_full_trig); led_trigger_register_simple(psy->charging_trig_name, &psy->charging_trig); led_trigger_register_simple(psy->full_trig_name, &psy->full_trig); + led_trigger_register_simple(psy->charging_blink_full_solid_trig_name, + &psy->charging_blink_full_solid_trig); goto success; +charging_blink_full_solid_failed: + kfree(psy->full_trig_name); full_failed: kfree(psy->charging_trig_name); charging_failed: @@ -88,6 +105,8 @@ static void power_supply_remove_bat_triggers(struct power_supply *psy) led_trigger_unregister_simple(psy->charging_full_trig); led_trigger_unregister_simple(psy->charging_trig); led_trigger_unregister_simple(psy->full_trig); + led_trigger_unregister_simple(psy->charging_blink_full_solid_trig); + kfree(psy->charging_blink_full_solid_trig_name); kfree(psy->full_trig_name); kfree(psy->charging_trig_name); kfree(psy->charging_full_trig_name); diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index e3419fc5541e..20f23fef63cc 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -173,6 +173,8 @@ struct power_supply { char *full_trig_name; struct led_trigger *online_trig; char *online_trig_name; + struct led_trigger *charging_blink_full_solid_trig; + char *charging_blink_full_solid_trig_name; #endif }; -- cgit v1.2.3 From 7fb7ba588c0f276609609565b21fcc853284a9a0 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 24 May 2010 19:55:27 +0200 Subject: bq27x00: Add bq27000 support This patch adds support for the bq27000 battery to the bq27x00 driver. The bq27000 is similar to the bq27200 except that it uses the HDQ bus instead of I2C to communicate with the host system. The driver is implemented as a platform driver. The driver expects to be provided with a read callback function through its platform data. The read function is assumed to do the lowlevel HDQ handling and read out the value of a certain register. Signed-off-by: Lars-Peter Clausen Tested-by: Grazvydas Ignotas --- drivers/power/Kconfig | 14 +++ drivers/power/bq27x00_battery.c | 184 +++++++++++++++++++++++++++++++--- include/linux/power/bq27x00_battery.h | 19 ++++ 3 files changed, 202 insertions(+), 15 deletions(-) create mode 100644 include/linux/power/bq27x00_battery.h (limited to 'include') diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 61bf5d724139..52a462fc6b84 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -117,10 +117,24 @@ config BATTERY_BQ20Z75 config BATTERY_BQ27x00 tristate "BQ27x00 battery driver" + help + Say Y here to enable support for batteries with BQ27x00 (I2C/HDQ) chips. + +config BATTERY_BQ27X00_I2C + bool "BQ27200/BQ27500 support" + depends on BATTERY_BQ27x00 depends on I2C + default y help Say Y here to enable support for batteries with BQ27x00 (I2C) chips. +config BATTERY_BQ27X00_PLATFORM + bool "BQ27000 support" + depends on BATTERY_BQ27x00 + default y + help + Say Y here to enable support for batteries with BQ27000 (HDQ) chips. + config BATTERY_DA9030 tristate "DA9030 battery driver" depends on PMIC_DA903X diff --git a/drivers/power/bq27x00_battery.c b/drivers/power/bq27x00_battery.c index def951d8fc2b..44bc76be0780 100644 --- a/drivers/power/bq27x00_battery.c +++ b/drivers/power/bq27x00_battery.c @@ -3,6 +3,7 @@ * * Copyright (C) 2008 Rodolfo Giometti * Copyright (C) 2008 Eurotech S.p.A. + * Copyright (C) 2010-2011 Lars-Peter Clausen * * Based on a previous work by Copyright (C) 2008 Texas Instruments, Inc. * @@ -27,6 +28,8 @@ #include #include +#include + #define DRIVER_VERSION "1.1.0" #define BQ27x00_REG_TEMP 0x06 @@ -46,12 +49,6 @@ #define BQ27000_RS 20 /* Resistor sense */ -/* If the system has several batteries we need a different name for each - * of them... - */ -static DEFINE_IDR(battery_id); -static DEFINE_MUTEX(battery_mutex); - struct bq27x00_device_info; struct bq27x00_access_methods { int (*read)(struct bq27x00_device_info *, u8 reg, int *rt_value, @@ -317,9 +314,15 @@ static int bq27x00_powersupply_init(struct bq27x00_device_info *di) return 0; } -/* - * i2c specific code + +/* i2c specific code */ +#ifdef CONFIG_BATTERY_BQ27X00_I2C + +/* If the system has several batteries we need a different name for each + * of them... */ +static DEFINE_IDR(battery_id); +static DEFINE_MUTEX(battery_mutex); static int bq27x00_read_i2c(struct bq27x00_device_info *di, u8 reg, int *rt_value, bool single) @@ -434,10 +437,6 @@ static int bq27x00_battery_remove(struct i2c_client *client) return 0; } -/* - * Module stuff - */ - static const struct i2c_device_id bq27x00_id[] = { { "bq27200", BQ27000 }, /* bq27200 is same as bq27000, but with i2c */ { "bq27500", BQ27500 }, @@ -453,13 +452,167 @@ static struct i2c_driver bq27x00_battery_driver = { .id_table = bq27x00_id, }; +static inline int bq27x00_battery_i2c_init(void) +{ + int ret = i2c_add_driver(&bq27x00_battery_driver); + if (ret) + printk(KERN_ERR "Unable to register BQ27x00 i2c driver\n"); + + return ret; +} + +static inline void bq27x00_battery_i2c_exit(void) +{ + i2c_del_driver(&bq27x00_battery_driver); +} + +#else + +static inline int bq27x00_battery_i2c_init(void) { return 0; } +static inline void bq27x00_battery_i2c_exit(void) {}; + +#endif + +/* platform specific code */ +#ifdef CONFIG_BATTERY_BQ27X00_PLATFORM + +static int bq27000_read_platform(struct bq27x00_device_info *di, u8 reg, + int *rt_value, bool single) +{ + struct device *dev = di->dev; + struct bq27000_platform_data *pdata = dev->platform_data; + unsigned int timeout = 3; + int upper, lower; + int temp; + + if (!single) { + /* Make sure the value has not changed in between reading the + * lower and the upper part */ + upper = pdata->read(dev, reg + 1); + do { + temp = upper; + if (upper < 0) + return upper; + + lower = pdata->read(dev, reg); + if (lower < 0) + return lower; + + upper = pdata->read(dev, reg + 1); + } while (temp != upper && --timeout); + + if (timeout == 0) + return -EIO; + + *rt_value = (upper << 8) | lower; + } else { + lower = pdata->read(dev, reg); + if (lower < 0) + return lower; + *rt_value = lower; + } + return 0; +} + +static int __devinit bq27000_battery_probe(struct platform_device *pdev) +{ + struct bq27x00_device_info *di; + struct bq27000_platform_data *pdata = pdev->dev.platform_data; + int ret; + + if (!pdata) { + dev_err(&pdev->dev, "no platform_data supplied\n"); + return -EINVAL; + } + + if (!pdata->read) { + dev_err(&pdev->dev, "no hdq read callback supplied\n"); + return -EINVAL; + } + + di = kzalloc(sizeof(*di), GFP_KERNEL); + if (!di) { + dev_err(&pdev->dev, "failed to allocate device info data\n"); + return -ENOMEM; + } + + platform_set_drvdata(pdev, di); + + di->dev = &pdev->dev; + di->chip = BQ27000; + + di->bat.name = pdata->name ?: dev_name(&pdev->dev); + di->bus.read = &bq27000_read_platform; + + ret = bq27x00_powersupply_init(di); + if (ret) + goto err_free; + + return 0; + +err_free: + platform_set_drvdata(pdev, NULL); + kfree(di); + + return ret; +} + +static int __devexit bq27000_battery_remove(struct platform_device *pdev) +{ + struct bq27x00_device_info *di = platform_get_drvdata(pdev); + + power_supply_unregister(&di->bat); + platform_set_drvdata(pdev, NULL); + kfree(di); + + return 0; +} + +static struct platform_driver bq27000_battery_driver = { + .probe = bq27000_battery_probe, + .remove = __devexit_p(bq27000_battery_remove), + .driver = { + .name = "bq27000-battery", + .owner = THIS_MODULE, + }, +}; + +static inline int bq27x00_battery_platform_init(void) +{ + int ret = platform_driver_register(&bq27000_battery_driver); + if (ret) + printk(KERN_ERR "Unable to register BQ27000 platform driver\n"); + + return ret; +} + +static inline void bq27x00_battery_platform_exit(void) +{ + platform_driver_unregister(&bq27000_battery_driver); +} + +#else + +static inline int bq27x00_battery_platform_init(void) { return 0; } +static inline void bq27x00_battery_platform_exit(void) {}; + +#endif + +/* + * Module stuff + */ + static int __init bq27x00_battery_init(void) { int ret; - ret = i2c_add_driver(&bq27x00_battery_driver); + ret = bq27x00_battery_i2c_init(); + if (ret) + return ret; + + ret = bq27x00_battery_platform_init(); if (ret) - printk(KERN_ERR "Unable to register BQ27x00 driver\n"); + bq27x00_battery_i2c_exit(); return ret; } @@ -467,7 +620,8 @@ module_init(bq27x00_battery_init); static void __exit bq27x00_battery_exit(void) { - i2c_del_driver(&bq27x00_battery_driver); + bq27x00_battery_platform_exit(); + bq27x00_battery_i2c_exit(); } module_exit(bq27x00_battery_exit); diff --git a/include/linux/power/bq27x00_battery.h b/include/linux/power/bq27x00_battery.h new file mode 100644 index 000000000000..a857f719bf40 --- /dev/null +++ b/include/linux/power/bq27x00_battery.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_BQ27X00_BATTERY_H__ +#define __LINUX_BQ27X00_BATTERY_H__ + +/** + * struct bq27000_plaform_data - Platform data for bq27000 devices + * @name: Name of the battery. If NULL the driver will fallback to "bq27000". + * @read: HDQ read callback. + * This function should provide access to the HDQ bus the battery is + * connected to. + * The first parameter is a pointer to the battery device, the second the + * register to be read. The return value should either be the content of + * the passed register or an error value. + */ +struct bq27000_platform_data { + const char *name; + int (*read)(struct device *dev, unsigned int); +}; + +#endif -- cgit v1.2.3 From bb879101606dd7235d8f4ecd0f707b63281d0838 Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Mon, 28 Feb 2011 16:55:28 -0800 Subject: bq20z75: Add optional battery detect gpio Adding support for an optional gpio for battery detection. This is passed in through the i2c platform data. It also accepts another field, battery_detect_present to signify the gpio state which means the battery is present, either 0 (low) or 1 (high). Signed-off-by: Rhyland Klein Signed-off-by: Anton Vorontsov --- drivers/power/bq20z75.c | 160 ++++++++++++++++++++++++++++++++++-------- include/linux/power/bq20z75.h | 37 ++++++++++ 2 files changed, 166 insertions(+), 31 deletions(-) create mode 100644 include/linux/power/bq20z75.h (limited to 'include') diff --git a/drivers/power/bq20z75.c b/drivers/power/bq20z75.c index 4141775e5ff6..a51e98d74d34 100644 --- a/drivers/power/bq20z75.c +++ b/drivers/power/bq20z75.c @@ -25,6 +25,10 @@ #include #include #include +#include +#include + +#include enum { REG_MANUFACTURER_DATA, @@ -141,8 +145,13 @@ static enum power_supply_property bq20z75_properties[] = { }; struct bq20z75_info { - struct i2c_client *client; - struct power_supply power_supply; + struct i2c_client *client; + struct power_supply power_supply; + struct bq20z75_platform_data *pdata; + bool is_present; + bool gpio_detect; + bool enable_detection; + int irq; }; static int bq20z75_read_word_data(struct i2c_client *client, u8 address) @@ -179,6 +188,18 @@ static int bq20z75_get_battery_presence_and_health( union power_supply_propval *val) { s32 ret; + struct bq20z75_info *bq20z75_device = i2c_get_clientdata(client); + + if (psp == POWER_SUPPLY_PROP_PRESENT && + bq20z75_device->gpio_detect) { + ret = gpio_get_value( + bq20z75_device->pdata->battery_detect); + if (ret == bq20z75_device->pdata->battery_detect_present) + val->intval = 1; + else + val->intval = 0; + return ret; + } /* Write to ManufacturerAccess with * ManufacturerAccess command and then @@ -192,8 +213,11 @@ static int bq20z75_get_battery_presence_and_health( ret = bq20z75_read_word_data(client, bq20z75_data[REG_MANUFACTURER_DATA].addr); - if (ret < 0) + if (ret < 0) { + if (psp == POWER_SUPPLY_PROP_PRESENT) + val->intval = 0; /* battery removed */ return ret; + } if (ret < bq20z75_data[REG_MANUFACTURER_DATA].min_value || ret > bq20z75_data[REG_MANUFACTURER_DATA].max_value) { @@ -397,8 +421,7 @@ static int bq20z75_get_property(struct power_supply *psy, enum power_supply_property psp, union power_supply_propval *val) { - int ps_index; - int ret; + int ret = 0; struct bq20z75_info *bq20z75_device = container_of(psy, struct bq20z75_info, power_supply); struct i2c_client *client = bq20z75_device->client; @@ -407,8 +430,6 @@ static int bq20z75_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_PRESENT: case POWER_SUPPLY_PROP_HEALTH: ret = bq20z75_get_battery_presence_and_health(client, psp, val); - if (ret) - return ret; break; case POWER_SUPPLY_PROP_TECHNOLOGY: @@ -422,20 +443,15 @@ static int bq20z75_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_CHARGE_FULL: case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN: case POWER_SUPPLY_PROP_CAPACITY: - ps_index = bq20z75_get_property_index(client, psp); - if (ps_index < 0) - return ps_index; - - ret = bq20z75_get_battery_capacity(client, ps_index, psp, val); - if (ret) - return ret; + ret = bq20z75_get_property_index(client, psp); + if (ret < 0) + break; + ret = bq20z75_get_battery_capacity(client, ret, psp, val); break; case POWER_SUPPLY_PROP_SERIAL_NUMBER: ret = bq20z75_get_battery_serial_number(client, val); - if (ret) - return ret; break; case POWER_SUPPLY_PROP_STATUS: @@ -446,14 +462,11 @@ static int bq20z75_get_property(struct power_supply *psy, case POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG: case POWER_SUPPLY_PROP_TIME_TO_FULL_AVG: case POWER_SUPPLY_PROP_VOLTAGE_MAX_DESIGN: - ps_index = bq20z75_get_property_index(client, psp); - if (ps_index < 0) - return ps_index; - - ret = bq20z75_get_battery_property(client, ps_index, psp, val); - if (ret) - return ret; + ret = bq20z75_get_property_index(client, psp); + if (ret < 0) + break; + ret = bq20z75_get_battery_property(client, ret, psp, val); break; default: @@ -462,26 +475,51 @@ static int bq20z75_get_property(struct power_supply *psy, return -EINVAL; } - /* Convert units to match requirements for power supply class */ - bq20z75_unit_adjustment(client, psp, val); + if (!bq20z75_device->enable_detection) + goto done; + + if (!bq20z75_device->gpio_detect && + bq20z75_device->is_present != (ret >= 0)) { + bq20z75_device->is_present = (ret >= 0); + power_supply_changed(&bq20z75_device->power_supply); + } + +done: + if (!ret) { + /* Convert units to match requirements for power supply class */ + bq20z75_unit_adjustment(client, psp, val); + } dev_dbg(&client->dev, "%s: property = %d, value = %d\n", __func__, psp, val->intval); - return 0; + return ret; } -static int bq20z75_probe(struct i2c_client *client, +static irqreturn_t bq20z75_irq(int irq, void *devid) +{ + struct power_supply *battery = devid; + + power_supply_changed(battery); + + return IRQ_HANDLED; +} + +static int __devinit bq20z75_probe(struct i2c_client *client, const struct i2c_device_id *id) { struct bq20z75_info *bq20z75_device; + struct bq20z75_platform_data *pdata = client->dev.platform_data; int rc; + int irq; bq20z75_device = kzalloc(sizeof(struct bq20z75_info), GFP_KERNEL); if (!bq20z75_device) return -ENOMEM; bq20z75_device->client = client; + bq20z75_device->enable_detection = false; + bq20z75_device->gpio_detect = false; bq20z75_device->power_supply.name = "battery"; bq20z75_device->power_supply.type = POWER_SUPPLY_TYPE_BATTERY; bq20z75_device->power_supply.properties = bq20z75_properties; @@ -489,26 +527,86 @@ static int bq20z75_probe(struct i2c_client *client, ARRAY_SIZE(bq20z75_properties); bq20z75_device->power_supply.get_property = bq20z75_get_property; + if (pdata) { + bq20z75_device->gpio_detect = + gpio_is_valid(pdata->battery_detect); + bq20z75_device->pdata = pdata; + } + i2c_set_clientdata(client, bq20z75_device); + if (!bq20z75_device->gpio_detect) + goto skip_gpio; + + rc = gpio_request(pdata->battery_detect, dev_name(&client->dev)); + if (rc) { + dev_warn(&client->dev, "Failed to request gpio: %d\n", rc); + bq20z75_device->gpio_detect = false; + goto skip_gpio; + } + + rc = gpio_direction_input(pdata->battery_detect); + if (rc) { + dev_warn(&client->dev, "Failed to get gpio as input: %d\n", rc); + gpio_free(pdata->battery_detect); + bq20z75_device->gpio_detect = false; + goto skip_gpio; + } + + irq = gpio_to_irq(pdata->battery_detect); + if (irq <= 0) { + dev_warn(&client->dev, "Failed to get gpio as irq: %d\n", irq); + gpio_free(pdata->battery_detect); + bq20z75_device->gpio_detect = false; + goto skip_gpio; + } + + rc = request_irq(irq, bq20z75_irq, + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + dev_name(&client->dev), &bq20z75_device->power_supply); + if (rc) { + dev_warn(&client->dev, "Failed to request irq: %d\n", rc); + gpio_free(pdata->battery_detect); + bq20z75_device->gpio_detect = false; + goto skip_gpio; + } + + bq20z75_device->irq = irq; + +skip_gpio: + rc = power_supply_register(&client->dev, &bq20z75_device->power_supply); if (rc) { dev_err(&client->dev, "%s: Failed to register power supply\n", __func__); - kfree(bq20z75_device); - return rc; + goto exit_psupply; } dev_info(&client->dev, "%s: battery gas gauge device registered\n", client->name); return 0; + +exit_psupply: + if (bq20z75_device->irq) + free_irq(bq20z75_device->irq, &bq20z75_device->power_supply); + if (bq20z75_device->gpio_detect) + gpio_free(pdata->battery_detect); + + kfree(bq20z75_device); + + return rc; } -static int bq20z75_remove(struct i2c_client *client) +static int __devexit bq20z75_remove(struct i2c_client *client) { struct bq20z75_info *bq20z75_device = i2c_get_clientdata(client); + if (bq20z75_device->irq) + free_irq(bq20z75_device->irq, &bq20z75_device->power_supply); + if (bq20z75_device->gpio_detect) + gpio_free(bq20z75_device->pdata->battery_detect); + power_supply_unregister(&bq20z75_device->power_supply); kfree(bq20z75_device); bq20z75_device = NULL; @@ -544,7 +642,7 @@ static const struct i2c_device_id bq20z75_id[] = { static struct i2c_driver bq20z75_battery_driver = { .probe = bq20z75_probe, - .remove = bq20z75_remove, + .remove = __devexit_p(bq20z75_remove), .suspend = bq20z75_suspend, .resume = bq20z75_resume, .id_table = bq20z75_id, diff --git a/include/linux/power/bq20z75.h b/include/linux/power/bq20z75.h new file mode 100644 index 000000000000..0e1b8a26a9b1 --- /dev/null +++ b/include/linux/power/bq20z75.h @@ -0,0 +1,37 @@ +/* + * Gas Gauge driver for TI's BQ20Z75 + * + * Copyright (c) 2010, NVIDIA Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __LINUX_POWER_BQ20Z75_H_ +#define __LINUX_POWER_BQ20Z75_H_ + +#include +#include + +/** + * struct bq20z75_platform_data - platform data for bq20z75 devices + * @battery_detect: GPIO which is used to detect battery presence + * @battery_detect_present: gpio state when battery is present (0 / 1) + */ +struct bq20z75_platform_data { + int battery_detect; + int battery_detect_present; +}; + +#endif -- cgit v1.2.3 From ff28fcef1bedcfbdf49500fee1573dc2f3eedb22 Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Mon, 28 Feb 2011 16:55:29 -0800 Subject: bq20z75: Add i2c retry mechanism With the support of platform data, now adding support for option i2c retries on read/write failures. Ths is specified through the optional platform data. Signed-off-by: Rhyland Klein Signed-off-by: Anton Vorontsov --- drivers/power/bq20z75.c | 37 +++++++++++++++++++++++++++++++------ include/linux/power/bq20z75.h | 2 ++ 2 files changed, 33 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/power/bq20z75.c b/drivers/power/bq20z75.c index a51e98d74d34..e82d10e25e8c 100644 --- a/drivers/power/bq20z75.c +++ b/drivers/power/bq20z75.c @@ -156,30 +156,55 @@ struct bq20z75_info { static int bq20z75_read_word_data(struct i2c_client *client, u8 address) { - s32 ret; + struct bq20z75_info *bq20z75_device = i2c_get_clientdata(client); + s32 ret = 0; + int retries = 1; + + if (bq20z75_device->pdata) + retries = max(bq20z75_device->pdata->i2c_retry_count + 1, 1); + + while (retries > 0) { + ret = i2c_smbus_read_word_data(client, address); + if (ret >= 0) + break; + retries--; + } - ret = i2c_smbus_read_word_data(client, address); if (ret < 0) { - dev_err(&client->dev, + dev_warn(&client->dev, "%s: i2c read at address 0x%x failed\n", __func__, address); return ret; } + return le16_to_cpu(ret); } static int bq20z75_write_word_data(struct i2c_client *client, u8 address, u16 value) { - s32 ret; + struct bq20z75_info *bq20z75_device = i2c_get_clientdata(client); + s32 ret = 0; + int retries = 1; + + if (bq20z75_device->pdata) + retries = max(bq20z75_device->pdata->i2c_retry_count + 1, 1); + + while (retries > 0) { + ret = i2c_smbus_write_word_data(client, address, + le16_to_cpu(value)); + if (ret >= 0) + break; + retries--; + } - ret = i2c_smbus_write_word_data(client, address, le16_to_cpu(value)); if (ret < 0) { - dev_err(&client->dev, + dev_warn(&client->dev, "%s: i2c write to address 0x%x failed\n", __func__, address); return ret; } + return 0; } diff --git a/include/linux/power/bq20z75.h b/include/linux/power/bq20z75.h index 0e1b8a26a9b1..b0843b68af92 100644 --- a/include/linux/power/bq20z75.h +++ b/include/linux/power/bq20z75.h @@ -28,10 +28,12 @@ * struct bq20z75_platform_data - platform data for bq20z75 devices * @battery_detect: GPIO which is used to detect battery presence * @battery_detect_present: gpio state when battery is present (0 / 1) + * @i2c_retry_count: # of times to retry on i2c IO failure */ struct bq20z75_platform_data { int battery_detect; int battery_detect_present; + int i2c_retry_count; }; #endif -- cgit v1.2.3 From 35c9d267665230cf44445be616d491d3763a5cd3 Mon Sep 17 00:00:00 2001 From: Rhyland Klein Date: Mon, 28 Feb 2011 16:55:31 -0800 Subject: power_supply: Update power_supply_is_watt_property Update the power_supply_is_watt_property function to include POWER_NOW. Signed-off-by: Rhyland Klein Signed-off-by: Anton Vorontsov --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 20f23fef63cc..204c18dfdc9e 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -251,6 +251,7 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp) case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN: case POWER_SUPPLY_PROP_VOLTAGE_NOW: case POWER_SUPPLY_PROP_VOLTAGE_AVG: + case POWER_SUPPLY_PROP_POWER_NOW: return 1; default: break; -- cgit v1.2.3 From 422028b1ca4c07995af82a18abced022ff4c296c Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Oct 2010 11:12:07 +0200 Subject: drbd: New configuration parameters for dealing with network congestion net { on_congestion {block|pull-ahead|disconnect}; congestion-fill {sectors}; congestion-extents {al-extents}; } Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 7 +++++++ include/linux/drbd.h | 7 +++++++ include/linux/drbd_limits.h | 9 +++++++++ include/linux/drbd_nl.h | 3 +++ 4 files changed, 26 insertions(+) (limited to 'include') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 9e27d82a9a19..f969d8717e23 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1323,6 +1323,8 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, new_conf->wire_protocol = DRBD_PROT_C; new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + new_conf->on_congestion = DRBD_ON_CONGESTION_DEF; + new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF; if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { retcode = ERR_MANDATORY_TAG; @@ -1344,6 +1346,11 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } } + if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) { + retcode = ERR_CONG_NOT_PROTO_A; + goto fail; + } + if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { retcode = ERR_DISCARD; goto fail; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index ef44c7a0638c..03a08baabf11 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -96,6 +96,12 @@ enum drbd_on_no_data { OND_SUSPEND_IO }; +enum drbd_on_congestion { + OC_BLOCK, + OC_PULL_AHEAD, + OC_DISCONNECT, +}; + /* KEEP the order, do not delete or insert. Only append. */ enum drbd_ret_codes { ERR_CODE_BASE = 100, @@ -146,6 +152,7 @@ enum drbd_ret_codes { ERR_PERM = 152, ERR_NEED_APV_93 = 153, ERR_STONITH_AND_PROT_A = 154, + ERR_CONG_NOT_PROTO_A = 155, /* insert new ones above this line */ AFTER_LAST_ERR_CODE diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 4ac33f34b77e..abf418724e52 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -129,6 +129,7 @@ #define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR +#define DRBD_ON_CONGESTION_DEF OC_BLOCK #define DRBD_MAX_BIO_BVECS_MIN 0 #define DRBD_MAX_BIO_BVECS_MAX 128 @@ -154,5 +155,13 @@ #define DRBD_C_MIN_RATE_MAX (4 << 20) #define DRBD_C_MIN_RATE_DEF 4096 +#define DRBD_CONG_FILL_MIN 0 +#define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */ +#define DRBD_CONG_FILL_DEF 0 + +#define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN +#define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX +#define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF + #undef RANGE #endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index ade91107c9a5..8cde3945d1f7 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -56,6 +56,9 @@ NL_PACKET(net_conf, 5, NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) + NL_INTEGER( 81, T_MAY_IGNORE, on_congestion) + NL_INTEGER( 82, T_MAY_IGNORE, cong_fill) + NL_INTEGER( 83, T_MAY_IGNORE, cong_extents) /* 59 addr_family was available in GIT, never released */ NL_BIT( 60, T_MANDATORY, mind_af) NL_BIT( 27, T_MAY_IGNORE, want_lose) -- cgit v1.2.3 From 67531718d8f1259f01ab84c2aa25f7b03c7afd46 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Oct 2010 12:21:30 +0200 Subject: drbd: Implemented two new connection states Ahead/Behind In this connection mode, the ahead node no longer replicates application IO. The behind's disk becomes out dated. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_int.h | 2 ++ drivers/block/drbd/drbd_main.c | 12 ++++++++++-- drivers/block/drbd/drbd_receiver.c | 3 +++ drivers/block/drbd/drbd_req.c | 23 +++++++++++++++++++++++ drivers/block/drbd/drbd_strings.c | 4 +++- include/linux/drbd.h | 4 ++++ 6 files changed, 45 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index c804e44b9455..21b7439438cd 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -2217,6 +2217,8 @@ static inline int drbd_state_is_stable(union drbd_state s) case C_VERIFY_T: case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: + case C_AHEAD: + case C_BEHIND: /* maybe stable, look at the disk state */ break; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e81d009dd061..46f27d6c0b21 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -871,16 +871,19 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state if (ns.conn >= C_CONNECTED && ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || - (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { + (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T) || + ns.conn >= C_AHEAD)) { switch (ns.conn) { case C_WF_BITMAP_T: case C_PAUSED_SYNC_T: + case C_BEHIND: ns.disk = D_OUTDATED; break; case C_CONNECTED: case C_WF_BITMAP_S: case C_SYNC_SOURCE: case C_PAUSED_SYNC_S: + case C_AHEAD: ns.disk = D_UP_TO_DATE; break; case C_SYNC_TARGET: @@ -893,16 +896,18 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state } if (ns.conn >= C_CONNECTED && - (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { + (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED || ns.conn >= C_AHEAD)) { switch (ns.conn) { case C_CONNECTED: case C_WF_BITMAP_T: case C_PAUSED_SYNC_T: case C_SYNC_TARGET: + case C_BEHIND: ns.pdsk = D_UP_TO_DATE; break; case C_WF_BITMAP_S: case C_PAUSED_SYNC_S: + case C_AHEAD: /* remap any consistent state to D_OUTDATED, * but disallow "upgrade" of not even consistent states. */ @@ -1374,6 +1379,9 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) drbd_send_state(mdev); + if (os.conn != C_AHEAD && ns.conn == C_AHEAD) + drbd_send_state(mdev); + /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index f3052d871d31..b19e8b2c4ce5 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3179,6 +3179,9 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned if (ns.conn == C_WF_REPORT_PARAMS) ns.conn = C_CONNECTED; + if (peer_state.conn == C_AHEAD) + ns.conn = C_BEHIND; + if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && get_ldev_if_state(mdev, D_NEGOTIATING)) { int cr; /* consider resync */ diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5c60d77d447c..60288fb3c4d7 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -948,6 +948,29 @@ allocate_barrier: ? queue_for_net_write : queue_for_net_read); } + + if (remote && mdev->net_conf->on_congestion != OC_BLOCK) { + int congested = 0; + + if (mdev->net_conf->cong_fill && + atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { + dev_info(DEV, "Congestion-fill threshold reached\n"); + congested = 1; + } + + if (mdev->act_log->used >= mdev->net_conf->cong_extents) { + dev_info(DEV, "Congestion-extents threshold reached\n"); + congested = 1; + } + + if (congested) { + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); + else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); + } + } + spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 85179e1fb50a..5b970adc3b6f 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -48,6 +48,8 @@ static const char *drbd_conn_s_names[] = { [C_PAUSED_SYNC_T] = "PausedSyncT", [C_VERIFY_S] = "VerifyS", [C_VERIFY_T] = "VerifyT", + [C_AHEAD] = "Ahead", + [C_BEHIND] = "Behind", }; static const char *drbd_role_s_names[] = { @@ -92,7 +94,7 @@ static const char *drbd_state_sw_errors[] = { const char *drbd_conn_str(enum drbd_conns s) { /* enums are unsigned... */ - return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; + return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s]; } const char *drbd_role_str(enum drbd_role s) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 03a08baabf11..23f31be6f00d 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -206,6 +206,10 @@ enum drbd_conns { C_VERIFY_T, C_PAUSED_SYNC_S, C_PAUSED_SYNC_T, + + C_AHEAD, + C_BEHIND, + C_MASK = 31 }; -- cgit v1.2.3 From 73a01a18b9c28a0fab1131ece5b0a9bc00a879b8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Oct 2010 14:33:00 +0200 Subject: drbd: New packet for Ahead/Behind mode: P_OUT_OF_SYNC Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 10 +++++---- drivers/block/drbd/drbd_int.h | 14 +++++++++++- drivers/block/drbd/drbd_main.c | 10 +++++++++ drivers/block/drbd/drbd_receiver.c | 10 +++++++++ drivers/block/drbd/drbd_req.c | 44 +++++++++++++++++++++++++++----------- drivers/block/drbd/drbd_req.h | 4 +++- drivers/block/drbd/drbd_worker.c | 16 ++++++++++++++ include/linux/drbd.h | 2 +- 8 files changed, 91 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b4adb58c7472..33f6cc537d08 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -1007,22 +1007,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, * called by tl_clear and drbd_send_dblock (==drbd_make_request). * so this can be _any_ process. */ -void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, +int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line) { unsigned long sbnr, ebnr, lbnr, flags; sector_t esector, nr_sectors; - unsigned int enr, count; + unsigned int enr, count = 0; struct lc_element *e; if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); - return; + return 0; } if (!get_ldev(mdev)) - return; /* no disk, no metadata, no bitmap to set bits in */ + return 0; /* no disk, no metadata, no bitmap to set bits in */ nr_sectors = drbd_get_capacity(mdev->this_bdev); esector = sector + (size >> 9) - 1; @@ -1052,6 +1052,8 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, out: put_ldev(mdev); + + return count; } static diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 21b7439438cd..471331236826 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -212,6 +212,7 @@ enum drbd_packets { /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ + P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ P_MAX_CMD = 0x28, P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ @@ -269,6 +270,7 @@ static inline const char *cmdname(enum drbd_packets cmd) [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", [P_COMPRESSED_BITMAP] = "CBitmap", [P_DELAY_PROBE] = "DelayProbe", + [P_OUT_OF_SYNC] = "OutOfSync", [P_MAX_CMD] = NULL, }; @@ -550,6 +552,13 @@ struct p_discard { u32 pad; } __packed; +struct p_block_desc { + struct p_header80 head; + u64 sector; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + /* Valid values for the encoding field. * Bump proto version when changing this. */ enum drbd_bitmap_code { @@ -647,6 +656,7 @@ union p_polymorph { struct p_block_req block_req; struct p_delay_probe93 delay_probe93; struct p_rs_uuid rs_uuid; + struct p_block_desc block_desc; } __packed; /**********************************************************************/ @@ -1221,6 +1231,7 @@ extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, struct p_data *dp, int data_size); extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, sector_t sector, int blksize, u64 block_id); +extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req); extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, struct drbd_epoch_entry *e); extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); @@ -1534,6 +1545,7 @@ extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int); extern void resync_timer_fn(unsigned long data); @@ -1626,7 +1638,7 @@ extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); #define drbd_set_in_sync(mdev, sector, size) \ __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) -extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, +extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, const char *file, const unsigned int line); #define drbd_set_out_of_sync(mdev, sector, size) \ __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 46f27d6c0b21..0dc93f43a476 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2634,6 +2634,16 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, return ok; } +int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req) +{ + struct p_block_desc p; + + p.sector = cpu_to_be64(req->sector); + p.blksize = cpu_to_be32(req->size); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p)); +} + /* drbd_send distinguishes two cases: diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index b19e8b2c4ce5..04a08e7541cc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3562,6 +3562,15 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, u return TRUE; } +static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size) +{ + struct p_block_desc *p = &mdev->data.rbuf.block_desc; + + drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + + return TRUE; +} + typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive); struct data_cmd { @@ -3592,6 +3601,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, + [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, /* anything missing from this table is in * the asender_tbl, see get_asender_cmd */ [P_MAX_CMD] = { 0, 0, NULL }, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 60288fb3c4d7..a8d1ff2bda27 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -142,7 +142,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, /* before we can signal completion to the upper layers, * we may need to close the current epoch */ - if (mdev->state.conn >= C_CONNECTED && + if (mdev->state.conn >= C_CONNECTED && mdev->state.conn < C_AHEAD && req->epoch == mdev->newest_tle->br_number) queue_barrier(mdev); @@ -545,6 +545,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; + case queue_for_send_oos: + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_oos; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case oos_handed_to_network: + /* actually the same */ case send_canceled: /* treat it the same */ case send_failed: @@ -756,7 +764,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) const sector_t sector = bio->bi_sector; struct drbd_tl_epoch *b = NULL; struct drbd_request *req; - int local, remote; + int local, remote, send_oos = 0; int err = -EIO; int ret = 0; @@ -820,8 +828,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) } remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || - (mdev->state.pdsk == D_INCONSISTENT && - mdev->state.conn >= C_CONNECTED)); + (mdev->state.pdsk >= D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED && + mdev->state.conn < C_AHEAD)); + send_oos = (rw == WRITE && mdev->state.conn == C_AHEAD && + mdev->state.pdsk >= D_INCONSISTENT); if (!(local || remote) && !is_susp(mdev->state)) { if (__ratelimit(&drbd_ratelimit_state)) @@ -835,7 +846,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) * but there is a race between testing the bit and pointer outside the * spinlock, and grabbing the spinlock. * if we lost that race, we retry. */ - if (rw == WRITE && remote && + if (rw == WRITE && (remote || send_oos) && mdev->unused_spare_tle == NULL && test_bit(CREATE_BARRIER, &mdev->flags)) { allocate_barrier: @@ -860,11 +871,15 @@ allocate_barrier: goto fail_free_complete; } - if (remote) { + if (remote || send_oos) { remote = (mdev->state.pdsk == D_UP_TO_DATE || - (mdev->state.pdsk == D_INCONSISTENT && - mdev->state.conn >= C_CONNECTED)); - if (!remote) + (mdev->state.pdsk >= D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED && + mdev->state.conn < C_AHEAD)); + send_oos = (rw == WRITE && mdev->state.conn == C_AHEAD && + mdev->state.pdsk >= D_INCONSISTENT); + + if (!(remote || send_oos)) dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); if (!(local || remote)) { dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); @@ -877,7 +892,7 @@ allocate_barrier: mdev->unused_spare_tle = b; b = NULL; } - if (rw == WRITE && remote && + if (rw == WRITE && (remote || send_oos) && mdev->unused_spare_tle == NULL && test_bit(CREATE_BARRIER, &mdev->flags)) { /* someone closed the current epoch @@ -900,7 +915,7 @@ allocate_barrier: * barrier packet. To get the write ordering right, we only have to * make sure that, if this is a write request and it triggered a * barrier packet, this request is queued within the same spinlock. */ - if (remote && mdev->unused_spare_tle && + if ((remote || send_oos) && mdev->unused_spare_tle && test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { _tl_add_barrier(mdev, mdev->unused_spare_tle); mdev->unused_spare_tle = NULL; @@ -948,8 +963,11 @@ allocate_barrier: ? queue_for_net_write : queue_for_net_read); } + if (send_oos && drbd_set_out_of_sync(mdev, sector, size)) + _req_mod(req, queue_for_send_oos); - if (remote && mdev->net_conf->on_congestion != OC_BLOCK) { + if (remote && + mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) { int congested = 0; if (mdev->net_conf->cong_fill && @@ -964,6 +982,8 @@ allocate_barrier: } if (congested) { + queue_barrier(mdev); + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 69d350fe7c1e..40d3dcd8fc81 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -82,14 +82,16 @@ enum drbd_req_event { to_be_submitted, /* XXX yes, now I am inconsistent... - * these two are not "events" but "actions" + * these are not "events" but "actions" * oh, well... */ queue_for_net_write, queue_for_net_read, + queue_for_send_oos, send_canceled, send_failed, handed_over_to_network, + oos_handed_to_network, connection_lost_while_pending, read_retry_remote_canceled, recv_acked_by_peer, diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 782d87237cb4..67499077c482 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1237,6 +1237,22 @@ int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); } +int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled); + return 1; + } + + ok = drbd_send_oos(mdev, req); + req_mod(req, oos_handed_to_network); + + return ok; +} + /** * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request * @mdev: DRBD device. diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 23f31be6f00d..41da654cc0b1 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -56,7 +56,7 @@ extern const char *drbd_buildtag(void); #define REL_VERSION "8.3.9" #define API_VERSION 88 #define PRO_VERSION_MIN 86 -#define PRO_VERSION_MAX 95 +#define PRO_VERSION_MAX 96 enum drbd_io_error_p { -- cgit v1.2.3 From 42ff269d1022a86be4f526cf674998c47b7ab856 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 24 Nov 2010 10:11:14 +0100 Subject: drbd: add packet_type 27 (return_code_only) to netlink api In case we ever should add an other packet type, we must not reuse 27, as that currently used for "empty" return code only replies. Document it as such. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 6 ++++-- include/linux/drbd_nl.h | 6 +++++- include/linux/drbd_tag_magic.h | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 80a389d24cdd..6a6dde6c51c6 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2195,7 +2195,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms goto fail; } - if (nlp->packet_type >= P_nl_after_last_packet) { + if (nlp->packet_type >= P_nl_after_last_packet || + nlp->packet_type == P_return_code_only) { retcode = ERR_PACKET_NR; goto fail; } @@ -2219,7 +2220,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms reply = (struct drbd_nl_cfg_reply *) cn_reply->data; reply->packet_type = - cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; + cm->reply_body_size ? nlp->packet_type : P_return_code_only; reply->minor = nlp->drbd_minor; reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ /* reply->tag_list; might be modified by cm->function. */ @@ -2525,6 +2526,7 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code) cn_reply->len = sizeof(struct drbd_nl_cfg_reply); cn_reply->flags = 0; + reply->packet_type = P_return_code_only; reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; reply->ret_code = ret_code; diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 8cde3945d1f7..6fc614b06c4d 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -146,9 +146,13 @@ NL_PACKET(new_c_uuid, 26, NL_BIT( 63, T_MANDATORY, clear_bm) ) +#ifdef NL_RESPONSE +NL_RESPONSE(return_code_only, 27) +#endif + #undef NL_PACKET #undef NL_INTEGER #undef NL_INT64 #undef NL_BIT #undef NL_STRING - +#undef NL_RESPONSE diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h index fcdff8410e99..f14a165e82dc 100644 --- a/include/linux/drbd_tag_magic.h +++ b/include/linux/drbd_tag_magic.h @@ -7,6 +7,7 @@ /* declare packet_type enums */ enum packet_types { #define NL_PACKET(name, number, fields) P_ ## name = number, +#define NL_RESPONSE(name, number) P_ ## name = number, #define NL_INTEGER(pn, pr, member) #define NL_INT64(pn, pr, member) #define NL_BIT(pn, pr, member) -- cgit v1.2.3 From 2561b9c1f1d63077c41903fc6ad58dc9ec47248b Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 3 Dec 2010 15:22:48 +0100 Subject: drbd: --force option for disconnect As the network connection can be lost at any time, a --force option for disconnect is just a matter of completeness. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 15 +++++++++++++++ include/linux/drbd_nl.h | 4 +++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6a6dde6c51c6..cd0459f0403f 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1531,6 +1531,21 @@ static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl struct drbd_nl_cfg_reply *reply) { int retcode; + struct disconnect dc; + + memset(&dc, 0, sizeof(struct disconnect)); + if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (dc.force) { + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn >= C_WF_CONNECTION) + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL); + spin_unlock_irq(&mdev->req_lock); + goto done; + } retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h index 6fc614b06c4d..ab6159e4fcf0 100644 --- a/include/linux/drbd_nl.h +++ b/include/linux/drbd_nl.h @@ -69,7 +69,9 @@ NL_PACKET(net_conf, 5, NL_BIT( 70, T_MANDATORY, dry_run) ) -NL_PACKET(disconnect, 6, ) +NL_PACKET(disconnect, 6, + NL_BIT( 84, T_MAY_IGNORE, force) +) NL_PACKET(resize, 7, NL_INT64( 29, T_MAY_IGNORE, resize_size) -- cgit v1.2.3 From 116676ca621a862a8124969772f4dd61c8b40eee Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 8 Dec 2010 13:33:11 +0100 Subject: drbd: Rename enum drbd_ret_codes to enum drbd_ret_code Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_nl.c | 4 ++-- include/linux/drbd.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index f43e2aa354a6..8d69e3a1b3c2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3570,7 +3570,7 @@ void drbd_md_sync(struct drbd_conf *mdev) * @mdev: DRBD device. * @bdev: Device from which the meta data should be read in. * - * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case + * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. */ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cd0459f0403f..fe336592e538 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -849,7 +849,7 @@ static void drbd_suspend_al(struct drbd_conf *mdev) static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { - enum drbd_ret_codes retcode; + enum drbd_ret_code retcode; enum determine_dev_size dd; sector_t max_possible_sectors; sector_t min_md_device_sectors; @@ -1278,7 +1278,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { int i, ns; - enum drbd_ret_codes retcode; + enum drbd_ret_code retcode; struct net_conf *new_conf = NULL; struct crypto_hash *tfm = NULL; struct crypto_hash *integrity_w_tfm = NULL; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 41da654cc0b1..d92f989036c9 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -103,7 +103,7 @@ enum drbd_on_congestion { }; /* KEEP the order, do not delete or insert. Only append. */ -enum drbd_ret_codes { +enum drbd_ret_code { ERR_CODE_BASE = 100, NO_ERROR = 101, ERR_LOCAL_ADDR = 102, -- cgit v1.2.3 From c8b325632f0e5ffdaeca3d1f3be77c9399316a40 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 8 Dec 2010 01:06:16 +0100 Subject: drbd: Rename enum drbd_state_ret_codes to enum drbd_state_rv Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 5 +++-- drivers/block/drbd/drbd_strings.c | 2 +- include/linux/drbd.h | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8d69e3a1b3c2..cddf311b7429 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -510,8 +510,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); -static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, - union drbd_state mask, union drbd_state val) +static enum drbd_state_rv +_req_st_cond(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val) { union drbd_state os, ns; unsigned long flags; diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 5b970adc3b6f..c44a2a602772 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -107,7 +107,7 @@ const char *drbd_disk_str(enum drbd_disk_state s) return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; } -const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) +const char *drbd_set_st_err_str(enum drbd_state_rv err) { return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : err > SS_TWO_PRIMARIES ? "TOO_LARGE" diff --git a/include/linux/drbd.h b/include/linux/drbd.h index d92f989036c9..d10431fab004 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -270,7 +270,7 @@ union drbd_state { unsigned int i; }; -enum drbd_state_ret_codes { +enum drbd_state_rv { SS_CW_NO_NEED = 4, SS_CW_SUCCESS = 3, SS_NOTHING_TO_DO = 2, @@ -301,7 +301,7 @@ enum drbd_state_ret_codes { extern const char *drbd_conn_str(enum drbd_conns); extern const char *drbd_role_str(enum drbd_role); extern const char *drbd_disk_str(enum drbd_disk_state); -extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); +extern const char *drbd_set_st_err_str(enum drbd_state_rv); #define SHARED_SECRET_MAX 64 -- cgit v1.2.3 From 2b8a90b55533c66258a1ff0fb27b8cffa95665c4 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Mon, 10 Jan 2011 11:15:17 +0100 Subject: drbd: Corrected off-by-one error in DRBD_MINOR_COUNT_MAX Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 7 ++++--- include/linux/drbd_limits.h | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 4074d6699307..da98bff7c333 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -85,7 +85,8 @@ MODULE_AUTHOR("Philipp Reisner , " MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); MODULE_VERSION(REL_VERSION); MODULE_LICENSE("GPL"); -MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (" + __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); #include @@ -115,7 +116,7 @@ module_param(fault_devs, int, 0644); #endif /* module parameter, defined */ -unsigned int minor_count = 32; +unsigned int minor_count = DRBD_MINOR_COUNT_DEF; int disable_sendpage; int allow_oos; unsigned int cn_idx = CN_IDX_DRBD; @@ -3456,7 +3457,7 @@ int __init drbd_init(void) return -EINVAL; } - if (1 > minor_count || minor_count > 255) { + if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { printk(KERN_ERR "drbd: invalid minor_count (%d)\n", minor_count); #ifdef MODULE diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index abf418724e52..bb264a5732de 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -16,7 +16,8 @@ #define DEBUG_RANGE_CHECK 0 #define DRBD_MINOR_COUNT_MIN 1 -#define DRBD_MINOR_COUNT_MAX 255 +#define DRBD_MINOR_COUNT_MAX 256 +#define DRBD_MINOR_COUNT_DEF 32 #define DRBD_DIALOG_REFRESH_MIN 0 #define DRBD_DIALOG_REFRESH_MAX 600 -- cgit v1.2.3 From cd88d030d41a9b0100fd5fee872024e6ebc8b276 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 20 Jan 2011 11:46:41 +0100 Subject: drbd: Provide hints with the error message when clearing the sync pause flag When the user clears the sync-pause flag, and sync stays in pause state, give hints to the user, why it still is in pause state. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 12 ++++++++++-- include/linux/drbd.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 434b621f76a9..ffe3a97fef9b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1952,9 +1952,17 @@ static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n struct drbd_nl_cfg_reply *reply) { int retcode = NO_ERROR; + union drbd_state s; - if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) - retcode = ERR_PAUSE_IS_CLEAR; + if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = mdev->state; + if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { + retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : + s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; + } else { + retcode = ERR_PAUSE_IS_CLEAR; + } + } reply->ret_code = retcode; return 0; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index d10431fab004..ba5c785d3f7d 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -153,6 +153,8 @@ enum drbd_ret_code { ERR_NEED_APV_93 = 153, ERR_STONITH_AND_PROT_A = 154, ERR_CONG_NOT_PROTO_A = 155, + ERR_PIC_AFTER_DEP = 156, + ERR_PIC_PEER_DEP = 157, /* insert new ones above this line */ AFTER_LAST_ERR_CODE -- cgit v1.2.3 From c5a91619793d444e5103ec5841045bf878718398 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 25 Jan 2011 17:33:38 +0100 Subject: drbd: Remove unused function atodb_endio() Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_actlog.c | 27 --------------------------- drivers/block/drbd/drbd_worker.c | 15 ++++++--------- include/linux/drbd.h | 2 +- 3 files changed, 7 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index a6050791401b..2a1642bc451d 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -542,33 +542,6 @@ cancel: return 1; } -static void atodb_endio(struct bio *bio, int error) -{ - struct drbd_atodb_wait *wc = bio->bi_private; - struct drbd_conf *mdev = wc->mdev; - struct page *page; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - if (!error && !uptodate) - error = -EIO; - - drbd_chk_io_error(mdev, error, true); - if (error && wc->error == 0) - wc->error = error; - - if (atomic_dec_and_test(&wc->count)) - complete(&wc->io_done); - - page = bio->bi_io_vec[0].bv_page; - put_page(page); - bio_put(bio); - mdev->bm_writ_cnt++; - put_ldev(mdev); -} - /** * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents * @mdev: DRBD device. diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index cfd324b9f95b..3d70d8d015d9 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -44,15 +44,12 @@ static int w_make_resync_request(struct drbd_conf *mdev, -/* defined here: - drbd_md_io_complete - drbd_endio_sec - drbd_endio_pri - - * more endio handlers: - atodb_endio in drbd_actlog.c - drbd_bm_async_io_complete in drbd_bitmap.c - +/* endio handlers: + * drbd_md_io_complete (defined here) + * drbd_endio_pri (defined here) + * drbd_endio_sec (defined here) + * bm_async_io_complete (defined in drbd_bitmap.c) + * * For all these callbacks, note the following: * The callbacks will be called in irq context by the IDE drivers, * and in Softirqs/Tasklets/BH context by the SCSI drivers. diff --git a/include/linux/drbd.h b/include/linux/drbd.h index ba5c785d3f7d..d18d673ebc78 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.9" +#define REL_VERSION "8.3.10" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 96 -- cgit v1.2.3 From c7519dbf6f4b4408229d279d799c938ffdd06f21 Mon Sep 17 00:00:00 2001 From: Jarkko Lavinen Date: Mon, 14 Feb 2011 16:16:09 +0200 Subject: mtd_blkdevs: Add background processing support Add a new background method into mtd_blktrans_ops, add background support into mtd_blktrans_thread(), and add mtd_blktrans_cease_background(). If the mtd blktrans dev has the background support, the thread will call background function when the request queue becomes empty. The background operation may run as long as needs to until mtd_blktrans_cease_background() tells to stop. Signed-off-by: Jarkko Lavinen Tested-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/mtd_blkdevs.c | 26 ++++++++++++++++++++++++++ include/linux/mtd/blktrans.h | 2 ++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 344ac1037ee7..e0b5f6442171 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -119,11 +119,22 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, } } +int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev) +{ + if (kthread_should_stop()) + return 1; + + return !elv_queue_empty(dev->rq); +} +EXPORT_SYMBOL_GPL(mtd_blktrans_cease_background); + static int mtd_blktrans_thread(void *arg) { struct mtd_blktrans_dev *dev = arg; + struct mtd_blktrans_ops *tr = dev->tr; struct request_queue *rq = dev->rq; struct request *req = NULL; + int background_done = 0; spin_lock_irq(rq->queue_lock); @@ -131,6 +142,19 @@ static int mtd_blktrans_thread(void *arg) int res; if (!req && !(req = blk_fetch_request(rq))) { + if (tr->background && !background_done) { + spin_unlock_irq(rq->queue_lock); + mutex_lock(&dev->lock); + tr->background(dev); + mutex_unlock(&dev->lock); + spin_lock_irq(rq->queue_lock); + /* + * Do background processing just once per idle + * period. + */ + background_done = 1; + continue; + } set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) @@ -152,6 +176,8 @@ static int mtd_blktrans_thread(void *arg) if (!__blk_end_request_cur(req, res)) req = NULL; + + background_done = 0; } if (req) diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index 26529ebd59cc..66bec4bf2b0a 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -62,6 +62,7 @@ struct mtd_blktrans_ops { unsigned long block, char *buffer); int (*discard)(struct mtd_blktrans_dev *dev, unsigned long block, unsigned nr_blocks); + void (*background)(struct mtd_blktrans_dev *dev); /* Block layer ioctls */ int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo); @@ -85,6 +86,7 @@ extern int register_mtd_blktrans(struct mtd_blktrans_ops *tr); extern int deregister_mtd_blktrans(struct mtd_blktrans_ops *tr); extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev); extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev); +extern int mtd_blktrans_cease_background(struct mtd_blktrans_dev *dev); #endif /* __MTD_TRANS_H__ */ -- cgit v1.2.3 From dcfb81d61da1367e52f7f7e3ceff0d0044c3c7ee Mon Sep 17 00:00:00 2001 From: David Griego Date: Tue, 30 Nov 2010 15:32:05 +0530 Subject: mtd: NOR flash driver for OMAP-L137/AM17x OMAP-L137/AM17x has limited number of dedicated EMIFA address pins, enough to interface directly to an SDRAM. If a device such as an asynchronous flash needs to be attached to the EMIFA, then either GPIO pins or a chip select may be used to control the flash device's upper address lines. This patch adds support for the NOR flash on the OMAP-L137/ AM17x user interface daughter board using the latch-addr-flash MTD mapping driver which allows flashes to be partially physically addressed. The upper address lines are set by a board specific code which is a separate patch. Signed-off-by: David Griego Signed-off-by: Aleksey Makarov Signed-off-by: Sergei Shtylyov Signed-off-by: Savinay Dharmappa Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/maps/Kconfig | 9 ++ drivers/mtd/maps/Makefile | 1 + drivers/mtd/maps/latch-addr-flash.c | 272 +++++++++++++++++++++++++++++++++++ include/linux/mtd/latch-addr-flash.h | 29 ++++ 4 files changed, 311 insertions(+) create mode 100644 drivers/mtd/maps/latch-addr-flash.c create mode 100644 include/linux/mtd/latch-addr-flash.h (limited to 'include') diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index 803072a71ed7..44b1f46458ca 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -552,4 +552,13 @@ config MTD_PISMO When built as a module, it will be called pismo.ko +config MTD_LATCH_ADDR + tristate "Latch-assisted Flash Chip Support" + depends on MTD_COMPLEX_MAPPINGS + help + Map driver which allows flashes to be partially physically addressed + and have the upper address lines set by a board specific code. + + If compiled as a module, it will be called latch-addr-flash. + endmenu diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile index c7869c7a6b18..08533bd5cba7 100644 --- a/drivers/mtd/maps/Makefile +++ b/drivers/mtd/maps/Makefile @@ -59,3 +59,4 @@ obj-$(CONFIG_MTD_RBTX4939) += rbtx4939-flash.o obj-$(CONFIG_MTD_VMU) += vmu-flash.o obj-$(CONFIG_MTD_GPIO_ADDR) += gpio-addr-flash.o obj-$(CONFIG_MTD_BCM963XX) += bcm963xx-flash.o +obj-$(CONFIG_MTD_LATCH_ADDR) += latch-addr-flash.o diff --git a/drivers/mtd/maps/latch-addr-flash.c b/drivers/mtd/maps/latch-addr-flash.c new file mode 100644 index 000000000000..ee2548085334 --- /dev/null +++ b/drivers/mtd/maps/latch-addr-flash.c @@ -0,0 +1,272 @@ +/* + * Interface for NOR flash driver whose high address lines are latched + * + * Copyright © 2000 Nicolas Pitre + * Copyright © 2005-2008 Analog Devices Inc. + * Copyright © 2008 MontaVista Software, Inc. + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRIVER_NAME "latch-addr-flash" + +struct latch_addr_flash_info { + struct mtd_info *mtd; + struct map_info map; + struct resource *res; + + void (*set_window)(unsigned long offset, void *data); + void *data; + + /* cache; could be found out of res */ + unsigned long win_mask; + + int nr_parts; + struct mtd_partition *parts; + + spinlock_t lock; +}; + +static map_word lf_read(struct map_info *map, unsigned long ofs) +{ + struct latch_addr_flash_info *info; + map_word datum; + + info = (struct latch_addr_flash_info *)map->map_priv_1; + + spin_lock(&info->lock); + + info->set_window(ofs, info->data); + datum = inline_map_read(map, info->win_mask & ofs); + + spin_unlock(&info->lock); + + return datum; +} + +static void lf_write(struct map_info *map, map_word datum, unsigned long ofs) +{ + struct latch_addr_flash_info *info; + + info = (struct latch_addr_flash_info *)map->map_priv_1; + + spin_lock(&info->lock); + + info->set_window(ofs, info->data); + inline_map_write(map, datum, info->win_mask & ofs); + + spin_unlock(&info->lock); +} + +static void lf_copy_from(struct map_info *map, void *to, + unsigned long from, ssize_t len) +{ + struct latch_addr_flash_info *info = + (struct latch_addr_flash_info *) map->map_priv_1; + unsigned n; + + while (len > 0) { + n = info->win_mask + 1 - (from & info->win_mask); + if (n > len) + n = len; + + spin_lock(&info->lock); + + info->set_window(from, info->data); + memcpy_fromio(to, map->virt + (from & info->win_mask), n); + + spin_unlock(&info->lock); + + to += n; + from += n; + len -= n; + } +} + +static char *rom_probe_types[] = { "cfi_probe", NULL }; + +static char *part_probe_types[] = { "cmdlinepart", NULL }; + +static int latch_addr_flash_remove(struct platform_device *dev) +{ + struct latch_addr_flash_info *info; + struct latch_addr_flash_data *latch_addr_data; + + info = platform_get_drvdata(dev); + if (info == NULL) + return 0; + platform_set_drvdata(dev, NULL); + + latch_addr_data = dev->dev.platform_data; + + if (info->mtd != NULL) { + if (mtd_has_partitions()) { + if (info->nr_parts) { + del_mtd_partitions(info->mtd); + kfree(info->parts); + } else if (latch_addr_data->nr_parts) { + del_mtd_partitions(info->mtd); + } else { + del_mtd_device(info->mtd); + } + } else { + del_mtd_device(info->mtd); + } + map_destroy(info->mtd); + } + + if (info->map.virt != NULL) + iounmap(info->map.virt); + + if (info->res != NULL) + release_mem_region(info->res->start, resource_size(info->res)); + + kfree(info); + + if (latch_addr_data->done) + latch_addr_data->done(latch_addr_data->data); + + return 0; +} + +static int __devinit latch_addr_flash_probe(struct platform_device *dev) +{ + struct latch_addr_flash_data *latch_addr_data; + struct latch_addr_flash_info *info; + resource_size_t win_base = dev->resource->start; + resource_size_t win_size = resource_size(dev->resource); + char **probe_type; + int chipsel; + int err; + + latch_addr_data = dev->dev.platform_data; + if (latch_addr_data == NULL) + return -ENODEV; + + pr_notice("latch-addr platform flash device: %#llx byte " + "window at %#.8llx\n", + (unsigned long long)win_size, (unsigned long long)win_base); + + chipsel = dev->id; + + if (latch_addr_data->init) { + err = latch_addr_data->init(latch_addr_data->data, chipsel); + if (err != 0) + return err; + } + + info = kzalloc(sizeof(struct latch_addr_flash_info), GFP_KERNEL); + if (info == NULL) { + err = -ENOMEM; + goto done; + } + + platform_set_drvdata(dev, info); + + info->res = request_mem_region(win_base, win_size, DRIVER_NAME); + if (info->res == NULL) { + dev_err(&dev->dev, "Could not reserve memory region\n"); + err = -EBUSY; + goto free_info; + } + + info->map.name = DRIVER_NAME; + info->map.size = latch_addr_data->size; + info->map.bankwidth = latch_addr_data->width; + + info->map.phys = NO_XIP; + info->map.virt = ioremap(win_base, win_size); + if (!info->map.virt) { + err = -ENOMEM; + goto free_res; + } + + info->map.map_priv_1 = (unsigned long)info; + + info->map.read = lf_read; + info->map.copy_from = lf_copy_from; + info->map.write = lf_write; + info->set_window = latch_addr_data->set_window; + info->data = latch_addr_data->data; + info->win_mask = win_size - 1; + + spin_lock_init(&info->lock); + + for (probe_type = rom_probe_types; !info->mtd && *probe_type; + probe_type++) + info->mtd = do_map_probe(*probe_type, &info->map); + + if (info->mtd == NULL) { + dev_err(&dev->dev, "map_probe failed\n"); + err = -ENODEV; + goto iounmap; + } + info->mtd->owner = THIS_MODULE; + + if (mtd_has_partitions()) { + + err = parse_mtd_partitions(info->mtd, + (const char **)part_probe_types, + &info->parts, 0); + if (err > 0) { + add_mtd_partitions(info->mtd, info->parts, err); + return 0; + } + if (latch_addr_data->nr_parts) { + pr_notice("Using latch-addr-flash partition information\n"); + add_mtd_partitions(info->mtd, latch_addr_data->parts, + latch_addr_data->nr_parts); + return 0; + } + } + add_mtd_device(info->mtd); + return 0; + +iounmap: + iounmap(info->map.virt); +free_res: + release_mem_region(info->res->start, resource_size(info->res)); +free_info: + kfree(info); +done: + if (latch_addr_data->done) + latch_addr_data->done(latch_addr_data->data); + return err; +} + +static struct platform_driver latch_addr_flash_driver = { + .probe = latch_addr_flash_probe, + .remove = __devexit_p(latch_addr_flash_remove), + .driver = { + .name = DRIVER_NAME, + }, +}; + +static int __init latch_addr_flash_init(void) +{ + return platform_driver_register(&latch_addr_flash_driver); +} +module_init(latch_addr_flash_init); + +static void __exit latch_addr_flash_exit(void) +{ + platform_driver_unregister(&latch_addr_flash_driver); +} +module_exit(latch_addr_flash_exit); + +MODULE_AUTHOR("David Griego "); +MODULE_DESCRIPTION("MTD map driver for flashes addressed physically with upper " + "address lines being set board specifically"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/mtd/latch-addr-flash.h b/include/linux/mtd/latch-addr-flash.h new file mode 100644 index 000000000000..e94b8e128074 --- /dev/null +++ b/include/linux/mtd/latch-addr-flash.h @@ -0,0 +1,29 @@ +/* + * Interface for NOR flash driver whose high address lines are latched + * + * Copyright © 2008 MontaVista Software, Inc. + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#ifndef __LATCH_ADDR_FLASH__ +#define __LATCH_ADDR_FLASH__ + +struct map_info; +struct mtd_partition; + +struct latch_addr_flash_data { + unsigned int width; + unsigned int size; + + int (*init)(void *data, int cs); + void (*done)(void *data); + void (*set_window)(unsigned long offset, void *data); + void *data; + + unsigned int nr_parts; + struct mtd_partition *parts; +}; + +#endif -- cgit v1.2.3 From b3dcfd35244e1cb8dc8dfa5c05013b133dbb437a Mon Sep 17 00:00:00 2001 From: Roman Tereshonkov Date: Thu, 17 Feb 2011 13:44:41 +0200 Subject: mtd: onenand: add new option to control initial onenand unlocking A new option ONENAND_SKIP_INITIAL_UNLOCKING is added. This allows to disable initial onenand unlocking when the driver is initialized. Signed-off-by: Roman Tereshonkov Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/onenand/onenand_base.c | 3 ++- include/linux/mtd/onenand.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 4205b9423b89..56a8b2005bda 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -4085,7 +4085,8 @@ int onenand_scan(struct mtd_info *mtd, int maxchips) mtd->writebufsize = mtd->writesize; /* Unlock whole block */ - this->unlock_all(mtd); + if (!(this->options & ONENAND_SKIP_INITIAL_UNLOCKING)) + this->unlock_all(mtd); ret = this->scan_bbt(mtd); if ((!FLEXONENAND(this)) || ret) diff --git a/include/linux/mtd/onenand.h b/include/linux/mtd/onenand.h index ae418e41d8f5..52b6f187bf49 100644 --- a/include/linux/mtd/onenand.h +++ b/include/linux/mtd/onenand.h @@ -198,6 +198,7 @@ struct onenand_chip { #define ONENAND_SKIP_UNLOCK_CHECK (0x0100) #define ONENAND_PAGEBUF_ALLOC (0x1000) #define ONENAND_OOBBUF_ALLOC (0x2000) +#define ONENAND_SKIP_INITIAL_UNLOCKING (0x4000) #define ONENAND_IS_4KB_PAGE(this) \ (this->options & ONENAND_HAS_4KB_PAGE) -- cgit v1.2.3 From 437aa565e2656776a7104aaacd792fe789ea8b2d Mon Sep 17 00:00:00 2001 From: Ivan Djelic Date: Fri, 11 Mar 2011 11:05:32 +0100 Subject: lib: add shared BCH ECC library MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a new software BCH encoding/decoding library, similar to the shared Reed-Solomon library. Binary BCH (Bose-Chaudhuri-Hocquenghem) codes are widely used to correct errors in NAND flash devices requiring more than 1-bit ecc correction; they are generally better suited for NAND flash than RS codes because NAND bit errors do not occur in bursts. Latest SLC NAND devices typically require at least 4-bit ecc protection per 512 bytes block. This library provides software encoding/decoding, but may also be used with ASIC/SoC hardware BCH engines to perform error correction. It is being currently used for this purpose on an OMAP3630 board (4bit/8bit HW BCH). It has also been used to decode raw dumps of NAND devices with on-die BCH ecc engines (e.g. Micron 4bit ecc SLC devices). Latest NAND devices (including SLC) can exhibit high error rates (typically a dozen or more bitflips per hour during stress tests); in order to minimize the performance impact of error correction, this library implements recently developed algorithms for fast polynomial root finding (see bch.c header for details) instead of the traditional exhaustive Chien root search; a few performance figures are provided below: Platform: arm926ejs @ 468 MHz, 32 KiB icache, 16 KiB dcache BCH ecc : 4-bit per 512 bytes Encoding average throughput: 250 Mbits/s Error correction time (compared with Chien search): average worst average (Chien) worst (Chien) ---------------------------------------------------------- 1 bit 8.5 µs 11 µs 200 µs 383 µs 2 bit 9.7 µs 12.5 µs 477 µs 728 µs 3 bit 18.1 µs 20.6 µs 758 µs 1010 µs 4 bit 19.5 µs 23 µs 1028 µs 1280 µs In the above figures, "worst" is meant in terms of error pattern, not in terms of cache miss / page faults effects (not taken into account here). The library has been extensively tested on the following platforms: x86, x86_64, arm926ejs, omap3630, qemu-ppc64, qemu-mips. Signed-off-by: Ivan Djelic Signed-off-by: David Woodhouse --- include/linux/bch.h | 79 +++ lib/Kconfig | 39 ++ lib/Makefile | 1 + lib/bch.c | 1368 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1487 insertions(+) create mode 100644 include/linux/bch.h create mode 100644 lib/bch.c (limited to 'include') diff --git a/include/linux/bch.h b/include/linux/bch.h new file mode 100644 index 000000000000..295b4ef153bb --- /dev/null +++ b/include/linux/bch.h @@ -0,0 +1,79 @@ +/* + * Generic binary BCH encoding/decoding library + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright © 2011 Parrot S.A. + * + * Author: Ivan Djelic + * + * Description: + * + * This library provides runtime configurable encoding/decoding of binary + * Bose-Chaudhuri-Hocquenghem (BCH) codes. +*/ +#ifndef _BCH_H +#define _BCH_H + +#include + +/** + * struct bch_control - BCH control structure + * @m: Galois field order + * @n: maximum codeword size in bits (= 2^m-1) + * @t: error correction capability in bits + * @ecc_bits: ecc exact size in bits, i.e. generator polynomial degree (<=m*t) + * @ecc_bytes: ecc max size (m*t bits) in bytes + * @a_pow_tab: Galois field GF(2^m) exponentiation lookup table + * @a_log_tab: Galois field GF(2^m) log lookup table + * @mod8_tab: remainder generator polynomial lookup tables + * @ecc_buf: ecc parity words buffer + * @ecc_buf2: ecc parity words buffer + * @xi_tab: GF(2^m) base for solving degree 2 polynomial roots + * @syn: syndrome buffer + * @cache: log-based polynomial representation buffer + * @elp: error locator polynomial + * @poly_2t: temporary polynomials of degree 2t + */ +struct bch_control { + unsigned int m; + unsigned int n; + unsigned int t; + unsigned int ecc_bits; + unsigned int ecc_bytes; +/* private: */ + uint16_t *a_pow_tab; + uint16_t *a_log_tab; + uint32_t *mod8_tab; + uint32_t *ecc_buf; + uint32_t *ecc_buf2; + unsigned int *xi_tab; + unsigned int *syn; + int *cache; + struct gf_poly *elp; + struct gf_poly *poly_2t[4]; +}; + +struct bch_control *init_bch(int m, int t, unsigned int prim_poly); + +void free_bch(struct bch_control *bch); + +void encode_bch(struct bch_control *bch, const uint8_t *data, + unsigned int len, uint8_t *ecc); + +int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, + const uint8_t *recv_ecc, const uint8_t *calc_ecc, + const unsigned int *syn, unsigned int *errloc); + +#endif /* _BCH_H */ diff --git a/lib/Kconfig b/lib/Kconfig index 0ee67e08ad3e..b9fef78ed04f 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -154,6 +154,45 @@ config REED_SOLOMON_ENC16 config REED_SOLOMON_DEC16 boolean +# +# BCH support is selected if needed +# +config BCH + tristate + +config BCH_CONST_PARAMS + boolean + help + Drivers may select this option to force specific constant + values for parameters 'm' (Galois field order) and 't' + (error correction capability). Those specific values must + be set by declaring default values for symbols BCH_CONST_M + and BCH_CONST_T. + Doing so will enable extra compiler optimizations, + improving encoding and decoding performance up to 2x for + usual (m,t) values (typically such that m*t < 200). + When this option is selected, the BCH library supports + only a single (m,t) configuration. This is mainly useful + for NAND flash board drivers requiring known, fixed BCH + parameters. + +config BCH_CONST_M + int + range 5 15 + help + Constant value for Galois field order 'm'. If 'k' is the + number of data bits to protect, 'm' should be chosen such + that (k + m*t) <= 2**m - 1. + Drivers should declare a default value for this symbol if + they select option BCH_CONST_PARAMS. + +config BCH_CONST_T + int + help + Constant value for error correction capability in bits 't'. + Drivers should declare a default value for this symbol if + they select option BCH_CONST_PARAMS. + # # Textsearch support is select'ed if needed # diff --git a/lib/Makefile b/lib/Makefile index cbb774f7d41d..0544c814d827 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ +obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ obj-$(CONFIG_XZ_DEC) += xz/ diff --git a/lib/bch.c b/lib/bch.c new file mode 100644 index 000000000000..bc89dfe4d1b3 --- /dev/null +++ b/lib/bch.c @@ -0,0 +1,1368 @@ +/* + * Generic binary BCH encoding/decoding library + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright © 2011 Parrot S.A. + * + * Author: Ivan Djelic + * + * Description: + * + * This library provides runtime configurable encoding/decoding of binary + * Bose-Chaudhuri-Hocquenghem (BCH) codes. + * + * Call init_bch to get a pointer to a newly allocated bch_control structure for + * the given m (Galois field order), t (error correction capability) and + * (optional) primitive polynomial parameters. + * + * Call encode_bch to compute and store ecc parity bytes to a given buffer. + * Call decode_bch to detect and locate errors in received data. + * + * On systems supporting hw BCH features, intermediate results may be provided + * to decode_bch in order to skip certain steps. See decode_bch() documentation + * for details. + * + * Option CONFIG_BCH_CONST_PARAMS can be used to force fixed values of + * parameters m and t; thus allowing extra compiler optimizations and providing + * better (up to 2x) encoding performance. Using this option makes sense when + * (m,t) are fixed and known in advance, e.g. when using BCH error correction + * on a particular NAND flash device. + * + * Algorithmic details: + * + * Encoding is performed by processing 32 input bits in parallel, using 4 + * remainder lookup tables. + * + * The final stage of decoding involves the following internal steps: + * a. Syndrome computation + * b. Error locator polynomial computation using Berlekamp-Massey algorithm + * c. Error locator root finding (by far the most expensive step) + * + * In this implementation, step c is not performed using the usual Chien search. + * Instead, an alternative approach described in [1] is used. It consists in + * factoring the error locator polynomial using the Berlekamp Trace algorithm + * (BTA) down to a certain degree (4), after which ad hoc low-degree polynomial + * solving techniques [2] are used. The resulting algorithm, called BTZ, yields + * much better performance than Chien search for usual (m,t) values (typically + * m >= 13, t < 32, see [1]). + * + * [1] B. Biswas, V. Herbert. Efficient root finding of polynomials over fields + * of characteristic 2, in: Western European Workshop on Research in Cryptology + * - WEWoRC 2009, Graz, Austria, LNCS, Springer, July 2009, to appear. + * [2] [Zin96] V.A. Zinoviev. On the solution of equations of degree 10 over + * finite fields GF(2^q). In Rapport de recherche INRIA no 2829, 1996. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_BCH_CONST_PARAMS) +#define GF_M(_p) (CONFIG_BCH_CONST_M) +#define GF_T(_p) (CONFIG_BCH_CONST_T) +#define GF_N(_p) ((1 << (CONFIG_BCH_CONST_M))-1) +#else +#define GF_M(_p) ((_p)->m) +#define GF_T(_p) ((_p)->t) +#define GF_N(_p) ((_p)->n) +#endif + +#define BCH_ECC_WORDS(_p) DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 32) +#define BCH_ECC_BYTES(_p) DIV_ROUND_UP(GF_M(_p)*GF_T(_p), 8) + +#ifndef dbg +#define dbg(_fmt, args...) do {} while (0) +#endif + +/* + * represent a polynomial over GF(2^m) + */ +struct gf_poly { + unsigned int deg; /* polynomial degree */ + unsigned int c[0]; /* polynomial terms */ +}; + +/* given its degree, compute a polynomial size in bytes */ +#define GF_POLY_SZ(_d) (sizeof(struct gf_poly)+((_d)+1)*sizeof(unsigned int)) + +/* polynomial of degree 1 */ +struct gf_poly_deg1 { + struct gf_poly poly; + unsigned int c[2]; +}; + +/* + * same as encode_bch(), but process input data one byte at a time + */ +static void encode_bch_unaligned(struct bch_control *bch, + const unsigned char *data, unsigned int len, + uint32_t *ecc) +{ + int i; + const uint32_t *p; + const int l = BCH_ECC_WORDS(bch)-1; + + while (len--) { + p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(*data++)) & 0xff); + + for (i = 0; i < l; i++) + ecc[i] = ((ecc[i] << 8)|(ecc[i+1] >> 24))^(*p++); + + ecc[l] = (ecc[l] << 8)^(*p); + } +} + +/* + * convert ecc bytes to aligned, zero-padded 32-bit ecc words + */ +static void load_ecc8(struct bch_control *bch, uint32_t *dst, + const uint8_t *src) +{ + uint8_t pad[4] = {0, 0, 0, 0}; + unsigned int i, nwords = BCH_ECC_WORDS(bch)-1; + + for (i = 0; i < nwords; i++, src += 4) + dst[i] = (src[0] << 24)|(src[1] << 16)|(src[2] << 8)|src[3]; + + memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords); + dst[nwords] = (pad[0] << 24)|(pad[1] << 16)|(pad[2] << 8)|pad[3]; +} + +/* + * convert 32-bit ecc words to ecc bytes + */ +static void store_ecc8(struct bch_control *bch, uint8_t *dst, + const uint32_t *src) +{ + uint8_t pad[4]; + unsigned int i, nwords = BCH_ECC_WORDS(bch)-1; + + for (i = 0; i < nwords; i++) { + *dst++ = (src[i] >> 24); + *dst++ = (src[i] >> 16) & 0xff; + *dst++ = (src[i] >> 8) & 0xff; + *dst++ = (src[i] >> 0) & 0xff; + } + pad[0] = (src[nwords] >> 24); + pad[1] = (src[nwords] >> 16) & 0xff; + pad[2] = (src[nwords] >> 8) & 0xff; + pad[3] = (src[nwords] >> 0) & 0xff; + memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords); +} + +/** + * encode_bch - calculate BCH ecc parity of data + * @bch: BCH control structure + * @data: data to encode + * @len: data length in bytes + * @ecc: ecc parity data, must be initialized by caller + * + * The @ecc parity array is used both as input and output parameter, in order to + * allow incremental computations. It should be of the size indicated by member + * @ecc_bytes of @bch, and should be initialized to 0 before the first call. + * + * The exact number of computed ecc parity bits is given by member @ecc_bits of + * @bch; it may be less than m*t for large values of t. + */ +void encode_bch(struct bch_control *bch, const uint8_t *data, + unsigned int len, uint8_t *ecc) +{ + const unsigned int l = BCH_ECC_WORDS(bch)-1; + unsigned int i, mlen; + unsigned long m; + uint32_t w, r[l+1]; + const uint32_t * const tab0 = bch->mod8_tab; + const uint32_t * const tab1 = tab0 + 256*(l+1); + const uint32_t * const tab2 = tab1 + 256*(l+1); + const uint32_t * const tab3 = tab2 + 256*(l+1); + const uint32_t *pdata, *p0, *p1, *p2, *p3; + + if (ecc) { + /* load ecc parity bytes into internal 32-bit buffer */ + load_ecc8(bch, bch->ecc_buf, ecc); + } else { + memset(bch->ecc_buf, 0, sizeof(r)); + } + + /* process first unaligned data bytes */ + m = ((unsigned long)data) & 3; + if (m) { + mlen = (len < (4-m)) ? len : 4-m; + encode_bch_unaligned(bch, data, mlen, bch->ecc_buf); + data += mlen; + len -= mlen; + } + + /* process 32-bit aligned data words */ + pdata = (uint32_t *)data; + mlen = len/4; + data += 4*mlen; + len -= 4*mlen; + memcpy(r, bch->ecc_buf, sizeof(r)); + + /* + * split each 32-bit word into 4 polynomials of weight 8 as follows: + * + * 31 ...24 23 ...16 15 ... 8 7 ... 0 + * xxxxxxxx yyyyyyyy zzzzzzzz tttttttt + * tttttttt mod g = r0 (precomputed) + * zzzzzzzz 00000000 mod g = r1 (precomputed) + * yyyyyyyy 00000000 00000000 mod g = r2 (precomputed) + * xxxxxxxx 00000000 00000000 00000000 mod g = r3 (precomputed) + * xxxxxxxx yyyyyyyy zzzzzzzz tttttttt mod g = r0^r1^r2^r3 + */ + while (mlen--) { + /* input data is read in big-endian format */ + w = r[0]^cpu_to_be32(*pdata++); + p0 = tab0 + (l+1)*((w >> 0) & 0xff); + p1 = tab1 + (l+1)*((w >> 8) & 0xff); + p2 = tab2 + (l+1)*((w >> 16) & 0xff); + p3 = tab3 + (l+1)*((w >> 24) & 0xff); + + for (i = 0; i < l; i++) + r[i] = r[i+1]^p0[i]^p1[i]^p2[i]^p3[i]; + + r[l] = p0[l]^p1[l]^p2[l]^p3[l]; + } + memcpy(bch->ecc_buf, r, sizeof(r)); + + /* process last unaligned bytes */ + if (len) + encode_bch_unaligned(bch, data, len, bch->ecc_buf); + + /* store ecc parity bytes into original parity buffer */ + if (ecc) + store_ecc8(bch, ecc, bch->ecc_buf); +} +EXPORT_SYMBOL_GPL(encode_bch); + +static inline int modulo(struct bch_control *bch, unsigned int v) +{ + const unsigned int n = GF_N(bch); + while (v >= n) { + v -= n; + v = (v & n) + (v >> GF_M(bch)); + } + return v; +} + +/* + * shorter and faster modulo function, only works when v < 2N. + */ +static inline int mod_s(struct bch_control *bch, unsigned int v) +{ + const unsigned int n = GF_N(bch); + return (v < n) ? v : v-n; +} + +static inline int deg(unsigned int poly) +{ + /* polynomial degree is the most-significant bit index */ + return fls(poly)-1; +} + +static inline int parity(unsigned int x) +{ + /* + * public domain code snippet, lifted from + * http://www-graphics.stanford.edu/~seander/bithacks.html + */ + x ^= x >> 1; + x ^= x >> 2; + x = (x & 0x11111111U) * 0x11111111U; + return (x >> 28) & 1; +} + +/* Galois field basic operations: multiply, divide, inverse, etc. */ + +static inline unsigned int gf_mul(struct bch_control *bch, unsigned int a, + unsigned int b) +{ + return (a && b) ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+ + bch->a_log_tab[b])] : 0; +} + +static inline unsigned int gf_sqr(struct bch_control *bch, unsigned int a) +{ + return a ? bch->a_pow_tab[mod_s(bch, 2*bch->a_log_tab[a])] : 0; +} + +static inline unsigned int gf_div(struct bch_control *bch, unsigned int a, + unsigned int b) +{ + return a ? bch->a_pow_tab[mod_s(bch, bch->a_log_tab[a]+ + GF_N(bch)-bch->a_log_tab[b])] : 0; +} + +static inline unsigned int gf_inv(struct bch_control *bch, unsigned int a) +{ + return bch->a_pow_tab[GF_N(bch)-bch->a_log_tab[a]]; +} + +static inline unsigned int a_pow(struct bch_control *bch, int i) +{ + return bch->a_pow_tab[modulo(bch, i)]; +} + +static inline int a_log(struct bch_control *bch, unsigned int x) +{ + return bch->a_log_tab[x]; +} + +static inline int a_ilog(struct bch_control *bch, unsigned int x) +{ + return mod_s(bch, GF_N(bch)-bch->a_log_tab[x]); +} + +/* + * compute 2t syndromes of ecc polynomial, i.e. ecc(a^j) for j=1..2t + */ +static void compute_syndromes(struct bch_control *bch, uint32_t *ecc, + unsigned int *syn) +{ + int i, j, s; + unsigned int m; + uint32_t poly; + const int t = GF_T(bch); + + s = bch->ecc_bits; + + /* make sure extra bits in last ecc word are cleared */ + m = ((unsigned int)s) & 31; + if (m) + ecc[s/32] &= ~((1u << (32-m))-1); + memset(syn, 0, 2*t*sizeof(*syn)); + + /* compute v(a^j) for j=1 .. 2t-1 */ + do { + poly = *ecc++; + s -= 32; + while (poly) { + i = deg(poly); + for (j = 0; j < 2*t; j += 2) + syn[j] ^= a_pow(bch, (j+1)*(i+s)); + + poly ^= (1 << i); + } + } while (s > 0); + + /* v(a^(2j)) = v(a^j)^2 */ + for (j = 0; j < t; j++) + syn[2*j+1] = gf_sqr(bch, syn[j]); +} + +static void gf_poly_copy(struct gf_poly *dst, struct gf_poly *src) +{ + memcpy(dst, src, GF_POLY_SZ(src->deg)); +} + +static int compute_error_locator_polynomial(struct bch_control *bch, + const unsigned int *syn) +{ + const unsigned int t = GF_T(bch); + const unsigned int n = GF_N(bch); + unsigned int i, j, tmp, l, pd = 1, d = syn[0]; + struct gf_poly *elp = bch->elp; + struct gf_poly *pelp = bch->poly_2t[0]; + struct gf_poly *elp_copy = bch->poly_2t[1]; + int k, pp = -1; + + memset(pelp, 0, GF_POLY_SZ(2*t)); + memset(elp, 0, GF_POLY_SZ(2*t)); + + pelp->deg = 0; + pelp->c[0] = 1; + elp->deg = 0; + elp->c[0] = 1; + + /* use simplified binary Berlekamp-Massey algorithm */ + for (i = 0; (i < t) && (elp->deg <= t); i++) { + if (d) { + k = 2*i-pp; + gf_poly_copy(elp_copy, elp); + /* e[i+1](X) = e[i](X)+di*dp^-1*X^2(i-p)*e[p](X) */ + tmp = a_log(bch, d)+n-a_log(bch, pd); + for (j = 0; j <= pelp->deg; j++) { + if (pelp->c[j]) { + l = a_log(bch, pelp->c[j]); + elp->c[j+k] ^= a_pow(bch, tmp+l); + } + } + /* compute l[i+1] = max(l[i]->c[l[p]+2*(i-p]) */ + tmp = pelp->deg+k; + if (tmp > elp->deg) { + elp->deg = tmp; + gf_poly_copy(pelp, elp_copy); + pd = d; + pp = 2*i; + } + } + /* di+1 = S(2i+3)+elp[i+1].1*S(2i+2)+...+elp[i+1].lS(2i+3-l) */ + if (i < t-1) { + d = syn[2*i+2]; + for (j = 1; j <= elp->deg; j++) + d ^= gf_mul(bch, elp->c[j], syn[2*i+2-j]); + } + } + dbg("elp=%s\n", gf_poly_str(elp)); + return (elp->deg > t) ? -1 : (int)elp->deg; +} + +/* + * solve a m x m linear system in GF(2) with an expected number of solutions, + * and return the number of found solutions + */ +static int solve_linear_system(struct bch_control *bch, unsigned int *rows, + unsigned int *sol, int nsol) +{ + const int m = GF_M(bch); + unsigned int tmp, mask; + int rem, c, r, p, k, param[m]; + + k = 0; + mask = 1 << m; + + /* Gaussian elimination */ + for (c = 0; c < m; c++) { + rem = 0; + p = c-k; + /* find suitable row for elimination */ + for (r = p; r < m; r++) { + if (rows[r] & mask) { + if (r != p) { + tmp = rows[r]; + rows[r] = rows[p]; + rows[p] = tmp; + } + rem = r+1; + break; + } + } + if (rem) { + /* perform elimination on remaining rows */ + tmp = rows[p]; + for (r = rem; r < m; r++) { + if (rows[r] & mask) + rows[r] ^= tmp; + } + } else { + /* elimination not needed, store defective row index */ + param[k++] = c; + } + mask >>= 1; + } + /* rewrite system, inserting fake parameter rows */ + if (k > 0) { + p = k; + for (r = m-1; r >= 0; r--) { + if ((r > m-1-k) && rows[r]) + /* system has no solution */ + return 0; + + rows[r] = (p && (r == param[p-1])) ? + p--, 1u << (m-r) : rows[r-p]; + } + } + + if (nsol != (1 << k)) + /* unexpected number of solutions */ + return 0; + + for (p = 0; p < nsol; p++) { + /* set parameters for p-th solution */ + for (c = 0; c < k; c++) + rows[param[c]] = (rows[param[c]] & ~1)|((p >> c) & 1); + + /* compute unique solution */ + tmp = 0; + for (r = m-1; r >= 0; r--) { + mask = rows[r] & (tmp|1); + tmp |= parity(mask) << (m-r); + } + sol[p] = tmp >> 1; + } + return nsol; +} + +/* + * this function builds and solves a linear system for finding roots of a degree + * 4 affine monic polynomial X^4+aX^2+bX+c over GF(2^m). + */ +static int find_affine4_roots(struct bch_control *bch, unsigned int a, + unsigned int b, unsigned int c, + unsigned int *roots) +{ + int i, j, k; + const int m = GF_M(bch); + unsigned int mask = 0xff, t, rows[16] = {0,}; + + j = a_log(bch, b); + k = a_log(bch, a); + rows[0] = c; + + /* buid linear system to solve X^4+aX^2+bX+c = 0 */ + for (i = 0; i < m; i++) { + rows[i+1] = bch->a_pow_tab[4*i]^ + (a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^ + (b ? bch->a_pow_tab[mod_s(bch, j)] : 0); + j++; + k += 2; + } + /* + * transpose 16x16 matrix before passing it to linear solver + * warning: this code assumes m < 16 + */ + for (j = 8; j != 0; j >>= 1, mask ^= (mask << j)) { + for (k = 0; k < 16; k = (k+j+1) & ~j) { + t = ((rows[k] >> j)^rows[k+j]) & mask; + rows[k] ^= (t << j); + rows[k+j] ^= t; + } + } + return solve_linear_system(bch, rows, roots, 4); +} + +/* + * compute root r of a degree 1 polynomial over GF(2^m) (returned as log(1/r)) + */ +static int find_poly_deg1_roots(struct bch_control *bch, struct gf_poly *poly, + unsigned int *roots) +{ + int n = 0; + + if (poly->c[0]) + /* poly[X] = bX+c with c!=0, root=c/b */ + roots[n++] = mod_s(bch, GF_N(bch)-bch->a_log_tab[poly->c[0]]+ + bch->a_log_tab[poly->c[1]]); + return n; +} + +/* + * compute roots of a degree 2 polynomial over GF(2^m) + */ +static int find_poly_deg2_roots(struct bch_control *bch, struct gf_poly *poly, + unsigned int *roots) +{ + int n = 0, i, l0, l1, l2; + unsigned int u, v, r; + + if (poly->c[0] && poly->c[1]) { + + l0 = bch->a_log_tab[poly->c[0]]; + l1 = bch->a_log_tab[poly->c[1]]; + l2 = bch->a_log_tab[poly->c[2]]; + + /* using z=a/bX, transform aX^2+bX+c into z^2+z+u (u=ac/b^2) */ + u = a_pow(bch, l0+l2+2*(GF_N(bch)-l1)); + /* + * let u = sum(li.a^i) i=0..m-1; then compute r = sum(li.xi): + * r^2+r = sum(li.(xi^2+xi)) = sum(li.(a^i+Tr(a^i).a^k)) = + * u + sum(li.Tr(a^i).a^k) = u+a^k.Tr(sum(li.a^i)) = u+a^k.Tr(u) + * i.e. r and r+1 are roots iff Tr(u)=0 + */ + r = 0; + v = u; + while (v) { + i = deg(v); + r ^= bch->xi_tab[i]; + v ^= (1 << i); + } + /* verify root */ + if ((gf_sqr(bch, r)^r) == u) { + /* reverse z=a/bX transformation and compute log(1/r) */ + roots[n++] = modulo(bch, 2*GF_N(bch)-l1- + bch->a_log_tab[r]+l2); + roots[n++] = modulo(bch, 2*GF_N(bch)-l1- + bch->a_log_tab[r^1]+l2); + } + } + return n; +} + +/* + * compute roots of a degree 3 polynomial over GF(2^m) + */ +static int find_poly_deg3_roots(struct bch_control *bch, struct gf_poly *poly, + unsigned int *roots) +{ + int i, n = 0; + unsigned int a, b, c, a2, b2, c2, e3, tmp[4]; + + if (poly->c[0]) { + /* transform polynomial into monic X^3 + a2X^2 + b2X + c2 */ + e3 = poly->c[3]; + c2 = gf_div(bch, poly->c[0], e3); + b2 = gf_div(bch, poly->c[1], e3); + a2 = gf_div(bch, poly->c[2], e3); + + /* (X+a2)(X^3+a2X^2+b2X+c2) = X^4+aX^2+bX+c (affine) */ + c = gf_mul(bch, a2, c2); /* c = a2c2 */ + b = gf_mul(bch, a2, b2)^c2; /* b = a2b2 + c2 */ + a = gf_sqr(bch, a2)^b2; /* a = a2^2 + b2 */ + + /* find the 4 roots of this affine polynomial */ + if (find_affine4_roots(bch, a, b, c, tmp) == 4) { + /* remove a2 from final list of roots */ + for (i = 0; i < 4; i++) { + if (tmp[i] != a2) + roots[n++] = a_ilog(bch, tmp[i]); + } + } + } + return n; +} + +/* + * compute roots of a degree 4 polynomial over GF(2^m) + */ +static int find_poly_deg4_roots(struct bch_control *bch, struct gf_poly *poly, + unsigned int *roots) +{ + int i, l, n = 0; + unsigned int a, b, c, d, e = 0, f, a2, b2, c2, e4; + + if (poly->c[0] == 0) + return 0; + + /* transform polynomial into monic X^4 + aX^3 + bX^2 + cX + d */ + e4 = poly->c[4]; + d = gf_div(bch, poly->c[0], e4); + c = gf_div(bch, poly->c[1], e4); + b = gf_div(bch, poly->c[2], e4); + a = gf_div(bch, poly->c[3], e4); + + /* use Y=1/X transformation to get an affine polynomial */ + if (a) { + /* first, eliminate cX by using z=X+e with ae^2+c=0 */ + if (c) { + /* compute e such that e^2 = c/a */ + f = gf_div(bch, c, a); + l = a_log(bch, f); + l += (l & 1) ? GF_N(bch) : 0; + e = a_pow(bch, l/2); + /* + * use transformation z=X+e: + * z^4+e^4 + a(z^3+ez^2+e^2z+e^3) + b(z^2+e^2) +cz+ce+d + * z^4 + az^3 + (ae+b)z^2 + (ae^2+c)z+e^4+be^2+ae^3+ce+d + * z^4 + az^3 + (ae+b)z^2 + e^4+be^2+d + * z^4 + az^3 + b'z^2 + d' + */ + d = a_pow(bch, 2*l)^gf_mul(bch, b, f)^d; + b = gf_mul(bch, a, e)^b; + } + /* now, use Y=1/X to get Y^4 + b/dY^2 + a/dY + 1/d */ + if (d == 0) + /* assume all roots have multiplicity 1 */ + return 0; + + c2 = gf_inv(bch, d); + b2 = gf_div(bch, a, d); + a2 = gf_div(bch, b, d); + } else { + /* polynomial is already affine */ + c2 = d; + b2 = c; + a2 = b; + } + /* find the 4 roots of this affine polynomial */ + if (find_affine4_roots(bch, a2, b2, c2, roots) == 4) { + for (i = 0; i < 4; i++) { + /* post-process roots (reverse transformations) */ + f = a ? gf_inv(bch, roots[i]) : roots[i]; + roots[i] = a_ilog(bch, f^e); + } + n = 4; + } + return n; +} + +/* + * build monic, log-based representation of a polynomial + */ +static void gf_poly_logrep(struct bch_control *bch, + const struct gf_poly *a, int *rep) +{ + int i, d = a->deg, l = GF_N(bch)-a_log(bch, a->c[a->deg]); + + /* represent 0 values with -1; warning, rep[d] is not set to 1 */ + for (i = 0; i < d; i++) + rep[i] = a->c[i] ? mod_s(bch, a_log(bch, a->c[i])+l) : -1; +} + +/* + * compute polynomial Euclidean division remainder in GF(2^m)[X] + */ +static void gf_poly_mod(struct bch_control *bch, struct gf_poly *a, + const struct gf_poly *b, int *rep) +{ + int la, p, m; + unsigned int i, j, *c = a->c; + const unsigned int d = b->deg; + + if (a->deg < d) + return; + + /* reuse or compute log representation of denominator */ + if (!rep) { + rep = bch->cache; + gf_poly_logrep(bch, b, rep); + } + + for (j = a->deg; j >= d; j--) { + if (c[j]) { + la = a_log(bch, c[j]); + p = j-d; + for (i = 0; i < d; i++, p++) { + m = rep[i]; + if (m >= 0) + c[p] ^= bch->a_pow_tab[mod_s(bch, + m+la)]; + } + } + } + a->deg = d-1; + while (!c[a->deg] && a->deg) + a->deg--; +} + +/* + * compute polynomial Euclidean division quotient in GF(2^m)[X] + */ +static void gf_poly_div(struct bch_control *bch, struct gf_poly *a, + const struct gf_poly *b, struct gf_poly *q) +{ + if (a->deg >= b->deg) { + q->deg = a->deg-b->deg; + /* compute a mod b (modifies a) */ + gf_poly_mod(bch, a, b, NULL); + /* quotient is stored in upper part of polynomial a */ + memcpy(q->c, &a->c[b->deg], (1+q->deg)*sizeof(unsigned int)); + } else { + q->deg = 0; + q->c[0] = 0; + } +} + +/* + * compute polynomial GCD (Greatest Common Divisor) in GF(2^m)[X] + */ +static struct gf_poly *gf_poly_gcd(struct bch_control *bch, struct gf_poly *a, + struct gf_poly *b) +{ + struct gf_poly *tmp; + + dbg("gcd(%s,%s)=", gf_poly_str(a), gf_poly_str(b)); + + if (a->deg < b->deg) { + tmp = b; + b = a; + a = tmp; + } + + while (b->deg > 0) { + gf_poly_mod(bch, a, b, NULL); + tmp = b; + b = a; + a = tmp; + } + + dbg("%s\n", gf_poly_str(a)); + + return a; +} + +/* + * Given a polynomial f and an integer k, compute Tr(a^kX) mod f + * This is used in Berlekamp Trace algorithm for splitting polynomials + */ +static void compute_trace_bk_mod(struct bch_control *bch, int k, + const struct gf_poly *f, struct gf_poly *z, + struct gf_poly *out) +{ + const int m = GF_M(bch); + int i, j; + + /* z contains z^2j mod f */ + z->deg = 1; + z->c[0] = 0; + z->c[1] = bch->a_pow_tab[k]; + + out->deg = 0; + memset(out, 0, GF_POLY_SZ(f->deg)); + + /* compute f log representation only once */ + gf_poly_logrep(bch, f, bch->cache); + + for (i = 0; i < m; i++) { + /* add a^(k*2^i)(z^(2^i) mod f) and compute (z^(2^i) mod f)^2 */ + for (j = z->deg; j >= 0; j--) { + out->c[j] ^= z->c[j]; + z->c[2*j] = gf_sqr(bch, z->c[j]); + z->c[2*j+1] = 0; + } + if (z->deg > out->deg) + out->deg = z->deg; + + if (i < m-1) { + z->deg *= 2; + /* z^(2(i+1)) mod f = (z^(2^i) mod f)^2 mod f */ + gf_poly_mod(bch, z, f, bch->cache); + } + } + while (!out->c[out->deg] && out->deg) + out->deg--; + + dbg("Tr(a^%d.X) mod f = %s\n", k, gf_poly_str(out)); +} + +/* + * factor a polynomial using Berlekamp Trace algorithm (BTA) + */ +static void factor_polynomial(struct bch_control *bch, int k, struct gf_poly *f, + struct gf_poly **g, struct gf_poly **h) +{ + struct gf_poly *f2 = bch->poly_2t[0]; + struct gf_poly *q = bch->poly_2t[1]; + struct gf_poly *tk = bch->poly_2t[2]; + struct gf_poly *z = bch->poly_2t[3]; + struct gf_poly *gcd; + + dbg("factoring %s...\n", gf_poly_str(f)); + + *g = f; + *h = NULL; + + /* tk = Tr(a^k.X) mod f */ + compute_trace_bk_mod(bch, k, f, z, tk); + + if (tk->deg > 0) { + /* compute g = gcd(f, tk) (destructive operation) */ + gf_poly_copy(f2, f); + gcd = gf_poly_gcd(bch, f2, tk); + if (gcd->deg < f->deg) { + /* compute h=f/gcd(f,tk); this will modify f and q */ + gf_poly_div(bch, f, gcd, q); + /* store g and h in-place (clobbering f) */ + *h = &((struct gf_poly_deg1 *)f)[gcd->deg].poly; + gf_poly_copy(*g, gcd); + gf_poly_copy(*h, q); + } + } +} + +/* + * find roots of a polynomial, using BTZ algorithm; see the beginning of this + * file for details + */ +static int find_poly_roots(struct bch_control *bch, unsigned int k, + struct gf_poly *poly, unsigned int *roots) +{ + int cnt; + struct gf_poly *f1, *f2; + + switch (poly->deg) { + /* handle low degree polynomials with ad hoc techniques */ + case 1: + cnt = find_poly_deg1_roots(bch, poly, roots); + break; + case 2: + cnt = find_poly_deg2_roots(bch, poly, roots); + break; + case 3: + cnt = find_poly_deg3_roots(bch, poly, roots); + break; + case 4: + cnt = find_poly_deg4_roots(bch, poly, roots); + break; + default: + /* factor polynomial using Berlekamp Trace Algorithm (BTA) */ + cnt = 0; + if (poly->deg && (k <= GF_M(bch))) { + factor_polynomial(bch, k, poly, &f1, &f2); + if (f1) + cnt += find_poly_roots(bch, k+1, f1, roots); + if (f2) + cnt += find_poly_roots(bch, k+1, f2, roots+cnt); + } + break; + } + return cnt; +} + +#if defined(USE_CHIEN_SEARCH) +/* + * exhaustive root search (Chien) implementation - not used, included only for + * reference/comparison tests + */ +static int chien_search(struct bch_control *bch, unsigned int len, + struct gf_poly *p, unsigned int *roots) +{ + int m; + unsigned int i, j, syn, syn0, count = 0; + const unsigned int k = 8*len+bch->ecc_bits; + + /* use a log-based representation of polynomial */ + gf_poly_logrep(bch, p, bch->cache); + bch->cache[p->deg] = 0; + syn0 = gf_div(bch, p->c[0], p->c[p->deg]); + + for (i = GF_N(bch)-k+1; i <= GF_N(bch); i++) { + /* compute elp(a^i) */ + for (j = 1, syn = syn0; j <= p->deg; j++) { + m = bch->cache[j]; + if (m >= 0) + syn ^= a_pow(bch, m+j*i); + } + if (syn == 0) { + roots[count++] = GF_N(bch)-i; + if (count == p->deg) + break; + } + } + return (count == p->deg) ? count : 0; +} +#define find_poly_roots(_p, _k, _elp, _loc) chien_search(_p, len, _elp, _loc) +#endif /* USE_CHIEN_SEARCH */ + +/** + * decode_bch - decode received codeword and find bit error locations + * @bch: BCH control structure + * @data: received data, ignored if @calc_ecc is provided + * @len: data length in bytes, must always be provided + * @recv_ecc: received ecc, if NULL then assume it was XORed in @calc_ecc + * @calc_ecc: calculated ecc, if NULL then calc_ecc is computed from @data + * @syn: hw computed syndrome data (if NULL, syndrome is calculated) + * @errloc: output array of error locations + * + * Returns: + * The number of errors found, or -EBADMSG if decoding failed, or -EINVAL if + * invalid parameters were provided + * + * Depending on the available hw BCH support and the need to compute @calc_ecc + * separately (using encode_bch()), this function should be called with one of + * the following parameter configurations - + * + * by providing @data and @recv_ecc only: + * decode_bch(@bch, @data, @len, @recv_ecc, NULL, NULL, @errloc) + * + * by providing @recv_ecc and @calc_ecc: + * decode_bch(@bch, NULL, @len, @recv_ecc, @calc_ecc, NULL, @errloc) + * + * by providing ecc = recv_ecc XOR calc_ecc: + * decode_bch(@bch, NULL, @len, NULL, ecc, NULL, @errloc) + * + * by providing syndrome results @syn: + * decode_bch(@bch, NULL, @len, NULL, NULL, @syn, @errloc) + * + * Once decode_bch() has successfully returned with a positive value, error + * locations returned in array @errloc should be interpreted as follows - + * + * if (errloc[n] >= 8*len), then n-th error is located in ecc (no need for + * data correction) + * + * if (errloc[n] < 8*len), then n-th error is located in data and can be + * corrected with statement data[errloc[n]/8] ^= 1 << (errloc[n] % 8); + * + * Note that this function does not perform any data correction by itself, it + * merely indicates error locations. + */ +int decode_bch(struct bch_control *bch, const uint8_t *data, unsigned int len, + const uint8_t *recv_ecc, const uint8_t *calc_ecc, + const unsigned int *syn, unsigned int *errloc) +{ + const unsigned int ecc_words = BCH_ECC_WORDS(bch); + unsigned int nbits; + int i, err, nroots; + uint32_t sum; + + /* sanity check: make sure data length can be handled */ + if (8*len > (bch->n-bch->ecc_bits)) + return -EINVAL; + + /* if caller does not provide syndromes, compute them */ + if (!syn) { + if (!calc_ecc) { + /* compute received data ecc into an internal buffer */ + if (!data || !recv_ecc) + return -EINVAL; + encode_bch(bch, data, len, NULL); + } else { + /* load provided calculated ecc */ + load_ecc8(bch, bch->ecc_buf, calc_ecc); + } + /* load received ecc or assume it was XORed in calc_ecc */ + if (recv_ecc) { + load_ecc8(bch, bch->ecc_buf2, recv_ecc); + /* XOR received and calculated ecc */ + for (i = 0, sum = 0; i < (int)ecc_words; i++) { + bch->ecc_buf[i] ^= bch->ecc_buf2[i]; + sum |= bch->ecc_buf[i]; + } + if (!sum) + /* no error found */ + return 0; + } + compute_syndromes(bch, bch->ecc_buf, bch->syn); + syn = bch->syn; + } + + err = compute_error_locator_polynomial(bch, syn); + if (err > 0) { + nroots = find_poly_roots(bch, 1, bch->elp, errloc); + if (err != nroots) + err = -1; + } + if (err > 0) { + /* post-process raw error locations for easier correction */ + nbits = (len*8)+bch->ecc_bits; + for (i = 0; i < err; i++) { + if (errloc[i] >= nbits) { + err = -1; + break; + } + errloc[i] = nbits-1-errloc[i]; + errloc[i] = (errloc[i] & ~7)|(7-(errloc[i] & 7)); + } + } + return (err >= 0) ? err : -EBADMSG; +} +EXPORT_SYMBOL_GPL(decode_bch); + +/* + * generate Galois field lookup tables + */ +static int build_gf_tables(struct bch_control *bch, unsigned int poly) +{ + unsigned int i, x = 1; + const unsigned int k = 1 << deg(poly); + + /* primitive polynomial must be of degree m */ + if (k != (1u << GF_M(bch))) + return -1; + + for (i = 0; i < GF_N(bch); i++) { + bch->a_pow_tab[i] = x; + bch->a_log_tab[x] = i; + if (i && (x == 1)) + /* polynomial is not primitive (a^i=1 with 0a_pow_tab[GF_N(bch)] = 1; + bch->a_log_tab[0] = 0; + + return 0; +} + +/* + * compute generator polynomial remainder tables for fast encoding + */ +static void build_mod8_tables(struct bch_control *bch, const uint32_t *g) +{ + int i, j, b, d; + uint32_t data, hi, lo, *tab; + const int l = BCH_ECC_WORDS(bch); + const int plen = DIV_ROUND_UP(bch->ecc_bits+1, 32); + const int ecclen = DIV_ROUND_UP(bch->ecc_bits, 32); + + memset(bch->mod8_tab, 0, 4*256*l*sizeof(*bch->mod8_tab)); + + for (i = 0; i < 256; i++) { + /* p(X)=i is a small polynomial of weight <= 8 */ + for (b = 0; b < 4; b++) { + /* we want to compute (p(X).X^(8*b+deg(g))) mod g(X) */ + tab = bch->mod8_tab + (b*256+i)*l; + data = i << (8*b); + while (data) { + d = deg(data); + /* subtract X^d.g(X) from p(X).X^(8*b+deg(g)) */ + data ^= g[0] >> (31-d); + for (j = 0; j < ecclen; j++) { + hi = (d < 31) ? g[j] << (d+1) : 0; + lo = (j+1 < plen) ? + g[j+1] >> (31-d) : 0; + tab[j] ^= hi|lo; + } + } + } + } +} + +/* + * build a base for factoring degree 2 polynomials + */ +static int build_deg2_base(struct bch_control *bch) +{ + const int m = GF_M(bch); + int i, j, r; + unsigned int sum, x, y, remaining, ak = 0, xi[m]; + + /* find k s.t. Tr(a^k) = 1 and 0 <= k < m */ + for (i = 0; i < m; i++) { + for (j = 0, sum = 0; j < m; j++) + sum ^= a_pow(bch, i*(1 << j)); + + if (sum) { + ak = bch->a_pow_tab[i]; + break; + } + } + /* find xi, i=0..m-1 such that xi^2+xi = a^i+Tr(a^i).a^k */ + remaining = m; + memset(xi, 0, sizeof(xi)); + + for (x = 0; (x <= GF_N(bch)) && remaining; x++) { + y = gf_sqr(bch, x)^x; + for (i = 0; i < 2; i++) { + r = a_log(bch, y); + if (y && (r < m) && !xi[r]) { + bch->xi_tab[r] = x; + xi[r] = 1; + remaining--; + dbg("x%d = %x\n", r, x); + break; + } + y ^= ak; + } + } + /* should not happen but check anyway */ + return remaining ? -1 : 0; +} + +static void *bch_alloc(size_t size, int *err) +{ + void *ptr; + + ptr = kmalloc(size, GFP_KERNEL); + if (ptr == NULL) + *err = 1; + return ptr; +} + +/* + * compute generator polynomial for given (m,t) parameters. + */ +static uint32_t *compute_generator_polynomial(struct bch_control *bch) +{ + const unsigned int m = GF_M(bch); + const unsigned int t = GF_T(bch); + int n, err = 0; + unsigned int i, j, nbits, r, word, *roots; + struct gf_poly *g; + uint32_t *genpoly; + + g = bch_alloc(GF_POLY_SZ(m*t), &err); + roots = bch_alloc((bch->n+1)*sizeof(*roots), &err); + genpoly = bch_alloc(DIV_ROUND_UP(m*t+1, 32)*sizeof(*genpoly), &err); + + if (err) { + kfree(genpoly); + genpoly = NULL; + goto finish; + } + + /* enumerate all roots of g(X) */ + memset(roots , 0, (bch->n+1)*sizeof(*roots)); + for (i = 0; i < t; i++) { + for (j = 0, r = 2*i+1; j < m; j++) { + roots[r] = 1; + r = mod_s(bch, 2*r); + } + } + /* build generator polynomial g(X) */ + g->deg = 0; + g->c[0] = 1; + for (i = 0; i < GF_N(bch); i++) { + if (roots[i]) { + /* multiply g(X) by (X+root) */ + r = bch->a_pow_tab[i]; + g->c[g->deg+1] = 1; + for (j = g->deg; j > 0; j--) + g->c[j] = gf_mul(bch, g->c[j], r)^g->c[j-1]; + + g->c[0] = gf_mul(bch, g->c[0], r); + g->deg++; + } + } + /* store left-justified binary representation of g(X) */ + n = g->deg+1; + i = 0; + + while (n > 0) { + nbits = (n > 32) ? 32 : n; + for (j = 0, word = 0; j < nbits; j++) { + if (g->c[n-1-j]) + word |= 1u << (31-j); + } + genpoly[i++] = word; + n -= nbits; + } + bch->ecc_bits = g->deg; + +finish: + kfree(g); + kfree(roots); + + return genpoly; +} + +/** + * init_bch - initialize a BCH encoder/decoder + * @m: Galois field order, should be in the range 5-15 + * @t: maximum error correction capability, in bits + * @prim_poly: user-provided primitive polynomial (or 0 to use default) + * + * Returns: + * a newly allocated BCH control structure if successful, NULL otherwise + * + * This initialization can take some time, as lookup tables are built for fast + * encoding/decoding; make sure not to call this function from a time critical + * path. Usually, init_bch() should be called on module/driver init and + * free_bch() should be called to release memory on exit. + * + * You may provide your own primitive polynomial of degree @m in argument + * @prim_poly, or let init_bch() use its default polynomial. + * + * Once init_bch() has successfully returned a pointer to a newly allocated + * BCH control structure, ecc length in bytes is given by member @ecc_bytes of + * the structure. + */ +struct bch_control *init_bch(int m, int t, unsigned int prim_poly) +{ + int err = 0; + unsigned int i, words; + uint32_t *genpoly; + struct bch_control *bch = NULL; + + const int min_m = 5; + const int max_m = 15; + + /* default primitive polynomials */ + static const unsigned int prim_poly_tab[] = { + 0x25, 0x43, 0x83, 0x11d, 0x211, 0x409, 0x805, 0x1053, 0x201b, + 0x402b, 0x8003, + }; + +#if defined(CONFIG_BCH_CONST_PARAMS) + if ((m != (CONFIG_BCH_CONST_M)) || (t != (CONFIG_BCH_CONST_T))) { + printk(KERN_ERR "bch encoder/decoder was configured to support " + "parameters m=%d, t=%d only!\n", + CONFIG_BCH_CONST_M, CONFIG_BCH_CONST_T); + goto fail; + } +#endif + if ((m < min_m) || (m > max_m)) + /* + * values of m greater than 15 are not currently supported; + * supporting m > 15 would require changing table base type + * (uint16_t) and a small patch in matrix transposition + */ + goto fail; + + /* sanity checks */ + if ((t < 1) || (m*t >= ((1 << m)-1))) + /* invalid t value */ + goto fail; + + /* select a primitive polynomial for generating GF(2^m) */ + if (prim_poly == 0) + prim_poly = prim_poly_tab[m-min_m]; + + bch = kzalloc(sizeof(*bch), GFP_KERNEL); + if (bch == NULL) + goto fail; + + bch->m = m; + bch->t = t; + bch->n = (1 << m)-1; + words = DIV_ROUND_UP(m*t, 32); + bch->ecc_bytes = DIV_ROUND_UP(m*t, 8); + bch->a_pow_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_pow_tab), &err); + bch->a_log_tab = bch_alloc((1+bch->n)*sizeof(*bch->a_log_tab), &err); + bch->mod8_tab = bch_alloc(words*1024*sizeof(*bch->mod8_tab), &err); + bch->ecc_buf = bch_alloc(words*sizeof(*bch->ecc_buf), &err); + bch->ecc_buf2 = bch_alloc(words*sizeof(*bch->ecc_buf2), &err); + bch->xi_tab = bch_alloc(m*sizeof(*bch->xi_tab), &err); + bch->syn = bch_alloc(2*t*sizeof(*bch->syn), &err); + bch->cache = bch_alloc(2*t*sizeof(*bch->cache), &err); + bch->elp = bch_alloc((t+1)*sizeof(struct gf_poly_deg1), &err); + + for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++) + bch->poly_2t[i] = bch_alloc(GF_POLY_SZ(2*t), &err); + + if (err) + goto fail; + + err = build_gf_tables(bch, prim_poly); + if (err) + goto fail; + + /* use generator polynomial for computing encoding tables */ + genpoly = compute_generator_polynomial(bch); + if (genpoly == NULL) + goto fail; + + build_mod8_tables(bch, genpoly); + kfree(genpoly); + + err = build_deg2_base(bch); + if (err) + goto fail; + + return bch; + +fail: + free_bch(bch); + return NULL; +} +EXPORT_SYMBOL_GPL(init_bch); + +/** + * free_bch - free the BCH control structure + * @bch: BCH control structure to release + */ +void free_bch(struct bch_control *bch) +{ + unsigned int i; + + if (bch) { + kfree(bch->a_pow_tab); + kfree(bch->a_log_tab); + kfree(bch->mod8_tab); + kfree(bch->ecc_buf); + kfree(bch->ecc_buf2); + kfree(bch->xi_tab); + kfree(bch->syn); + kfree(bch->cache); + kfree(bch->elp); + + for (i = 0; i < ARRAY_SIZE(bch->poly_2t); i++) + kfree(bch->poly_2t[i]); + + kfree(bch); + } +} +EXPORT_SYMBOL_GPL(free_bch); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ivan Djelic "); +MODULE_DESCRIPTION("Binary BCH encoder/decoder"); -- cgit v1.2.3 From 1065cda8a1a57d0d3bfccdce1a655a84439d24e0 Mon Sep 17 00:00:00 2001 From: Steffen Sledz Date: Thu, 10 Mar 2011 09:05:12 +0100 Subject: mtd: cfi: add support for AMIC flashes (e.g. A29L160AT) Signed-off-by: Steffen Sledz Signed-off-by: David Woodhouse --- drivers/mtd/chips/cfi_cmdset_0002.c | 1 + include/linux/mtd/cfi.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index 7e9c4e9c274a..f9a5331e9445 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -349,6 +349,7 @@ static struct cfi_fixup cfi_fixup_table[] = { { CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri }, #ifdef AMD_BOOTLOC_BUG { CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock }, + { CFI_MFR_AMIC, CFI_ID_ANY, fixup_amd_bootblock }, { CFI_MFR_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock }, #endif { CFI_MFR_AMD, 0x0050, fixup_use_secsi }, diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index a9baee6864af..0d823f2dd667 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -535,6 +535,7 @@ struct cfi_fixup { #define CFI_MFR_CONTINUATION 0x007F #define CFI_MFR_AMD 0x0001 +#define CFI_MFR_AMIC 0x0037 #define CFI_MFR_ATMEL 0x001F #define CFI_MFR_EON 0x001C #define CFI_MFR_FUJITSU 0x0004 -- cgit v1.2.3 From 193bd40026443835e1b96c79d5efe559d01509ae Mon Sep 17 00:00:00 2001 From: Ivan Djelic Date: Fri, 11 Mar 2011 11:05:33 +0100 Subject: mtd: nand: add software BCH ECC support This patch adds software BCH ECC support to mtd, in order to handle recent NAND device ecc requirements (4 bits or more). It does so by adding a new ecc mode (NAND_ECC_SOFT_BCH) for use by board drivers, and a new Kconfig option to enable BCH support. It relies on the generic BCH library introduced in a previous patch. When a board driver uses mode NAND_ECC_SOFT_BCH, it should also set fields chip->ecc.size and chip->ecc.bytes to select BCH ecc data size and required error correction capability. See nand_bch_init() documentation for details. It has been tested on the following platforms using mtd-utils, UBI and UBIFS: x86 (with nandsim), arm926ejs. Signed-off-by: Ivan Djelic Signed-off-by: David Woodhouse --- drivers/mtd/nand/Kconfig | 15 +++ drivers/mtd/nand/Makefile | 1 + drivers/mtd/nand/nand_base.c | 40 ++++++- drivers/mtd/nand/nand_bch.c | 243 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/nand.h | 3 + include/linux/mtd/nand_bch.h | 72 +++++++++++++ 6 files changed, 373 insertions(+), 1 deletion(-) create mode 100644 drivers/mtd/nand/nand_bch.c create mode 100644 include/linux/mtd/nand_bch.h (limited to 'include') diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index c89592239bc7..78205ac2b10f 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -31,6 +31,21 @@ config MTD_NAND_VERIFY_WRITE device thinks the write was successful, a bit could have been flipped accidentally due to device wear or something else. +config MTD_NAND_BCH + tristate + select BCH + depends on MTD_NAND_ECC_BCH + default MTD_NAND + +config MTD_NAND_ECC_BCH + bool "Support software BCH ECC" + default n + help + This enables support for software BCH error correction. Binary BCH + codes are more powerful and cpu intensive than traditional Hamming + ECC codes. They are used with NAND devices requiring more than 1 bit + of error correction. + config MTD_SM_COMMON tristate default n diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile index 8ad6faec72cb..5745d831168e 100644 --- a/drivers/mtd/nand/Makefile +++ b/drivers/mtd/nand/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_MTD_NAND) += nand.o obj-$(CONFIG_MTD_NAND_ECC) += nand_ecc.o +obj-$(CONFIG_MTD_NAND_BCH) += nand_bch.o obj-$(CONFIG_MTD_NAND_IDS) += nand_ids.o obj-$(CONFIG_MTD_SM_COMMON) += sm_common.o diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index da7604050347..85cfc061d41c 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -3248,7 +3249,7 @@ int nand_scan_tail(struct mtd_info *mtd) /* * If no default placement scheme is given, select an appropriate one */ - if (!chip->ecc.layout) { + if (!chip->ecc.layout && (chip->ecc.mode != NAND_ECC_SOFT_BCH)) { switch (mtd->oobsize) { case 8: chip->ecc.layout = &nand_oob_8; @@ -3351,6 +3352,40 @@ int nand_scan_tail(struct mtd_info *mtd) chip->ecc.bytes = 3; break; + case NAND_ECC_SOFT_BCH: + if (!mtd_nand_has_bch()) { + printk(KERN_WARNING "CONFIG_MTD_ECC_BCH not enabled\n"); + BUG(); + } + chip->ecc.calculate = nand_bch_calculate_ecc; + chip->ecc.correct = nand_bch_correct_data; + chip->ecc.read_page = nand_read_page_swecc; + chip->ecc.read_subpage = nand_read_subpage; + chip->ecc.write_page = nand_write_page_swecc; + chip->ecc.read_page_raw = nand_read_page_raw; + chip->ecc.write_page_raw = nand_write_page_raw; + chip->ecc.read_oob = nand_read_oob_std; + chip->ecc.write_oob = nand_write_oob_std; + /* + * Board driver should supply ecc.size and ecc.bytes values to + * select how many bits are correctable; see nand_bch_init() + * for details. + * Otherwise, default to 4 bits for large page devices + */ + if (!chip->ecc.size && (mtd->oobsize >= 64)) { + chip->ecc.size = 512; + chip->ecc.bytes = 7; + } + chip->ecc.priv = nand_bch_init(mtd, + chip->ecc.size, + chip->ecc.bytes, + &chip->ecc.layout); + if (!chip->ecc.priv) { + printk(KERN_WARNING "BCH ECC initialization failed!\n"); + BUG(); + } + break; + case NAND_ECC_NONE: printk(KERN_WARNING "NAND_ECC_NONE selected by board driver. " "This is not recommended !!\n"); @@ -3501,6 +3536,9 @@ void nand_release(struct mtd_info *mtd) { struct nand_chip *chip = mtd->priv; + if (chip->ecc.mode == NAND_ECC_SOFT_BCH) + nand_bch_free((struct nand_bch_control *)chip->ecc.priv); + #ifdef CONFIG_MTD_PARTITIONS /* Deregister partitions */ del_mtd_partitions(mtd); diff --git a/drivers/mtd/nand/nand_bch.c b/drivers/mtd/nand/nand_bch.c new file mode 100644 index 000000000000..0f931e757116 --- /dev/null +++ b/drivers/mtd/nand/nand_bch.c @@ -0,0 +1,243 @@ +/* + * This file provides ECC correction for more than 1 bit per block of data, + * using binary BCH codes. It relies on the generic BCH library lib/bch.c. + * + * Copyright © 2011 Ivan Djelic + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this file; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * struct nand_bch_control - private NAND BCH control structure + * @bch: BCH control structure + * @ecclayout: private ecc layout for this BCH configuration + * @errloc: error location array + * @eccmask: XOR ecc mask, allows erased pages to be decoded as valid + */ +struct nand_bch_control { + struct bch_control *bch; + struct nand_ecclayout ecclayout; + unsigned int *errloc; + unsigned char *eccmask; +}; + +/** + * nand_bch_calculate_ecc - [NAND Interface] Calculate ECC for data block + * @mtd: MTD block structure + * @buf: input buffer with raw data + * @code: output buffer with ECC + */ +int nand_bch_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf, + unsigned char *code) +{ + const struct nand_chip *chip = mtd->priv; + struct nand_bch_control *nbc = chip->ecc.priv; + unsigned int i; + + memset(code, 0, chip->ecc.bytes); + encode_bch(nbc->bch, buf, chip->ecc.size, code); + + /* apply mask so that an erased page is a valid codeword */ + for (i = 0; i < chip->ecc.bytes; i++) + code[i] ^= nbc->eccmask[i]; + + return 0; +} +EXPORT_SYMBOL(nand_bch_calculate_ecc); + +/** + * nand_bch_correct_data - [NAND Interface] Detect and correct bit error(s) + * @mtd: MTD block structure + * @buf: raw data read from the chip + * @read_ecc: ECC from the chip + * @calc_ecc: the ECC calculated from raw data + * + * Detect and correct bit errors for a data byte block + */ +int nand_bch_correct_data(struct mtd_info *mtd, unsigned char *buf, + unsigned char *read_ecc, unsigned char *calc_ecc) +{ + const struct nand_chip *chip = mtd->priv; + struct nand_bch_control *nbc = chip->ecc.priv; + unsigned int *errloc = nbc->errloc; + int i, count; + + count = decode_bch(nbc->bch, NULL, chip->ecc.size, read_ecc, calc_ecc, + NULL, errloc); + if (count > 0) { + for (i = 0; i < count; i++) { + if (errloc[i] < (chip->ecc.size*8)) + /* error is located in data, correct it */ + buf[errloc[i] >> 3] ^= (1 << (errloc[i] & 7)); + /* else error in ecc, no action needed */ + + DEBUG(MTD_DEBUG_LEVEL0, "%s: corrected bitflip %u\n", + __func__, errloc[i]); + } + } else if (count < 0) { + printk(KERN_ERR "ecc unrecoverable error\n"); + count = -1; + } + return count; +} +EXPORT_SYMBOL(nand_bch_correct_data); + +/** + * nand_bch_init - [NAND Interface] Initialize NAND BCH error correction + * @mtd: MTD block structure + * @eccsize: ecc block size in bytes + * @eccbytes: ecc length in bytes + * @ecclayout: output default layout + * + * Returns: + * a pointer to a new NAND BCH control structure, or NULL upon failure + * + * Initialize NAND BCH error correction. Parameters @eccsize and @eccbytes + * are used to compute BCH parameters m (Galois field order) and t (error + * correction capability). @eccbytes should be equal to the number of bytes + * required to store m*t bits, where m is such that 2^m-1 > @eccsize*8. + * + * Example: to configure 4 bit correction per 512 bytes, you should pass + * @eccsize = 512 (thus, m=13 is the smallest integer such that 2^m-1 > 512*8) + * @eccbytes = 7 (7 bytes are required to store m*t = 13*4 = 52 bits) + */ +struct nand_bch_control * +nand_bch_init(struct mtd_info *mtd, unsigned int eccsize, unsigned int eccbytes, + struct nand_ecclayout **ecclayout) +{ + unsigned int m, t, eccsteps, i; + struct nand_ecclayout *layout; + struct nand_bch_control *nbc = NULL; + unsigned char *erased_page; + + if (!eccsize || !eccbytes) { + printk(KERN_WARNING "ecc parameters not supplied\n"); + goto fail; + } + + m = fls(1+8*eccsize); + t = (eccbytes*8)/m; + + nbc = kzalloc(sizeof(*nbc), GFP_KERNEL); + if (!nbc) + goto fail; + + nbc->bch = init_bch(m, t, 0); + if (!nbc->bch) + goto fail; + + /* verify that eccbytes has the expected value */ + if (nbc->bch->ecc_bytes != eccbytes) { + printk(KERN_WARNING "invalid eccbytes %u, should be %u\n", + eccbytes, nbc->bch->ecc_bytes); + goto fail; + } + + eccsteps = mtd->writesize/eccsize; + + /* if no ecc placement scheme was provided, build one */ + if (!*ecclayout) { + + /* handle large page devices only */ + if (mtd->oobsize < 64) { + printk(KERN_WARNING "must provide an oob scheme for " + "oobsize %d\n", mtd->oobsize); + goto fail; + } + + layout = &nbc->ecclayout; + layout->eccbytes = eccsteps*eccbytes; + + /* reserve 2 bytes for bad block marker */ + if (layout->eccbytes+2 > mtd->oobsize) { + printk(KERN_WARNING "no suitable oob scheme available " + "for oobsize %d eccbytes %u\n", mtd->oobsize, + eccbytes); + goto fail; + } + /* put ecc bytes at oob tail */ + for (i = 0; i < layout->eccbytes; i++) + layout->eccpos[i] = mtd->oobsize-layout->eccbytes+i; + + layout->oobfree[0].offset = 2; + layout->oobfree[0].length = mtd->oobsize-2-layout->eccbytes; + + *ecclayout = layout; + } + + /* sanity checks */ + if (8*(eccsize+eccbytes) >= (1 << m)) { + printk(KERN_WARNING "eccsize %u is too large\n", eccsize); + goto fail; + } + if ((*ecclayout)->eccbytes != (eccsteps*eccbytes)) { + printk(KERN_WARNING "invalid ecc layout\n"); + goto fail; + } + + nbc->eccmask = kmalloc(eccbytes, GFP_KERNEL); + nbc->errloc = kmalloc(t*sizeof(*nbc->errloc), GFP_KERNEL); + if (!nbc->eccmask || !nbc->errloc) + goto fail; + /* + * compute and store the inverted ecc of an erased ecc block + */ + erased_page = kmalloc(eccsize, GFP_KERNEL); + if (!erased_page) + goto fail; + + memset(erased_page, 0xff, eccsize); + memset(nbc->eccmask, 0, eccbytes); + encode_bch(nbc->bch, erased_page, eccsize, nbc->eccmask); + kfree(erased_page); + + for (i = 0; i < eccbytes; i++) + nbc->eccmask[i] ^= 0xff; + + return nbc; +fail: + nand_bch_free(nbc); + return NULL; +} +EXPORT_SYMBOL(nand_bch_init); + +/** + * nand_bch_free - [NAND Interface] Release NAND BCH ECC resources + * @nbc: NAND BCH control structure + */ +void nand_bch_free(struct nand_bch_control *nbc) +{ + if (nbc) { + free_bch(nbc->bch); + kfree(nbc->errloc); + kfree(nbc->eccmask); + kfree(nbc); + } +} +EXPORT_SYMBOL(nand_bch_free); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ivan Djelic "); +MODULE_DESCRIPTION("NAND software BCH ECC support"); diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 1f489b247a29..ae67ef56a8f5 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -140,6 +140,7 @@ typedef enum { NAND_ECC_HW, NAND_ECC_HW_SYNDROME, NAND_ECC_HW_OOB_FIRST, + NAND_ECC_SOFT_BCH, } nand_ecc_modes_t; /* @@ -339,6 +340,7 @@ struct nand_hw_control { * @prepad: padding information for syndrome based ecc generators * @postpad: padding information for syndrome based ecc generators * @layout: ECC layout control struct pointer + * @priv: pointer to private ecc control data * @hwctl: function to control hardware ecc generator. Must only * be provided if an hardware ECC is available * @calculate: function for ecc calculation or readback from ecc hardware @@ -362,6 +364,7 @@ struct nand_ecc_ctrl { int prepad; int postpad; struct nand_ecclayout *layout; + void *priv; void (*hwctl)(struct mtd_info *mtd, int mode); int (*calculate)(struct mtd_info *mtd, const uint8_t *dat, uint8_t *ecc_code); diff --git a/include/linux/mtd/nand_bch.h b/include/linux/mtd/nand_bch.h new file mode 100644 index 000000000000..74acf5367556 --- /dev/null +++ b/include/linux/mtd/nand_bch.h @@ -0,0 +1,72 @@ +/* + * Copyright © 2011 Ivan Djelic + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This file is the header for the NAND BCH ECC implementation. + */ + +#ifndef __MTD_NAND_BCH_H__ +#define __MTD_NAND_BCH_H__ + +struct mtd_info; +struct nand_bch_control; + +#if defined(CONFIG_MTD_NAND_ECC_BCH) + +static inline int mtd_nand_has_bch(void) { return 1; } + +/* + * Calculate BCH ecc code + */ +int nand_bch_calculate_ecc(struct mtd_info *mtd, const u_char *dat, + u_char *ecc_code); + +/* + * Detect and correct bit errors + */ +int nand_bch_correct_data(struct mtd_info *mtd, u_char *dat, u_char *read_ecc, + u_char *calc_ecc); +/* + * Initialize BCH encoder/decoder + */ +struct nand_bch_control * +nand_bch_init(struct mtd_info *mtd, unsigned int eccsize, + unsigned int eccbytes, struct nand_ecclayout **ecclayout); +/* + * Release BCH encoder/decoder resources + */ +void nand_bch_free(struct nand_bch_control *nbc); + +#else /* !CONFIG_MTD_NAND_ECC_BCH */ + +static inline int mtd_nand_has_bch(void) { return 0; } + +static inline int +nand_bch_calculate_ecc(struct mtd_info *mtd, const u_char *dat, + u_char *ecc_code) +{ + return -1; +} + +static inline int +nand_bch_correct_data(struct mtd_info *mtd, unsigned char *buf, + unsigned char *read_ecc, unsigned char *calc_ecc) +{ + return -1; +} + +static inline struct nand_bch_control * +nand_bch_init(struct mtd_info *mtd, unsigned int eccsize, + unsigned int eccbytes, struct nand_ecclayout **ecclayout) +{ + return NULL; +} + +static inline void nand_bch_free(struct nand_bch_control *nbc) {} + +#endif /* CONFIG_MTD_NAND_ECC_BCH */ + +#endif /* __MTD_NAND_BCH_H__ */ -- cgit v1.2.3 From 19b01b5fbf0b78930b3b06ee6080539c17b5d1fd Mon Sep 17 00:00:00 2001 From: Ilija Hadzic Date: Fri, 18 Mar 2011 16:58:04 -0500 Subject: drm/kernel: vblank wait on crtc > 1 Below is a patch against drm-next branch of 2.6.38-rc8+ kernel that adds the capability to wait on vblank events for CRTCs that are greater than 1 and thus cannot be represented with primary/secondary flags in the legacy interface. It was discussed on the dri-devel list in these two threads: http://lists.freedesktop.org/archives/dri-devel/2011-March/009009.html http://lists.freedesktop.org/archives/dri-devel/2011-March/009025.html This patch extends the interface to drm_wait_vblank ioctl so that crtc>1 can be represented. It also adds a new capability to drm_getcap ioctl so that the user space can check whether the new interface to drm_wait_vblank is supported (and fall back to the legacy interface if not) Signed-off-by: Ilija Hadzic Reviewed-by: Mario Kleiner Acked-by: Mario Kleiner Reviewed-by: Alex Deucher Tested-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_ioctl.c | 3 +++ drivers/gpu/drm/drm_irq.c | 15 ++++++++++----- include/drm/drm.h | 3 +++ 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 7f6912a16761..3617b4c4bb57 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -280,6 +280,9 @@ int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv) if (dev->driver->dumb_create) req->value = 1; break; + case DRM_CAP_HIGH_CRTC: + req->value = 1; + break; default: return -EINVAL; } diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c index a34ef97d3c81..741457bd1c46 100644 --- a/drivers/gpu/drm/drm_irq.c +++ b/drivers/gpu/drm/drm_irq.c @@ -1125,7 +1125,7 @@ int drm_wait_vblank(struct drm_device *dev, void *data, { union drm_wait_vblank *vblwait = data; int ret = 0; - unsigned int flags, seq, crtc; + unsigned int flags, seq, crtc, high_crtc; if ((!drm_dev_to_irq(dev)) || (!dev->irq_enabled)) return -EINVAL; @@ -1134,16 +1134,21 @@ int drm_wait_vblank(struct drm_device *dev, void *data, return -EINVAL; if (vblwait->request.type & - ~(_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK)) { + ~(_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | + _DRM_VBLANK_HIGH_CRTC_MASK)) { DRM_ERROR("Unsupported type value 0x%x, supported mask 0x%x\n", vblwait->request.type, - (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK)); + (_DRM_VBLANK_TYPES_MASK | _DRM_VBLANK_FLAGS_MASK | + _DRM_VBLANK_HIGH_CRTC_MASK)); return -EINVAL; } flags = vblwait->request.type & _DRM_VBLANK_FLAGS_MASK; - crtc = flags & _DRM_VBLANK_SECONDARY ? 1 : 0; - + high_crtc = (vblwait->request.type & _DRM_VBLANK_HIGH_CRTC_MASK); + if (high_crtc) + crtc = high_crtc >> _DRM_VBLANK_HIGH_CRTC_SHIFT; + else + crtc = flags & _DRM_VBLANK_SECONDARY ? 1 : 0; if (crtc >= dev->num_crtcs) return -EINVAL; diff --git a/include/drm/drm.h b/include/drm/drm.h index 9ac431396176..99cd07433fab 100644 --- a/include/drm/drm.h +++ b/include/drm/drm.h @@ -469,6 +469,8 @@ enum drm_vblank_seq_type { _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ }; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 16 +#define _DRM_VBLANK_HIGH_CRTC_MASK 0x001F0000 #define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) #define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ @@ -753,6 +755,7 @@ struct drm_event_vblank { }; #define DRM_CAP_DUMB_BUFFER 0x1 +#define DRM_CAP_HIGH_CRTC 0x2 /* typedef area */ #ifndef __KERNEL__ -- cgit v1.2.3 From c2cc7028e41c76e44b6e247c4b495c7523b23c87 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 20 Mar 2011 20:08:48 -0400 Subject: jbd2: add the b_cow_tid field to journal_head struct The b_cow_tid field will be used by the ext4 snapshots code to store the transaction id when the buffer was last cowed. Merging this patch to mainline will allow users to test ext4 snapshots as a standalone module, without the need to patch and install a development kernel. On 64bit machines this field uses fills in a padding "hole" and does not increase the size of the struct. On a 32bit machine this patch increases the size of the struct from 60 to 64 bytes. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" --- include/linux/journal-head.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index 525aac3c97df..44e95d0a721f 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -40,6 +40,13 @@ struct journal_head { */ unsigned b_modified; + /* + * This feild tracks the last transaction id in which this buffer + * has been cowed + * [jbd_lock_bh_state()] + */ + unsigned b_cow_tid; + /* * Copy of the buffer data frozen for writing to the log. * [jbd_lock_bh_state()] -- cgit v1.2.3 From 93737456d68ddcb86232f669b83da673dd12e351 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Sun, 20 Mar 2011 21:13:43 -0400 Subject: jbd2: add COW fields to struct jbd2_journal_handle Add fields needed for the copy-on-write ext4 development work. The h_cowing flag is used by ext4 snapshots code to mark the task in COWING state. The h_XXX_credits fields are used to track buffer credits usage (accounted by COW and non-COW operations). The h_cow_XXX fields are used as per task debugging counters. Merging this commit into mainline will allow users to test ext4 snapshots as a standalone module, without the need to patch and install a development kernel. Signed-off-by: Amir Goldstein Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 27e79c27ba08..a32dcaec04e1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -432,13 +432,35 @@ struct jbd2_journal_handle int h_err; /* Flags [no locking] */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ - unsigned int h_aborted: 1; /* fatal error on handle */ + unsigned int h_sync:1; /* sync-on-close */ + unsigned int h_jdata:1; /* force data journaling */ + unsigned int h_aborted:1; /* fatal error on handle */ + unsigned int h_cowing:1; /* COWing block to snapshot */ + + /* Number of buffers requested by user: + * (before adding the COW credits factor) */ + unsigned int h_base_credits:14; + + /* Number of buffers the user is allowed to dirty: + * (counts only buffers dirtied when !h_cowing) */ + unsigned int h_user_credits:14; + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map h_lockdep_map; #endif + +#ifdef CONFIG_JBD2_DEBUG + /* COW debugging counters: */ + unsigned int h_cow_moved; /* blocks moved to snapshot */ + unsigned int h_cow_copied; /* blocks copied to snapshot */ + unsigned int h_cow_ok_jh; /* blocks already COWed during current + transaction */ + unsigned int h_cow_ok_bitmap; /* blocks not set in COW bitmap */ + unsigned int h_cow_ok_mapped;/* blocks already mapped in snapshot */ + unsigned int h_cow_bitmaps; /* COW bitmaps created */ + unsigned int h_cow_excluded; /* blocks set in exclude bitmap */ +#endif }; -- cgit v1.2.3 From 8b8bae901ce23addbdcdb54fa1696fb2d049feb5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 5 Mar 2011 13:21:51 +0100 Subject: PCI/ACPI: Report ASPM support to BIOS if not disabled from command line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to distinguish the situation in which ASPM support is disabled from the command line or through .config from the situation in which it is disabled, because the hardware or BIOS can't handle it. In the former case we should not report ASPM support to the BIOS through ACPI _OSC, but in the latter case we should do that. Introduce pcie_aspm_support_enabled() that can be used by acpi_pci_root_add() to determine whether or not it should report ASPM support to the BIOS through _OSC. Cc: stable@kernel.org References: https://bugzilla.kernel.org/show_bug.cgi?id=29722 References: https://bugzilla.kernel.org/show_bug.cgi?id=20232 Reported-and-tested-by: Ortwin Glück Reviewed-by: Kenji Kaneshige Tested-by: Kenji Kaneshige Signed-off-by: Rafael J. Wysocki Signed-off-by: Jesse Barnes --- drivers/acpi/pci_root.c | 2 +- drivers/pci/pcie/aspm.c | 7 +++++++ include/linux/pci.h | 7 +++---- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index 85249395623b..c7358dd68b31 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -564,7 +564,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device) /* Indicate support for various _OSC capabilities. */ if (pci_ext_cfg_avail(root->bus->self)) flags |= OSC_EXT_PCI_CONFIG_SUPPORT; - if (pcie_aspm_enabled()) + if (pcie_aspm_support_enabled()) flags |= OSC_ACTIVE_STATE_PWR_SUPPORT | OSC_CLOCK_PWR_CAPABILITY_SUPPORT; if (pci_msi_enabled()) diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 3188cd96b338..bbdb4fd85b9c 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -69,6 +69,7 @@ struct pcie_link_state { }; static int aspm_disabled, aspm_force, aspm_clear_state; +static bool aspm_support_enabled = true; static DEFINE_MUTEX(aspm_lock); static LIST_HEAD(link_list); @@ -896,6 +897,7 @@ static int __init pcie_aspm_disable(char *str) { if (!strcmp(str, "off")) { aspm_disabled = 1; + aspm_support_enabled = false; printk(KERN_INFO "PCIe ASPM is disabled\n"); } else if (!strcmp(str, "force")) { aspm_force = 1; @@ -930,3 +932,8 @@ int pcie_aspm_enabled(void) } EXPORT_SYMBOL(pcie_aspm_enabled); +bool pcie_aspm_support_enabled(void) +{ + return aspm_support_enabled; +} +EXPORT_SYMBOL(pcie_aspm_support_enabled); diff --git a/include/linux/pci.h b/include/linux/pci.h index 16c9f2e61977..96f70d7e058d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1002,12 +1002,11 @@ extern bool pcie_ports_auto; #endif #ifndef CONFIG_PCIEASPM -static inline int pcie_aspm_enabled(void) -{ - return 0; -} +static inline int pcie_aspm_enabled(void) { return 0; } +static inline bool pcie_aspm_support_enabled(void) { return false; } #else extern int pcie_aspm_enabled(void); +extern bool pcie_aspm_support_enabled(void); #endif #ifdef CONFIG_PCIEAER -- cgit v1.2.3 From 1a680b7c325882188865f05b9a88d32f75f26495 Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Mon, 21 Mar 2011 03:29:08 +0000 Subject: PCI: PCIe links may not get configured for ASPM under POWERSAVE mode v3 -> v2: Moved ASPM enabling logic to pci_set_power_state() v2 -> v1: Preserved the logic in pci_raw_set_power_state() : Added ASPM enabling logic after scanning Root Bridge : http://marc.info/?l=linux-pci&m=130046996216391&w=2 v1 : http://marc.info/?l=linux-pci&m=130013164703283&w=2 The assumption made in commit 41cd766b065970ff6f6c89dd1cf55fa706c84a3d (PCI: Don't enable aspm before drivers have had a chance to veto it) that pci_enable_device() will result in re-configuring ASPM when aspm_policy is POWERSAVE is no longer valid. This is due to commit 97c145f7c87453cec90e91238fba5fe2c1561b32 (PCI: read current power state at enable time) which resets dev->current_state to D0. Due to this the call to pcie_aspm_pm_state_change() is never made. Note the equality check (below) that returns early: ./drivers/pci/pci.c: pci_raw_set_pci_power_state() 546 /* Check if we're already there */ 547 if (dev->current_state == state) 548 return 0; Therefore OSPM never configures the PCIe links for ASPM to turn them "on". Fix it by configuring ASPM from the pci_enable_device() code path. This also allows a driver such as the e1000e networking driver a chance to disable ASPM (L0s, L1), if need be, prior to enabling the device. A driver may perform this action if the device is known to mis-behave wrt ASPM. Signed-off-by: Naga Chumbalkar Acked-by: Rafael J. Wysocki Cc: Matthew Garrett Signed-off-by: Jesse Barnes --- drivers/pci/pci.c | 6 ++++++ drivers/pci/pcie/aspm.c | 22 ++++++++++++++++++++++ include/linux/pci-aspm.h | 4 ++++ 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b714d787bddd..2472e7177b4b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -740,6 +740,12 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state) if (!__pci_complete_power_transition(dev, state)) error = 0; + /* + * When aspm_policy is "powersave" this call ensures + * that ASPM is configured. + */ + if (!error && dev->bus->self) + pcie_aspm_powersave_config_link(dev->bus->self); return error; } diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index bbdb4fd85b9c..e61b82e611c2 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -708,6 +708,28 @@ void pcie_aspm_pm_state_change(struct pci_dev *pdev) up_read(&pci_bus_sem); } +void pcie_aspm_powersave_config_link(struct pci_dev *pdev) +{ + struct pcie_link_state *link = pdev->link_state; + + if (aspm_disabled || !pci_is_pcie(pdev) || !link) + return; + + if (aspm_policy != POLICY_POWERSAVE) + return; + + if ((pdev->pcie_type != PCI_EXP_TYPE_ROOT_PORT) && + (pdev->pcie_type != PCI_EXP_TYPE_DOWNSTREAM)) + return; + + down_read(&pci_bus_sem); + mutex_lock(&aspm_lock); + pcie_config_aspm_path(link); + pcie_set_clkpm(link, policy_to_clkpm_state(link)); + mutex_unlock(&aspm_lock); + up_read(&pci_bus_sem); +} + /* * pci_disable_link_state - disable pci device's link state, so the link will * never enter specific states diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h index ce6810512c66..67cb3ae38016 100644 --- a/include/linux/pci-aspm.h +++ b/include/linux/pci-aspm.h @@ -26,6 +26,7 @@ extern void pcie_aspm_init_link_state(struct pci_dev *pdev); extern void pcie_aspm_exit_link_state(struct pci_dev *pdev); extern void pcie_aspm_pm_state_change(struct pci_dev *pdev); +extern void pcie_aspm_powersave_config_link(struct pci_dev *pdev); extern void pci_disable_link_state(struct pci_dev *pdev, int state); extern void pcie_clear_aspm(void); extern void pcie_no_aspm(void); @@ -39,6 +40,9 @@ static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev) { } +static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) +{ +} static inline void pci_disable_link_state(struct pci_dev *pdev, int state) { } -- cgit v1.2.3 From 858022aa6fad90ec86c567cbf54682a61dd39a01 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:33:02 -0700 Subject: wireless: fix 80211 kernel-doc warnings Fix many of each of these warnings: Warning(include/net/cfg80211.h:519): No description found for parameter 'rxrate' Warning(include/net/mac80211.h:1163): bad line: Signed-off-by: Randy Dunlap Cc: Johannes Berg Signed-off-by: John W. Linville --- include/net/cfg80211.h | 3 ++- include/net/mac80211.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 60f7876b6da8..b2b9d28cb4ab 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -486,7 +486,8 @@ struct rate_info { * @plink_state: mesh peer link state * @signal: signal strength of last received packet in dBm * @signal_avg: signal strength average in dBm - * @txrate: current unicast bitrate to this station + * @txrate: current unicast bitrate from this station + * @rxrate: current unicast bitrate to this station * @rx_packets: packets received from this station * @tx_packets: packets transmitted to this station * @tx_retries: cumulative retry counts diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8650e7bf2ed0..cefe1b37c493 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1160,7 +1160,7 @@ enum ieee80211_hw_flags { * @napi_weight: weight used for NAPI polling. You must specify an * appropriate value here if a napi_poll operation is provided * by your driver. - + * * @max_rx_aggregation_subframes: maximum buffer size (number of * sub-frames) to be used for A-MPDU block ack receiver * aggregation. -- cgit v1.2.3 From b31268ac793fd300da66b9c28bbf0a200339ab96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2011 17:02:00 -0400 Subject: FS: Use stable writes when not doing a bulk flush If we're only doing a single write, and there are no other unstable writes being queued up, we might want to just flip to using a stable write RPC call. Reviewed-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 3 +++ fs/nfs/write.c | 16 +++++++++++++--- include/linux/nfs_fs.h | 2 ++ include/linux/nfs_page.h | 1 + 4 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 23e794410669..fd85618149a1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -223,6 +223,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_count = 0; desc->pg_bsize = bsize; desc->pg_base = 0; + desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_doio = doio; desc->pg_ioflags = io_flags; @@ -335,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { + desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; + desc->pg_moreio = 0; } return 1; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 47a3ad63e0d5..4d686ee53244 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -179,8 +179,8 @@ static int wb_priority(struct writeback_control *wbc) if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI; - return 0; + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; } /* @@ -863,7 +863,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & FLUSH_STABLE) { + if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { data->args.stable = NFS_DATA_SYNC; if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; @@ -912,6 +912,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + desc->pg_count > wsize)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -1002,6 +1008,10 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) if ((!lseg) && list_is_singular(&data->pages)) lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + /* Set up the argument struct */ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f88522b10a38..cb2add401f25 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -33,6 +33,8 @@ #define FLUSH_STABLE 4 /* commit to stable storage */ #define FLUSH_LOWPRI 8 /* low priority background flush */ #define FLUSH_HIGHPRI 16 /* high priority memory reclaim flush */ +#define FLUSH_COND_STABLE 32 /* conditional stable write - only stable + * if everything fits in one RPC */ #ifdef __KERNEL__ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 90907ada6d52..92d54c81f51e 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -57,6 +57,7 @@ struct nfs_pageio_descriptor { size_t pg_count; size_t pg_bsize; unsigned int pg_base; + char pg_moreio; struct inode *pg_inode; int (*pg_doio)(struct nfs_pageio_descriptor *); -- cgit v1.2.3 From 0562e0bad483d10e9651fbb8f21dc3d0bad57374 Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Mon, 21 Mar 2011 21:38:05 -0400 Subject: ext4: add more tracepoints and use dev_t in the trace buffer - Add more ext4 tracepoints. - Change ext4 tracepoints to use dev_t field with MAJOR/MINOR macros so that we can save 4 bytes in the ring buffer on some platforms. - Add sync_mode to ext4_da_writepages, ext4_da_write_pages, and ext4_da_writepages_result tracepoints. Also remove for_reclaim field from ext4_da_writepages since it is usually not very useful. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 3 + fs/ext4/extents.c | 13 +- fs/ext4/fsync.c | 14 +- fs/ext4/ialloc.c | 1 + fs/ext4/inode.c | 24 +- fs/ext4/namei.c | 3 + include/trace/events/ext4.h | 775 +++++++++++++++++++++++++++++++++----------- include/trace/events/jbd2.h | 78 ++--- 8 files changed, 658 insertions(+), 253 deletions(-) (limited to 'include') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index adf96b822781..97b970e7dd13 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -21,6 +21,8 @@ #include "ext4_jbd2.h" #include "mballoc.h" +#include + /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_read_block_bitmap_load(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9ea1bc64ca6d..f46f6e3c02d7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,8 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" +#include + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, if (unlikely(!bh)) goto err; if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, block, + path[ppos].p_block); if (bh_submit_read(bh) < 0) { put_bh(bh); goto err; @@ -3297,7 +3301,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; @@ -3305,6 +3309,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { @@ -3525,6 +3530,8 @@ out2: ext4_ext_drop_refs(path); kfree(path); } + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); return err ? err : allocated; } @@ -3658,6 +3665,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because @@ -3673,6 +3681,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ret = inode_newsize_ok(inode, (len + offset)); if (ret) { mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } retry: @@ -3717,6 +3726,8 @@ retry: goto retry; } mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, + ret > 0 ? ret2 : ret); return ret > 0 ? ret2 : ret; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 7829b287822a..7f74019d6d77 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -164,20 +164,20 @@ int ext4_sync_file(struct file *file, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_ext4_sync_file(file, datasync); + trace_ext4_sync_file_enter(file, datasync); if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = ext4_flush_completed_IO(inode); if (ret < 0) - return ret; + goto out; if (!journal) { ret = generic_file_fsync(file, datasync); if (!ret && !list_empty(&inode->i_dentry)) ext4_sync_parent(inode); - return ret; + goto out; } /* @@ -194,8 +194,10 @@ int ext4_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) - return ext4_force_commit(inode->i_sb); + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; if (jbd2_log_start_commit(journal, commit_tid)) { @@ -215,5 +217,7 @@ int ext4_sync_file(struct file *file, int datasync) ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + out: + trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a679a482c986..254e6b98b5b4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_load_inode_bitmap(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fc8c0ce84315..f44307a21136 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int count = 0; ext4_fsblk_t first_block = 0; + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, @@ -1058,6 +1059,8 @@ cleanup: partial--; } out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); return err; } @@ -3379,6 +3382,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + trace_ext4_readpage(page); return mpage_readpage(page, ext4_get_block); } @@ -3413,6 +3417,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_invalidatepage(page, offset); + /* * free any io_end structure allocated for buffers to be discarded */ @@ -3434,6 +3440,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_releasepage(page); + WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -3792,11 +3800,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + ssize_t ret; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); - - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + else + ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + trace_ext4_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); + return ret; } /* @@ -4425,6 +4438,8 @@ void ext4_truncate(struct inode *inode) ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; + trace_ext4_truncate_enter(inode); + if (!ext4_can_truncate(inode)) return; @@ -4435,6 +4450,7 @@ void ext4_truncate(struct inode *inode) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_ext_truncate(inode); + trace_ext4_truncate_exit(inode); return; } @@ -4564,6 +4580,7 @@ out_stop: ext4_orphan_del(handle, inode); ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); } /* @@ -4695,6 +4712,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ad87584aa8d2..f9f83878843a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -40,6 +40,7 @@ #include "xattr.h" #include "acl.h" +#include /* * define how far ahead to read directories while searching them. */ @@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle; + trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) end_unlink: ext4_journal_stop(handle); brelse(bh); + trace_ext4_unlink_exit(dentry, retval); return retval; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index e5e345fb2a5c..e09592d2f916 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -21,8 +21,7 @@ TRACE_EVENT(ext4_free_inode, TP_ARGS(inode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( uid_t, uid ) @@ -31,8 +30,7 @@ TRACE_EVENT(ext4_free_inode, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->uid = inode->i_uid; @@ -41,9 +39,9 @@ TRACE_EVENT(ext4_free_inode, ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->mode, - __entry->uid, __entry->gid, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) ); @@ -53,21 +51,19 @@ TRACE_EVENT(ext4_request_inode, TP_ARGS(dir, mode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev_major = MAJOR(dir->i_sb->s_dev); - __entry->dev_minor = MINOR(dir->i_sb->s_dev); + __entry->dev = dir->i_sb->s_dev; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d dir %lu mode 0%o", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); @@ -77,23 +73,21 @@ TRACE_EVENT(ext4_allocate_inode, TP_ARGS(inode, dir, mode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); @@ -104,21 +98,19 @@ TRACE_EVENT(ext4_evict_inode, TP_ARGS(inode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( int, nlink ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nlink = inode->i_nlink; ), TP_printk("dev %d,%d ino %lu nlink %d", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nlink) ); @@ -128,21 +120,19 @@ TRACE_EVENT(ext4_drop_inode, TP_ARGS(inode, drop), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( int, drop ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->drop = drop; ), TP_printk("dev %d,%d ino %lu drop %d", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->drop) ); @@ -152,21 +142,19 @@ TRACE_EVENT(ext4_mark_inode_dirty, TP_ARGS(inode, IP), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, ip ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ip = IP; ), TP_printk("dev %d,%d ino %lu caller %pF", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); @@ -176,21 +164,19 @@ TRACE_EVENT(ext4_begin_ordered_truncate, TP_ARGS(inode, new_size), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, new_size ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->new_size = new_size; ), TP_printk("dev %d,%d ino %lu new_size %lld", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (long long) __entry->new_size) ); @@ -203,8 +189,7 @@ DECLARE_EVENT_CLASS(ext4__write_begin, TP_ARGS(inode, pos, len, flags), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -212,8 +197,7 @@ DECLARE_EVENT_CLASS(ext4__write_begin, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; @@ -221,7 +205,7 @@ DECLARE_EVENT_CLASS(ext4__write_begin, ), TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->flags) ); @@ -249,8 +233,7 @@ DECLARE_EVENT_CLASS(ext4__write_end, TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -258,8 +241,7 @@ DECLARE_EVENT_CLASS(ext4__write_end, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; @@ -267,9 +249,9 @@ DECLARE_EVENT_CLASS(ext4__write_end, ), TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->pos, - __entry->len, __entry->copied) + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->pos, __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, @@ -310,22 +292,20 @@ TRACE_EVENT(ext4_writepage, TP_ARGS(inode, page), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = page->index; ), TP_printk("dev %d,%d ino %lu page_index %lu", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->index) ); @@ -335,43 +315,39 @@ TRACE_EVENT(ext4_da_writepages, TP_ARGS(inode, wbc), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) + __field( int, sync_mode ) __field( char, for_kupdate ) - __field( char, for_reclaim ) __field( char, range_cyclic ) __field( pgoff_t, writeback_index ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; + __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; - __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " - "range_start %llu range_end %llu " - "for_kupdate %d for_reclaim %d " - "range_cyclic %d writeback_index %lu", - __entry->dev_major, __entry->dev_minor, + "range_start %llu range_end %llu sync_mode %d" + "for_kupdate %d range_cyclic %d writeback_index %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, - __entry->range_end, - __entry->for_kupdate, __entry->for_reclaim, - __entry->range_cyclic, + __entry->range_end, __entry->sync_mode, + __entry->for_kupdate, __entry->range_cyclic, (unsigned long) __entry->writeback_index) ); @@ -381,8 +357,7 @@ TRACE_EVENT(ext4_da_write_pages, TP_ARGS(inode, mpd), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, b_blocknr ) __field( __u32, b_size ) @@ -390,11 +365,11 @@ TRACE_EVENT(ext4_da_write_pages, __field( unsigned long, first_page ) __field( int, io_done ) __field( int, pages_written ) + __field( int, sync_mode ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->b_blocknr = mpd->b_blocknr; __entry->b_size = mpd->b_size; @@ -402,14 +377,18 @@ TRACE_EVENT(ext4_da_write_pages, __entry->first_page = mpd->first_page; __entry->io_done = mpd->io_done; __entry->pages_written = mpd->pages_written; + __entry->sync_mode = mpd->wbc->sync_mode; ), - TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", - __entry->dev_major, __entry->dev_minor, + TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x " + "first_page %lu io_done %d pages_written %d sync_mode %d", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->b_blocknr, __entry->b_size, __entry->b_state, __entry->first_page, - __entry->io_done, __entry->pages_written) + __entry->io_done, __entry->pages_written, + __entry->sync_mode + ) ); TRACE_EVENT(ext4_da_writepages_result, @@ -419,35 +398,100 @@ TRACE_EVENT(ext4_da_writepages_result, TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) + __field( int, sync_mode ) __field( char, more_io ) __field( pgoff_t, writeback_index ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; + __entry->sync_mode = wbc->sync_mode; __entry->more_io = wbc->more_io; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu", - __entry->dev_major, __entry->dev_minor, + TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " + " more_io %d sync_mode %d writeback_index %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, - __entry->more_io, + __entry->more_io, __entry->sync_mode, (unsigned long) __entry->writeback_index) ); +DECLARE_EVENT_CLASS(ext4__page_op, + TP_PROTO(struct page *page), + + TP_ARGS(page), + + TP_STRUCT__entry( + __field( pgoff_t, index ) + __field( ino_t, ino ) + __field( dev_t, dev ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->index) +); + +DEFINE_EVENT(ext4__page_op, ext4_readpage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +DEFINE_EVENT(ext4__page_op, ext4_releasepage, + + TP_PROTO(struct page *page), + + TP_ARGS(page) +); + +TRACE_EVENT(ext4_invalidatepage, + TP_PROTO(struct page *page, unsigned long offset), + + TP_ARGS(page, offset), + + TP_STRUCT__entry( + __field( pgoff_t, index ) + __field( unsigned long, offset ) + __field( ino_t, ino ) + __field( dev_t, dev ) + + ), + + TP_fast_assign( + __entry->index = page->index; + __entry->offset = offset; + __entry->ino = page->mapping->host->i_ino; + __entry->dev = page->mapping->host->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %lu page_index %lu offset %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->index, __entry->offset) +); + TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), @@ -455,22 +499,20 @@ TRACE_EVENT(ext4_discard_blocks, TP_ARGS(sb, blk, count), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->blk = blk; __entry->count = count; ), TP_printk("dev %d,%d blk %llu count %llu", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blk, __entry->count) ); @@ -481,8 +523,7 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_ARGS(ac, pa), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) @@ -491,8 +532,7 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, ), TP_fast_assign( - __entry->dev_major = MAJOR(ac->ac_sb->s_dev); - __entry->dev_minor = MINOR(ac->ac_sb->s_dev); + __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; @@ -500,9 +540,9 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->pa_pstart, - __entry->pa_len, __entry->pa_lstart) + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, @@ -530,8 +570,7 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_ARGS(sb, inode, pa, block, count), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) @@ -539,16 +578,16 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; ), TP_printk("dev %d,%d ino %lu block %llu count %u", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->block, __entry->count) + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, @@ -558,22 +597,20 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_ARGS(sb, pa), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d pstart %llu len %u", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pa_pstart, __entry->pa_len) ); @@ -583,20 +620,18 @@ TRACE_EVENT(ext4_discard_preallocations, TP_ARGS(inode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); @@ -606,20 +641,19 @@ TRACE_EVENT(ext4_mb_discard_preallocations, TP_ARGS(sb, needed), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( int, needed ) ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->needed = needed; ), TP_printk("dev %d,%d needed %d", - __entry->dev_major, __entry->dev_minor, __entry->needed) + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->needed) ); TRACE_EVENT(ext4_request_blocks, @@ -628,8 +662,7 @@ TRACE_EVENT(ext4_request_blocks, TP_ARGS(ar), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( unsigned int, len ) @@ -642,8 +675,7 @@ TRACE_EVENT(ext4_request_blocks, ), TP_fast_assign( - __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); - __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); + __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->flags = ar->flags; __entry->len = ar->len; @@ -655,8 +687,9 @@ TRACE_EVENT(ext4_request_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - __entry->dev_major, __entry->dev_minor, + TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu " + "lleft %llu lright %llu pleft %llu pright %llu ", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->flags, __entry->len, (unsigned long long) __entry->logical, @@ -673,8 +706,7 @@ TRACE_EVENT(ext4_allocate_blocks, TP_ARGS(ar, block), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, flags ) @@ -688,8 +720,7 @@ TRACE_EVENT(ext4_allocate_blocks, ), TP_fast_assign( - __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); - __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); + __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->flags = ar->flags; @@ -702,10 +733,11 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->flags, - __entry->len, __entry->block, + TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu " + "goal %llu lleft %llu lright %llu pleft %llu pright %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->flags, __entry->len, __entry->block, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -721,8 +753,7 @@ TRACE_EVENT(ext4_free_blocks, TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, block ) @@ -731,8 +762,7 @@ TRACE_EVENT(ext4_free_blocks, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->block = block; @@ -741,20 +771,19 @@ TRACE_EVENT(ext4_free_blocks, ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, __entry->flags) ); -TRACE_EVENT(ext4_sync_file, +TRACE_EVENT(ext4_sync_file_enter, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) @@ -763,39 +792,60 @@ TRACE_EVENT(ext4_sync_file, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - __entry->dev_major = MAJOR(dentry->d_inode->i_sb->s_dev); - __entry->dev_minor = MINOR(dentry->d_inode->i_sb->s_dev); + __entry->dev = dentry->d_inode->i_sb->s_dev; __entry->ino = dentry->d_inode->i_ino; __entry->datasync = datasync; __entry->parent = dentry->d_parent->d_inode->i_ino; ), TP_printk("dev %d,%d ino %ld parent %ld datasync %d ", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); +TRACE_EVENT(ext4_sync_file_exit, + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret), + + TP_STRUCT__entry( + __field( int, ret ) + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ret = ret; + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %ld ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->wait = wait; ), - TP_printk("dev %d,%d wait %d", __entry->dev_major, - __entry->dev_minor, __entry->wait) + TP_printk("dev %d,%d wait %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, @@ -804,23 +854,21 @@ TRACE_EVENT(ext4_alloc_da_blocks, TP_ARGS(inode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) __field( unsigned int, meta_blocks ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->data_blocks, __entry->meta_blocks) ); @@ -831,8 +879,7 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u16, found ) __field( __u16, groups ) @@ -855,8 +902,7 @@ TRACE_EVENT(ext4_mballoc_alloc, ), TP_fast_assign( - __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); - __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); + __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; @@ -881,7 +927,7 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " "tail %u broken %u", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, @@ -900,8 +946,7 @@ TRACE_EVENT(ext4_mballoc_prealloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) @@ -914,8 +959,7 @@ TRACE_EVENT(ext4_mballoc_prealloc, ), TP_fast_assign( - __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); - __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); + __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; @@ -928,7 +972,7 @@ TRACE_EVENT(ext4_mballoc_prealloc, ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, @@ -946,8 +990,7 @@ DECLARE_EVENT_CLASS(ext4__mballoc, TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) @@ -955,8 +998,7 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; @@ -964,7 +1006,7 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_printk("dev %d,%d inode %lu extent %u/%d/%u ", - __entry->dev_major, __entry->dev_minor, + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) @@ -998,8 +1040,7 @@ TRACE_EVENT(ext4_forget, TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( int, is_metadata ) @@ -1007,8 +1048,7 @@ TRACE_EVENT(ext4_forget, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->is_metadata = is_metadata; @@ -1016,9 +1056,9 @@ TRACE_EVENT(ext4_forget, ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->mode, - __entry->is_metadata, __entry->block) + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, @@ -1027,8 +1067,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_ARGS(inode, used_blocks), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -1039,8 +1078,7 @@ TRACE_EVENT(ext4_da_update_reserve_space, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -1050,10 +1088,12 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino, __entry->mode, - (unsigned long long) __entry->i_blocks, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " + "reserved_data_blocks %d reserved_meta_blocks %d " + "allocated_meta_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->mode, (unsigned long long) __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) ); @@ -1064,8 +1104,7 @@ TRACE_EVENT(ext4_da_reserve_space, TP_ARGS(inode, md_needed), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -1075,8 +1114,7 @@ TRACE_EVENT(ext4_da_reserve_space, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -1085,8 +1123,9 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", - __entry->dev_major, __entry->dev_minor, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d " + "reserved_data_blocks %d reserved_meta_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->md_needed, __entry->reserved_data_blocks, @@ -1099,8 +1138,7 @@ TRACE_EVENT(ext4_da_release_space, TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -1111,8 +1149,7 @@ TRACE_EVENT(ext4_da_release_space, ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -1122,8 +1159,10 @@ TRACE_EVENT(ext4_da_release_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - __entry->dev_major, __entry->dev_minor, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " + "reserved_data_blocks %d reserved_meta_blocks %d " + "allocated_meta_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks, @@ -1136,20 +1175,19 @@ DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_ARGS(sb, group), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( - __entry->dev_major = MAJOR(sb->s_dev); - __entry->dev_minor = MINOR(sb->s_dev); + __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", - __entry->dev_major, __entry->dev_minor, __entry->group) + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, @@ -1166,6 +1204,349 @@ DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, TP_ARGS(sb, group) ); +DEFINE_EVENT(ext4__bitmap_load, ext4_read_block_bitmap_load, + + TP_PROTO(struct super_block *sb, unsigned long group), + + TP_ARGS(sb, group) +); + +DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, + + TP_PROTO(struct super_block *sb, unsigned long group), + + TP_ARGS(sb, group) +); + +TRACE_EVENT(ext4_direct_IO_enter, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), + + TP_ARGS(inode, offset, len, rw), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, __entry->rw) +); + +TRACE_EVENT(ext4_direct_IO_exit, + TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret), + + TP_ARGS(inode, offset, len, rw, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned long, len ) + __field( int, rw ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->rw = rw; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu pos %llu len %lu rw %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->len, + __entry->rw, __entry->ret) +); + +TRACE_EVENT(ext4_fallocate_enter, + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), + + TP_ARGS(inode, offset, len, mode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( loff_t, len ) + __field( int, mode ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->len = len; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d ino %ld pos %llu len %llu mode %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, + (unsigned long long) __entry->len, __entry->mode) +); + +TRACE_EVENT(ext4_fallocate_exit, + TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), + + TP_ARGS(inode, offset, max_blocks, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( loff_t, pos ) + __field( unsigned, blocks ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->pos = offset; + __entry->blocks = max_blocks; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %ld pos %llu blocks %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long long) __entry->pos, __entry->blocks, + __entry->ret) +); + +TRACE_EVENT(ext4_unlink_enter, + TP_PROTO(struct inode *parent, struct dentry *dentry), + + TP_ARGS(parent, dentry), + + TP_STRUCT__entry( + __field( ino_t, parent ) + __field( ino_t, ino ) + __field( loff_t, size ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->parent = parent->i_ino; + __entry->ino = dentry->d_inode->i_ino; + __entry->size = dentry->d_inode->i_size; + __entry->dev = dentry->d_inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %ld size %lld parent %ld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->size, + (unsigned long) __entry->parent) +); + +TRACE_EVENT(ext4_unlink_exit, + TP_PROTO(struct dentry *dentry, int ret), + + TP_ARGS(dentry, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = dentry->d_inode->i_ino; + __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %ld ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->ret) +); + +DECLARE_EVENT_CLASS(ext4__truncate, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( blkcnt_t, blocks ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu blocks %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, (unsigned long) __entry->blocks) +); + +DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DECLARE_EVENT_CLASS(ext4__map_blocks_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + unsigned len, unsigned flags), + + TP_ARGS(inode, lblk, len, flags), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( unsigned, len ) + __field( unsigned, flags ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->len = len; + __entry->flags = flags; + ), + + TP_printk("dev %d,%d ino %lu lblk %u len %u flags %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, __entry->len, __entry->flags) +); + +DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + unsigned len, unsigned flags), + + TP_ARGS(inode, lblk, len, flags) +); + +DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + unsigned len, unsigned flags), + + TP_ARGS(inode, lblk, len, flags) +); + +DECLARE_EVENT_CLASS(ext4__map_blocks_exit, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned len, int ret), + + TP_ARGS(inode, lblk, pblk, len, ret), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned, len ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, + __entry->len, __entry->ret) +); + +DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned len, int ret), + + TP_ARGS(inode, lblk, pblk, len, ret) +); + +DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, unsigned len, int ret), + + TP_ARGS(inode, lblk, pblk, len, ret) +); + +TRACE_EVENT(ext4_ext_load_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), + + TP_ARGS(inode, lblk, pblk), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + __field( ext4_lblk_t, lblk ) + __field( ext4_fsblk_t, pblk ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->lblk = lblk; + __entry->pblk = pblk; + ), + + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned) __entry->lblk, (unsigned long long) __entry->pblk) +); + +TRACE_EVENT(ext4_load_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( ino_t, ino ) + __field( dev_t, dev ) + ), + + TP_fast_assign( + __entry->ino = inode->i_ino; + __entry->dev = inode->i_sb->s_dev; + ), + + TP_printk("dev %d,%d ino %ld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 7447ea9305b5..bf16545cc977 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -17,19 +17,17 @@ TRACE_EVENT(jbd2_checkpoint, TP_ARGS(journal, result), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( int, result ) ), TP_fast_assign( - __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); - __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); + __entry->dev = journal->j_fs_dev->bd_dev; __entry->result = result; ), - TP_printk("dev %d,%d result %d", - __entry->dev_major, __entry->dev_minor, __entry->result) + TP_printk("dev %s result %d", + jbd2_dev_to_name(__entry->dev), __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -39,22 +37,20 @@ DECLARE_EVENT_CLASS(jbd2_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( - __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); - __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); + __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %d,%d transaction %d sync %d", - __entry->dev_major, __entry->dev_minor, - __entry->transaction, __entry->sync_commit) + TP_printk("dev %s transaction %d sync %d", + jbd2_dev_to_name(__entry->dev), __entry->transaction, + __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -91,24 +87,22 @@ TRACE_EVENT(jbd2_end_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( char, sync_commit ) __field( int, transaction ) __field( int, head ) ), TP_fast_assign( - __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); - __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); + __entry->dev = journal->j_fs_dev->bd_dev; __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %d,%d transaction %d sync %d head %d", - __entry->dev_major, __entry->dev_minor, - __entry->transaction, __entry->sync_commit, __entry->head) + TP_printk("dev %s transaction %d sync %d head %d", + jbd2_dev_to_name(__entry->dev), __entry->transaction, + __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -117,20 +111,17 @@ TRACE_EVENT(jbd2_submit_inode_data, TP_ARGS(inode), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev_major = MAJOR(inode->i_sb->s_dev); - __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), - TP_printk("dev %d,%d ino %lu", - __entry->dev_major, __entry->dev_minor, - (unsigned long) __entry->ino) + TP_printk("dev %s ino %lu", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -140,8 +131,7 @@ TRACE_EVENT(jbd2_run_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( unsigned long, tid ) __field( unsigned long, wait ) __field( unsigned long, running ) @@ -154,8 +144,7 @@ TRACE_EVENT(jbd2_run_stats, ), TP_fast_assign( - __entry->dev_major = MAJOR(dev); - __entry->dev_minor = MINOR(dev); + __entry->dev = dev; __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->running = stats->rs_running; @@ -167,9 +156,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - __entry->dev_major, __entry->dev_minor, __entry->tid, + jbd2_dev_to_name(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -186,8 +175,7 @@ TRACE_EVENT(jbd2_checkpoint_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( unsigned long, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) @@ -196,8 +184,7 @@ TRACE_EVENT(jbd2_checkpoint_stats, ), TP_fast_assign( - __entry->dev_major = MAJOR(dev); - __entry->dev_minor = MINOR(dev); + __entry->dev = dev; __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; @@ -205,9 +192,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - __entry->dev_major, __entry->dev_minor, __entry->tid, + jbd2_dev_to_name(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -220,8 +207,7 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( - __field( int, dev_major ) - __field( int, dev_minor ) + __field( dev_t, dev ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) @@ -229,18 +215,16 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, ), TP_fast_assign( - __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); - __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); + __entry->dev = journal->j_fs_dev->bd_dev; __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), - TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", - __entry->dev_major, __entry->dev_minor, - __entry->tail_sequence, __entry->first_tid, - __entry->block_nr, __entry->freed) + TP_printk("dev %s from %u to %u offset %lu freed %lu", + jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, + __entry->first_tid, __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From 27660515a21bf913e3208ded3f27abd0529fae0e Mon Sep 17 00:00:00 2001 From: MichaÅ‚ MirosÅ‚aw Date: Fri, 18 Mar 2011 16:56:34 +0000 Subject: net: implement dev_disable_lro() hw_features compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement compatibility with new hw_features for dev_disable_lro(). This is a transition path - dev_disable_lro() should be later integrated into netdev_fix_features() after all drivers are converted. Signed-off-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: David S. Miller --- include/linux/ethtool.h | 3 +++ net/core/dev.c | 19 +++++++++++-------- net/core/ethtool.c | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b297f288f6eb..ae757bcf1280 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -648,6 +648,9 @@ enum ethtool_sfeatures_retval_bits { #include +/* needed by dev_disable_lro() */ +extern int __ethtool_set_flags(struct net_device *dev, u32 flags); + struct ethtool_rx_ntuple_flow_spec_container { struct ethtool_rx_ntuple_flow_spec fs; struct list_head list; diff --git a/net/core/dev.c b/net/core/dev.c index 0b88eba97dab..f453370131a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1353,14 +1353,17 @@ EXPORT_SYMBOL(dev_close); */ void dev_disable_lro(struct net_device *dev) { - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } + u32 flags; + + if (dev->ethtool_ops && dev->ethtool_ops->get_flags) + flags = dev->ethtool_ops->get_flags(dev); + else + flags = ethtool_op_get_flags(dev); + + if (!(flags & ETH_FLAG_LRO)) + return; + + __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO); WARN_ON(dev->features & NETIF_F_LRO); } EXPORT_SYMBOL(dev_disable_lro); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index a1086fb0c0c7..24bd57493c0d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -513,7 +513,7 @@ static int ethtool_set_one_feature(struct net_device *dev, } } -static int __ethtool_set_flags(struct net_device *dev, u32 data) +int __ethtool_set_flags(struct net_device *dev, u32 data) { u32 changed; -- cgit v1.2.3 From e6abbaa2725a43cf5d26c4c2a5dc6c0f6029ea19 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 19 Mar 2011 12:13:49 +0000 Subject: ipv4: fix route deletion for IPs on many subnets Alex Sidorenko reported for problems with local routes left after IP addresses are deleted. It happens when same IPs are used in more than one subnet for the device. Fix fib_del_ifaddr to restrict the checks for duplicate local and broadcast addresses only to the IFAs that use our primary IFA or another primary IFA with same address. And we expect the prefsrc to be matched when the routes are deleted because it is possible they to differ only by prefsrc. This patch prevents local and broadcast routes to be leaked until their primary IP is deleted finally from the box. As the secondary address promotion needs to delete the routes for all secondaries that used the old primary IFA, add option to ignore these secondaries from the checks and to assume they are already deleted, so that we can safely delete the route while these IFAs are still on the device list. Reported-by: Alex Sidorenko Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/route.h | 1 + net/ipv4/fib_frontend.c | 101 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 89 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 30d6cae3841a..dc102445ec47 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -207,6 +207,7 @@ extern int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb); struct in_ifaddr; extern void fib_add_ifaddr(struct in_ifaddr *); +extern void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *); static inline void ip_rt_put(struct rtable * rt) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a373a259253c..02c3ba61884a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -722,12 +722,17 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) } } -static void fib_del_ifaddr(struct in_ifaddr *ifa) +/* Delete primary or secondary address. + * Optionally, on secondary address promotion consider the addresses + * from subnet iprim as deleted, even if they are in device list. + * In this case the secondary ifa can be in device list. + */ +void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; - struct in_ifaddr *prim = ifa; + struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 @@ -735,17 +740,26 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + int subnet = 0; /* Primary network */ + int gone = 1; /* Address is missing */ + int same_prefsrc = 0; /* Another primary with same IP */ - if (!(ifa->ifa_flags & IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, - dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, - any, ifa->ifa_prefixlen, prim); - else { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n"); return; } + if (iprim && iprim != prim) { + printk(KERN_WARNING "fib_del_ifaddr: bug: iprim != prim\n"); + return; + } + } else if (!ipv4_is_zeronet(any) && + (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); + subnet = 1; } /* Deletion is more complicated than add. @@ -755,6 +769,49 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1 == ifa) { + /* promotion, keep the IP */ + gone = 0; + continue; + } + /* Ignore IFAs from our subnet */ + if (iprim && ifa1->ifa_mask == iprim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, iprim)) + continue; + + /* Ignore ifa1 if it uses different primary IP (prefsrc) */ + if (ifa1->ifa_flags & IFA_F_SECONDARY) { + /* Another address from our subnet? */ + if (ifa1->ifa_mask == prim->ifa_mask && + inet_ifa_match(ifa1->ifa_address, prim)) + prim1 = prim; + else { + /* We reached the secondaries, so + * same_prefsrc should be determined. + */ + if (!same_prefsrc) + continue; + /* Search new prim1 if ifa1 is not + * using the current prim1 + */ + if (!prim1 || + ifa1->ifa_mask != prim1->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, prim1)) + prim1 = inet_ifa_byprefix(in_dev, + ifa1->ifa_address, + ifa1->ifa_mask); + if (!prim1) + continue; + if (prim1->ifa_local != prim->ifa_local) + continue; + } + } else { + if (prim->ifa_local != ifa1->ifa_local) + continue; + prim1 = ifa1; + if (prim != prim1) + same_prefsrc = 1; + } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) @@ -763,19 +820,37 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; + /* primary has network specific broadcasts */ + if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { + __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; + __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; + + if (!ipv4_is_zeronet(any1)) { + if (ifa->ifa_broadcast == brd1 || + ifa->ifa_broadcast == any1) + ok |= BRD_OK; + if (brd == brd1 || brd == any1) + ok |= BRD1_OK; + if (any == brd1 || any == any1) + ok |= BRD0_OK; + } + } } if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok & BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok & BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (subnet && ifa->ifa_prefixlen < 31) { + if (!(ok & BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok & BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + } if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + if (gone && + inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -896,7 +971,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_del_ifaddr(ifa); + fib_del_ifaddr(ifa, NULL); fib_update_nh_saddrs(dev); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. -- cgit v1.2.3 From 9c7a4f9ce651383c73dfdff3d7e21d5f9572c4ec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Mar 2011 19:17:36 -0700 Subject: ipv6: ip6_route_output does not modify sk parameter, so make it const This avoids explicit cast to avoid 'discards qualifiers' compiler warning in a netfilter patch that i've been working on. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- net/ipv6/route.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 642a80bb42cf..c850e5fb967c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -70,7 +70,7 @@ static inline struct inet_peer *rt6_get_peer(struct rt6_info *rt) extern void ip6_route_input(struct sk_buff *skb); extern struct dst_entry * ip6_route_output(struct net *net, - struct sock *sk, + const struct sock *sk, struct flowi6 *fl6); extern int ip6_route_init(void); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6814c8722fa7..843406f14d7b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -854,7 +854,7 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); } -struct dst_entry * ip6_route_output(struct net *net, struct sock *sk, +struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { int flags = 0; -- cgit v1.2.3 From 6a1fef6d000944911df0f160f366111daa10740a Mon Sep 17 00:00:00 2001 From: Sriram Date: Tue, 22 Mar 2011 02:31:03 +0000 Subject: net: davinci_emac:Fix translation logic for buffer descriptor With recent changes to the driver(switch to new cpdma layer), the support for buffer descriptor address translation logic is broken. This affects platforms where the physical address of the descriptors as seen by the DMA engine is different from the physical address. Original Patch adding translation logic support: Commit: ad021ae8862209864dc8ebd3b7d3a55ce84b9ea2 Signed-off-by: Sriramakrishnan A G Tested-By: Sekhar Nori Signed-off-by: David S. Miller --- drivers/net/davinci_cpdma.c | 11 ++++++++--- drivers/net/davinci_cpdma.h | 1 + drivers/net/davinci_emac.c | 5 ++++- include/linux/davinci_emac.h | 1 + 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/davinci_cpdma.c b/drivers/net/davinci_cpdma.c index e92b2b6cd8c4..ae47f23ba930 100644 --- a/drivers/net/davinci_cpdma.c +++ b/drivers/net/davinci_cpdma.c @@ -76,6 +76,7 @@ struct cpdma_desc { struct cpdma_desc_pool { u32 phys; + u32 hw_addr; void __iomem *iomap; /* ioremap map */ void *cpumap; /* dma_alloc map */ int desc_size, mem_size; @@ -137,7 +138,8 @@ struct cpdma_chan { * abstract out these details */ static struct cpdma_desc_pool * -cpdma_desc_pool_create(struct device *dev, u32 phys, int size, int align) +cpdma_desc_pool_create(struct device *dev, u32 phys, u32 hw_addr, + int size, int align) { int bitmap_size; struct cpdma_desc_pool *pool; @@ -161,10 +163,12 @@ cpdma_desc_pool_create(struct device *dev, u32 phys, int size, int align) if (phys) { pool->phys = phys; pool->iomap = ioremap(phys, size); + pool->hw_addr = hw_addr; } else { pool->cpumap = dma_alloc_coherent(dev, size, &pool->phys, GFP_KERNEL); pool->iomap = (void __force __iomem *)pool->cpumap; + pool->hw_addr = pool->phys; } if (pool->iomap) @@ -201,14 +205,14 @@ static inline dma_addr_t desc_phys(struct cpdma_desc_pool *pool, { if (!desc) return 0; - return pool->phys + (__force dma_addr_t)desc - + return pool->hw_addr + (__force dma_addr_t)desc - (__force dma_addr_t)pool->iomap; } static inline struct cpdma_desc __iomem * desc_from_phys(struct cpdma_desc_pool *pool, dma_addr_t dma) { - return dma ? pool->iomap + dma - pool->phys : NULL; + return dma ? pool->iomap + dma - pool->hw_addr : NULL; } static struct cpdma_desc __iomem * @@ -260,6 +264,7 @@ struct cpdma_ctlr *cpdma_ctlr_create(struct cpdma_params *params) ctlr->pool = cpdma_desc_pool_create(ctlr->dev, ctlr->params.desc_mem_phys, + ctlr->params.desc_hw_addr, ctlr->params.desc_mem_size, ctlr->params.desc_align); if (!ctlr->pool) { diff --git a/drivers/net/davinci_cpdma.h b/drivers/net/davinci_cpdma.h index 868e50ebde45..afa19a0c0d81 100644 --- a/drivers/net/davinci_cpdma.h +++ b/drivers/net/davinci_cpdma.h @@ -33,6 +33,7 @@ struct cpdma_params { bool has_soft_reset; int min_packet_size; u32 desc_mem_phys; + u32 desc_hw_addr; int desc_mem_size; int desc_align; diff --git a/drivers/net/davinci_emac.c b/drivers/net/davinci_emac.c index 082d6ea69920..baca6bfcb089 100644 --- a/drivers/net/davinci_emac.c +++ b/drivers/net/davinci_emac.c @@ -1854,10 +1854,13 @@ static int __devinit davinci_emac_probe(struct platform_device *pdev) dma_params.rxcp = priv->emac_base + 0x660; dma_params.num_chan = EMAC_MAX_TXRX_CHANNELS; dma_params.min_packet_size = EMAC_DEF_MIN_ETHPKTSIZE; - dma_params.desc_mem_phys = hw_ram_addr; + dma_params.desc_hw_addr = hw_ram_addr; dma_params.desc_mem_size = pdata->ctrl_ram_size; dma_params.desc_align = 16; + dma_params.desc_mem_phys = pdata->no_bd_ram ? 0 : + (u32 __force)res->start + pdata->ctrl_ram_offset; + priv->dma = cpdma_ctlr_create(&dma_params); if (!priv->dma) { dev_err(emac_dev, "DaVinci EMAC: Error initializing DMA\n"); diff --git a/include/linux/davinci_emac.h b/include/linux/davinci_emac.h index 5dd428532f79..542888504994 100644 --- a/include/linux/davinci_emac.h +++ b/include/linux/davinci_emac.h @@ -36,6 +36,7 @@ struct emac_platform_data { u8 rmii_en; u8 version; + bool no_bd_ram; void (*interrupt_enable) (void); void (*interrupt_disable) (void); }; -- cgit v1.2.3 From e815f0a84fc9a98e5cc3ef0b520122e5e18520e7 Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Mon, 21 Mar 2011 20:24:47 +0100 Subject: sched.h: Fix a typo ("its") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sentence uses the possessive pronoun, which is spelled without an apostrophe. Signed-off-by: Jonathan Neuschäfer Cc: Jiri Kosina Cc: Peter Zijlstra LKML-Reference: <1300735487-2406-1-git-send-email-j.neuschaefer@gmx.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c15936fe998b..e89f1292c301 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -516,7 +516,7 @@ struct thread_group_cputimer { struct autogroup; /* - * NOTE! "signal_struct" does not have it's own + * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of -- cgit v1.2.3 From 68cacd29167b1926d237bd1b153aa2a990201729 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 23 Mar 2011 16:03:06 +0100 Subject: perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx() This patch solves a stale pointer problem in update_cgrp_time_from_cpuctx(). The cpuctx->cgrp was not cleared on all possible event exit paths, including: close() perf_release() perf_release_kernel() list_del_event() This patch fixes list_del_event() to clear cpuctx->cgrp when there are no cgroup events left in the context. [ This second version makes the code compile when CONFIG_CGROUP_PERF is not enabled. We unconditionally define perf_cpu_context->cgrp. ] Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: perfmon2-devel@lists.sf.net Cc: paulus@samba.org Cc: davem@davemloft.net LKML-Reference: <20110323150306.GA1580@quad> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 -- kernel/perf_event.c | 12 +++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f495c0147240..311b4dc785a1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -938,9 +938,7 @@ struct perf_cpu_context { struct list_head rotation_list; int jiffies_interval; struct pmu *active_pmu; -#ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; -#endif }; struct perf_output_handle { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3472bb1a070c..0c714226ae0c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -941,6 +941,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -949,8 +950,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.2.3 From d1e12de804f9d8ad114786ca7c2ce593cba79891 Mon Sep 17 00:00:00 2001 From: "Krishnasamy, Somasundaram" Date: Mon, 28 Feb 2011 18:13:22 -0500 Subject: [SCSI] ses: Avoid kernel panic when lun 0 is not mapped During device discovery, scsi mid layer sends INQUIRY command to LUN 0. If the LUN 0 is not mapped to host, it creates a temporary scsi_device with LUN id 0 and sends REPORT_LUNS command to it. After the REPORT_LUNS succeeds, it walks through the LUN table and adds each LUN found to sysfs. At the end of REPORT_LUNS lun table scan, it will delete the temporary scsi_device of LUN 0. When scsi devices are added to sysfs, it calls add_dev function of all the registered class interfaces. If ses driver has been registered, ses_intf_add() of ses module will be called. This function calls scsi_device_enclosure() to check the inquiry data for EncServ bit. Since inquiry was not allocated for temporary LUN 0 scsi_device, it will cause NULL pointer exception. To fix the problem, sdev->inquiry is checked for NULL before reading it. Signed-off-by: Somasundaram Krishnasamy Signed-off-by: Babu Moger Cc: stable@kernel.org Signed-off-by: James Bottomley --- include/scsi/scsi_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index f171c65dc5a8..2d3ec5094685 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -462,7 +462,7 @@ static inline int scsi_device_qas(struct scsi_device *sdev) } static inline int scsi_device_enclosure(struct scsi_device *sdev) { - return sdev->inquiry[6] & (1<<6); + return sdev->inquiry ? (sdev->inquiry[6] & (1<<6)) : 1; } static inline int scsi_device_protection(struct scsi_device *sdev) -- cgit v1.2.3 From 5dd7ed2e811d5cd12f31fb7f0c5ad0107d494a12 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Mar 2011 04:06:01 -0700 Subject: [SCSI] target: Minor sparse warning fixes and annotations This patch addresses the majority of sparse warnings and adds proper locking annotations. It also fixes the dubious one-bit signed bitfield, for which the signed one-bit types can be 0 or -1 which can cause a problem if someone ever checks if (foo->lu_gp_assoc == 1). The current code is fine because everyone just checks zero vs non-zero. But Sparse complains about it so lets change it. The warnings look like this: include/target/target_core_base.h:228:26: error: dubious one-bit signed bitfield Signed-off-by: Dan Carpenter Signed-off-by: Fubo Chen Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- drivers/target/target_core_device.c | 1 + drivers/target/target_core_fabric_lib.c | 1 + drivers/target/target_core_pscsi.c | 3 +++ drivers/target/target_core_rd.h | 2 -- drivers/target/target_core_transport.c | 4 +--- include/target/target_core_base.h | 14 +++++++------- include/target/target_core_fabric_ops.h | 2 +- include/target/target_core_transport.h | 4 ++++ 8 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 9e182bd245f0..3fb8e32506ed 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -589,6 +589,7 @@ static void core_export_port( * Called with struct se_device->se_port_lock spinlock held. */ static void core_release_port(struct se_device *dev, struct se_port *port) + __releases(&dev->se_port_lock) __acquires(&dev->se_port_lock) { /* * Wait for any port reference for PR ALL_TG_PT=1 operation diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c index a3c695adabec..d57ad672677f 100644 --- a/drivers/target/target_core_fabric_lib.c +++ b/drivers/target/target_core_fabric_lib.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 51fd309388fc..7ff6a35f26ac 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -441,6 +441,7 @@ static struct se_device *pscsi_create_type_disk( struct pscsi_dev_virt *pdv, struct se_subsystem_dev *se_dev, struct se_hba *hba) + __releases(sh->host_lock) { struct se_device *dev; struct pscsi_hba_virt *phv = (struct pscsi_hba_virt *)pdv->pdv_se_hba->hba_ptr; @@ -488,6 +489,7 @@ static struct se_device *pscsi_create_type_rom( struct pscsi_dev_virt *pdv, struct se_subsystem_dev *se_dev, struct se_hba *hba) + __releases(sh->host_lock) { struct se_device *dev; struct pscsi_hba_virt *phv = (struct pscsi_hba_virt *)pdv->pdv_se_hba->hba_ptr; @@ -522,6 +524,7 @@ static struct se_device *pscsi_create_type_other( struct pscsi_dev_virt *pdv, struct se_subsystem_dev *se_dev, struct se_hba *hba) + __releases(sh->host_lock) { struct se_device *dev; struct pscsi_hba_virt *phv = (struct pscsi_hba_virt *)pdv->pdv_se_hba->hba_ptr; diff --git a/drivers/target/target_core_rd.h b/drivers/target/target_core_rd.h index 13badfbaf9c0..3ea19e29d8ec 100644 --- a/drivers/target/target_core_rd.h +++ b/drivers/target/target_core_rd.h @@ -14,8 +14,6 @@ #define RD_BLOCKSIZE 512 #define RD_MAX_SECTORS 1024 -extern struct kmem_cache *se_mem_cache; - /* Used in target_core_init_configfs() for virtual LUN 0 access */ int __init rd_module_init(void); void rd_module_exit(void); diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ff9ace01e27a..80df4056f9f1 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -227,8 +227,6 @@ static void transport_remove_cmd_from_queue(struct se_cmd *cmd, static int transport_set_sense_codes(struct se_cmd *cmd, u8 asc, u8 ascq); static void transport_stop_all_task_timers(struct se_cmd *cmd); -int transport_emulate_control_cdb(struct se_task *task); - int init_se_global(void) { struct se_global *global; @@ -4395,7 +4393,7 @@ out: return -1; } -extern u32 transport_calc_sg_num( +u32 transport_calc_sg_num( struct se_task *task, struct se_mem *in_se_mem, u32 task_offset) diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 0828b6c8610a..bc93b7819eba 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -239,7 +239,7 @@ struct t10_alua_lu_gp { } ____cacheline_aligned; struct t10_alua_lu_gp_member { - int lu_gp_assoc:1; + bool lu_gp_assoc; atomic_t lu_gp_mem_ref_cnt; spinlock_t lu_gp_mem_lock; struct t10_alua_lu_gp *lu_gp; @@ -271,7 +271,7 @@ struct t10_alua_tg_pt_gp { } ____cacheline_aligned; struct t10_alua_tg_pt_gp_member { - int tg_pt_gp_assoc:1; + bool tg_pt_gp_assoc; atomic_t tg_pt_gp_mem_ref_cnt; spinlock_t tg_pt_gp_mem_lock; struct t10_alua_tg_pt_gp *tg_pt_gp; @@ -336,7 +336,7 @@ struct t10_pr_registration { int pr_res_type; int pr_res_scope; /* Used for fabric initiator WWPNs using a ISID */ - int isid_present_at_reg:1; + bool isid_present_at_reg; u32 pr_res_mapped_lun; u32 pr_aptpl_target_lun; u32 pr_res_generation; @@ -418,7 +418,7 @@ struct se_transport_task { unsigned long long t_task_lba; int t_tasks_failed; int t_tasks_fua; - int t_tasks_bidi:1; + bool t_tasks_bidi; u32 t_task_cdbs; u32 t_tasks_check; u32 t_tasks_no; @@ -470,7 +470,7 @@ struct se_task { u8 task_flags; int task_error_status; int task_state_flags; - int task_padded_sg:1; + bool task_padded_sg; unsigned long long task_lba; u32 task_no; u32 task_sectors; @@ -583,7 +583,7 @@ struct se_ua { struct se_node_acl { char initiatorname[TRANSPORT_IQN_LEN]; /* Used to signal demo mode created ACL, disabled by default */ - int dynamic_node_acl:1; + bool dynamic_node_acl; u32 queue_depth; u32 acl_index; u64 num_cmds; @@ -632,7 +632,7 @@ struct se_lun_acl { } ____cacheline_aligned; struct se_dev_entry { - int def_pr_registered:1; + bool def_pr_registered; /* See transport_lunflags_table */ u32 lun_flags; u32 deve_cmds; diff --git a/include/target/target_core_fabric_ops.h b/include/target/target_core_fabric_ops.h index f3ac12b019c2..5eb8b1ae59d1 100644 --- a/include/target/target_core_fabric_ops.h +++ b/include/target/target_core_fabric_ops.h @@ -8,7 +8,7 @@ struct target_core_fabric_ops { * for scatterlist chaining using transport_do_task_sg_link(), * disabled by default */ - int task_sg_chaining:1; + bool task_sg_chaining; char *(*get_fabric_name)(void); u8 (*get_fabric_proto_ident)(struct se_portal_group *); char *(*tpg_get_wwn)(struct se_portal_group *); diff --git a/include/target/target_core_transport.h b/include/target/target_core_transport.h index 2e8ec51f0615..59aa464f6ee2 100644 --- a/include/target/target_core_transport.h +++ b/include/target/target_core_transport.h @@ -109,6 +109,8 @@ struct se_mem; struct se_subsystem_api; +extern struct kmem_cache *se_mem_cache; + extern int init_se_global(void); extern void release_se_global(void); extern void init_scsi_index_table(void); @@ -190,6 +192,8 @@ extern void transport_generic_process_write(struct se_cmd *); extern int transport_generic_do_tmr(struct se_cmd *); /* From target_core_alua.c */ extern int core_alua_check_nonop_delay(struct se_cmd *); +/* From target_core_cdb.c */ +extern int transport_emulate_control_cdb(struct se_task *); /* * Each se_transport_task_t can have N number of possible struct se_task's -- cgit v1.2.3 From 35ce9e26d7e0674892f77a9172c898dfcba50064 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 14 Mar 2011 04:06:02 -0700 Subject: [SCSI] target: Remove spurious double cast from structure macro accessors Reported-by: Fubo Chen Cc: James Bottomley Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- include/target/target_core_base.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index bc93b7819eba..b3a62e451313 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -494,8 +494,8 @@ struct se_task { struct list_head t_state_list; } ____cacheline_aligned; -#define TASK_CMD(task) ((struct se_cmd *)task->task_se_cmd) -#define TASK_DEV(task) ((struct se_device *)task->se_dev) +#define TASK_CMD(task) ((task)->task_se_cmd) +#define TASK_DEV(task) ((task)->se_dev) struct se_cmd { /* SAM response code being sent to initiator */ @@ -551,8 +551,8 @@ struct se_cmd { void (*transport_complete_callback)(struct se_cmd *); } ____cacheline_aligned; -#define T_TASK(cmd) ((struct se_transport_task *)(cmd->t_task)) -#define CMD_TFO(cmd) ((struct target_core_fabric_ops *)cmd->se_tfo) +#define T_TASK(cmd) ((cmd)->t_task) +#define CMD_TFO(cmd) ((cmd)->se_tfo) struct se_tmr_req { /* Task Management function to be preformed */ @@ -615,8 +615,8 @@ struct se_session { struct list_head sess_acl_list; } ____cacheline_aligned; -#define SE_SESS(cmd) ((struct se_session *)(cmd)->se_sess) -#define SE_NODE_ACL(sess) ((struct se_node_acl *)(sess)->se_node_acl) +#define SE_SESS(cmd) ((cmd)->se_sess) +#define SE_NODE_ACL(sess) ((sess)->se_node_acl) struct se_device; struct se_transform_info; @@ -803,8 +803,8 @@ struct se_device { struct list_head g_se_dev_list; } ____cacheline_aligned; -#define SE_DEV(cmd) ((struct se_device *)(cmd)->se_lun->lun_se_dev) -#define SU_DEV(dev) ((struct se_subsystem_dev *)(dev)->se_sub_dev) +#define SE_DEV(cmd) ((cmd)->se_lun->lun_se_dev) +#define SU_DEV(dev) ((dev)->se_sub_dev) #define DEV_ATTRIB(dev) (&(dev)->se_sub_dev->se_dev_attrib) #define DEV_T10_WWN(dev) (&(dev)->se_sub_dev->t10_wwn) @@ -832,7 +832,7 @@ struct se_hba { struct se_subsystem_api *transport; } ____cacheline_aligned; -#define SE_HBA(d) ((struct se_hba *)(d)->se_hba) +#define SE_HBA(dev) ((dev)->se_hba) struct se_lun { /* See transport_lun_status_table */ @@ -852,7 +852,7 @@ struct se_lun { struct se_port *lun_sep; } ____cacheline_aligned; -#define SE_LUN(c) ((struct se_lun *)(c)->se_lun) +#define SE_LUN(cmd) ((cmd)->se_lun) struct scsi_port_stats { u64 cmd_pdus; @@ -919,7 +919,7 @@ struct se_portal_group { struct config_group tpg_param_group; } ____cacheline_aligned; -#define TPG_TFO(se_tpg) ((struct target_core_fabric_ops *)(se_tpg)->se_tpg_tfo) +#define TPG_TFO(se_tpg) ((se_tpg)->se_tpg_tfo) struct se_wwn { struct target_fabric_configfs *wwn_tf; -- cgit v1.2.3 From 5c6cd613196558ba50ba97268b6d225c8d2f56d6 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 14 Mar 2011 04:06:04 -0700 Subject: [SCSI] target: Convert TMR REQ/RSP definitions to target namespace This patch changes include/target/target_core_tmr.h code to use target specific 'TMR_*' prefixed definitions for fabric independent SCSI Task Management Request/Request naming in include/scsi/scsi.h definitions for mainline target code. Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- drivers/target/target_core_transport.c | 19 +++++-------- include/target/target_core_tmr.h | 52 ++++++++++++++-------------------- 2 files changed, 29 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 80df4056f9f1..20ab27cb3e45 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -5832,31 +5832,26 @@ int transport_generic_do_tmr(struct se_cmd *cmd) int ret; switch (tmr->function) { - case ABORT_TASK: + case TMR_ABORT_TASK: ref_cmd = tmr->ref_cmd; tmr->response = TMR_FUNCTION_REJECTED; break; - case ABORT_TASK_SET: - case CLEAR_ACA: - case CLEAR_TASK_SET: + case TMR_ABORT_TASK_SET: + case TMR_CLEAR_ACA: + case TMR_CLEAR_TASK_SET: tmr->response = TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED; break; - case LUN_RESET: + case TMR_LUN_RESET: ret = core_tmr_lun_reset(dev, tmr, NULL, NULL); tmr->response = (!ret) ? TMR_FUNCTION_COMPLETE : TMR_FUNCTION_REJECTED; break; -#if 0 - case TARGET_WARM_RESET: - transport_generic_host_reset(dev->se_hba); + case TMR_TARGET_WARM_RESET: tmr->response = TMR_FUNCTION_REJECTED; break; - case TARGET_COLD_RESET: - transport_generic_host_reset(dev->se_hba); - transport_generic_cold_reset(dev->se_hba); + case TMR_TARGET_COLD_RESET: tmr->response = TMR_FUNCTION_REJECTED; break; -#endif default: printk(KERN_ERR "Uknown TMR function: 0x%02x.\n", tmr->function); diff --git a/include/target/target_core_tmr.h b/include/target/target_core_tmr.h index 6c8248bc2c66..bd5596807478 100644 --- a/include/target/target_core_tmr.h +++ b/include/target/target_core_tmr.h @@ -1,37 +1,29 @@ #ifndef TARGET_CORE_TMR_H #define TARGET_CORE_TMR_H -/* task management function values */ -#ifdef ABORT_TASK -#undef ABORT_TASK -#endif /* ABORT_TASK */ -#define ABORT_TASK 1 -#ifdef ABORT_TASK_SET -#undef ABORT_TASK_SET -#endif /* ABORT_TASK_SET */ -#define ABORT_TASK_SET 2 -#ifdef CLEAR_ACA -#undef CLEAR_ACA -#endif /* CLEAR_ACA */ -#define CLEAR_ACA 3 -#ifdef CLEAR_TASK_SET -#undef CLEAR_TASK_SET -#endif /* CLEAR_TASK_SET */ -#define CLEAR_TASK_SET 4 -#define LUN_RESET 5 -#define TARGET_WARM_RESET 6 -#define TARGET_COLD_RESET 7 -#define TASK_REASSIGN 8 +/* fabric independent task management function values */ +enum tcm_tmreq_table { + TMR_ABORT_TASK = 1, + TMR_ABORT_TASK_SET = 2, + TMR_CLEAR_ACA = 3, + TMR_CLEAR_TASK_SET = 4, + TMR_LUN_RESET = 5, + TMR_TARGET_WARM_RESET = 6, + TMR_TARGET_COLD_RESET = 7, + TMR_FABRIC_TMR = 255, +}; -/* task management response values */ -#define TMR_FUNCTION_COMPLETE 0 -#define TMR_TASK_DOES_NOT_EXIST 1 -#define TMR_LUN_DOES_NOT_EXIST 2 -#define TMR_TASK_STILL_ALLEGIANT 3 -#define TMR_TASK_FAILOVER_NOT_SUPPORTED 4 -#define TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED 5 -#define TMR_FUNCTION_AUTHORIZATION_FAILED 6 -#define TMR_FUNCTION_REJECTED 255 +/* fabric independent task management response values */ +enum tcm_tmrsp_table { + TMR_FUNCTION_COMPLETE = 0, + TMR_TASK_DOES_NOT_EXIST = 1, + TMR_LUN_DOES_NOT_EXIST = 2, + TMR_TASK_STILL_ALLEGIANT = 3, + TMR_TASK_FAILOVER_NOT_SUPPORTED = 4, + TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED = 5, + TMR_FUNCTION_AUTHORIZATION_FAILED = 6, + TMR_FUNCTION_REJECTED = 255, +}; extern struct kmem_cache *se_tmr_req_cache; -- cgit v1.2.3 From 15fb48cc40be170423fe8ddd17666aa6175315e3 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 14 Mar 2011 04:06:10 -0700 Subject: [SCSI] target: update version to v4.0.0-rc7-ml Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- include/target/target_core_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index b3a62e451313..79f2e0a30dc8 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -9,7 +9,7 @@ #include #include -#define TARGET_CORE_MOD_VERSION "v4.0.0-rc6" +#define TARGET_CORE_MOD_VERSION "v4.0.0-rc7-ml" #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGABRT)) /* Used by transport_generic_allocate_iovecs() */ -- cgit v1.2.3 From 12d233842987d9972957419e427987b94f7bd7b4 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 14 Mar 2011 04:06:11 -0700 Subject: [SCSI] target: add initial statistics This patch adds a target_core_mib.c statistics conversion for backend context struct se_subsystem_dev + struct se_device config_group based statistics in target_core_device.c using CONFIGFS_EATTR() based struct config_item_types from target_core_stat.c code. The conversion from backend /proc/scsi_target/mib/ context output to configfs default groups+attributes include scsi_dev, scsi_lu, and scsi_tgt_dev output from within individual: /sys/kernel/config/target/core/$HBA/DEV/ The legacy procfs output now appear as individual configfs attributes under: *) $HBA/$DEV/statistics/scsi_dev: |-- indx |-- inst |-- ports `-- role *) $HBA/$DEV/statistics/scsi_lu: |-- creation_time |-- dev |-- dev_type |-- full_stat |-- hs_num_cmds |-- indx |-- inst |-- lu_name |-- lun |-- num_cmds |-- prod |-- read_mbytes |-- resets |-- rev |-- state_bit |-- status |-- vend `-- write_mbytes *) $HBA/$DEV/statistics/scsi_tgt_dev: |-- indx |-- inst |-- non_access_lus |-- num_lus |-- resets `-- status The conversion from backend /proc/scsi_target/mib/ context output to configfs default groups+attributes include scsi_port, scsi_tgt_port and scsi_transport output from within individual: /sys/kernel/config/target/fabric/$WWN/tpgt_$TPGT/lun/lun_$LUN_ID/statistics/ The legacy procfs output now appear as individual configfs attributes under: *) fabric/$WWN/tpgt_$TPGT/lun/lun_$LUN_ID/statistics/scsi_port |-- busy_count |-- dev |-- indx |-- inst `-- role *) fabric/$WWN/tpgt_$TPGT/lun/lun_$LUN_ID/statistics/scsi_tgt_port |-- dev |-- hs_in_cmds |-- in_cmds |-- indx |-- inst |-- name |-- port_index |-- read_mbytes `-- write_mbytes *) fabric/$WWN/tpgt_$TPGT/lun/lun_$LUN_ID/statistics/scsi_transport |-- dev_name |-- device |-- indx `-- inst The conversion from backend /proc/scsi_target/mib/ context output to configfs default groups+attributes include scsi_att_intr_port and scsi_auth_intr output from within individual: /sys/kernel/config/target/fabric/$WWN/tpgt_$TPGT/acls/$INITIATOR_WWN/lun_$LUN_ID/statistics/ The legacy procfs output now appear as individual configfs attributes under: *) acls/$INITIATOR_WWN/lun_$LUN_ID/statistics/scsi_att_intr_port |-- dev |-- indx |-- inst |-- port |-- port_auth_indx `-- port_ident *) acls/$INITIATOR_WWN/lun_$LUN_ID/statistics/scsi_auth_intr |-- att_count |-- creation_time |-- dev |-- dev_or_port |-- hs_num_cmds |-- indx |-- inst |-- intr_name |-- map_indx |-- num_cmds |-- port |-- read_mbytes |-- row_status `-- write_mbytes Also, this includes adding struct target_fabric_configfs_template-> tfc_wwn_fabric_stats_cit and ->tfc_tpg_nacl_stat_cit respectively for use during target_core_fabric_configfs.c:target_fabric_setup_cits() Signed-off-by: Nicholas A. Bellinger Signed-off-by: James Bottomley --- drivers/target/Makefile | 3 +- drivers/target/target_core_configfs.c | 76 +- drivers/target/target_core_fabric_configfs.c | 209 ++- drivers/target/target_core_stat.c | 1810 ++++++++++++++++++++++++++ drivers/target/target_core_stat.h | 8 + include/target/target_core_base.h | 35 +- include/target/target_core_configfs.h | 4 + 7 files changed, 2127 insertions(+), 18 deletions(-) create mode 100644 drivers/target/target_core_stat.c create mode 100644 drivers/target/target_core_stat.h (limited to 'include') diff --git a/drivers/target/Makefile b/drivers/target/Makefile index 050ee6569b2b..4f3cb788f06a 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -12,7 +12,8 @@ target_core_mod-y := target_core_configfs.o \ target_core_transport.o \ target_core_cdb.o \ target_core_ua.o \ - target_core_rd.o + target_core_rd.o \ + target_core_stat.o obj-$(CONFIG_TARGET_CORE) += target_core_mod.o diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index a38fec4acceb..a5f44a6e6e1d 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -3,8 +3,8 @@ * * This file contains ConfigFS logic for the Generic Target Engine project. * - * Copyright (c) 2008-2010 Rising Tide Systems - * Copyright (c) 2008-2010 Linux-iSCSI.org + * Copyright (c) 2008-2011 Rising Tide Systems + * Copyright (c) 2008-2011 Linux-iSCSI.org * * Nicholas A. Bellinger * @@ -50,6 +50,7 @@ #include "target_core_hba.h" #include "target_core_pr.h" #include "target_core_rd.h" +#include "target_core_stat.h" static struct list_head g_tf_list; static struct mutex g_tf_lock; @@ -2709,6 +2710,34 @@ static struct config_item_type target_core_alua_cit = { /* End functions for struct config_item_type target_core_alua_cit */ +/* Start functions for struct config_item_type target_core_stat_cit */ + +static struct config_group *target_core_stat_mkdir( + struct config_group *group, + const char *name) +{ + return ERR_PTR(-ENOSYS); +} + +static void target_core_stat_rmdir( + struct config_group *group, + struct config_item *item) +{ + return; +} + +static struct configfs_group_operations target_core_stat_group_ops = { + .make_group = &target_core_stat_mkdir, + .drop_item = &target_core_stat_rmdir, +}; + +static struct config_item_type target_core_stat_cit = { + .ct_group_ops = &target_core_stat_group_ops, + .ct_owner = THIS_MODULE, +}; + +/* End functions for struct config_item_type target_core_stat_cit */ + /* Start functions for struct config_item_type target_core_hba_cit */ static struct config_group *target_core_make_subdev( @@ -2721,10 +2750,12 @@ static struct config_group *target_core_make_subdev( struct config_item *hba_ci = &group->cg_item; struct se_hba *hba = item_to_hba(hba_ci); struct config_group *dev_cg = NULL, *tg_pt_gp_cg = NULL; + struct config_group *dev_stat_grp = NULL; + int errno = -ENOMEM, ret; - if (mutex_lock_interruptible(&hba->hba_access_mutex)) - return NULL; - + ret = mutex_lock_interruptible(&hba->hba_access_mutex); + if (ret) + return ERR_PTR(ret); /* * Locate the struct se_subsystem_api from parent's struct se_hba. */ @@ -2754,7 +2785,7 @@ static struct config_group *target_core_make_subdev( se_dev->se_dev_hba = hba; dev_cg = &se_dev->se_dev_group; - dev_cg->default_groups = kzalloc(sizeof(struct config_group) * 6, + dev_cg->default_groups = kzalloc(sizeof(struct config_group) * 7, GFP_KERNEL); if (!(dev_cg->default_groups)) goto out; @@ -2786,13 +2817,17 @@ static struct config_group *target_core_make_subdev( &target_core_dev_wwn_cit); config_group_init_type_name(&se_dev->t10_alua.alua_tg_pt_gps_group, "alua", &target_core_alua_tg_pt_gps_cit); + config_group_init_type_name(&se_dev->dev_stat_grps.stat_group, + "statistics", &target_core_stat_cit); + dev_cg->default_groups[0] = &se_dev->se_dev_attrib.da_group; dev_cg->default_groups[1] = &se_dev->se_dev_pr_group; dev_cg->default_groups[2] = &se_dev->t10_wwn.t10_wwn_group; dev_cg->default_groups[3] = &se_dev->t10_alua.alua_tg_pt_gps_group; - dev_cg->default_groups[4] = NULL; + dev_cg->default_groups[4] = &se_dev->dev_stat_grps.stat_group; + dev_cg->default_groups[5] = NULL; /* - * Add core/$HBA/$DEV/alua/tg_pt_gps/default_tg_pt_gp + * Add core/$HBA/$DEV/alua/default_tg_pt_gp */ tg_pt_gp = core_alua_allocate_tg_pt_gp(se_dev, "default_tg_pt_gp", 1); if (!(tg_pt_gp)) @@ -2812,6 +2847,17 @@ static struct config_group *target_core_make_subdev( tg_pt_gp_cg->default_groups[0] = &tg_pt_gp->tg_pt_gp_group; tg_pt_gp_cg->default_groups[1] = NULL; T10_ALUA(se_dev)->default_tg_pt_gp = tg_pt_gp; + /* + * Add core/$HBA/$DEV/statistics/ default groups + */ + dev_stat_grp = &DEV_STAT_GRP(se_dev)->stat_group; + dev_stat_grp->default_groups = kzalloc(sizeof(struct config_group) * 4, + GFP_KERNEL); + if (!dev_stat_grp->default_groups) { + printk(KERN_ERR "Unable to allocate dev_stat_grp->default_groups\n"); + goto out; + } + target_stat_setup_dev_default_groups(se_dev); printk(KERN_INFO "Target_Core_ConfigFS: Allocated struct se_subsystem_dev:" " %p se_dev_su_ptr: %p\n", se_dev, se_dev->se_dev_su_ptr); @@ -2823,6 +2869,8 @@ out: core_alua_free_tg_pt_gp(T10_ALUA(se_dev)->default_tg_pt_gp); T10_ALUA(se_dev)->default_tg_pt_gp = NULL; } + if (dev_stat_grp) + kfree(dev_stat_grp->default_groups); if (tg_pt_gp_cg) kfree(tg_pt_gp_cg->default_groups); if (dev_cg) @@ -2832,7 +2880,7 @@ out: kfree(se_dev); unlock: mutex_unlock(&hba->hba_access_mutex); - return NULL; + return ERR_PTR(errno); } static void target_core_drop_subdev( @@ -2844,7 +2892,7 @@ static void target_core_drop_subdev( struct se_hba *hba; struct se_subsystem_api *t; struct config_item *df_item; - struct config_group *dev_cg, *tg_pt_gp_cg; + struct config_group *dev_cg, *tg_pt_gp_cg, *dev_stat_grp; int i; hba = item_to_hba(&se_dev->se_dev_hba->hba_group.cg_item); @@ -2856,6 +2904,14 @@ static void target_core_drop_subdev( list_del(&se_dev->g_se_dev_list); spin_unlock(&se_global->g_device_lock); + dev_stat_grp = &DEV_STAT_GRP(se_dev)->stat_group; + for (i = 0; dev_stat_grp->default_groups[i]; i++) { + df_item = &dev_stat_grp->default_groups[i]->cg_item; + dev_stat_grp->default_groups[i] = NULL; + config_item_put(df_item); + } + kfree(dev_stat_grp->default_groups); + tg_pt_gp_cg = &T10_ALUA(se_dev)->alua_tg_pt_gps_group; for (i = 0; tg_pt_gp_cg->default_groups[i]; i++) { df_item = &tg_pt_gp_cg->default_groups[i]->cg_item; diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index b65d1c8e7740..07ab5a3bb8e8 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -4,10 +4,10 @@ * This file contains generic fabric module configfs infrastructure for * TCM v4.x code * - * Copyright (c) 2010 Rising Tide Systems - * Copyright (c) 2010 Linux-iSCSI.org + * Copyright (c) 2010,2011 Rising Tide Systems + * Copyright (c) 2010,2011 Linux-iSCSI.org * - * Copyright (c) 2010 Nicholas A. Bellinger + * Copyright (c) Nicholas A. Bellinger * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -48,6 +48,7 @@ #include "target_core_alua.h" #include "target_core_hba.h" #include "target_core_pr.h" +#include "target_core_stat.h" #define TF_CIT_SETUP(_name, _item_ops, _group_ops, _attrs) \ static void target_fabric_setup_##_name##_cit(struct target_fabric_configfs *tf) \ @@ -241,6 +242,32 @@ TF_CIT_SETUP(tpg_mappedlun, &target_fabric_mappedlun_item_ops, NULL, /* End of tfc_tpg_mappedlun_cit */ +/* Start of tfc_tpg_mappedlun_port_cit */ + +static struct config_group *target_core_mappedlun_stat_mkdir( + struct config_group *group, + const char *name) +{ + return ERR_PTR(-ENOSYS); +} + +static void target_core_mappedlun_stat_rmdir( + struct config_group *group, + struct config_item *item) +{ + return; +} + +static struct configfs_group_operations target_fabric_mappedlun_stat_group_ops = { + .make_group = target_core_mappedlun_stat_mkdir, + .drop_item = target_core_mappedlun_stat_rmdir, +}; + +TF_CIT_SETUP(tpg_mappedlun_stat, NULL, &target_fabric_mappedlun_stat_group_ops, + NULL); + +/* End of tfc_tpg_mappedlun_port_cit */ + /* Start of tfc_tpg_nacl_attrib_cit */ CONFIGFS_EATTR_OPS(target_fabric_nacl_attrib, se_node_acl, acl_attrib_group); @@ -294,6 +321,7 @@ static struct config_group *target_fabric_make_mappedlun( struct target_fabric_configfs *tf = se_tpg->se_tpg_wwn->wwn_tf; struct se_lun_acl *lacl; struct config_item *acl_ci; + struct config_group *lacl_cg = NULL, *ml_stat_grp = NULL; char *buf; unsigned long mapped_lun; int ret = 0; @@ -330,15 +358,42 @@ static struct config_group *target_fabric_make_mappedlun( lacl = core_dev_init_initiator_node_lun_acl(se_tpg, mapped_lun, config_item_name(acl_ci), &ret); - if (!(lacl)) + if (!(lacl)) { + ret = -EINVAL; goto out; + } + + lacl_cg = &lacl->se_lun_group; + lacl_cg->default_groups = kzalloc(sizeof(struct config_group) * 2, + GFP_KERNEL); + if (!lacl_cg->default_groups) { + printk(KERN_ERR "Unable to allocate lacl_cg->default_groups\n"); + ret = -ENOMEM; + goto out; + } config_group_init_type_name(&lacl->se_lun_group, name, &TF_CIT_TMPL(tf)->tfc_tpg_mappedlun_cit); + config_group_init_type_name(&lacl->ml_stat_grps.stat_group, + "statistics", &TF_CIT_TMPL(tf)->tfc_tpg_mappedlun_stat_cit); + lacl_cg->default_groups[0] = &lacl->ml_stat_grps.stat_group; + lacl_cg->default_groups[1] = NULL; + + ml_stat_grp = &ML_STAT_GRPS(lacl)->stat_group; + ml_stat_grp->default_groups = kzalloc(sizeof(struct config_group) * 3, + GFP_KERNEL); + if (!ml_stat_grp->default_groups) { + printk(KERN_ERR "Unable to allocate ml_stat_grp->default_groups\n"); + ret = -ENOMEM; + goto out; + } + target_stat_setup_mappedlun_default_groups(lacl); kfree(buf); return &lacl->se_lun_group; out: + if (lacl_cg) + kfree(lacl_cg->default_groups); kfree(buf); return ERR_PTR(ret); } @@ -347,6 +402,28 @@ static void target_fabric_drop_mappedlun( struct config_group *group, struct config_item *item) { + struct se_lun_acl *lacl = container_of(to_config_group(item), + struct se_lun_acl, se_lun_group); + struct config_item *df_item; + struct config_group *lacl_cg = NULL, *ml_stat_grp = NULL; + int i; + + ml_stat_grp = &ML_STAT_GRPS(lacl)->stat_group; + for (i = 0; ml_stat_grp->default_groups[i]; i++) { + df_item = &ml_stat_grp->default_groups[i]->cg_item; + ml_stat_grp->default_groups[i] = NULL; + config_item_put(df_item); + } + kfree(ml_stat_grp->default_groups); + + lacl_cg = &lacl->se_lun_group; + for (i = 0; lacl_cg->default_groups[i]; i++) { + df_item = &lacl_cg->default_groups[i]->cg_item; + lacl_cg->default_groups[i] = NULL; + config_item_put(df_item); + } + kfree(lacl_cg->default_groups); + config_item_put(item); } @@ -376,6 +453,15 @@ TF_CIT_SETUP(tpg_nacl_base, &target_fabric_nacl_base_item_ops, /* End of tfc_tpg_nacl_base_cit */ +/* Start of tfc_node_fabric_stats_cit */ +/* + * This is used as a placeholder for struct se_node_acl->acl_fabric_stat_group + * to allow fabrics access to ->acl_fabric_stat_group->default_groups[] + */ +TF_CIT_SETUP(tpg_nacl_stat, NULL, NULL, NULL); + +/* End of tfc_wwn_fabric_stats_cit */ + /* Start of tfc_tpg_nacl_cit */ static struct config_group *target_fabric_make_nodeacl( @@ -402,7 +488,8 @@ static struct config_group *target_fabric_make_nodeacl( nacl_cg->default_groups[0] = &se_nacl->acl_attrib_group; nacl_cg->default_groups[1] = &se_nacl->acl_auth_group; nacl_cg->default_groups[2] = &se_nacl->acl_param_group; - nacl_cg->default_groups[3] = NULL; + nacl_cg->default_groups[3] = &se_nacl->acl_fabric_stat_group; + nacl_cg->default_groups[4] = NULL; config_group_init_type_name(&se_nacl->acl_group, name, &TF_CIT_TMPL(tf)->tfc_tpg_nacl_base_cit); @@ -412,6 +499,9 @@ static struct config_group *target_fabric_make_nodeacl( &TF_CIT_TMPL(tf)->tfc_tpg_nacl_auth_cit); config_group_init_type_name(&se_nacl->acl_param_group, "param", &TF_CIT_TMPL(tf)->tfc_tpg_nacl_param_cit); + config_group_init_type_name(&se_nacl->acl_fabric_stat_group, + "fabric_statistics", + &TF_CIT_TMPL(tf)->tfc_tpg_nacl_stat_cit); return &se_nacl->acl_group; } @@ -758,6 +848,31 @@ TF_CIT_SETUP(tpg_port, &target_fabric_port_item_ops, NULL, target_fabric_port_at /* End of tfc_tpg_port_cit */ +/* Start of tfc_tpg_port_stat_cit */ + +static struct config_group *target_core_port_stat_mkdir( + struct config_group *group, + const char *name) +{ + return ERR_PTR(-ENOSYS); +} + +static void target_core_port_stat_rmdir( + struct config_group *group, + struct config_item *item) +{ + return; +} + +static struct configfs_group_operations target_fabric_port_stat_group_ops = { + .make_group = target_core_port_stat_mkdir, + .drop_item = target_core_port_stat_rmdir, +}; + +TF_CIT_SETUP(tpg_port_stat, NULL, &target_fabric_port_stat_group_ops, NULL); + +/* End of tfc_tpg_port_stat_cit */ + /* Start of tfc_tpg_lun_cit */ static struct config_group *target_fabric_make_lun( @@ -768,7 +883,9 @@ static struct config_group *target_fabric_make_lun( struct se_portal_group *se_tpg = container_of(group, struct se_portal_group, tpg_lun_group); struct target_fabric_configfs *tf = se_tpg->se_tpg_wwn->wwn_tf; + struct config_group *lun_cg = NULL, *port_stat_grp = NULL; unsigned long unpacked_lun; + int errno; if (strstr(name, "lun_") != name) { printk(KERN_ERR "Unable to locate \'_\" in" @@ -782,16 +899,64 @@ static struct config_group *target_fabric_make_lun( if (!(lun)) return ERR_PTR(-EINVAL); + lun_cg = &lun->lun_group; + lun_cg->default_groups = kzalloc(sizeof(struct config_group) * 2, + GFP_KERNEL); + if (!lun_cg->default_groups) { + printk(KERN_ERR "Unable to allocate lun_cg->default_groups\n"); + return ERR_PTR(-ENOMEM); + } + config_group_init_type_name(&lun->lun_group, name, &TF_CIT_TMPL(tf)->tfc_tpg_port_cit); + config_group_init_type_name(&lun->port_stat_grps.stat_group, + "statistics", &TF_CIT_TMPL(tf)->tfc_tpg_port_stat_cit); + lun_cg->default_groups[0] = &lun->port_stat_grps.stat_group; + lun_cg->default_groups[1] = NULL; + + port_stat_grp = &PORT_STAT_GRP(lun)->stat_group; + port_stat_grp->default_groups = kzalloc(sizeof(struct config_group) * 3, + GFP_KERNEL); + if (!port_stat_grp->default_groups) { + printk(KERN_ERR "Unable to allocate port_stat_grp->default_groups\n"); + errno = -ENOMEM; + goto out; + } + target_stat_setup_port_default_groups(lun); return &lun->lun_group; +out: + if (lun_cg) + kfree(lun_cg->default_groups); + return ERR_PTR(errno); } static void target_fabric_drop_lun( struct config_group *group, struct config_item *item) { + struct se_lun *lun = container_of(to_config_group(item), + struct se_lun, lun_group); + struct config_item *df_item; + struct config_group *lun_cg, *port_stat_grp; + int i; + + port_stat_grp = &PORT_STAT_GRP(lun)->stat_group; + for (i = 0; port_stat_grp->default_groups[i]; i++) { + df_item = &port_stat_grp->default_groups[i]->cg_item; + port_stat_grp->default_groups[i] = NULL; + config_item_put(df_item); + } + kfree(port_stat_grp->default_groups); + + lun_cg = &lun->lun_group; + for (i = 0; lun_cg->default_groups[i]; i++) { + df_item = &lun_cg->default_groups[i]->cg_item; + lun_cg->default_groups[i] = NULL; + config_item_put(df_item); + } + kfree(lun_cg->default_groups); + config_item_put(item); } @@ -946,6 +1111,15 @@ TF_CIT_SETUP(tpg, &target_fabric_tpg_item_ops, &target_fabric_tpg_group_ops, /* End of tfc_tpg_cit */ +/* Start of tfc_wwn_fabric_stats_cit */ +/* + * This is used as a placeholder for struct se_wwn->fabric_stat_group + * to allow fabrics access to ->fabric_stat_group->default_groups[] + */ +TF_CIT_SETUP(wwn_fabric_stats, NULL, NULL, NULL); + +/* End of tfc_wwn_fabric_stats_cit */ + /* Start of tfc_wwn_cit */ static struct config_group *target_fabric_make_wwn( @@ -966,8 +1140,17 @@ static struct config_group *target_fabric_make_wwn( return ERR_PTR(-EINVAL); wwn->wwn_tf = tf; + /* + * Setup default groups from pre-allocated wwn->wwn_default_groups + */ + wwn->wwn_group.default_groups = wwn->wwn_default_groups; + wwn->wwn_group.default_groups[0] = &wwn->fabric_stat_group; + wwn->wwn_group.default_groups[1] = NULL; + config_group_init_type_name(&wwn->wwn_group, name, &TF_CIT_TMPL(tf)->tfc_tpg_cit); + config_group_init_type_name(&wwn->fabric_stat_group, "fabric_statistics", + &TF_CIT_TMPL(tf)->tfc_wwn_fabric_stats_cit); return &wwn->wwn_group; } @@ -976,6 +1159,18 @@ static void target_fabric_drop_wwn( struct config_group *group, struct config_item *item) { + struct se_wwn *wwn = container_of(to_config_group(item), + struct se_wwn, wwn_group); + struct config_item *df_item; + struct config_group *cg = &wwn->wwn_group; + int i; + + for (i = 0; cg->default_groups[i]; i++) { + df_item = &cg->default_groups[i]->cg_item; + cg->default_groups[i] = NULL; + config_item_put(df_item); + } + config_item_put(item); } @@ -1015,9 +1210,11 @@ int target_fabric_setup_cits(struct target_fabric_configfs *tf) { target_fabric_setup_discovery_cit(tf); target_fabric_setup_wwn_cit(tf); + target_fabric_setup_wwn_fabric_stats_cit(tf); target_fabric_setup_tpg_cit(tf); target_fabric_setup_tpg_base_cit(tf); target_fabric_setup_tpg_port_cit(tf); + target_fabric_setup_tpg_port_stat_cit(tf); target_fabric_setup_tpg_lun_cit(tf); target_fabric_setup_tpg_np_cit(tf); target_fabric_setup_tpg_np_base_cit(tf); @@ -1028,7 +1225,9 @@ int target_fabric_setup_cits(struct target_fabric_configfs *tf) target_fabric_setup_tpg_nacl_attrib_cit(tf); target_fabric_setup_tpg_nacl_auth_cit(tf); target_fabric_setup_tpg_nacl_param_cit(tf); + target_fabric_setup_tpg_nacl_stat_cit(tf); target_fabric_setup_tpg_mappedlun_cit(tf); + target_fabric_setup_tpg_mappedlun_stat_cit(tf); return 0; } diff --git a/drivers/target/target_core_stat.c b/drivers/target/target_core_stat.c new file mode 100644 index 000000000000..5e3a067a7475 --- /dev/null +++ b/drivers/target/target_core_stat.c @@ -0,0 +1,1810 @@ +/******************************************************************************* + * Filename: target_core_stat.c + * + * Copyright (c) 2011 Rising Tide Systems + * Copyright (c) 2011 Linux-iSCSI.org + * + * Modern ConfigFS group context specific statistics based on original + * target_core_mib.c code + * + * Copyright (c) 2006-2007 SBE, Inc. All Rights Reserved. + * + * Nicholas A. Bellinger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "target_core_hba.h" + +#ifndef INITIAL_JIFFIES +#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) +#endif + +#define NONE "None" +#define ISPRINT(a) ((a >= ' ') && (a <= '~')) + +#define SCSI_LU_INDEX 1 +#define LU_COUNT 1 + +/* + * SCSI Device Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_dev, se_dev_stat_grps); +#define DEV_STAT_SCSI_DEV_ATTR(_name, _mode) \ +static struct target_stat_scsi_dev_attribute \ + target_stat_scsi_dev_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_dev_show_attr_##_name, \ + target_stat_scsi_dev_store_attr_##_name); + +#define DEV_STAT_SCSI_DEV_ATTR_RO(_name) \ +static struct target_stat_scsi_dev_attribute \ + target_stat_scsi_dev_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_dev_show_attr_##_name); + +static ssize_t target_stat_scsi_dev_show_attr_inst( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_hba *hba = se_subdev->se_dev_hba; + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); +} +DEV_STAT_SCSI_DEV_ATTR_RO(inst); + +static ssize_t target_stat_scsi_dev_show_attr_indx( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", dev->dev_index); +} +DEV_STAT_SCSI_DEV_ATTR_RO(indx); + +static ssize_t target_stat_scsi_dev_show_attr_role( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "Target\n"); +} +DEV_STAT_SCSI_DEV_ATTR_RO(role); + +static ssize_t target_stat_scsi_dev_show_attr_ports( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", dev->dev_port_count); +} +DEV_STAT_SCSI_DEV_ATTR_RO(ports); + +CONFIGFS_EATTR_OPS(target_stat_scsi_dev, se_dev_stat_grps, scsi_dev_group); + +static struct configfs_attribute *target_stat_scsi_dev_attrs[] = { + &target_stat_scsi_dev_inst.attr, + &target_stat_scsi_dev_indx.attr, + &target_stat_scsi_dev_role.attr, + &target_stat_scsi_dev_ports.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_dev_attrib_ops = { + .show_attribute = target_stat_scsi_dev_attr_show, + .store_attribute = target_stat_scsi_dev_attr_store, +}; + +static struct config_item_type target_stat_scsi_dev_cit = { + .ct_item_ops = &target_stat_scsi_dev_attrib_ops, + .ct_attrs = target_stat_scsi_dev_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * SCSI Target Device Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_tgt_dev, se_dev_stat_grps); +#define DEV_STAT_SCSI_TGT_DEV_ATTR(_name, _mode) \ +static struct target_stat_scsi_tgt_dev_attribute \ + target_stat_scsi_tgt_dev_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_tgt_dev_show_attr_##_name, \ + target_stat_scsi_tgt_dev_store_attr_##_name); + +#define DEV_STAT_SCSI_TGT_DEV_ATTR_RO(_name) \ +static struct target_stat_scsi_tgt_dev_attribute \ + target_stat_scsi_tgt_dev_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_tgt_dev_show_attr_##_name); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_inst( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_hba *hba = se_subdev->se_dev_hba; + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(inst); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_indx( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", dev->dev_index); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(indx); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_num_lus( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", LU_COUNT); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(num_lus); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_status( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + char status[16]; + + if (!dev) + return -ENODEV; + + switch (dev->dev_status) { + case TRANSPORT_DEVICE_ACTIVATED: + strcpy(status, "activated"); + break; + case TRANSPORT_DEVICE_DEACTIVATED: + strcpy(status, "deactivated"); + break; + case TRANSPORT_DEVICE_SHUTDOWN: + strcpy(status, "shutdown"); + break; + case TRANSPORT_DEVICE_OFFLINE_ACTIVATED: + case TRANSPORT_DEVICE_OFFLINE_DEACTIVATED: + strcpy(status, "offline"); + break; + default: + sprintf(status, "unknown(%d)", dev->dev_status); + break; + } + + return snprintf(page, PAGE_SIZE, "%s\n", status); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(status); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_non_access_lus( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + int non_accessible_lus; + + if (!dev) + return -ENODEV; + + switch (dev->dev_status) { + case TRANSPORT_DEVICE_ACTIVATED: + non_accessible_lus = 0; + break; + case TRANSPORT_DEVICE_DEACTIVATED: + case TRANSPORT_DEVICE_SHUTDOWN: + case TRANSPORT_DEVICE_OFFLINE_ACTIVATED: + case TRANSPORT_DEVICE_OFFLINE_DEACTIVATED: + default: + non_accessible_lus = 1; + break; + } + + return snprintf(page, PAGE_SIZE, "%u\n", non_accessible_lus); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(non_access_lus); + +static ssize_t target_stat_scsi_tgt_dev_show_attr_resets( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", dev->num_resets); +} +DEV_STAT_SCSI_TGT_DEV_ATTR_RO(resets); + + +CONFIGFS_EATTR_OPS(target_stat_scsi_tgt_dev, se_dev_stat_grps, scsi_tgt_dev_group); + +static struct configfs_attribute *target_stat_scsi_tgt_dev_attrs[] = { + &target_stat_scsi_tgt_dev_inst.attr, + &target_stat_scsi_tgt_dev_indx.attr, + &target_stat_scsi_tgt_dev_num_lus.attr, + &target_stat_scsi_tgt_dev_status.attr, + &target_stat_scsi_tgt_dev_non_access_lus.attr, + &target_stat_scsi_tgt_dev_resets.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_tgt_dev_attrib_ops = { + .show_attribute = target_stat_scsi_tgt_dev_attr_show, + .store_attribute = target_stat_scsi_tgt_dev_attr_store, +}; + +static struct config_item_type target_stat_scsi_tgt_dev_cit = { + .ct_item_ops = &target_stat_scsi_tgt_dev_attrib_ops, + .ct_attrs = target_stat_scsi_tgt_dev_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * SCSI Logical Unit Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_lu, se_dev_stat_grps); +#define DEV_STAT_SCSI_LU_ATTR(_name, _mode) \ +static struct target_stat_scsi_lu_attribute target_stat_scsi_lu_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_lu_show_attr_##_name, \ + target_stat_scsi_lu_store_attr_##_name); + +#define DEV_STAT_SCSI_LU_ATTR_RO(_name) \ +static struct target_stat_scsi_lu_attribute target_stat_scsi_lu_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_lu_show_attr_##_name); + +static ssize_t target_stat_scsi_lu_show_attr_inst( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_hba *hba = se_subdev->se_dev_hba; + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); +} +DEV_STAT_SCSI_LU_ATTR_RO(inst); + +static ssize_t target_stat_scsi_lu_show_attr_dev( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", dev->dev_index); +} +DEV_STAT_SCSI_LU_ATTR_RO(dev); + +static ssize_t target_stat_scsi_lu_show_attr_indx( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + return snprintf(page, PAGE_SIZE, "%u\n", SCSI_LU_INDEX); +} +DEV_STAT_SCSI_LU_ATTR_RO(indx); + +static ssize_t target_stat_scsi_lu_show_attr_lun( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + /* FIXME: scsiLuDefaultLun */ + return snprintf(page, PAGE_SIZE, "%llu\n", (unsigned long long)0); +} +DEV_STAT_SCSI_LU_ATTR_RO(lun); + +static ssize_t target_stat_scsi_lu_show_attr_lu_name( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + /* scsiLuWwnName */ + return snprintf(page, PAGE_SIZE, "%s\n", + (strlen(DEV_T10_WWN(dev)->unit_serial)) ? + (char *)&DEV_T10_WWN(dev)->unit_serial[0] : "None"); +} +DEV_STAT_SCSI_LU_ATTR_RO(lu_name); + +static ssize_t target_stat_scsi_lu_show_attr_vend( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + int j; + char str[28]; + + if (!dev) + return -ENODEV; + /* scsiLuVendorId */ + memcpy(&str[0], (void *)DEV_T10_WWN(dev), 28); + for (j = 0; j < 8; j++) + str[j] = ISPRINT(DEV_T10_WWN(dev)->vendor[j]) ? + DEV_T10_WWN(dev)->vendor[j] : 0x20; + str[8] = 0; + return snprintf(page, PAGE_SIZE, "%s\n", str); +} +DEV_STAT_SCSI_LU_ATTR_RO(vend); + +static ssize_t target_stat_scsi_lu_show_attr_prod( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + int j; + char str[28]; + + if (!dev) + return -ENODEV; + + /* scsiLuProductId */ + memcpy(&str[0], (void *)DEV_T10_WWN(dev), 28); + for (j = 0; j < 16; j++) + str[j] = ISPRINT(DEV_T10_WWN(dev)->model[j]) ? + DEV_T10_WWN(dev)->model[j] : 0x20; + str[16] = 0; + return snprintf(page, PAGE_SIZE, "%s\n", str); +} +DEV_STAT_SCSI_LU_ATTR_RO(prod); + +static ssize_t target_stat_scsi_lu_show_attr_rev( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + int j; + char str[28]; + + if (!dev) + return -ENODEV; + + /* scsiLuRevisionId */ + memcpy(&str[0], (void *)DEV_T10_WWN(dev), 28); + for (j = 0; j < 4; j++) + str[j] = ISPRINT(DEV_T10_WWN(dev)->revision[j]) ? + DEV_T10_WWN(dev)->revision[j] : 0x20; + str[4] = 0; + return snprintf(page, PAGE_SIZE, "%s\n", str); +} +DEV_STAT_SCSI_LU_ATTR_RO(rev); + +static ssize_t target_stat_scsi_lu_show_attr_dev_type( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuPeripheralType */ + return snprintf(page, PAGE_SIZE, "%u\n", + TRANSPORT(dev)->get_device_type(dev)); +} +DEV_STAT_SCSI_LU_ATTR_RO(dev_type); + +static ssize_t target_stat_scsi_lu_show_attr_status( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuStatus */ + return snprintf(page, PAGE_SIZE, "%s\n", + (dev->dev_status == TRANSPORT_DEVICE_ACTIVATED) ? + "available" : "notavailable"); +} +DEV_STAT_SCSI_LU_ATTR_RO(status); + +static ssize_t target_stat_scsi_lu_show_attr_state_bit( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuState */ + return snprintf(page, PAGE_SIZE, "exposed\n"); +} +DEV_STAT_SCSI_LU_ATTR_RO(state_bit); + +static ssize_t target_stat_scsi_lu_show_attr_num_cmds( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuNumCommands */ + return snprintf(page, PAGE_SIZE, "%llu\n", + (unsigned long long)dev->num_cmds); +} +DEV_STAT_SCSI_LU_ATTR_RO(num_cmds); + +static ssize_t target_stat_scsi_lu_show_attr_read_mbytes( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuReadMegaBytes */ + return snprintf(page, PAGE_SIZE, "%u\n", (u32)(dev->read_bytes >> 20)); +} +DEV_STAT_SCSI_LU_ATTR_RO(read_mbytes); + +static ssize_t target_stat_scsi_lu_show_attr_write_mbytes( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuWrittenMegaBytes */ + return snprintf(page, PAGE_SIZE, "%u\n", (u32)(dev->write_bytes >> 20)); +} +DEV_STAT_SCSI_LU_ATTR_RO(write_mbytes); + +static ssize_t target_stat_scsi_lu_show_attr_resets( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuInResets */ + return snprintf(page, PAGE_SIZE, "%u\n", dev->num_resets); +} +DEV_STAT_SCSI_LU_ATTR_RO(resets); + +static ssize_t target_stat_scsi_lu_show_attr_full_stat( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* FIXME: scsiLuOutTaskSetFullStatus */ + return snprintf(page, PAGE_SIZE, "%u\n", 0); +} +DEV_STAT_SCSI_LU_ATTR_RO(full_stat); + +static ssize_t target_stat_scsi_lu_show_attr_hs_num_cmds( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* FIXME: scsiLuHSInCommands */ + return snprintf(page, PAGE_SIZE, "%u\n", 0); +} +DEV_STAT_SCSI_LU_ATTR_RO(hs_num_cmds); + +static ssize_t target_stat_scsi_lu_show_attr_creation_time( + struct se_dev_stat_grps *sgrps, char *page) +{ + struct se_subsystem_dev *se_subdev = container_of(sgrps, + struct se_subsystem_dev, dev_stat_grps); + struct se_device *dev = se_subdev->se_dev_ptr; + + if (!dev) + return -ENODEV; + + /* scsiLuCreationTime */ + return snprintf(page, PAGE_SIZE, "%u\n", (u32)(((u32)dev->creation_time - + INITIAL_JIFFIES) * 100 / HZ)); +} +DEV_STAT_SCSI_LU_ATTR_RO(creation_time); + +CONFIGFS_EATTR_OPS(target_stat_scsi_lu, se_dev_stat_grps, scsi_lu_group); + +static struct configfs_attribute *target_stat_scsi_lu_attrs[] = { + &target_stat_scsi_lu_inst.attr, + &target_stat_scsi_lu_dev.attr, + &target_stat_scsi_lu_indx.attr, + &target_stat_scsi_lu_lun.attr, + &target_stat_scsi_lu_lu_name.attr, + &target_stat_scsi_lu_vend.attr, + &target_stat_scsi_lu_prod.attr, + &target_stat_scsi_lu_rev.attr, + &target_stat_scsi_lu_dev_type.attr, + &target_stat_scsi_lu_status.attr, + &target_stat_scsi_lu_state_bit.attr, + &target_stat_scsi_lu_num_cmds.attr, + &target_stat_scsi_lu_read_mbytes.attr, + &target_stat_scsi_lu_write_mbytes.attr, + &target_stat_scsi_lu_resets.attr, + &target_stat_scsi_lu_full_stat.attr, + &target_stat_scsi_lu_hs_num_cmds.attr, + &target_stat_scsi_lu_creation_time.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_lu_attrib_ops = { + .show_attribute = target_stat_scsi_lu_attr_show, + .store_attribute = target_stat_scsi_lu_attr_store, +}; + +static struct config_item_type target_stat_scsi_lu_cit = { + .ct_item_ops = &target_stat_scsi_lu_attrib_ops, + .ct_attrs = target_stat_scsi_lu_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * Called from target_core_configfs.c:target_core_make_subdev() to setup + * the target statistics groups + configfs CITs located in target_core_stat.c + */ +void target_stat_setup_dev_default_groups(struct se_subsystem_dev *se_subdev) +{ + struct config_group *dev_stat_grp = &DEV_STAT_GRP(se_subdev)->stat_group; + + config_group_init_type_name(&DEV_STAT_GRP(se_subdev)->scsi_dev_group, + "scsi_dev", &target_stat_scsi_dev_cit); + config_group_init_type_name(&DEV_STAT_GRP(se_subdev)->scsi_tgt_dev_group, + "scsi_tgt_dev", &target_stat_scsi_tgt_dev_cit); + config_group_init_type_name(&DEV_STAT_GRP(se_subdev)->scsi_lu_group, + "scsi_lu", &target_stat_scsi_lu_cit); + + dev_stat_grp->default_groups[0] = &DEV_STAT_GRP(se_subdev)->scsi_dev_group; + dev_stat_grp->default_groups[1] = &DEV_STAT_GRP(se_subdev)->scsi_tgt_dev_group; + dev_stat_grp->default_groups[2] = &DEV_STAT_GRP(se_subdev)->scsi_lu_group; + dev_stat_grp->default_groups[3] = NULL; +} + +/* + * SCSI Port Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_port, se_port_stat_grps); +#define DEV_STAT_SCSI_PORT_ATTR(_name, _mode) \ +static struct target_stat_scsi_port_attribute \ + target_stat_scsi_port_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_port_show_attr_##_name, \ + target_stat_scsi_port_store_attr_##_name); + +#define DEV_STAT_SCSI_PORT_ATTR_RO(_name) \ +static struct target_stat_scsi_port_attribute \ + target_stat_scsi_port_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_port_show_attr_##_name); + +static ssize_t target_stat_scsi_port_show_attr_inst( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_device *dev = lun->lun_se_dev; + struct se_hba *hba; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + hba = dev->se_hba; + ret = snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_PORT_ATTR_RO(inst); + +static ssize_t target_stat_scsi_port_show_attr_dev( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_device *dev = lun->lun_se_dev; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%u\n", dev->dev_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_PORT_ATTR_RO(dev); + +static ssize_t target_stat_scsi_port_show_attr_indx( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%u\n", sep->sep_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_PORT_ATTR_RO(indx); + +static ssize_t target_stat_scsi_port_show_attr_role( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_device *dev = lun->lun_se_dev; + struct se_port *sep; + ssize_t ret; + + if (!dev) + return -ENODEV; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s%u\n", "Device", dev->dev_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_PORT_ATTR_RO(role); + +static ssize_t target_stat_scsi_port_show_attr_busy_count( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + /* FIXME: scsiPortBusyStatuses */ + ret = snprintf(page, PAGE_SIZE, "%u\n", 0); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_PORT_ATTR_RO(busy_count); + +CONFIGFS_EATTR_OPS(target_stat_scsi_port, se_port_stat_grps, scsi_port_group); + +static struct configfs_attribute *target_stat_scsi_port_attrs[] = { + &target_stat_scsi_port_inst.attr, + &target_stat_scsi_port_dev.attr, + &target_stat_scsi_port_indx.attr, + &target_stat_scsi_port_role.attr, + &target_stat_scsi_port_busy_count.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_port_attrib_ops = { + .show_attribute = target_stat_scsi_port_attr_show, + .store_attribute = target_stat_scsi_port_attr_store, +}; + +static struct config_item_type target_stat_scsi_port_cit = { + .ct_item_ops = &target_stat_scsi_port_attrib_ops, + .ct_attrs = target_stat_scsi_port_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * SCSI Target Port Table + */ +CONFIGFS_EATTR_STRUCT(target_stat_scsi_tgt_port, se_port_stat_grps); +#define DEV_STAT_SCSI_TGT_PORT_ATTR(_name, _mode) \ +static struct target_stat_scsi_tgt_port_attribute \ + target_stat_scsi_tgt_port_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_tgt_port_show_attr_##_name, \ + target_stat_scsi_tgt_port_store_attr_##_name); + +#define DEV_STAT_SCSI_TGT_PORT_ATTR_RO(_name) \ +static struct target_stat_scsi_tgt_port_attribute \ + target_stat_scsi_tgt_port_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_tgt_port_show_attr_##_name); + +static ssize_t target_stat_scsi_tgt_port_show_attr_inst( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_device *dev = lun->lun_se_dev; + struct se_port *sep; + struct se_hba *hba; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + hba = dev->se_hba; + ret = snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(inst); + +static ssize_t target_stat_scsi_tgt_port_show_attr_dev( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_device *dev = lun->lun_se_dev; + struct se_port *sep; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%u\n", dev->dev_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(dev); + +static ssize_t target_stat_scsi_tgt_port_show_attr_indx( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%u\n", sep->sep_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(indx); + +static ssize_t target_stat_scsi_tgt_port_show_attr_name( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + ret = snprintf(page, PAGE_SIZE, "%sPort#%u\n", + TPG_TFO(tpg)->get_fabric_name(), sep->sep_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(name); + +static ssize_t target_stat_scsi_tgt_port_show_attr_port_index( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + ret = snprintf(page, PAGE_SIZE, "%s%s%d\n", + TPG_TFO(tpg)->tpg_get_wwn(tpg), "+t+", + TPG_TFO(tpg)->tpg_get_tag(tpg)); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(port_index); + +static ssize_t target_stat_scsi_tgt_port_show_attr_in_cmds( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + ret = snprintf(page, PAGE_SIZE, "%llu\n", sep->sep_stats.cmd_pdus); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(in_cmds); + +static ssize_t target_stat_scsi_tgt_port_show_attr_write_mbytes( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + ret = snprintf(page, PAGE_SIZE, "%u\n", + (u32)(sep->sep_stats.rx_data_octets >> 20)); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(write_mbytes); + +static ssize_t target_stat_scsi_tgt_port_show_attr_read_mbytes( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + ret = snprintf(page, PAGE_SIZE, "%u\n", + (u32)(sep->sep_stats.tx_data_octets >> 20)); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(read_mbytes); + +static ssize_t target_stat_scsi_tgt_port_show_attr_hs_in_cmds( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + + /* FIXME: scsiTgtPortHsInCommands */ + ret = snprintf(page, PAGE_SIZE, "%u\n", 0); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TGT_PORT_ATTR_RO(hs_in_cmds); + +CONFIGFS_EATTR_OPS(target_stat_scsi_tgt_port, se_port_stat_grps, + scsi_tgt_port_group); + +static struct configfs_attribute *target_stat_scsi_tgt_port_attrs[] = { + &target_stat_scsi_tgt_port_inst.attr, + &target_stat_scsi_tgt_port_dev.attr, + &target_stat_scsi_tgt_port_indx.attr, + &target_stat_scsi_tgt_port_name.attr, + &target_stat_scsi_tgt_port_port_index.attr, + &target_stat_scsi_tgt_port_in_cmds.attr, + &target_stat_scsi_tgt_port_write_mbytes.attr, + &target_stat_scsi_tgt_port_read_mbytes.attr, + &target_stat_scsi_tgt_port_hs_in_cmds.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_tgt_port_attrib_ops = { + .show_attribute = target_stat_scsi_tgt_port_attr_show, + .store_attribute = target_stat_scsi_tgt_port_attr_store, +}; + +static struct config_item_type target_stat_scsi_tgt_port_cit = { + .ct_item_ops = &target_stat_scsi_tgt_port_attrib_ops, + .ct_attrs = target_stat_scsi_tgt_port_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * SCSI Transport Table +o */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_transport, se_port_stat_grps); +#define DEV_STAT_SCSI_TRANSPORT_ATTR(_name, _mode) \ +static struct target_stat_scsi_transport_attribute \ + target_stat_scsi_transport_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_transport_show_attr_##_name, \ + target_stat_scsi_transport_store_attr_##_name); + +#define DEV_STAT_SCSI_TRANSPORT_ATTR_RO(_name) \ +static struct target_stat_scsi_transport_attribute \ + target_stat_scsi_transport_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_transport_show_attr_##_name); + +static ssize_t target_stat_scsi_transport_show_attr_inst( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_device *dev = lun->lun_se_dev; + struct se_port *sep; + struct se_hba *hba; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + + hba = dev->se_hba; + ret = snprintf(page, PAGE_SIZE, "%u\n", hba->hba_index); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TRANSPORT_ATTR_RO(inst); + +static ssize_t target_stat_scsi_transport_show_attr_device( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + /* scsiTransportType */ + ret = snprintf(page, PAGE_SIZE, "scsiTransport%s\n", + TPG_TFO(tpg)->get_fabric_name()); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TRANSPORT_ATTR_RO(device); + +static ssize_t target_stat_scsi_transport_show_attr_indx( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_port *sep; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + ret = snprintf(page, PAGE_SIZE, "%u\n", + TPG_TFO(tpg)->tpg_get_inst_index(tpg)); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TRANSPORT_ATTR_RO(indx); + +static ssize_t target_stat_scsi_transport_show_attr_dev_name( + struct se_port_stat_grps *pgrps, char *page) +{ + struct se_lun *lun = container_of(pgrps, struct se_lun, port_stat_grps); + struct se_device *dev = lun->lun_se_dev; + struct se_port *sep; + struct se_portal_group *tpg; + struct t10_wwn *wwn; + ssize_t ret; + + spin_lock(&lun->lun_sep_lock); + sep = lun->lun_sep; + if (!sep) { + spin_unlock(&lun->lun_sep_lock); + return -ENODEV; + } + tpg = sep->sep_tpg; + wwn = DEV_T10_WWN(dev); + /* scsiTransportDevName */ + ret = snprintf(page, PAGE_SIZE, "%s+%s\n", + TPG_TFO(tpg)->tpg_get_wwn(tpg), + (strlen(wwn->unit_serial)) ? wwn->unit_serial : + wwn->vendor); + spin_unlock(&lun->lun_sep_lock); + return ret; +} +DEV_STAT_SCSI_TRANSPORT_ATTR_RO(dev_name); + +CONFIGFS_EATTR_OPS(target_stat_scsi_transport, se_port_stat_grps, + scsi_transport_group); + +static struct configfs_attribute *target_stat_scsi_transport_attrs[] = { + &target_stat_scsi_transport_inst.attr, + &target_stat_scsi_transport_device.attr, + &target_stat_scsi_transport_indx.attr, + &target_stat_scsi_transport_dev_name.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_transport_attrib_ops = { + .show_attribute = target_stat_scsi_transport_attr_show, + .store_attribute = target_stat_scsi_transport_attr_store, +}; + +static struct config_item_type target_stat_scsi_transport_cit = { + .ct_item_ops = &target_stat_scsi_transport_attrib_ops, + .ct_attrs = target_stat_scsi_transport_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * Called from target_core_fabric_configfs.c:target_fabric_make_lun() to setup + * the target port statistics groups + configfs CITs located in target_core_stat.c + */ +void target_stat_setup_port_default_groups(struct se_lun *lun) +{ + struct config_group *port_stat_grp = &PORT_STAT_GRP(lun)->stat_group; + + config_group_init_type_name(&PORT_STAT_GRP(lun)->scsi_port_group, + "scsi_port", &target_stat_scsi_port_cit); + config_group_init_type_name(&PORT_STAT_GRP(lun)->scsi_tgt_port_group, + "scsi_tgt_port", &target_stat_scsi_tgt_port_cit); + config_group_init_type_name(&PORT_STAT_GRP(lun)->scsi_transport_group, + "scsi_transport", &target_stat_scsi_transport_cit); + + port_stat_grp->default_groups[0] = &PORT_STAT_GRP(lun)->scsi_port_group; + port_stat_grp->default_groups[1] = &PORT_STAT_GRP(lun)->scsi_tgt_port_group; + port_stat_grp->default_groups[2] = &PORT_STAT_GRP(lun)->scsi_transport_group; + port_stat_grp->default_groups[3] = NULL; +} + +/* + * SCSI Authorized Initiator Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_auth_intr, se_ml_stat_grps); +#define DEV_STAT_SCSI_AUTH_INTR_ATTR(_name, _mode) \ +static struct target_stat_scsi_auth_intr_attribute \ + target_stat_scsi_auth_intr_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_auth_intr_show_attr_##_name, \ + target_stat_scsi_auth_intr_store_attr_##_name); + +#define DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(_name) \ +static struct target_stat_scsi_auth_intr_attribute \ + target_stat_scsi_auth_intr_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_auth_intr_show_attr_##_name); + +static ssize_t target_stat_scsi_auth_intr_show_attr_inst( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + /* scsiInstIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", + TPG_TFO(tpg)->tpg_get_inst_index(tpg)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(inst); + +static ssize_t target_stat_scsi_auth_intr_show_attr_dev( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_lun *lun; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + lun = deve->se_lun; + /* scsiDeviceIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", lun->lun_se_dev->dev_index); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(dev); + +static ssize_t target_stat_scsi_auth_intr_show_attr_port( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + /* scsiAuthIntrTgtPortIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", TPG_TFO(tpg)->tpg_get_tag(tpg)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(port); + +static ssize_t target_stat_scsi_auth_intr_show_attr_indx( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", nacl->acl_index); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(indx); + +static ssize_t target_stat_scsi_auth_intr_show_attr_dev_or_port( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrDevOrPort */ + ret = snprintf(page, PAGE_SIZE, "%u\n", 1); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(dev_or_port); + +static ssize_t target_stat_scsi_auth_intr_show_attr_intr_name( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrName */ + ret = snprintf(page, PAGE_SIZE, "%s\n", nacl->initiatorname); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(intr_name); + +static ssize_t target_stat_scsi_auth_intr_show_attr_map_indx( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* FIXME: scsiAuthIntrLunMapIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", 0); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(map_indx); + +static ssize_t target_stat_scsi_auth_intr_show_attr_att_count( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrAttachedTimes */ + ret = snprintf(page, PAGE_SIZE, "%u\n", deve->attach_count); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(att_count); + +static ssize_t target_stat_scsi_auth_intr_show_attr_num_cmds( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrOutCommands */ + ret = snprintf(page, PAGE_SIZE, "%u\n", deve->total_cmds); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(num_cmds); + +static ssize_t target_stat_scsi_auth_intr_show_attr_read_mbytes( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrReadMegaBytes */ + ret = snprintf(page, PAGE_SIZE, "%u\n", (u32)(deve->read_bytes >> 20)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(read_mbytes); + +static ssize_t target_stat_scsi_auth_intr_show_attr_write_mbytes( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrWrittenMegaBytes */ + ret = snprintf(page, PAGE_SIZE, "%u\n", (u32)(deve->write_bytes >> 20)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(write_mbytes); + +static ssize_t target_stat_scsi_auth_intr_show_attr_hs_num_cmds( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* FIXME: scsiAuthIntrHSOutCommands */ + ret = snprintf(page, PAGE_SIZE, "%u\n", 0); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(hs_num_cmds); + +static ssize_t target_stat_scsi_auth_intr_show_attr_creation_time( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAuthIntrLastCreation */ + ret = snprintf(page, PAGE_SIZE, "%u\n", (u32)(((u32)deve->creation_time - + INITIAL_JIFFIES) * 100 / HZ)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(creation_time); + +static ssize_t target_stat_scsi_auth_intr_show_attr_row_status( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* FIXME: scsiAuthIntrRowStatus */ + ret = snprintf(page, PAGE_SIZE, "Ready\n"); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_AUTH_INTR_ATTR_RO(row_status); + +CONFIGFS_EATTR_OPS(target_stat_scsi_auth_intr, se_ml_stat_grps, + scsi_auth_intr_group); + +static struct configfs_attribute *target_stat_scsi_auth_intr_attrs[] = { + &target_stat_scsi_auth_intr_inst.attr, + &target_stat_scsi_auth_intr_dev.attr, + &target_stat_scsi_auth_intr_port.attr, + &target_stat_scsi_auth_intr_indx.attr, + &target_stat_scsi_auth_intr_dev_or_port.attr, + &target_stat_scsi_auth_intr_intr_name.attr, + &target_stat_scsi_auth_intr_map_indx.attr, + &target_stat_scsi_auth_intr_att_count.attr, + &target_stat_scsi_auth_intr_num_cmds.attr, + &target_stat_scsi_auth_intr_read_mbytes.attr, + &target_stat_scsi_auth_intr_write_mbytes.attr, + &target_stat_scsi_auth_intr_hs_num_cmds.attr, + &target_stat_scsi_auth_intr_creation_time.attr, + &target_stat_scsi_auth_intr_row_status.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_auth_intr_attrib_ops = { + .show_attribute = target_stat_scsi_auth_intr_attr_show, + .store_attribute = target_stat_scsi_auth_intr_attr_store, +}; + +static struct config_item_type target_stat_scsi_auth_intr_cit = { + .ct_item_ops = &target_stat_scsi_auth_intr_attrib_ops, + .ct_attrs = target_stat_scsi_auth_intr_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * SCSI Attached Initiator Port Table + */ + +CONFIGFS_EATTR_STRUCT(target_stat_scsi_att_intr_port, se_ml_stat_grps); +#define DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR(_name, _mode) \ +static struct target_stat_scsi_att_intr_port_attribute \ + target_stat_scsi_att_intr_port_##_name = \ + __CONFIGFS_EATTR(_name, _mode, \ + target_stat_scsi_att_intr_port_show_attr_##_name, \ + target_stat_scsi_att_intr_port_store_attr_##_name); + +#define DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(_name) \ +static struct target_stat_scsi_att_intr_port_attribute \ + target_stat_scsi_att_intr_port_##_name = \ + __CONFIGFS_EATTR_RO(_name, \ + target_stat_scsi_att_intr_port_show_attr_##_name); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_inst( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + /* scsiInstIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", + TPG_TFO(tpg)->tpg_get_inst_index(tpg)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(inst); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_dev( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_lun *lun; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + lun = deve->se_lun; + /* scsiDeviceIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", lun->lun_se_dev->dev_index); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(dev); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_port( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + tpg = nacl->se_tpg; + /* scsiPortIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", TPG_TFO(tpg)->tpg_get_tag(tpg)); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(port); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_indx( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_session *se_sess; + struct se_portal_group *tpg; + ssize_t ret; + + spin_lock_irq(&nacl->nacl_sess_lock); + se_sess = nacl->nacl_sess; + if (!se_sess) { + spin_unlock_irq(&nacl->nacl_sess_lock); + return -ENODEV; + } + + tpg = nacl->se_tpg; + /* scsiAttIntrPortIndex */ + ret = snprintf(page, PAGE_SIZE, "%u\n", + TPG_TFO(tpg)->sess_get_index(se_sess)); + spin_unlock_irq(&nacl->nacl_sess_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(indx); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_port_auth_indx( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_dev_entry *deve; + ssize_t ret; + + spin_lock_irq(&nacl->device_list_lock); + deve = &nacl->device_list[lacl->mapped_lun]; + if (!deve->se_lun || !deve->se_lun_acl) { + spin_unlock_irq(&nacl->device_list_lock); + return -ENODEV; + } + /* scsiAttIntrPortAuthIntrIdx */ + ret = snprintf(page, PAGE_SIZE, "%u\n", nacl->acl_index); + spin_unlock_irq(&nacl->device_list_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(port_auth_indx); + +static ssize_t target_stat_scsi_att_intr_port_show_attr_port_ident( + struct se_ml_stat_grps *lgrps, char *page) +{ + struct se_lun_acl *lacl = container_of(lgrps, + struct se_lun_acl, ml_stat_grps); + struct se_node_acl *nacl = lacl->se_lun_nacl; + struct se_session *se_sess; + struct se_portal_group *tpg; + ssize_t ret; + unsigned char buf[64]; + + spin_lock_irq(&nacl->nacl_sess_lock); + se_sess = nacl->nacl_sess; + if (!se_sess) { + spin_unlock_irq(&nacl->nacl_sess_lock); + return -ENODEV; + } + + tpg = nacl->se_tpg; + /* scsiAttIntrPortName+scsiAttIntrPortIdentifier */ + memset(buf, 0, 64); + if (TPG_TFO(tpg)->sess_get_initiator_sid != NULL) + TPG_TFO(tpg)->sess_get_initiator_sid(se_sess, + (unsigned char *)&buf[0], 64); + + ret = snprintf(page, PAGE_SIZE, "%s+i+%s\n", nacl->initiatorname, buf); + spin_unlock_irq(&nacl->nacl_sess_lock); + return ret; +} +DEV_STAT_SCSI_ATTR_INTR_PORT_ATTR_RO(port_ident); + +CONFIGFS_EATTR_OPS(target_stat_scsi_att_intr_port, se_ml_stat_grps, + scsi_att_intr_port_group); + +static struct configfs_attribute *target_stat_scsi_ath_intr_port_attrs[] = { + &target_stat_scsi_att_intr_port_inst.attr, + &target_stat_scsi_att_intr_port_dev.attr, + &target_stat_scsi_att_intr_port_port.attr, + &target_stat_scsi_att_intr_port_indx.attr, + &target_stat_scsi_att_intr_port_port_auth_indx.attr, + &target_stat_scsi_att_intr_port_port_ident.attr, + NULL, +}; + +static struct configfs_item_operations target_stat_scsi_att_intr_port_attrib_ops = { + .show_attribute = target_stat_scsi_att_intr_port_attr_show, + .store_attribute = target_stat_scsi_att_intr_port_attr_store, +}; + +static struct config_item_type target_stat_scsi_att_intr_port_cit = { + .ct_item_ops = &target_stat_scsi_att_intr_port_attrib_ops, + .ct_attrs = target_stat_scsi_ath_intr_port_attrs, + .ct_owner = THIS_MODULE, +}; + +/* + * Called from target_core_fabric_configfs.c:target_fabric_make_mappedlun() to setup + * the target MappedLUN statistics groups + configfs CITs located in target_core_stat.c + */ +void target_stat_setup_mappedlun_default_groups(struct se_lun_acl *lacl) +{ + struct config_group *ml_stat_grp = &ML_STAT_GRPS(lacl)->stat_group; + + config_group_init_type_name(&ML_STAT_GRPS(lacl)->scsi_auth_intr_group, + "scsi_auth_intr", &target_stat_scsi_auth_intr_cit); + config_group_init_type_name(&ML_STAT_GRPS(lacl)->scsi_att_intr_port_group, + "scsi_att_intr_port", &target_stat_scsi_att_intr_port_cit); + + ml_stat_grp->default_groups[0] = &ML_STAT_GRPS(lacl)->scsi_auth_intr_group; + ml_stat_grp->default_groups[1] = &ML_STAT_GRPS(lacl)->scsi_att_intr_port_group; + ml_stat_grp->default_groups[2] = NULL; +} diff --git a/drivers/target/target_core_stat.h b/drivers/target/target_core_stat.h new file mode 100644 index 000000000000..86c252f9ea47 --- /dev/null +++ b/drivers/target/target_core_stat.h @@ -0,0 +1,8 @@ +#ifndef TARGET_CORE_STAT_H +#define TARGET_CORE_STAT_H + +extern void target_stat_setup_dev_default_groups(struct se_subsystem_dev *); +extern void target_stat_setup_port_default_groups(struct se_lun *); +extern void target_stat_setup_mappedlun_default_groups(struct se_lun_acl *); + +#endif /*** TARGET_CORE_STAT_H ***/ diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 79f2e0a30dc8..c15ed5026fb5 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -601,7 +601,8 @@ struct se_node_acl { struct config_group acl_attrib_group; struct config_group acl_auth_group; struct config_group acl_param_group; - struct config_group *acl_default_groups[4]; + struct config_group acl_fabric_stat_group; + struct config_group *acl_default_groups[5]; struct list_head acl_list; struct list_head acl_sess_list; } ____cacheline_aligned; @@ -622,6 +623,12 @@ struct se_device; struct se_transform_info; struct scatterlist; +struct se_ml_stat_grps { + struct config_group stat_group; + struct config_group scsi_auth_intr_group; + struct config_group scsi_att_intr_port_group; +}; + struct se_lun_acl { char initiatorname[TRANSPORT_IQN_LEN]; u32 mapped_lun; @@ -629,8 +636,11 @@ struct se_lun_acl { struct se_lun *se_lun; struct list_head lacl_list; struct config_group se_lun_group; + struct se_ml_stat_grps ml_stat_grps; } ____cacheline_aligned; +#define ML_STAT_GRPS(lacl) (&(lacl)->ml_stat_grps) + struct se_dev_entry { bool def_pr_registered; /* See transport_lunflags_table */ @@ -693,6 +703,13 @@ struct se_dev_attrib { struct config_group da_group; } ____cacheline_aligned; +struct se_dev_stat_grps { + struct config_group stat_group; + struct config_group scsi_dev_group; + struct config_group scsi_tgt_dev_group; + struct config_group scsi_lu_group; +}; + struct se_subsystem_dev { /* Used for struct se_subsystem_dev-->se_dev_alias, must be less than PAGE_SIZE */ #define SE_DEV_ALIAS_LEN 512 @@ -716,11 +733,14 @@ struct se_subsystem_dev { struct config_group se_dev_group; /* For T10 Reservations */ struct config_group se_dev_pr_group; + /* For target_core_stat.c groups */ + struct se_dev_stat_grps dev_stat_grps; } ____cacheline_aligned; #define T10_ALUA(su_dev) (&(su_dev)->t10_alua) #define T10_RES(su_dev) (&(su_dev)->t10_reservation) #define T10_PR_OPS(su_dev) (&(su_dev)->t10_reservation.pr_ops) +#define DEV_STAT_GRP(dev) (&(dev)->dev_stat_grps) struct se_device { /* Set to 1 if thread is NOT sleeping on thread_sem */ @@ -834,6 +854,13 @@ struct se_hba { #define SE_HBA(dev) ((dev)->se_hba) +struct se_port_stat_grps { + struct config_group stat_group; + struct config_group scsi_port_group; + struct config_group scsi_tgt_port_group; + struct config_group scsi_transport_group; +}; + struct se_lun { /* See transport_lun_status_table */ enum transport_lun_status_table lun_status; @@ -848,11 +875,13 @@ struct se_lun { struct list_head lun_cmd_list; struct list_head lun_acl_list; struct se_device *lun_se_dev; + struct se_port *lun_sep; struct config_group lun_group; - struct se_port *lun_sep; + struct se_port_stat_grps port_stat_grps; } ____cacheline_aligned; #define SE_LUN(cmd) ((cmd)->se_lun) +#define PORT_STAT_GRP(lun) (&(lun)->port_stat_grps) struct scsi_port_stats { u64 cmd_pdus; @@ -924,6 +953,8 @@ struct se_portal_group { struct se_wwn { struct target_fabric_configfs *wwn_tf; struct config_group wwn_group; + struct config_group *wwn_default_groups[2]; + struct config_group fabric_stat_group; } ____cacheline_aligned; struct se_global { diff --git a/include/target/target_core_configfs.h b/include/target/target_core_configfs.h index 40e6e740527c..612509592ffd 100644 --- a/include/target/target_core_configfs.h +++ b/include/target/target_core_configfs.h @@ -14,10 +14,12 @@ extern void target_fabric_configfs_deregister(struct target_fabric_configfs *); struct target_fabric_configfs_template { struct config_item_type tfc_discovery_cit; struct config_item_type tfc_wwn_cit; + struct config_item_type tfc_wwn_fabric_stats_cit; struct config_item_type tfc_tpg_cit; struct config_item_type tfc_tpg_base_cit; struct config_item_type tfc_tpg_lun_cit; struct config_item_type tfc_tpg_port_cit; + struct config_item_type tfc_tpg_port_stat_cit; struct config_item_type tfc_tpg_np_cit; struct config_item_type tfc_tpg_np_base_cit; struct config_item_type tfc_tpg_attrib_cit; @@ -27,7 +29,9 @@ struct target_fabric_configfs_template { struct config_item_type tfc_tpg_nacl_attrib_cit; struct config_item_type tfc_tpg_nacl_auth_cit; struct config_item_type tfc_tpg_nacl_param_cit; + struct config_item_type tfc_tpg_nacl_stat_cit; struct config_item_type tfc_tpg_mappedlun_cit; + struct config_item_type tfc_tpg_mappedlun_stat_cit; }; struct target_fabric_configfs { -- cgit v1.2.3 From 70c7c88a1a65ca683eb7f3fe3ce79c72f29d845e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 17 Mar 2011 16:22:17 -0500 Subject: [SCSI] libiscsi_tcp: use kmap in xmit path The xmit path can sleep with a page kmapped in the network xmit code while it waits for space to open up, so we have to use kmap instead of kmap atomic in that path. Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/libiscsi_tcp.c | 15 +++++++++++++-- include/scsi/libiscsi_tcp.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 8eeb39ffa37f..e98ae33f1295 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -132,14 +132,25 @@ static void iscsi_tcp_segment_map(struct iscsi_segment *segment, int recv) if (page_count(sg_page(sg)) >= 1 && !recv) return; - segment->sg_mapped = kmap_atomic(sg_page(sg), KM_SOFTIRQ0); + if (recv) { + segment->atomic_mapped = true; + segment->sg_mapped = kmap_atomic(sg_page(sg), KM_SOFTIRQ0); + } else { + segment->atomic_mapped = false; + /* the xmit path can sleep with the page mapped so use kmap */ + segment->sg_mapped = kmap(sg_page(sg)); + } + segment->data = segment->sg_mapped + sg->offset + segment->sg_offset; } void iscsi_tcp_segment_unmap(struct iscsi_segment *segment) { if (segment->sg_mapped) { - kunmap_atomic(segment->sg_mapped, KM_SOFTIRQ0); + if (segment->atomic_mapped) + kunmap_atomic(segment->sg_mapped, KM_SOFTIRQ0); + else + kunmap(sg_page(segment->sg)); segment->sg_mapped = NULL; segment->data = NULL; } diff --git a/include/scsi/libiscsi_tcp.h b/include/scsi/libiscsi_tcp.h index 741ae7ed4394..e6b9fd2eea34 100644 --- a/include/scsi/libiscsi_tcp.h +++ b/include/scsi/libiscsi_tcp.h @@ -47,6 +47,7 @@ struct iscsi_segment { struct scatterlist *sg; void *sg_mapped; unsigned int sg_offset; + bool atomic_mapped; iscsi_segment_done_fn_t *done; }; -- cgit v1.2.3 From 480c2006ebb44ae03165695db7b3e38c04e0d102 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 23 Mar 2011 14:48:29 -0400 Subject: NFS: Create nfs_open_dir_context nfs_opendir() created a context that held much more information than we need for a readdir. This patch introduces a slimmed-down nfs_open_dir_context that contains only the cookie and the cred used for RPC operations. The new context will eventually be used to help detect readdir loops. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 54 +++++++++++++++++++++++++++++++++++++++++++------- fs/nfs/inode.c | 1 - include/linux/nfs_fs.h | 3 +++ 3 files changed, 50 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c5c71cb62fbd..cda73814f666 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -44,6 +44,7 @@ /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); @@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = { .read = generic_read_dir, .readdir = nfs_readdir, .open = nfs_opendir, - .release = nfs_release, + .release = nfs_closedir, .fsync = nfs_fsync_dir, }; @@ -133,13 +134,33 @@ const struct inode_operations nfs4_dir_inode_operations = { #endif /* CONFIG_NFS_V4 */ +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +{ + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->dir_cookie = 0; + ctx->cred = get_rpccred(cred); + } else + ctx = ERR_PTR(-ENOMEM); + return ctx; +} + +static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +{ + put_rpccred(ctx->cred); + kfree(ctx); +} + /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { - int res; + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; dfprintk(FILE, "NFS: open dir(%s/%s)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -147,8 +168,15 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - /* Call generic open code in order to cache credentials */ - res = nfs_open(inode, filp); + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { /* This is a mountpoint, so d_revalidate will never * have been called, so we need to refresh the @@ -156,9 +184,18 @@ nfs_opendir(struct inode *inode, struct file *filp) */ __nfs_revalidate_inode(NFS_SERVER(inode), inode); } +out: + put_rpccred(cred); return res; } +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->private_data); + return 0; +} + struct nfs_cache_array_entry { u64 cookie; u64 ino; @@ -355,7 +392,8 @@ static int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct file *file, struct inode *inode) { - struct rpc_cred *cred = nfs_file_cred(file); + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; unsigned long timestamp, gencount; int error; @@ -786,6 +824,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = filp->private_data; int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", @@ -802,7 +841,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) memset(desc, 0, sizeof(*desc)); desc->file = filp; - desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; + desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); @@ -854,6 +893,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", dentry->d_parent->d_name.name, @@ -873,7 +913,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) } if (offset != filp->f_pos) { filp->f_pos = offset; - nfs_file_open_context(filp)->dir_cookie = 0; + dir_ctx->dir_cookie = 0; } out: mutex_unlock(&inode->i_mutex); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f8e61816d75..477a2e512b39 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -639,7 +639,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; - ctx->dir_cookie = 0; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index cb2add401f25..4b851a02f7f3 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -95,7 +95,10 @@ struct nfs_open_context { int error; struct list_head list; +}; +struct nfs_open_dir_context { + struct rpc_cred *cred; __u64 dir_cookie; }; -- cgit v1.2.3 From 8ef2ce3e16d9bec6cf015207c1c82a5b864046ac Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 23 Mar 2011 15:04:31 -0400 Subject: NFS: Detect loops in a readdir due to bad cookies Some filesystems (such as ext4) can return the same cookie value for multiple files. If we try to start a readdir with one of these cookies, the server will return the first file found with a cookie of the same value. This can cause the client to enter an infinite loop. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 28 +++++++++++++++++++++++++++- include/linux/nfs_fs.h | 2 ++ 2 files changed, 29 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index cda73814f666..db87a7d1109b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,7 +139,9 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred * struct nfs_open_dir_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { + ctx->duped = 0; ctx->dir_cookie = 0; + ctx->dup_cookie = 0; ctx->cred = get_rpccred(cred); } else ctx = ERR_PTR(-ENOMEM); @@ -321,6 +323,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri { loff_t diff = desc->file->f_pos - desc->current_index; unsigned int index; + struct nfs_open_dir_context *ctx = desc->file->private_data; if (diff < 0) goto out_eof; @@ -333,6 +336,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; + ctx->duped = 0; return 0; out_eof: desc->eof = 1; @@ -343,11 +347,18 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { int i; + loff_t new_pos; int status = -EAGAIN; + struct nfs_open_dir_context *ctx = desc->file->private_data; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { - desc->file->f_pos = desc->current_index + i; + new_pos = desc->current_index + i; + if (new_pos < desc->file->f_pos) { + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = 1; + } + desc->file->f_pos = new_pos; desc->cache_entry_index = i; return 0; } @@ -732,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop. " + "Please contact your server vendor. " + "Offending cookie: %llu\n", + file->f_dentry->d_parent->d_name.name, + file->f_dentry->d_name.name, + *desc->dir_cookie); + } + res = -ELOOP; + goto out; + } array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -914,6 +939,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; } out: mutex_unlock(&inode->i_mutex); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4b851a02f7f3..4179c368844b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -100,6 +100,8 @@ struct nfs_open_context { struct nfs_open_dir_context { struct rpc_cred *cred; __u64 dir_cookie; + __u64 dup_cookie; + int duped; }; /* -- cgit v1.2.3 From 3b9038912828384e38d82409c281124631c8533b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Mar 2011 00:24:11 +0100 Subject: genirq; Remove the last leftovers of the old sparse irq code All users converted. Get rid of it. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 7 ------- kernel/irq/irqdesc.c | 14 -------------- 2 files changed, 21 deletions(-) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 00218371518b..0b30662bc36c 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -100,13 +100,6 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif -/* Will be removed once the last users in power and sh are gone */ -extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); -static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - return desc; -} - #ifdef CONFIG_GENERIC_HARDIRQS static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index dbccc799407f..6fb014f172f7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -198,15 +198,6 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; -} - static int irq_expand_nr_irqs(unsigned int nr) { if (nr > IRQ_BITMAP_BITS) @@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); -- cgit v1.2.3 From d3e17deb1790ee2123e9d11420be6411d1768b47 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Mar 2011 17:08:15 +0100 Subject: genirq: Provide a lockdep helper Some irq chips need to call genirq functions for nested chips from their callbacks. That upsets lockdep. So they need to set a different lock class for those nested chips. Provide a helper function to avoid open access to irq_desc. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 0b30662bc36c..1595f9176b43 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -191,6 +191,15 @@ static inline void __set_irq_handler_unlocked(int irq, desc->handle_irq = handler; } +static inline void +irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + lockdep_set_class(&desc->lock, class); +} + #ifdef CONFIG_IRQ_PREFLOW_FASTEOI static inline void __irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler) -- cgit v1.2.3 From a2e8461a2ce5e8140b7374eb68af0d09e36e07ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Mar 2011 13:10:31 +0100 Subject: genirq: Provide locked setter for chip, handler, name Some irq_set_type() callbacks need to change the chip and the handler when the trigger mode changes. We have already a (misnomed) setter function for the handler which can be called from irq_set_type(). Provide one which allows to set chip and name as well. Put the misnomed function under the COMPAT switch and provide a replacement. Signed-off-by: Thomas Gleixner --- include/linux/irqdesc.h | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 1595f9176b43..15e6c3905f41 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -171,25 +171,44 @@ static inline int irq_has_action(unsigned int irq) return desc->action != NULL; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline int irq_balancing_disabled(unsigned int irq) +/* caller has locked the irq_desc and both params are valid */ +static inline void __irq_set_handler_locked(unsigned int irq, + irq_flow_handler_t handler) { struct irq_desc *desc; desc = irq_to_desc(irq); - return desc->status & IRQ_NO_BALANCING_MASK; + desc->handle_irq = handler; } -#endif /* caller has locked the irq_desc and both params are valid */ +static inline void +__irq_set_chip_handler_name_locked(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handler, const char *name) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + irq_desc_get_irq_data(desc)->chip = chip; + desc->handle_irq = handler; + desc->name = name; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT static inline void __set_irq_handler_unlocked(int irq, irq_flow_handler_t handler) +{ + __irq_set_handler_locked(irq, handler); +} + +static inline int irq_balancing_disabled(unsigned int irq) { struct irq_desc *desc; desc = irq_to_desc(irq); - desc->handle_irq = handler; + return desc->status & IRQ_NO_BALANCING_MASK; } +#endif static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *class) -- cgit v1.2.3 From 0b7ca5a928e2271bbc225e9e1ac1f22e9fbee54f Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Tue, 22 Mar 2011 22:37:47 +0000 Subject: mlx4: Changing interrupt scheme Adding a pool of MSI-X vectors and EQs that can be used explicitly by mlx4_core customers (mlx4_ib, mlx4_en). The consumers will assign their own names to the interrupt vectors. Those vectors are not opened at mlx4 device initialization, opened by demand. Changed the max number of possible EQs according to the new scheme, no longer relies on on number of cores. The new functionality is exposed through mlx4_assign_eq() and mlx4_release_eq(). Customers that do not use the new API will get completion vectors as before. Signed-off-by: Markuze Alex Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/cq.c | 2 +- drivers/net/mlx4/eq.c | 107 +++++++++++++++++++++++++++++++++++++++++--- drivers/net/mlx4/main.c | 20 +++++++-- drivers/net/mlx4/mlx4.h | 6 +++ drivers/net/mlx4/profile.c | 4 +- include/linux/mlx4/device.h | 8 ++++ 6 files changed, 134 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index 7cd34e9c7c7e..bd8ef9f2fa71 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -198,7 +198,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, u64 mtt_addr; int err; - if (vector >= dev->caps.num_comp_vectors) + if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool) return -EINVAL; cq->vector = vector; diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 552d0fce6f67..506cfd0372ec 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -42,7 +42,7 @@ #include "fw.h" enum { - MLX4_IRQNAME_SIZE = 64 + MLX4_IRQNAME_SIZE = 32 }; enum { @@ -317,8 +317,8 @@ static int mlx4_num_eq_uar(struct mlx4_dev *dev) * we need to map, take the difference of highest index and * the lowest index we'll use and add 1. */ - return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 - - dev->caps.reserved_eqs / 4 + 1; + return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs + + dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1; } static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) @@ -496,16 +496,32 @@ static void mlx4_free_eq(struct mlx4_dev *dev, static void mlx4_free_irqs(struct mlx4_dev *dev) { struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table; - int i; + struct mlx4_priv *priv = mlx4_priv(dev); + int i, vec; if (eq_table->have_irq) free_irq(dev->pdev->irq, dev); + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) if (eq_table->eq[i].have_irq) { free_irq(eq_table->eq[i].irq, eq_table->eq + i); eq_table->eq[i].have_irq = 0; } + for (i = 0; i < dev->caps.comp_pool; i++) { + /* + * Freeing the assigned irq's + * all bits should be 0, but we need to validate + */ + if (priv->msix_ctl.pool_bm & 1ULL << i) { + /* NO need protecting*/ + vec = dev->caps.num_comp_vectors + 1 + i; + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + } + } + + kfree(eq_table->irq_names); } @@ -578,7 +594,8 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) (priv->eq_table.inta_pin < 32 ? 4 : 0); priv->eq_table.irq_names = - kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1), + kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 + + dev->caps.comp_pool), GFP_KERNEL); if (!priv->eq_table.irq_names) { err = -ENOMEM; @@ -601,6 +618,22 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) if (err) goto err_out_comp; + /*if additional completion vectors poolsize is 0 this loop will not run*/ + for (i = dev->caps.num_comp_vectors + 1; + i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) { + + err = mlx4_create_eq(dev, dev->caps.num_cqs - + dev->caps.reserved_cqs + + MLX4_NUM_SPARE_EQE, + (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, + &priv->eq_table.eq[i]); + if (err) { + --i; + goto err_out_unmap; + } + } + + if (dev->flags & MLX4_FLAG_MSI_X) { const char *eq_name; @@ -686,7 +719,7 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) mlx4_free_irqs(dev); - for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) mlx4_free_eq(dev, &priv->eq_table.eq[i]); mlx4_unmap_clr_int(dev); @@ -743,3 +776,65 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) return err; } EXPORT_SYMBOL(mlx4_test_interrupts); + +int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector) +{ + + struct mlx4_priv *priv = mlx4_priv(dev); + int vec = 0, err = 0, i; + + spin_lock(&priv->msix_ctl.pool_lock); + for (i = 0; !vec && i < dev->caps.comp_pool; i++) { + if (~priv->msix_ctl.pool_bm & 1ULL << i) { + priv->msix_ctl.pool_bm |= 1ULL << i; + vec = dev->caps.num_comp_vectors + 1 + i; + snprintf(priv->eq_table.irq_names + + vec * MLX4_IRQNAME_SIZE, + MLX4_IRQNAME_SIZE, "%s", name); + err = request_irq(priv->eq_table.eq[vec].irq, + mlx4_msi_x_interrupt, 0, + &priv->eq_table.irq_names[vec<<5], + priv->eq_table.eq + vec); + if (err) { + /*zero out bit by fliping it*/ + priv->msix_ctl.pool_bm ^= 1 << i; + vec = 0; + continue; + /*we dont want to break here*/ + } + eq_set_ci(&priv->eq_table.eq[vec], 1); + } + } + spin_unlock(&priv->msix_ctl.pool_lock); + + if (vec) { + *vector = vec; + } else { + *vector = 0; + err = (i == dev->caps.comp_pool) ? -ENOSPC : err; + } + return err; +} +EXPORT_SYMBOL(mlx4_assign_eq); + +void mlx4_release_eq(struct mlx4_dev *dev, int vec) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /*bm index*/ + int i = vec - dev->caps.num_comp_vectors - 1; + + if (likely(i >= 0)) { + /*sanity check , making sure were not trying to free irq's + Belonging to a legacy EQ*/ + spin_lock(&priv->msix_ctl.pool_lock); + if (priv->msix_ctl.pool_bm & 1ULL << i) { + free_irq(priv->eq_table.eq[vec].irq, + &priv->eq_table.eq[vec]); + priv->msix_ctl.pool_bm &= ~(1ULL << i); + } + spin_unlock(&priv->msix_ctl.pool_lock); + } + +} +EXPORT_SYMBOL(mlx4_release_eq); + diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 2765a3ce9c24..517ca34f5b37 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -969,13 +969,15 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry *entries; - int nreq; + int nreq = min_t(int, dev->caps.num_ports * + min_t(int, num_online_cpus() + 1, MAX_MSIX_P_PORT) + + MSIX_LEGACY_SZ, MAX_MSIX); int err; int i; if (msi_x) { nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, - num_possible_cpus() + 1); + nreq); entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; @@ -998,7 +1000,15 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) goto no_msi; } - dev->caps.num_comp_vectors = nreq - 1; + if (nreq < + MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) { + /*Working in legacy mode , all EQ's shared*/ + dev->caps.comp_pool = 0; + dev->caps.num_comp_vectors = nreq - 1; + } else { + dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; + dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; + } for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; @@ -1010,6 +1020,7 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) no_msi: dev->caps.num_comp_vectors = 1; + dev->caps.comp_pool = 0; for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; @@ -1151,6 +1162,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_close; + priv->msix_ctl.pool_bm = 0; + spin_lock_init(&priv->msix_ctl.pool_lock); + mlx4_enable_msi_x(dev); err = mlx4_setup_hca(dev); diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 0da5bb7285b4..67ee8dacae79 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -282,6 +282,11 @@ struct mlx4_sense { struct delayed_work sense_poll; }; +struct mlx4_msix_ctl { + u64 pool_bm; + spinlock_t pool_lock; +}; + struct mlx4_priv { struct mlx4_dev dev; @@ -313,6 +318,7 @@ struct mlx4_priv { struct mlx4_port_info port[MLX4_MAX_PORTS + 1]; struct mlx4_sense sense; struct mutex port_mutex; + struct mlx4_msix_ctl msix_ctl; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index e749f82865fe..b967647d0c76 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -107,9 +107,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, - dev_cap->reserved_eqs + - num_possible_cpus() + 1); + profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; profile[MLX4_RES_MTT].num = request->num_mtt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 049214642036..78380823d827 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -39,6 +39,11 @@ #include +#define MAX_MSIX_P_PORT 17 +#define MAX_MSIX 64 +#define MSIX_LEGACY_SZ 4 +#define MIN_MSIX_P_PORT 5 + enum { MLX4_FLAG_MSI_X = 1 << 0, MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, @@ -223,6 +228,7 @@ struct mlx4_caps { int num_eqs; int reserved_eqs; int num_comp_vectors; + int comp_pool; int num_mpts; int num_mtt_segs; int mtts_per_seg; @@ -526,5 +532,7 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_test_interrupts(struct mlx4_dev *dev); +int mlx4_assign_eq(struct mlx4_dev *dev, char* name , int* vector); +void mlx4_release_eq(struct mlx4_dev *dev, int vec); #endif /* MLX4_DEVICE_H */ -- cgit v1.2.3 From 14c07b1358ede1664652bb9b28d9ace5fe6f7f92 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Tue, 22 Mar 2011 22:37:59 +0000 Subject: mlx4: Wake on LAN support The driver queries the FW for WOL support. Ethtool get/set_wol is implemented accordingly. Only magic packets are supported at the time. Signed-off-by: Igor Yarovinsky Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_ethtool.c | 62 +++++++++++++++++++++++++++++++++++++++++-- drivers/net/mlx4/fw.c | 20 ++++++++++++++ drivers/net/mlx4/fw.h | 1 + drivers/net/mlx4/main.c | 1 + drivers/net/mlx4/mlx4_en.h | 7 +++++ include/linux/mlx4/device.h | 4 +++ 6 files changed, 93 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/mlx4/en_ethtool.c b/drivers/net/mlx4/en_ethtool.c index 8cfe8586ed2d..c1f351f6ae57 100644 --- a/drivers/net/mlx4/en_ethtool.c +++ b/drivers/net/mlx4/en_ethtool.c @@ -131,8 +131,65 @@ static void mlx4_en_set_msglevel(struct net_device *dev, u32 val) static void mlx4_en_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { - wol->supported = 0; - wol->wolopts = 0; + struct mlx4_en_priv *priv = netdev_priv(netdev); + int err = 0; + u64 config = 0; + + if (!priv->mdev->dev->caps.wol) { + wol->supported = 0; + wol->wolopts = 0; + return; + } + + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); + if (err) { + en_err(priv, "Failed to get WoL information\n"); + return; + } + + if (config & MLX4_EN_WOL_MAGIC) + wol->supported = WAKE_MAGIC; + else + wol->supported = 0; + + if (config & MLX4_EN_WOL_ENABLED) + wol->wolopts = WAKE_MAGIC; + else + wol->wolopts = 0; +} + +static int mlx4_en_set_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct mlx4_en_priv *priv = netdev_priv(netdev); + u64 config = 0; + int err = 0; + + if (!priv->mdev->dev->caps.wol) + return -EOPNOTSUPP; + + if (wol->supported & ~WAKE_MAGIC) + return -EINVAL; + + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); + if (err) { + en_err(priv, "Failed to get WoL info, unable to modify\n"); + return err; + } + + if (wol->wolopts & WAKE_MAGIC) { + config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED | + MLX4_EN_WOL_MAGIC; + } else { + config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC); + config |= MLX4_EN_WOL_DO_MODIFY; + } + + err = mlx4_wol_write(priv->mdev->dev, config, priv->port); + if (err) + en_err(priv, "Failed to set WoL information\n"); + + return err; } static int mlx4_en_get_sset_count(struct net_device *dev, int sset) @@ -442,6 +499,7 @@ const struct ethtool_ops mlx4_en_ethtool_ops = { .get_ethtool_stats = mlx4_en_get_ethtool_stats, .self_test = mlx4_en_self_test, .get_wol = mlx4_en_get_wol, + .set_wol = mlx4_en_set_wol, .get_msglevel = mlx4_en_get_msglevel, .set_msglevel = mlx4_en_set_msglevel, .get_coalesce = mlx4_en_get_coalesce, diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 5de1db897835..fd1c51b6197a 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -276,6 +276,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->udp_rss = field & 0x1; MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET); dev_cap->loopback_support = field & 0x1; + dev_cap->wol = field & 0x40; MLX4_GET(dev_cap->flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; @@ -908,3 +909,22 @@ int mlx4_NOP(struct mlx4_dev *dev) /* Input modifier of 0x1f means "finish as soon as possible." */ return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100); } + +#define MLX4_WOL_SETUP_MODE (5 << 28) +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) +{ + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A); +} +EXPORT_SYMBOL_GPL(mlx4_wol_read); + +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) +{ + u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + + return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A); +} +EXPORT_SYMBOL_GPL(mlx4_wol_write); diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index 65cc72eb899d..f7b9cc2d1b2a 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -80,6 +80,7 @@ struct mlx4_dev_cap { u16 stat_rate_support; int udp_rss; int loopback_support; + int wol; u32 flags; int reserved_uars; int uar_size; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 517ca34f5b37..42d4fb477870 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -227,6 +227,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.udp_rss = dev_cap->udp_rss; dev->caps.loopback_support = dev_cap->loopback_support; + dev->caps.wol = dev_cap->wol; dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.log_num_macs = log_num_mac; diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index 2db245fcd84b..07aea8d0beeb 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -479,6 +479,13 @@ struct mlx4_en_priv { int mc_addrs_cnt; struct mlx4_en_stat_out_mbox hw_stats; int vids[128]; + bool wol; +}; + +enum mlx4_en_wol { + MLX4_EN_WOL_MAGIC = (1ULL << 61), + MLX4_EN_WOL_ENABLED = (1ULL << 62), + MLX4_EN_WOL_DO_MODIFY = (1ULL << 63), }; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 78380823d827..2460356d2c72 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -251,6 +251,7 @@ struct mlx4_caps { u16 stat_rate_support; int udp_rss; int loopback_support; + int wol; u8 port_width_cap[MLX4_MAX_PORTS + 1]; int max_gso_sz; int reserved_qps_cnt[MLX4_NUM_QP_REGION]; @@ -535,4 +536,7 @@ int mlx4_test_interrupts(struct mlx4_dev *dev); int mlx4_assign_eq(struct mlx4_dev *dev, char* name , int* vector); void mlx4_release_eq(struct mlx4_dev *dev, int vec); +int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); +int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); + #endif /* MLX4_DEVICE_H */ -- cgit v1.2.3 From 725c89997e03d71b09ea3c17c997da0712b9d835 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Tue, 22 Mar 2011 22:38:07 +0000 Subject: mlx4_en: Reporting HW revision in ethtool -i HW revision is derived from device ID and rev id. Signed-off-by: Eugenia Emantayev Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 1 - drivers/net/mlx4/en_ethtool.c | 15 ++++++++++++++- drivers/net/mlx4/main.c | 2 ++ drivers/net/mlx4/mlx4_en.h | 3 +++ include/linux/mlx4/device.h | 2 +- 5 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c7a6213c6996..66e3eecfd0db 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -721,7 +721,6 @@ static int init_node_data(struct mlx4_ib_dev *dev) if (err) goto out; - dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: diff --git a/drivers/net/mlx4/en_ethtool.c b/drivers/net/mlx4/en_ethtool.c index c1f351f6ae57..62ace6c72b42 100644 --- a/drivers/net/mlx4/en_ethtool.c +++ b/drivers/net/mlx4/en_ethtool.c @@ -45,7 +45,20 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; - sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id); + switch (mdev->dev->rev_id) { + case 0xa0: + if (dev->dev_id >= MLX4_EN_CX3_LOW_ID && dev->dev_id <= MLX4_EN_CX3_HIGH_ID) + sprintf(drvinfo->driver, DRV_NAME " (%s_CX-3)", mdev->dev->board_id); + else + sprintf(drvinfo->driver, DRV_NAME " (%s_CX)", mdev->dev->board_id); + break; + case 0xb0: + sprintf(drvinfo->driver, DRV_NAME " (%s_CX-2)", mdev->dev->board_id); + break; + default: + sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id); + break; + } strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32); sprintf(drvinfo->fw_version, "%d.%d.%d", (u16) (mdev->dev->caps.fw_ver >> 32), diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 42d4fb477870..5bebb8800ab2 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -1139,6 +1139,8 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&priv->pgdir_list); mutex_init(&priv->pgdir_mutex); + pci_read_config_byte(pdev, PCI_REVISION_ID, &dev->rev_id); + /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index 07aea8d0beeb..ad4df66750bc 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -216,6 +216,9 @@ struct mlx4_en_tx_desc { #define MLX4_EN_USE_SRQ 0x01000000 +#define MLX4_EN_CX3_LOW_ID 0x1000 +#define MLX4_EN_CX3_HIGH_ID 0x1005 + struct mlx4_en_rx_alloc { struct page *page; u16 offset; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 2460356d2c72..fe2a3a3f046c 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -422,7 +422,7 @@ struct mlx4_dev { unsigned long flags; struct mlx4_caps caps; struct radix_tree_root qp_table_tree; - u32 rev_id; + u8 rev_id; char board_id[MLX4_BOARD_ID_LEN]; }; -- cgit v1.2.3 From 0345584e0b8be3735a950d17c7e463db20c6ce27 Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Tue, 22 Mar 2011 22:38:17 +0000 Subject: mlx4: generalization of multicast steering. The same packet steering mechanism would be used both for IB and Ethernet, Both multicasts and unicasts. This commit prepares the general infrastructure for this. Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 10 ++-- drivers/net/mlx4/en_main.c | 2 +- drivers/net/mlx4/fw.c | 2 + drivers/net/mlx4/fw.h | 2 + drivers/net/mlx4/main.c | 2 + drivers/net/mlx4/mcg.c | 113 ++++++++++++++++++++++++++------------ include/linux/mlx4/device.h | 14 ++++- 7 files changed, 102 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 66e3eecfd0db..fbe1973f77b0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -625,7 +625,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - MLX4_PROTOCOL_IB); + MLX4_PROT_IB_IPV6); if (err) return err; @@ -636,7 +636,7 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) return 0; err_add: - mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROTOCOL_IB); + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6); return err; } @@ -666,7 +666,7 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_gid_entry *ge; err = mlx4_multicast_detach(mdev->dev, - &mqp->mqp, gid->raw, MLX4_PROTOCOL_IB); + &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6); if (err) return err; @@ -953,7 +953,7 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event mlx4_foreach_ib_transport_port(port, ibdev->dev) { oldnd = iboe->netdevs[port - 1]; iboe->netdevs[port - 1] = - mlx4_get_protocol_dev(ibdev->dev, MLX4_PROTOCOL_EN, port); + mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (oldnd != iboe->netdevs[port - 1]) { if (iboe->netdevs[port - 1]) netdev_added(ibdev, port); @@ -1206,7 +1206,7 @@ static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, - .protocol = MLX4_PROTOCOL_IB + .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c index 29aaa4303991..9317b61a75b8 100644 --- a/drivers/net/mlx4/en_main.c +++ b/drivers/net/mlx4/en_main.c @@ -296,7 +296,7 @@ static struct mlx4_interface mlx4_en_interface = { .remove = mlx4_en_remove, .event = mlx4_en_event, .get_dev = mlx4_en_get_netdev, - .protocol = MLX4_PROTOCOL_EN, + .protocol = MLX4_PROT_ETH, }; static int __init mlx4_en_init(void) diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index fd1c51b6197a..7813913b6d0f 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -274,6 +274,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->stat_rate_support = stat_rate; MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET); dev_cap->udp_rss = field & 0x1; + dev_cap->vep_uc_steering = field & 0x2; + dev_cap->vep_mc_steering = field & 0x4; MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET); dev_cap->loopback_support = field & 0x1; dev_cap->wol = field & 0x40; diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index f7b9cc2d1b2a..88003ebc6185 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -80,6 +80,8 @@ struct mlx4_dev_cap { u16 stat_rate_support; int udp_rss; int loopback_support; + int vep_uc_steering; + int vep_mc_steering; int wol; u32 flags; int reserved_uars; diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 5bebb8800ab2..da0da281d830 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -227,6 +227,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.udp_rss = dev_cap->udp_rss; dev->caps.loopback_support = dev_cap->loopback_support; + dev->caps.vep_uc_steering = dev_cap->vep_uc_steering; + dev->caps.vep_mc_steering = dev_cap->vep_mc_steering; dev->caps.wol = dev_cap->wol; dev->caps.max_gso_sz = dev_cap->max_gso_sz; diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index 79cf42db2ea9..b103ad23bd84 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -32,6 +32,7 @@ */ #include +#include #include @@ -50,28 +51,28 @@ struct mlx4_mgm { static const u8 zero_gid[16]; /* automatically initialized to 0 */ -static int mlx4_READ_MCG(struct mlx4_dev *dev, int index, - struct mlx4_cmd_mailbox *mailbox) +static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) { return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG, MLX4_CMD_TIME_CLASS_A); } -static int mlx4_WRITE_MCG(struct mlx4_dev *dev, int index, - struct mlx4_cmd_mailbox *mailbox) +static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index, + struct mlx4_cmd_mailbox *mailbox) { return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A); } -static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, - u16 *hash) +static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + u16 *hash, u8 op_mod) { u64 imm; int err; - err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, 0, MLX4_CMD_MGID_HASH, - MLX4_CMD_TIME_CLASS_A); + err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod, + MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A); if (!err) *hash = imm; @@ -94,15 +95,17 @@ static int mlx4_MGID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox * If no AMGM exists for given gid, *index = -1, *prev = index of last * entry in hash chain and *mgm holds end of hash chain. */ -static int find_mgm(struct mlx4_dev *dev, - u8 *gid, enum mlx4_protocol protocol, - struct mlx4_cmd_mailbox *mgm_mailbox, - u16 *hash, int *prev, int *index) +static int find_entry(struct mlx4_dev *dev, u8 port, + u8 *gid, enum mlx4_protocol prot, + enum mlx4_steer_type steer, + struct mlx4_cmd_mailbox *mgm_mailbox, + u16 *hash, int *prev, int *index) { struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm = mgm_mailbox->buf; u8 *mgid; int err; + u8 op_mod = (prot == MLX4_PROT_ETH) ? !!(dev->caps.vep_mc_steering) : 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -111,7 +114,7 @@ static int find_mgm(struct mlx4_dev *dev, memcpy(mgid, gid, 16); - err = mlx4_MGID_HASH(dev, mailbox, hash); + err = mlx4_GID_HASH(dev, mailbox, hash, op_mod); mlx4_free_cmd_mailbox(dev, mailbox); if (err) return err; @@ -123,11 +126,11 @@ static int find_mgm(struct mlx4_dev *dev, *prev = -1; do { - err = mlx4_READ_MCG(dev, *index, mgm_mailbox); + err = mlx4_READ_ENTRY(dev, *index, mgm_mailbox); if (err) return err; - if (!memcmp(mgm->gid, zero_gid, 16)) { + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) { if (*index != *hash) { mlx4_err(dev, "Found zero MGID in AMGM.\n"); err = -EINVAL; @@ -136,7 +139,7 @@ static int find_mgm(struct mlx4_dev *dev, } if (!memcmp(mgm->gid, gid, 16) && - be32_to_cpu(mgm->members_count) >> 30 == protocol) + be32_to_cpu(mgm->members_count) >> 30 == prot) return err; *prev = *index; @@ -147,8 +150,9 @@ static int find_mgm(struct mlx4_dev *dev, return err; } -int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - int block_mcast_loopback, enum mlx4_protocol protocol) +int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot, + enum mlx4_steer_type steer) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; @@ -159,6 +163,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int link = 0; int i; int err; + u8 port = gid[5]; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -166,13 +171,13 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mgm = mailbox->buf; mutex_lock(&priv->mcg_table.mutex); - - err = find_mgm(dev, gid, protocol, mailbox, &hash, &prev, &index); + err = find_entry(dev, port, gid, prot, steer, + mailbox, &hash, &prev, &index); if (err) goto out; if (index != -1) { - if (!memcmp(mgm->gid, zero_gid, 16)) + if (!(be32_to_cpu(mgm->members_count) & 0xffffff)) memcpy(mgm->gid, gid, 16); } else { link = 1; @@ -209,22 +214,22 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], else mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK); - mgm->members_count = cpu_to_be32(members_count | (u32) protocol << 30); + mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30); - err = mlx4_WRITE_MCG(dev, index, mailbox); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); if (err) goto out; if (!link) goto out; - err = mlx4_READ_MCG(dev, prev, mailbox); + err = mlx4_READ_ENTRY(dev, prev, mailbox); if (err) goto out; mgm->next_gid_index = cpu_to_be32(index << 6); - err = mlx4_WRITE_MCG(dev, prev, mailbox); + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) goto out; @@ -242,10 +247,9 @@ out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } -EXPORT_SYMBOL_GPL(mlx4_multicast_attach); -int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - enum mlx4_protocol protocol) +int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot, enum mlx4_steer_type steer) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; @@ -255,6 +259,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int prev, index; int i, loc; int err; + u8 port = gid[5]; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -263,7 +268,8 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mutex_lock(&priv->mcg_table.mutex); - err = find_mgm(dev, gid, protocol, mailbox, &hash, &prev, &index); + err = find_entry(dev, port, gid, prot, steer, + mailbox, &hash, &prev, &index); if (err) goto out; @@ -285,12 +291,12 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } - mgm->members_count = cpu_to_be32(--members_count | (u32) protocol << 30); + mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30); mgm->qp[loc] = mgm->qp[i - 1]; mgm->qp[i - 1] = 0; if (i != 1) { - err = mlx4_WRITE_MCG(dev, index, mailbox); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); goto out; } @@ -298,13 +304,13 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], /* Remove entry from MGM */ int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6; if (amgm_index) { - err = mlx4_READ_MCG(dev, amgm_index, mailbox); + err = mlx4_READ_ENTRY(dev, amgm_index, mailbox); if (err) goto out; } else memset(mgm->gid, 0, 16); - err = mlx4_WRITE_MCG(dev, index, mailbox); + err = mlx4_WRITE_ENTRY(dev, index, mailbox); if (err) goto out; @@ -319,13 +325,13 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } else { /* Remove entry from AMGM */ int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; - err = mlx4_READ_MCG(dev, prev, mailbox); + err = mlx4_READ_ENTRY(dev, prev, mailbox); if (err) goto out; mgm->next_gid_index = cpu_to_be32(cur_next_index << 6); - err = mlx4_WRITE_MCG(dev, prev, mailbox); + err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) goto out; @@ -343,6 +349,43 @@ out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } + + +int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + int block_mcast_loopback, enum mlx4_protocol prot) +{ + enum mlx4_steer_type steer; + + steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER; + + if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering) + return 0; + + if (prot == MLX4_PROT_ETH) + gid[7] |= (steer << 1); + + return mlx4_qp_attach_common(dev, qp, gid, + block_mcast_loopback, prot, + steer); +} +EXPORT_SYMBOL_GPL(mlx4_multicast_attach); + +int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], + enum mlx4_protocol prot) +{ + enum mlx4_steer_type steer; + + steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER; + + if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering) + return 0; + + if (prot == MLX4_PROT_ETH) { + gid[7] |= (steer << 1); + } + + return mlx4_qp_detach_common(dev, qp, gid, prot, steer); +} EXPORT_SYMBOL_GPL(mlx4_multicast_detach); int mlx4_init_mcg_table(struct mlx4_dev *dev) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index fe2a3a3f046c..ebda939b9fc3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -150,8 +150,10 @@ enum { }; enum mlx4_protocol { - MLX4_PROTOCOL_IB, - MLX4_PROTOCOL_EN, + MLX4_PROT_IB_IPV6 = 0, + MLX4_PROT_ETH, + MLX4_PROT_IB_IPV4, + MLX4_PROT_FCOE }; enum { @@ -178,6 +180,12 @@ enum mlx4_special_vlan_idx { MLX4_VLAN_REGULAR }; +enum mlx4_steer_type { + MLX4_MC_STEER = 0, + MLX4_UC_STEER, + MLX4_NUM_STEERS +}; + enum { MLX4_NUM_FEXCH = 64 * 1024, }; @@ -251,6 +259,8 @@ struct mlx4_caps { u16 stat_rate_support; int udp_rss; int loopback_support; + int vep_uc_steering; + int vep_mc_steering; int wol; u8 port_width_cap[MLX4_MAX_PORTS + 1]; int max_gso_sz; -- cgit v1.2.3 From 1679200f91da6a054b06954c9bd3eeed29b6731f Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Tue, 22 Mar 2011 22:38:31 +0000 Subject: mlx4_en: Enabling new steering The mlx4_en module now uses the new steering mechanism. The RX packets are now steered through the MCG table instead of Mac table for unicast, and default entry for multicast. The feature is enabled through INIT_HCA Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_netdev.c | 133 +++++++++++++++++++++++++++------- drivers/net/mlx4/en_port.c | 11 ++- drivers/net/mlx4/en_port.h | 19 +++-- drivers/net/mlx4/en_rx.c | 11 +-- drivers/net/mlx4/fw.c | 3 + drivers/net/mlx4/mlx4.h | 6 ++ drivers/net/mlx4/mlx4_en.h | 1 + drivers/net/mlx4/port.c | 165 +++++++++++++++++++++++++++++++++++++++---- include/linux/mlx4/device.h | 12 +++- 9 files changed, 300 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index f6ed315b4b89..08e680100db8 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -156,9 +156,8 @@ static void mlx4_en_do_set_mac(struct work_struct *work) mutex_lock(&mdev->state_lock); if (priv->port_up) { /* Remove old MAC and insert the new one */ - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); - err = mlx4_register_mac(mdev->dev, priv->port, - priv->mac, &priv->mac_index); + err = mlx4_replace_mac(mdev->dev, priv->port, + priv->base_qpn, priv->mac, 0); if (err) en_err(priv, "Failed changing HW MAC address\n"); } else @@ -214,6 +213,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) struct mlx4_en_dev *mdev = priv->mdev; struct net_device *dev = priv->dev; u64 mcast_addr = 0; + u8 mc_list[16] = {0}; int err; mutex_lock(&mdev->state_lock); @@ -239,8 +239,12 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv->flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ - err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, - priv->base_qpn, 1); + if (!mdev->dev->caps.vep_uc_steering) + err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, + priv->base_qpn, 1); + else + err = mlx4_unicast_promisc_add(mdev->dev, priv->base_qpn, + priv->port); if (err) en_err(priv, "Failed enabling " "promiscous mode\n"); @@ -252,10 +256,21 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) en_err(priv, "Failed disabling " "multicast filter\n"); - /* Disable port VLAN filter */ - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL); - if (err) - en_err(priv, "Failed disabling VLAN filter\n"); + /* Add the default qp number as multicast promisc */ + if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { + err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed entering multicast promisc mode\n"); + priv->flags |= MLX4_EN_FLAG_MC_PROMISC; + } + + if (priv->vlgrp) { + /* Disable port VLAN filter */ + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL); + if (err) + en_err(priv, "Failed disabling VLAN filter\n"); + } } goto out; } @@ -270,11 +285,24 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ - err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, - priv->base_qpn, 0); + if (!mdev->dev->caps.vep_uc_steering) + err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, + priv->base_qpn, 0); + else + err = mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn, + priv->port); if (err) en_err(priv, "Failed disabling promiscous mode\n"); + /* Disable Multicast promisc */ + if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { + err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed disabling multicast promiscous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + } + /* Enable port VLAN filter */ err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); if (err) @@ -287,14 +315,38 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); + + /* Add the default qp number as multicast promisc */ + if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { + err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed entering multicast promisc mode\n"); + priv->flags |= MLX4_EN_FLAG_MC_PROMISC; + } } else { int i; + /* Disable Multicast promisc */ + if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { + err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, + priv->port); + if (err) + en_err(priv, "Failed disabling multicast promiscous mode\n"); + priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; + } err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_DISABLE); if (err) en_err(priv, "Failed disabling multicast filter\n"); + /* Detach our qp from all the multicast addresses */ + for (i = 0; i < priv->mc_addrs_cnt; i++) { + memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN); + mc_list[5] = priv->port; + mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, + mc_list, MLX4_PROT_ETH); + } /* Flush mcast filter and init it with broadcast address */ mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, ETH_BCAST, 1, MLX4_MCAST_CONFIG); @@ -307,6 +359,10 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) for (i = 0; i < priv->mc_addrs_cnt; i++) { mcast_addr = mlx4_en_mac_to_u64(priv->mc_addrs + i * ETH_ALEN); + memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN); + mc_list[5] = priv->port; + mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, + mc_list, 0, MLX4_PROT_ETH); mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, mcast_addr, 0, MLX4_MCAST_CONFIG); } @@ -314,8 +370,6 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) 0, MLX4_MCAST_ENABLE); if (err) en_err(priv, "Failed enabling multicast filter\n"); - - mlx4_en_clear_list(dev); } out: mutex_unlock(&mdev->state_lock); @@ -557,6 +611,7 @@ int mlx4_en_start_port(struct net_device *dev) int err = 0; int i; int j; + u8 mc_list[16] = {0}; char name[32]; if (priv->port_up) { @@ -596,10 +651,20 @@ int mlx4_en_start_port(struct net_device *dev) ++rx_index; } + /* Set port mac number */ + en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port); + err = mlx4_register_mac(mdev->dev, priv->port, + priv->mac, &priv->base_qpn, 0); + if (err) { + en_err(priv, "Failed setting port mac\n"); + goto cq_err; + } + mdev->mac_removed[priv->port] = 0; + err = mlx4_en_config_rss_steer(priv); if (err) { en_err(priv, "Failed configuring rss steering\n"); - goto cq_err; + goto mac_err; } if (mdev->dev->caps.comp_pool && !priv->tx_vector) { @@ -661,24 +726,22 @@ int mlx4_en_start_port(struct net_device *dev) en_err(priv, "Failed setting default qp numbers\n"); goto tx_err; } - /* Set port mac number */ - en_dbg(DRV, priv, "Setting mac for port %d\n", priv->port); - err = mlx4_register_mac(mdev->dev, priv->port, - priv->mac, &priv->mac_index); - if (err) { - en_err(priv, "Failed setting port mac\n"); - goto tx_err; - } - mdev->mac_removed[priv->port] = 0; /* Init port */ en_dbg(HW, priv, "Initializing port\n"); err = mlx4_INIT_PORT(mdev->dev, priv->port); if (err) { en_err(priv, "Failed Initializing port\n"); - goto mac_err; + goto tx_err; } + /* Attach rx QP to bradcast address */ + memset(&mc_list[10], 0xff, ETH_ALEN); + mc_list[5] = priv->port; + if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list, + 0, MLX4_PROT_ETH)) + mlx4_warn(mdev, "Failed Attaching Broadcast\n"); + /* Schedule multicast task to populate multicast list */ queue_work(mdev->workqueue, &priv->mcast_task); @@ -686,8 +749,6 @@ int mlx4_en_start_port(struct net_device *dev) netif_tx_start_all_queues(dev); return 0; -mac_err: - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); tx_err: while (tx_index--) { mlx4_en_deactivate_tx_ring(priv, &priv->tx_ring[tx_index]); @@ -695,6 +756,8 @@ tx_err: } mlx4_en_release_rss_steer(priv); +mac_err: + mlx4_unregister_mac(mdev->dev, priv->port, priv->base_qpn); cq_err: while (rx_index--) mlx4_en_deactivate_cq(priv, &priv->rx_cq[rx_index]); @@ -710,6 +773,7 @@ void mlx4_en_stop_port(struct net_device *dev) struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int i; + u8 mc_list[16] = {0}; if (!priv->port_up) { en_dbg(DRV, priv, "stop port called while port already down\n"); @@ -724,8 +788,23 @@ void mlx4_en_stop_port(struct net_device *dev) /* Set port as not active */ priv->port_up = false; + /* Detach All multicasts */ + memset(&mc_list[10], 0xff, ETH_ALEN); + mc_list[5] = priv->port; + mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, + MLX4_PROT_ETH); + for (i = 0; i < priv->mc_addrs_cnt; i++) { + memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN); + mc_list[5] = priv->port; + mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, + mc_list, MLX4_PROT_ETH); + } + mlx4_en_clear_list(dev); + /* Flush multicast filter */ + mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 1, MLX4_MCAST_CONFIG); + /* Unregister Mac address for the port */ - mlx4_unregister_mac(mdev->dev, priv->port, priv->mac_index); + mlx4_unregister_mac(mdev->dev, priv->port, priv->base_qpn); mdev->mac_removed[priv->port] = 1; /* Free TX Rings */ diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c index a2b0bc4ada5d..f2a4f5dd313d 100644 --- a/drivers/net/mlx4/en_port.c +++ b/drivers/net/mlx4/en_port.c @@ -119,6 +119,10 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, struct mlx4_set_port_rqp_calc_context *context; int err; u32 in_mod; + u32 m_promisc = (dev->caps.vep_mc_steering) ? MCAST_DIRECT : MCAST_DEFAULT; + + if (dev->caps.vep_mc_steering && dev->caps.vep_uc_steering) + return 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -127,8 +131,11 @@ int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, memset(context, 0, sizeof *context); context->base_qpn = cpu_to_be32(base_qpn); - context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_EN_SHIFT | base_qpn); - context->mcast = cpu_to_be32(1 << SET_PORT_PROMISC_MODE_SHIFT | base_qpn); + context->n_mac = 0x7; + context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | + base_qpn); + context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | + base_qpn); context->intra_no_vlan = 0; context->no_vlan = MLX4_NO_VLAN_IDX; context->intra_vlan_miss = 0; diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h index 092e814b1981..e3d73e41c567 100644 --- a/drivers/net/mlx4/en_port.h +++ b/drivers/net/mlx4/en_port.h @@ -36,8 +36,8 @@ #define SET_PORT_GEN_ALL_VALID 0x7 -#define SET_PORT_PROMISC_EN_SHIFT 31 -#define SET_PORT_PROMISC_MODE_SHIFT 30 +#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_MC_PROMISC_SHIFT 30 enum { MLX4_CMD_SET_VLAN_FLTR = 0x47, @@ -45,6 +45,12 @@ enum { MLX4_CMD_DUMP_ETH_STATS = 0x49, }; +enum { + MCAST_DIRECT_ONLY = 0, + MCAST_DIRECT = 1, + MCAST_DEFAULT = 2 +}; + struct mlx4_set_port_general_context { u8 reserved[3]; u8 flags; @@ -60,14 +66,17 @@ struct mlx4_set_port_general_context { struct mlx4_set_port_rqp_calc_context { __be32 base_qpn; - __be32 flags; - u8 reserved[3]; + u8 rererved; + u8 n_mac; + u8 n_vlan; + u8 n_prio; + u8 reserved2[3]; u8 mac_miss; u8 intra_no_vlan; u8 no_vlan; u8 intra_vlan_miss; u8 vlan_miss; - u8 reserved2[3]; + u8 reserved3[3]; u8 no_vlan_prio; __be32 promisc; __be32 mcast; diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c index 570f2508fb30..05998ee297c9 100644 --- a/drivers/net/mlx4/en_rx.c +++ b/drivers/net/mlx4/en_rx.c @@ -845,16 +845,10 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) } /* Configure RSS indirection qp */ - err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &priv->base_qpn); - if (err) { - en_err(priv, "Failed to reserve range for RSS " - "indirection qp\n"); - goto rss_err; - } err = mlx4_qp_alloc(mdev->dev, priv->base_qpn, &rss_map->indir_qp); if (err) { en_err(priv, "Failed to allocate RSS indirection QP\n"); - goto reserve_err; + goto rss_err; } rss_map->indir_qp.event = mlx4_en_sqp_event; mlx4_en_fill_qp_context(priv, 0, 0, 0, 1, priv->base_qpn, @@ -881,8 +875,6 @@ indir_err: MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); -reserve_err: - mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1); rss_err: for (i = 0; i < good_qps; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], @@ -904,7 +896,6 @@ void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv) MLX4_QP_STATE_RST, NULL, 0, 0, &rss_map->indir_qp); mlx4_qp_remove(mdev->dev, &rss_map->indir_qp); mlx4_qp_free(mdev->dev, &rss_map->indir_qp); - mlx4_qp_release_range(mdev->dev, priv->base_qpn, 1); for (i = 0; i < priv->rx_ring_num; i++) { mlx4_qp_modify(mdev->dev, NULL, rss_map->state[i], diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 7813913b6d0f..67a209ba939d 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -740,6 +740,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_MC_BASE_OFFSET (INIT_HCA_MCAST_OFFSET + 0x00) #define INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12) #define INIT_HCA_LOG_MC_HASH_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x16) +#define INIT_HCA_UC_STEERING_OFFSET (INIT_HCA_MCAST_OFFSET + 0x18) #define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) #define INIT_HCA_TPT_OFFSET 0x0f0 #define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) @@ -800,6 +801,8 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); MLX4_PUT(inbox, param->log_mc_hash_sz, INIT_HCA_LOG_MC_HASH_SZ_OFFSET); + if (dev->caps.vep_mc_steering) + MLX4_PUT(inbox, (u8) (1 << 3), INIT_HCA_UC_STEERING_OFFSET); MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); /* TPT attributes */ diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index 1513a91b4bd4..d8bb4418581b 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -286,6 +286,10 @@ struct mlx4_vlan_table { int max; }; +struct mlx4_mac_entry { + u64 mac; +}; + struct mlx4_port_info { struct mlx4_dev *dev; int port; @@ -293,7 +297,9 @@ struct mlx4_port_info { struct device_attribute port_attr; enum mlx4_port_type tmp_type; struct mlx4_mac_table mac_table; + struct radix_tree_root mac_tree; struct mlx4_vlan_table vlan_table; + int base_qpn; }; struct mlx4_sense { diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index ad4df66750bc..5a2c56023668 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -458,6 +458,7 @@ struct mlx4_en_priv { struct mlx4_en_rss_map rss_map; u32 flags; #define MLX4_EN_FLAG_PROMISC 0x1 +#define MLX4_EN_FLAG_MC_PROMISC 0x2 u32 tx_ring_num; u32 rx_ring_num; u32 rx_skb_size; diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 451339559bdc..eca7d8596f87 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -90,12 +90,79 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, return err; } -int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index) +static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port, + u64 mac, int *qpn, u8 reserve) { - struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table; + struct mlx4_qp qp; + u8 gid[16] = {0}; + int err; + + if (reserve) { + err = mlx4_qp_reserve_range(dev, 1, 1, qpn); + if (err) { + mlx4_err(dev, "Failed to reserve qp for mac registration\n"); + return err; + } + } + qp.qpn = *qpn; + + mac &= 0xffffffffffffULL; + mac = cpu_to_be64(mac << 16); + memcpy(&gid[10], &mac, ETH_ALEN); + gid[5] = port; + gid[7] = MLX4_UC_STEER << 1; + + err = mlx4_qp_attach_common(dev, &qp, gid, 0, + MLX4_PROT_ETH, MLX4_UC_STEER); + if (err && reserve) + mlx4_qp_release_range(dev, *qpn, 1); + + return err; +} + +static void mlx4_uc_steer_release(struct mlx4_dev *dev, u8 port, + u64 mac, int qpn, u8 free) +{ + struct mlx4_qp qp; + u8 gid[16] = {0}; + + qp.qpn = qpn; + mac &= 0xffffffffffffULL; + mac = cpu_to_be64(mac << 16); + memcpy(&gid[10], &mac, ETH_ALEN); + gid[5] = port; + gid[7] = MLX4_UC_STEER << 1; + + mlx4_qp_detach_common(dev, &qp, gid, MLX4_PROT_ETH, MLX4_UC_STEER); + if (free) + mlx4_qp_release_range(dev, qpn, 1); +} + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + struct mlx4_mac_entry *entry; int i, err = 0; int free = -1; + if (dev->caps.vep_uc_steering) { + err = mlx4_uc_steer_add(dev, port, mac, qpn, 1); + if (!err) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) { + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); + return -ENOMEM; + } + entry->mac = mac; + err = radix_tree_insert(&info->mac_tree, *qpn, entry); + if (err) { + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); + return err; + } + } else + return err; + } mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac); mutex_lock(&table->mutex); for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) { @@ -106,7 +173,6 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index) if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { /* MAC already registered, increase refernce count */ - *index = i; ++table->refs[i]; goto out; } @@ -137,7 +203,8 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index) goto out; } - *index = free; + if (!dev->caps.vep_uc_steering) + *qpn = info->base_qpn + free; ++table->total; out: mutex_unlock(&table->mutex); @@ -145,20 +212,52 @@ out: } EXPORT_SYMBOL_GPL(mlx4_register_mac); -void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index) +static int validate_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, int index) { - struct mlx4_mac_table *table = &mlx4_priv(dev)->port[port].mac_table; + int err = 0; - mutex_lock(&table->mutex); - if (!table->refs[index]) { - mlx4_warn(dev, "No MAC entry for index %d\n", index); - goto out; + if (index < 0 || index >= table->max || !table->entries[index]) { + mlx4_warn(dev, "No valid Mac entry for the given index\n"); + err = -EINVAL; } - if (--table->refs[index]) { - mlx4_warn(dev, "Have more references for index %d," - "no need to modify MAC table\n", index); - goto out; + return err; +} + +static int find_index(struct mlx4_dev *dev, + struct mlx4_mac_table *table, u64 mac) +{ + int i; + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) + return i; } + /* Mac not found */ + return -EINVAL; +} + +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int index = qpn - info->base_qpn; + struct mlx4_mac_entry *entry; + + if (dev->caps.vep_uc_steering) { + entry = radix_tree_lookup(&info->mac_tree, qpn); + if (entry) { + mlx4_uc_steer_release(dev, port, entry->mac, qpn, 1); + radix_tree_delete(&info->mac_tree, qpn); + index = find_index(dev, table, entry->mac); + kfree(entry); + } + } + + mutex_lock(&table->mutex); + + if (validate_index(dev, table, index)) + goto out; + table->entries[index] = 0; mlx4_set_port_mac_table(dev, port, table->entries); --table->total; @@ -167,6 +266,44 @@ out: } EXPORT_SYMBOL_GPL(mlx4_unregister_mac); +int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac, u8 wrap) +{ + struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; + struct mlx4_mac_table *table = &info->mac_table; + int index = qpn - info->base_qpn; + struct mlx4_mac_entry *entry; + int err; + + if (dev->caps.vep_uc_steering) { + entry = radix_tree_lookup(&info->mac_tree, qpn); + if (!entry) + return -EINVAL; + index = find_index(dev, table, entry->mac); + mlx4_uc_steer_release(dev, port, entry->mac, qpn, 0); + entry->mac = new_mac; + err = mlx4_uc_steer_add(dev, port, entry->mac, &qpn, 0); + if (err || index < 0) + return err; + } + + mutex_lock(&table->mutex); + + err = validate_index(dev, table, index); + if (err) + goto out; + + table->entries[index] = cpu_to_be64(new_mac | MLX4_MAC_VALID); + + err = mlx4_set_port_mac_table(dev, port, table->entries); + if (unlikely(err)) { + mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) new_mac); + table->entries[index] = 0; + } +out: + mutex_unlock(&table->mutex); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_replace_mac); static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, __be32 *entries) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index ebda939b9fc3..56fa5e1cd6d4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -525,9 +525,15 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int block_mcast_loopback, enum mlx4_protocol protocol); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], enum mlx4_protocol protocol); - -int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *index); -void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int index); +int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap); +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn); +int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac, u8 wrap); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); -- cgit v1.2.3 From c1b43dca137f2154845122417fba86d4bae67182 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 22 Mar 2011 22:38:41 +0000 Subject: mlx4: Add blue flame support for kernel consumers Using blue flame can improve latency by allowing the HW to more efficiently access the WQE. This patch presents two functions that are used to allocate or release HW resources for using blue flame; the caller need to supply a struct mlx4_bf object when allocating resources. Consumers that make use of this API should post doorbells to the UAR object pointed by the initialized struct mlx4_bf; Signed-off-by: Eli Cohen Signed-off-by: David S. Miller --- drivers/net/mlx4/main.c | 31 +++++++++++++++ drivers/net/mlx4/mlx4.h | 3 ++ drivers/net/mlx4/pd.c | 94 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mlx4/device.h | 13 +++++++ include/linux/mlx4/qp.h | 1 + 5 files changed, 142 insertions(+) (limited to 'include') diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index c8e276138f81..05c5671749aa 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -721,8 +722,31 @@ static void mlx4_free_icms(struct mlx4_dev *dev) mlx4_free_icm(dev, priv->fw.aux_icm, 0); } +static int map_bf_area(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + resource_size_t bf_start; + resource_size_t bf_len; + int err = 0; + + bf_start = pci_resource_start(dev->pdev, 2) + (dev->caps.num_uars << PAGE_SHIFT); + bf_len = pci_resource_len(dev->pdev, 2) - (dev->caps.num_uars << PAGE_SHIFT); + priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len); + if (!priv->bf_mapping) + err = -ENOMEM; + + return err; +} + +static void unmap_bf_area(struct mlx4_dev *dev) +{ + if (mlx4_priv(dev)->bf_mapping) + io_mapping_free(mlx4_priv(dev)->bf_mapping); +} + static void mlx4_close_hca(struct mlx4_dev *dev) { + unmap_bf_area(dev); mlx4_CLOSE_HCA(dev, 0); mlx4_free_icms(dev); mlx4_UNMAP_FA(dev); @@ -775,6 +799,9 @@ static int mlx4_init_hca(struct mlx4_dev *dev) goto err_stop_fw; } + if (map_bf_area(dev)) + mlx4_dbg(dev, "Failed to map blue flame area\n"); + init_hca.log_uar_sz = ilog2(dev->caps.num_uars); err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); @@ -805,6 +832,7 @@ err_free_icm: mlx4_free_icms(dev); err_stop_fw: + unmap_bf_area(dev); mlx4_UNMAP_FA(dev); mlx4_free_icm(dev, priv->fw.fw_icm, 0); @@ -1196,6 +1224,9 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) pci_read_config_byte(pdev, PCI_REVISION_ID, &dev->rev_id); + INIT_LIST_HEAD(&priv->bf_list); + mutex_init(&priv->bf_mutex); + /* * Now reset the HCA before we touch the PCI capabilities or * attempt a firmware command, since a boot ROM may have left diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index d8bb4418581b..bc9a21657a4f 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -353,6 +353,9 @@ struct mlx4_priv { struct mutex port_mutex; struct mlx4_msix_ctl msix_ctl; struct mlx4_steer *steer; + struct list_head bf_list; + struct mutex bf_mutex; + struct io_mapping *bf_mapping; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index c4988d6bd5b2..5210a0f31413 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -32,6 +32,7 @@ */ #include +#include #include @@ -77,6 +78,7 @@ int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar) return -ENOMEM; uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index; + uar->map = NULL; return 0; } @@ -88,6 +90,98 @@ void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar) } EXPORT_SYMBOL_GPL(mlx4_uar_free); +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_uar *uar; + int err = 0; + int idx; + + if (!priv->bf_mapping) + return -ENOMEM; + + mutex_lock(&priv->bf_mutex); + if (!list_empty(&priv->bf_list)) + uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list); + else { + uar = kmalloc(sizeof *uar, GFP_KERNEL); + if (!uar) { + err = -ENOMEM; + goto out; + } + err = mlx4_uar_alloc(dev, uar); + if (err) + goto free_kmalloc; + + uar->map = ioremap(uar->pfn << PAGE_SHIFT, PAGE_SIZE); + if (!uar->map) { + err = -ENOMEM; + goto free_uar; + } + + uar->bf_map = io_mapping_map_wc(priv->bf_mapping, uar->index << PAGE_SHIFT); + if (!uar->bf_map) { + err = -ENOMEM; + goto unamp_uar; + } + uar->free_bf_bmap = 0; + list_add(&uar->bf_list, &priv->bf_list); + } + + bf->uar = uar; + idx = ffz(uar->free_bf_bmap); + uar->free_bf_bmap |= 1 << idx; + bf->uar = uar; + bf->offset = 0; + bf->buf_size = dev->caps.bf_reg_size / 2; + bf->reg = uar->bf_map + idx * dev->caps.bf_reg_size; + if (uar->free_bf_bmap == (1 << dev->caps.bf_regs_per_page) - 1) + list_del_init(&uar->bf_list); + + goto out; + +unamp_uar: + bf->uar = NULL; + iounmap(uar->map); + +free_uar: + mlx4_uar_free(dev, uar); + +free_kmalloc: + kfree(uar); + +out: + mutex_unlock(&priv->bf_mutex); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_bf_alloc); + +void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int idx; + + if (!bf->uar || !bf->uar->bf_map) + return; + + mutex_lock(&priv->bf_mutex); + idx = (bf->reg - bf->uar->bf_map) / dev->caps.bf_reg_size; + bf->uar->free_bf_bmap &= ~(1 << idx); + if (!bf->uar->free_bf_bmap) { + if (!list_empty(&bf->uar->bf_list)) + list_del(&bf->uar->bf_list); + + io_mapping_unmap(bf->uar->bf_map); + iounmap(bf->uar->map); + mlx4_uar_free(dev, bf->uar); + kfree(bf->uar); + } else if (list_empty(&bf->uar->bf_list)) + list_add(&bf->uar->bf_list, &priv->bf_list); + + mutex_unlock(&priv->bf_mutex); +} +EXPORT_SYMBOL_GPL(mlx4_bf_free); + int mlx4_init_uar_table(struct mlx4_dev *dev) { if (dev->caps.num_uars <= 128) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 56fa5e1cd6d4..8985768e2c0d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -351,6 +351,17 @@ struct mlx4_fmr { struct mlx4_uar { unsigned long pfn; int index; + struct list_head bf_list; + unsigned free_bf_bmap; + void __iomem *map; + void __iomem *bf_map; +}; + +struct mlx4_bf { + unsigned long offset; + int buf_size; + struct mlx4_uar *uar; + void __iomem *reg; }; struct mlx4_cq { @@ -478,6 +489,8 @@ void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar); void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar); +int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf); +void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf); int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, struct mlx4_mtt *mtt); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 0eeb2a1a867c..9e9eb21056ca 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -303,6 +303,7 @@ struct mlx4_wqe_data_seg { enum { MLX4_INLINE_ALIGN = 64, + MLX4_INLINE_SEG = 1 << 31, }; struct mlx4_wqe_inline_seg { -- cgit v1.2.3 From a861a1e1c398fe34701569fd8ac9225dfe0a9a7e Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:51 +0000 Subject: NFSv4.1: add generic layer hooks for pnfs COMMIT We create three major hooks for the pnfs code. pnfs_mark_request_commit() is called during writeback_done from nfs_mark_request_commit, which gives the driver an opportunity to claim it wants control over commiting a particular req. pnfs_choose_commit_list() is called from nfs_scan_list to choose which list a given req should be added to, based on where we intend to send it for COMMIT. It is up to the driver to have preallocated list headers for each destination it may need. pnfs_commit_list() is how the driver actually takes control, it is used instead of nfs_commit_list(). In order to pass information between the above functions, we create a union in nfs_page to hold a lseg (which is possible because the req is not on any list while in transition), and add some flags to indicate if we need to use the pnfs code. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 5 ++-- fs/nfs/pnfs.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/write.c | 41 +++++++++++++++++---------- include/linux/nfs_fs.h | 1 + include/linux/nfs_page.h | 6 +++- 5 files changed, 108 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fd85618149a1..87a593c2b055 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -398,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, pgoff_t idx_end; int found, i; int res; + struct list_head *list; res = 0; if (npages == 0) @@ -418,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, idx_start = req->wb_index + 1; if (nfs_set_page_tag_locked(req)) { kref_get(&req->wb_kref); - nfs_list_remove_request(req); radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, tag); - nfs_list_add_request(req, dst); + list = pnfs_choose_commit_list(req, dst); + nfs_list_add_request(req, list); res++; if (res == INT_MAX) goto out; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6380b9405bcd..5370f1b9aa43 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -74,6 +74,13 @@ struct pnfs_layoutdriver_type { /* test for nfs page cache coalescing */ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + /* Returns true if layoutdriver wants to divert this request to + * driver's commit routine. + */ + bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); + struct list_head * (*choose_commit_list) (struct nfs_page *req); + int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -169,6 +176,51 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) return nfss->pnfs_curr_ld != NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + if (lseg) { + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; + if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { + set_bit(PG_PNFS_COMMIT, &req->wb_flags); + req->wb_commit_lseg = get_lseg(lseg); + } + } +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags)) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + struct list_head *rv; + + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { + struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; + + set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); + rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); + /* matched by ref taken when PG_PNFS_COMMIT is set */ + put_lseg(req->wb_commit_lseg); + } else + rv = mds; + return rv; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) + put_lseg(req->wb_commit_lseg); +} + #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -252,6 +304,27 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) pgio->pg_test = NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + return mds; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f5f005e9db48..6927a18b6891 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -441,7 +441,7 @@ nfs_mark_request_dirty(struct nfs_page *req) * Add a request to the inode's commit list. */ static void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -453,6 +453,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); nfsi->ncommit++; spin_unlock(&inode->i_lock); + pnfs_mark_request_commit(req, lseg); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -481,10 +482,11 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { @@ -495,7 +497,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #else static inline void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { } @@ -512,7 +514,8 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { return 0; } @@ -615,9 +618,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, } if (nfs_clear_request_commit(req) && - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { NFS_I(inode)->ncommit--; + pnfs_clear_request_commit(req); + } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -765,11 +770,12 @@ int nfs_updatepage(struct file *file, struct page *page, return status; } -static void nfs_writepage_release(struct nfs_page *req) +static void nfs_writepage_release(struct nfs_page *req, + struct nfs_write_data *data) { struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) nfs_inode_remove_request(req); nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); @@ -1087,7 +1093,7 @@ static void nfs_writeback_release_partial(void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req); + nfs_writepage_release(req, data); nfs_writedata_release(calldata); } @@ -1154,7 +1160,7 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); dprintk(" marked for commit\n"); goto next; } @@ -1357,14 +1363,15 @@ static void nfs_init_commit(struct nfs_write_data *data, nfs_fattr_init(&data->fattr); } -static void nfs_retry_commit(struct list_head *page_list) +static void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg) { struct nfs_page *req; while (!list_empty(page_list)) { req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, lseg); dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); dec_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); @@ -1389,7 +1396,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) nfs_init_commit(data, head); return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: - nfs_retry_commit(head); + nfs_retry_commit(head, NULL); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1477,7 +1484,11 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error; + + error = pnfs_commit_list(inode, &head, how); + if (error == PNFS_NOT_ATTEMPTED) + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; if (!may_wait) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4179c368844b..eddda6ce7c42 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -226,6 +226,7 @@ struct nfs_inode { #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ +#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 92d54c81f51e..8023e4e25133 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -33,11 +33,15 @@ enum { PG_CLEAN, PG_NEED_COMMIT, PG_NEED_RESCHED, + PG_PNFS_COMMIT, }; struct nfs_inode; struct nfs_page { - struct list_head wb_list; /* Defines state of page: */ + union { + struct list_head wb_list; /* Defines state of page: */ + struct pnfs_layout_segment *wb_commit_lseg; /* Used when PG_PNFS_COMMIT set */ + }; struct page *wb_page; /* page to read in/write out */ struct nfs_open_context *wb_context; /* File state context info */ struct nfs_lock_context *wb_lock_context; /* lock context info */ -- cgit v1.2.3 From e0c2b3801828aadb65dec9f67f7c6b7a675ad007 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Wed, 23 Mar 2011 13:27:53 +0000 Subject: NFSv4.1: filelayout driver specific code for COMMIT Implement all the hooks created in the previous patches. This requires exporting quite a few functions and adding a few structure fields. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 + fs/nfs/internal.h | 14 +++ fs/nfs/nfs4filelayout.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/pnfs.c | 1 + fs/nfs/write.c | 21 +++-- include/linux/nfs_fs.h | 1 + include/linux/nfs_xdr.h | 1 + 7 files changed, 267 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 477a2e512b39..229e586b1a20 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1470,6 +1470,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; + atomic_set(&nfsi->commits_outstanding, 0); #endif } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d1ddc23c404d..708705062216 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -276,11 +276,25 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg); +void nfs_commit_clear_lock(struct nfs_inode *nfsi); +void nfs_commitdata_release(void *data); +void nfs_commit_release_pages(struct nfs_write_data *data); + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 03ff80c67c6e..97e75a22af72 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -213,6 +213,37 @@ static int filelayout_write_done_cb(struct rpc_task *task, return 0; } +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_write_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(data->verf.verifier, first->wb_verf.verifier, + sizeof(first->wb_verf.verifier)); + data->verf.verifier[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + prepare_to_resend_writes(data); + filelayout_set_lo_fail(data->lseg); + } else + nfs_restart_rpc(task, data->ds_clp); + return -EAGAIN; + } + + return 0; +} + static void filelayout_write_prepare(struct rpc_task *task, void *data) { struct nfs_write_data *wdata = (struct nfs_write_data *)data; @@ -240,6 +271,16 @@ static void filelayout_write_release(void *data) wdata->mds_ops->rpc_release(data); } +static void filelayout_commit_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + nfs_commit_release_pages(wdata); + if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) + nfs_commit_clear_lock(NFS_I(wdata->inode)); + nfs_commitdata_release(wdata); +} + struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, @@ -252,6 +293,12 @@ struct rpc_call_ops filelayout_write_call_ops = { .rpc_release = filelayout_write_release, }; +struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_commit_release, +}; + static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { @@ -574,6 +621,191 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) +{ + return !FILELAYOUT_LSEG(lseg)->commit_through_mds; +} + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +{ + struct pnfs_layout_segment *lseg = req->wb_commit_lseg; + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + struct list_head *list; + + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, + (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + i = select_bucket_index(fl, j); + list = &fl->commit_buckets[i]; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg */ + get_lseg(lseg); + } + return list; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + prepare_to_resend_writes(data); + data->mds_ops->rpc_release(data); + return -EAGAIN; + } + dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); + data->write_done_cb = filelayout_commit_done_cb; + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, + &filelayout_commit_call_ops, how); +} + +/* + * This is only useful while we are using whole file layouts. + */ +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = get_lseg(lseg); + spin_unlock(&inode->i_lock); + return rv; +} + +static int alloc_ds_commits(struct inode *inode, struct list_head *list) +{ + struct pnfs_layout_segment *lseg; + struct nfs4_filelayout_segment *fl; + struct nfs_write_data *data; + int i, j; + + /* Won't need this when non-whole file layout segments are supported + * instead we will use a pnfs_layout_hdr structure */ + lseg = find_only_write_lseg(inode); + if (!lseg) + return 0; + fl = FILELAYOUT_LSEG(lseg); + for (i = 0; i < fl->number_of_buckets; i++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->ds_commit_index = i; + data->lseg = lseg; + list_add(&data->pages, list); + } + put_lseg(lseg); + return 0; + +out_bad: + for (j = i; j < fl->number_of_buckets; j++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + nfs_retry_commit(&fl->commit_buckets[i], lseg); + put_lseg(lseg); /* associated with emptying bucket */ + } + put_lseg(lseg); + /* Caller will clean up entries put on list */ + return -ENOMEM; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how) +{ + struct nfs_write_data *data, *tmp; + LIST_HEAD(list); + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->lseg = NULL; + list_add(&data->pages, &list); + } + + if (alloc_ds_commits(inode, &list)) + goto out_bad; + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + atomic_inc(&NFS_I(inode)->commits_outstanding); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL); + nfs_initiate_commit(data, NFS_CLIENT(inode), + data->mds_ops, how); + } else { + nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); + filelayout_initiate_commit(data, how); + } + } + return 0; + out_bad: + list_for_each_entry_safe(data, tmp, &list, pages) { + nfs_retry_commit(&data->pages, data->lseg); + list_del_init(&data->pages); + nfs_commit_free(data); + } + nfs_retry_commit(mds_pages, NULL); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -581,6 +813,9 @@ static struct pnfs_layoutdriver_type filelayout_type = { .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, + .mark_pnfs_commit = filelayout_mark_pnfs_commit, + .choose_commit_list = filelayout_choose_commit_list, + .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f38813a0a295..c67565965f2a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,6 +259,7 @@ put_lseg(struct pnfs_layout_segment *lseg) pnfs_free_lseg_list(&free_me); } } +EXPORT_SYMBOL_GPL(put_lseg); static bool should_free_lseg(u32 lseg_iomode, u32 recall_iomode) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cae5d160d835..e7aeda0663c5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); void nfs_commit_free(struct nfs_write_data *p) { @@ -66,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { @@ -1283,15 +1285,15 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) return (ret < 0) ? ret : 1; } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); smp_mb__after_clear_bit(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } +EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); - -static void nfs_commitdata_release(void *data) +void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1299,8 +1301,9 @@ static void nfs_commitdata_release(void *data) put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } +EXPORT_SYMBOL_GPL(nfs_commitdata_release); -static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, +int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how) { @@ -1334,11 +1337,12 @@ static int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *cln rpc_put_task(task); return 0; } +EXPORT_SYMBOL_GPL(nfs_initiate_commit); /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_init_commit(struct nfs_write_data *data, +void nfs_init_commit(struct nfs_write_data *data, struct list_head *head, struct pnfs_layout_segment *lseg) { @@ -1365,8 +1369,9 @@ static void nfs_init_commit(struct nfs_write_data *data, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); } +EXPORT_SYMBOL_GPL(nfs_init_commit); -static void nfs_retry_commit(struct list_head *page_list, +void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg) { struct nfs_page *req; @@ -1381,6 +1386,7 @@ static void nfs_retry_commit(struct list_head *page_list, nfs_clear_page_tag_locked(req); } } +EXPORT_SYMBOL_GPL(nfs_retry_commit); /* * Commit dirty pages @@ -1419,7 +1425,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) return; } -static void nfs_commit_release_pages(struct nfs_write_data *data) +void nfs_commit_release_pages(struct nfs_write_data *data) { struct nfs_page *req; int status = data->task.tk_status; @@ -1456,6 +1462,7 @@ static void nfs_commit_release_pages(struct nfs_write_data *data) nfs_clear_page_tag_locked(req); } } +EXPORT_SYMBOL_GPL(nfs_commit_release_pages); static void nfs_commit_release(void *calldata) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index eddda6ce7c42..807e07c86b26 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -198,6 +198,7 @@ struct nfs_inode { /* pNFS layout information */ struct pnfs_layout_hdr *layout; + atomic_t commits_outstanding; #endif /* CONFIG_NFS_V4*/ #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2c2c67d2eb42..ac0c0e51786e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1040,6 +1040,7 @@ struct nfs_write_data { struct nfs_writeres res; /* result struct */ struct pnfs_layout_segment *lseg; struct nfs_client *ds_clp; /* pNFS data server */ + int ds_commit_index; const struct rpc_call_ops *mds_ops; int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); #ifdef CONFIG_NFS_V4 -- cgit v1.2.3 From 863a3c6c686d5773f7192a4818769e15db12ce08 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 23 Mar 2011 13:27:54 +0000 Subject: NFSv4.1: layoutcommit The filelayout driver sends LAYOUTCOMMIT only when COMMIT goes to the data server (as opposed to the MDS) and the data server WRITE is not NFS_FILE_SYNC. Only whole file layout support means that there is only one IOMODE_RW layout segment. Signed-off-by: Andy Adamson Signed-off-by: Alexandros Batsakis Signed-off-by: Boaz Harrosh Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Mingyang Guo Signed-off-by: Tao Guo Signed-off-by: Zhang Jingwang Tested-by: Boaz Harrosh Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 ++ fs/nfs/nfs4_fs.h | 2 + fs/nfs/nfs4filelayout.c | 18 +++++++ fs/nfs/nfs4proc.c | 94 +++++++++++++++++++++++++++++++++++ fs/nfs/nfs4xdr.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/pnfs.c | 94 +++++++++++++++++++++++++++++++++++ fs/nfs/pnfs.h | 9 +++- fs/nfs/write.c | 15 +++++- include/linux/nfs4.h | 1 + include/linux/nfs_fs.h | 1 + include/linux/nfs_xdr.h | 23 +++++++++ 11 files changed, 387 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d85a534b15cd..85cb95de5df5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync) ret = xchg(&ctx->error, 0); if (!ret && status < 0) ret = status; + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, 1); return ret; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c64be1cff080..1e612d159b71 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -262,6 +262,8 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + int sync); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 97e75a22af72..fc1a0e9c1270 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -153,6 +153,23 @@ static int filelayout_read_done_cb(struct rpc_task *task, return 0; } +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_write_data *wdata) +{ + if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + wdata->res.verf->committed == NFS_FILE_SYNC) + return; + + pnfs_set_layoutcommit(wdata); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, + (unsigned long) wdata->lseg->pls_end_pos); +} + /* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its @@ -210,6 +227,7 @@ static int filelayout_write_done_cb(struct rpc_task *task, return -EAGAIN; } + filelayout_set_layoutcommit(data); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d61cccc8d4d..6f2f40239d10 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5616,6 +5616,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) } EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case NFS4ERR_BADLAYOUT: /* no layout */ + case NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + } + + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, server->nfs_client); + return; + } + + if (task->tk_status == 0) + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + + /* Matched by references in pnfs_set_layoutcommit */ + put_lseg(data->lseg); + put_rpccred(data->cred); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (!sync) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 07cdf925c524..207d399c8dee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -324,6 +324,18 @@ static int nfs4_stat_to_errno(int); #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) + #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -727,6 +739,17 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1816,6 +1839,34 @@ encode_layoutget(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutget_maxsz; } + +static int +encode_layoutcommit(struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ + *p++ = cpu_to_be32(0); /* reclaim */ + p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + *p++ = cpu_to_be32(0); /* no file layout payload */ + + hdr->nops++; + hdr->replen += decode_layoutcommit_maxsz; + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2607,6 +2658,26 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_layoutget(xdr, args, &hdr); encode_nops(&hdr); } + +/* + * Encode LAYOUTCOMMIT request + */ +static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -5007,6 +5078,35 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6068,6 +6168,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6269,6 +6397,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c67565965f2a..2a08ca0dddc1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -946,3 +946,97 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } + +/* + * Currently there is only one (whole file) write lseg. + */ +static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = lseg; + return rv; +} + +void +pnfs_set_layoutcommit(struct nfs_write_data *wdata) +{ + struct nfs_inode *nfsi = NFS_I(wdata->inode); + loff_t end_pos = wdata->args.offset + wdata->res.count; + + spin_lock(&nfsi->vfs_inode.i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + wdata->lseg->pls_lc_cred = + get_rpccred(wdata->args.context->state->owner->so_cred); + mark_inode_dirty_sync(wdata->inode); + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, wdata->inode->i_ino); + } + if (end_pos > wdata->lseg->pls_end_pos) + wdata->lseg->pls_end_pos = end_pos; + spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +int +pnfs_layoutcommit_inode(struct inode *inode, int sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg; + struct rpc_cred *cred; + loff_t end_pos; + int status = 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + spin_lock(&inode->i_lock); + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + spin_unlock(&inode->i_lock); + kfree(data); + goto out; + } + /* + * Currently only one (whole file) write lseg which is referenced + * in pnfs_set_layoutcommit and will be found. + */ + lseg = pnfs_list_write_lseg(inode); + + end_pos = lseg->pls_end_pos; + cred = lseg->pls_lc_cred; + lseg->pls_end_pos = 0; + lseg->pls_lc_cred = NULL; + + if (!data) { + put_lseg(lseg); + spin_unlock(&inode->i_lock); + put_rpccred(cred); + status = -ENOMEM; + goto out; + } else { + memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, + sizeof(nfsi->layout->plh_stateid.data)); + } + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->lseg = lseg; + data->cred = cred; + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + status = nfs4_proc_layoutcommit(data, sync); +out: + dprintk("<-- %s status %d\n", __func__, status); + return status; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5370f1b9aa43..0806c77862b6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -43,6 +43,8 @@ struct pnfs_layout_segment { atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; + struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ + loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ }; enum pnfs_try_status { @@ -152,7 +154,8 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); - +void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +int pnfs_layoutcommit_inode(struct inode *inode, int sync); static inline int lo_fail_bit(u32 iomode) { @@ -325,6 +328,10 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req) { } +static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e7aeda0663c5..a03c11f9081e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1562,7 +1562,20 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return nfs_commit_unstable_pages(inode, wbc); + int ret; + + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status, sync = wbc->sync_mode; + + if (wbc->nonblocking || wbc->for_background) + sync = 0; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; } /* diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 134716e5e350..bf22cfaaeee5 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -560,6 +560,7 @@ enum { NFSPROC4_CLNT_RECLAIM_COMPLETE, NFSPROC4_CLNT_LAYOUTGET, NFSPROC4_CLNT_GETDEVICEINFO, + NFSPROC4_CLNT_LAYOUTCOMMIT, }; /* nfs41 types */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 807e07c86b26..1b93b9c60e55 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -228,6 +228,7 @@ struct nfs_inode { #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ #define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */ +#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ac0c0e51786e..84f3585c5728 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -236,6 +236,29 @@ struct nfs4_getdeviceinfo_res { struct nfs4_sequence_res seq_res; }; +struct nfs4_layoutcommit_args { + nfs4_stateid stateid; + __u64 lastbytewritten; + struct inode *inode; + const u32 *bitmask; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_layoutcommit_res { + struct nfs_fattr *fattr; + const struct nfs_server *server; + struct nfs4_sequence_res seq_res; +}; + +struct nfs4_layoutcommit_data { + struct rpc_task task; + struct nfs_fattr fattr; + struct pnfs_layout_segment *lseg; + struct rpc_cred *cred; + struct nfs4_layoutcommit_args args; + struct nfs4_layoutcommit_res res; +}; + /* * Arguments to the open call. */ -- cgit v1.2.3 From d47d81c0e9abdc3c88653fabff5beae82c949b09 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:16:41 +0100 Subject: Introduce ARCH_NO_SYSDEV_OPS config option (v2) Introduce Kconfig option allowing architectures where sysdev operations used during system suspend, resume and shutdown have been completely replaced with struct sycore_ops operations to avoid building sysdev code that will never be used. Make callbacks in struct sys_device and struct sysdev_driver depend on ARCH_NO_SYSDEV_OPS to allows us to verify if all of the references have been actually removed from the code the given architecture depends on. Make x86 select ARCH_NO_SYSDEV_OPS. Signed-off-by: Rafael J. Wysocki --- arch/x86/Kconfig | 1 + drivers/base/Kconfig | 7 +++++++ drivers/base/sys.c | 3 ++- include/linux/device.h | 4 ++++ include/linux/pm.h | 10 ++++++++-- include/linux/sysdev.h | 7 +++++-- 6 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d57ddd7573cc..b1cd5a96a511 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -71,6 +71,7 @@ config X86 select GENERIC_IRQ_SHOW select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP + select ARCH_NO_SYSDEV_OPS config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index d57e8d0fb823..e9e5238f3106 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -168,4 +168,11 @@ config SYS_HYPERVISOR bool default n +config ARCH_NO_SYSDEV_OPS + bool + ---help--- + To be selected by architectures that don't use sysdev class or + sysdev driver power management (suspend/resume) and shutdown + operations. + endmenu diff --git a/drivers/base/sys.c b/drivers/base/sys.c index f6fb54741602..fbe72da6c414 100644 --- a/drivers/base/sys.c +++ b/drivers/base/sys.c @@ -329,7 +329,7 @@ void sysdev_unregister(struct sys_device *sysdev) } - +#ifndef CONFIG_ARCH_NO_SYSDEV_OPS /** * sysdev_shutdown - Shut down all system devices. * @@ -524,6 +524,7 @@ int sysdev_resume(void) return 0; } EXPORT_SYMBOL_GPL(sysdev_resume); +#endif /* CONFIG_ARCH_NO_SYSDEV_OPS */ int __init system_bus_init(void) { diff --git a/include/linux/device.h b/include/linux/device.h index 144ec135875f..ab8dfc095709 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -633,8 +633,12 @@ static inline int devtmpfs_mount(const char *mountpoint) { return 0; } /* drivers/base/power/shutdown.c */ extern void device_shutdown(void); +#ifndef CONFIG_ARCH_NO_SYSDEV_OPS /* drivers/base/sys.c */ extern void sysdev_shutdown(void); +#else +static inline void sysdev_shutdown(void) { } +#endif /* debugging and troubleshooting/diagnostic helpers. */ extern const char *dev_driver_string(const struct device *dev); diff --git a/include/linux/pm.h b/include/linux/pm.h index 6618216bb973..512e09177e57 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -529,13 +529,19 @@ struct dev_power_domain { */ #ifdef CONFIG_PM_SLEEP -extern void device_pm_lock(void); +#ifndef CONFIG_ARCH_NO_SYSDEV_OPS +extern int sysdev_suspend(pm_message_t state); extern int sysdev_resume(void); +#else +static inline int sysdev_suspend(pm_message_t state) { return 0; } +static inline int sysdev_resume(void) { return 0; } +#endif + +extern void device_pm_lock(void); extern void dpm_resume_noirq(pm_message_t state); extern void dpm_resume_end(pm_message_t state); extern void device_pm_unlock(void); -extern int sysdev_suspend(pm_message_t state); extern int dpm_suspend_noirq(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h index 1154c29f4101..8a75da551e4e 100644 --- a/include/linux/sysdev.h +++ b/include/linux/sysdev.h @@ -33,12 +33,13 @@ struct sysdev_class { const char *name; struct list_head drivers; struct sysdev_class_attribute **attrs; - + struct kset kset; +#ifndef CONFIG_ARCH_NO_SYSDEV_OPS /* Default operations for these types of devices */ int (*shutdown)(struct sys_device *); int (*suspend)(struct sys_device *, pm_message_t state); int (*resume)(struct sys_device *); - struct kset kset; +#endif }; struct sysdev_class_attribute { @@ -76,9 +77,11 @@ struct sysdev_driver { struct list_head entry; int (*add)(struct sys_device *); int (*remove)(struct sys_device *); +#ifndef CONFIG_ARCH_NO_SYSDEV_OPS int (*shutdown)(struct sys_device *); int (*suspend)(struct sys_device *, pm_message_t state); int (*resume)(struct sys_device *); +#endif }; -- cgit v1.2.3 From ef352e7cdf714596f51ad18809404edeaa50e8fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Mar 2011 00:13:14 -0700 Subject: net_sched: fix THROTTLED/RUNNING race commit fd245a4adb52 (net_sched: move TCQ_F_THROTTLED flag) added a race. qdisc_watchdog() is run from softirq, so special care should be taken or we can lose one state transition (THROTTLED/RUNNING) Prior to fd245a4adb52, we were manipulating q->flags (qdisc->flags &= ~TCQ_F_THROTTLED;) and this manipulation could only race with qdisc_warn_nonwc(). Since we want to avoid atomic ops in qdisc fast path - it was the meaning of commit 371121057607e (QDISC_STATE_RUNNING dont need atomic bit ops) - fix is to move THROTTLE bit into 'state' field, this one being manipulated with SMP and IRQ safe operations. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a9505b6a18e3..b931f021d7ab 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -25,6 +25,7 @@ struct qdisc_rate_table { enum qdisc_state_t { __QDISC_STATE_SCHED, __QDISC_STATE_DEACTIVATED, + __QDISC_STATE_THROTTLED, }; /* @@ -32,7 +33,6 @@ enum qdisc_state_t { */ enum qdisc___state_t { __QDISC___STATE_RUNNING = 1, - __QDISC___STATE_THROTTLED = 2, }; struct qdisc_size_table { @@ -106,17 +106,17 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) static inline bool qdisc_is_throttled(const struct Qdisc *qdisc) { - return (qdisc->__state & __QDISC___STATE_THROTTLED) ? true : false; + return test_bit(__QDISC_STATE_THROTTLED, &qdisc->state) ? true : false; } static inline void qdisc_throttled(struct Qdisc *qdisc) { - qdisc->__state |= __QDISC___STATE_THROTTLED; + set_bit(__QDISC_STATE_THROTTLED, &qdisc->state); } static inline void qdisc_unthrottled(struct Qdisc *qdisc) { - qdisc->__state &= ~__QDISC___STATE_THROTTLED; + clear_bit(__QDISC_STATE_THROTTLED, &qdisc->state); } struct Qdisc_class_ops { -- cgit v1.2.3 From 0f77a8d378254f27df4a114a5da67223af1fe93f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 24 Mar 2011 11:42:29 +0900 Subject: vsprintf: Introduce %pB format specifier The %pB format specifier is for stack backtrace. Its handler sprint_backtrace() does symbol lookup using (address-1) to ensure the address will not point outside of the function. If there is a tail-call to the function marked "noreturn", gcc optimized out the code after the call then causes saved return address points outside of the function (i.e. the start of the next function), so pollutes call trace somewhat. This patch adds the %pB printk mechanism that allows architecture call-trace printout functions to improve backtrace printouts. Signed-off-by: Namhyung Kim Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Cc: linux-arch@vger.kernel.org LKML-Reference: <1300934550-21394-1-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kallsyms.h | 7 +++++++ kernel/kallsyms.c | 44 +++++++++++++++++++++++++++++++++++++++++--- lib/vsprintf.c | 7 ++++++- 3 files changed, 54 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index d8e9b3d1c23c..0df513b7a9f8 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -36,6 +36,7 @@ const char *kallsyms_lookup(unsigned long addr, /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); +extern int sprint_backtrace(char *buffer, unsigned long address); /* Look up a kernel symbol and print it to the kernel messages. */ extern void __print_symbol(const char *fmt, unsigned long address); @@ -79,6 +80,12 @@ static inline int sprint_symbol(char *buffer, unsigned long addr) return 0; } +static inline int sprint_backtrace(char *buffer, unsigned long addr) +{ + *buffer = '\0'; + return 0; +} + static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..59e879929b17 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d3023df8477f..d9e01fc3168e 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -574,7 +574,9 @@ char *symbol_string(char *buf, char *end, void *ptr, unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - if (ext != 'f' && ext != 's') + if (ext == 'B') + sprint_backtrace(sym, value); + else if (ext != 'f' && ext != 's') sprint_symbol(sym, value); else kallsyms_lookup(value, NULL, NULL, NULL, sym); @@ -949,6 +951,7 @@ int kptr_restrict = 1; * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers with offset * - 's' For symbolic direct pointers without offset + * - 'B' For backtraced symbolic direct pointers with offset * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'M' For a 6-byte MAC address, it prints the address in the @@ -1008,6 +1011,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, /* Fallthrough */ case 'S': case 's': + case 'B': return symbol_string(buf, end, ptr, spec, *fmt); case 'R': case 'r': @@ -1279,6 +1283,7 @@ qualifier: * %ps output the name of a text symbol without offset * %pF output the name of a function pointer with its offset * %pf output the name of a function pointer without its offset + * %pB output the name of a backtrace symbol with its offset * %pR output the address range in a struct resource with decoded flags * %pr output the address range in a struct resource with raw flags * %pM output a 6-byte MAC address with colons -- cgit v1.2.3 From 51eab416c9b4b3ed16553d405ec4a5f67daa34cf Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 24 Mar 2011 20:54:35 +1000 Subject: drm/vblank: update recently added vbl interface to be more future proof. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the interface a bit cleaner by leaving a single gap in the vblank bit space instead of creating two gaps. Suggestions from Michel on mailing list/irc. Reviewed-by: Michel Dänzer Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_ioctl.c | 2 +- include/drm/drm.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c index 3617b4c4bb57..904d7e9c8e47 100644 --- a/drivers/gpu/drm/drm_ioctl.c +++ b/drivers/gpu/drm/drm_ioctl.c @@ -280,7 +280,7 @@ int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv) if (dev->driver->dumb_create) req->value = 1; break; - case DRM_CAP_HIGH_CRTC: + case DRM_CAP_VBLANK_HIGH_CRTC: req->value = 1; break; default: diff --git a/include/drm/drm.h b/include/drm/drm.h index 99cd07433fab..4be33b4ca2f8 100644 --- a/include/drm/drm.h +++ b/include/drm/drm.h @@ -463,14 +463,15 @@ struct drm_irq_busid { enum drm_vblank_seq_type { _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ }; -#define _DRM_VBLANK_HIGH_CRTC_SHIFT 16 -#define _DRM_VBLANK_HIGH_CRTC_MASK 0x001F0000 +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 #define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) #define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ @@ -755,7 +756,7 @@ struct drm_event_vblank { }; #define DRM_CAP_DUMB_BUFFER 0x1 -#define DRM_CAP_HIGH_CRTC 0x2 +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 /* typedef area */ #ifndef __KERNEL__ -- cgit v1.2.3 From f868120549fc1664b2c451d4b9882a363928c698 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Thu, 24 Mar 2011 13:54:30 +0000 Subject: dm ioctl: add flag to wipe buffers for secure data Add DM_SECURE_DATA_FLAG which userspace can use to ensure that all buffers allocated for dm-ioctl are wiped immediately after use. The user buffer is wiped as well (we do not want to keep and return sensitive data back to userspace if the flag is set). Wiping is useful for cryptsetup to ensure that the key is present in memory only in defined places and only for the time needed. (For crypt, key can be present in table during load or table status, wait and message commands). Signed-off-by: Milan Broz Signed-off-by: Alasdair G Kergon --- drivers/md/dm-ioctl.c | 23 +++++++++++++++++++++-- include/linux/dm-ioctl.h | 12 +++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 3a8f6a0e6eb5..4cacdad2270a 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1504,6 +1504,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) { struct dm_ioctl tmp, *dmi; + int secure_data; if (copy_from_user(&tmp, user, sizeof(tmp) - sizeof(tmp.data))) return -EFAULT; @@ -1511,17 +1512,28 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param) if (tmp.data_size < (sizeof(tmp) - sizeof(tmp.data))) return -EINVAL; + secure_data = tmp.flags & DM_SECURE_DATA_FLAG; + dmi = vmalloc(tmp.data_size); - if (!dmi) + if (!dmi) { + if (secure_data && clear_user(user, tmp.data_size)) + return -EFAULT; return -ENOMEM; + } if (copy_from_user(dmi, user, tmp.data_size)) goto bad; + /* Wipe the user buffer so we do not return it to userspace */ + if (secure_data && clear_user(user, tmp.data_size)) + goto bad; + *param = dmi; return 0; bad: + if (secure_data) + memset(dmi, 0, tmp.data_size); vfree(dmi); return -EFAULT; } @@ -1531,6 +1543,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) /* Always clear this flag */ param->flags &= ~DM_BUFFER_FULL_FLAG; param->flags &= ~DM_UEVENT_GENERATED_FLAG; + param->flags &= ~DM_SECURE_DATA_FLAG; /* Ignores parameters */ if (cmd == DM_REMOVE_ALL_CMD || @@ -1558,6 +1571,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param) static int ctl_ioctl(uint command, struct dm_ioctl __user *user) { int r = 0; + int wipe_buffer; unsigned int cmd; struct dm_ioctl *uninitialized_var(param); ioctl_fn fn = NULL; @@ -1602,13 +1616,15 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) * Copy the parameters into kernel space. */ r = copy_params(user, ¶m); - input_param_size = param->data_size; current->flags &= ~PF_MEMALLOC; if (r) return r; + input_param_size = param->data_size; + wipe_buffer = param->flags & DM_SECURE_DATA_FLAG; + r = validate_params(cmd, param); if (r) goto out; @@ -1623,6 +1639,9 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user) r = -EFAULT; out: + if (wipe_buffer) + memset(param, 0, input_param_size); + vfree(param); return r; } diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 78bbf47bbb96..3708455ee6c3 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 19 -#define DM_VERSION_PATCHLEVEL 1 -#define DM_VERSION_EXTRA "-ioctl (2011-01-07)" +#define DM_VERSION_MINOR 20 +#define DM_VERSION_PATCHLEVEL 0 +#define DM_VERSION_EXTRA "-ioctl (2011-02-02)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ @@ -328,4 +328,10 @@ enum { */ #define DM_UUID_FLAG (1 << 14) /* In */ +/* + * If set, all buffers are wiped after use. Use when sending + * or requesting sensitive data such as an encryption key. + */ +#define DM_SECURE_DATA_FLAG (1 << 15) /* In */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit v1.2.3 From 7c5130588d691a3b34d02312f1bd1b6d56fe0100 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:24 +0000 Subject: NFS: lookup supports alternate client A later patch will need to perform a lookup using an alternate client with a different security flavor. This patch adds support for doing that on NFS v4. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 6 ++-- fs/nfs/internal.h | 6 ++-- fs/nfs/namespace.c | 2 +- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4_fs.h | 3 +- fs/nfs/nfs4proc.c | 86 +++++++++++++++++++++++++++---------------------- fs/nfs/proc.c | 2 +- include/linux/nfs_xdr.h | 2 +- 8 files changed, 60 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index abdf38d5971d..a83cd0d9dfab 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1068,7 +1068,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1224,7 +1224,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) goto no_entry; if (error < 0) { @@ -1562,7 +1562,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 72e0bddf7a2f..1ec5d0662ede 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,12 +296,14 @@ extern int nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour, int noresvport); extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); -extern int _nfs4_call_sync(struct nfs_server *server, +extern int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); -extern int _nfs4_call_sync_session(struct nfs_server *server, +extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index bf1c68009ffd..b02720864ded 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -153,7 +153,7 @@ struct vfsmount *nfs_d_automount(struct path *path) /* Look it up again to get its attributes */ parent = dget_parent(path->dentry); - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, &path->dentry->d_name, fh, fattr); dput(parent); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0c80d8b3f96..38053d823eb0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs3_diropargs arg = { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c64be1cff080..72e5f1a2883e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -57,7 +57,8 @@ enum nfs4_session_state { struct nfs4_minor_version_ops { u32 minor_version; - int (*call_sync)(struct nfs_server *server, + int (*call_sync)(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e403d7a84466..448657456b68 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,7 +71,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -657,7 +659,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_server *server, +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -673,7 +676,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = server->client, + .rpc_client = clnt, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -692,13 +695,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, return ret; } -int _nfs4_call_sync_session(struct nfs_server *server, +int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); } #else @@ -709,25 +713,27 @@ static int nfs4_sequence_done(struct rpc_task *task, } #endif /* CONFIG_NFS_V4_1 */ -int _nfs4_call_sync(struct nfs_server *server, +int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { args->sa_session = res->sr_session = NULL; - return rpc_call_sync(server->client, msg, 0); + return rpc_call_sync(clnt, msg, 0); } static inline -int nfs4_call_sync(struct nfs_server *server, +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - return server->nfs_client->cl_mvops->call_sync(server, msg, args, - res, cache_reply); + return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, + args, res, cache_reply); } static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) @@ -1838,7 +1844,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - status = nfs4_call_sync(server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); return status; @@ -2097,7 +2103,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| @@ -2167,7 +2173,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(info->fattr); - return nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2256,7 +2262,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fattr); - return nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) @@ -2316,9 +2322,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, + const struct nfs_fh *dirfh, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct nfs4_lookup_arg args = { @@ -2340,7 +2346,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d nfs_fattr_init(fattr); dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookupfh: %d\n", status); return status; } @@ -2352,7 +2358,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); + err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); /* FIXME: !!!! */ if (err == -NFS4ERR_MOVED) { err = -EREMOTE; @@ -2363,26 +2369,28 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, return err; } -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { int status; dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); + status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); if (status == -NFS4ERR_MOVED) status = nfs4_get_referral(dir, name, fattr, fhandle); dprintk("NFS reply lookup: %d\n", status); return status; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(dir, name, fhandle, fattr), + _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), &exception); } while (exception.retry); return err; @@ -2428,7 +2436,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (res.fattr == NULL) return -ENOMEM; - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { entry->mask = 0; if (res.access & NFS4_ACCESS_READ) @@ -2495,7 +2503,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, .rpc_resp = &res, }; - return nfs4_call_sync(NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_readlink(struct inode *inode, struct page *page, @@ -2584,7 +2592,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) if (res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2685,7 +2693,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, if (res.old_fattr == NULL || res.new_fattr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); nfs_post_op_update_inode(old_dir, res.old_fattr); @@ -2736,7 +2744,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (res.fattr == NULL || res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2799,7 +2807,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) { - int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); @@ -2912,7 +2920,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, (unsigned long long)cookie); nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; @@ -3004,7 +3012,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fsstat->fattr); - return nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) @@ -3035,7 +3043,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - return nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) @@ -3080,7 +3088,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle } nfs_fattr_init(pathconf->fattr); - return nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3459,7 +3467,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu resp_buf = buf; buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); } - ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; if (res.acl_len > args.acl_len) @@ -3534,7 +3542,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl if (i < 0) return i; nfs_inode_return_delegation(inode); - ret = nfs4_call_sync(server, &msg, &arg.seq_args, &res.seq_res, 1); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* * Free each page after tx, so the only ref left is @@ -3897,7 +3905,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock lsp = request->fl_u.nfs4_fl.owner; arg.lock_owner.id = lsp->ls_id.id; arg.lock_owner.s_dev = server->s_dev; - status = nfs4_call_sync(server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: request->fl_type = F_UNLCK; @@ -4625,7 +4633,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; @@ -5593,7 +5601,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) int status; dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server, &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b8ec170f2a0f..ac40b8535d7e 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2c2c67d2eb42..71ee6799db9b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1071,7 +1071,7 @@ struct nfs_rpc_ops { struct nfs_fattr *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); - int (*lookup) (struct inode *, struct qstr *, + int (*lookup) (struct rpc_clnt *clnt, struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); int (*access) (struct inode *, struct nfs_access_entry *); int (*readlink)(struct inode *, struct page *, unsigned int, -- cgit v1.2.3 From 5a5ea0d485c9715c86bf858bbdc5f6d373b3db88 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:29 +0000 Subject: NFS: Add secinfo procedure This patch adds the nfs4 operation secinfo as a valid nfs rpc operation. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 35 +++++++++++++ fs/nfs/nfs4xdr.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 1 + include/linux/nfs_xdr.h | 37 +++++++++++++ 4 files changed, 208 insertions(+) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 448657456b68..0b8bae119976 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4639,6 +4639,40 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, return status; } +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + dprintk("NFS call secinfo %s\n", name->name); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_secinfo(dir, name, flavors), + &exception); + } while (exception.retry); + return err; +} + #ifdef CONFIG_NFS_V4_1 /* * Check the exchange flags returned by the server for invalid flags, having @@ -5756,6 +5790,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .init_client = nfs4_init_client, + .secinfo = nfs4_proc_secinfo, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0cf560f77884..98afcf947aa4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int); (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) @@ -676,6 +679,14 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_lookup_maxsz + \ decode_fs_locations_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_exchange_id_sz \ (compound_encode_hdr_maxsz + \ @@ -1620,6 +1631,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state hdr->replen += decode_delegreturn_maxsz; } +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + int len = name->len; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_SECINFO); + xdr_encode_opaque(p, name->name, len); + hdr->nops++; + hdr->replen += decode_secinfo_maxsz; +} + #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ static void encode_exchange_id(struct xdr_stream *xdr, @@ -2465,6 +2488,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * EXCHANGE_ID request @@ -4680,6 +4721,73 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } +static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.sec_oid4.len = be32_to_cpup(p); + if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.qop4 = be32_to_cpup(p++); + flavor->gss.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo_flavor *sec_flavor; + int status; + __be32 *p; + int i; + + status = decode_op_hdr(xdr, OP_SECINFO); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->flavors->num_flavors = be32_to_cpup(p); + + for (i = 0; i < res->flavors->num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + if (decode_secinfo_gss(xdr, sec_flavor)) + break; + } + } + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + #if defined(CONFIG_NFS_V4_1) static int decode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_res *res) @@ -5919,6 +6027,32 @@ out: return status; } +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); + if (status) + goto out; +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode EXCHANGE_ID response @@ -6258,6 +6392,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 134716e5e350..7e7f6b7e46b1 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -550,6 +550,7 @@ enum { NFSPROC4_CLNT_SETACL, NFSPROC4_CLNT_FS_LOCATIONS, NFSPROC4_CLNT_RELEASE_LOCKOWNER, + NFSPROC4_CLNT_SECINFO, /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 71ee6799db9b..3f32bf175840 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -3,6 +3,7 @@ #include #include +#include /* * To change the maximum rsize and wsize supported by the NFS client, adjust @@ -14,6 +15,9 @@ #define NFS_DEF_FILE_IO_SIZE (4096U) #define NFS_MIN_FILE_IO_SIZE (1024U) +/* Forward declaration for NFS v3 */ +struct nfs4_secinfo_flavors; + struct nfs_fsid { uint64_t major; uint64_t minor; @@ -936,6 +940,38 @@ struct nfs4_fs_locations_res { struct nfs4_sequence_res seq_res; }; +struct nfs4_secinfo_oid { + unsigned int len; + char data[GSS_OID_MAX_LEN]; +}; + +struct nfs4_secinfo_gss { + struct nfs4_secinfo_oid sec_oid4; + unsigned int qop4; + unsigned int service; +}; + +struct nfs4_secinfo_flavor { + unsigned int flavor; + struct nfs4_secinfo_gss gss; +}; + +struct nfs4_secinfo_flavors { + unsigned int num_flavors; + struct nfs4_secinfo_flavor flavors[0]; +}; + +struct nfs4_secinfo_arg { + const struct nfs_fh *dir_fh; + const struct qstr *name; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_secinfo_res { + struct nfs4_secinfo_flavors *flavors; + struct nfs4_sequence_res seq_res; +}; + #endif /* CONFIG_NFS_V4 */ struct nfstime4 { @@ -1118,6 +1154,7 @@ struct nfs_rpc_ops { struct iattr *iattr); int (*init_client) (struct nfs_client *, const struct rpc_timeout *, const char *, rpc_authflavor_t, int); + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); }; /* -- cgit v1.2.3 From 7ebb931598cd95cccea10d4bc4c0123a464ea565 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:30 +0000 Subject: NFS: use secinfo when crossing mountpoints A submount may use different security than the parent mount does. We should figure out what sec flavor the submount uses at mount time. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 ++- fs/nfs/internal.h | 7 +++ fs/nfs/namespace.c | 113 +++++++++++++++++++++++++++++++++- fs/nfs/nfs4proc.c | 14 +++++ fs/nfs/nfs4xdr.c | 11 ++-- include/linux/nfs_xdr.h | 1 + net/sunrpc/auth_gss/gss_mech_switch.c | 22 +++++++ 7 files changed, 165 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 01768e5e2c9b..058d7d63e566 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -254,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) + nfs_attr_check_mountpoint(sb, fattr); + + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -298,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if ((fattr->valid & NFS_ATTR_FATTR_FSID) - && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1ec5d0662ede..345d86b90e26 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -214,6 +220,7 @@ extern const u32 nfs41_maxwrite_overhead; /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 extern struct rpc_procinfo nfs4_procedures[]; +void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *); #endif extern int nfs4_init_ds_session(struct nfs_client *clp); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b02720864ded..ad92bf731ff5 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internal.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -27,7 +28,8 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr); + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor); /* * nfs_path - reconstruct the path given an arbitrary dentry @@ -116,6 +118,100 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +#ifdef CONFIG_NFS_V4 +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + +static rpc_authflavor_t nfs_negotiate_security(const struct dentry *parent, const struct dentry *dentry) +{ + int status = 0; + struct page *page; + struct nfs4_secinfo_flavors *flavors; + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); + rpc_authflavor_t flavor = RPC_AUTH_UNIX; + + secinfo = NFS_PROTO(parent->d_inode)->secinfo; + if (secinfo != NULL) { + page = alloc_page(GFP_KERNEL); + if (!page) { + status = -ENOMEM; + goto out; + } + flavors = page_address(page); + status = secinfo(parent->d_inode, &dentry->d_name, flavors); + flavor = nfs_find_best_sec(flavors, dentry->d_inode); + put_page(page); + } + + return flavor; + +out: + status = -ENOMEM; + return status; +} + +static rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent, + struct dentry *dentry, struct path *path, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + rpc_authflavor_t flavor; + struct rpc_clnt *clone; + struct rpc_auth *auth; + int err; + + flavor = nfs_negotiate_security(parent, path->dentry); + if (flavor < 0) + goto out; + clone = rpc_clone_client(server->client); + auth = rpcauth_create(flavor, clone); + if (!auth) { + flavor = -EIO; + goto out; + } + err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode, + &path->dentry->d_name, + fh, fattr); + if (err < 0) + flavor = err; +out: + return flavor; +} +#else /* CONFIG_NFS_V4 */ +static inline rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, + struct dentry *parent, struct dentry *dentry, + struct path *path, struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + return -EPERM; +} +#endif /* CONFIG_NFS_V4 */ + /* * nfs_d_automount - Handle crossing a mountpoint on the server * @path - The mountpoint @@ -136,6 +232,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; int err; + rpc_authflavor_t flavor = 1; dprintk("--> nfs_d_automount()\n"); @@ -156,6 +253,13 @@ struct vfsmount *nfs_d_automount(struct path *path) err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, &path->dentry->d_name, fh, fattr); + if (err == -EPERM) { + flavor = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr); + if (flavor < 0) + err = flavor; + else + err = 0; + } dput(parent); if (err != 0) { mnt = ERR_PTR(err); @@ -165,7 +269,7 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) mnt = nfs_do_refmount(path->dentry); else - mnt = nfs_do_submount(path->dentry, fh, fattr); + mnt = nfs_do_submount(path->dentry, fh, fattr, flavor); if (IS_ERR(mnt)) goto out; @@ -232,17 +336,20 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, * @dentry - parent directory * @fh - filehandle for new root dentry * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount * */ static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor) { struct nfs_clone_mount mountdata = { .sb = dentry->d_sb, .dentry = dentry, .fh = fh, .fattr = fattr, + .authflavor = authflavor, }; struct vfsmount *mnt = ERR_PTR(-ENOMEM); char *page = (char *) __get_free_page(GFP_USER); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0b8bae119976..563463777d9d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -87,6 +87,8 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + return -EPERM; case -NFS4ERR_BADOWNER: case -NFS4ERR_BADNAME: return -EINVAL; @@ -2383,6 +2385,16 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, return status; } +void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(struct nfs_fh)); + fattr->fsid.major = 1; + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { @@ -2392,6 +2404,8 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst err = nfs4_handle_exception(NFS_SERVER(dir), _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), &exception); + if (err == -EPERM) + nfs_fixup_secinfo_attributes(fattr, fhandle); } while (exception.retry); return err; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 98afcf947aa4..21c3004b72d5 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -113,7 +113,7 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -2966,6 +2966,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + return -be32_to_cpup(p); } return 0; out_overflow: @@ -3953,6 +3954,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, fattr->valid |= status; status = decode_attr_error(xdr, bitmap); + if (status == -NFS4ERR_WRONGSEC) { + nfs_fixup_secinfo_attributes(fattr, fh); + status = 0; + } if (status < 0) goto xdr_error; @@ -6314,10 +6319,6 @@ static struct { { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, - { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs - * to be handled by a - * middle-layer. - */ { -1, -EIO } }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3f32bf175840..fa1ba78ca2c8 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -82,6 +82,7 @@ struct nfs_fattr { #define NFS_ATTR_FATTR_CHANGE (1U << 17) #define NFS_ATTR_FATTR_PRECHANGE (1U << 18) #define NFS_ATTR_FATTR_V4_REFERRAL (1U << 19) /* NFSv4 referral */ +#define NFS_ATTR_FATTR_MOUNTPOINT (1U << 20) /* Treat as mountpoint */ #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \ | NFS_ATTR_FATTR_MODE \ diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 8b4061049d76..6c844b01a1d1 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -160,6 +160,28 @@ gss_mech_get_by_name(const char *name) EXPORT_SYMBOL_GPL(gss_mech_get_by_name); +struct gss_api_mech * +gss_mech_get_by_OID(struct xdr_netobj *obj) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (obj->len == pos->gm_oid.len) { + if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL_GPL(gss_mech_get_by_OID); + static inline int mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) { -- cgit v1.2.3 From 8f70e95f9f4159184f557a1db60c909d7c1bd2e3 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Thu, 24 Mar 2011 17:12:31 +0000 Subject: NFS: Determine initial mount security When sec= is not presented as a mount option, we should attempt to determine what security flavor the server is using. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 33 +++++++++++++++++++++++++++++++-- include/linux/sunrpc/gss_api.h | 3 +++ net/sunrpc/auth_gss/gss_mech_switch.c | 16 ++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 563463777d9d..f9150f03d640 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -2191,15 +2192,43 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(flavor, server->client); + if (!auth) { + ret = -EIO; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); + if (ret < 0) + ret = -EAGAIN; +out: + return ret; +} + /* * get the file handle for the "/" directory on the server */ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int status; + int i, len, status = 0; + rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2]; - status = nfs4_lookup_root(server, fhandle, info); + flav_array[0] = RPC_AUTH_UNIX; + len = gss_mech_list_pseudoflavors(&flav_array[1]); + flav_array[1+len] = RPC_AUTH_NULL; + len += 2; + + for (i = 0; i < len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); + if (status == 0) + break; + } if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h index 5d8048beb051..332da61cf8b7 100644 --- a/include/linux/sunrpc/gss_api.h +++ b/include/linux/sunrpc/gss_api.h @@ -126,6 +126,9 @@ struct gss_api_mech *gss_mech_get_by_name(const char *); /* Similar, but get by pseudoflavor. */ struct gss_api_mech *gss_mech_get_by_pseudoflavor(u32); +/* Fill in an array with a list of supported pseudoflavors */ +int gss_mech_list_pseudoflavors(u32 *); + /* Just increments the mechanism's reference count and returns its input: */ struct gss_api_mech * gss_mech_get(struct gss_api_mech *); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 6c844b01a1d1..e3c36a274412 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -215,6 +215,22 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); +int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) +{ + struct gss_api_mech *pos = NULL; + int i = 0; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + array_ptr[i] = pos->gm_pfs->pseudoflavor; + i++; + } + spin_unlock(®istered_mechs_lock); + return i; +} + +EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors); + u32 gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) { -- cgit v1.2.3 From 35124a0994fc02545b14b9fa3aad000b3331f1c0 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 24 Mar 2011 16:48:21 -0400 Subject: Cleanup XDR parsing for LAYOUTGET, GETDEVICEINFO changes LAYOUTGET and GETDEVICEINFO XDR parsing to: - not use vmap, which doesn't work on incoherent archs - use xdr_stream parsing for all xdr Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 53 +++++++++++--- fs/nfs/nfs4filelayoutdev.c | 178 +++++++++++++++++++++++++++++++-------------- fs/nfs/nfs4proc.c | 9 +-- fs/nfs/nfs4xdr.c | 30 +++++--- fs/nfs/pnfs.c | 39 ++++++++++ fs/nfs/pnfs.h | 1 - include/linux/nfs_xdr.h | 6 +- 7 files changed, 230 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index ffb54a082f3a..6f8192f4cfc7 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -502,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, struct nfs4_layoutget_res *lgr, struct nfs4_deviceid *id) { - uint32_t *p = (uint32_t *)lgr->layout.buf; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + memcpy(id, p, sizeof(*id)); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); print_deviceid(id); @@ -529,32 +550,46 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); + if (!fl->num_fh) + goto out_err; + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), GFP_KERNEL); if (!fl->fh_array) - return -ENOMEM; + goto out_err; for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); - if (!fl->fh_array[i]) { - filelayout_free_fh_array(fl); - return -ENOMEM; - } + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { printk(KERN_ERR "Too big fh %d received %d\n", i, fl->fh_array[i]->size); - filelayout_free_fh_array(fl); - return -EIO; + goto out_err_free; } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); - p += XDR_QUADLEN(fl->fh_array[i]->size); dprintk("DEBUG: %s: fh len %d\n", __func__, fl->fh_array[i]->size); } + __free_page(scratch); return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; } static void diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 68143c162e3b..de5350f2b249 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -261,7 +261,7 @@ out: * Currently only support ipv4, and one multi-path address. */ static struct nfs4_pnfs_ds * -decode_and_add_ds(__be32 **pp, struct inode *inode) +decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) { struct nfs4_pnfs_ds *ds = NULL; char *buf; @@ -269,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) u32 ip_addr, port; int nlen, rlen, i; int tmp[2]; - __be32 *r_netid, *r_addr, *p = *pp; + __be32 *p; /* r_netid */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; nlen = be32_to_cpup(p++); - r_netid = p; - p += XDR_QUADLEN(nlen); - /* r_addr */ - rlen = be32_to_cpup(p++); - r_addr = p; - p += XDR_QUADLEN(rlen); - *pp = p; + p = xdr_inline_decode(streamp, nlen); + if (unlikely(!p)) + goto out_err; /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); goto out_err; } + /* r_addr */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(streamp, rlen); + if (unlikely(!p)) + goto out_err; + /* ipv6 length plus port is legal */ if (rlen > INET6_ADDRSTRLEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, @@ -300,7 +309,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf[rlen] = '\0'; - memcpy(buf, r_addr, rlen); + memcpy(buf, p, rlen); /* replace the port dots with dashes for the in4_pton() delimiter*/ for (i = 0; i < 2; i++) { @@ -336,90 +345,154 @@ out_err: static struct nfs4_file_layout_dsaddr* decode_device(struct inode *ino, struct pnfs_device *pdev) { - int i, dummy; + int i; u32 cnt, num; u8 *indexp; - __be32 *p = (__be32 *)pdev->area, *indicesp; - struct nfs4_file_layout_dsaddr *dsaddr; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = pdev->pages, + .page_len = pdev->pglen, + .buflen = pdev->pglen, + .len = pdev->pglen, + }; + struct page *scratch; + + /* set up xdr stream */ + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + goto out_err; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); /* Get the stripe count (number of stripe index) */ - cnt = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); dprintk("%s stripe count %d\n", __func__, cnt); if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { printk(KERN_WARNING "%s: stripe count %d greater than " "supported maximum %d\n", __func__, cnt, NFS4_PNFS_MAX_STRIPE_CNT); - goto out_err; + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; } /* Check the multipath list count */ - indicesp = p; - p += XDR_QUADLEN(cnt << 2); - num = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); dprintk("%s ds_num %u\n", __func__, num); if (num > NFS4_PNFS_MAX_MULTI_CNT) { printk(KERN_WARNING "%s: multipath count %d greater than " "supported maximum %d\n", __func__, num, NFS4_PNFS_MAX_MULTI_CNT); - goto out_err; + goto out_err_free_stripe_indices; } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + dsaddr = kzalloc(sizeof(*dsaddr) + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), GFP_KERNEL); if (!dsaddr) - goto out_err; - - dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); - if (!dsaddr->stripe_indices) - goto out_err_free; + goto out_err_free_stripe_indices; dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; dsaddr->ds_num = num; memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); - /* Go back an read stripe indices */ - p = indicesp; - indexp = &dsaddr->stripe_indices[0]; - for (i = 0; i < dsaddr->stripe_count; i++) { - *indexp = be32_to_cpup(p++); - if (*indexp >= num) - goto out_err_free; - indexp++; - } - /* Skip already read multipath list count */ - p++; - for (i = 0; i < dsaddr->ds_num; i++) { int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; - dummy = be32_to_cpup(p++); /* multipath count */ - if (dummy > 1) { + mp_count = be32_to_cpup(p); /* multipath count */ + if (mp_count > 1) { printk(KERN_WARNING "%s: Multipath count %d not supported, " "skipping all greater than 1\n", __func__, - dummy); + mp_count); } - for (j = 0; j < dummy; j++) { + for (j = 0; j < mp_count; j++) { if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + dsaddr->ds_list[i] = decode_and_add_ds(&stream, + ino); if (dsaddr->ds_list[i] == NULL) - goto out_err_free; + goto out_err_free_deviceid; } else { u32 len; /* skip extra multipath */ - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - continue; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; } } } + + __free_page(scratch); return dsaddr; -out_err_free: +out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; @@ -498,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) goto out_free; } - /* set pdev->area */ - pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); - if (!pdev->area) - goto out_free; - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); pdev->layout_type = LAYOUT_NFSV4_1_FILES; pdev->pages = pages; @@ -521,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) */ dsaddr = decode_and_add_device(inode, pdev); out_free: - if (pdev->area != NULL) - vunmap(pdev->area); for (i = 0; i < max_pages; i++) __free_page(pages[i]); kfree(pages); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 43045fa44710..8f071314e94b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5526,8 +5526,6 @@ static void nfs4_layoutget_release(void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - if (lgp->res.layout.buf != NULL) - free_page((unsigned long) lgp->res.layout.buf); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -5559,12 +5557,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) dprintk("--> %s\n", __func__); - lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); - if (lgp->res.layout.buf == NULL) { - nfs4_layoutget_release(lgp); - return -ENOMEM; - } - + lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 207d399c8dee..40da65e8fa2a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2656,6 +2656,10 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + encode_nops(&hdr); } @@ -5022,6 +5026,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __be32 *p; int status; u32 layout_count; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + u32 hdrlen, recvd; status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) @@ -5038,17 +5045,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, return -EINVAL; } - p = xdr_inline_decode(xdr, 24); + p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; p = xdr_decode_hyper(p, &res->range.offset); p = xdr_decode_hyper(p, &res->range.length); res->range.iomode = be32_to_cpup(p++); res->type = be32_to_cpup(p++); - - status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); - if (unlikely(status)) - return status; + res->layoutp->len = be32_to_cpup(p); dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", __func__, @@ -5056,12 +5060,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, (unsigned long)res->range.length, res->range.iomode, res->type, - res->layout.len); + res->layoutp->len); + + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } - /* nfs4_proc_layoutget allocated a single page */ - if (res->layout.len > PAGE_SIZE) - return -ENOMEM; - memcpy(res->layout.buf, p, res->layout.len); + xdr_read_pages(xdr, res->layoutp->len); if (layout_count > 1) { /* We only handle a length one array at the moment. Any diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 22c2ddbef420..d9ab97269ce6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -472,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg = NULL; + struct page **pages = NULL; + int i; + u32 max_resp_sz, max_pages; dprintk("--> %s\n", __func__); @@ -479,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); if (lgp == NULL) return NULL; + + /* allocate pages for xdr post processing */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_err_free; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_err_free; + } + lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range.iomode = iomode; @@ -487,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); + lgp->args.layout.pages = pages; + lgp->args.layout.pglen = max_pages * PAGE_SIZE; lgp->lsegpp = &lseg; /* Synchronously retrieve layout information from server and @@ -497,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo, /* remember that LAYOUTGET failed and suspend trying */ set_bit(lo_fail_bit(iomode), &lo->plh_flags); } + + /* free xdr pages */ + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + return lseg; + +out_err_free: + /* free any allocated xdr pages, lgp as it's not used */ + if (pages) { + for (i = 0; i < max_pages; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); + } + kfree(lgp); + return NULL; } bool pnfs_roc(struct inode *ino) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 33b9ae90c6f7..bc4827202e7a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -109,7 +109,6 @@ struct pnfs_device { unsigned int layout_type; unsigned int mincount; struct page **pages; - void *area; unsigned int pgbase; unsigned int pglen; }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 84f3585c5728..a6e21b10f43d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -190,8 +190,9 @@ struct nfs4_get_lease_time_res { #define PNFS_LAYOUT_MAXSIZE 4096 struct nfs4_layoutdriver_data { + struct page **pages; + __u32 pglen; __u32 len; - void *buf; }; struct pnfs_layout_range { @@ -209,6 +210,7 @@ struct nfs4_layoutget_args { struct nfs_open_context *ctx; struct nfs4_sequence_args seq_args; nfs4_stateid stateid; + struct nfs4_layoutdriver_data layout; }; struct nfs4_layoutget_res { @@ -216,8 +218,8 @@ struct nfs4_layoutget_res { struct pnfs_layout_range range; __u32 type; nfs4_stateid stateid; - struct nfs4_layoutdriver_data layout; struct nfs4_sequence_res seq_res; + struct nfs4_layoutdriver_data *layoutp; }; struct nfs4_layoutget { -- cgit v1.2.3 From 436c3b66ec9824a633724ae42de1c416af4f2063 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 24 Mar 2011 17:42:21 -0700 Subject: ipv4: Invalidate nexthop cache nh_saddr more correctly. Any operation that: 1) Brings up an interface 2) Adds an IP address to an interface 3) Deletes an IP address from an interface can potentially invalidate the nh_saddr value, requiring it to be recomputed. Perform the recomputation lazily using a generation ID. Reported-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 12 ++++++++++-- include/net/netns/ipv4.h | 1 + net/ipv4/fib_frontend.c | 11 +++++++---- net/ipv4/fib_semantics.c | 32 +++++++++++--------------------- net/ipv4/route.c | 6 ++++-- 5 files changed, 33 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a1a858035913..cd92b923a578 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -62,6 +62,7 @@ struct fib_nh { int nh_oif; __be32 nh_gw; __be32 nh_saddr; + int nh_saddr_genid; }; /* @@ -141,12 +142,19 @@ struct fib_result_nl { #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -#define FIB_RES_SADDR(res) (FIB_RES_NH(res).nh_saddr) +extern __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); + +#define FIB_RES_SADDR(net, res) \ + ((FIB_RES_NH(res).nh_saddr_genid == \ + atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ + FIB_RES_NH(res).nh_saddr : \ + fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) #define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) #define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) #define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) -#define FIB_RES_PREFSRC(res) ((res).fi->fib_prefsrc ? : FIB_RES_SADDR(res)) +#define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ + FIB_RES_SADDR(net, res)) struct fib_table { struct hlist_node tb_hlist; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e2e2ef57eca2..542195d9469e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -55,6 +55,7 @@ struct netns_ipv4 { int current_rt_cache_rebuild_count; atomic_t rt_genid; + atomic_t dev_addr_genid; #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 02c3ba61884a..f116ce8f1b46 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -228,7 +228,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (res.type != RTN_LOCAL || !accept_local) goto e_inval; } - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); fib_combine_itag(itag, &res); dev_match = false; @@ -258,7 +258,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, ret = 0; if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { - *spec_dst = FIB_RES_PREFSRC(res); + *spec_dst = FIB_RES_PREFSRC(net, res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } } @@ -960,6 +960,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: @@ -967,12 +968,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - fib_update_nh_saddrs(dev); + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); - fib_update_nh_saddrs(dev); + atomic_inc(&net->ipv4.dev_addr_genid); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -990,6 +991,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo { struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); @@ -1007,6 +1009,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 75b9fb5d5d37..2d4bebca671a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -695,6 +695,16 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, fib_info_hash_free(old_laddrhash, bytes); } +__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) +{ + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_cfg_scope); + nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); + + return nh->nh_saddr; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -855,9 +865,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { nexthop_nh->nh_cfg_scope = cfg->fc_scope; - nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev, - nexthop_nh->nh_gw, - nexthop_nh->nh_cfg_scope); + fib_info_update_nh_saddr(net, nexthop_nh); } endfor_nexthops(fi) link_it: @@ -1128,24 +1136,6 @@ out: return; } -void fib_update_nh_saddrs(struct net_device *dev) -{ - struct hlist_head *head; - struct hlist_node *node; - struct fib_nh *nh; - unsigned int hash; - - hash = fib_devindex_hashfn(dev->ifindex); - head = &fib_info_devhash[hash]; - hlist_for_each_entry(nh, node, head, nh_hash) { - if (nh->nh_dev != dev) - continue; - nh->nh_saddr = inet_select_addr(nh->nh_dev, - nh->nh_gw, - nh->nh_cfg_scope); - } -} - #ifdef CONFIG_IP_ROUTE_MULTIPATH /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 34921b0d3f8e..4b0c81180804 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1718,7 +1718,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) - src = FIB_RES_PREFSRC(res); + src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); @@ -2615,7 +2615,7 @@ static struct rtable *ip_route_output_slow(struct net *net, fib_select_default(&res); if (!fl4.saddr) - fl4.saddr = FIB_RES_PREFSRC(res); + fl4.saddr = FIB_RES_PREFSRC(net, res); dev_out = FIB_RES_DEV(res); fl4.flowi4_oif = dev_out->ifindex; @@ -3219,6 +3219,8 @@ static __net_init int rt_genid_init(struct net *net) { get_random_bytes(&net->ipv4.rt_genid, sizeof(net->ipv4.rt_genid)); + get_random_bytes(&net->ipv4.dev_addr_genid, + sizeof(net->ipv4.dev_addr_genid)); return 0; } -- cgit v1.2.3 From b2b755b5f10eb32fbdc73a9907c07006b17f714b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 24 Mar 2011 15:18:15 -0700 Subject: lib, arch: add filter argument to show_mem and fix private implementations Commit ddd588b5dd55 ("oom: suppress nodes that are not allowed from meminfo on oom kill") moved lib/show_mem.o out of lib/lib.a, which resulted in build warnings on all architectures that implement their own versions of show_mem(): lib/lib.a(show_mem.o): In function `show_mem': show_mem.c:(.text+0x1f4): multiple definition of `show_mem' arch/sparc/mm/built-in.o:(.text+0xd70): first defined here The fix is to remove __show_mem() and add its argument to show_mem() in all implementations to prevent this breakage. Architectures that implement their own show_mem() actually don't do anything with the argument yet, but they could be made to filter nodes that aren't allowed in the current context in the future just like the generic implementation. Reported-by: Stephen Rothwell Reported-by: James Bottomley Suggested-by: Andrew Morton Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- arch/arm/mm/init.c | 2 +- arch/ia64/mm/contig.c | 2 +- arch/ia64/mm/discontig.c | 2 +- arch/parisc/mm/init.c | 2 +- arch/powerpc/xmon/xmon.c | 2 +- arch/sparc/mm/init_32.c | 2 +- arch/tile/mm/pgtable.c | 2 +- arch/unicore32/mm/init.c | 2 +- drivers/tty/sysrq.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- include/linux/mm.h | 5 ++--- lib/show_mem.c | 7 +------ mm/oom_kill.c | 2 +- mm/page_alloc.c | 2 +- 14 files changed, 15 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index b3b0f0f5053d..e5f6fc428348 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -78,7 +78,7 @@ __tagtable(ATAG_INITRD2, parse_tag_initrd2); */ struct meminfo meminfo; -void show_mem(void) +void show_mem(unsigned int filter) { int free = 0, total = 0, reserved = 0; int shared = 0, cached = 0, slab = 0, i; diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 54bf54059811..9a018cde5d84 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -36,7 +36,7 @@ static unsigned long max_gap; * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 61620323bb60..82ab1bc6afb1 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -614,7 +614,7 @@ void __cpuinit *per_cpu_init(void) * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index f4f4d700833a..b7ed8d7a9b33 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -544,7 +544,7 @@ void __init mem_init(void) unsigned long *empty_zero_page __read_mostly; EXPORT_SYMBOL(empty_zero_page); -void show_mem(void) +void show_mem(unsigned int filter) { int i,free = 0,total = 0,reserved = 0; int shared = 0, cached = 0; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d17d04cfb2cd..33794c1d92c3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -821,7 +821,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(); + show_mem(0); break; default: termch = cmd; diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 6d0e02c4fe09..4c31e2b6e71b 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -75,7 +75,7 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void show_mem(void) +void show_mem(unsigned int filter) { printk("Mem-info:\n"); show_free_areas(); diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 1a2b36f8866d..de7d8e21e01d 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -41,7 +41,7 @@ * The normal show_free_areas() is too verbose on Tile, with dozens * of processors and often four NUMA zones each with high and lowmem. */ -void show_mem(void) +void show_mem(unsigned int filter) { struct zone *zone; diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 3dbe3709b69d..1fc02633f700 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -55,7 +55,7 @@ early_param("initrd", early_initrd); */ struct meminfo meminfo; -void show_mem(void) +void show_mem(unsigned int filter) { int free = 0, total = 0, reserved = 0; int shared = 0, cached = 0, slab = 0, i; diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 81f13958e751..43db715f1502 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -306,7 +306,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key) { - show_mem(); + show_mem(0); } static struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 6dd3c68c13ad..d6b342b5b423 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -600,7 +600,7 @@ static void fn_scroll_back(struct vc_data *vc) static void fn_show_mem(struct vc_data *vc) { - show_mem(); + show_mem(0); } static void fn_show_state(struct vc_data *vc) diff --git a/include/linux/mm.h b/include/linux/mm.h index f9535b2c9558..7606d7db96c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -861,7 +861,7 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) /* - * Flags passed to __show_mem() and __show_free_areas() to suppress output in + * Flags passed to show_mem() and __show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ @@ -1360,8 +1360,7 @@ extern void setup_per_zone_wmarks(void); extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(void); -extern void __show_mem(unsigned int flags); +extern void show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; diff --git a/lib/show_mem.c b/lib/show_mem.c index d8d602b58c31..90cbe4bb5960 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,7 +9,7 @@ #include #include -void __show_mem(unsigned int filter) +void show_mem(unsigned int filter) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, shared = 0, @@ -61,8 +61,3 @@ void __show_mem(unsigned int filter) quicklist_total_size()); #endif } - -void show_mem(void) -{ - __show_mem(0); -} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 62a5cec08a17..6a819d1b2c7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - __show_mem(SHOW_MEM_FILTER_NODES); + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e5726ab0d85..d6e7ba7373be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2195,7 +2195,7 @@ nopage: current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) - __show_mem(filter); + show_mem(filter); } return page; got_pg: -- cgit v1.2.3 From 37e826c513883099c298317bad1b3b677b2905fb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 24 Mar 2011 18:06:47 -0700 Subject: ipv4: Fix nexthop caching wrt. scoping. Move the scope value out of the fib alias entries and into fib_info, so that we always use the correct scope when recomputing the nexthop cached source address. Reported-by: Julian Anastasov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 6 +++--- net/ipv4/fib_lookup.h | 3 +-- net/ipv4/fib_semantics.c | 15 ++++++++------- net/ipv4/fib_trie.c | 12 ++++-------- 4 files changed, 16 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index cd92b923a578..e5d66ec88cf6 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -51,7 +51,6 @@ struct fib_nh { struct fib_info *nh_parent; unsigned nh_flags; unsigned char nh_scope; - unsigned char nh_cfg_scope; #ifdef CONFIG_IP_ROUTE_MULTIPATH int nh_weight; int nh_power; @@ -75,9 +74,10 @@ struct fib_info { struct net *fib_net; int fib_treeref; atomic_t fib_clntref; - int fib_dead; unsigned fib_flags; - int fib_protocol; + unsigned char fib_dead; + unsigned char fib_protocol; + unsigned char fib_scope; __be32 fib_prefsrc; u32 fib_priority; u32 *fib_metrics; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 4ec323875a02..af0f14aba169 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -10,7 +10,6 @@ struct fib_alias { struct fib_info *fa_info; u8 fa_tos; u8 fa_type; - u8 fa_scope; u8 fa_state; struct rcu_head rcu; }; @@ -29,7 +28,7 @@ extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); extern void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2d4bebca671a..641a5a2a9f9c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -222,7 +222,7 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi) unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; - val ^= fi->fib_protocol; + val ^= (fi->fib_protocol << 8) | fi->fib_scope; val ^= (__force u32)fi->fib_prefsrc; val ^= fi->fib_priority; for_nexthops(fi) { @@ -248,6 +248,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, @@ -328,7 +329,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, goto errout; err = fib_dump_info(skb, info->pid, seq, event, tb_id, - fa->fa_type, fa->fa_scope, key, dst_len, + fa->fa_type, key, dst_len, fa->fa_tos, fa->fa_info, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ @@ -699,7 +700,7 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) { nh->nh_saddr = inet_select_addr(nh->nh_dev, nh->nh_gw, - nh->nh_cfg_scope); + nh->nh_parent->fib_scope); nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); return nh->nh_saddr; @@ -763,6 +764,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_net = hold_net(net); fi->fib_protocol = cfg->fc_protocol; + fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; @@ -864,7 +866,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } change_nexthops(fi) { - nexthop_nh->nh_cfg_scope = cfg->fc_scope; fib_info_update_nh_saddr(net, nexthop_nh); } endfor_nexthops(fi) @@ -914,7 +915,7 @@ failure: } int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, - u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, + u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) { struct nlmsghdr *nlh; @@ -936,7 +937,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, NLA_PUT_U32(skb, RTA_TABLE, tb_id); rtm->rtm_type = type; rtm->rtm_flags = fi->fib_flags; - rtm->rtm_scope = scope; + rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len) @@ -1092,7 +1093,7 @@ void fib_select_default(struct fib_result *res) list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - if (fa->fa_scope != res->scope || + if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ac87a49ad50b..90a3ff605591 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1245,7 +1245,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && fa->fa_info == fi) { fa_match = fa; break; @@ -1271,7 +1270,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = fa->fa_tos; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; @@ -1308,7 +1306,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; new_fa->fa_state = 0; /* * Insert new entry to the list. @@ -1362,7 +1359,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) continue; - if (fa->fa_scope < flp->flowi4_scope) + if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; @@ -1388,7 +1385,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, res->prefixlen = plen; res->nh_sel = nhsel; res->type = fa->fa_type; - res->scope = fa->fa_scope; + res->scope = fa->fa_info->fib_scope; res->fi = fi; res->table = tb; res->fa_head = &li->falh; @@ -1664,7 +1661,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && + fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || @@ -1863,7 +1860,6 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, RTM_NEWROUTE, tb->tb_id, fa->fa_type, - fa->fa_scope, xkey, plen, fa->fa_tos, @@ -2384,7 +2380,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth+1); seq_printf(seq, " /%d %s %s", li->plen, rtn_scope(buf1, sizeof(buf1), - fa->fa_scope), + fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_tos) -- cgit v1.2.3 From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:36 +1100 Subject: fs: protect inode->i_state with inode->i_lock Protect inode state transitions and validity checks with the inode->i_lock. This enables us to make inode state transitions independently of the inode_lock and is the first step to peeling away the inode_lock from the code. This requires that __iget() is done atomically with i_state checks during list traversals so that we don't race with another thread marking the inode I_FREEING between the state check and grabbing the reference. Also remove the unlock_new_inode() memory barrier optimisation required to avoid taking the inode_lock when clearing I_NEW. Simplify the code by simply taking the inode->i_lock around the state change and wakeup. Because the wakeup is no longer tricky, remove the wake_up_inode() function and open code the wakeup where necessary. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/block_dev.c | 2 + fs/buffer.c | 2 +- fs/drop_caches.c | 9 ++- fs/fs-writeback.c | 44 ++++++++++---- fs/inode.c | 150 ++++++++++++++++++++++++++++++++--------------- fs/notify/inode_mark.c | 21 +++++-- fs/quota/dquot.c | 13 ++-- include/linux/fs.h | 2 +- include/linux/quotaops.h | 2 +- mm/filemap.c | 2 + mm/rmap.c | 1 + 11 files changed, 174 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/fs/block_dev.c b/fs/block_dev.c index 889287019599..bc39b18cf3d0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -56,9 +56,11 @@ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } diff --git a/fs/buffer.c b/fs/buffer.c index 2219a76e2caf..da666f3148f9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1144,7 +1144,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. + * mapping->tree_lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 816f88e6b9ce..6c6f73ba0868 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -18,11 +18,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c6e4956786..efd1ebe879cc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -306,10 +306,12 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); } } @@ -333,6 +335,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; + spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -348,6 +351,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of b_io. */ if (wbc->sync_mode != WB_SYNC_ALL) { + spin_unlock(&inode->i_lock); requeue_io(inode); return 0; } @@ -363,6 +367,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -384,8 +389,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { @@ -395,6 +402,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -436,6 +444,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } } inode_sync_complete(inode); + spin_unlock(&inode->i_lock); return ret; } @@ -506,7 +515,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -515,10 +526,14 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } __iget(inode); + spin_unlock(&inode->i_lock); + pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); if (wbc->pages_skipped != pages_skipped) { @@ -724,7 +739,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } spin_unlock(&inode_lock); } @@ -1017,6 +1034,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) block_dump___mark_inode_dirty(inode); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1028,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) - goto out; + goto out_unlock_inode; /* * Only add valid (hashed) inodes to the superblock's @@ -1036,11 +1054,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out; + goto out_unlock_inode; } if (inode->i_state & I_FREEING) - goto out; + goto out_unlock_inode; + spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). @@ -1065,7 +1084,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); } + goto out; } +out_unlock_inode: + spin_unlock(&inode->i_lock); out: spin_unlock(&inode_lock); @@ -1111,14 +1133,16 @@ static void wait_sb_inodes(struct super_block *sb) * we still have to wait for that writeout. */ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + struct address_space *mapping = inode->i_mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); /* * We hold a reference to 'inode' so it couldn't have diff --git a/fs/inode.c b/fs/inode.c index 0b3da4a77704..14b12c4ee026 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -27,6 +27,17 @@ #include #include +/* + * inode locking rules. + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * + * Lock ordering: + * inode_lock + * inode->i_lock + */ + /* * This is needed for the following functions: * - inode_has_buffers @@ -137,15 +148,6 @@ int proc_nr_inodes(ctl_table *table, int write, } #endif -static void wake_up_inode(struct inode *inode) -{ - /* - * Prevent speculative execution through spin_unlock(&inode_lock); - */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); -} - /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -336,7 +338,7 @@ static void init_once(void *foo) } /* - * inode_lock must be held + * inode->i_lock must be held */ void __iget(struct inode *inode) { @@ -413,7 +415,9 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } EXPORT_SYMBOL(__insert_inode_hash); @@ -438,7 +442,9 @@ static void __remove_inode_hash(struct inode *inode) void remove_inode_hash(struct inode *inode) { spin_lock(&inode_lock); + spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -495,7 +501,9 @@ static void dispose_list(struct list_head *head) __inode_sb_list_del(inode); spin_unlock(&inode_lock); - wake_up_inode(inode); + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); destroy_inode(inode); } } @@ -518,10 +526,17 @@ void evict_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } inode->i_state |= I_FREEING; + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + inodes_stat.nr_unused--; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -529,8 +544,6 @@ void evict_inodes(struct super_block *sb) */ list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -563,18 +576,26 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_lock(&inode_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & I_DIRTY && !kill_dirty) { + spin_unlock(&inode->i_lock); busy = 1; continue; } if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); busy = 1; continue; } inode->i_state |= I_FREEING; + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + inodes_stat.nr_unused--; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -582,8 +603,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) */ list_move(&inode->i_lru, &dispose); list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; } spin_unlock(&inode_lock); @@ -641,8 +660,10 @@ static void prune_icache(int nr_to_scan) * Referenced or dirty inodes are still in use. Give them * another pass through the LRU as we canot reclaim them now. */ + spin_lock(&inode->i_lock); if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { + spin_unlock(&inode->i_lock); list_del_init(&inode->i_lru); inodes_stat.nr_unused--; continue; @@ -650,12 +671,14 @@ static void prune_icache(int nr_to_scan) /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { - list_move(&inode->i_lru, &inode_lru); inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + list_move(&inode->i_lru, &inode_lru); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, @@ -666,11 +689,15 @@ static void prune_icache(int nr_to_scan) if (inode != list_entry(inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + spin_lock(&inode->i_lock); + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -737,11 +764,13 @@ repeat: continue; if (!test(inode, data)) continue; + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -763,11 +792,13 @@ repeat: continue; if (inode->i_sb != sb) continue; + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -832,14 +863,23 @@ struct inode *new_inode(struct super_block *sb) inode = alloc_inode(sb); if (inode) { spin_lock(&inode_lock); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = 0; + spin_unlock(&inode->i_lock); + __inode_sb_list_add(inode); spin_unlock(&inode_lock); } return inode; } EXPORT_SYMBOL(new_inode); +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -859,19 +899,11 @@ void unlock_new_inode(struct inode *inode) } } #endif - /* - * This is special! We do not need the spinlock when clearing I_NEW, - * because we're guaranteed that nobody else tries to do anything about - * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_NEW). - * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_NEW after the other inode initialisation has - * completed. - */ - smp_mb(); + spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; - wake_up_inode(inode); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -900,9 +932,11 @@ static struct inode *get_new_inode(struct super_block *sb, if (set(inode, data)) goto set_failed; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); __inode_sb_list_add(inode); - inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -947,9 +981,11 @@ static struct inode *get_new_inode_fast(struct super_block *sb, old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); __inode_sb_list_add(inode); - inode->i_state = I_NEW; spin_unlock(&inode_lock); /* Return the locked inode with I_NEW set, the @@ -1034,15 +1070,19 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); - else + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; + } spin_unlock(&inode_lock); return inode; } @@ -1271,7 +1311,6 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1281,16 +1320,23 @@ int insert_inode_locked(struct inode *inode) continue; if (old->i_sb != sb) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return 0; } __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { @@ -1308,8 +1354,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_NEW; - while (1) { struct hlist_node *node; struct inode *old = NULL; @@ -1320,16 +1364,23 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, continue; if (!test(old, data)) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return 0; } __iget(old); + spin_unlock(&old->i_lock); spin_unlock(&inode_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { @@ -1375,6 +1426,9 @@ static void iput_final(struct inode *inode) const struct super_operations *op = inode->i_sb->s_op; int drop; + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_NEW); + if (op && op->drop_inode) drop = op->drop_inode(inode); else @@ -1386,21 +1440,23 @@ static void iput_final(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_SYNC))) { inode_lru_list_add(inode); } + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); return; } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; __remove_inode_hash(inode); } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); /* * Move the inode off the IO lists and LRU once I_FREEING is @@ -1413,8 +1469,10 @@ static void iput_final(struct inode *inode) spin_unlock(&inode_lock); evict(inode); remove_inode_hash(inode); - wake_up_inode(inode); + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -1611,9 +1669,8 @@ EXPORT_SYMBOL(inode_wait); * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to - * wake_up_inode() after removing from the hash list will DTRT. - * - * This is called with inode_lock held. + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1621,6 +1678,7 @@ static void __wait_on_freeing_inode(struct inode *inode) DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); schedule(); finish_wait(wq, &wait.wait); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4c29fcf557d1..4dd53fb44124 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list) * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } /* * If i_count is zero, the inode cannot have any watches and @@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list) * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ - if (!atomic_read(&inode->i_count)) + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); continue; + } need_iput_tmp = need_iput; need_iput = NULL; @@ -274,13 +279,17 @@ void fsnotify_unmount_inodes(struct list_head *list) __iget(inode); else need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; + atomic_read(&next_i->i_count)) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); } /* diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a2a622e079f0..a1470fda366c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -902,18 +902,19 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); continue; + } #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - if (!atomic_read(&inode->i_writecount)) - continue; - if (!dqinit_needed(inode, type)) - continue; - __iget(inode); + spin_unlock(&inode->i_lock); spin_unlock(&inode_lock); iput(old_inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4dda076c24a1..ed6fdcc1484c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1647,7 +1647,7 @@ struct super_operations { }; /* - * Inode state bits. Protected by inode_lock. + * Inode state bits. Protected by inode->i_lock * * Three bits determine the dirty state of the inode, I_DIRTY_SYNC, * I_DIRTY_DATASYNC and I_DIRTY_PAGES. diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index eb354f6f26b3..26f9e3612e0f 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -277,7 +277,7 @@ static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly - * reduces inode_lock contention. + * reduces lock contention. */ mark_inode_dirty(inode); } diff --git a/mm/filemap.c b/mm/filemap.c index f807afda86f2..499e9aa91450 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -99,7 +99,9 @@ * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->inode_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * * (code doesn't rely on that order, so you could switch it around) diff --git a/mm/rmap.c b/mm/rmap.c index 4a8e99a0fb97..7dada0456448 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,6 +32,7 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, -- cgit v1.2.3 From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:41 +1100 Subject: fs: move i_wb_list out from under inode_lock Protect the inode writeback list with a new global lock inode_wb_list_lock and use it to protect the list manipulations and traversals. This lock replaces the inode_lock as the inodes on the list can be validity checked while holding the inode->i_lock and hence the inode_lock is no longer needed to protect the list. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/block_dev.c | 4 +-- fs/fs-writeback.c | 76 +++++++++++++++++++++++++++-------------------- fs/inode.c | 12 +++++--- fs/internal.h | 5 ++++ include/linux/writeback.h | 1 + mm/backing-dev.c | 8 ++--- mm/filemap.c | 8 ++--- mm/rmap.c | 4 +-- 8 files changed, 70 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/fs/block_dev.c b/fs/block_dev.c index bc39b18cf3d0..2bbc0e62102f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV); static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5de56a2182bb..ed800656356b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) spin_unlock_bh(&bdi->wb_lock); } +/* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode) wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock. Either + * the caller has an active reference on the inode or the inode has I_WILL_FREE + * set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { @@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); @@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb, inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (inode->i_state & I_FREEING) goto out_unlock_inode; - spin_unlock(&inode->i_lock); /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } - goto out; } out_unlock_inode: spin_unlock(&inode->i_lock); -out: - spin_unlock(&inode_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 785b1ab23ff0..239fdc08719e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include "internal.h" /* * inode locking rules. @@ -36,6 +37,8 @@ * inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list * * Lock ordering: * inode_lock @@ -44,6 +47,9 @@ * inode_sb_list_lock * inode->i_lock * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock */ /* @@ -105,6 +111,7 @@ static struct hlist_head *inode_hashtable __read_mostly; DEFINE_SPINLOCK(inode_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the @@ -483,10 +490,7 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - spin_lock(&inode_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&inode_lock); - + inode_wb_list_del(inode); inode_sb_list_del(inode); if (op->evict_inode) { diff --git a/fs/internal.h b/fs/internal.h index 7013ae0c88c1..b29c46e4e32f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd, */ extern spinlock_t inode_sb_list_lock; +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 0ead399e08b5..3f5fee718329 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,6 +10,7 @@ struct backing_dev_info; extern spinlock_t inode_lock; +extern spinlock_t inode_wb_list_lock; /* * fs/fs-writeback.c diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..4b3e9f17ee21 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -73,14 +73,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -682,11 +682,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); diff --git a/mm/filemap.c b/mm/filemap.c index 499e9aa91450..d8b34d1a1071 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,9 +98,9 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * diff --git a/mm/rmap.c b/mm/rmap.c index 7dada0456448..8da044a1db0f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,12 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock -- cgit v1.2.3 From 67a23c494621ff1d5431c3bc320947865b224625 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 22 Mar 2011 22:23:42 +1100 Subject: fs: rename inode_lock to inode_hash_lock All that remains of the inode_lock is protecting the inode hash list manipulation and traversals. Rename the inode_lock to inode_hash_lock to reflect it's actual function. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/inode.c | 111 +++++++++++++++++++++++++--------------------- fs/notify/inode_mark.c | 1 - fs/notify/mark.c | 1 - fs/notify/vfsmount_mark.c | 1 - fs/ntfs/inode.c | 4 +- include/linux/writeback.h | 1 - 6 files changed, 63 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/fs/inode.c b/fs/inode.c index 239fdc08719e..f9ee4928358f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -39,10 +39,10 @@ * sb->s_inodes, inode->i_sb_list * inode_wb_list_lock protects: * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash * * Lock ordering: - * inode_lock - * inode->i_lock * * inode_sb_list_lock * inode->i_lock @@ -50,6 +50,13 @@ * * inode_wb_list_lock * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock */ /* @@ -85,6 +92,8 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Each inode can be on two separate lists. One is @@ -100,15 +109,6 @@ static unsigned int i_hash_shift __read_mostly; static LIST_HEAD(inode_lru); static DEFINE_SPINLOCK(inode_lru_lock); -static struct hlist_head *inode_hashtable __read_mostly; - -/* - * A simple spinlock to protect the list manipulations. - * - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ -DEFINE_SPINLOCK(inode_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); @@ -433,11 +433,11 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); @@ -449,11 +449,11 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void remove_inode_hash(struct inode *inode) { - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -778,11 +778,15 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_sb != sb) + spin_lock(&inode->i_lock); + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - if (!test(inode, data)) + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; - spin_lock(&inode->i_lock); + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -806,11 +810,15 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_ino != ino) + spin_lock(&inode->i_lock); + if (inode->i_ino != ino) { + spin_unlock(&inode->i_lock); continue; - if (inode->i_sb != sb) + } + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - spin_lock(&inode->i_lock); + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -924,7 +932,7 @@ void unlock_new_inode(struct inode *inode) EXPORT_SYMBOL(unlock_new_inode); /* - * This is called without the inode lock held.. Be careful. + * This is called without the inode hash lock held.. Be careful. * * We no longer cache the sb_flags in i_flags - see fs.h * -- rmk@arm.uk.linux.org @@ -941,7 +949,7 @@ static struct inode *get_new_inode(struct super_block *sb, if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode(sb, head, test, data); if (!old) { @@ -953,7 +961,7 @@ static struct inode *get_new_inode(struct super_block *sb, hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -966,7 +974,7 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -974,7 +982,7 @@ static struct inode *get_new_inode(struct super_block *sb, return inode; set_failed: - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); return NULL; } @@ -992,7 +1000,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { @@ -1002,7 +1010,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -1015,7 +1023,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -1036,10 +1044,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_node *node; struct inode *inode; + spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, node, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); return 0; + } } + spin_unlock(&inode_hash_lock); return 1; } @@ -1069,7 +1081,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; - spin_lock(&inode_lock); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1077,7 +1088,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); - spin_unlock(&inode_lock); return res; } @@ -1119,7 +1129,7 @@ EXPORT_SYMBOL(igrab); * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ static struct inode *ifind(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), @@ -1127,15 +1137,15 @@ static struct inode *ifind(struct super_block *sb, { struct inode *inode; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); inode = find_inode(sb, head, test, data); if (inode) { - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); if (likely(wait)) wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return NULL; } @@ -1159,14 +1169,14 @@ static struct inode *ifind_fast(struct super_block *sb, { struct inode *inode; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); inode = find_inode_fast(sb, head, ino); if (inode) { - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(inode); return inode; } - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return NULL; } @@ -1189,7 +1199,7 @@ static struct inode *ifind_fast(struct super_block *sb, * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) @@ -1217,7 +1227,7 @@ EXPORT_SYMBOL(ilookup5_nowait); * * Otherwise NULL is returned. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note, @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) @@ -1268,7 +1278,8 @@ EXPORT_SYMBOL(ilookup); * inode and this is returned locked, hashed, and with the I_NEW flag set. The * file system gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_lock held, so can't sleep. + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1328,7 +1339,7 @@ int insert_inode_locked(struct inode *inode) while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_ino != ino) continue; @@ -1346,12 +1357,12 @@ int insert_inode_locked(struct inode *inode) inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1372,7 +1383,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_sb != sb) continue; @@ -1390,12 +1401,12 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1674,10 +1685,10 @@ static void __wait_on_freeing_inode(struct inode *inode) wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wait); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); } static __initdata unsigned long ihash_entries; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index fb3b3c5ef0ee..07ea8d3e6ea2 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -22,7 +22,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 325185e514bb..50c00856f730 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 85eebff6d0d7..e86577d6c5c3 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -23,7 +23,6 @@ #include #include #include -#include /* for inode_lock */ #include diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index a627ed82c0a3..0b56c6b7ec01 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -54,7 +54,7 @@ * * Return 1 if the attributes match and 0 if not. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. */ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) @@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * * Return 0 on success and -errno on error. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. (Hence the GFP_ATOMIC allocation.) */ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f5fee718329..17e7ccc322a5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,7 +9,6 @@ struct backing_dev_info; -extern spinlock_t inode_lock; extern spinlock_t inode_wb_list_lock; /* -- cgit v1.2.3 From 6df59a84eccd4cad7fcefda3e0c5e55239a3b2dd Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Mar 2011 01:28:45 -0700 Subject: route: Take the right src and dst addresses in ip_route_newports When we set up the flow informations in ip_route_newports(), we take the address informations from the the rt_key_src and rt_key_dst fields of the rtable. They appear to be empty. So take the address informations from rt_src and rt_dst instead. This issue was introduced by commit 5e2b61f78411be25f0b84f97d5b5d312f184dfd1 ("ipv4: Remove flowi from struct rtable.") Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/route.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index dc102445ec47..f88429cad52a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -270,8 +270,8 @@ static inline struct rtable *ip_route_newports(struct rtable *rt, struct flowi4 fl4 = { .flowi4_oif = rt->rt_oif, .flowi4_mark = rt->rt_mark, - .daddr = rt->rt_key_dst, - .saddr = rt->rt_key_src, + .daddr = rt->rt_dst, + .saddr = rt->rt_src, .flowi4_tos = rt->rt_tos, .flowi4_proto = protocol, .fl4_sport = sport, -- cgit v1.2.3 From 3674f19dabd15f9541079a588149a370d888f4e6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 25 Mar 2011 17:51:54 +1100 Subject: ALSA: vmalloc buffers should use normal mmap It's a big no-no to use pgprot_noncached() when mmap'ing such buffers into userspace since they are mapped cachable in kernel space. This can cause all sort of interesting things ranging from to garbled sound to lockups on various architectures. I've observed that usb-audio is broken on powerpc 4xx for example because of that. Also remove the now unused snd_pcm_lib_mmap_noncached(). It's an arch business to know when to use uncached mappings, there's already hacks for MIPS inside snd_pcm_default_mmap() and other archs are supposed to use dma_mmap_coherent(). (See my separate patch that adds dma_mmap_coherent() to powerpc) Signed-off-by: Benjamin Herrenschmidt CC: Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 4 +--- sound/core/pcm_native.c | 9 --------- 2 files changed, 1 insertion(+), 12 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 430a9cc045e2..e1bad1130616 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1031,9 +1031,7 @@ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_s #define snd_pcm_lib_mmap_iomem NULL #endif -int snd_pcm_lib_mmap_noncached(struct snd_pcm_substream *substream, - struct vm_area_struct *area); -#define snd_pcm_lib_mmap_vmalloc snd_pcm_lib_mmap_noncached +#define snd_pcm_lib_mmap_vmalloc NULL static inline void snd_pcm_limit_isa_dma_size(int dma, size_t *max) { diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index ae42b6509ce4..fe5c8036beba 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3201,15 +3201,6 @@ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem); #endif /* SNDRV_PCM_INFO_MMAP */ -/* mmap callback with pgprot_noncached */ -int snd_pcm_lib_mmap_noncached(struct snd_pcm_substream *substream, - struct vm_area_struct *area) -{ - area->vm_page_prot = pgprot_noncached(area->vm_page_prot); - return snd_pcm_default_mmap(substream, area); -} -EXPORT_SYMBOL(snd_pcm_lib_mmap_noncached); - /* * mmap DMA buffer */ -- cgit v1.2.3 From 2092e6be82ec71ecbf5a8ceeef004bbcbdb78812 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Mar 2011 15:21:06 -0400 Subject: WARN_ON_SMP(): Allow use in if() statements on UP Both WARN_ON() and WARN_ON_SMP() should be able to be used in an if statement. if (WARN_ON_SMP(foo)) { ... } Because WARN_ON_SMP() is defined as a do { } while (0) on UP, it can not be used this way. Convert it to the same form that WARN_ON() is, even when CONFIG_SMP is off. Signed-off-by: Steven Rostedt Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20110317192208.444147791@goodmis.org> Signed-off-by: Ingo Molnar --- include/asm-generic/bug.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c2c9ba032d46..f2d2faf4d9ae 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -165,10 +165,36 @@ extern void warn_slowpath_null(const char *file, const int line); #define WARN_ON_RATELIMIT(condition, state) \ WARN_ON((condition) && __ratelimit(state)) +/* + * WARN_ON_SMP() is for cases that the warning is either + * meaningless for !SMP or may even cause failures. + * This is usually used for cases that we have + * WARN_ON(!spin_is_locked(&lock)) checks, as spin_is_locked() + * returns 0 for uniprocessor settings. + * It can also be used with values that are only defined + * on SMP: + * + * struct foo { + * [...] + * #ifdef CONFIG_SMP + * int bar; + * #endif + * }; + * + * void func(struct foo *zoot) + * { + * WARN_ON_SMP(!zoot->bar); + * + * For CONFIG_SMP, WARN_ON_SMP() should act the same as WARN_ON(), + * and should be a nop and return false for uniprocessor. + * + * if (WARN_ON_SMP(x)) returns true only when CONFIG_SMP is set + * and x is true. + */ #ifdef CONFIG_SMP # define WARN_ON_SMP(x) WARN_ON(x) #else -# define WARN_ON_SMP(x) do { } while (0) +# define WARN_ON_SMP(x) ({0;}) #endif #endif -- cgit v1.2.3 From ab7798ffcf98b11a9525cf65bacdae3fd58d357f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Mar 2011 16:48:50 +0100 Subject: genirq: Expand generic show_interrupts() Some archs want to print extra information for certain irq_chips which is per irq and not per chip. Allow them to provide a chip callback to print the chip name and the extra information. PowerPC wants to print the LEVEL/EDGE type information. Make it configurable. Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 4 ++++ kernel/irq/Kconfig | 4 ++++ kernel/irq/proc.c | 15 ++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d3577f30d45..5d876c9b3a3d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -28,6 +28,7 @@ #include #include +struct seq_file; struct irq_desc; struct irq_data; typedef void (*irq_flow_handler_t)(unsigned int irq, @@ -270,6 +271,7 @@ static inline bool irqd_can_move_in_process_context(struct irq_data *d) * @irq_set_wake: enable/disable power-management wake-on of an IRQ * @irq_bus_lock: function to lock access to slow bus (i2c) chips * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips + * @irq_print_chip: optional to print special chip info in show_interrupts * @flags: chip specific flags * * @release: release function solely used by UML @@ -317,6 +319,8 @@ struct irq_chip { void (*irq_bus_lock)(struct irq_data *data); void (*irq_bus_sync_unlock)(struct irq_data *data); + void (*irq_print_chip)(struct irq_data *data, struct seq_file *p); + unsigned long flags; /* Currently used only by UML, might disappear one day.*/ diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 09bef82d74cb..00f2c037267a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,6 +31,10 @@ config GENERIC_IRQ_PROBE config GENERIC_IRQ_SHOW bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 760248de109d..626d092eed9a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -404,7 +404,20 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif if (desc->name) seq_printf(p, "-%-8s", desc->name); -- cgit v1.2.3 From 7a32b589a9c856493bccb02db55047edc04eee7b Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Fri, 11 Mar 2011 10:13:59 +0900 Subject: Regulator: add suspend-finish API for regulator core. The regulator core had suspend-prepare that turns off the regulators when entering a system-wide suspend. However, it did not have suspend-finish that pairs with suspend-prepare and the regulator core has assumed that the regulator devices and their drivers support autonomous recover at resume. This patch adds regulator_suspend_finish that pairs with the previously-existed regulator_suspend_prepare. The function regulator_suspend_finish turns on the regulators that have always_on set or positive use_count so that we can reset the regulator states appropriately at resume. In regulator_suspend_finish, if has_full_constraints, it disables unnecessary regulators. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Acked-by: Mark Brown -- Updates v3 comments corrected (Thanks to Igor) v2 disable unnecessary regulators (Thanks to Mark) Signed-off-by: Liam Girdwood --- drivers/regulator/core.c | 41 +++++++++++++++++++++++++++++++++++++++ include/linux/regulator/machine.h | 1 + 2 files changed, 42 insertions(+) (limited to 'include') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index a2dc6223e8d2..e611f6797e6a 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2655,6 +2655,47 @@ out: } EXPORT_SYMBOL_GPL(regulator_suspend_prepare); +/** + * regulator_suspend_finish - resume regulators from system wide suspend + * + * Turn on regulators that might be turned off by regulator_suspend_prepare + * and that should be turned on according to the regulators properties. + */ +int regulator_suspend_finish(void) +{ + struct regulator_dev *rdev; + int ret = 0, error; + + mutex_lock(®ulator_list_mutex); + list_for_each_entry(rdev, ®ulator_list, list) { + struct regulator_ops *ops = rdev->desc->ops; + + mutex_lock(&rdev->mutex); + if ((rdev->use_count > 0 || rdev->constraints->always_on) && + ops->enable) { + error = ops->enable(rdev); + if (error) + ret = error; + } else { + if (!has_full_constraints) + goto unlock; + if (!ops->disable) + goto unlock; + if (ops->is_enabled && !ops->is_enabled(rdev)) + goto unlock; + + error = ops->disable(rdev); + if (error) + ret = error; + } +unlock: + mutex_unlock(&rdev->mutex); + } + mutex_unlock(®ulator_list_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(regulator_suspend_finish); + /** * regulator_has_full_constraints - the system has fully specified constraints * diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 761c745b9c24..c4c4fc45f856 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -186,6 +186,7 @@ struct regulator_init_data { }; int regulator_suspend_prepare(suspend_state_t state); +int regulator_suspend_finish(void); #ifdef CONFIG_REGULATOR void regulator_has_full_constraints(void); -- cgit v1.2.3 From ea05ef31f2aa98b25d14222300dc9c1d1eb59e41 Mon Sep 17 00:00:00 2001 From: Bengt Jonsson Date: Thu, 10 Mar 2011 14:43:31 +0100 Subject: regulator: add support for USB voltage regulator Signed-off-by: Bengt Jonsson Signed-off-by: Linus Walleij Acked-by: Mark Brown Signed-off-by: Liam Girdwood --- drivers/regulator/ab8500.c | 17 ++++++++++++++++- include/linux/regulator/ab8500.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c index d9a052c53aec..5a77630095d9 100644 --- a/drivers/regulator/ab8500.c +++ b/drivers/regulator/ab8500.c @@ -9,7 +9,7 @@ * AB8500 peripheral regulators * * AB8500 supports the following regulators: - * VAUX1/2/3, VINTCORE, VTVOUT, VAUDIO, VAMIC1/2, VDMIC, VANA + * VAUX1/2/3, VINTCORE, VTVOUT, VUSB, VAUDIO, VAMIC1/2, VDMIC, VANA */ #include #include @@ -432,6 +432,21 @@ static struct ab8500_regulator_info .update_mask = 0x82, .update_val_enable = 0x02, }, + [AB8500_LDO_USB] = { + .desc = { + .name = "LDO-USB", + .ops = &ab8500_regulator_fixed_ops, + .type = REGULATOR_VOLTAGE, + .id = AB8500_LDO_USB, + .owner = THIS_MODULE, + .n_voltages = 1, + }, + .fixed_uV = 3300000, + .update_bank = 0x03, + .update_reg = 0x82, + .update_mask = 0x03, + .update_val_enable = 0x01, + }, [AB8500_LDO_AUDIO] = { .desc = { .name = "LDO-AUDIO", diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h index 6a210f1511fc..d4eacdef2017 100644 --- a/include/linux/regulator/ab8500.h +++ b/include/linux/regulator/ab8500.h @@ -17,6 +17,7 @@ enum ab8500_regulator_id { AB8500_LDO_AUX3, AB8500_LDO_INTCORE, AB8500_LDO_TVOUT, + AB8500_LDO_USB, AB8500_LDO_AUDIO, AB8500_LDO_ANAMIC1, AB8500_LDO_ANAMIC2, -- cgit v1.2.3 From 79568b941277b5986a8a7f0fb8578b2ccfc3e87e Mon Sep 17 00:00:00 2001 From: Bengt Jonsson Date: Fri, 11 Mar 2011 11:54:46 +0100 Subject: regulator: initialization for ab8500 regulators The regulators on the AB8500 have a lot of custom hardware control settings pertaining to 8 external signals, settings which are board-specific and need be provided from the platform at startup. Initialization added for regulators Vana, VextSupply1, VextSupply2, VextSupply3, Vaux1, Vaux2, Vaux3, VTVout, Vintcore12, Vaudio, Vdmic, Vamic1, Vamic2, VrefDDR. Signed-off-by: Bengt Jonsson Reviewed-by: Rickard Andersson Reviewed-by: Jonas Aberg Signed-off-by: Linus Walleij Acked-by: Mark Brown Signed-off-by: Liam Girdwood --- arch/arm/mach-ux500/board-mop500-regulators.h | 2 + arch/arm/mach-ux500/board-mop500.c | 1 + drivers/regulator/ab8500.c | 223 +++++++++++++++++++++++++- include/linux/mfd/ab8500.h | 6 + include/linux/regulator/ab8500.h | 50 +++++- 5 files changed, 279 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-ux500/board-mop500-regulators.h b/arch/arm/mach-ux500/board-mop500-regulators.h index 2675fae52537..f979b892e4fa 100644 --- a/arch/arm/mach-ux500/board-mop500-regulators.h +++ b/arch/arm/mach-ux500/board-mop500-regulators.h @@ -14,6 +14,8 @@ #include #include +extern struct ab8500_regulator_reg_init +ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS]; extern struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS]; #endif diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c index 8790d984cac8..d0076453d7ff 100644 --- a/arch/arm/mach-ux500/board-mop500.c +++ b/arch/arm/mach-ux500/board-mop500.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c index 5a77630095d9..d157146c8655 100644 --- a/drivers/regulator/ab8500.c +++ b/drivers/regulator/ab8500.c @@ -526,6 +526,186 @@ static struct ab8500_regulator_info }; +struct ab8500_reg_init { + u8 bank; + u8 addr; + u8 mask; +}; + +#define REG_INIT(_id, _bank, _addr, _mask) \ + [_id] = { \ + .bank = _bank, \ + .addr = _addr, \ + .mask = _mask, \ + } + +static struct ab8500_reg_init ab8500_reg_init[] = { + /* + * 0x30, VanaRequestCtrl + * 0x0C, VpllRequestCtrl + * 0xc0, VextSupply1RequestCtrl + */ + REG_INIT(AB8500_REGUREQUESTCTRL2, 0x03, 0x04, 0xfc), + /* + * 0x03, VextSupply2RequestCtrl + * 0x0c, VextSupply3RequestCtrl + * 0x30, Vaux1RequestCtrl + * 0xc0, Vaux2RequestCtrl + */ + REG_INIT(AB8500_REGUREQUESTCTRL3, 0x03, 0x05, 0xff), + /* + * 0x03, Vaux3RequestCtrl + * 0x04, SwHPReq + */ + REG_INIT(AB8500_REGUREQUESTCTRL4, 0x03, 0x06, 0x07), + /* + * 0x08, VanaSysClkReq1HPValid + * 0x20, Vaux1SysClkReq1HPValid + * 0x40, Vaux2SysClkReq1HPValid + * 0x80, Vaux3SysClkReq1HPValid + */ + REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1, 0x03, 0x07, 0xe8), + /* + * 0x10, VextSupply1SysClkReq1HPValid + * 0x20, VextSupply2SysClkReq1HPValid + * 0x40, VextSupply3SysClkReq1HPValid + */ + REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2, 0x03, 0x08, 0x70), + /* + * 0x08, VanaHwHPReq1Valid + * 0x20, Vaux1HwHPReq1Valid + * 0x40, Vaux2HwHPReq1Valid + * 0x80, Vaux3HwHPReq1Valid + */ + REG_INIT(AB8500_REGUHWHPREQ1VALID1, 0x03, 0x09, 0xe8), + /* + * 0x01, VextSupply1HwHPReq1Valid + * 0x02, VextSupply2HwHPReq1Valid + * 0x04, VextSupply3HwHPReq1Valid + */ + REG_INIT(AB8500_REGUHWHPREQ1VALID2, 0x03, 0x0a, 0x07), + /* + * 0x08, VanaHwHPReq2Valid + * 0x20, Vaux1HwHPReq2Valid + * 0x40, Vaux2HwHPReq2Valid + * 0x80, Vaux3HwHPReq2Valid + */ + REG_INIT(AB8500_REGUHWHPREQ2VALID1, 0x03, 0x0b, 0xe8), + /* + * 0x01, VextSupply1HwHPReq2Valid + * 0x02, VextSupply2HwHPReq2Valid + * 0x04, VextSupply3HwHPReq2Valid + */ + REG_INIT(AB8500_REGUHWHPREQ2VALID2, 0x03, 0x0c, 0x07), + /* + * 0x20, VanaSwHPReqValid + * 0x80, Vaux1SwHPReqValid + */ + REG_INIT(AB8500_REGUSWHPREQVALID1, 0x03, 0x0d, 0xa0), + /* + * 0x01, Vaux2SwHPReqValid + * 0x02, Vaux3SwHPReqValid + * 0x04, VextSupply1SwHPReqValid + * 0x08, VextSupply2SwHPReqValid + * 0x10, VextSupply3SwHPReqValid + */ + REG_INIT(AB8500_REGUSWHPREQVALID2, 0x03, 0x0e, 0x1f), + /* + * 0x02, SysClkReq2Valid1 + * ... + * 0x80, SysClkReq8Valid1 + */ + REG_INIT(AB8500_REGUSYSCLKREQVALID1, 0x03, 0x0f, 0xfe), + /* + * 0x02, SysClkReq2Valid2 + * ... + * 0x80, SysClkReq8Valid2 + */ + REG_INIT(AB8500_REGUSYSCLKREQVALID2, 0x03, 0x10, 0xfe), + /* + * 0x02, VTVoutEna + * 0x04, Vintcore12Ena + * 0x38, Vintcore12Sel + * 0x40, Vintcore12LP + * 0x80, VTVoutLP + */ + REG_INIT(AB8500_REGUMISC1, 0x03, 0x80, 0xfe), + /* + * 0x02, VaudioEna + * 0x04, VdmicEna + * 0x08, Vamic1Ena + * 0x10, Vamic2Ena + */ + REG_INIT(AB8500_VAUDIOSUPPLY, 0x03, 0x83, 0x1e), + /* + * 0x01, Vamic1_dzout + * 0x02, Vamic2_dzout + */ + REG_INIT(AB8500_REGUCTRL1VAMIC, 0x03, 0x84, 0x03), + /* + * 0x0c, VanaRegu + * 0x03, VpllRegu + */ + REG_INIT(AB8500_VPLLVANAREGU, 0x04, 0x06, 0x0f), + /* + * 0x01, VrefDDREna + * 0x02, VrefDDRSleepMode + */ + REG_INIT(AB8500_VREFDDR, 0x04, 0x07, 0x03), + /* + * 0x03, VextSupply1Regu + * 0x0c, VextSupply2Regu + * 0x30, VextSupply3Regu + * 0x40, ExtSupply2Bypass + * 0x80, ExtSupply3Bypass + */ + REG_INIT(AB8500_EXTSUPPLYREGU, 0x04, 0x08, 0xff), + /* + * 0x03, Vaux1Regu + * 0x0c, Vaux2Regu + */ + REG_INIT(AB8500_VAUX12REGU, 0x04, 0x09, 0x0f), + /* + * 0x03, Vaux3Regu + */ + REG_INIT(AB8500_VRF1VAUX3REGU, 0x04, 0x0a, 0x03), + /* + * 0x3f, Vsmps1Sel1 + */ + REG_INIT(AB8500_VSMPS1SEL1, 0x04, 0x13, 0x3f), + /* + * 0x0f, Vaux1Sel + */ + REG_INIT(AB8500_VAUX1SEL, 0x04, 0x1f, 0x0f), + /* + * 0x0f, Vaux2Sel + */ + REG_INIT(AB8500_VAUX2SEL, 0x04, 0x20, 0x0f), + /* + * 0x07, Vaux3Sel + */ + REG_INIT(AB8500_VRF1VAUX3SEL, 0x04, 0x21, 0x07), + /* + * 0x01, VextSupply12LP + */ + REG_INIT(AB8500_REGUCTRL2SPARE, 0x04, 0x22, 0x01), + /* + * 0x04, Vaux1Disch + * 0x08, Vaux2Disch + * 0x10, Vaux3Disch + * 0x20, Vintcore12Disch + * 0x40, VTVoutDisch + * 0x80, VaudioDisch + */ + REG_INIT(AB8500_REGUCTRLDISCH, 0x04, 0x43, 0xfc), + /* + * 0x02, VanaDisch + * 0x04, VdmicPullDownEna + * 0x10, VdmicDisch + */ + REG_INIT(AB8500_REGUCTRLDISCH2, 0x04, 0x44, 0x16), +}; + static __devinit int ab8500_regulator_probe(struct platform_device *pdev) { struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent); @@ -544,10 +724,51 @@ static __devinit int ab8500_regulator_probe(struct platform_device *pdev) /* make sure the platform data has the correct size */ if (pdata->num_regulator != ARRAY_SIZE(ab8500_regulator_info)) { - dev_err(&pdev->dev, "platform configuration error\n"); + dev_err(&pdev->dev, "Configuration error: size mismatch.\n"); return -EINVAL; } + /* initialize registers */ + for (i = 0; i < pdata->num_regulator_reg_init; i++) { + int id; + u8 value; + + id = pdata->regulator_reg_init[i].id; + value = pdata->regulator_reg_init[i].value; + + /* check for configuration errors */ + if (id >= AB8500_NUM_REGULATOR_REGISTERS) { + dev_err(&pdev->dev, + "Configuration error: id outside range.\n"); + return -EINVAL; + } + if (value & ~ab8500_reg_init[id].mask) { + dev_err(&pdev->dev, + "Configuration error: value outside mask.\n"); + return -EINVAL; + } + + /* initialize register */ + err = abx500_mask_and_set_register_interruptible(&pdev->dev, + ab8500_reg_init[id].bank, + ab8500_reg_init[id].addr, + ab8500_reg_init[id].mask, + value); + if (err < 0) { + dev_err(&pdev->dev, + "Failed to initialize 0x%02x, 0x%02x.\n", + ab8500_reg_init[id].bank, + ab8500_reg_init[id].addr); + return err; + } + dev_vdbg(&pdev->dev, + " init: 0x%02x, 0x%02x, 0x%02x, 0x%02x\n", + ab8500_reg_init[id].bank, + ab8500_reg_init[id].addr, + ab8500_reg_init[id].mask, + value); + } + /* register all regulators */ for (i = 0; i < ARRAY_SIZE(ab8500_regulator_info); i++) { struct ab8500_regulator_info *info = NULL; diff --git a/include/linux/mfd/ab8500.h b/include/linux/mfd/ab8500.h index 56f8dea72152..6e4f77ef4d20 100644 --- a/include/linux/mfd/ab8500.h +++ b/include/linux/mfd/ab8500.h @@ -139,17 +139,23 @@ struct ab8500 { u8 oldmask[AB8500_NUM_IRQ_REGS]; }; +struct regulator_reg_init; struct regulator_init_data; /** * struct ab8500_platform_data - AB8500 platform data * @irq_base: start of AB8500 IRQs, AB8500_NR_IRQS will be used * @init: board-specific initialization after detection of ab8500 + * @num_regulator_reg_init: number of regulator init registers + * @regulator_reg_init: regulator init registers + * @num_regulator: number of regulators * @regulator: machine-specific constraints for regulators */ struct ab8500_platform_data { int irq_base; void (*init) (struct ab8500 *); + int num_regulator_reg_init; + struct ab8500_regulator_reg_init *regulator_reg_init; int num_regulator; struct regulator_init_data *regulator; }; diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h index d4eacdef2017..76579f964a29 100644 --- a/include/linux/regulator/ab8500.h +++ b/include/linux/regulator/ab8500.h @@ -3,8 +3,8 @@ * * License Terms: GNU General Public License v2 * - * Author: Sundar Iyer for ST-Ericsson - * + * Authors: Sundar Iyer for ST-Ericsson + * Bengt Jonsson for ST-Ericsson */ #ifndef __LINUX_MFD_AB8500_REGULATOR_H @@ -25,4 +25,50 @@ enum ab8500_regulator_id { AB8500_LDO_ANA, AB8500_NUM_REGULATORS, }; + +/* AB8500 register initialization */ +struct ab8500_regulator_reg_init { + int id; + u8 value; +}; + +#define INIT_REGULATOR_REGISTER(_id, _value) \ + { \ + .id = _id, \ + .value = _value, \ + } + +/* AB8500 registers */ +enum ab8500_regulator_reg { + AB8500_REGUREQUESTCTRL2, + AB8500_REGUREQUESTCTRL3, + AB8500_REGUREQUESTCTRL4, + AB8500_REGUSYSCLKREQ1HPVALID1, + AB8500_REGUSYSCLKREQ1HPVALID2, + AB8500_REGUHWHPREQ1VALID1, + AB8500_REGUHWHPREQ1VALID2, + AB8500_REGUHWHPREQ2VALID1, + AB8500_REGUHWHPREQ2VALID2, + AB8500_REGUSWHPREQVALID1, + AB8500_REGUSWHPREQVALID2, + AB8500_REGUSYSCLKREQVALID1, + AB8500_REGUSYSCLKREQVALID2, + AB8500_REGUMISC1, + AB8500_VAUDIOSUPPLY, + AB8500_REGUCTRL1VAMIC, + AB8500_VPLLVANAREGU, + AB8500_VREFDDR, + AB8500_EXTSUPPLYREGU, + AB8500_VAUX12REGU, + AB8500_VRF1VAUX3REGU, + AB8500_VAUX1SEL, + AB8500_VAUX2SEL, + AB8500_VRF1VAUX3SEL, + AB8500_REGUCTRL2SPARE, + AB8500_REGUCTRLDISCH, + AB8500_REGUCTRLDISCH2, + AB8500_VSMPS1SEL1, + AB8500_NUM_REGULATOR_REGISTERS, +}; + #endif -- cgit v1.2.3 From 77af1b2641faf45788a0d480db94082ebee931dc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 17 Mar 2011 13:24:36 +0100 Subject: regulator: add set_voltage_time_sel infrastructure This makes it possible to set the stabilization time for voltage regulators in the same manner as enable_time(). The interface only supports regulators that implements fixed selectors. Cc: Bengt Jonsson Signed-off-by: Linus Walleij Acked-by: Mark Brown Signed-off-by: Liam Girdwood --- drivers/regulator/core.c | 25 +++++++++++++++++++++++++ include/linux/regulator/driver.h | 11 +++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index e611f6797e6a..e7e4460dcb92 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1629,6 +1629,7 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev, int min_uV, int max_uV) { int ret; + int delay = 0; unsigned int selector; trace_regulator_set_voltage(rdev_get_name(rdev), min_uV, max_uV); @@ -1662,6 +1663,22 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev, } } + /* + * If we can't obtain the old selector there is not enough + * info to call set_voltage_time_sel(). + */ + if (rdev->desc->ops->set_voltage_time_sel && + rdev->desc->ops->get_voltage_sel) { + unsigned int old_selector = 0; + + ret = rdev->desc->ops->get_voltage_sel(rdev); + if (ret < 0) + return ret; + old_selector = ret; + delay = rdev->desc->ops->set_voltage_time_sel(rdev, + old_selector, selector); + } + if (best_val != INT_MAX) { ret = rdev->desc->ops->set_voltage_sel(rdev, selector); selector = best_val; @@ -1672,6 +1689,14 @@ static int _regulator_do_set_voltage(struct regulator_dev *rdev, ret = -EINVAL; } + /* Insert any necessary delays */ + if (delay >= 1000) { + mdelay(delay / 1000); + udelay(delay % 1000); + } else if (delay) { + udelay(delay); + } + if (ret == 0) _notifier_call_chain(rdev, REGULATOR_EVENT_VOLTAGE_CHANGE, NULL); diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index b8ed16a33c47..6c433b89c80d 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -63,7 +63,11 @@ enum regulator_status { * when running with the specified parameters. * * @enable_time: Time taken for the regulator voltage output voltage to - * stabalise after being enabled, in microseconds. + * stabilise after being enabled, in microseconds. + * @set_voltage_time_sel: Time taken for the regulator voltage output voltage + * to stabilise after being set to a new value, in microseconds. + * The function provides the from and to voltage selector, the + * function should return the worst case. * * @set_suspend_voltage: Set the voltage for the regulator when the system * is suspended. @@ -103,8 +107,11 @@ struct regulator_ops { int (*set_mode) (struct regulator_dev *, unsigned int mode); unsigned int (*get_mode) (struct regulator_dev *); - /* Time taken to enable the regulator */ + /* Time taken to enable or set voltage on the regulator */ int (*enable_time) (struct regulator_dev *); + int (*set_voltage_time_sel) (struct regulator_dev *, + unsigned int old_selector, + unsigned int new_selector); /* report regulator status ... most other accessors report * control inputs, this reports results of combining inputs -- cgit v1.2.3 From 88cd222b259d62148ab8c82398498b1a01314476 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 17 Mar 2011 13:24:52 +0100 Subject: regulator: provide consumer interface for fall/rise time This exposes the functionality for rise/fall fime when setting voltage to the consumers. Cc: Bengt Jonsson Signed-off-by: Linus Walleij Acked-by: Mark Brown Signed-off-by: Liam Girdwood --- drivers/regulator/core.c | 45 ++++++++++++++++++++++++++++++++++++++ include/linux/regulator/consumer.h | 2 ++ 2 files changed, 47 insertions(+) (limited to 'include') diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index e7e4460dcb92..3ffc6979d164 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1764,6 +1764,51 @@ out: } EXPORT_SYMBOL_GPL(regulator_set_voltage); +/** + * regulator_set_voltage_time - get raise/fall time + * @regulator: regulator source + * @old_uV: starting voltage in microvolts + * @new_uV: target voltage in microvolts + * + * Provided with the starting and ending voltage, this function attempts to + * calculate the time in microseconds required to rise or fall to this new + * voltage. + */ +int regulator_set_voltage_time(struct regulator *regulator, + int old_uV, int new_uV) +{ + struct regulator_dev *rdev = regulator->rdev; + struct regulator_ops *ops = rdev->desc->ops; + int old_sel = -1; + int new_sel = -1; + int voltage; + int i; + + /* Currently requires operations to do this */ + if (!ops->list_voltage || !ops->set_voltage_time_sel + || !rdev->desc->n_voltages) + return -EINVAL; + + for (i = 0; i < rdev->desc->n_voltages; i++) { + /* We only look for exact voltage matches here */ + voltage = regulator_list_voltage(regulator, i); + if (voltage < 0) + return -EINVAL; + if (voltage == 0) + continue; + if (voltage == old_uV) + old_sel = i; + if (voltage == new_uV) + new_sel = i; + } + + if (old_sel < 0 || new_sel < 0) + return -EINVAL; + + return ops->set_voltage_time_sel(rdev, old_sel, new_sel); +} +EXPORT_SYMBOL_GPL(regulator_set_voltage_time); + /** * regulator_sync_voltage - re-apply last regulator output voltage * @regulator: regulator source diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 7954f6bd7edb..9e87c1cb7270 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -153,6 +153,8 @@ int regulator_list_voltage(struct regulator *regulator, unsigned selector); int regulator_is_supported_voltage(struct regulator *regulator, int min_uV, int max_uV); int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV); +int regulator_set_voltage_time(struct regulator *regulator, + int old_uV, int new_uV); int regulator_get_voltage(struct regulator *regulator); int regulator_sync_voltage(struct regulator *regulator); int regulator_set_current_limit(struct regulator *regulator, -- cgit v1.2.3 From fa1df691688f34cbcd5bf77bd084bbe47e9d6bfe Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 21 Mar 2011 19:19:35 -0700 Subject: mfd: Add mfd_clone_cell(), convert cs5535-mfd/olpc-xo1 to it Replace mfd_shared_platform_driver_register with mfd_clone_cell. The former was called by an mfd client, and registered both a platform driver and device. The latter is called by an mfd driver, and registers only a platform device. The downside of this is that mfd drivers need to be modified whenever new clients are added that share a cell; the upside is that it fits Linux's driver model better. It's also simpler. This also converts cs5535-mfd/olpc-xo1 from the old API. cs5535-mfd now creates the olpc-xo1-{acpi,pms} devices, while olpc-xo1 binds to them via platform drivers. Signed-off-by: Andres Salomon Signed-off-by: Samuel Ortiz --- arch/x86/platform/olpc/olpc-xo1.c | 11 ++++---- drivers/mfd/cs5535-mfd.c | 18 +++++++++++++ drivers/mfd/mfd-core.c | 53 ++++++++------------------------------- include/linux/mfd/core.h | 27 +++++++++++++------- 4 files changed, 52 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 99513642a0e6..386e3a159cca 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -121,22 +121,21 @@ static int __init olpc_xo1_init(void) { int r; - r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); + r = platform_driver_register(&cs5535_pms_drv); if (r) return r; - r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, - "cs5535-acpi"); + r = platform_driver_register(&cs5535_acpi_drv); if (r) - mfd_shared_platform_driver_unregister(&cs5535_pms_drv); + platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); - mfd_shared_platform_driver_unregister(&cs5535_pms_drv); + platform_driver_unregister(&cs5535_acpi_drv); + platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake "); diff --git a/drivers/mfd/cs5535-mfd.c b/drivers/mfd/cs5535-mfd.c index 886a06871065..24959ddd9324 100644 --- a/drivers/mfd/cs5535-mfd.c +++ b/drivers/mfd/cs5535-mfd.c @@ -27,6 +27,7 @@ #include #include #include +#include #define DRV_NAME "cs5535-mfd" @@ -111,6 +112,22 @@ static __devinitdata struct mfd_cell cs5535_mfd_cells[] = { }, }; +#ifdef CONFIG_OLPC +static void __devinit cs5535_clone_olpc_cells(void) +{ + const char *acpi_clones[] = { "olpc-xo1-acpi" }; + const char *pms_clones[] = { "olpc-xo1-pms" }; + + if (!machine_is_olpc()) + return; + + mfd_clone_cell("cs5535-acpi", acpi_clones, ARRAY_SIZE(acpi_clones)); + mfd_clone_cell("cs5535-pms", pms_clones, ARRAY_SIZE(pms_clones)); +} +#else +static void cs5535_clone_olpc_cells(void) { } +#endif + static int __devinit cs5535_mfd_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -139,6 +156,7 @@ static int __devinit cs5535_mfd_probe(struct pci_dev *pdev, dev_err(&pdev->dev, "MFD add devices failed: %d\n", err); goto err_disable; } + cs5535_clone_olpc_cells(); dev_info(&pdev->dev, "%zu devices registered.\n", ARRAY_SIZE(cs5535_mfd_cells)); diff --git a/drivers/mfd/mfd-core.c b/drivers/mfd/mfd-core.c index 79eda0264fb2..d01574d98870 100644 --- a/drivers/mfd/mfd-core.c +++ b/drivers/mfd/mfd-core.c @@ -184,16 +184,12 @@ void mfd_remove_devices(struct device *parent) } EXPORT_SYMBOL(mfd_remove_devices); -static int add_shared_platform_device(const char *cell, const char *name) +int mfd_clone_cell(const char *cell, const char **clones, size_t n_clones) { struct mfd_cell cell_entry; struct device *dev; struct platform_device *pdev; - int err; - - /* check if we've already registered a device (don't fail if we have) */ - if (bus_find_device_by_name(&platform_bus_type, NULL, name)) - return 0; + int i; /* fetch the parent cell's device (should already be registered!) */ dev = bus_find_device_by_name(&platform_bus_type, NULL, cell); @@ -206,44 +202,17 @@ static int add_shared_platform_device(const char *cell, const char *name) WARN_ON(!cell_entry.enable); - cell_entry.name = name; - err = mfd_add_device(pdev->dev.parent, -1, &cell_entry, NULL, 0); - if (err) - dev_err(dev, "MFD add devices failed: %d\n", err); - return err; -} - -int mfd_shared_platform_driver_register(struct platform_driver *drv, - const char *cellname) -{ - int err; - - err = add_shared_platform_device(cellname, drv->driver.name); - if (err) - printk(KERN_ERR "failed to add platform device %s\n", - drv->driver.name); - - err = platform_driver_register(drv); - if (err) - printk(KERN_ERR "failed to add platform driver %s\n", - drv->driver.name); - - return err; -} -EXPORT_SYMBOL(mfd_shared_platform_driver_register); - -void mfd_shared_platform_driver_unregister(struct platform_driver *drv) -{ - struct device *dev; - - dev = bus_find_device_by_name(&platform_bus_type, NULL, - drv->driver.name); - if (dev) - platform_device_unregister(to_platform_device(dev)); + for (i = 0; i < n_clones; i++) { + cell_entry.name = clones[i]; + /* don't give up if a single call fails; just report error */ + if (mfd_add_device(pdev->dev.parent, -1, &cell_entry, NULL, 0)) + dev_err(dev, "failed to create platform device '%s'\n", + clones[i]); + } - platform_driver_unregister(drv); + return 0; } -EXPORT_SYMBOL(mfd_shared_platform_driver_unregister); +EXPORT_SYMBOL(mfd_clone_cell); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ian Molton, Dmitry Baryshkov"); diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index 1408bf8eed5f..ad1b19aa6508 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -62,6 +62,24 @@ struct mfd_cell { extern int mfd_cell_enable(struct platform_device *pdev); extern int mfd_cell_disable(struct platform_device *pdev); +/* + * "Clone" multiple platform devices for a single cell. This is to be used + * for devices that have multiple users of a cell. For example, if an mfd + * driver wants the cell "foo" to be used by a GPIO driver, an MTD driver, + * and a platform driver, the following bit of code would be use after first + * calling mfd_add_devices(): + * + * const char *fclones[] = { "foo-gpio", "foo-mtd" }; + * err = mfd_clone_cells("foo", fclones, ARRAY_SIZE(fclones)); + * + * Each driver (MTD, GPIO, and platform driver) would then register + * platform_drivers for "foo-mtd", "foo-gpio", and "foo", respectively. + * The cell's .enable/.disable hooks should be used to deal with hardware + * resource contention. + */ +extern int mfd_clone_cell(const char *cell, const char **clones, + size_t n_clones); + /* * Given a platform device that's been created by mfd_add_devices(), fetch * the mfd_cell that created it. @@ -87,13 +105,4 @@ extern int mfd_add_devices(struct device *parent, int id, extern void mfd_remove_devices(struct device *parent); -/* - * For MFD drivers with clients sharing access to resources, these create - * multiple platform devices per cell. Contention handling must still be - * handled via drivers (ie, with enable/disable hooks). - */ -extern int mfd_shared_platform_driver_register(struct platform_driver *drv, - const char *cellname); -extern void mfd_shared_platform_driver_unregister(struct platform_driver *drv); - #endif -- cgit v1.2.3 From 8de6bc7f6ba58dd717e4a65e3bf4a746116fb874 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Thu, 24 Mar 2011 15:54:45 +0900 Subject: mfd: Add MAX8997/8966 IRQ control This patch enables IRQ handling for MAX8997/8966 chips. Please note that Fuel-Gauge-related IRQs are not implemented in this initial release. The fuel gauge module in MAX8997 is identical to MAX17042, which is already in Linux kernel. In order to use the already-existing MAX17042 driver for fuel gauge module in MAX8997, the main interrupt handler of MAX8997 should relay related interrupts to MAX17042 driver. However, in order to do this, we need to modify MAX17042 driver as well because MAX17042 driver does not have any interrupt handlers for now. We are not going to implement this in this initial release as it is not crucial in basic operations of MAX8997. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Signed-off-by: Samuel Ortiz --- drivers/mfd/Makefile | 2 +- drivers/mfd/max8997-irq.c | 377 ++++++++++++++++++++++++++++++++++++ include/linux/mfd/max8997-private.h | 21 ++ include/linux/mfd/max8997.h | 7 +- 4 files changed, 404 insertions(+), 3 deletions(-) create mode 100644 drivers/mfd/max8997-irq.c (limited to 'include') diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 47f5709f3828..ef489f253402 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -63,7 +63,7 @@ obj-$(CONFIG_UCB1400_CORE) += ucb1400_core.o obj-$(CONFIG_PMIC_DA903X) += da903x.o max8925-objs := max8925-core.o max8925-i2c.o obj-$(CONFIG_MFD_MAX8925) += max8925.o -obj-$(CONFIG_MFD_MAX8997) += max8997.o +obj-$(CONFIG_MFD_MAX8997) += max8997.o max8997-irq.o obj-$(CONFIG_MFD_MAX8998) += max8998.o max8998-irq.o pcf50633-objs := pcf50633-core.o pcf50633-irq.o diff --git a/drivers/mfd/max8997-irq.c b/drivers/mfd/max8997-irq.c new file mode 100644 index 000000000000..e85c874133c4 --- /dev/null +++ b/drivers/mfd/max8997-irq.c @@ -0,0 +1,377 @@ +/* + * max8997-irq.c - Interrupt controller support for MAX8997 + * + * Copyright (C) 2011 Samsung Electronics Co.Ltd + * MyungJoo Ham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This driver is based on max8998-irq.c + */ + +#include +#include +#include +#include +#include + +static const u8 max8997_mask_reg[] = { + [PMIC_INT1] = MAX8997_REG_INT1MSK, + [PMIC_INT2] = MAX8997_REG_INT2MSK, + [PMIC_INT3] = MAX8997_REG_INT3MSK, + [PMIC_INT4] = MAX8997_REG_INT4MSK, + [FUEL_GAUGE] = MAX8997_REG_INVALID, + [MUIC_INT1] = MAX8997_MUIC_REG_INTMASK1, + [MUIC_INT2] = MAX8997_MUIC_REG_INTMASK2, + [MUIC_INT3] = MAX8997_MUIC_REG_INTMASK3, + [GPIO_LOW] = MAX8997_REG_INVALID, + [GPIO_HI] = MAX8997_REG_INVALID, + [FLASH_STATUS] = MAX8997_REG_INVALID, +}; + +static struct i2c_client *get_i2c(struct max8997_dev *max8997, + enum max8997_irq_source src) +{ + switch (src) { + case PMIC_INT1 ... PMIC_INT4: + return max8997->i2c; + case FUEL_GAUGE: + return NULL; + case MUIC_INT1 ... MUIC_INT3: + return max8997->muic; + case GPIO_LOW ... GPIO_HI: + return max8997->i2c; + case FLASH_STATUS: + return max8997->i2c; + default: + return ERR_PTR(-EINVAL); + } + + return ERR_PTR(-EINVAL); +} + +struct max8997_irq_data { + int mask; + enum max8997_irq_source group; +}; + +#define DECLARE_IRQ(idx, _group, _mask) \ + [(idx)] = { .group = (_group), .mask = (_mask) } +static const struct max8997_irq_data max8997_irqs[] = { + DECLARE_IRQ(MAX8997_PMICIRQ_PWRONR, PMIC_INT1, 1 << 0), + DECLARE_IRQ(MAX8997_PMICIRQ_PWRONF, PMIC_INT1, 1 << 1), + DECLARE_IRQ(MAX8997_PMICIRQ_PWRON1SEC, PMIC_INT1, 1 << 3), + DECLARE_IRQ(MAX8997_PMICIRQ_JIGONR, PMIC_INT1, 1 << 4), + DECLARE_IRQ(MAX8997_PMICIRQ_JIGONF, PMIC_INT1, 1 << 5), + DECLARE_IRQ(MAX8997_PMICIRQ_LOWBAT2, PMIC_INT1, 1 << 6), + DECLARE_IRQ(MAX8997_PMICIRQ_LOWBAT1, PMIC_INT1, 1 << 7), + + DECLARE_IRQ(MAX8997_PMICIRQ_JIGR, PMIC_INT2, 1 << 0), + DECLARE_IRQ(MAX8997_PMICIRQ_JIGF, PMIC_INT2, 1 << 1), + DECLARE_IRQ(MAX8997_PMICIRQ_MR, PMIC_INT2, 1 << 2), + DECLARE_IRQ(MAX8997_PMICIRQ_DVS1OK, PMIC_INT2, 1 << 3), + DECLARE_IRQ(MAX8997_PMICIRQ_DVS2OK, PMIC_INT2, 1 << 4), + DECLARE_IRQ(MAX8997_PMICIRQ_DVS3OK, PMIC_INT2, 1 << 5), + DECLARE_IRQ(MAX8997_PMICIRQ_DVS4OK, PMIC_INT2, 1 << 6), + + DECLARE_IRQ(MAX8997_PMICIRQ_CHGINS, PMIC_INT3, 1 << 0), + DECLARE_IRQ(MAX8997_PMICIRQ_CHGRM, PMIC_INT3, 1 << 1), + DECLARE_IRQ(MAX8997_PMICIRQ_DCINOVP, PMIC_INT3, 1 << 2), + DECLARE_IRQ(MAX8997_PMICIRQ_TOPOFFR, PMIC_INT3, 1 << 3), + DECLARE_IRQ(MAX8997_PMICIRQ_CHGRSTF, PMIC_INT3, 1 << 5), + DECLARE_IRQ(MAX8997_PMICIRQ_MBCHGTMEXPD, PMIC_INT3, 1 << 7), + + DECLARE_IRQ(MAX8997_PMICIRQ_RTC60S, PMIC_INT4, 1 << 0), + DECLARE_IRQ(MAX8997_PMICIRQ_RTCA1, PMIC_INT4, 1 << 1), + DECLARE_IRQ(MAX8997_PMICIRQ_RTCA2, PMIC_INT4, 1 << 2), + DECLARE_IRQ(MAX8997_PMICIRQ_SMPL_INT, PMIC_INT4, 1 << 3), + DECLARE_IRQ(MAX8997_PMICIRQ_RTC1S, PMIC_INT4, 1 << 4), + DECLARE_IRQ(MAX8997_PMICIRQ_WTSR, PMIC_INT4, 1 << 5), + + DECLARE_IRQ(MAX8997_MUICIRQ_ADCError, MUIC_INT1, 1 << 2), + DECLARE_IRQ(MAX8997_MUICIRQ_ADCLow, MUIC_INT1, 1 << 1), + DECLARE_IRQ(MAX8997_MUICIRQ_ADC, MUIC_INT1, 1 << 0), + + DECLARE_IRQ(MAX8997_MUICIRQ_VBVolt, MUIC_INT2, 1 << 4), + DECLARE_IRQ(MAX8997_MUICIRQ_DBChg, MUIC_INT2, 1 << 3), + DECLARE_IRQ(MAX8997_MUICIRQ_DCDTmr, MUIC_INT2, 1 << 2), + DECLARE_IRQ(MAX8997_MUICIRQ_ChgDetRun, MUIC_INT2, 1 << 1), + DECLARE_IRQ(MAX8997_MUICIRQ_ChgTyp, MUIC_INT2, 1 << 0), + + DECLARE_IRQ(MAX8997_MUICIRQ_OVP, MUIC_INT3, 1 << 2), +}; + +static void max8997_irq_lock(struct irq_data *data) +{ + struct max8997_dev *max8997 = get_irq_chip_data(data->irq); + + mutex_lock(&max8997->irqlock); +} + +static void max8997_irq_sync_unlock(struct irq_data *data) +{ + struct max8997_dev *max8997 = get_irq_chip_data(data->irq); + int i; + + for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++) { + u8 mask_reg = max8997_mask_reg[i]; + struct i2c_client *i2c = get_i2c(max8997, i); + + if (mask_reg == MAX8997_REG_INVALID || + IS_ERR_OR_NULL(i2c)) + continue; + max8997->irq_masks_cache[i] = max8997->irq_masks_cur[i]; + + max8997_write_reg(i2c, max8997_mask_reg[i], + max8997->irq_masks_cur[i]); + } + + mutex_unlock(&max8997->irqlock); +} + +static const inline struct max8997_irq_data * +irq_to_max8997_irq(struct max8997_dev *max8997, int irq) +{ + return &max8997_irqs[irq - max8997->irq_base]; +} + +static void max8997_irq_mask(struct irq_data *data) +{ + struct max8997_dev *max8997 = get_irq_chip_data(data->irq); + const struct max8997_irq_data *irq_data = irq_to_max8997_irq(max8997, + data->irq); + + max8997->irq_masks_cur[irq_data->group] |= irq_data->mask; +} + +static void max8997_irq_unmask(struct irq_data *data) +{ + struct max8997_dev *max8997 = get_irq_chip_data(data->irq); + const struct max8997_irq_data *irq_data = irq_to_max8997_irq(max8997, + data->irq); + + max8997->irq_masks_cur[irq_data->group] &= ~irq_data->mask; +} + +static struct irq_chip max8997_irq_chip = { + .name = "max8997", + .irq_bus_lock = max8997_irq_lock, + .irq_bus_sync_unlock = max8997_irq_sync_unlock, + .irq_mask = max8997_irq_mask, + .irq_unmask = max8997_irq_unmask, +}; + +#define MAX8997_IRQSRC_PMIC (1 << 1) +#define MAX8997_IRQSRC_FUELGAUGE (1 << 2) +#define MAX8997_IRQSRC_MUIC (1 << 3) +#define MAX8997_IRQSRC_GPIO (1 << 4) +#define MAX8997_IRQSRC_FLASH (1 << 5) +static irqreturn_t max8997_irq_thread(int irq, void *data) +{ + struct max8997_dev *max8997 = data; + u8 irq_reg[MAX8997_IRQ_GROUP_NR] = {}; + u8 irq_src; + int ret; + int i; + + ret = max8997_read_reg(max8997->i2c, MAX8997_REG_INTSRC, &irq_src); + if (ret < 0) { + dev_err(max8997->dev, "Failed to read interrupt source: %d\n", + ret); + return IRQ_NONE; + } + + if (irq_src & MAX8997_IRQSRC_PMIC) { + /* PMIC INT1 ~ INT4 */ + max8997_bulk_read(max8997->i2c, MAX8997_REG_INT1, 4, + &irq_reg[PMIC_INT1]); + } + if (irq_src & MAX8997_IRQSRC_FUELGAUGE) { + /* + * TODO: FUEL GAUGE + * + * This is to be supported by Max17042 driver. When + * an interrupt incurs here, it should be relayed to a + * Max17042 device that is connected (probably by + * platform-data). However, we do not have interrupt + * handling in Max17042 driver currently. The Max17042 IRQ + * driver should be ready to be used as a stand-alone device and + * a Max8997-dependent device. Because it is not ready in + * Max17042-side and it is not too critical in operating + * Max8997, we do not implement this in initial releases. + */ + irq_reg[FUEL_GAUGE] = 0; + } + if (irq_src & MAX8997_IRQSRC_MUIC) { + /* MUIC INT1 ~ INT3 */ + max8997_bulk_read(max8997->muic, MAX8997_MUIC_REG_INT1, 3, + &irq_reg[MUIC_INT1]); + } + if (irq_src & MAX8997_IRQSRC_GPIO) { + /* GPIO Interrupt */ + u8 gpio_info[MAX8997_NUM_GPIO]; + + irq_reg[GPIO_LOW] = 0; + irq_reg[GPIO_HI] = 0; + + max8997_bulk_read(max8997->i2c, MAX8997_REG_GPIOCNTL1, + MAX8997_NUM_GPIO, gpio_info); + for (i = 0; i < MAX8997_NUM_GPIO; i++) { + bool interrupt = false; + + switch (gpio_info[i] & MAX8997_GPIO_INT_MASK) { + case MAX8997_GPIO_INT_BOTH: + if (max8997->gpio_status[i] != gpio_info[i]) + interrupt = true; + break; + case MAX8997_GPIO_INT_RISE: + if ((max8997->gpio_status[i] != gpio_info[i]) && + (gpio_info[i] & MAX8997_GPIO_DATA_MASK)) + interrupt = true; + break; + case MAX8997_GPIO_INT_FALL: + if ((max8997->gpio_status[i] != gpio_info[i]) && + !(gpio_info[i] & MAX8997_GPIO_DATA_MASK)) + interrupt = true; + break; + default: + break; + } + + if (interrupt) { + if (i < 8) + irq_reg[GPIO_LOW] |= (1 << i); + else + irq_reg[GPIO_HI] |= (1 << (i - 8)); + } + + } + } + if (irq_src & MAX8997_IRQSRC_FLASH) { + /* Flash Status Interrupt */ + ret = max8997_read_reg(max8997->i2c, MAX8997_REG_FLASHSTATUS, + &irq_reg[FLASH_STATUS]); + } + + /* Apply masking */ + for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++) + irq_reg[i] &= ~max8997->irq_masks_cur[i]; + + /* Report */ + for (i = 0; i < MAX8997_IRQ_NR; i++) { + if (irq_reg[max8997_irqs[i].group] & max8997_irqs[i].mask) + handle_nested_irq(max8997->irq_base + i); + } + + return IRQ_HANDLED; +} + +int max8997_irq_resume(struct max8997_dev *max8997) +{ + if (max8997->irq && max8997->irq_base) + max8997_irq_thread(max8997->irq_base, max8997); + return 0; +} + +int max8997_irq_init(struct max8997_dev *max8997) +{ + int i; + int cur_irq; + int ret; + u8 val; + + if (!max8997->irq) { + dev_warn(max8997->dev, "No interrupt specified.\n"); + max8997->irq_base = 0; + return 0; + } + + if (!max8997->irq_base) { + dev_err(max8997->dev, "No interrupt base specified.\n"); + return 0; + } + + mutex_init(&max8997->irqlock); + + /* Mask individual interrupt sources */ + for (i = 0; i < MAX8997_IRQ_GROUP_NR; i++) { + struct i2c_client *i2c; + + max8997->irq_masks_cur[i] = 0xff; + max8997->irq_masks_cache[i] = 0xff; + i2c = get_i2c(max8997, i); + + if (IS_ERR_OR_NULL(i2c)) + continue; + if (max8997_mask_reg[i] == MAX8997_REG_INVALID) + continue; + + max8997_write_reg(i2c, max8997_mask_reg[i], 0xff); + } + + for (i = 0; i < MAX8997_NUM_GPIO; i++) { + max8997->gpio_status[i] = (max8997_read_reg(max8997->i2c, + MAX8997_REG_GPIOCNTL1 + i, + &val) + & MAX8997_GPIO_DATA_MASK) ? + true : false; + } + + /* Register with genirq */ + for (i = 0; i < MAX8997_IRQ_NR; i++) { + cur_irq = i + max8997->irq_base; + set_irq_chip_data(cur_irq, max8997); + set_irq_chip_and_handler(cur_irq, &max8997_irq_chip, + handle_edge_irq); + set_irq_nested_thread(cur_irq, 1); +#ifdef CONFIG_ARM + set_irq_flags(cur_irq, IRQF_VALID); +#else + set_irq_noprobe(cur_irq); +#endif + } + + ret = request_threaded_irq(max8997->irq, NULL, max8997_irq_thread, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + "max8997-irq", max8997); + + if (ret) { + dev_err(max8997->dev, "Failed to request IRQ %d: %d\n", + max8997->irq, ret); + return ret; + } + + if (!max8997->ono) + return 0; + + ret = request_threaded_irq(max8997->ono, NULL, max8997_irq_thread, + IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING | + IRQF_ONESHOT, "max8997-ono", max8997); + + if (ret) + dev_err(max8997->dev, "Failed to request ono-IRQ %d: %d\n", + max8997->ono, ret); + + return 0; +} + +void max8997_irq_exit(struct max8997_dev *max8997) +{ + if (max8997->ono) + free_irq(max8997->ono, max8997); + + if (max8997->irq) + free_irq(max8997->irq, max8997); +} diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h index 93a9477e075f..69d1010e2e51 100644 --- a/include/linux/mfd/max8997-private.h +++ b/include/linux/mfd/max8997-private.h @@ -24,6 +24,8 @@ #include +#define MAX8997_REG_INVALID (0xff) + enum max8997_pmic_reg { MAX8997_REG_PMIC_ID0 = 0x00, MAX8997_REG_PMIC_ID1 = 0x01, @@ -313,6 +315,7 @@ enum max8997_irq { #define MAX8997_REG_BUCK2DVS(x) (MAX8997_REG_BUCK2DVS1 + (x) - 1) #define MAX8997_REG_BUCK5DVS(x) (MAX8997_REG_BUCK5DVS1 + (x) - 1) +#define MAX8997_NUM_GPIO 12 struct max8997_dev { struct device *dev; struct i2c_client *i2c; /* 0xcc / PMIC, Battery Control, and FLASH */ @@ -324,11 +327,19 @@ struct max8997_dev { int type; struct platform_device *battery; /* battery control (not fuel gauge) */ + int irq; + int ono; + int irq_base; bool wakeup; + struct mutex irqlock; + int irq_masks_cur[MAX8997_IRQ_GROUP_NR]; + int irq_masks_cache[MAX8997_IRQ_GROUP_NR]; /* For hibernation */ u8 reg_dump[MAX8997_REG_PMIC_END + MAX8997_MUIC_REG_END + MAX8997_HAPTIC_REG_END]; + + bool gpio_status[MAX8997_NUM_GPIO]; }; enum max8997_types { @@ -336,6 +347,10 @@ enum max8997_types { TYPE_MAX8966, }; +extern int max8997_irq_init(struct max8997_dev *max8997); +extern void max8997_irq_exit(struct max8997_dev *max8997); +extern int max8997_irq_resume(struct max8997_dev *max8997); + extern int max8997_read_reg(struct i2c_client *i2c, u8 reg, u8 *dest); extern int max8997_bulk_read(struct i2c_client *i2c, u8 reg, int count, u8 *buf); @@ -344,4 +359,10 @@ extern int max8997_bulk_write(struct i2c_client *i2c, u8 reg, int count, u8 *buf); extern int max8997_update_reg(struct i2c_client *i2c, u8 reg, u8 val, u8 mask); +#define MAX8997_GPIO_INT_BOTH (0x3 << 4) +#define MAX8997_GPIO_INT_RISE (0x2 << 4) +#define MAX8997_GPIO_INT_FALL (0x1 << 4) + +#define MAX8997_GPIO_INT_MASK (0x3 << 4) +#define MAX8997_GPIO_DATA_MASK (0x1 << 2) #endif /* __LINUX_MFD_MAX8997_PRIV_H */ diff --git a/include/linux/mfd/max8997.h b/include/linux/mfd/max8997.h index cb671b3451bf..60931d089422 100644 --- a/include/linux/mfd/max8997.h +++ b/include/linux/mfd/max8997.h @@ -78,8 +78,11 @@ struct max8997_regulator_data { }; struct max8997_platform_data { - bool wakeup; - /* IRQ: Not implemented */ + /* IRQ */ + int irq_base; + int ono; + int wakeup; + /* ---- PMIC ---- */ struct max8997_regulator_data *regulators; int num_regulators; -- cgit v1.2.3 From 6ea0c34dac89611126455537552cffe6c7e832ad Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 4 Apr 2011 01:41:32 +0200 Subject: percpu: Unify input section names The two percpu helper macros have the section names duplicated. So create a new define to merge the two. This also allows arches who need to link things more directly themselves to avoid duplicating the input sections in their linker script. Signed-off-by: Mike Frysinger Signed-off-by: Tejun Heo --- include/asm-generic/vmlinux.lds.h | 44 +++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 32c45e5fe0ab..bf90fbc6688b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -687,6 +687,28 @@ *(.discard.*) \ } +/** + * PERCPU_INPUT - the percpu input sections + * @cacheline: cacheline size + * + * The core percpu section names and core symbols which do not rely + * directly upon load addresses. + * + * @cacheline is used to align subsections to avoid false cacheline + * sharing between subsections for different purposes. + */ +#define PERCPU_INPUT(cacheline) \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ + *(.data..percpu..first) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..page_aligned) \ + . = ALIGN(cacheline); \ + *(.data..percpu..readmostly) \ + . = ALIGN(cacheline); \ + *(.data..percpu) \ + *(.data..percpu..shared_aligned) \ + VMLINUX_SYMBOL(__per_cpu_end) = .; + /** * PERCPU_VADDR - define output section for percpu area * @cacheline: cacheline size @@ -715,16 +737,7 @@ VMLINUX_SYMBOL(__per_cpu_load) = .; \ .data..percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \ - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__per_cpu_start) = .; \ - *(.data..percpu..first) \ - . = ALIGN(PAGE_SIZE); \ - *(.data..percpu..page_aligned) \ - . = ALIGN(cacheline); \ - *(.data..percpu..readmostly) \ - . = ALIGN(cacheline); \ - *(.data..percpu) \ - *(.data..percpu..shared_aligned) \ - VMLINUX_SYMBOL(__per_cpu_end) = .; \ + PERCPU_INPUT(cacheline) \ } phdr \ . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data..percpu); @@ -745,16 +758,7 @@ . = ALIGN(align); \ .data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ - VMLINUX_SYMBOL(__per_cpu_start) = .; \ - *(.data..percpu..first) \ - . = ALIGN(PAGE_SIZE); \ - *(.data..percpu..page_aligned) \ - . = ALIGN(cacheline); \ - *(.data..percpu..readmostly) \ - . = ALIGN(cacheline); \ - *(.data..percpu) \ - *(.data..percpu..shared_aligned) \ - VMLINUX_SYMBOL(__per_cpu_end) = .; \ + PERCPU_INPUT(cacheline) \ } -- cgit v1.2.3