summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-09 10:55:17 -0800
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-09 10:55:17 -0800
commitb675b3667f6729dcd1036a2a129b35445947f905 (patch)
tree0d58791e9063d3ca2c352da6f3e7df2bdb876f9d /block
parent104a5f3cad8f2f27cadbdf0029400ecd9e17ccc0 (diff)
parent192cfd58774b4d17b2fe8bdc77d89c2ef4e0591d (diff)
downloadlinux-b675b3667f6729dcd1036a2a129b35445947f905.tar.bz2
Merge commit 'v3.3-rc6' into next
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig6
-rw-r--r--block/Makefile3
-rw-r--r--block/blk-cgroup.c46
-rw-r--r--block/blk-core.c249
-rw-r--r--block/blk-exec.c8
-rw-r--r--block/blk-ioc.c426
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-merge.c37
-rw-r--r--block/blk-settings.c32
-rw-r--r--block/blk-sysfs.c12
-rw-r--r--block/blk-tag.c13
-rw-r--r--block/blk-throttle.c4
-rw-r--r--block/blk.h60
-rw-r--r--block/bsg.c9
-rw-r--r--block/cfq-iosched.c608
-rw-r--r--block/compat_ioctl.c3
-rw-r--r--block/deadline-iosched.c4
-rw-r--r--block/elevator.c272
-rw-r--r--block/genhd.c7
-rw-r--r--block/ioctl.c30
-rw-r--r--block/noop-iosched.c4
-rw-r--r--block/partition-generic.c537
-rw-r--r--block/partitions/Kconfig251
-rw-r--r--block/partitions/Makefile20
-rw-r--r--block/partitions/acorn.c556
-rw-r--r--block/partitions/acorn.h14
-rw-r--r--block/partitions/amiga.c139
-rw-r--r--block/partitions/amiga.h6
-rw-r--r--block/partitions/atari.c149
-rw-r--r--block/partitions/atari.h34
-rw-r--r--block/partitions/check.c166
-rw-r--r--block/partitions/check.h52
-rw-r--r--block/partitions/efi.c675
-rw-r--r--block/partitions/efi.h134
-rw-r--r--block/partitions/ibm.c275
-rw-r--r--block/partitions/ibm.h1
-rw-r--r--block/partitions/karma.c57
-rw-r--r--block/partitions/karma.h8
-rw-r--r--block/partitions/ldm.c1567
-rw-r--r--block/partitions/ldm.h215
-rw-r--r--block/partitions/mac.c134
-rw-r--r--block/partitions/mac.h44
-rw-r--r--block/partitions/msdos.c552
-rw-r--r--block/partitions/msdos.h8
-rw-r--r--block/partitions/osf.c86
-rw-r--r--block/partitions/osf.h7
-rw-r--r--block/partitions/sgi.c82
-rw-r--r--block/partitions/sgi.h8
-rw-r--r--block/partitions/sun.c122
-rw-r--r--block/partitions/sun.h8
-rw-r--r--block/partitions/sysv68.c95
-rw-r--r--block/partitions/sysv68.h1
-rw-r--r--block/partitions/ultrix.c48
-rw-r--r--block/partitions/ultrix.h5
-rw-r--r--block/scsi_ioctl.c52
55 files changed, 7026 insertions, 917 deletions
diff --git a/block/Kconfig b/block/Kconfig
index e97934eececa..09acf1b39905 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING
See Documentation/cgroups/blkio-controller.txt for more information.
+menu "Partition Types"
+
+source "block/partitions/Kconfig"
+
+endmenu
+
endif # BLOCK
config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 514c6e4f427a..39b76ba66ffd 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,8 @@
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
+ partition-generic.o partitions/
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8f630cec906e..75642a352a8f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -30,8 +30,10 @@ EXPORT_SYMBOL_GPL(blkio_root_cgroup);
static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
-static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
+static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+ struct cgroup_taskset *);
+static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
+ struct cgroup_taskset *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
@@ -44,8 +46,8 @@ static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach_task = blkiocg_can_attach_task,
- .attach_task = blkiocg_attach_task,
+ .can_attach = blkiocg_can_attach,
+ .attach = blkiocg_attach,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
@@ -1626,30 +1628,40 @@ done:
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- task_lock(tsk);
- ioc = tsk->io_context;
- if (ioc && atomic_read(&ioc->nr_tasks) > 1)
- ret = -EINVAL;
- task_unlock(tsk);
-
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ task_lock(task);
+ ioc = task->io_context;
+ if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+ ret = -EINVAL;
+ task_unlock(task);
+ if (ret)
+ break;
+ }
return ret;
}
-static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
struct io_context *ioc;
- task_lock(tsk);
- ioc = tsk->io_context;
- if (ioc)
- ioc->cgroup_changed = 1;
- task_unlock(tsk);
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /* we don't lose anything even if ioc allocation fails */
+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+ if (ioc) {
+ ioc_cgroup_changed(ioc);
+ put_io_context(ioc);
+ }
+ }
}
void blkio_policy_register(struct blkio_policy_type *blkiop)
diff --git a/block/blk-core.c b/block/blk-core.c
index ea70e6c80cd3..3a78b00edd71 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
+DEFINE_IDA(blk_queue_ida);
+
/*
* For the allocated request tables
*/
@@ -358,7 +360,8 @@ EXPORT_SYMBOL(blk_put_queue);
void blk_drain_queue(struct request_queue *q, bool drain_all)
{
while (true) {
- int nr_rqs;
+ bool drain = false;
+ int i;
spin_lock_irq(q->queue_lock);
@@ -366,16 +369,34 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
if (drain_all)
blk_throtl_drain(q);
- __blk_run_queue(q);
+ /*
+ * This function might be called on a queue which failed
+ * driver init after queue creation. Some drivers
+ * (e.g. fd) get unhappy in such cases. Kick queue iff
+ * dispatch queue has something on it.
+ */
+ if (!list_empty(&q->queue_head))
+ __blk_run_queue(q);
- if (drain_all)
- nr_rqs = q->rq.count[0] + q->rq.count[1];
- else
- nr_rqs = q->rq.elvpriv;
+ drain |= q->rq.elvpriv;
+
+ /*
+ * Unfortunately, requests are queued at and tracked from
+ * multiple places and there's no single counter which can
+ * be drained. Check all the queues and counters.
+ */
+ if (drain_all) {
+ drain |= !list_empty(&q->queue_head);
+ for (i = 0; i < 2; i++) {
+ drain |= q->rq.count[i];
+ drain |= q->in_flight[i];
+ drain |= !list_empty(&q->flush_queue[i]);
+ }
+ }
spin_unlock_irq(q->queue_lock);
- if (!nr_rqs)
+ if (!drain)
break;
msleep(10);
}
@@ -462,27 +483,29 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ if (q->id < 0)
+ goto fail_q;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
q->backing_dev_info.name = "block";
+ q->node = node_id;
err = bdi_init(&q->backing_dev_info);
- if (err) {
- kmem_cache_free(blk_requestq_cachep, q);
- return NULL;
- }
+ if (err)
+ goto fail_id;
- if (blk_throtl_init(q)) {
- kmem_cache_free(blk_requestq_cachep, q);
- return NULL;
- }
+ if (blk_throtl_init(q))
+ goto fail_id;
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->icq_list);
INIT_LIST_HEAD(&q->flush_queue[0]);
INIT_LIST_HEAD(&q->flush_queue[1]);
INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -500,6 +523,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->queue_lock = &q->__queue_lock;
return q;
+
+fail_id:
+ ida_simple_remove(&blk_queue_ida, q->id);
+fail_q:
+ kmem_cache_free(blk_requestq_cachep, q);
+ return NULL;
}
EXPORT_SYMBOL(blk_alloc_queue_node);
@@ -551,7 +580,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
if (!uninit_q)
return NULL;
- q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+ q = blk_init_allocated_queue(uninit_q, rfn, lock);
if (!q)
blk_cleanup_queue(uninit_q);
@@ -563,18 +592,9 @@ struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
spinlock_t *lock)
{
- return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
- spinlock_t *lock, int node_id)
-{
if (!q)
return NULL;
- q->node = node_id;
if (blk_init_free_list(q))
return NULL;
@@ -604,28 +624,33 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
return NULL;
}
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
-int blk_get_queue(struct request_queue *q)
+bool blk_get_queue(struct request_queue *q)
{
- if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
- kobject_get(&q->kobj);
- return 0;
+ if (likely(!blk_queue_dead(q))) {
+ __blk_get_queue(q);
+ return true;
}
- return 1;
+ return false;
}
EXPORT_SYMBOL(blk_get_queue);
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
- if (rq->cmd_flags & REQ_ELVPRIV)
+ if (rq->cmd_flags & REQ_ELVPRIV) {
elv_put_request(q, rq);
+ if (rq->elv.icq)
+ put_io_context(rq->elv.icq->ioc);
+ }
+
mempool_free(rq, q->rq.rq_pool);
}
static struct request *
-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+ unsigned int flags, gfp_t gfp_mask)
{
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@@ -636,10 +661,15 @@ blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
rq->cmd_flags = flags | REQ_ALLOCED;
- if ((flags & REQ_ELVPRIV) &&
- unlikely(elv_set_request(q, rq, gfp_mask))) {
- mempool_free(rq, q->rq.rq_pool);
- return NULL;
+ if (flags & REQ_ELVPRIV) {
+ rq->elv.icq = icq;
+ if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+ mempool_free(rq, q->rq.rq_pool);
+ return NULL;
+ }
+ /* @rq->elv.icq holds on to io_context until @rq is freed */
+ if (icq)
+ get_io_context(icq->ioc);
}
return rq;
@@ -751,11 +781,17 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
{
struct request *rq = NULL;
struct request_list *rl = &q->rq;
- struct io_context *ioc = NULL;
+ struct elevator_type *et;
+ struct io_context *ioc;
+ struct io_cq *icq = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
+ bool retried = false;
int may_queue;
+retry:
+ et = q->elevator->type;
+ ioc = current->io_context;
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ if (unlikely(blk_queue_dead(q)))
return NULL;
may_queue = elv_may_queue(q, rw_flags);
@@ -764,7 +800,20 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
if (rl->count[is_sync]+1 >= q->nr_requests) {
- ioc = current_io_context(GFP_ATOMIC, q->node);
+ /*
+ * We want ioc to record batching state. If it's
+ * not already there, creating a new one requires
+ * dropping queue_lock, which in turn requires
+ * retesting conditions to avoid queue hang.
+ */
+ if (!ioc && !retried) {
+ spin_unlock_irq(q->queue_lock);
+ create_io_context(current, gfp_mask, q->node);
+ spin_lock_irq(q->queue_lock);
+ retried = true;
+ goto retry;
+ }
+
/*
* The queue will fill after this allocation, so set
* it as full, and mark this process as "batching".
@@ -800,17 +849,38 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
+ /*
+ * Decide whether the new request will be managed by elevator. If
+ * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
+ * prevent the current elevator from being destroyed until the new
+ * request is freed. This guarantees icq's won't be destroyed and
+ * makes creating new ones safe.
+ *
+ * Also, lookup icq while holding queue_lock. If it doesn't exist,
+ * it will be created after releasing queue_lock.
+ */
if (blk_rq_should_init_elevator(bio) &&
!test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
rw_flags |= REQ_ELVPRIV;
rl->elvpriv++;
+ if (et->icq_cache && ioc)
+ icq = ioc_lookup_icq(ioc, q);
}
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
spin_unlock_irq(q->queue_lock);
- rq = blk_alloc_request(q, rw_flags, gfp_mask);
+ /* create icq if missing */
+ if ((rw_flags & REQ_ELVPRIV) && unlikely(et->icq_cache && !icq)) {
+ icq = ioc_create_icq(q, gfp_mask);
+ if (!icq)
+ goto fail_icq;
+ }
+
+ rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+
+fail_icq:
if (unlikely(!rq)) {
/*
* Allocation failed presumably due to memory. Undo anything
@@ -872,10 +942,9 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
rq = get_request(q, rw_flags, bio, GFP_NOIO);
while (!rq) {
DEFINE_WAIT(wait);
- struct io_context *ioc;
struct request_list *rl = &q->rq;
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ if (unlikely(blk_queue_dead(q)))
return NULL;
prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -892,8 +961,8 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
* up to a big batch of them for a small period time.
* See ioc_batching, ioc_set_batching
*/
- ioc = current_io_context(GFP_NOIO, q->node);
- ioc_set_batching(q, ioc);
+ create_io_context(current, GFP_NOIO, q->node);
+ ioc_set_batching(q, current->io_context);
spin_lock_irq(q->queue_lock);
finish_wait(&rl->wait[is_sync], &wait);
@@ -1010,54 +1079,6 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
__elv_add_request(q, rq, where);
}
-/**
- * blk_insert_request - insert a special request into a request queue
- * @q: request queue where request should be inserted
- * @rq: request to be inserted
- * @at_head: insert request at head or tail of queue
- * @data: private data
- *
- * Description:
- * Many block devices need to execute commands asynchronously, so they don't
- * block the whole kernel from preemption during request execution. This is
- * accomplished normally by inserting aritficial requests tagged as
- * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- * be scheduled for actual execution by the request queue.
- *
- * We have the option of inserting the head or the tail of the queue.
- * Typically we use the tail for new ioctls and so forth. We use the head
- * of the queue for things like a QUEUE_FULL message from a device, or a
- * host that is unable to accept a particular command.
- */
-void blk_insert_request(struct request_queue *q, struct request *rq,
- int at_head, void *data)
-{
- int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
- unsigned long flags;
-
- /*
- * tell I/O scheduler that this isn't a regular read/write (ie it
- * must not attempt merges on this) and that it acts as a soft
- * barrier
- */
- rq->cmd_type = REQ_TYPE_SPECIAL;
-
- rq->special = data;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- /*
- * If command is tagged, release the tag
- */
- if (blk_rq_tagged(rq))
- blk_queue_end_tag(q, rq);
-
- add_acct_request(q, rq, where);
- __blk_run_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_insert_request);
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
@@ -1191,7 +1212,6 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
return true;
}
@@ -1222,7 +1242,6 @@ static bool bio_attempt_front_merge(struct request_queue *q,
req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
drive_stat_acct(req, 0);
- elv_bio_merged(q, req, bio);
return true;
}
@@ -1236,13 +1255,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
* on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
- * This function is called without @q->queue_lock; however, elevator is
- * accessed iff there already are requests on the plugged list which in
- * turn guarantees validity of the elevator.
- *
- * Note that, on successful merge, elevator operation
- * elevator_bio_merged_fn() will be called without queue lock. Elevator
- * must be ready for this.
+ * Plugging coalesces IOs from the same issuer for the same purpose without
+ * going through @q->queue_lock. As such it's more of an issuing mechanism
+ * than scheduling, and the request, while may have elvpriv data, is not
+ * added on the elevator at this point. In addition, we don't have
+ * reliable access to the elevator outside queue lock. Only check basic
+ * merging parameters without querying the elevator.
*/
static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count)
@@ -1261,10 +1279,10 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
(*request_count)++;
- if (rq->q != q)
+ if (rq->q != q || !blk_rq_merge_ok(rq, bio))
continue;
- el_ret = elv_try_merge(rq, bio);
+ el_ret = blk_try_merge(rq, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
ret = bio_attempt_back_merge(q, rq, bio);
if (ret)
@@ -1326,12 +1344,14 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
el_ret = elv_merge(q, &req, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, req, bio)) {
+ elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, req, bio)) {
+ elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
@@ -1767,6 +1787,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
return -EIO;
spin_lock_irqsave(q->queue_lock, flags);
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ return -ENODEV;
+ }
/*
* Submitting request must be dequeued before calling this function
@@ -2741,6 +2765,14 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth,
trace_block_unplug(q, depth, !from_schedule);
/*
+ * Don't mess with dead queue.
+ */
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock(q->queue_lock);
+ return;
+ }
+
+ /*
* If we are punting this to kblockd, then we can safely drop
* the queue_lock before waking kblockd (which needs to take
* this lock).
@@ -2816,6 +2848,15 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
depth = 0;
spin_lock(q->queue_lock);
}
+
+ /*
+ * Short-circuit if @q is dead
+ */
+ if (unlikely(blk_queue_dead(q))) {
+ __blk_end_request_all(rq, -ENODEV);
+ continue;
+ }
+
/*
* rq is already accounted, so use raw insert
*/
diff --git a/block/blk-exec.c b/block/blk-exec.c
index a1ebceb332f9..fb2cbd551621 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -50,7 +50,11 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
{
int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+ WARN_ON(irqs_disabled());
+ spin_lock_irq(q->queue_lock);
+
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock_irq(q->queue_lock);
rq->errors = -ENXIO;
if (rq->end_io)
rq->end_io(rq, rq->errors);
@@ -59,8 +63,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
- WARN_ON(irqs_disabled());
- spin_lock_irq(q->queue_lock);
__elv_add_request(q, rq, where);
__blk_run_queue(q);
/* the queue is stopped so it won't be run */
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 6f9bbd978653..8b782a63c297 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -16,53 +16,153 @@
*/
static struct kmem_cache *iocontext_cachep;
-static void cfq_dtor(struct io_context *ioc)
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
{
- if (!hlist_empty(&ioc->cic_list)) {
- struct cfq_io_context *cic;
+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+ atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
- cic->dtor(ioc);
- }
+static void icq_free_icq_rcu(struct rcu_head *head)
+{
+ struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
+
+ kmem_cache_free(icq->__rcu_icq_cache, icq);
}
/*
- * IO Context helper functions. put_io_context() returns 1 if there are no
- * more users of this io context, 0 otherwise.
+ * Exit and free an icq. Called with both ioc and q locked.
*/
-int put_io_context(struct io_context *ioc)
+static void ioc_exit_icq(struct io_cq *icq)
{
- if (ioc == NULL)
- return 1;
+ struct io_context *ioc = icq->ioc;
+ struct request_queue *q = icq->q;
+ struct elevator_type *et = q->elevator->type;
- BUG_ON(atomic_long_read(&ioc->refcount) == 0);
+ lockdep_assert_held(&ioc->lock);
+ lockdep_assert_held(q->queue_lock);
- if (atomic_long_dec_and_test(&ioc->refcount)) {
- rcu_read_lock();
- cfq_dtor(ioc);
- rcu_read_unlock();
+ radix_tree_delete(&ioc->icq_tree, icq->q->id);
+ hlist_del_init(&icq->ioc_node);
+ list_del_init(&icq->q_node);
- kmem_cache_free(iocontext_cachep, ioc);
- return 1;
+ /*
+ * Both setting lookup hint to and clearing it from @icq are done
+ * under queue_lock. If it's not pointing to @icq now, it never
+ * will. Hint assignment itself can race safely.
+ */
+ if (rcu_dereference_raw(ioc->icq_hint) == icq)
+ rcu_assign_pointer(ioc->icq_hint, NULL);
+
+ if (et->ops.elevator_exit_icq_fn)
+ et->ops.elevator_exit_icq_fn(icq);
+
+ /*
+ * @icq->q might have gone away by the time RCU callback runs
+ * making it impossible to determine icq_cache. Record it in @icq.
+ */
+ icq->__rcu_icq_cache = et->icq_cache;
+ call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
+}
+
+/*
+ * Slow path for ioc release in put_io_context(). Performs double-lock
+ * dancing to unlink all icq's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
+{
+ struct io_context *ioc = container_of(work, struct io_context,
+ release_work);
+ struct request_queue *last_q = NULL;
+ unsigned long flags;
+
+ /*
+ * Exiting icq may call into put_io_context() through elevator
+ * which will trigger lockdep warning. The ioc's are guaranteed to
+ * be different, use a different locking subclass here. Use
+ * irqsave variant as there's no spin_lock_irq_nested().
+ */
+ spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+
+ while (!hlist_empty(&ioc->icq_list)) {
+ struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+ struct io_cq, ioc_node);
+ struct request_queue *this_q = icq->q;
+
+ if (this_q != last_q) {
+ /*
+ * Need to switch to @this_q. Once we release
+ * @ioc->lock, it can go away along with @cic.
+ * Hold on to it.
+ */
+ __blk_get_queue(this_q);
+
+ /*
+ * blk_put_queue() might sleep thanks to kobject
+ * idiocy. Always release both locks, put and
+ * restart.
+ */
+ if (last_q) {
+ spin_unlock(last_q->queue_lock);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ blk_put_queue(last_q);
+ } else {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ }
+
+ last_q = this_q;
+ spin_lock_irqsave(this_q->queue_lock, flags);
+ spin_lock_nested(&ioc->lock, 1);
+ continue;
+ }
+ ioc_exit_icq(icq);
}
- return 0;
+
+ if (last_q) {
+ spin_unlock(last_q->queue_lock);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ blk_put_queue(last_q);
+ } else {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ }
+
+ kmem_cache_free(iocontext_cachep, ioc);
}
-EXPORT_SYMBOL(put_io_context);
-static void cfq_exit(struct io_context *ioc)
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.
+ */
+void put_io_context(struct io_context *ioc)
{
- rcu_read_lock();
+ unsigned long flags;
- if (!hlist_empty(&ioc->cic_list)) {
- struct cfq_io_context *cic;
+ if (ioc == NULL)
+ return;
+
+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
- cic_list);
- cic->exit(ioc);
+ /*
+ * Releasing ioc requires reverse order double locking and we may
+ * already be holding a queue_lock. Do it asynchronously from wq.
+ */
+ if (atomic_long_dec_and_test(&ioc->refcount)) {
+ spin_lock_irqsave(&ioc->lock, flags);
+ if (!hlist_empty(&ioc->icq_list))
+ schedule_work(&ioc->release_work);
+ spin_unlock_irqrestore(&ioc->lock, flags);
}
- rcu_read_unlock();
}
+EXPORT_SYMBOL(put_io_context);
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
@@ -74,86 +174,240 @@ void exit_io_context(struct task_struct *task)
task->io_context = NULL;
task_unlock(task);
- if (atomic_dec_and_test(&ioc->nr_tasks))
- cfq_exit(ioc);
-
+ atomic_dec(&ioc->nr_tasks);
put_io_context(ioc);
}
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+/**
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
+ *
+ * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
+ */
+void ioc_clear_queue(struct request_queue *q)
{
- struct io_context *ioc;
+ lockdep_assert_held(q->queue_lock);
+
+ while (!list_empty(&q->icq_list)) {
+ struct io_cq *icq = list_entry(q->icq_list.next,
+ struct io_cq, q_node);
+ struct io_context *ioc = icq->ioc;
- ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
- if (ioc) {
- atomic_long_set(&ioc->refcount, 1);
- atomic_set(&ioc->nr_tasks, 1);
- spin_lock_init(&ioc->lock);
- ioc->ioprio_changed = 0;
- ioc->ioprio = 0;
- ioc->last_waited = 0; /* doesn't matter... */
- ioc->nr_batch_requests = 0; /* because this is 0 */
- INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
- INIT_HLIST_HEAD(&ioc->cic_list);
- ioc->ioc_data = NULL;
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
- ioc->cgroup_changed = 0;
-#endif
+ spin_lock(&ioc->lock);
+ ioc_exit_icq(icq);
+ spin_unlock(&ioc->lock);
}
+}
+
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
+ int node)
+{
+ struct io_context *ioc;
- return ioc;
+ ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
+ node);
+ if (unlikely(!ioc))
+ return;
+
+ /* initialize */
+ atomic_long_set(&ioc->refcount, 1);
+ atomic_set(&ioc->nr_tasks, 1);
+ spin_lock_init(&ioc->lock);
+ INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ioc->icq_list);
+ INIT_WORK(&ioc->release_work, ioc_release_fn);
+
+ /*
+ * Try to install. ioc shouldn't be installed if someone else
+ * already did or @task, which isn't %current, is exiting. Note
+ * that we need to allow ioc creation on exiting %current as exit
+ * path may issue IOs from e.g. exit_files(). The exit path is
+ * responsible for not issuing IO after exit_io_context().
+ */
+ task_lock(task);
+ if (!task->io_context &&
+ (task == current || !(task->flags & PF_EXITING)))
+ task->io_context = ioc;
+ else
+ kmem_cache_free(iocontext_cachep, ioc);
+ task_unlock(task);
}
-/*
- * If the current task has no IO context then create one and initialise it.
- * Otherwise, return its existing IO context.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task. If it doesn't exist, it is created with
+ * @gfp_flags and @node. The returned io_context has its reference count
+ * incremented.
*
- * This returned IO context doesn't have a specifically elevated refcount,
- * but since the current task itself holds a reference, the context can be
- * used in general code, so long as it stays within `current` context.
+ * This function always goes through task_lock() and it's better to use
+ * %current->io_context + get_io_context() for %current.
*/
-struct io_context *current_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+ gfp_t gfp_flags, int node)
{
- struct task_struct *tsk = current;
- struct io_context *ret;
-
- ret = tsk->io_context;
- if (likely(ret))
- return ret;
-
- ret = alloc_io_context(gfp_flags, node);
- if (ret) {
- /* make sure set_task_ioprio() sees the settings above */
- smp_wmb();
- tsk->io_context = ret;
- }
+ struct io_context *ioc;
+
+ might_sleep_if(gfp_flags & __GFP_WAIT);
+
+ do {
+ task_lock(task);
+ ioc = task->io_context;
+ if (likely(ioc)) {
+ get_io_context(ioc);
+ task_unlock(task);
+ return ioc;
+ }
+ task_unlock(task);
+ } while (create_io_context(task, gfp_flags, node));
- return ret;
+ return NULL;
}
+EXPORT_SYMBOL(get_task_io_context);
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * ioc_lookup_icq - lookup io_cq from ioc
+ * @ioc: the associated io_context
+ * @q: the associated request_queue
*
- * This is always called in the context of the task which submitted the I/O.
+ * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
+ * with @q->queue_lock held.
*/
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
{
- struct io_context *ioc = NULL;
+ struct io_cq *icq;
+
+ lockdep_assert_held(q->queue_lock);
/*
- * Check for unlikely race with exiting task. ioc ref count is
- * zero when ioc is being detached.
+ * icq's are indexed from @ioc using radix tree and hint pointer,
+ * both of which are protected with RCU. All removals are done
+ * holding both q and ioc locks, and we're holding q lock - if we
+ * find a icq which points to us, it's guaranteed to be valid.
*/
- do {
- ioc = current_io_context(gfp_flags, node);
- if (unlikely(!ioc))
- break;
- } while (!atomic_long_inc_not_zero(&ioc->refcount));
+ rcu_read_lock();
+ icq = rcu_dereference(ioc->icq_hint);
+ if (icq && icq->q == q)
+ goto out;
- return ioc;
+ icq = radix_tree_lookup(&ioc->icq_tree, q->id);
+ if (icq && icq->q == q)
+ rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
+ else
+ icq = NULL;
+out:
+ rcu_read_unlock();
+ return icq;
}
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(ioc_lookup_icq);
+
+/**
+ * ioc_create_icq - create and link io_cq
+ * @q: request_queue of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure io_cq linking %current->io_context and @q exists. If either
+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
+ */
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+{
+ struct elevator_type *et = q->elevator->type;
+ struct io_context *ioc;
+ struct io_cq *icq;
+
+ /* allocate stuff */
+ ioc = create_io_context(current, gfp_mask, q->node);
+ if (!ioc)
+ return NULL;
+
+ icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+ q->node);
+ if (!icq)
+ return NULL;
+
+ if (radix_tree_preload(gfp_mask) < 0) {
+ kmem_cache_free(et->icq_cache, icq);
+ return NULL;
+ }
+
+ icq->ioc = ioc;
+ icq->q = q;
+ INIT_LIST_HEAD(&icq->q_node);
+ INIT_HLIST_NODE(&icq->ioc_node);
+
+ /* lock both q and ioc and try to link @icq */
+ spin_lock_irq(q->queue_lock);
+ spin_lock(&ioc->lock);
+
+ if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
+ hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+ list_add(&icq->q_node, &q->icq_list);
+ if (et->ops.elevator_init_icq_fn)
+ et->ops.elevator_init_icq_fn(icq);
+ } else {
+ kmem_cache_free(et->icq_cache, icq);
+ icq = ioc_lookup_icq(ioc, q);
+ if (!icq)
+ printk(KERN_ERR "cfq: icq link failed!\n");
+ }
+
+ spin_unlock(&ioc->lock);
+ spin_unlock_irq(q->queue_lock);
+ radix_tree_preload_end();
+ return icq;
+}
+
+void ioc_set_changed(struct io_context *ioc, int which)
+{
+ struct io_cq *icq;
+ struct hlist_node *n;
+
+ hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
+ set_bit(which, &icq->changed);
+}
+
+/**
+ * ioc_ioprio_changed - notify ioprio change
+ * @ioc: io_context of interest
+ * @ioprio: new ioprio
+ *
+ * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
+ * icq's. iosched is responsible for checking the bit and applying it on
+ * request issue path.
+ */
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioc->lock, flags);
+ ioc->ioprio = ioprio;
+ ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+/**
+ * ioc_cgroup_changed - notify cgroup change
+ * @ioc: io_context of interest
+ *
+ * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
+ * iosched is responsible for checking the bit and applying it on request
+ * issue path.
+ */
+void ioc_cgroup_changed(struct io_context *ioc)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioc->lock, flags);
+ ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+}
+EXPORT_SYMBOL(ioc_cgroup_changed);
static int __init blk_ioc_init(void)
{
diff --git a/block/blk-map.c b/block/blk-map.c
index 164cd0059706..623e1cd4cffe 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -311,7 +311,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (IS_ERR(bio))
return PTR_ERR(bio);
- if (rq_data_dir(rq) == WRITE)
+ if (!reading)
bio->bi_rw |= REQ_WRITE;
if (do_copy)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index cfcc37cb222b..160035f54882 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -471,3 +471,40 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
{
return attempt_merge(q, rq, next);
}
+
+bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
+{
+ if (!rq_mergeable(rq))
+ return false;
+
+ /* don't merge file system requests and discard requests */
+ if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
+ return false;
+
+ /* don't merge discard requests and secure discard requests */
+ if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
+ return false;
+
+ /* different data direction or already started, don't merge */
+ if (bio_data_dir(bio) != rq_data_dir(rq))
+ return false;
+
+ /* must be same device and not a special request */
+ if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+ return false;
+
+ /* only merge integrity protected bio into ditto rq */
+ if (bio_integrity(bio) != blk_integrity_rq(rq))
+ return false;
+
+ return true;
+}
+
+int blk_try_merge(struct request *rq, struct bio *bio)
+{
+ if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector)
+ return ELEVATOR_BACK_MERGE;
+ else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector)
+ return ELEVATOR_FRONT_MERGE;
+ return ELEVATOR_NO_MERGE;
+}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index fa1eb0449a05..d3234fc494ad 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,9 +104,7 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
* @lim: the queue_limits structure to reset
*
* Description:
- * Returns a queue_limit struct to its default state. Can be used by
- * stacking drivers like DM that stage table swaps and reuse an
- * existing device queue.
+ * Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
@@ -114,13 +112,12 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
- lim->max_sectors = BLK_DEF_MAX_SECTORS;
- lim->max_hw_sectors = INT_MAX;
+ lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
- lim->discard_zeroes_data = 1;
+ lim->discard_zeroes_data = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -131,6 +128,27 @@ void blk_set_default_limits(struct queue_limits *lim)
EXPORT_SYMBOL(blk_set_default_limits);
/**
+ * blk_set_stacking_limits - set default limits for stacking devices
+ * @lim: the queue_limits structure to reset
+ *
+ * Description:
+ * Returns a queue_limit struct to its default state. Should be used
+ * by stacking drivers like DM that have no internal limits.
+ */
+void blk_set_stacking_limits(struct queue_limits *lim)
+{
+ blk_set_default_limits(lim);
+
+ /* Inherit limits from component devices */
+ lim->discard_zeroes_data = 1;
+ lim->max_segments = USHRT_MAX;
+ lim->max_hw_sectors = UINT_MAX;
+
+ lim->max_sectors = BLK_DEF_MAX_SECTORS;
+}
+EXPORT_SYMBOL(blk_set_stacking_limits);
+
+/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
* @mfn: the alternate make_request function
@@ -165,8 +183,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
q->nr_batching = BLK_BATCH_REQ;
blk_set_default_limits(&q->limits);
- blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
- q->limits.discard_zeroes_data = 0;
/*
* by default assume old behaviour and bounce for any highmem page
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e7f9f657f105..cf150011d808 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -425,7 +425,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
mutex_lock(&q->sysfs_lock);
- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+ if (blk_queue_dead(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
@@ -447,7 +447,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+ if (blk_queue_dead(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
@@ -479,8 +479,12 @@ static void blk_release_queue(struct kobject *kobj)
blk_sync_queue(q);
- if (q->elevator)
+ if (q->elevator) {
+ spin_lock_irq(q->queue_lock);
+ ioc_clear_queue(q);
+ spin_unlock_irq(q->queue_lock);
elevator_exit(q->elevator);
+ }
blk_throtl_exit(q);
@@ -494,6 +498,8 @@ static void blk_release_queue(struct kobject *kobj)
blk_trace_shutdown(q);
bdi_destroy(&q->backing_dev_info);
+
+ ida_simple_remove(&blk_queue_ida, q->id);
kmem_cache_free(blk_requestq_cachep, q);
}
diff --git a/block/blk-tag.c b/block/blk-tag.c
index e74d6d13838f..4af6f5cc1167 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -282,18 +282,9 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
void blk_queue_end_tag(struct request_queue *q, struct request *rq)
{
struct blk_queue_tag *bqt = q->queue_tags;
- int tag = rq->tag;
+ unsigned tag = rq->tag; /* negative tags invalid */
- BUG_ON(tag == -1);
-
- if (unlikely(tag >= bqt->max_depth)) {
- /*
- * This can happen after tag depth has been reduced.
- * But tag shouldn't be larger than real_max_depth.
- */
- WARN_ON(tag >= bqt->real_max_depth);
- return;
- }
+ BUG_ON(tag >= bqt->real_max_depth);
list_del_init(&rq->queuelist);
rq->cmd_flags &= ~REQ_QUEUED;
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 4553245d9317..5eed6a76721d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -310,7 +310,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
struct request_queue *q = td->queue;
/* no throttling for dead queue */
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ if (unlikely(blk_queue_dead(q)))
return NULL;
rcu_read_lock();
@@ -335,7 +335,7 @@ static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
spin_lock_irq(q->queue_lock);
/* Make sure @q is still alive */
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+ if (unlikely(blk_queue_dead(q))) {
kfree(tg);
return NULL;
}
diff --git a/block/blk.h b/block/blk.h
index 3f6551b3c92d..9c12f80882b0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -1,6 +1,8 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
+#include <linux/idr.h>
+
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME (HZ/50UL)
@@ -9,6 +11,12 @@
extern struct kmem_cache *blk_requestq_cachep;
extern struct kobj_type blk_queue_ktype;
+extern struct ida blk_queue_ida;
+
+static inline void __blk_get_queue(struct request_queue *q)
+{
+ kobject_get(&q->kobj);
+}
void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
@@ -85,8 +93,8 @@ static inline struct request *__elv_next_request(struct request_queue *q)
q->flush_queue_delayed = 1;
return NULL;
}
- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
- !q->elevator->ops->elevator_dispatch_fn(q, 0))
+ if (unlikely(blk_queue_dead(q)) ||
+ !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
return NULL;
}
}
@@ -95,16 +103,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_activate_req_fn)
- e->ops->elevator_activate_req_fn(q, rq);
+ if (e->type->ops.elevator_activate_req_fn)
+ e->type->ops.elevator_activate_req_fn(q, rq);
}
static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_deactivate_req_fn)
- e->ops->elevator_deactivate_req_fn(q, rq);
+ if (e->type->ops.elevator_deactivate_req_fn)
+ e->type->ops.elevator_deactivate_req_fn(q, rq);
}
#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -119,8 +127,6 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
}
#endif
-struct io_context *current_io_context(gfp_t gfp_flags, int node);
-
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int ll_front_merge_fn(struct request_queue *q, struct request *req,
@@ -131,6 +137,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
void blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
+bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
+int blk_try_merge(struct request *rq, struct bio *bio);
void blk_queue_congestion_threshold(struct request_queue *q);
@@ -189,6 +197,42 @@ static inline int blk_do_io_stat(struct request *rq)
(rq->cmd_flags & REQ_DISCARD));
}
+/*
+ * Internal io_context interface
+ */
+void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
+void ioc_clear_queue(struct request_queue *q);
+
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
+ int node);
+
+/**
+ * create_io_context - try to create task->io_context
+ * @task: target task
+ * @gfp_mask: allocation mask
+ * @node: allocation node
+ *
+ * If @task->io_context is %NULL, allocate a new io_context and install it.
+ * Returns the current @task->io_context which may be %NULL if allocation
+ * failed.
+ *
+ * Note that this function can't be called with IRQ disabled because
+ * task_lock which protects @task->io_context is IRQ-unsafe.
+ */
+static inline struct io_context *create_io_context(struct task_struct *task,
+ gfp_t gfp_mask, int node)
+{
+ WARN_ON_ONCE(irqs_disabled());
+ if (unlikely(!task->io_context))
+ create_io_context_slowpath(task, gfp_mask, node);
+ return task->io_context;
+}
+
+/*
+ * Internal throttling interface
+ */
#ifdef CONFIG_BLK_DEV_THROTTLING
extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
extern void blk_throtl_drain(struct request_queue *q);
diff --git a/block/bsg.c b/block/bsg.c
index 702f1316bb8f..ff64ae3bacee 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -769,12 +769,10 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
struct file *file)
{
struct bsg_device *bd;
- int ret;
#ifdef BSG_DEBUG
unsigned char buf[32];
#endif
- ret = blk_get_queue(rq);
- if (ret)
+ if (!blk_get_queue(rq))
return ERR_PTR(-ENXIO);
bd = bsg_alloc_device();
@@ -985,7 +983,8 @@ void bsg_unregister_queue(struct request_queue *q)
mutex_lock(&bsg_mutex);
idr_remove(&bsg_minor_idr, bcd->minor);
- sysfs_remove_link(&q->kobj, "bsg");
+ if (q->kobj.sd)
+ sysfs_remove_link(&q->kobj, "bsg");
device_unregister(bcd->class_dev);
bcd->class_dev = NULL;
kref_put(&bcd->ref, bsg_kref_release_function);
@@ -1070,7 +1069,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue);
static struct cdev bsg_cdev;
-static char *bsg_devnode(struct device *dev, mode_t *mode)
+static char *bsg_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 16ace89613bc..d0ba50533668 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,7 @@
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
+#include "blk.h"
#include "cfq.h"
/*
@@ -53,20 +54,11 @@ static const int cfq_hist_divisor = 4;
#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
-#define RQ_CIC(rq) \
- ((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
+#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
static struct kmem_cache *cfq_pool;
-static struct kmem_cache *cfq_ioc_pool;
-
-static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
@@ -75,6 +67,14 @@ static DEFINE_IDA(cic_index_ida);
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+struct cfq_ttime {
+ unsigned long last_end_request;
+
+ unsigned long ttime_total;
+ unsigned long ttime_samples;
+ unsigned long ttime_mean;
+};
+
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
@@ -216,6 +216,12 @@ struct cfq_group {
struct cfq_ttime ttime;
};
+struct cfq_io_cq {
+ struct io_cq icq; /* must be the first member */
+ struct cfq_queue *cfqq[2];
+ struct cfq_ttime ttime;
+};
+
/*
* Per block device queue structure
*/
@@ -267,7 +273,7 @@ struct cfq_data {
struct work_struct unplug_work;
struct cfq_queue *active_queue;
- struct cfq_io_context *active_cic;
+ struct cfq_io_cq *active_cic;
/*
* async queue for each priority case
@@ -290,9 +296,6 @@ struct cfq_data {
unsigned int cfq_group_idle;
unsigned int cfq_latency;
- unsigned int cic_index;
- struct list_head cic_list;
-
/*
* Fallback dummy cfqq for extreme OOM conditions
*/
@@ -464,37 +467,35 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
- struct io_context *);
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
- bool is_sync)
+static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
- return cic->cfqq[is_sync];
+ /* cic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct cfq_io_cq, icq);
}
-static inline void cic_set_cfqq(struct cfq_io_context *cic,
- struct cfq_queue *cfqq, bool is_sync)
+static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
+ struct io_context *ioc)
{
- cic->cfqq[is_sync] = cfqq;
+ if (ioc)
+ return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
+ return NULL;
}
-#define CIC_DEAD_KEY 1ul
-#define CIC_DEAD_INDEX_SHIFT 1
-
-static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
{
- return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+ return cic->cfqq[is_sync];
}
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
+ bool is_sync)
{
- struct cfq_data *cfqd = cic->key;
-
- if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
- return NULL;
+ cic->cfqq[is_sync] = cfqq;
+}
- return cfqd;
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
+{
+ return cic->icq.q->elevator->elevator_data;
}
/*
@@ -1561,7 +1562,7 @@ static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
{
struct task_struct *tsk = current;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1655,6 +1656,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ struct cfq_data *cfqd = q->elevator->elevator_data;
+
/*
* reposition in fifo if next is older than rq
*/
@@ -1669,13 +1672,23 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
cfq_remove_request(next);
cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
rq_data_dir(next), rq_is_sync(next));
+
+ cfqq = RQ_CFQQ(next);
+ /*
+ * all requests of this queue are merged to other queues, delete it
+ * from the service tree. If it's the active_queue,
+ * cfq_dispatch_requests() will choose to expire it or do idle
+ */
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
+ cfqq != cfqd->active_queue)
+ cfq_del_cfqq_rr(cfqd, cfqq);
}
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
/*
@@ -1685,7 +1698,7 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq,
return false;
/*
- * Lookup the cfqq that this bio will be queued with. Allow
+ * Lookup the cfqq that this bio will be queued with and allow
* merge only if rq is queued there.
*/
cic = cfq_cic_lookup(cfqd, current->io_context);
@@ -1774,7 +1787,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
cfqd->active_queue = NULL;
if (cfqd->active_cic) {
- put_io_context(cfqd->active_cic->ioc);
+ put_io_context(cfqd->active_cic->icq.ioc);
cfqd->active_cic = NULL;
}
}
@@ -1994,7 +2007,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
unsigned long sl, group_idle = 0;
/*
@@ -2029,7 +2042,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
* task has exited, don't wait
*/
cic = cfqd->active_cic;
- if (!cic || !atomic_read(&cic->ioc->nr_tasks))
+ if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
return;
/*
@@ -2580,9 +2593,9 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_dispatch_insert(cfqd->queue, rq);
if (!cfqd->active_cic) {
- struct cfq_io_context *cic = RQ_CIC(rq);
+ struct cfq_io_cq *cic = RQ_CIC(rq);
- atomic_long_inc(&cic->ioc->refcount);
+ atomic_long_inc(&cic->icq.ioc->refcount);
cfqd->active_cic = cic;
}
@@ -2665,84 +2678,6 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
cfq_put_cfqg(cfqg);
}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
-{
- struct cfq_io_context *cic;
- struct hlist_node *n;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
- func(ioc, cic);
-
- rcu_read_unlock();
-}
-
-static void cfq_cic_free_rcu(struct rcu_head *head)
-{
- struct cfq_io_context *cic;
-
- cic = container_of(head, struct cfq_io_context, rcu_head);
-
- kmem_cache_free(cfq_ioc_pool, cic);
- elv_ioc_count_dec(cfq_ioc_count);
-
- if (ioc_gone) {
- /*
- * CFQ scheduler is exiting, grab exit lock and check
- * the pending io context count. If it hits zero,
- * complete ioc_gone and set it back to NULL
- */
- spin_lock(&ioc_gone_lock);
- if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
- complete(ioc_gone);
- ioc_gone = NULL;
- }
- spin_unlock(&ioc_gone_lock);
- }
-}
-
-static void cfq_cic_free(struct cfq_io_context *cic)
-{
- call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
-}
-
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
-{
- unsigned long flags;
- unsigned long dead_key = (unsigned long) cic->key;
-
- BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
- spin_lock_irqsave(&ioc->lock, flags);
- radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
- hlist_del_rcu(&cic->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- cfq_cic_free(cic);
-}
-
-/*
- * Must be called with rcu_read_lock() held or preemption otherwise disabled.
- * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
- * and ->trim() which is called with the task lock held
- */
-static void cfq_free_io_context(struct io_context *ioc)
-{
- /*
- * ioc->refcount is zero here, or we are called from elv_unregister(),
- * so no more cic's are allowed to be linked into this ioc. So it
- * should be ok to iterate over the known list, we will see all cic's
- * since no new ones are added.
- */
- call_for_each_cic(ioc, cic_free_func);
-}
-
static void cfq_put_cooperator(struct cfq_queue *cfqq)
{
struct cfq_queue *__cfqq, *next;
@@ -2776,27 +2711,17 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfq_put_queue(cfqq);
}
-static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
- struct cfq_io_context *cic)
+static void cfq_init_icq(struct io_cq *icq)
{
- struct io_context *ioc = cic->ioc;
-
- list_del_init(&cic->queue_list);
+ struct cfq_io_cq *cic = icq_to_cic(icq);
- /*
- * Make sure dead mark is seen for dead queues
- */
- smp_wmb();
- cic->key = cfqd_dead_key(cfqd);
+ cic->ttime.last_end_request = jiffies;
+}
- rcu_read_lock();
- if (rcu_dereference(ioc->ioc_data) == cic) {
- rcu_read_unlock();
- spin_lock(&ioc->lock);
- rcu_assign_pointer(ioc->ioc_data, NULL);
- spin_unlock(&ioc->lock);
- } else
- rcu_read_unlock();
+static void cfq_exit_icq(struct io_cq *icq)
+{
+ struct cfq_io_cq *cic = icq_to_cic(icq);
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
if (cic->cfqq[BLK_RW_ASYNC]) {
cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
@@ -2809,57 +2734,6 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
}
}
-static void cfq_exit_single_io_context(struct io_context *ioc,
- struct cfq_io_context *cic)
-{
- struct cfq_data *cfqd = cic_to_cfqd(cic);
-
- if (cfqd) {
- struct request_queue *q = cfqd->queue;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- /*
- * Ensure we get a fresh copy of the ->key to prevent
- * race between exiting task and queue
- */
- smp_read_barrier_depends();
- if (cic->key == cfqd)
- __cfq_exit_single_io_context(cfqd, cic);
-
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
- call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
-static struct cfq_io_context *
-cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
- struct cfq_io_context *cic;
-
- cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- if (cic) {
- cic->ttime.last_end_request = jiffies;
- INIT_LIST_HEAD(&cic->queue_list);
- INIT_HLIST_NODE(&cic->cic_list);
- cic->dtor = cfq_free_io_context;
- cic->exit = cfq_exit_io_context;
- elv_ioc_count_inc(cfq_ioc_count);
- }
-
- return cic;
-}
-
static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
{
struct task_struct *tsk = current;
@@ -2902,21 +2776,18 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
cfq_clear_cfqq_prio_changed(cfqq);
}
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_cq *cic)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
- unsigned long flags;
if (unlikely(!cfqd))
return;
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
cfqq = cic->cfqq[BLK_RW_ASYNC];
if (cfqq) {
struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+ new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
GFP_ATOMIC);
if (new_cfqq) {
cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
@@ -2927,14 +2798,6 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
cfqq = cic->cfqq[BLK_RW_SYNC];
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
-
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-}
-
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
- call_for_each_cic(ioc, changed_ioprio);
- ioc->ioprio_changed = 0;
}
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
@@ -2958,11 +2821,10 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_cq *cic)
{
struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
struct cfq_data *cfqd = cic_to_cfqd(cic);
- unsigned long flags;
struct request_queue *q;
if (unlikely(!cfqd))
@@ -2970,8 +2832,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
q = cfqd->queue;
- spin_lock_irqsave(q->queue_lock, flags);
-
if (sync_cfqq) {
/*
* Drop reference to sync queue. A new sync queue will be
@@ -2981,14 +2841,6 @@ static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
cic_set_cfqq(cic, NULL, 1);
cfq_put_queue(sync_cfqq);
}
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
- call_for_each_cic(ioc, changed_cgroup);
- ioc->cgroup_changed = 0;
}
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -2997,7 +2849,7 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
struct io_context *ioc, gfp_t gfp_mask)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_group *cfqg;
retry:
@@ -3088,153 +2940,6 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
return cfqq;
}
-/*
- * We drop cfq io contexts lazily, so we may find a dead one.
- */
-static void
-cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
- struct cfq_io_context *cic)
-{
- unsigned long flags;
-
- WARN_ON(!list_empty(&cic->queue_list));
- BUG_ON(cic->key != cfqd_dead_key(cfqd));
-
- spin_lock_irqsave(&ioc->lock, flags);
-
- BUG_ON(rcu_dereference_check(ioc->ioc_data,
- lockdep_is_held(&ioc->lock)) == cic);
-
- radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
- hlist_del_rcu(&cic->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- cfq_cic_free(cic);
-}
-
-static struct cfq_io_context *
-cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
-{
- struct cfq_io_context *cic;
- unsigned long flags;
-
- if (unlikely(!ioc))
- return NULL;
-
- rcu_read_lock();
-
- /*
- * we maintain a last-hit cache, to avoid browsing over the tree
- */
- cic = rcu_dereference(ioc->ioc_data);
- if (cic && cic->key == cfqd) {
- rcu_read_unlock();
- return cic;
- }
-
- do {
- cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
- rcu_read_unlock();
- if (!cic)
- break;
- if (unlikely(cic->key != cfqd)) {
- cfq_drop_dead_cic(cfqd, ioc, cic);
- rcu_read_lock();
- continue;
- }
-
- spin_lock_irqsave(&ioc->lock, flags);
- rcu_assign_pointer(ioc->ioc_data, cic);
- spin_unlock_irqrestore(&ioc->lock, flags);
- break;
- } while (1);
-
- return cic;
-}
-
-/*
- * Add cic into ioc, using cfqd as the search key. This enables us to lookup
- * the process specific cfq io context when entered from the block layer.
- * Also adds the cic to a per-cfqd list, used when this queue is removed.
- */
-static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
- struct cfq_io_context *cic, gfp_t gfp_mask)
-{
- unsigned long flags;
- int ret;
-
- ret = radix_tree_preload(gfp_mask);
- if (!ret) {
- cic->ioc = ioc;
- cic->key = cfqd;
-
- spin_lock_irqsave(&ioc->lock, flags);
- ret = radix_tree_insert(&ioc->radix_root,
- cfqd->cic_index, cic);
- if (!ret)
- hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- radix_tree_preload_end();
-
- if (!ret) {
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
- list_add(&cic->queue_list, &cfqd->cic_list);
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
- }
- }
-
- if (ret)
- printk(KERN_ERR "cfq: cic link failed!\n");
-
- return ret;
-}
-
-/*
- * Setup general io context and cfq io context. There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
- */
-static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
- struct io_context *ioc = NULL;
- struct cfq_io_context *cic;
-
- might_sleep_if(gfp_mask & __GFP_WAIT);
-
- ioc = get_io_context(gfp_mask, cfqd->queue->node);
- if (!ioc)
- return NULL;
-
- cic = cfq_cic_lookup(cfqd, ioc);
- if (cic)
- goto out;
-
- cic = cfq_alloc_io_context(cfqd, gfp_mask);
- if (cic == NULL)
- goto err;
-
- if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
- goto err_free;
-
-out:
- smp_read_barrier_depends();
- if (unlikely(ioc->ioprio_changed))
- cfq_ioc_set_ioprio(ioc);
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
- if (unlikely(ioc->cgroup_changed))
- cfq_ioc_set_cgroup(ioc);
-#endif
- return cic;
-err_free:
- cfq_cic_free(cic);
-err:
- put_io_context(ioc);
- return NULL;
-}
-
static void
__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
{
@@ -3248,7 +2953,7 @@ __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
static void
cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- struct cfq_io_context *cic)
+ struct cfq_io_cq *cic)
{
if (cfq_cfqq_sync(cfqq)) {
__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3286,7 +2991,7 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
*/
static void
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- struct cfq_io_context *cic)
+ struct cfq_io_cq *cic)
{
int old_idle, enable_idle;
@@ -3303,8 +3008,9 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
enable_idle = 0;
- else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
- (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
+ else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+ !cfqd->cfq_slice_idle ||
+ (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
else if (sample_valid(cic->ttime.ttime_samples)) {
if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3404,7 +3110,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
*/
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- struct cfq_queue *old_cfqq = cfqd->active_queue;
+ enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
cfq_log_cfqq(cfqd, cfqq, "preempt");
cfq_slice_expired(cfqd, 1);
@@ -3413,7 +3119,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
* workload type is changed, don't save slice, otherwise preempt
* doesn't happen
*/
- if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+ if (old_type != cfqq_type(cfqq))
cfqq->cfqg->saved_workload_slice = 0;
/*
@@ -3436,7 +3142,7 @@ static void
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *rq)
{
- struct cfq_io_context *cic = RQ_CIC(rq);
+ struct cfq_io_cq *cic = RQ_CIC(rq);
cfqd->rq_queued++;
if (rq->cmd_flags & REQ_PRIO)
@@ -3489,7 +3195,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
struct cfq_queue *cfqq = RQ_CFQQ(rq);
cfq_log_cfqq(cfqd, cfqq, "insert_request");
- cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
+ cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
list_add_tail(&rq->queuelist, &cfqq->fifo);
@@ -3539,7 +3245,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd)
static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- struct cfq_io_context *cic = cfqd->active_cic;
+ struct cfq_io_cq *cic = cfqd->active_cic;
/* If the queue already has requests, don't wait */
if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3676,7 +3382,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct task_struct *tsk = current;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
/*
@@ -3691,7 +3397,7 @@ static int cfq_may_queue(struct request_queue *q, int rw)
cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
if (cfqq) {
- cfq_init_prio_data(cfqq, cic->ioc);
+ cfq_init_prio_data(cfqq, cic->icq.ioc);
return __cfq_may_queue(cfqq);
}
@@ -3712,21 +3418,17 @@ static void cfq_put_request(struct request *rq)
BUG_ON(!cfqq->allocated[rw]);
cfqq->allocated[rw]--;
- put_io_context(RQ_CIC(rq)->ioc);
-
- rq->elevator_private[0] = NULL;
- rq->elevator_private[1] = NULL;
-
/* Put down rq reference on cfqg */
cfq_put_cfqg(RQ_CFQG(rq));
- rq->elevator_private[2] = NULL;
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
cfq_put_queue(cfqq);
}
}
static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
struct cfq_queue *cfqq)
{
cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3741,7 +3443,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
* was the last process referring to said cfqq.
*/
static struct cfq_queue *
-split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
{
if (cfqq_process_refs(cfqq) == 1) {
cfqq->pid = current->pid;
@@ -3764,25 +3466,29 @@ static int
cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- unsigned long flags;
might_sleep_if(gfp_mask & __GFP_WAIT);
- cic = cfq_get_io_context(cfqd, gfp_mask);
-
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irq(q->queue_lock);
- if (!cic)
- goto queue_fail;
+ /* handle changed notifications */
+ if (unlikely(cic->icq.changed)) {
+ if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
+ changed_ioprio(cic);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
+ changed_cgroup(cic);
+#endif
+ }
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+ cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
cic_set_cfqq(cic, cfqq, is_sync);
} else {
/*
@@ -3808,17 +3514,10 @@ new_queue:
cfqq->allocated[rw]++;
cfqq->ref++;
- rq->elevator_private[0] = cic;
- rq->elevator_private[1] = cfqq;
- rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ rq->elv.priv[0] = cfqq;
+ rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+ spin_unlock_irq(q->queue_lock);
return 0;
-
-queue_fail:
- cfq_schedule_dispatch(cfqd);
- spin_unlock_irqrestore(q->queue_lock, flags);
- cfq_log(cfqd, "set_request fail");
- return 1;
}
static void cfq_kick_queue(struct work_struct *work)
@@ -3922,14 +3621,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
if (cfqd->active_queue)
__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
- while (!list_empty(&cfqd->cic_list)) {
- struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
- struct cfq_io_context,
- queue_list);
-
- __cfq_exit_single_io_context(cfqd, cic);
- }
-
cfq_put_async_queues(cfqd);
cfq_release_cfq_groups(cfqd);
@@ -3944,10 +3635,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
cfq_shutdown_timer_wq(cfqd);
- spin_lock(&cic_index_lock);
- ida_remove(&cic_index_ida, cfqd->cic_index);
- spin_unlock(&cic_index_lock);
-
/*
* Wait for cfqg->blkg->key accessors to exit their grace periods.
* Do this wait only if there are other unlinked groups out
@@ -3969,24 +3656,6 @@ static void cfq_exit_queue(struct elevator_queue *e)
kfree(cfqd);
}
-static int cfq_alloc_cic_index(void)
-{
- int index, error;
-
- do {
- if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(&cic_index_lock);
- error = ida_get_new(&cic_index_ida, &index);
- spin_unlock(&cic_index_lock);
- if (error && error != -EAGAIN)
- return error;
- } while (error);
-
- return index;
-}
-
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
@@ -3994,23 +3663,9 @@ static void *cfq_init_queue(struct request_queue *q)
struct cfq_group *cfqg;
struct cfq_rb_root *st;
- i = cfq_alloc_cic_index();
- if (i < 0)
- return NULL;
-
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd) {
- spin_lock(&cic_index_lock);
- ida_remove(&cic_index_ida, i);
- spin_unlock(&cic_index_lock);
+ if (!cfqd)
return NULL;
- }
-
- /*
- * Don't need take queue_lock in the routine, since we are
- * initializing the ioscheduler, and nobody is using cfqd
- */
- cfqd->cic_index = i;
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
@@ -4067,8 +3722,6 @@ static void *cfq_init_queue(struct request_queue *q)
cfqd->oom_cfqq.ref++;
cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
- INIT_LIST_HEAD(&cfqd->cic_list);
-
cfqd->queue = q;
init_timer(&cfqd->idle_slice_timer);
@@ -4097,34 +3750,6 @@ static void *cfq_init_queue(struct request_queue *q)
return cfqd;
}
-static void cfq_slab_kill(void)
-{
- /*
- * Caller already ensured that pending RCU callbacks are completed,
- * so we should have no busy allocations at this point.
- */
- if (cfq_pool)
- kmem_cache_destroy(cfq_pool);
- if (cfq_ioc_pool)
- kmem_cache_destroy(cfq_ioc_pool);
-}
-
-static int __init cfq_slab_setup(void)
-{
- cfq_pool = KMEM_CACHE(cfq_queue, 0);
- if (!cfq_pool)
- goto fail;
-
- cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
- if (!cfq_ioc_pool)
- goto fail;
-
- return 0;
-fail:
- cfq_slab_kill();
- return -ENOMEM;
-}
-
/*
* sysfs parts below -->
*/
@@ -4230,15 +3855,18 @@ static struct elevator_type iosched_cfq = {
.elevator_completed_req_fn = cfq_completed_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
+ .elevator_init_icq_fn = cfq_init_icq,
+ .elevator_exit_icq_fn = cfq_exit_icq,
.elevator_set_req_fn = cfq_set_request,
.elevator_put_req_fn = cfq_put_request,
.elevator_may_queue_fn = cfq_may_queue,
.elevator_init_fn = cfq_init_queue,
.elevator_exit_fn = cfq_exit_queue,
- .trim = cfq_free_io_context,
},
+ .icq_size = sizeof(struct cfq_io_cq),
+ .icq_align = __alignof__(struct cfq_io_cq),
.elevator_attrs = cfq_attrs,
- .elevator_name = "cfq",
+ .elevator_name = "cfq",
.elevator_owner = THIS_MODULE,
};
@@ -4256,6 +3884,8 @@ static struct blkio_policy_type blkio_policy_cfq;
static int __init cfq_init(void)
{
+ int ret;
+
/*
* could be 0 on HZ < 1000 setups
*/
@@ -4270,10 +3900,16 @@ static int __init cfq_init(void)
#else
cfq_group_idle = 0;
#endif
- if (cfq_slab_setup())
+ cfq_pool = KMEM_CACHE(cfq_queue, 0);
+ if (!cfq_pool)
return -ENOMEM;
- elv_register(&iosched_cfq);
+ ret = elv_register(&iosched_cfq);
+ if (ret) {
+ kmem_cache_destroy(cfq_pool);
+ return ret;
+ }
+
blkio_policy_register(&blkio_policy_cfq);
return 0;
@@ -4281,21 +3917,9 @@ static int __init cfq_init(void)
static void __exit cfq_exit(void)
{
- DECLARE_COMPLETION_ONSTACK(all_gone);
blkio_policy_unregister(&blkio_policy_cfq);
elv_unregister(&iosched_cfq);
- ioc_gone = &all_gone;
- /* ioc_gone's update must be visible before reading ioc_count */
- smp_wmb();
-
- /*
- * this also protects us from entering cfq_slab_kill() with
- * pending RCU callbacks
- */
- if (elv_ioc_count_read(cfq_ioc_count))
- wait_for_completion(&all_gone);
- ida_destroy(&cic_index_ida);
- cfq_slab_kill();
+ kmem_cache_destroy(cfq_pool);
}
module_init(cfq_init);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7b725020823c..7c668c8a6f95 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -719,6 +719,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKSECTGET:
return compat_put_ushort(arg,
queue_max_sectors(bdev_get_queue(bdev)));
+ case BLKROTATIONAL:
+ return compat_put_ushort(arg,
+ !blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET: /* compatible, but no compat_ptr (!) */
case BLKFRASET:
if (!capable(CAP_SYS_ADMIN))
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c644137d9cd6..7bf12d793fcd 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -448,9 +448,7 @@ static struct elevator_type iosched_deadline = {
static int __init deadline_init(void)
{
- elv_register(&iosched_deadline);
-
- return 0;
+ return elv_register(&iosched_deadline);
}
static void __exit deadline_exit(void)
diff --git a/block/elevator.c b/block/elevator.c
index 66343d6917d0..f016855a46b0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,8 +61,8 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_allow_merge_fn)
- return e->ops->elevator_allow_merge_fn(q, rq, bio);
+ if (e->type->ops.elevator_allow_merge_fn)
+ return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
return 1;
}
@@ -70,39 +70,9 @@ static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
/*
* can we safely merge with this request?
*/
-int elv_rq_merge_ok(struct request *rq, struct bio *bio)
+bool elv_rq_merge_ok(struct request *rq, struct bio *bio)
{
- if (!rq_mergeable(rq))
- return 0;
-
- /*
- * Don't merge file system requests and discard requests
- */
- if ((bio->bi_rw & REQ_DISCARD) != (rq->bio->bi_rw & REQ_DISCARD))
- return 0;
-
- /*
- * Don't merge discard requests and secure discard requests
- */
- if ((bio->bi_rw & REQ_SECURE) != (rq->bio->bi_rw & REQ_SECURE))
- return 0;
-
- /*
- * different data direction or already started, don't merge
- */
- if (bio_data_dir(bio) != rq_data_dir(rq))
- return 0;
-
- /*
- * must be same device and not a special request
- */
- if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
- return 0;
-
- /*
- * only merge integrity protected bio into ditto rq
- */
- if (bio_integrity(bio) != blk_integrity_rq(rq))
+ if (!blk_rq_merge_ok(rq, bio))
return 0;
if (!elv_iosched_allow_merge(rq, bio))
@@ -112,23 +82,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_rq_merge_ok);
-int elv_try_merge(struct request *__rq, struct bio *bio)
-{
- int ret = ELEVATOR_NO_MERGE;
-
- /*
- * we can merge and sequence is ok, check if it's possible
- */
- if (elv_rq_merge_ok(__rq, bio)) {
- if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
- ret = ELEVATOR_BACK_MERGE;
- else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
- ret = ELEVATOR_FRONT_MERGE;
- }
-
- return ret;
-}
-
static struct elevator_type *elevator_find(const char *name)
{
struct elevator_type *e;
@@ -168,17 +121,13 @@ static struct elevator_type *elevator_get(const char *name)
return e;
}
-static void *elevator_init_queue(struct request_queue *q,
- struct elevator_queue *eq)
-{
- return eq->ops->elevator_init_fn(q);
-}
-
-static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
- void *data)
+static int elevator_init_queue(struct request_queue *q,
+ struct elevator_queue *eq)
{
- q->elevator = eq;
- eq->elevator_data = data;
+ eq->elevator_data = eq->type->ops.elevator_init_fn(q);
+ if (eq->elevator_data)
+ return 0;
+ return -ENOMEM;
}
static char chosen_elevator[ELV_NAME_MAX];
@@ -207,8 +156,7 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
if (unlikely(!eq))
goto err;
- eq->ops = &e->ops;
- eq->elevator_type = e;
+ eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
@@ -232,7 +180,7 @@ static void elevator_release(struct kobject *kobj)
struct elevator_queue *e;
e = container_of(kobj, struct elevator_queue, kobj);
- elevator_put(e->elevator_type);
+ elevator_put(e->type);
kfree(e->hash);
kfree(e);
}
@@ -241,7 +189,7 @@ int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
struct elevator_queue *eq;
- void *data;
+ int err;
if (unlikely(q->elevator))
return 0;
@@ -278,13 +226,13 @@ int elevator_init(struct request_queue *q, char *name)
if (!eq)
return -ENOMEM;
- data = elevator_init_queue(q, eq);
- if (!data) {
+ err = elevator_init_queue(q, eq);
+ if (err) {
kobject_put(&eq->kobj);
- return -ENOMEM;
+ return err;
}
- elevator_attach(q, eq, data);
+ q->elevator = eq;
return 0;
}
EXPORT_SYMBOL(elevator_init);
@@ -292,9 +240,8 @@ EXPORT_SYMBOL(elevator_init);
void elevator_exit(struct elevator_queue *e)
{
mutex_lock(&e->sysfs_lock);
- if (e->ops->elevator_exit_fn)
- e->ops->elevator_exit_fn(e);
- e->ops = NULL;
+ if (e->type->ops.elevator_exit_fn)
+ e->type->ops.elevator_exit_fn(e);
mutex_unlock(&e->sysfs_lock);
kobject_put(&e->kobj);
@@ -484,8 +431,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
/*
* First try one-hit cache.
*/
- if (q->last_merge) {
- ret = elv_try_merge(q->last_merge, bio);
+ if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) {
+ ret = blk_try_merge(q->last_merge, bio);
if (ret != ELEVATOR_NO_MERGE) {
*req = q->last_merge;
return ret;
@@ -504,8 +451,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
return ELEVATOR_BACK_MERGE;
}
- if (e->ops->elevator_merge_fn)
- return e->ops->elevator_merge_fn(q, req, bio);
+ if (e->type->ops.elevator_merge_fn)
+ return e->type->ops.elevator_merge_fn(q, req, bio);
return ELEVATOR_NO_MERGE;
}
@@ -548,8 +495,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_merged_fn)
- e->ops->elevator_merged_fn(q, rq, type);
+ if (e->type->ops.elevator_merged_fn)
+ e->type->ops.elevator_merged_fn(q, rq, type);
if (type == ELEVATOR_BACK_MERGE)
elv_rqhash_reposition(q, rq);
@@ -563,8 +510,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
struct elevator_queue *e = q->elevator;
const int next_sorted = next->cmd_flags & REQ_SORTED;
- if (next_sorted && e->ops->elevator_merge_req_fn)
- e->ops->elevator_merge_req_fn(q, rq, next);
+ if (next_sorted && e->type->ops.elevator_merge_req_fn)
+ e->type->ops.elevator_merge_req_fn(q, rq, next);
elv_rqhash_reposition(q, rq);
@@ -581,8 +528,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_bio_merged_fn)
- e->ops->elevator_bio_merged_fn(q, rq, bio);
+ if (e->type->ops.elevator_bio_merged_fn)
+ e->type->ops.elevator_bio_merged_fn(q, rq, bio);
}
void elv_requeue_request(struct request_queue *q, struct request *rq)
@@ -608,12 +555,12 @@ void elv_drain_elevator(struct request_queue *q)
lockdep_assert_held(q->queue_lock);
- while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+ while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
;
if (q->nr_sorted && printed++ < 10) {
printk(KERN_ERR "%s: forced dispatching is broken "
"(nr_sorted=%u), please report this\n",
- q->elevator->elevator_type->elevator_name, q->nr_sorted);
+ q->elevator->type->elevator_name, q->nr_sorted);
}
}
@@ -702,7 +649,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
* rq cannot be accessed after calling
* elevator_add_req_fn.
*/
- q->elevator->ops->elevator_add_req_fn(q, rq);
+ q->elevator->type->ops.elevator_add_req_fn(q, rq);
break;
case ELEVATOR_INSERT_FLUSH:
@@ -731,8 +678,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_latter_req_fn)
- return e->ops->elevator_latter_req_fn(q, rq);
+ if (e->type->ops.elevator_latter_req_fn)
+ return e->type->ops.elevator_latter_req_fn(q, rq);
return NULL;
}
@@ -740,8 +687,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_former_req_fn)
- return e->ops->elevator_former_req_fn(q, rq);
+ if (e->type->ops.elevator_former_req_fn)
+ return e->type->ops.elevator_former_req_fn(q, rq);
return NULL;
}
@@ -749,10 +696,8 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_set_req_fn)
- return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
-
- rq->elevator_private[0] = NULL;
+ if (e->type->ops.elevator_set_req_fn)
+ return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
return 0;
}
@@ -760,16 +705,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_put_req_fn)
- e->ops->elevator_put_req_fn(rq);
+ if (e->type->ops.elevator_put_req_fn)
+ e->type->ops.elevator_put_req_fn(rq);
}
int elv_may_queue(struct request_queue *q, int rw)
{
struct elevator_queue *e = q->elevator;
- if (e->ops->elevator_may_queue_fn)
- return e->ops->elevator_may_queue_fn(q, rw);
+ if (e->type->ops.elevator_may_queue_fn)
+ return e->type->ops.elevator_may_queue_fn(q, rw);
return ELV_MQUEUE_MAY;
}
@@ -804,8 +749,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]--;
if ((rq->cmd_flags & REQ_SORTED) &&
- e->ops->elevator_completed_req_fn)
- e->ops->elevator_completed_req_fn(q, rq);
+ e->type->ops.elevator_completed_req_fn)
+ e->type->ops.elevator_completed_req_fn(q, rq);
}
}
@@ -823,7 +768,7 @@ elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->ops ? entry->show(e, page) : -ENOENT;
+ error = e->type ? entry->show(e, page) : -ENOENT;
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -841,7 +786,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr,
e = container_of(kobj, struct elevator_queue, kobj);
mutex_lock(&e->sysfs_lock);
- error = e->ops ? entry->store(e, page, length) : -ENOENT;
+ error = e->type ? entry->store(e, page, length) : -ENOENT;
mutex_unlock(&e->sysfs_lock);
return error;
}
@@ -856,14 +801,13 @@ static struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q)
+int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
{
- struct elevator_queue *e = q->elevator;
int error;
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
- struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
+ struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
while (attr->attr.name) {
if (sysfs_create_file(&e->kobj, &attr->attr))
@@ -876,31 +820,55 @@ int elv_register_queue(struct request_queue *q)
}
return error;
}
-EXPORT_SYMBOL(elv_register_queue);
-static void __elv_unregister_queue(struct elevator_queue *e)
+int elv_register_queue(struct request_queue *q)
{
- kobject_uevent(&e->kobj, KOBJ_REMOVE);
- kobject_del(&e->kobj);
- e->registered = 0;
+ return __elv_register_queue(q, q->elevator);
}
+EXPORT_SYMBOL(elv_register_queue);
void elv_unregister_queue(struct request_queue *q)
{
- if (q)
- __elv_unregister_queue(q->elevator);
+ if (q) {
+ struct elevator_queue *e = q->elevator;
+
+ kobject_uevent(&e->kobj, KOBJ_REMOVE);
+ kobject_del(&e->kobj);
+ e->registered = 0;
+ }
}
EXPORT_SYMBOL(elv_unregister_queue);
-void elv_register(struct elevator_type *e)
+int elv_register(struct elevator_type *e)
{
char *def = "";
+ /* create icq_cache if requested */
+ if (e->icq_size) {
+ if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
+ WARN_ON(e->icq_align < __alignof__(struct io_cq)))
+ return -EINVAL;
+
+ snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
+ "%s_io_cq", e->elevator_name);
+ e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
+ e->icq_align, 0, NULL);
+ if (!e->icq_cache)
+ return -ENOMEM;
+ }
+
+ /* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- BUG_ON(elevator_find(e->elevator_name));
+ if (elevator_find(e->elevator_name)) {
+ spin_unlock(&elv_list_lock);
+ if (e->icq_cache)
+ kmem_cache_destroy(e->icq_cache);
+ return -EBUSY;
+ }
list_add_tail(&e->list, &elv_list);
spin_unlock(&elv_list_lock);
+ /* print pretty message */
if (!strcmp(e->elevator_name, chosen_elevator) ||
(!*chosen_elevator &&
!strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
@@ -908,30 +876,26 @@ void elv_register(struct elevator_type *e)
printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
def);
+ return 0;
}
EXPORT_SYMBOL_GPL(elv_register);
void elv_unregister(struct elevator_type *e)
{
- struct task_struct *g, *p;
+ /* unregister */
+ spin_lock(&elv_list_lock);
+ list_del_init(&e->list);
+ spin_unlock(&elv_list_lock);
/*
- * Iterate every thread in the process to remove the io contexts.
+ * Destroy icq_cache if it exists. icq's are RCU managed. Make
+ * sure all RCU operations are complete before proceeding.
*/
- if (e->ops.trim) {
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- task_lock(p);
- if (p->io_context)
- e->ops.trim(p->io_context);
- task_unlock(p);
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ if (e->icq_cache) {
+ rcu_barrier();
+ kmem_cache_destroy(e->icq_cache);
+ e->icq_cache = NULL;
}
-
- spin_lock(&elv_list_lock);
- list_del_init(&e->list);
- spin_unlock(&elv_list_lock);
}
EXPORT_SYMBOL_GPL(elv_unregister);
@@ -944,54 +908,41 @@ EXPORT_SYMBOL_GPL(elv_unregister);
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
struct elevator_queue *old_elevator, *e;
- void *data;
int err;
- /*
- * Allocate new elevator
- */
+ /* allocate new elevator */
e = elevator_alloc(q, new_e);
if (!e)
return -ENOMEM;
- data = elevator_init_queue(q, e);
- if (!data) {
+ err = elevator_init_queue(q, e);
+ if (err) {
kobject_put(&e->kobj);
- return -ENOMEM;
+ return err;
}
- /*
- * Turn on BYPASS and drain all requests w/ elevator private data
- */
+ /* turn on BYPASS and drain all requests w/ elevator private data */
elv_quiesce_start(q);
- /*
- * Remember old elevator.
- */
- old_elevator = q->elevator;
-
- /*
- * attach and start new elevator
- */
- spin_lock_irq(q->queue_lock);
- elevator_attach(q, e, data);
- spin_unlock_irq(q->queue_lock);
-
- if (old_elevator->registered) {
- __elv_unregister_queue(old_elevator);
-
- err = elv_register_queue(q);
+ /* unregister old queue, register new one and kill old elevator */
+ if (q->elevator->registered) {
+ elv_unregister_queue(q);
+ err = __elv_register_queue(q, e);
if (err)
goto fail_register;
}
- /*
- * finally exit old elevator and turn off BYPASS.
- */
+ /* done, clear io_cq's, switch elevators and turn off BYPASS */
+ spin_lock_irq(q->queue_lock);
+ ioc_clear_queue(q);
+ old_elevator = q->elevator;
+ q->elevator = e;
+ spin_unlock_irq(q->queue_lock);
+
elevator_exit(old_elevator);
elv_quiesce_end(q);
- blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
+ blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
return 0;
@@ -1001,7 +952,6 @@ fail_register:
* one again (along with re-adding the sysfs dir)
*/
elevator_exit(e);
- q->elevator = old_elevator;
elv_register_queue(q);
elv_quiesce_end(q);
@@ -1026,7 +976,7 @@ int elevator_change(struct request_queue *q, const char *name)
return -EINVAL;
}
- if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
+ if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
elevator_put(e);
return 0;
}
@@ -1061,7 +1011,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
if (!q->elevator || !blk_queue_stackable(q))
return sprintf(name, "none\n");
- elv = e->elevator_type;
+ elv = e->type;
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
diff --git a/block/genhd.c b/block/genhd.c
index 02e9fca80825..23b4f7063322 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/kobj_map.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
@@ -507,7 +506,7 @@ static int exact_lock(dev_t devt, void *data)
return 0;
}
-void register_disk(struct gendisk *disk)
+static void register_disk(struct gendisk *disk)
{
struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
@@ -615,7 +614,7 @@ void add_disk(struct gendisk *disk)
* Take an extra ref on queue which will be put on disk_release()
* so that it sticks around as long as @disk is there.
*/
- WARN_ON_ONCE(blk_get_queue(disk->queue));
+ WARN_ON_ONCE(!blk_get_queue(disk->queue));
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
@@ -1109,7 +1108,7 @@ struct class block_class = {
.name = "block",
};
-static char *block_devnode(struct device *dev, mode_t *mode)
+static char *block_devnode(struct device *dev, umode_t *mode)
{
struct gendisk *disk = dev_to_disk(dev);
diff --git a/block/ioctl.c b/block/ioctl.c
index ca939fc1030f..ba15b2dbfb98 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,7 +5,7 @@
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
@@ -180,6 +180,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
/*
+ * Is it an unrecognized ioctl? The correct returns are either
+ * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
+ * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
+ * code before returning.
+ *
+ * Confused drivers sometimes return EINVAL, which is wrong. It
+ * means "I understood the ioctl command, but the parameters to
+ * it were wrong".
+ *
+ * We should aim to just fix the broken drivers, the EINVAL case
+ * should go away.
+ */
+static inline int is_unrecognized_ioctl(int ret)
+{
+ return ret == -EINVAL ||
+ ret == -ENOTTY ||
+ ret == -ENOIOCTLCMD;
+}
+
+/*
* always keep this in sync with compat_blkdev_ioctl()
*/
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
@@ -196,8 +216,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -EACCES;
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- /* -EINVAL to handle old uncorrected drivers */
- if (ret != -EINVAL && ret != -ENOTTY)
+ if (!is_unrecognized_ioctl(ret))
return ret;
fsync_bdev(bdev);
@@ -206,8 +225,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKROSET:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- /* -EINVAL to handle old uncorrected drivers */
- if (ret != -EINVAL && ret != -ENOTTY)
+ if (!is_unrecognized_ioctl(ret))
return ret;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -278,6 +296,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKSECTGET:
return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+ case BLKROTATIONAL:
+ return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index 06389e9ef96d..413a0b1d788c 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -94,9 +94,7 @@ static struct elevator_type elevator_noop = {
static int __init noop_init(void)
{
- elv_register(&elevator_noop);
-
- return 0;
+ return elv_register(&elevator_noop);
}
static void __exit noop_exit(void)
diff --git a/block/partition-generic.c b/block/partition-generic.c
new file mode 100644
index 000000000000..d06ec1c829c2
--- /dev/null
+++ b/block/partition-generic.c
@@ -0,0 +1,537 @@
+/*
+ * Code extracted from drivers/block/genhd.c
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ *
+ * We now have independent partition support from the
+ * block drivers, which allows all the partition code to
+ * be grouped in one location, and it to be mostly self
+ * contained.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+#include <linux/blktrace_api.h>
+
+#include "partitions/check.h"
+
+#ifdef CONFIG_BLK_DEV_MD
+extern void md_autodetect_dev(dev_t dev);
+#endif
+
+/*
+ * disk_name() is used by partition check code and the genhd driver.
+ * It formats the devicename of the indicated disk into
+ * the supplied buffer (of size at least 32), and returns
+ * a pointer to that same buffer (for convenience).
+ */
+
+char *disk_name(struct gendisk *hd, int partno, char *buf)
+{
+ if (!partno)
+ snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
+ else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
+ snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
+ else
+ snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
+
+ return buf;
+}
+
+const char *bdevname(struct block_device *bdev, char *buf)
+{
+ return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
+}
+
+EXPORT_SYMBOL(bdevname);
+
+/*
+ * There's very little reason to use this, you should really
+ * have a struct block_device just about everywhere and use
+ * bdevname() instead.
+ */
+const char *__bdevname(dev_t dev, char *buffer)
+{
+ scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
+ MAJOR(dev), MINOR(dev));
+ return buffer;
+}
+
+EXPORT_SYMBOL(__bdevname);
+
+static ssize_t part_partition_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%d\n", p->partno);
+}
+
+static ssize_t part_start_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
+}
+
+ssize_t part_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
+}
+
+static ssize_t part_ro_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
+static ssize_t part_alignment_offset_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
+static ssize_t part_discard_alignment_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
+ssize_t part_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ int cpu;
+
+ cpu = part_stat_lock();
+ part_round_stats(cpu, p);
+ part_stat_unlock();
+ return sprintf(buf,
+ "%8lu %8lu %8llu %8u "
+ "%8lu %8lu %8llu %8u "
+ "%8u %8u %8u"
+ "\n",
+ part_stat_read(p, ios[READ]),
+ part_stat_read(p, merges[READ]),
+ (unsigned long long)part_stat_read(p, sectors[READ]),
+ jiffies_to_msecs(part_stat_read(p, ticks[READ])),
+ part_stat_read(p, ios[WRITE]),
+ part_stat_read(p, merges[WRITE]),
+ (unsigned long long)part_stat_read(p, sectors[WRITE]),
+ jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+ part_in_flight(p),
+ jiffies_to_msecs(part_stat_read(p, io_ticks)),
+ jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+}
+
+ssize_t part_inflight_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+ atomic_read(&p->in_flight[1]));
+}
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+ssize_t part_fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%d\n", p->make_it_fail);
+}
+
+ssize_t part_fail_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ int i;
+
+ if (count > 0 && sscanf(buf, "%d", &i) > 0)
+ p->make_it_fail = (i == 0) ? 0 : 1;
+
+ return count;
+}
+#endif
+
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+ NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+ __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+#endif
+
+static struct attribute *part_attrs[] = {
+ &dev_attr_partition.attr,
+ &dev_attr_start.attr,
+ &dev_attr_size.attr,
+ &dev_attr_ro.attr,
+ &dev_attr_alignment_offset.attr,
+ &dev_attr_discard_alignment.attr,
+ &dev_attr_stat.attr,
+ &dev_attr_inflight.attr,
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+ &dev_attr_fail.attr,
+#endif
+ NULL
+};
+
+static struct attribute_group part_attr_group = {
+ .attrs = part_attrs,
+};
+
+static const struct attribute_group *part_attr_groups[] = {
+ &part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ &blk_trace_attr_group,
+#endif
+ NULL
+};
+
+static void part_release(struct device *dev)
+{
+ struct hd_struct *p = dev_to_part(dev);
+ free_part_stats(p);
+ free_part_info(p);
+ kfree(p);
+}
+
+struct device_type part_type = {
+ .name = "partition",
+ .groups = part_attr_groups,
+ .release = part_release,
+};
+
+static void delete_partition_rcu_cb(struct rcu_head *head)
+{
+ struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+
+ part->start_sect = 0;
+ part->nr_sects = 0;
+ part_stat_set_all(part, 0);
+ put_device(part_to_dev(part));
+}
+
+void __delete_partition(struct hd_struct *part)
+{
+ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
+void delete_partition(struct gendisk *disk, int partno)
+{
+ struct disk_part_tbl *ptbl = disk->part_tbl;
+ struct hd_struct *part;
+
+ if (partno >= ptbl->len)
+ return;
+
+ part = ptbl->part[partno];
+ if (!part)
+ return;
+
+ blk_free_devt(part_devt(part));
+ rcu_assign_pointer(ptbl->part[partno], NULL);
+ rcu_assign_pointer(ptbl->last_lookup, NULL);
+ kobject_put(part->holder_dir);
+ device_del(part_to_dev(part));
+
+ hd_struct_put(part);
+}
+
+static ssize_t whole_disk_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return 0;
+}
+static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
+ whole_disk_show, NULL);
+
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+ sector_t start, sector_t len, int flags,
+ struct partition_meta_info *info)
+{
+ struct hd_struct *p;
+ dev_t devt = MKDEV(0, 0);
+ struct device *ddev = disk_to_dev(disk);
+ struct device *pdev;
+ struct disk_part_tbl *ptbl;
+ const char *dname;
+ int err;
+
+ err = disk_expand_part_tbl(disk, partno);
+ if (err)
+ return ERR_PTR(err);
+ ptbl = disk->part_tbl;
+
+ if (ptbl->part[partno])
+ return ERR_PTR(-EBUSY);
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-EBUSY);
+
+ if (!init_part_stats(p)) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ pdev = part_to_dev(p);
+
+ p->start_sect = start;
+ p->alignment_offset =
+ queue_limit_alignment_offset(&disk->queue->limits, start);
+ p->discard_alignment =
+ queue_limit_discard_alignment(&disk->queue->limits, start);
+ p->nr_sects = len;
+ p->partno = partno;
+ p->policy = get_disk_ro(disk);
+
+ if (info) {
+ struct partition_meta_info *pinfo = alloc_part_info(disk);
+ if (!pinfo)
+ goto out_free_stats;
+ memcpy(pinfo, info, sizeof(*info));
+ p->info = pinfo;
+ }
+
+ dname = dev_name(ddev);
+ if (isdigit(dname[strlen(dname) - 1]))
+ dev_set_name(pdev, "%sp%d", dname, partno);
+ else
+ dev_set_name(pdev, "%s%d", dname, partno);
+
+ device_initialize(pdev);
+ pdev->class = &block_class;
+ pdev->type = &part_type;
+ pdev->parent = ddev;
+
+ err = blk_alloc_devt(p, &devt);
+ if (err)
+ goto out_free_info;
+ pdev->devt = devt;
+
+ /* delay uevent until 'holders' subdir is created */
+ dev_set_uevent_suppress(pdev, 1);
+ err = device_add(pdev);
+ if (err)
+ goto out_put;
+
+ err = -ENOMEM;
+ p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+ if (!p->holder_dir)
+ goto out_del;
+
+ dev_set_uevent_suppress(pdev, 0);
+ if (flags & ADDPART_FLAG_WHOLEDISK) {
+ err = device_create_file(pdev, &dev_attr_whole_disk);
+ if (err)
+ goto out_del;
+ }
+
+ /* everything is up and running, commence */
+ rcu_assign_pointer(ptbl->part[partno], p);
+
+ /* suppress uevent if the disk suppresses it */
+ if (!dev_get_uevent_suppress(ddev))
+ kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
+ hd_ref_init(p);
+ return p;
+
+out_free_info:
+ free_part_info(p);
+out_free_stats:
+ free_part_stats(p);
+out_free:
+ kfree(p);
+ return ERR_PTR(err);
+out_del:
+ kobject_put(p->holder_dir);
+ device_del(pdev);
+out_put:
+ put_device(pdev);
+ blk_free_devt(devt);
+ return ERR_PTR(err);
+}
+
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+ const struct block_device_operations *bdops = disk->fops;
+
+ if (bdops->unlock_native_capacity &&
+ !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+ printk(KERN_CONT "enabling native capacity\n");
+ bdops->unlock_native_capacity(disk);
+ disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+ return true;
+ } else {
+ printk(KERN_CONT "truncated\n");
+ return false;
+ }
+}
+
+int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+ struct parsed_partitions *state = NULL;
+ struct disk_part_iter piter;
+ struct hd_struct *part;
+ int p, highest, res;
+rescan:
+ if (state && !IS_ERR(state)) {
+ kfree(state);
+ state = NULL;
+ }
+
+ if (bdev->bd_part_count)
+ return -EBUSY;
+ res = invalidate_partition(disk, 0);
+ if (res)
+ return res;
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ while ((part = disk_part_iter_next(&piter)))
+ delete_partition(disk, part->partno);
+ disk_part_iter_exit(&piter);
+
+ if (disk->fops->revalidate_disk)
+ disk->fops->revalidate_disk(disk);
+ check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
+ if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
+ return 0;
+ if (IS_ERR(state)) {
+ /*
+ * I/O error reading the partition table. If any
+ * partition code tried to read beyond EOD, retry
+ * after unlocking native capacity.
+ */
+ if (PTR_ERR(state) == -ENOSPC) {
+ printk(KERN_WARNING "%s: partition table beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
+ return -EIO;
+ }
+ /*
+ * If any partition code tried to read beyond EOD, try
+ * unlocking native capacity even if partition table is
+ * successfully read as we could be missing some partitions.
+ */
+ if (state->access_beyond_eod) {
+ printk(KERN_WARNING
+ "%s: partition table partially beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
+
+ /* tell userspace that the media / partition table may have changed */
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+ /* Detect the highest partition number and preallocate
+ * disk->part_tbl. This is an optimization and not strictly
+ * necessary.
+ */
+ for (p = 1, highest = 0; p < state->limit; p++)
+ if (state->parts[p].size)
+ highest = p;
+
+ disk_expand_part_tbl(disk, highest);
+
+ /* add partitions */
+ for (p = 1; p < state->limit; p++) {
+ sector_t size, from;
+ struct partition_meta_info *info = NULL;
+
+ size = state->parts[p].size;
+ if (!size)
+ continue;
+
+ from = state->parts[p].from;
+ if (from >= get_capacity(disk)) {
+ printk(KERN_WARNING
+ "%s: p%d start %llu is beyond EOD, ",
+ disk->disk_name, p, (unsigned long long) from);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ continue;
+ }
+
+ if (from + size > get_capacity(disk)) {
+ printk(KERN_WARNING
+ "%s: p%d size %llu extends beyond EOD, ",
+ disk->disk_name, p, (unsigned long long) size);
+
+ if (disk_unlock_native_capacity(disk)) {
+ /* free state and restart */
+ goto rescan;
+ } else {
+ /*
+ * we can not ignore partitions of broken tables
+ * created by for example camera firmware, but
+ * we limit them to the end of the disk to avoid
+ * creating invalid block devices
+ */
+ size = get_capacity(disk) - from;
+ }
+ }
+
+ if (state->parts[p].has_info)
+ info = &state->parts[p].info;
+ part = add_partition(disk, p, from, size,
+ state->parts[p].flags,
+ &state->parts[p].info);
+ if (IS_ERR(part)) {
+ printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+ disk->disk_name, p, -PTR_ERR(part));
+ continue;
+ }
+#ifdef CONFIG_BLK_DEV_MD
+ if (state->parts[p].flags & ADDPART_FLAG_RAID)
+ md_autodetect_dev(part_to_dev(part)->devt);
+#endif
+ }
+ kfree(state);
+ return 0;
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+ struct page *page;
+
+ page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+ NULL);
+ if (!IS_ERR(page)) {
+ if (PageError(page))
+ goto fail;
+ p->v = page;
+ return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
+fail:
+ page_cache_release(page);
+ }
+ p->v = NULL;
+ return NULL;
+}
+
+EXPORT_SYMBOL(read_dev_sector);
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
new file mode 100644
index 000000000000..cb5f0a3f1b03
--- /dev/null
+++ b/block/partitions/Kconfig
@@ -0,0 +1,251 @@
+#
+# Partition configuration
+#
+config PARTITION_ADVANCED
+ bool "Advanced partition selection"
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned under an operating system running on a different
+ architecture than your Linux system.
+
+ Note that the answer to this question won't directly affect the
+ kernel: saying N will just cause the configurator to skip all
+ the questions about foreign partitioning schemes.
+
+ If unsure, say N.
+
+config ACORN_PARTITION
+ bool "Acorn partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ help
+ Support hard disks partitioned under Acorn operating systems.
+
+config ACORN_PARTITION_CUMANA
+ bool "Cumana partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned using the Cumana interface on Acorn machines.
+
+config ACORN_PARTITION_EESOX
+ bool "EESOX partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+
+config ACORN_PARTITION_ICS
+ bool "ICS partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned using the ICS interface on Acorn machines.
+
+config ACORN_PARTITION_ADFS
+ bool "Native filecore partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+ help
+ The Acorn Disc Filing System is the standard file system of the
+ RiscOS operating system which runs on Acorn's ARM-based Risc PC
+ systems and the Acorn Archimedes range of machines. If you say
+ `Y' here, Linux will support disk partitions created under ADFS.
+
+config ACORN_PARTITION_POWERTEC
+ bool "PowerTec partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+ help
+ Support reading partition tables created on Acorn machines using
+ the PowerTec SCSI drive.
+
+config ACORN_PARTITION_RISCIX
+ bool "RISCiX partition support" if PARTITION_ADVANCED
+ default y if ARCH_ACORN
+ depends on ACORN_PARTITION
+ help
+ Once upon a time, there was a native Unix port for the Acorn series
+ of machines called RISCiX. If you say 'Y' here, Linux will be able
+ to read disks partitioned under RISCiX.
+
+config OSF_PARTITION
+ bool "Alpha OSF partition support" if PARTITION_ADVANCED
+ default y if ALPHA
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned on an Alpha machine.
+
+config AMIGA_PARTITION
+ bool "Amiga partition table support" if PARTITION_ADVANCED
+ default y if (AMIGA || AFFS_FS=y)
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned under AmigaOS.
+
+config ATARI_PARTITION
+ bool "Atari partition table support" if PARTITION_ADVANCED
+ default y if ATARI
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned under the Atari OS.
+
+config IBM_PARTITION
+ bool "IBM disk label and partition support"
+ depends on PARTITION_ADVANCED && S390
+ help
+ Say Y here if you would like to be able to read the hard disk
+ partition table format used by IBM DASD disks operating under CMS.
+ Otherwise, say N.
+
+config MAC_PARTITION
+ bool "Macintosh partition map support" if PARTITION_ADVANCED
+ default y if (MAC || PPC_PMAC)
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned on a Macintosh.
+
+config MSDOS_PARTITION
+ bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
+ default y
+ help
+ Say Y here.
+
+config BSD_DISKLABEL
+ bool "BSD disklabel (FreeBSD partition tables) support"
+ depends on PARTITION_ADVANCED && MSDOS_PARTITION
+ help
+ FreeBSD uses its own hard disk partition scheme on your PC. It
+ requires only one entry in the primary partition table of your disk
+ and manages it similarly to DOS extended partitions, putting in its
+ first sector a new partition table in BSD disklabel format. Saying Y
+ here allows you to read these disklabels and further mount FreeBSD
+ partitions from within Linux if you have also said Y to "UFS
+ file system support", above. If you don't know what all this is
+ about, say N.
+
+config MINIX_SUBPARTITION
+ bool "Minix subpartition support"
+ depends on PARTITION_ADVANCED && MSDOS_PARTITION
+ help
+ Minix 2.0.0/2.0.2 subpartition table support for Linux.
+ Say Y here if you want to mount and use Minix 2.0.0/2.0.2
+ subpartitions.
+
+config SOLARIS_X86_PARTITION
+ bool "Solaris (x86) partition table support"
+ depends on PARTITION_ADVANCED && MSDOS_PARTITION
+ help
+ Like most systems, Solaris x86 uses its own hard disk partition
+ table format, incompatible with all others. Saying Y here allows you
+ to read these partition tables and further mount Solaris x86
+ partitions from within Linux if you have also said Y to "UFS
+ file system support", above.
+
+config UNIXWARE_DISKLABEL
+ bool "Unixware slices support"
+ depends on PARTITION_ADVANCED && MSDOS_PARTITION
+ ---help---
+ Like some systems, UnixWare uses its own slice table inside a
+ partition (VTOC - Virtual Table of Contents). Its format is
+ incompatible with all other OSes. Saying Y here allows you to read
+ VTOC and further mount UnixWare partitions read-only from within
+ Linux if you have also said Y to "UFS file system support" or
+ "System V and Coherent file system support", above.
+
+ This is mainly used to carry data from a UnixWare box to your
+ Linux box via a removable medium like magneto-optical, ZIP or
+ removable IDE drives. Note, however, that a good portable way to
+ transport files and directories between unixes (and even other
+ operating systems) is given by the tar program ("man tar" or
+ preferably "info tar").
+
+ If you don't know what all this is about, say N.
+
+config LDM_PARTITION
+ bool "Windows Logical Disk Manager (Dynamic Disk) support"
+ depends on PARTITION_ADVANCED
+ ---help---
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned using Windows 2000's/XP's or Vista's Logical Disk
+ Manager. They are also known as "Dynamic Disks".
+
+ Note this driver only supports Dynamic Disks with a protective MBR
+ label, i.e. DOS partition table. It does not support GPT labelled
+ Dynamic Disks yet as can be created with Vista.
+
+ Windows 2000 introduced the concept of Dynamic Disks to get around
+ the limitations of the PC's partitioning scheme. The Logical Disk
+ Manager allows the user to repartition a disk and create spanned,
+ mirrored, striped or RAID volumes, all without the need for
+ rebooting.
+
+ Normal partitions are now called Basic Disks under Windows 2000, XP,
+ and Vista.
+
+ For a fuller description read <file:Documentation/ldm.txt>.
+
+ If unsure, say N.
+
+config LDM_DEBUG
+ bool "Windows LDM extra logging"
+ depends on LDM_PARTITION
+ help
+ Say Y here if you would like LDM to log verbosely. This could be
+ helpful if the driver doesn't work as expected and you'd like to
+ report a bug.
+
+ If unsure, say N.
+
+config SGI_PARTITION
+ bool "SGI partition support" if PARTITION_ADVANCED
+ default y if DEFAULT_SGI_PARTITION
+ help
+ Say Y here if you would like to be able to read the hard disk
+ partition table format used by SGI machines.
+
+config ULTRIX_PARTITION
+ bool "Ultrix partition table support" if PARTITION_ADVANCED
+ default y if MACH_DECSTATION
+ help
+ Say Y here if you would like to be able to read the hard disk
+ partition table format used by DEC (now Compaq) Ultrix machines.
+ Otherwise, say N.
+
+config SUN_PARTITION
+ bool "Sun partition tables support" if PARTITION_ADVANCED
+ default y if (SPARC || SUN3 || SUN3X)
+ ---help---
+ Like most systems, SunOS uses its own hard disk partition table
+ format, incompatible with all others. Saying Y here allows you to
+ read these partition tables and further mount SunOS partitions from
+ within Linux if you have also said Y to "UFS file system support",
+ above. This is mainly used to carry data from a SPARC under SunOS to
+ your Linux box via a removable medium like magneto-optical or ZIP
+ drives; note however that a good portable way to transport files and
+ directories between unixes (and even other operating systems) is
+ given by the tar program ("man tar" or preferably "info tar"). If
+ you don't know what all this is about, say N.
+
+config KARMA_PARTITION
+ bool "Karma Partition support"
+ depends on PARTITION_ADVANCED
+ help
+ Say Y here if you would like to mount the Rio Karma MP3 player, as it
+ uses a proprietary partition table.
+
+config EFI_PARTITION
+ bool "EFI GUID Partition support"
+ depends on PARTITION_ADVANCED
+ select CRC32
+ help
+ Say Y here if you would like to use hard disks under Linux which
+ were partitioned using EFI GPT.
+
+config SYSV68_PARTITION
+ bool "SYSV68 partition table support" if PARTITION_ADVANCED
+ default y if VME
+ help
+ Say Y here if you would like to be able to read the hard disk
+ partition table format used by Motorola Delta machines (using
+ sysv68).
+ Otherwise, say N.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
new file mode 100644
index 000000000000..03af8eac51da
--- /dev/null
+++ b/block/partitions/Makefile
@@ -0,0 +1,20 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-$(CONFIG_BLOCK) := check.o
+
+obj-$(CONFIG_ACORN_PARTITION) += acorn.o
+obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
+obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_MAC_PARTITION) += mac.o
+obj-$(CONFIG_LDM_PARTITION) += ldm.o
+obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
+obj-$(CONFIG_OSF_PARTITION) += osf.o
+obj-$(CONFIG_SGI_PARTITION) += sgi.o
+obj-$(CONFIG_SUN_PARTITION) += sun.o
+obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
+obj-$(CONFIG_IBM_PARTITION) += ibm.o
+obj-$(CONFIG_EFI_PARTITION) += efi.o
+obj-$(CONFIG_KARMA_PARTITION) += karma.o
+obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
new file mode 100644
index 000000000000..fbeb697374d5
--- /dev/null
+++ b/block/partitions/acorn.c
@@ -0,0 +1,556 @@
+/*
+ * linux/fs/partitions/acorn.c
+ *
+ * Copyright (c) 1996-2000 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Scan ADFS partitions on hard disk drives. Unfortunately, there
+ * isn't a standard for partitioning drives on Acorn machines, so
+ * every single manufacturer of SCSI and IDE cards created their own
+ * method.
+ */
+#include <linux/buffer_head.h>
+#include <linux/adfs_fs.h>
+
+#include "check.h"
+#include "acorn.h"
+
+/*
+ * Partition types. (Oh for reusability)
+ */
+#define PARTITION_RISCIX_MFM 1
+#define PARTITION_RISCIX_SCSI 2
+#define PARTITION_LINUX 9
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+ defined(CONFIG_ACORN_PARTITION_ADFS)
+static struct adfs_discrecord *
+adfs_partition(struct parsed_partitions *state, char *name, char *data,
+ unsigned long first_sector, int slot)
+{
+ struct adfs_discrecord *dr;
+ unsigned int nr_sects;
+
+ if (adfs_checkbblk(data))
+ return NULL;
+
+ dr = (struct adfs_discrecord *)(data + 0x1c0);
+
+ if (dr->disc_size == 0 && dr->disc_size_high == 0)
+ return NULL;
+
+ nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
+ (le32_to_cpu(dr->disc_size) >> 9);
+
+ if (name) {
+ strlcat(state->pp_buf, " [", PAGE_SIZE);
+ strlcat(state->pp_buf, name, PAGE_SIZE);
+ strlcat(state->pp_buf, "]", PAGE_SIZE);
+ }
+ put_partition(state, slot, first_sector, nr_sects);
+ return dr;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+
+struct riscix_part {
+ __le32 start;
+ __le32 length;
+ __le32 one;
+ char name[16];
+};
+
+struct riscix_record {
+ __le32 magic;
+#define RISCIX_MAGIC cpu_to_le32(0x4a657320)
+ __le32 date;
+ struct riscix_part part[8];
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+ defined(CONFIG_ACORN_PARTITION_ADFS)
+static int riscix_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
+{
+ Sector sect;
+ struct riscix_record *rr;
+
+ rr = read_part_sector(state, first_sect, &sect);
+ if (!rr)
+ return -1;
+
+ strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
+
+
+ if (rr->magic == RISCIX_MAGIC) {
+ unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+ int part;
+
+ strlcat(state->pp_buf, " <", PAGE_SIZE);
+
+ put_partition(state, slot++, first_sect, size);
+ for (part = 0; part < 8; part++) {
+ if (rr->part[part].one &&
+ memcmp(rr->part[part].name, "All\0", 4)) {
+ put_partition(state, slot++,
+ le32_to_cpu(rr->part[part].start),
+ le32_to_cpu(rr->part[part].length));
+ strlcat(state->pp_buf, "(", PAGE_SIZE);
+ strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+ strlcat(state->pp_buf, ")", PAGE_SIZE);
+ }
+ }
+
+ strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+ } else {
+ put_partition(state, slot++, first_sect, nr_sects);
+ }
+
+ put_dev_sector(sect);
+ return slot;
+}
+#endif
+#endif
+
+#define LINUX_NATIVE_MAGIC 0xdeafa1de
+#define LINUX_SWAP_MAGIC 0xdeafab1e
+
+struct linux_part {
+ __le32 magic;
+ __le32 start_sect;
+ __le32 nr_sects;
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+ defined(CONFIG_ACORN_PARTITION_ADFS)
+static int linux_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
+{
+ Sector sect;
+ struct linux_part *linuxp;
+ unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+
+ strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
+
+ put_partition(state, slot++, first_sect, size);
+
+ linuxp = read_part_sector(state, first_sect, &sect);
+ if (!linuxp)
+ return -1;
+
+ strlcat(state->pp_buf, " <", PAGE_SIZE);
+ while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
+ linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
+ if (slot == state->limit)
+ break;
+ put_partition(state, slot++, first_sect +
+ le32_to_cpu(linuxp->start_sect),
+ le32_to_cpu(linuxp->nr_sects));
+ linuxp ++;
+ }
+ strlcat(state->pp_buf, " >", PAGE_SIZE);
+
+ put_dev_sector(sect);
+ return slot;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+int adfspart_check_CUMANA(struct parsed_partitions *state)
+{
+ unsigned long first_sector = 0;
+ unsigned int start_blk = 0;
+ Sector sect;
+ unsigned char *data;
+ char *name = "CUMANA/ADFS";
+ int first = 1;
+ int slot = 1;
+
+ /*
+ * Try Cumana style partitions - sector 6 contains ADFS boot block
+ * with pointer to next 'drive'.
+ *
+ * There are unknowns in this code - is the 'cylinder number' of the
+ * next partition relative to the start of this one - I'm assuming
+ * it is.
+ *
+ * Also, which ID did Cumana use?
+ *
+ * This is totally unfinished, and will require more work to get it
+ * going. Hence it is totally untested.
+ */
+ do {
+ struct adfs_discrecord *dr;
+ unsigned int nr_sects;
+
+ data = read_part_sector(state, start_blk * 2 + 6, &sect);
+ if (!data)
+ return -1;
+
+ if (slot == state->limit)
+ break;
+
+ dr = adfs_partition(state, name, data, first_sector, slot++);
+ if (!dr)
+ break;
+
+ name = NULL;
+
+ nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
+ (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
+ dr->secspertrack;
+
+ if (!nr_sects)
+ break;
+
+ first = 0;
+ first_sector += nr_sects;
+ start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
+ nr_sects = 0; /* hmm - should be partition size */
+
+ switch (data[0x1fc] & 15) {
+ case 0: /* No partition / ADFS? */
+ break;
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+ case PARTITION_RISCIX_SCSI:
+ /* RISCiX - we don't know how to find the next one. */
+ slot = riscix_partition(state, first_sector, slot,
+ nr_sects);
+ break;
+#endif
+
+ case PARTITION_LINUX:
+ slot = linux_partition(state, first_sector, slot,
+ nr_sects);
+ break;
+ }
+ put_dev_sector(sect);
+ if (slot == -1)
+ return -1;
+ } while (1);
+ put_dev_sector(sect);
+ return first ? 0 : 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+/*
+ * Purpose: allocate ADFS partitions.
+ *
+ * Params : hd - pointer to gendisk structure to store partition info.
+ * dev - device number to access.
+ *
+ * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
+ *
+ * Alloc : hda = whole drive
+ * hda1 = ADFS partition on first drive.
+ * hda2 = non-ADFS partition.
+ */
+int adfspart_check_ADFS(struct parsed_partitions *state)
+{
+ unsigned long start_sect, nr_sects, sectscyl, heads;
+ Sector sect;
+ unsigned char *data;
+ struct adfs_discrecord *dr;
+ unsigned char id;
+ int slot = 1;
+
+ data = read_part_sector(state, 6, &sect);
+ if (!data)
+ return -1;
+
+ dr = adfs_partition(state, "ADFS", data, 0, slot++);
+ if (!dr) {
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ heads = dr->heads + ((dr->lowsector >> 6) & 1);
+ sectscyl = dr->secspertrack * heads;
+ start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
+ id = data[0x1fc] & 15;
+ put_dev_sector(sect);
+
+ /*
+ * Work out start of non-adfs partition.
+ */
+ nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
+
+ if (start_sect) {
+ switch (id) {
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+ case PARTITION_RISCIX_SCSI:
+ case PARTITION_RISCIX_MFM:
+ slot = riscix_partition(state, start_sect, slot,
+ nr_sects);
+ break;
+#endif
+
+ case PARTITION_LINUX:
+ slot = linux_partition(state, start_sect, slot,
+ nr_sects);
+ break;
+ }
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ICS
+
+struct ics_part {
+ __le32 start;
+ __le32 size;
+};
+
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+ unsigned long block)
+{
+ Sector sect;
+ unsigned char *data = read_part_sector(state, block, &sect);
+ int result = 0;
+
+ if (data) {
+ if (memcmp(data, "LinuxPart", 9) == 0)
+ result = 1;
+ put_dev_sector(sect);
+ }
+
+ return result;
+}
+
+/*
+ * Check for a valid ICS partition using the checksum.
+ */
+static inline int valid_ics_sector(const unsigned char *data)
+{
+ unsigned long sum;
+ int i;
+
+ for (i = 0, sum = 0x50617274; i < 508; i++)
+ sum += data[i];
+
+ sum -= le32_to_cpu(*(__le32 *)(&data[508]));
+
+ return sum == 0;
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd - pointer to gendisk structure to store partition info.
+ * dev - device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc : hda = whole drive
+ * hda1 = ADFS partition 0 on first drive.
+ * hda2 = ADFS partition 1 on first drive.
+ * ..etc..
+ */
+int adfspart_check_ICS(struct parsed_partitions *state)
+{
+ const unsigned char *data;
+ const struct ics_part *p;
+ int slot;
+ Sector sect;
+
+ /*
+ * Try ICS style partitions - sector 0 contains partition info.
+ */
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+
+ if (!valid_ics_sector(data)) {
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
+
+ for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
+ u32 start = le32_to_cpu(p->start);
+ s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
+
+ if (slot == state->limit)
+ break;
+
+ /*
+ * Negative sizes tell the RISC OS ICS driver to ignore
+ * this partition - in effect it says that this does not
+ * contain an ADFS filesystem.
+ */
+ if (size < 0) {
+ size = -size;
+
+ /*
+ * Our own extension - We use the first sector
+ * of the partition to identify what type this
+ * partition is. We must not make this visible
+ * to the filesystem.
+ */
+ if (size > 1 && adfspart_check_ICSLinux(state, start)) {
+ start += 1;
+ size -= 1;
+ }
+ }
+
+ if (size)
+ put_partition(state, slot++, start, size);
+ }
+
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+struct ptec_part {
+ __le32 unused1;
+ __le32 unused2;
+ __le32 start;
+ __le32 size;
+ __le32 unused5;
+ char type[8];
+};
+
+static inline int valid_ptec_sector(const unsigned char *data)
+{
+ unsigned char checksum = 0x2a;
+ int i;
+
+ /*
+ * If it looks like a PC/BIOS partition, then it
+ * probably isn't PowerTec.
+ */
+ if (data[510] == 0x55 && data[511] == 0xaa)
+ return 0;
+
+ for (i = 0; i < 511; i++)
+ checksum += data[i];
+
+ return checksum == data[511];
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd - pointer to gendisk structure to store partition info.
+ * dev - device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc : hda = whole drive
+ * hda1 = ADFS partition 0 on first drive.
+ * hda2 = ADFS partition 1 on first drive.
+ * ..etc..
+ */
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
+{
+ Sector sect;
+ const unsigned char *data;
+ const struct ptec_part *p;
+ int slot = 1;
+ int i;
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+
+ if (!valid_ptec_sector(data)) {
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
+
+ for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
+ u32 start = le32_to_cpu(p->start);
+ u32 size = le32_to_cpu(p->size);
+
+ if (size)
+ put_partition(state, slot++, start, size);
+ }
+
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+struct eesox_part {
+ char magic[6];
+ char name[10];
+ __le32 start;
+ __le32 unused6;
+ __le32 unused7;
+ __le32 unused8;
+};
+
+/*
+ * Guess who created this format?
+ */
+static const char eesox_name[] = {
+ 'N', 'e', 'i', 'l', ' ',
+ 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
+};
+
+/*
+ * EESOX SCSI partition format.
+ *
+ * This is a goddamned awful partition format. We don't seem to store
+ * the size of the partition in this table, only the start addresses.
+ *
+ * There are two possibilities where the size comes from:
+ * 1. The individual ADFS boot block entries that are placed on the disk.
+ * 2. The start address of the next entry.
+ */
+int adfspart_check_EESOX(struct parsed_partitions *state)
+{
+ Sector sect;
+ const unsigned char *data;
+ unsigned char buffer[256];
+ struct eesox_part *p;
+ sector_t start = 0;
+ int i, slot = 1;
+
+ data = read_part_sector(state, 7, &sect);
+ if (!data)
+ return -1;
+
+ /*
+ * "Decrypt" the partition table. God knows why...
+ */
+ for (i = 0; i < 256; i++)
+ buffer[i] = data[i] ^ eesox_name[i & 15];
+
+ put_dev_sector(sect);
+
+ for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
+ sector_t next;
+
+ if (memcmp(p->magic, "Eesox", 6))
+ break;
+
+ next = le32_to_cpu(p->start);
+ if (i)
+ put_partition(state, slot++, start, next - start);
+ start = next;
+ }
+
+ if (i != 0) {
+ sector_t size;
+
+ size = get_capacity(state->bdev->bd_disk);
+ put_partition(state, slot++, start, size - start);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ }
+
+ return i ? 1 : 0;
+}
+#endif
diff --git a/block/partitions/acorn.h b/block/partitions/acorn.h
new file mode 100644
index 000000000000..ede828529692
--- /dev/null
+++ b/block/partitions/acorn.h
@@ -0,0 +1,14 @@
+/*
+ * linux/fs/partitions/acorn.h
+ *
+ * Copyright (C) 1996-2001 Russell King.
+ *
+ * I _hate_ this partitioning mess - why can't we have one defined
+ * format, and everyone stick to it?
+ */
+
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
new file mode 100644
index 000000000000..70cbf44a1560
--- /dev/null
+++ b/block/partitions/amiga.c
@@ -0,0 +1,139 @@
+/*
+ * fs/partitions/amiga.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ *
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/types.h>
+#include <linux/affs_hardblocks.h>
+
+#include "check.h"
+#include "amiga.h"
+
+static __inline__ u32
+checksum_block(__be32 *m, int size)
+{
+ u32 sum = 0;
+
+ while (size--)
+ sum += be32_to_cpu(*m++);
+ return sum;
+}
+
+int amiga_partition(struct parsed_partitions *state)
+{
+ Sector sect;
+ unsigned char *data;
+ struct RigidDiskBlock *rdb;
+ struct PartitionBlock *pb;
+ int start_sect, nr_sects, blk, part, res = 0;
+ int blksize = 1; /* Multiplier for disk block size */
+ int slot = 1;
+ char b[BDEVNAME_SIZE];
+
+ for (blk = 0; ; blk++, put_dev_sector(sect)) {
+ if (blk == RDB_ALLOCATION_LIMIT)
+ goto rdb_done;
+ data = read_part_sector(state, blk, &sect);
+ if (!data) {
+ if (warn_no_part)
+ printk("Dev %s: unable to read RDB block %d\n",
+ bdevname(state->bdev, b), blk);
+ res = -1;
+ goto rdb_done;
+ }
+ if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
+ continue;
+
+ rdb = (struct RigidDiskBlock *)data;
+ if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
+ break;
+ /* Try again with 0xdc..0xdf zeroed, Windows might have
+ * trashed it.
+ */
+ *(__be32 *)(data+0xdc) = 0;
+ if (checksum_block((__be32 *)data,
+ be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
+ printk("Warning: Trashed word at 0xd0 in block %d "
+ "ignored in checksum calculation\n",blk);
+ break;
+ }
+
+ printk("Dev %s: RDB in block %d has bad checksum\n",
+ bdevname(state->bdev, b), blk);
+ }
+
+ /* blksize is blocks per 512 byte standard block */
+ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
+
+ {
+ char tmp[7 + 10 + 1 + 1];
+
+ /* Be more informative */
+ snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ blk = be32_to_cpu(rdb->rdb_PartitionList);
+ put_dev_sector(sect);
+ for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
+ blk *= blksize; /* Read in terms partition table understands */
+ data = read_part_sector(state, blk, &sect);
+ if (!data) {
+ if (warn_no_part)
+ printk("Dev %s: unable to read partition block %d\n",
+ bdevname(state->bdev, b), blk);
+ res = -1;
+ goto rdb_done;
+ }
+ pb = (struct PartitionBlock *)data;
+ blk = be32_to_cpu(pb->pb_Next);
+ if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
+ continue;
+ if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
+ continue;
+
+ /* Tell Kernel about it */
+
+ nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
+ be32_to_cpu(pb->pb_Environment[9])) *
+ be32_to_cpu(pb->pb_Environment[3]) *
+ be32_to_cpu(pb->pb_Environment[5]) *
+ blksize;
+ if (!nr_sects)
+ continue;
+ start_sect = be32_to_cpu(pb->pb_Environment[9]) *
+ be32_to_cpu(pb->pb_Environment[3]) *
+ be32_to_cpu(pb->pb_Environment[5]) *
+ blksize;
+ put_partition(state,slot++,start_sect,nr_sects);
+ {
+ /* Be even more informative to aid mounting */
+ char dostype[4];
+ char tmp[42];
+
+ __be32 *dt = (__be32 *)dostype;
+ *dt = pb->pb_Environment[16];
+ if (dostype[3] < ' ')
+ snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
+ dostype[0], dostype[1],
+ dostype[2], dostype[3] + '@' );
+ else
+ snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
+ dostype[0], dostype[1],
+ dostype[2], dostype[3]);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
+ be32_to_cpu(pb->pb_Environment[6]),
+ be32_to_cpu(pb->pb_Environment[4]));
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ res = 1;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+rdb_done:
+ return res;
+}
diff --git a/block/partitions/amiga.h b/block/partitions/amiga.h
new file mode 100644
index 000000000000..d094585cadaa
--- /dev/null
+++ b/block/partitions/amiga.h
@@ -0,0 +1,6 @@
+/*
+ * fs/partitions/amiga.h
+ */
+
+int amiga_partition(struct parsed_partitions *state);
+
diff --git a/block/partitions/atari.c b/block/partitions/atari.c
new file mode 100644
index 000000000000..9875b05e80a2
--- /dev/null
+++ b/block/partitions/atari.c
@@ -0,0 +1,149 @@
+/*
+ * fs/partitions/atari.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ *
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "atari.h"
+
+/* ++guenther: this should be settable by the user ("make config")?.
+ */
+#define ICD_PARTS
+
+/* check if a partition entry looks valid -- Atari format is assumed if at
+ least one of the primary entries is ok this way */
+#define VALID_PARTITION(pi,hdsiz) \
+ (((pi)->flg & 1) && \
+ isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
+ be32_to_cpu((pi)->st) <= (hdsiz) && \
+ be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
+
+static inline int OK_id(char *s)
+{
+ return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
+ memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
+ memcmp (s, "RAW", 3) == 0 ;
+}
+
+int atari_partition(struct parsed_partitions *state)
+{
+ Sector sect;
+ struct rootsector *rs;
+ struct partition_info *pi;
+ u32 extensect;
+ u32 hd_size;
+ int slot;
+#ifdef ICD_PARTS
+ int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
+#endif
+
+ rs = read_part_sector(state, 0, &sect);
+ if (!rs)
+ return -1;
+
+ /* Verify this is an Atari rootsector: */
+ hd_size = state->bdev->bd_inode->i_size >> 9;
+ if (!VALID_PARTITION(&rs->part[0], hd_size) &&
+ !VALID_PARTITION(&rs->part[1], hd_size) &&
+ !VALID_PARTITION(&rs->part[2], hd_size) &&
+ !VALID_PARTITION(&rs->part[3], hd_size)) {
+ /*
+ * if there's no valid primary partition, assume that no Atari
+ * format partition table (there's no reliable magic or the like
+ * :-()
+ */
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ pi = &rs->part[0];
+ strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
+ for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
+ struct rootsector *xrs;
+ Sector sect2;
+ ulong partsect;
+
+ if ( !(pi->flg & 1) )
+ continue;
+ /* active partition */
+ if (memcmp (pi->id, "XGM", 3) != 0) {
+ /* we don't care about other id's */
+ put_partition (state, slot, be32_to_cpu(pi->st),
+ be32_to_cpu(pi->siz));
+ continue;
+ }
+ /* extension partition */
+#ifdef ICD_PARTS
+ part_fmt = 1;
+#endif
+ strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
+ partsect = extensect = be32_to_cpu(pi->st);
+ while (1) {
+ xrs = read_part_sector(state, partsect, &sect2);
+ if (!xrs) {
+ printk (" block %ld read failed\n", partsect);
+ put_dev_sector(sect);
+ return -1;
+ }
+
+ /* ++roman: sanity check: bit 0 of flg field must be set */
+ if (!(xrs->part[0].flg & 1)) {
+ printk( "\nFirst sub-partition in extended partition is not valid!\n" );
+ put_dev_sector(sect2);
+ break;
+ }
+
+ put_partition(state, slot,
+ partsect + be32_to_cpu(xrs->part[0].st),
+ be32_to_cpu(xrs->part[0].siz));
+
+ if (!(xrs->part[1].flg & 1)) {
+ /* end of linked partition list */
+ put_dev_sector(sect2);
+ break;
+ }
+ if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
+ printk("\nID of extended partition is not XGM!\n");
+ put_dev_sector(sect2);
+ break;
+ }
+
+ partsect = be32_to_cpu(xrs->part[1].st) + extensect;
+ put_dev_sector(sect2);
+ if (++slot == state->limit) {
+ printk( "\nMaximum number of partitions reached!\n" );
+ break;
+ }
+ }
+ strlcat(state->pp_buf, " >", PAGE_SIZE);
+ }
+#ifdef ICD_PARTS
+ if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
+ pi = &rs->icdpart[0];
+ /* sanity check: no ICD format if first partition invalid */
+ if (OK_id(pi->id)) {
+ strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
+ for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
+ /* accept only GEM,BGM,RAW,LNX,SWP partitions */
+ if (!((pi->flg & 1) && OK_id(pi->id)))
+ continue;
+ part_fmt = 2;
+ put_partition (state, slot,
+ be32_to_cpu(pi->st),
+ be32_to_cpu(pi->siz));
+ }
+ strlcat(state->pp_buf, " >", PAGE_SIZE);
+ }
+ }
+#endif
+ put_dev_sector(sect);
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+ return 1;
+}
diff --git a/block/partitions/atari.h b/block/partitions/atari.h
new file mode 100644
index 000000000000..fe2d32a89f36
--- /dev/null
+++ b/block/partitions/atari.h
@@ -0,0 +1,34 @@
+/*
+ * fs/partitions/atari.h
+ * Moved by Russell King from:
+ *
+ * linux/include/linux/atari_rootsec.h
+ * definitions for Atari Rootsector layout
+ * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
+ *
+ * modified for ICD/Supra partitioning scheme restricted to at most 12
+ * partitions
+ * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
+ */
+
+struct partition_info
+{
+ u8 flg; /* bit 0: active; bit 7: bootable */
+ char id[3]; /* "GEM", "BGM", "XGM", or other */
+ __be32 st; /* start of partition */
+ __be32 siz; /* length of partition */
+};
+
+struct rootsector
+{
+ char unused[0x156]; /* room for boot code */
+ struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */
+ char unused2[0xc];
+ u32 hd_siz; /* size of disk in blocks */
+ struct partition_info part[4];
+ u32 bsl_st; /* start of bad sector list */
+ u32 bsl_cnt; /* length of bad sector list */
+ u16 checksum; /* checksum for bootable disks */
+} __attribute__((__packed__));
+
+int atari_partition(struct parsed_partitions *state);
diff --git a/block/partitions/check.c b/block/partitions/check.c
new file mode 100644
index 000000000000..bc908672c976
--- /dev/null
+++ b/block/partitions/check.c
@@ -0,0 +1,166 @@
+/*
+ * fs/partitions/check.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ *
+ * We now have independent partition support from the
+ * block drivers, which allows all the partition code to
+ * be grouped in one location, and it to be mostly self
+ * contained.
+ *
+ * Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
+ */
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+
+#include "check.h"
+
+#include "acorn.h"
+#include "amiga.h"
+#include "atari.h"
+#include "ldm.h"
+#include "mac.h"
+#include "msdos.h"
+#include "osf.h"
+#include "sgi.h"
+#include "sun.h"
+#include "ibm.h"
+#include "ultrix.h"
+#include "efi.h"
+#include "karma.h"
+#include "sysv68.h"
+
+int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
+
+static int (*check_part[])(struct parsed_partitions *) = {
+ /*
+ * Probe partition formats with tables at disk address 0
+ * that also have an ADFS boot block at 0xdc0.
+ */
+#ifdef CONFIG_ACORN_PARTITION_ICS
+ adfspart_check_ICS,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+ adfspart_check_POWERTEC,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+ adfspart_check_EESOX,
+#endif
+
+ /*
+ * Now move on to formats that only have partition info at
+ * disk address 0xdc0. Since these may also have stale
+ * PC/BIOS partition tables, they need to come before
+ * the msdos entry.
+ */
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+ adfspart_check_CUMANA,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+ adfspart_check_ADFS,
+#endif
+
+#ifdef CONFIG_EFI_PARTITION
+ efi_partition, /* this must come before msdos */
+#endif
+#ifdef CONFIG_SGI_PARTITION
+ sgi_partition,
+#endif
+#ifdef CONFIG_LDM_PARTITION
+ ldm_partition, /* this must come before msdos */
+#endif
+#ifdef CONFIG_MSDOS_PARTITION
+ msdos_partition,
+#endif
+#ifdef CONFIG_OSF_PARTITION
+ osf_partition,
+#endif
+#ifdef CONFIG_SUN_PARTITION
+ sun_partition,
+#endif
+#ifdef CONFIG_AMIGA_PARTITION
+ amiga_partition,
+#endif
+#ifdef CONFIG_ATARI_PARTITION
+ atari_partition,
+#endif
+#ifdef CONFIG_MAC_PARTITION
+ mac_partition,
+#endif
+#ifdef CONFIG_ULTRIX_PARTITION
+ ultrix_partition,
+#endif
+#ifdef CONFIG_IBM_PARTITION
+ ibm_partition,
+#endif
+#ifdef CONFIG_KARMA_PARTITION
+ karma_partition,
+#endif
+#ifdef CONFIG_SYSV68_PARTITION
+ sysv68_partition,
+#endif
+ NULL
+};
+
+struct parsed_partitions *
+check_partition(struct gendisk *hd, struct block_device *bdev)
+{
+ struct parsed_partitions *state;
+ int i, res, err;
+
+ state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+ if (!state)
+ return NULL;
+ state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+ if (!state->pp_buf) {
+ kfree(state);
+ return NULL;
+ }
+ state->pp_buf[0] = '\0';
+
+ state->bdev = bdev;
+ disk_name(hd, 0, state->name);
+ snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
+ if (isdigit(state->name[strlen(state->name)-1]))
+ sprintf(state->name, "p");
+
+ state->limit = disk_max_parts(hd);
+ i = res = err = 0;
+ while (!res && check_part[i]) {
+ memset(&state->parts, 0, sizeof(state->parts));
+ res = check_part[i++](state);
+ if (res < 0) {
+ /* We have hit an I/O error which we don't report now.
+ * But record it, and let the others do their job.
+ */
+ err = res;
+ res = 0;
+ }
+
+ }
+ if (res > 0) {
+ printk(KERN_INFO "%s", state->pp_buf);
+
+ free_page((unsigned long)state->pp_buf);
+ return state;
+ }
+ if (state->access_beyond_eod)
+ err = -ENOSPC;
+ if (err)
+ /* The partition is unrecognized. So report I/O errors if there were any */
+ res = err;
+ if (!res)
+ strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
+ else if (warn_no_part)
+ strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+
+ printk(KERN_INFO "%s", state->pp_buf);
+
+ free_page((unsigned long)state->pp_buf);
+ kfree(state);
+ return ERR_PTR(res);
+}
diff --git a/block/partitions/check.h b/block/partitions/check.h
new file mode 100644
index 000000000000..52b100311ec3
--- /dev/null
+++ b/block/partitions/check.h
@@ -0,0 +1,52 @@
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+
+/*
+ * add_gd_partition adds a partitions details to the devices partition
+ * description.
+ */
+struct parsed_partitions {
+ struct block_device *bdev;
+ char name[BDEVNAME_SIZE];
+ struct {
+ sector_t from;
+ sector_t size;
+ int flags;
+ bool has_info;
+ struct partition_meta_info info;
+ } parts[DISK_MAX_PARTS];
+ int next;
+ int limit;
+ bool access_beyond_eod;
+ char *pp_buf;
+};
+
+struct parsed_partitions *
+check_partition(struct gendisk *, struct block_device *);
+
+static inline void *read_part_sector(struct parsed_partitions *state,
+ sector_t n, Sector *p)
+{
+ if (n >= get_capacity(state->bdev->bd_disk)) {
+ state->access_beyond_eod = true;
+ return NULL;
+ }
+ return read_dev_sector(state->bdev, n, p);
+}
+
+static inline void
+put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
+{
+ if (n < p->limit) {
+ char tmp[1 + BDEVNAME_SIZE + 10 + 1];
+
+ p->parts[n].from = from;
+ p->parts[n].size = size;
+ snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+ strlcat(p->pp_buf, tmp, PAGE_SIZE);
+ }
+}
+
+extern int warn_no_part;
+
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
new file mode 100644
index 000000000000..6296b403c67a
--- /dev/null
+++ b/block/partitions/efi.c
@@ -0,0 +1,675 @@
+/************************************************************
+ * EFI GUID Partition Table handling
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
+ * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
+ * Copyright 2000,2001,2002,2004 Dell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * TODO:
+ *
+ * Changelog:
+ * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
+ * - test for valid PMBR and valid PGPT before ever reading
+ * AGPT, allow override with 'gpt' kernel command line option.
+ * - check for first/last_usable_lba outside of size of disk
+ *
+ * Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.7-pre1 and 2.5.7-dj2
+ * - Applied patch to avoid fault in alternate header handling
+ * - cleaned up find_valid_gpt
+ * - On-disk structure and copy in memory is *always* LE now -
+ * swab fields as needed
+ * - remove print_gpt_header()
+ * - only use first max_p partition entries, to keep the kernel minor number
+ * and partition numbers tied.
+ *
+ * Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed __PRIPTR_PREFIX - not being used
+ *
+ * Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
+ *
+ * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Added compare_gpts().
+ * - moved le_efi_guid_to_cpus() back into this file. GPT is the only
+ * thing that keeps EFI GUIDs on disk.
+ * - Changed gpt structure names and members to be simpler and more Linux-like.
+ *
+ * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
+ *
+ * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Changed function comments to DocBook style per Andreas Dilger suggestion.
+ *
+ * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Change read_lba() to use the page cache per Al Viro's work.
+ * - print u64s properly on all architectures
+ * - fixed debug_printk(), now Dprintk()
+ *
+ * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Style cleanups
+ * - made most functions static
+ * - Endianness addition
+ * - remove test for second alternate header, as it's not per spec,
+ * and is unnecessary. There's now a method to read/write the last
+ * sector of an odd-sized disk from user space. No tools have ever
+ * been released which used this code, so it's effectively dead.
+ * - Per Asit Mallick of Intel, added a test for a valid PMBR.
+ * - Added kernel command line option 'gpt' to override valid PMBR test.
+ *
+ * Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
+ * - added devfs volume UUID support (/dev/volumes/uuids) for
+ * mounting file systems by the partition GUID.
+ *
+ * Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Moved crc32() to linux/lib, added efi_crc32().
+ *
+ * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Replaced Intel's CRC32 function with an equivalent
+ * non-license-restricted version.
+ *
+ * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Fixed the last_lba() call to return the proper last block
+ *
+ * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Thanks to Andries Brouwer for his debugging assistance.
+ * - Code works, detects all the partitions.
+ *
+ ************************************************************/
+#include <linux/crc32.h>
+#include <linux/ctype.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include "check.h"
+#include "efi.h"
+
+/* This allows a kernel command line option 'gpt' to override
+ * the test for invalid PMBR. Not __initdata because reloading
+ * the partition tables happens after init too.
+ */
+static int force_gpt;
+static int __init
+force_gpt_fn(char *str)
+{
+ force_gpt = 1;
+ return 1;
+}
+__setup("gpt", force_gpt_fn);
+
+
+/**
+ * efi_crc32() - EFI version of crc32 function
+ * @buf: buffer to calculate crc32 of
+ * @len - length of buf
+ *
+ * Description: Returns EFI-style CRC32 value for @buf
+ *
+ * This function uses the little endian Ethernet polynomial
+ * but seeds the function with ~0, and xor's with ~0 at the end.
+ * Note, the EFI Specification, v1.02, has a reference to
+ * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
+ */
+static inline u32
+efi_crc32(const void *buf, unsigned long len)
+{
+ return (crc32(~0L, buf, len) ^ ~0L);
+}
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ *
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ * the part[0] entry for this disk, and is the number of
+ * physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+ if (!bdev || !bdev->bd_inode)
+ return 0;
+ return div_u64(bdev->bd_inode->i_size,
+ bdev_logical_block_size(bdev)) - 1ULL;
+}
+
+static inline int
+pmbr_part_valid(struct partition *part)
+{
+ if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
+ le32_to_cpu(part->start_sect) == 1UL)
+ return 1;
+ return 0;
+}
+
+/**
+ * is_pmbr_valid(): test Protective MBR for validity
+ * @mbr: pointer to a legacy mbr structure
+ *
+ * Description: Returns 1 if PMBR is valid, 0 otherwise.
+ * Validity depends on two things:
+ * 1) MSDOS signature is in the last two bytes of the MBR
+ * 2) One partition of type 0xEE is found
+ */
+static int
+is_pmbr_valid(legacy_mbr *mbr)
+{
+ int i;
+ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
+ return 0;
+ for (i = 0; i < 4; i++)
+ if (pmbr_part_valid(&mbr->partition_record[i]))
+ return 1;
+ return 0;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @size_t
+ *
+ * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state,
+ u64 lba, u8 *buffer, size_t count)
+{
+ size_t totalreadcount = 0;
+ struct block_device *bdev = state->bdev;
+ sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
+
+ if (!buffer || lba > last_lba(bdev))
+ return 0;
+
+ while (count) {
+ int copied = 512;
+ Sector sect;
+ unsigned char *data = read_part_sector(state, n++, &sect);
+ if (!data)
+ break;
+ if (copied > count)
+ copied = count;
+ memcpy(buffer, data, copied);
+ put_dev_sector(sect);
+ buffer += copied;
+ totalreadcount +=copied;
+ count -= copied;
+ }
+ return totalreadcount;
+}
+
+/**
+ * alloc_read_gpt_entries(): reads partition entries from disk
+ * @state
+ * @gpt - GPT header
+ *
+ * Description: Returns ptes on success, NULL on error.
+ * Allocates space for PTEs based on information found in @gpt.
+ * Notes: remember to free pte when you're done!
+ */
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+ gpt_header *gpt)
+{
+ size_t count;
+ gpt_entry *pte;
+
+ if (!gpt)
+ return NULL;
+
+ count = le32_to_cpu(gpt->num_partition_entries) *
+ le32_to_cpu(gpt->sizeof_partition_entry);
+ if (!count)
+ return NULL;
+ pte = kzalloc(count, GFP_KERNEL);
+ if (!pte)
+ return NULL;
+
+ if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
+ (u8 *) pte,
+ count) < count) {
+ kfree(pte);
+ pte=NULL;
+ return NULL;
+ }
+ return pte;
+}
+
+/**
+ * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
+ * @state
+ * @lba is the Logical Block Address of the partition table
+ *
+ * Description: returns GPT header on success, NULL on error. Allocates
+ * and fills a GPT header starting at @ from @state->bdev.
+ * Note: remember to free gpt when finished with it.
+ */
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+ u64 lba)
+{
+ gpt_header *gpt;
+ unsigned ssz = bdev_logical_block_size(state->bdev);
+
+ gpt = kzalloc(ssz, GFP_KERNEL);
+ if (!gpt)
+ return NULL;
+
+ if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
+ kfree(gpt);
+ gpt=NULL;
+ return NULL;
+ }
+
+ return gpt;
+}
+
+/**
+ * is_gpt_valid() - tests one GPT header and PTEs for validity
+ * @state
+ * @lba is the logical block address of the GPT header to test
+ * @gpt is a GPT header ptr, filled on return.
+ * @ptes is a PTEs ptr, filled on return.
+ *
+ * Description: returns 1 if valid, 0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ */
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+ gpt_header **gpt, gpt_entry **ptes)
+{
+ u32 crc, origcrc;
+ u64 lastlba;
+
+ if (!ptes)
+ return 0;
+ if (!(*gpt = alloc_read_gpt_header(state, lba)))
+ return 0;
+
+ /* Check the GUID Partition Table signature */
+ if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
+ pr_debug("GUID Partition Table Header signature is wrong:"
+ "%lld != %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->signature),
+ (unsigned long long)GPT_HEADER_SIGNATURE);
+ goto fail;
+ }
+
+ /* Check the GUID Partition Table header size */
+ if (le32_to_cpu((*gpt)->header_size) >
+ bdev_logical_block_size(state->bdev)) {
+ pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+ le32_to_cpu((*gpt)->header_size),
+ bdev_logical_block_size(state->bdev));
+ goto fail;
+ }
+
+ /* Check the GUID Partition Table CRC */
+ origcrc = le32_to_cpu((*gpt)->header_crc32);
+ (*gpt)->header_crc32 = 0;
+ crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
+
+ if (crc != origcrc) {
+ pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+ crc, origcrc);
+ goto fail;
+ }
+ (*gpt)->header_crc32 = cpu_to_le32(origcrc);
+
+ /* Check that the my_lba entry points to the LBA that contains
+ * the GUID Partition Table */
+ if (le64_to_cpu((*gpt)->my_lba) != lba) {
+ pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+ (unsigned long long)lba);
+ goto fail;
+ }
+
+ /* Check the first_usable_lba and last_usable_lba are
+ * within the disk.
+ */
+ lastlba = last_lba(state->bdev);
+ if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
+ pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+ (unsigned long long)lastlba);
+ goto fail;
+ }
+ if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
+ pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+ (unsigned long long)lastlba);
+ goto fail;
+ }
+
+ /* Check that sizeof_partition_entry has the correct value */
+ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
+ pr_debug("GUID Partitition Entry Size check failed.\n");
+ goto fail;
+ }
+
+ if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
+ goto fail;
+
+ /* Check the GUID Partition Entry Array CRC */
+ crc = efi_crc32((const unsigned char *) (*ptes),
+ le32_to_cpu((*gpt)->num_partition_entries) *
+ le32_to_cpu((*gpt)->sizeof_partition_entry));
+
+ if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
+ pr_debug("GUID Partitition Entry Array CRC check failed.\n");
+ goto fail_ptes;
+ }
+
+ /* We're done, all's well */
+ return 1;
+
+ fail_ptes:
+ kfree(*ptes);
+ *ptes = NULL;
+ fail:
+ kfree(*gpt);
+ *gpt = NULL;
+ return 0;
+}
+
+/**
+ * is_pte_valid() - tests one PTE for validity
+ * @pte is the pte to check
+ * @lastlba is last lba of the disk
+ *
+ * Description: returns 1 if valid, 0 on error.
+ */
+static inline int
+is_pte_valid(const gpt_entry *pte, const u64 lastlba)
+{
+ if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
+ le64_to_cpu(pte->starting_lba) > lastlba ||
+ le64_to_cpu(pte->ending_lba) > lastlba)
+ return 0;
+ return 1;
+}
+
+/**
+ * compare_gpts() - Search disk for valid GPT headers and PTEs
+ * @pgpt is the primary GPT header
+ * @agpt is the alternate GPT header
+ * @lastlba is the last LBA number
+ * Description: Returns nothing. Sanity checks pgpt and agpt fields
+ * and prints warnings on discrepancies.
+ *
+ */
+static void
+compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
+{
+ int error_found = 0;
+ if (!pgpt || !agpt)
+ return;
+ if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
+ printk(KERN_WARNING
+ "GPT:Primary header LBA != Alt. header alternate_lba\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(pgpt->my_lba),
+ (unsigned long long)le64_to_cpu(agpt->alternate_lba));
+ error_found++;
+ }
+ if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
+ printk(KERN_WARNING
+ "GPT:Primary header alternate_lba != Alt. header my_lba\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+ (unsigned long long)le64_to_cpu(agpt->my_lba));
+ error_found++;
+ }
+ if (le64_to_cpu(pgpt->first_usable_lba) !=
+ le64_to_cpu(agpt->first_usable_lba)) {
+ printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
+ (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
+ error_found++;
+ }
+ if (le64_to_cpu(pgpt->last_usable_lba) !=
+ le64_to_cpu(agpt->last_usable_lba)) {
+ printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
+ (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
+ error_found++;
+ }
+ if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
+ printk(KERN_WARNING "GPT:disk_guids don't match.\n");
+ error_found++;
+ }
+ if (le32_to_cpu(pgpt->num_partition_entries) !=
+ le32_to_cpu(agpt->num_partition_entries)) {
+ printk(KERN_WARNING "GPT:num_partition_entries don't match: "
+ "0x%x != 0x%x\n",
+ le32_to_cpu(pgpt->num_partition_entries),
+ le32_to_cpu(agpt->num_partition_entries));
+ error_found++;
+ }
+ if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
+ le32_to_cpu(agpt->sizeof_partition_entry)) {
+ printk(KERN_WARNING
+ "GPT:sizeof_partition_entry values don't match: "
+ "0x%x != 0x%x\n",
+ le32_to_cpu(pgpt->sizeof_partition_entry),
+ le32_to_cpu(agpt->sizeof_partition_entry));
+ error_found++;
+ }
+ if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
+ le32_to_cpu(agpt->partition_entry_array_crc32)) {
+ printk(KERN_WARNING
+ "GPT:partition_entry_array_crc32 values don't match: "
+ "0x%x != 0x%x\n",
+ le32_to_cpu(pgpt->partition_entry_array_crc32),
+ le32_to_cpu(agpt->partition_entry_array_crc32));
+ error_found++;
+ }
+ if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
+ printk(KERN_WARNING
+ "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+ (unsigned long long)lastlba);
+ error_found++;
+ }
+
+ if (le64_to_cpu(agpt->my_lba) != lastlba) {
+ printk(KERN_WARNING
+ "GPT:Alternate GPT header not at the end of the disk.\n");
+ printk(KERN_WARNING "GPT:%lld != %lld\n",
+ (unsigned long long)le64_to_cpu(agpt->my_lba),
+ (unsigned long long)lastlba);
+ error_found++;
+ }
+
+ if (error_found)
+ printk(KERN_WARNING
+ "GPT: Use GNU Parted to correct GPT errors.\n");
+ return;
+}
+
+/**
+ * find_valid_gpt() - Search disk for valid GPT headers and PTEs
+ * @state
+ * @gpt is a GPT header ptr, filled on return.
+ * @ptes is a PTEs ptr, filled on return.
+ * Description: Returns 1 if valid, 0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ * Validity depends on PMBR being valid (or being overridden by the
+ * 'gpt' kernel command line option) and finding either the Primary
+ * GPT header and PTEs valid, or the Alternate GPT header and PTEs
+ * valid. If the Primary GPT header is not valid, the Alternate GPT header
+ * is not checked unless the 'gpt' kernel command line option is passed.
+ * This protects against devices which misreport their size, and forces
+ * the user to decide to use the Alternate GPT.
+ */
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+ gpt_entry **ptes)
+{
+ int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
+ gpt_header *pgpt = NULL, *agpt = NULL;
+ gpt_entry *pptes = NULL, *aptes = NULL;
+ legacy_mbr *legacymbr;
+ u64 lastlba;
+
+ if (!ptes)
+ return 0;
+
+ lastlba = last_lba(state->bdev);
+ if (!force_gpt) {
+ /* This will be added to the EFI Spec. per Intel after v1.02. */
+ legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
+ if (legacymbr) {
+ read_lba(state, 0, (u8 *) legacymbr,
+ sizeof (*legacymbr));
+ good_pmbr = is_pmbr_valid(legacymbr);
+ kfree(legacymbr);
+ }
+ if (!good_pmbr)
+ goto fail;
+ }
+
+ good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
+ &pgpt, &pptes);
+ if (good_pgpt)
+ good_agpt = is_gpt_valid(state,
+ le64_to_cpu(pgpt->alternate_lba),
+ &agpt, &aptes);
+ if (!good_agpt && force_gpt)
+ good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
+
+ /* The obviously unsuccessful case */
+ if (!good_pgpt && !good_agpt)
+ goto fail;
+
+ compare_gpts(pgpt, agpt, lastlba);
+
+ /* The good cases */
+ if (good_pgpt) {
+ *gpt = pgpt;
+ *ptes = pptes;
+ kfree(agpt);
+ kfree(aptes);
+ if (!good_agpt) {
+ printk(KERN_WARNING
+ "Alternate GPT is invalid, "
+ "using primary GPT.\n");
+ }
+ return 1;
+ }
+ else if (good_agpt) {
+ *gpt = agpt;
+ *ptes = aptes;
+ kfree(pgpt);
+ kfree(pptes);
+ printk(KERN_WARNING
+ "Primary GPT is invalid, using alternate GPT.\n");
+ return 1;
+ }
+
+ fail:
+ kfree(pgpt);
+ kfree(agpt);
+ kfree(pptes);
+ kfree(aptes);
+ *gpt = NULL;
+ *ptes = NULL;
+ return 0;
+}
+
+/**
+ * efi_partition(struct parsed_partitions *state)
+ * @state
+ *
+ * Description: called from check.c, if the disk contains GPT
+ * partitions, sets up partition entries in the kernel.
+ *
+ * If the first block on the disk is a legacy MBR,
+ * it will get handled by msdos_partition().
+ * If it's a Protective MBR, we'll handle it here.
+ *
+ * We do not create a Linux partition for GPT, but
+ * only for the actual data partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ * 0 if this isn't our partition table
+ * 1 if successful
+ *
+ */
+int efi_partition(struct parsed_partitions *state)
+{
+ gpt_header *gpt = NULL;
+ gpt_entry *ptes = NULL;
+ u32 i;
+ unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+ u8 unparsed_guid[37];
+
+ if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
+ kfree(gpt);
+ kfree(ptes);
+ return 0;
+ }
+
+ pr_debug("GUID Partition Table is valid! Yea!\n");
+
+ for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+ struct partition_meta_info *info;
+ unsigned label_count = 0;
+ unsigned label_max;
+ u64 start = le64_to_cpu(ptes[i].starting_lba);
+ u64 size = le64_to_cpu(ptes[i].ending_lba) -
+ le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
+ if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
+ continue;
+
+ put_partition(state, i+1, start * ssz, size * ssz);
+
+ /* If this is a RAID volume, tell md */
+ if (!efi_guidcmp(ptes[i].partition_type_guid,
+ PARTITION_LINUX_RAID_GUID))
+ state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+ info = &state->parts[i + 1].info;
+ /* Instead of doing a manual swap to big endian, reuse the
+ * common ASCII hex format as the interim.
+ */
+ efi_guid_unparse(&ptes[i].unique_partition_guid, unparsed_guid);
+ part_pack_uuid(unparsed_guid, info->uuid);
+
+ /* Naively convert UTF16-LE to 7 bits. */
+ label_max = min(sizeof(info->volname) - 1,
+ sizeof(ptes[i].partition_name));
+ info->volname[label_max] = 0;
+ while (label_count < label_max) {
+ u8 c = ptes[i].partition_name[label_count] & 0xff;
+ if (c && !isprint(c))
+ c = '!';
+ info->volname[label_count] = c;
+ label_count++;
+ }
+ state->parts[i + 1].has_info = true;
+ }
+ kfree(ptes);
+ kfree(gpt);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+}
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
new file mode 100644
index 000000000000..b69ab729558f
--- /dev/null
+++ b/block/partitions/efi.h
@@ -0,0 +1,134 @@
+/************************************************************
+ * EFI GUID Partition Table
+ * Per Intel EFI Specification v1.02
+ * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000
+ * Copyright 2000,2001 Dell Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ ************************************************************/
+
+#ifndef FS_PART_EFI_H_INCLUDED
+#define FS_PART_EFI_H_INCLUDED
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/efi.h>
+
+#define MSDOS_MBR_SIGNATURE 0xaa55
+#define EFI_PMBR_OSTYPE_EFI 0xEF
+#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
+
+#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
+#define GPT_HEADER_REVISION_V1 0x00010000
+#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
+
+#define PARTITION_SYSTEM_GUID \
+ EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
+ 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B)
+#define LEGACY_MBR_PARTITION_GUID \
+ EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
+ 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
+#define PARTITION_MSFT_RESERVED_GUID \
+ EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
+ 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
+#define PARTITION_BASIC_DATA_GUID \
+ EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
+ 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
+#define PARTITION_LINUX_RAID_GUID \
+ EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
+ 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
+#define PARTITION_LINUX_SWAP_GUID \
+ EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
+ 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
+#define PARTITION_LINUX_LVM_GUID \
+ EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
+ 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
+
+typedef struct _gpt_header {
+ __le64 signature;
+ __le32 revision;
+ __le32 header_size;
+ __le32 header_crc32;
+ __le32 reserved1;
+ __le64 my_lba;
+ __le64 alternate_lba;
+ __le64 first_usable_lba;
+ __le64 last_usable_lba;
+ efi_guid_t disk_guid;
+ __le64 partition_entry_lba;
+ __le32 num_partition_entries;
+ __le32 sizeof_partition_entry;
+ __le32 partition_entry_array_crc32;
+
+ /* The rest of the logical block is reserved by UEFI and must be zero.
+ * EFI standard handles this by:
+ *
+ * uint8_t reserved2[ BlockSize - 92 ];
+ */
+} __attribute__ ((packed)) gpt_header;
+
+typedef struct _gpt_entry_attributes {
+ u64 required_to_function:1;
+ u64 reserved:47;
+ u64 type_guid_specific:16;
+} __attribute__ ((packed)) gpt_entry_attributes;
+
+typedef struct _gpt_entry {
+ efi_guid_t partition_type_guid;
+ efi_guid_t unique_partition_guid;
+ __le64 starting_lba;
+ __le64 ending_lba;
+ gpt_entry_attributes attributes;
+ efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
+} __attribute__ ((packed)) gpt_entry;
+
+typedef struct _legacy_mbr {
+ u8 boot_code[440];
+ __le32 unique_mbr_signature;
+ __le16 unknown;
+ struct partition partition_record[4];
+ __le16 signature;
+} __attribute__ ((packed)) legacy_mbr;
+
+/* Functions */
+extern int efi_partition(struct parsed_partitions *state);
+
+#endif
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only. This must remain at the end
+ * of the file.
+ * --------------------------------------------------------------------------
+ * Local variables:
+ * c-indent-level: 4
+ * c-brace-imaginary-offset: 0
+ * c-brace-offset: -4
+ * c-argdecl-indent: 4
+ * c-label-offset: -4
+ * c-continued-statement-offset: 4
+ * c-continued-brace-offset: 0
+ * indent-tabs-mode: nil
+ * tab-width: 8
+ * End:
+ */
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
new file mode 100644
index 000000000000..d513a07f44bb
--- /dev/null
+++ b/block/partitions/ibm.c
@@ -0,0 +1,275 @@
+/*
+ * File...........: linux/fs/partitions/ibm.c
+ * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
+ * Volker Sameske <sameske@de.ibm.com>
+ * Bugreports.to..: <Linux390@de.ibm.com>
+ * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/hdreg.h>
+#include <linux/slab.h>
+#include <asm/dasd.h>
+#include <asm/ebcdic.h>
+#include <asm/uaccess.h>
+#include <asm/vtoc.h>
+
+#include "check.h"
+#include "ibm.h"
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head structure
+ */
+static sector_t
+cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
+
+ sector_t cyl;
+ __u16 head;
+
+ /*decode cylinder and heads for large volumes */
+ cyl = ptr->hh & 0xFFF0;
+ cyl <<= 12;
+ cyl |= ptr->cc;
+ head = ptr->hh & 0x000F;
+ return cyl * geo->heads * geo->sectors +
+ head * geo->sectors;
+}
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head-block structure
+ */
+static sector_t
+cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
+
+ sector_t cyl;
+ __u16 head;
+
+ /*decode cylinder and heads for large volumes */
+ cyl = ptr->hh & 0xFFF0;
+ cyl <<= 12;
+ cyl |= ptr->cc;
+ head = ptr->hh & 0x000F;
+ return cyl * geo->heads * geo->sectors +
+ head * geo->sectors +
+ ptr->b;
+}
+
+/*
+ */
+int ibm_partition(struct parsed_partitions *state)
+{
+ struct block_device *bdev = state->bdev;
+ int blocksize, res;
+ loff_t i_size, offset, size, fmt_size;
+ dasd_information2_t *info;
+ struct hd_geometry *geo;
+ char type[5] = {0,};
+ char name[7] = {0,};
+ union label_t {
+ struct vtoc_volume_label_cdl vol;
+ struct vtoc_volume_label_ldl lnx;
+ struct vtoc_cms_label cms;
+ } *label;
+ unsigned char *data;
+ Sector sect;
+ sector_t labelsect;
+ char tmp[64];
+
+ res = 0;
+ blocksize = bdev_logical_block_size(bdev);
+ if (blocksize <= 0)
+ goto out_exit;
+ i_size = i_size_read(bdev->bd_inode);
+ if (i_size == 0)
+ goto out_exit;
+
+ info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
+ if (info == NULL)
+ goto out_exit;
+ geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
+ if (geo == NULL)
+ goto out_nogeo;
+ label = kmalloc(sizeof(union label_t), GFP_KERNEL);
+ if (label == NULL)
+ goto out_nolab;
+
+ if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0 ||
+ ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
+ goto out_freeall;
+
+ /*
+ * Special case for FBA disks: label sector does not depend on
+ * blocksize.
+ */
+ if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+ (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+ labelsect = info->label_block;
+ else
+ labelsect = info->label_block * (blocksize >> 9);
+
+ /*
+ * Get volume label, extract name and type.
+ */
+ data = read_part_sector(state, labelsect, &sect);
+ if (data == NULL)
+ goto out_readerr;
+
+ memcpy(label, data, sizeof(union label_t));
+ put_dev_sector(sect);
+
+ if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
+ strncpy(type, label->vol.vollbl, 4);
+ strncpy(name, label->vol.volid, 6);
+ } else {
+ strncpy(type, label->lnx.vollbl, 4);
+ strncpy(name, label->lnx.volid, 6);
+ }
+ EBCASC(type, 4);
+ EBCASC(name, 6);
+
+ res = 1;
+
+ /*
+ * Three different formats: LDL, CDL and unformated disk
+ *
+ * identified by info->format
+ *
+ * unformated disks we do not have to care about
+ */
+ if (info->format == DASD_FORMAT_LDL) {
+ if (strncmp(type, "CMS1", 4) == 0) {
+ /*
+ * VM style CMS1 labeled disk
+ */
+ blocksize = label->cms.block_size;
+ if (label->cms.disk_offset != 0) {
+ snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ /* disk is reserved minidisk */
+ offset = label->cms.disk_offset;
+ size = (label->cms.block_count - 1)
+ * (blocksize >> 9);
+ } else {
+ snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ offset = (info->label_block + 1);
+ size = label->cms.block_count
+ * (blocksize >> 9);
+ }
+ put_partition(state, 1, offset*(blocksize >> 9),
+ size-offset*(blocksize >> 9));
+ } else {
+ if (strncmp(type, "LNX1", 4) == 0) {
+ snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ if (label->lnx.ldl_version == 0xf2) {
+ fmt_size = label->lnx.formatted_blocks
+ * (blocksize >> 9);
+ } else if (!strcmp(info->type, "ECKD")) {
+ /* formated w/o large volume support */
+ fmt_size = geo->cylinders * geo->heads
+ * geo->sectors * (blocksize >> 9);
+ } else {
+ /* old label and no usable disk geometry
+ * (e.g. DIAG) */
+ fmt_size = i_size >> 9;
+ }
+ size = i_size >> 9;
+ if (fmt_size < size)
+ size = fmt_size;
+ offset = (info->label_block + 1);
+ } else {
+ /* unlabeled disk */
+ strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
+ size = i_size >> 9;
+ offset = (info->label_block + 1);
+ }
+ put_partition(state, 1, offset*(blocksize >> 9),
+ size-offset*(blocksize >> 9));
+ }
+ } else if (info->format == DASD_FORMAT_CDL) {
+ /*
+ * New style CDL formatted disk
+ */
+ sector_t blk;
+ int counter;
+
+ /*
+ * check if VOL1 label is available
+ * if not, something is wrong, skipping partition detection
+ */
+ if (strncmp(type, "VOL1", 4) == 0) {
+ snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ /*
+ * get block number and read then go through format1
+ * labels
+ */
+ blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
+ counter = 0;
+ data = read_part_sector(state, blk * (blocksize/512),
+ &sect);
+ while (data != NULL) {
+ struct vtoc_format1_label f1;
+
+ memcpy(&f1, data,
+ sizeof(struct vtoc_format1_label));
+ put_dev_sector(sect);
+
+ /* skip FMT4 / FMT5 / FMT7 labels */
+ if (f1.DS1FMTID == _ascebc['4']
+ || f1.DS1FMTID == _ascebc['5']
+ || f1.DS1FMTID == _ascebc['7']
+ || f1.DS1FMTID == _ascebc['9']) {
+ blk++;
+ data = read_part_sector(state,
+ blk * (blocksize/512), &sect);
+ continue;
+ }
+
+ /* only FMT1 and 8 labels valid at this point */
+ if (f1.DS1FMTID != _ascebc['1'] &&
+ f1.DS1FMTID != _ascebc['8'])
+ break;
+
+ /* OK, we got valid partition data */
+ offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
+ size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
+ offset + geo->sectors;
+ if (counter >= state->limit)
+ break;
+ put_partition(state, counter + 1,
+ offset * (blocksize >> 9),
+ size * (blocksize >> 9));
+ counter++;
+ blk++;
+ data = read_part_sector(state,
+ blk * (blocksize/512), &sect);
+ }
+
+ if (!data)
+ /* Are we not supposed to report this ? */
+ goto out_readerr;
+ } else
+ printk(KERN_WARNING "Warning, expected Label VOL1 not "
+ "found, treating as CDL formated Disk");
+
+ }
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ goto out_freeall;
+
+
+out_readerr:
+ res = -1;
+out_freeall:
+ kfree(label);
+out_nolab:
+ kfree(geo);
+out_nogeo:
+ kfree(info);
+out_exit:
+ return res;
+}
diff --git a/block/partitions/ibm.h b/block/partitions/ibm.h
new file mode 100644
index 000000000000..08fb0804a812
--- /dev/null
+++ b/block/partitions/ibm.h
@@ -0,0 +1 @@
+int ibm_partition(struct parsed_partitions *);
diff --git a/block/partitions/karma.c b/block/partitions/karma.c
new file mode 100644
index 000000000000..0ea19312706b
--- /dev/null
+++ b/block/partitions/karma.c
@@ -0,0 +1,57 @@
+/*
+ * fs/partitions/karma.c
+ * Rio Karma partition info.
+ *
+ * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
+ * based on osf.c
+ */
+
+#include "check.h"
+#include "karma.h"
+
+int karma_partition(struct parsed_partitions *state)
+{
+ int i;
+ int slot = 1;
+ Sector sect;
+ unsigned char *data;
+ struct disklabel {
+ u8 d_reserved[270];
+ struct d_partition {
+ __le32 p_res;
+ u8 p_fstype;
+ u8 p_res2[3];
+ __le32 p_offset;
+ __le32 p_size;
+ } d_partitions[2];
+ u8 d_blank[208];
+ __le16 d_magic;
+ } __attribute__((packed)) *label;
+ struct d_partition *p;
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+
+ label = (struct disklabel *)data;
+ if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ p = label->d_partitions;
+ for (i = 0 ; i < 2; i++, p++) {
+ if (slot == state->limit)
+ break;
+
+ if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
+ put_partition(state, slot, le32_to_cpu(p->p_offset),
+ le32_to_cpu(p->p_size));
+ }
+ slot++;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+}
+
diff --git a/block/partitions/karma.h b/block/partitions/karma.h
new file mode 100644
index 000000000000..c764b2e9df21
--- /dev/null
+++ b/block/partitions/karma.h
@@ -0,0 +1,8 @@
+/*
+ * fs/partitions/karma.h
+ */
+
+#define KARMA_LABEL_MAGIC 0xAB56
+
+int karma_partition(struct parsed_partitions *state);
+
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
new file mode 100644
index 000000000000..e507cfbd044e
--- /dev/null
+++ b/block/partitions/ldm.c
@@ -0,0 +1,1567 @@
+/**
+ * ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
+ *
+ * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
+ * Copyright (c) 2001-2012 Anton Altaparmakov
+ * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
+ *
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option) any later
+ * version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program (in the main directory of the source in the file COPYING); if
+ * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ */
+
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/stringify.h>
+#include <linux/kernel.h>
+#include "ldm.h"
+#include "check.h"
+#include "msdos.h"
+
+/**
+ * ldm_debug/info/error/crit - Output an error message
+ * @f: A printf format string containing the message
+ * @...: Variables to substitute into @f
+ *
+ * ldm_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with debug enabled. Otherwise, the call turns into a NOP.
+ */
+#ifndef CONFIG_LDM_DEBUG
+#define ldm_debug(...) do {} while (0)
+#else
+#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a)
+#endif
+
+#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a)
+#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a)
+#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a)
+
+static __printf(3, 4)
+void _ldm_printk(const char *level, const char *function, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start (args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%s%s(): %pV\n", level, function, &vaf);
+
+ va_end(args);
+}
+
+/**
+ * ldm_parse_hexbyte - Convert a ASCII hex number to a byte
+ * @src: Pointer to at least 2 characters to convert.
+ *
+ * Convert a two character ASCII hex string to a number.
+ *
+ * Return: 0-255 Success, the byte was parsed correctly
+ * -1 Error, an invalid character was supplied
+ */
+static int ldm_parse_hexbyte (const u8 *src)
+{
+ unsigned int x; /* For correct wrapping */
+ int h;
+
+ /* high part */
+ x = h = hex_to_bin(src[0]);
+ if (h < 0)
+ return -1;
+
+ /* low part */
+ h = hex_to_bin(src[1]);
+ if (h < 0)
+ return -1;
+
+ return (x << 4) + h;
+}
+
+/**
+ * ldm_parse_guid - Convert GUID from ASCII to binary
+ * @src: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ * @dest: Memory block to hold binary GUID (16 bytes)
+ *
+ * N.B. The GUID need not be NULL terminated.
+ *
+ * Return: 'true' @dest contains binary GUID
+ * 'false' @dest contents are undefined
+ */
+static bool ldm_parse_guid (const u8 *src, u8 *dest)
+{
+ static const int size[] = { 4, 2, 2, 2, 6 };
+ int i, j, v;
+
+ if (src[8] != '-' || src[13] != '-' ||
+ src[18] != '-' || src[23] != '-')
+ return false;
+
+ for (j = 0; j < 5; j++, src++)
+ for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
+ if ((v = ldm_parse_hexbyte (src)) < 0)
+ return false;
+
+ return true;
+}
+
+/**
+ * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure
+ * @data: Raw database PRIVHEAD structure loaded from the device
+ * @ph: In-memory privhead structure in which to return parsed information
+ *
+ * This parses the LDM database PRIVHEAD structure supplied in @data and
+ * sets up the in-memory privhead structure @ph with the obtained information.
+ *
+ * Return: 'true' @ph contains the PRIVHEAD data
+ * 'false' @ph contents are undefined
+ */
+static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
+{
+ bool is_vista = false;
+
+ BUG_ON(!data || !ph);
+ if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
+ ldm_error("Cannot find PRIVHEAD structure. LDM database is"
+ " corrupt. Aborting.");
+ return false;
+ }
+ ph->ver_major = get_unaligned_be16(data + 0x000C);
+ ph->ver_minor = get_unaligned_be16(data + 0x000E);
+ ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
+ ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
+ ph->config_start = get_unaligned_be64(data + 0x012B);
+ ph->config_size = get_unaligned_be64(data + 0x0133);
+ /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
+ if (ph->ver_major == 2 && ph->ver_minor == 12)
+ is_vista = true;
+ if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) {
+ ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d."
+ " Aborting.", ph->ver_major, ph->ver_minor);
+ return false;
+ }
+ ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major,
+ ph->ver_minor, is_vista ? "Vista" : "2000/XP");
+ if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */
+ /* Warn the user and continue, carefully. */
+ ldm_info("Database is normally %u bytes, it claims to "
+ "be %llu bytes.", LDM_DB_SIZE,
+ (unsigned long long)ph->config_size);
+ }
+ if ((ph->logical_disk_size == 0) || (ph->logical_disk_start +
+ ph->logical_disk_size > ph->config_start)) {
+ ldm_error("PRIVHEAD disk size doesn't match real disk size");
+ return false;
+ }
+ if (!ldm_parse_guid(data + 0x0030, ph->disk_id)) {
+ ldm_error("PRIVHEAD contains an invalid GUID.");
+ return false;
+ }
+ ldm_debug("Parsed PRIVHEAD successfully.");
+ return true;
+}
+
+/**
+ * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure
+ * @data: Raw database TOCBLOCK structure loaded from the device
+ * @toc: In-memory toc structure in which to return parsed information
+ *
+ * This parses the LDM Database TOCBLOCK (table of contents) structure supplied
+ * in @data and sets up the in-memory tocblock structure @toc with the obtained
+ * information.
+ *
+ * N.B. The *_start and *_size values returned in @toc are not range-checked.
+ *
+ * Return: 'true' @toc contains the TOCBLOCK data
+ * 'false' @toc contents are undefined
+ */
+static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
+{
+ BUG_ON (!data || !toc);
+
+ if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
+ ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
+ return false;
+ }
+ strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
+ toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
+ toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
+ toc->bitmap1_size = get_unaligned_be64(data + 0x36);
+
+ if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
+ sizeof (toc->bitmap1_name)) != 0) {
+ ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
+ TOC_BITMAP1, toc->bitmap1_name);
+ return false;
+ }
+ strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
+ toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
+ toc->bitmap2_start = get_unaligned_be64(data + 0x50);
+ toc->bitmap2_size = get_unaligned_be64(data + 0x58);
+ if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
+ sizeof (toc->bitmap2_name)) != 0) {
+ ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
+ TOC_BITMAP2, toc->bitmap2_name);
+ return false;
+ }
+ ldm_debug ("Parsed TOCBLOCK successfully.");
+ return true;
+}
+
+/**
+ * ldm_parse_vmdb - Read the LDM Database VMDB structure
+ * @data: Raw database VMDB structure loaded from the device
+ * @vm: In-memory vmdb structure in which to return parsed information
+ *
+ * This parses the LDM Database VMDB structure supplied in @data and sets up
+ * the in-memory vmdb structure @vm with the obtained information.
+ *
+ * N.B. The *_start, *_size and *_seq values will be range-checked later.
+ *
+ * Return: 'true' @vm contains VMDB info
+ * 'false' @vm contents are undefined
+ */
+static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
+{
+ BUG_ON (!data || !vm);
+
+ if (MAGIC_VMDB != get_unaligned_be32(data)) {
+ ldm_crit ("Cannot find the VMDB, database may be corrupt.");
+ return false;
+ }
+
+ vm->ver_major = get_unaligned_be16(data + 0x12);
+ vm->ver_minor = get_unaligned_be16(data + 0x14);
+ if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
+ ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
+ "Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
+ return false;
+ }
+
+ vm->vblk_size = get_unaligned_be32(data + 0x08);
+ if (vm->vblk_size == 0) {
+ ldm_error ("Illegal VBLK size");
+ return false;
+ }
+
+ vm->vblk_offset = get_unaligned_be32(data + 0x0C);
+ vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
+
+ ldm_debug ("Parsed VMDB successfully.");
+ return true;
+}
+
+/**
+ * ldm_compare_privheads - Compare two privhead objects
+ * @ph1: First privhead
+ * @ph2: Second privhead
+ *
+ * This compares the two privhead structures @ph1 and @ph2.
+ *
+ * Return: 'true' Identical
+ * 'false' Different
+ */
+static bool ldm_compare_privheads (const struct privhead *ph1,
+ const struct privhead *ph2)
+{
+ BUG_ON (!ph1 || !ph2);
+
+ return ((ph1->ver_major == ph2->ver_major) &&
+ (ph1->ver_minor == ph2->ver_minor) &&
+ (ph1->logical_disk_start == ph2->logical_disk_start) &&
+ (ph1->logical_disk_size == ph2->logical_disk_size) &&
+ (ph1->config_start == ph2->config_start) &&
+ (ph1->config_size == ph2->config_size) &&
+ !memcmp (ph1->disk_id, ph2->disk_id, GUID_SIZE));
+}
+
+/**
+ * ldm_compare_tocblocks - Compare two tocblock objects
+ * @toc1: First toc
+ * @toc2: Second toc
+ *
+ * This compares the two tocblock structures @toc1 and @toc2.
+ *
+ * Return: 'true' Identical
+ * 'false' Different
+ */
+static bool ldm_compare_tocblocks (const struct tocblock *toc1,
+ const struct tocblock *toc2)
+{
+ BUG_ON (!toc1 || !toc2);
+
+ return ((toc1->bitmap1_start == toc2->bitmap1_start) &&
+ (toc1->bitmap1_size == toc2->bitmap1_size) &&
+ (toc1->bitmap2_start == toc2->bitmap2_start) &&
+ (toc1->bitmap2_size == toc2->bitmap2_size) &&
+ !strncmp (toc1->bitmap1_name, toc2->bitmap1_name,
+ sizeof (toc1->bitmap1_name)) &&
+ !strncmp (toc1->bitmap2_name, toc2->bitmap2_name,
+ sizeof (toc1->bitmap2_name)));
+}
+
+/**
+ * ldm_validate_privheads - Compare the primary privhead with its backups
+ * @state: Partition check state including device holding the LDM Database
+ * @ph1: Memory struct to fill with ph contents
+ *
+ * Read and compare all three privheads from disk.
+ *
+ * The privheads on disk show the size and location of the main disk area and
+ * the configuration area (the database). The values are range-checked against
+ * @hd, which contains the real size of the disk.
+ *
+ * Return: 'true' Success
+ * 'false' Error
+ */
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+ struct privhead *ph1)
+{
+ static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
+ struct privhead *ph[3] = { ph1 };
+ Sector sect;
+ u8 *data;
+ bool result = false;
+ long num_sects;
+ int i;
+
+ BUG_ON (!state || !ph1);
+
+ ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
+ ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
+ if (!ph[1] || !ph[2]) {
+ ldm_crit ("Out of memory.");
+ goto out;
+ }
+
+ /* off[1 & 2] are relative to ph[0]->config_start */
+ ph[0]->config_start = 0;
+
+ /* Read and parse privheads */
+ for (i = 0; i < 3; i++) {
+ data = read_part_sector(state, ph[0]->config_start + off[i],
+ &sect);
+ if (!data) {
+ ldm_crit ("Disk read failed.");
+ goto out;
+ }
+ result = ldm_parse_privhead (data, ph[i]);
+ put_dev_sector (sect);
+ if (!result) {
+ ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */
+ if (i < 2)
+ goto out; /* Already logged */
+ else
+ break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */
+ }
+ }
+
+ num_sects = state->bdev->bd_inode->i_size >> 9;
+
+ if ((ph[0]->config_start > num_sects) ||
+ ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
+ ldm_crit ("Database extends beyond the end of the disk.");
+ goto out;
+ }
+
+ if ((ph[0]->logical_disk_start > ph[0]->config_start) ||
+ ((ph[0]->logical_disk_start + ph[0]->logical_disk_size)
+ > ph[0]->config_start)) {
+ ldm_crit ("Disk and database overlap.");
+ goto out;
+ }
+
+ if (!ldm_compare_privheads (ph[0], ph[1])) {
+ ldm_crit ("Primary and backup PRIVHEADs don't match.");
+ goto out;
+ }
+ /* FIXME ignore this for now
+ if (!ldm_compare_privheads (ph[0], ph[2])) {
+ ldm_crit ("Primary and backup PRIVHEADs don't match.");
+ goto out;
+ }*/
+ ldm_debug ("Validated PRIVHEADs successfully.");
+ result = true;
+out:
+ kfree (ph[1]);
+ kfree (ph[2]);
+ return result;
+}
+
+/**
+ * ldm_validate_tocblocks - Validate the table of contents and its backups
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
+ * @ldb: Cache of the database structures
+ *
+ * Find and compare the four tables of contents of the LDM Database stored on
+ * @state->bdev and return the parsed information into @toc1.
+ *
+ * The offsets and sizes of the configs are range-checked against a privhead.
+ *
+ * Return: 'true' @toc1 contains validated TOCBLOCK info
+ * 'false' @toc1 contents are undefined
+ */
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
+{
+ static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
+ struct tocblock *tb[4];
+ struct privhead *ph;
+ Sector sect;
+ u8 *data;
+ int i, nr_tbs;
+ bool result = false;
+
+ BUG_ON(!state || !ldb);
+ ph = &ldb->ph;
+ tb[0] = &ldb->toc;
+ tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
+ if (!tb[1]) {
+ ldm_crit("Out of memory.");
+ goto err;
+ }
+ tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1]));
+ tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2]));
+ /*
+ * Try to read and parse all four TOCBLOCKs.
+ *
+ * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so
+ * skip any that fail as long as we get at least one valid TOCBLOCK.
+ */
+ for (nr_tbs = i = 0; i < 4; i++) {
+ data = read_part_sector(state, base + off[i], &sect);
+ if (!data) {
+ ldm_error("Disk read failed for TOCBLOCK %d.", i);
+ continue;
+ }
+ if (ldm_parse_tocblock(data, tb[nr_tbs]))
+ nr_tbs++;
+ put_dev_sector(sect);
+ }
+ if (!nr_tbs) {
+ ldm_crit("Failed to find a valid TOCBLOCK.");
+ goto err;
+ }
+ /* Range check the TOCBLOCK against a privhead. */
+ if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) ||
+ ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) >
+ ph->config_size)) {
+ ldm_crit("The bitmaps are out of range. Giving up.");
+ goto err;
+ }
+ /* Compare all loaded TOCBLOCKs. */
+ for (i = 1; i < nr_tbs; i++) {
+ if (!ldm_compare_tocblocks(tb[0], tb[i])) {
+ ldm_crit("TOCBLOCKs 0 and %d do not match.", i);
+ goto err;
+ }
+ }
+ ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs);
+ result = true;
+err:
+ kfree(tb[1]);
+ return result;
+}
+
+/**
+ * ldm_validate_vmdb - Read the VMDB and validate it
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @bdev, of the database
+ * @ldb: Cache of the database structures
+ *
+ * Find the vmdb of the LDM Database stored on @bdev and return the parsed
+ * information in @ldb.
+ *
+ * Return: 'true' @ldb contains validated VBDB info
+ * 'false' @ldb contents are undefined
+ */
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
+{
+ Sector sect;
+ u8 *data;
+ bool result = false;
+ struct vmdb *vm;
+ struct tocblock *toc;
+
+ BUG_ON (!state || !ldb);
+
+ vm = &ldb->vm;
+ toc = &ldb->toc;
+
+ data = read_part_sector(state, base + OFF_VMDB, &sect);
+ if (!data) {
+ ldm_crit ("Disk read failed.");
+ return false;
+ }
+
+ if (!ldm_parse_vmdb (data, vm))
+ goto out; /* Already logged */
+
+ /* Are there uncommitted transactions? */
+ if (get_unaligned_be16(data + 0x10) != 0x01) {
+ ldm_crit ("Database is not in a consistent state. Aborting.");
+ goto out;
+ }
+
+ if (vm->vblk_offset != 512)
+ ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset);
+
+ /*
+ * The last_vblkd_seq can be before the end of the vmdb, just make sure
+ * it is not out of bounds.
+ */
+ if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) {
+ ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. "
+ "Database is corrupt. Aborting.");
+ goto out;
+ }
+
+ result = true;
+out:
+ put_dev_sector (sect);
+ return result;
+}
+
+
+/**
+ * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
+ * @state: Partition check state including device holding the LDM Database
+ *
+ * This function provides a weak test to decide whether the device is a dynamic
+ * disk or not. It looks for an MS-DOS-style partition table containing at
+ * least one partition of type 0x42 (formerly SFS, now used by Windows for
+ * dynamic disks).
+ *
+ * N.B. The only possible error can come from the read_part_sector and that is
+ * only likely to happen if the underlying device is strange. If that IS
+ * the case we should return zero to let someone else try.
+ *
+ * Return: 'true' @state->bdev is a dynamic disk
+ * 'false' @state->bdev is not a dynamic disk, or an error occurred
+ */
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
+{
+ Sector sect;
+ u8 *data;
+ struct partition *p;
+ int i;
+ bool result = false;
+
+ BUG_ON(!state);
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data) {
+ ldm_info ("Disk read failed.");
+ return false;
+ }
+
+ if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
+ goto out;
+
+ p = (struct partition*)(data + 0x01BE);
+ for (i = 0; i < 4; i++, p++)
+ if (SYS_IND (p) == LDM_PARTITION) {
+ result = true;
+ break;
+ }
+
+ if (result)
+ ldm_debug ("Found W2K dynamic disk partition type.");
+
+out:
+ put_dev_sector (sect);
+ return result;
+}
+
+/**
+ * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id
+ * @ldb: Cache of the database structures
+ *
+ * The LDM Database contains a list of all partitions on all dynamic disks.
+ * The primary PRIVHEAD, at the beginning of the physical disk, tells us
+ * the GUID of this disk. This function searches for the GUID in a linked
+ * list of vblk's.
+ *
+ * Return: Pointer, A matching vblk was found
+ * NULL, No match, or an error
+ */
+static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
+{
+ struct list_head *item;
+
+ BUG_ON (!ldb);
+
+ list_for_each (item, &ldb->v_disk) {
+ struct vblk *v = list_entry (item, struct vblk, list);
+ if (!memcmp (v->vblk.disk.disk_id, ldb->ph.disk_id, GUID_SIZE))
+ return v;
+ }
+
+ return NULL;
+}
+
+/**
+ * ldm_create_data_partitions - Create data partitions for this device
+ * @pp: List of the partitions parsed so far
+ * @ldb: Cache of the database structures
+ *
+ * The database contains ALL the partitions for ALL disk groups, so we need to
+ * filter out this specific disk. Using the disk's object id, we can find all
+ * the partitions in the database that belong to this disk.
+ *
+ * Add each partition in our database, to the parsed_partitions structure.
+ *
+ * N.B. This function creates the partitions in the order it finds partition
+ * objects in the linked list.
+ *
+ * Return: 'true' Partition created
+ * 'false' Error, probably a range checking problem
+ */
+static bool ldm_create_data_partitions (struct parsed_partitions *pp,
+ const struct ldmdb *ldb)
+{
+ struct list_head *item;
+ struct vblk *vb;
+ struct vblk *disk;
+ struct vblk_part *part;
+ int part_num = 1;
+
+ BUG_ON (!pp || !ldb);
+
+ disk = ldm_get_disk_objid (ldb);
+ if (!disk) {
+ ldm_crit ("Can't find the ID of this disk in the database.");
+ return false;
+ }
+
+ strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
+
+ /* Create the data partitions */
+ list_for_each (item, &ldb->v_part) {
+ vb = list_entry (item, struct vblk, list);
+ part = &vb->vblk.part;
+
+ if (part->disk_id != disk->obj_id)
+ continue;
+
+ put_partition (pp, part_num, ldb->ph.logical_disk_start +
+ part->start, part->size);
+ part_num++;
+ }
+
+ strlcat(pp->pp_buf, "\n", PAGE_SIZE);
+ return true;
+}
+
+
+/**
+ * ldm_relative - Calculate the next relative offset
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @base: Size of the previous fixed width fields
+ * @offset: Cumulative size of the previous variable-width fields
+ *
+ * Because many of the VBLK fields are variable-width, it's necessary
+ * to calculate each offset based on the previous one and the length
+ * of the field it pointed to.
+ *
+ * Return: -1 Error, the calculated offset exceeded the size of the buffer
+ * n OK, a range-checked offset into buffer
+ */
+static int ldm_relative(const u8 *buffer, int buflen, int base, int offset)
+{
+
+ base += offset;
+ if (!buffer || offset < 0 || base > buflen) {
+ if (!buffer)
+ ldm_error("!buffer");
+ if (offset < 0)
+ ldm_error("offset (%d) < 0", offset);
+ if (base > buflen)
+ ldm_error("base (%d) > buflen (%d)", base, buflen);
+ return -1;
+ }
+ if (base + buffer[base] >= buflen) {
+ ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base,
+ buffer[base], buflen);
+ return -1;
+ }
+ return buffer[base] + offset + 1;
+}
+
+/**
+ * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order
+ * @block: Pointer to the variable-width number to convert
+ *
+ * Large numbers in the LDM Database are often stored in a packed format. Each
+ * number is prefixed by a one byte width marker. All numbers in the database
+ * are stored in big-endian byte order. This function reads one of these
+ * numbers and returns the result
+ *
+ * N.B. This function DOES NOT perform any range checking, though the most
+ * it will read is eight bytes.
+ *
+ * Return: n A number
+ * 0 Zero, or an error occurred
+ */
+static u64 ldm_get_vnum (const u8 *block)
+{
+ u64 tmp = 0;
+ u8 length;
+
+ BUG_ON (!block);
+
+ length = *block++;
+
+ if (length && length <= 8)
+ while (length--)
+ tmp = (tmp << 8) | *block++;
+ else
+ ldm_error ("Illegal length %d.", length);
+
+ return tmp;
+}
+
+/**
+ * ldm_get_vstr - Read a length-prefixed string into a buffer
+ * @block: Pointer to the length marker
+ * @buffer: Location to copy string to
+ * @buflen: Size of the output buffer
+ *
+ * Many of the strings in the LDM Database are not NULL terminated. Instead
+ * they are prefixed by a one byte length marker. This function copies one of
+ * these strings into a buffer.
+ *
+ * N.B. This function DOES NOT perform any range checking on the input.
+ * If the buffer is too small, the output will be truncated.
+ *
+ * Return: 0, Error and @buffer contents are undefined
+ * n, String length in characters (excluding NULL)
+ * buflen-1, String was truncated.
+ */
+static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
+{
+ int length;
+
+ BUG_ON (!block || !buffer);
+
+ length = block[0];
+ if (length >= buflen) {
+ ldm_error ("Truncating string %d -> %d.", length, buflen);
+ length = buflen - 1;
+ }
+ memcpy (buffer, block + 1, length);
+ buffer[length] = 0;
+ return length;
+}
+
+
+/**
+ * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Component object (version 3) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Component VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
+ struct vblk_comp *comp;
+
+ BUG_ON (!buffer || !vb);
+
+ r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+ r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
+ r_vstate = ldm_relative (buffer, buflen, 0x18, r_name);
+ r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate);
+ r_parent = ldm_relative (buffer, buflen, 0x2D, r_child);
+
+ if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) {
+ r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent);
+ r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe);
+ len = r_cols;
+ } else {
+ r_stripe = 0;
+ r_cols = 0;
+ len = r_parent;
+ }
+ if (len < 0)
+ return false;
+
+ len += VBLK_SIZE_CMP3;
+ if (len != get_unaligned_be32(buffer + 0x14))
+ return false;
+
+ comp = &vb->vblk.comp;
+ ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
+ sizeof (comp->state));
+ comp->type = buffer[0x18 + r_vstate];
+ comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate);
+ comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
+ comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
+
+ return true;
+}
+
+/**
+ * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Disk Group VBLK
+ * 'false' @vb contents are not defined
+ */
+static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, r_diskid, r_id1, r_id2, len;
+ struct vblk_dgrp *dgrp;
+
+ BUG_ON (!buffer || !vb);
+
+ r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+ r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
+ r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
+
+ if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) {
+ r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
+ r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
+ len = r_id2;
+ } else {
+ r_id1 = 0;
+ r_id2 = 0;
+ len = r_diskid;
+ }
+ if (len < 0)
+ return false;
+
+ len += VBLK_SIZE_DGR3;
+ if (len != get_unaligned_be32(buffer + 0x14))
+ return false;
+
+ dgrp = &vb->vblk.dgrp;
+ ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
+ sizeof (dgrp->disk_id));
+ return true;
+}
+
+/**
+ * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Disk Group VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+ char buf[64];
+ int r_objid, r_name, r_id1, r_id2, len;
+ struct vblk_dgrp *dgrp;
+
+ BUG_ON (!buffer || !vb);
+
+ r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+ r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
+
+ if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) {
+ r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
+ r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
+ len = r_id2;
+ } else {
+ r_id1 = 0;
+ r_id2 = 0;
+ len = r_name;
+ }
+ if (len < 0)
+ return false;
+
+ len += VBLK_SIZE_DGR4;
+ if (len != get_unaligned_be32(buffer + 0x14))
+ return false;
+
+ dgrp = &vb->vblk.dgrp;
+
+ ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
+ return true;
+}
+
+/**
+ * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk object (version 3) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Disk VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, r_diskid, r_altname, len;
+ struct vblk_disk *disk;
+
+ BUG_ON (!buffer || !vb);
+
+ r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+ r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
+ r_diskid = ldm_relative (buffer, buflen, 0x18, r_name);
+ r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
+ len = r_altname;
+ if (len < 0)
+ return false;
+
+ len += VBLK_SIZE_DSK3;
+ if (len != get_unaligned_be32(buffer + 0x14))
+ return false;
+
+ disk = &vb->vblk.disk;
+ ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
+ sizeof (disk->alt_name));
+ if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
+ return false;
+
+ return true;
+}
+
+/**
+ * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Disk object (version 4) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Disk VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, len;
+ struct vblk_disk *disk;
+
+ BUG_ON (!buffer || !vb);
+
+ r_objid = ldm_relative (buffer, buflen, 0x18, 0);
+ r_name = ldm_relative (buffer, buflen, 0x18, r_objid);
+ len = r_name;
+ if (len < 0)
+ return false;
+
+ len += VBLK_SIZE_DSK4;
+ if (len != get_unaligned_be32(buffer + 0x14))
+ return false;
+
+ disk = &vb->vblk.disk;
+ memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
+ return true;
+}
+
+/**
+ * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Partition object (version 3) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Partition VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
+ struct vblk_part *part;
+
+ BUG_ON(!buffer || !vb);
+ r_objid = ldm_relative(buffer, buflen, 0x18, 0);
+ if (r_objid < 0) {
+ ldm_error("r_objid %d < 0", r_objid);
+ return false;
+ }
+ r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+ if (r_name < 0) {
+ ldm_error("r_name %d < 0", r_name);
+ return false;
+ }
+ r_size = ldm_relative(buffer, buflen, 0x34, r_name);
+ if (r_size < 0) {
+ ldm_error("r_size %d < 0", r_size);
+ return false;
+ }
+ r_parent = ldm_relative(buffer, buflen, 0x34, r_size);
+ if (r_parent < 0) {
+ ldm_error("r_parent %d < 0", r_parent);
+ return false;
+ }
+ r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent);
+ if (r_diskid < 0) {
+ ldm_error("r_diskid %d < 0", r_diskid);
+ return false;
+ }
+ if (buffer[0x12] & VBLK_FLAG_PART_INDEX) {
+ r_index = ldm_relative(buffer, buflen, 0x34, r_diskid);
+ if (r_index < 0) {
+ ldm_error("r_index %d < 0", r_index);
+ return false;
+ }
+ len = r_index;
+ } else {
+ r_index = 0;
+ len = r_diskid;
+ }
+ if (len < 0) {
+ ldm_error("len %d < 0", len);
+ return false;
+ }
+ len += VBLK_SIZE_PRT3;
+ if (len > get_unaligned_be32(buffer + 0x14)) {
+ ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+ get_unaligned_be32(buffer + 0x14));
+ return false;
+ }
+ part = &vb->vblk.part;
+ part->start = get_unaligned_be64(buffer + 0x24 + r_name);
+ part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
+ part->size = ldm_get_vnum(buffer + 0x34 + r_name);
+ part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
+ part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
+ if (vb->flags & VBLK_FLAG_PART_INDEX)
+ part->partnum = buffer[0x35 + r_diskid];
+ else
+ part->partnum = 0;
+ return true;
+}
+
+/**
+ * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure
+ * @buffer: Block of data being worked on
+ * @buflen: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK Volume object (version 5) into a vblk structure.
+ *
+ * Return: 'true' @vb contains a Volume VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
+{
+ int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size;
+ int r_id1, r_id2, r_size2, r_drive, len;
+ struct vblk_volu *volu;
+
+ BUG_ON(!buffer || !vb);
+ r_objid = ldm_relative(buffer, buflen, 0x18, 0);
+ if (r_objid < 0) {
+ ldm_error("r_objid %d < 0", r_objid);
+ return false;
+ }
+ r_name = ldm_relative(buffer, buflen, 0x18, r_objid);
+ if (r_name < 0) {
+ ldm_error("r_name %d < 0", r_name);
+ return false;
+ }
+ r_vtype = ldm_relative(buffer, buflen, 0x18, r_name);
+ if (r_vtype < 0) {
+ ldm_error("r_vtype %d < 0", r_vtype);
+ return false;
+ }
+ r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype);
+ if (r_disable_drive_letter < 0) {
+ ldm_error("r_disable_drive_letter %d < 0",
+ r_disable_drive_letter);
+ return false;
+ }
+ r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter);
+ if (r_child < 0) {
+ ldm_error("r_child %d < 0", r_child);
+ return false;
+ }
+ r_size = ldm_relative(buffer, buflen, 0x3D, r_child);
+ if (r_size < 0) {
+ ldm_error("r_size %d < 0", r_size);
+ return false;
+ }
+ if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) {
+ r_id1 = ldm_relative(buffer, buflen, 0x52, r_size);
+ if (r_id1 < 0) {
+ ldm_error("r_id1 %d < 0", r_id1);
+ return false;
+ }
+ } else
+ r_id1 = r_size;
+ if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) {
+ r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1);
+ if (r_id2 < 0) {
+ ldm_error("r_id2 %d < 0", r_id2);
+ return false;
+ }
+ } else
+ r_id2 = r_id1;
+ if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) {
+ r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2);
+ if (r_size2 < 0) {
+ ldm_error("r_size2 %d < 0", r_size2);
+ return false;
+ }
+ } else
+ r_size2 = r_id2;
+ if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
+ r_drive = ldm_relative(buffer, buflen, 0x52, r_size2);
+ if (r_drive < 0) {
+ ldm_error("r_drive %d < 0", r_drive);
+ return false;
+ }
+ } else
+ r_drive = r_size2;
+ len = r_drive;
+ if (len < 0) {
+ ldm_error("len %d < 0", len);
+ return false;
+ }
+ len += VBLK_SIZE_VOL5;
+ if (len > get_unaligned_be32(buffer + 0x14)) {
+ ldm_error("len %d > BE32(buffer + 0x14) %d", len,
+ get_unaligned_be32(buffer + 0x14));
+ return false;
+ }
+ volu = &vb->vblk.volu;
+ ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type,
+ sizeof(volu->volume_type));
+ memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter,
+ sizeof(volu->volume_state));
+ volu->size = ldm_get_vnum(buffer + 0x3D + r_child);
+ volu->partition_type = buffer[0x41 + r_size];
+ memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid));
+ if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) {
+ ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint,
+ sizeof(volu->drive_hint));
+ }
+ return true;
+}
+
+/**
+ * ldm_parse_vblk - Read a raw VBLK object into a vblk structure
+ * @buf: Block of data being worked on
+ * @len: Size of the block of data
+ * @vb: In-memory vblk in which to return information
+ *
+ * Read a raw VBLK object into a vblk structure. This function just reads the
+ * information common to all VBLK types, then delegates the rest of the work to
+ * helper functions: ldm_parse_*.
+ *
+ * Return: 'true' @vb contains a VBLK
+ * 'false' @vb contents are not defined
+ */
+static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
+{
+ bool result = false;
+ int r_objid;
+
+ BUG_ON (!buf || !vb);
+
+ r_objid = ldm_relative (buf, len, 0x18, 0);
+ if (r_objid < 0) {
+ ldm_error ("VBLK header is corrupt.");
+ return false;
+ }
+
+ vb->flags = buf[0x12];
+ vb->type = buf[0x13];
+ vb->obj_id = ldm_get_vnum (buf + 0x18);
+ ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name));
+
+ switch (vb->type) {
+ case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break;
+ case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break;
+ case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break;
+ case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break;
+ case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break;
+ case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break;
+ case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break;
+ }
+
+ if (result)
+ ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.",
+ (unsigned long long) vb->obj_id, vb->type);
+ else
+ ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).",
+ (unsigned long long) vb->obj_id, vb->type);
+
+ return result;
+}
+
+
+/**
+ * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database
+ * @data: Raw VBLK to add to the database
+ * @len: Size of the raw VBLK
+ * @ldb: Cache of the database structures
+ *
+ * The VBLKs are sorted into categories. Partitions are also sorted by offset.
+ *
+ * N.B. This function does not check the validity of the VBLKs.
+ *
+ * Return: 'true' The VBLK was added
+ * 'false' An error occurred
+ */
+static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
+{
+ struct vblk *vb;
+ struct list_head *item;
+
+ BUG_ON (!data || !ldb);
+
+ vb = kmalloc (sizeof (*vb), GFP_KERNEL);
+ if (!vb) {
+ ldm_crit ("Out of memory.");
+ return false;
+ }
+
+ if (!ldm_parse_vblk (data, len, vb)) {
+ kfree(vb);
+ return false; /* Already logged */
+ }
+
+ /* Put vblk into the correct list. */
+ switch (vb->type) {
+ case VBLK_DGR3:
+ case VBLK_DGR4:
+ list_add (&vb->list, &ldb->v_dgrp);
+ break;
+ case VBLK_DSK3:
+ case VBLK_DSK4:
+ list_add (&vb->list, &ldb->v_disk);
+ break;
+ case VBLK_VOL5:
+ list_add (&vb->list, &ldb->v_volu);
+ break;
+ case VBLK_CMP3:
+ list_add (&vb->list, &ldb->v_comp);
+ break;
+ case VBLK_PRT3:
+ /* Sort by the partition's start sector. */
+ list_for_each (item, &ldb->v_part) {
+ struct vblk *v = list_entry (item, struct vblk, list);
+ if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
+ (v->vblk.part.start > vb->vblk.part.start)) {
+ list_add_tail (&vb->list, &v->list);
+ return true;
+ }
+ }
+ list_add_tail (&vb->list, &ldb->v_part);
+ break;
+ }
+ return true;
+}
+
+/**
+ * ldm_frag_add - Add a VBLK fragment to a list
+ * @data: Raw fragment to be added to the list
+ * @size: Size of the raw fragment
+ * @frags: Linked list of VBLK fragments
+ *
+ * Fragmented VBLKs may not be consecutive in the database, so they are placed
+ * in a list so they can be pieced together later.
+ *
+ * Return: 'true' Success, the VBLK was added to the list
+ * 'false' Error, a problem occurred
+ */
+static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
+{
+ struct frag *f;
+ struct list_head *item;
+ int rec, num, group;
+
+ BUG_ON (!data || !frags);
+
+ if (size < 2 * VBLK_SIZE_HEAD) {
+ ldm_error("Value of size is to small.");
+ return false;
+ }
+
+ group = get_unaligned_be32(data + 0x08);
+ rec = get_unaligned_be16(data + 0x0C);
+ num = get_unaligned_be16(data + 0x0E);
+ if ((num < 1) || (num > 4)) {
+ ldm_error ("A VBLK claims to have %d parts.", num);
+ return false;
+ }
+ if (rec >= num) {
+ ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
+ return false;
+ }
+
+ list_for_each (item, frags) {
+ f = list_entry (item, struct frag, list);
+ if (f->group == group)
+ goto found;
+ }
+
+ f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
+ if (!f) {
+ ldm_crit ("Out of memory.");
+ return false;
+ }
+
+ f->group = group;
+ f->num = num;
+ f->rec = rec;
+ f->map = 0xFF << num;
+
+ list_add_tail (&f->list, frags);
+found:
+ if (rec >= f->num) {
+ ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
+ return false;
+ }
+ if (f->map & (1 << rec)) {
+ ldm_error ("Duplicate VBLK, part %d.", rec);
+ f->map &= 0x7F; /* Mark the group as broken */
+ return false;
+ }
+ f->map |= (1 << rec);
+ if (!rec)
+ memcpy(f->data, data, VBLK_SIZE_HEAD);
+ data += VBLK_SIZE_HEAD;
+ size -= VBLK_SIZE_HEAD;
+ memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size);
+ return true;
+}
+
+/**
+ * ldm_frag_free - Free a linked list of VBLK fragments
+ * @list: Linked list of fragments
+ *
+ * Free a linked list of VBLK fragments
+ *
+ * Return: none
+ */
+static void ldm_frag_free (struct list_head *list)
+{
+ struct list_head *item, *tmp;
+
+ BUG_ON (!list);
+
+ list_for_each_safe (item, tmp, list)
+ kfree (list_entry (item, struct frag, list));
+}
+
+/**
+ * ldm_frag_commit - Validate fragmented VBLKs and add them to the database
+ * @frags: Linked list of VBLK fragments
+ * @ldb: Cache of the database structures
+ *
+ * Now that all the fragmented VBLKs have been collected, they must be added to
+ * the database for later use.
+ *
+ * Return: 'true' All the fragments we added successfully
+ * 'false' One or more of the fragments we invalid
+ */
+static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
+{
+ struct frag *f;
+ struct list_head *item;
+
+ BUG_ON (!frags || !ldb);
+
+ list_for_each (item, frags) {
+ f = list_entry (item, struct frag, list);
+
+ if (f->map != 0xFF) {
+ ldm_error ("VBLK group %d is incomplete (0x%02x).",
+ f->group, f->map);
+ return false;
+ }
+
+ if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
+ return false; /* Already logged */
+ }
+ return true;
+}
+
+/**
+ * ldm_get_vblks - Read the on-disk database of VBLKs into memory
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
+ * @ldb: Cache of the database structures
+ *
+ * To use the information from the VBLKs, they need to be read from the disk,
+ * unpacked and validated. We cache them in @ldb according to their type.
+ *
+ * Return: 'true' All the VBLKs were read successfully
+ * 'false' An error occurred
+ */
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+ struct ldmdb *ldb)
+{
+ int size, perbuf, skip, finish, s, v, recs;
+ u8 *data = NULL;
+ Sector sect;
+ bool result = false;
+ LIST_HEAD (frags);
+
+ BUG_ON(!state || !ldb);
+
+ size = ldb->vm.vblk_size;
+ perbuf = 512 / size;
+ skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */
+ finish = (size * ldb->vm.last_vblk_seq) >> 9;
+
+ for (s = skip; s < finish; s++) { /* For each sector */
+ data = read_part_sector(state, base + OFF_VMDB + s, &sect);
+ if (!data) {
+ ldm_crit ("Disk read failed.");
+ goto out;
+ }
+
+ for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */
+ if (MAGIC_VBLK != get_unaligned_be32(data)) {
+ ldm_error ("Expected to find a VBLK.");
+ goto out;
+ }
+
+ recs = get_unaligned_be16(data + 0x0E); /* Number of records */
+ if (recs == 1) {
+ if (!ldm_ldmdb_add (data, size, ldb))
+ goto out; /* Already logged */
+ } else if (recs > 1) {
+ if (!ldm_frag_add (data, size, &frags))
+ goto out; /* Already logged */
+ }
+ /* else Record is not in use, ignore it. */
+ }
+ put_dev_sector (sect);
+ data = NULL;
+ }
+
+ result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */
+out:
+ if (data)
+ put_dev_sector (sect);
+ ldm_frag_free (&frags);
+
+ return result;
+}
+
+/**
+ * ldm_free_vblks - Free a linked list of vblk's
+ * @lh: Head of a linked list of struct vblk
+ *
+ * Free a list of vblk's and free the memory used to maintain the list.
+ *
+ * Return: none
+ */
+static void ldm_free_vblks (struct list_head *lh)
+{
+ struct list_head *item, *tmp;
+
+ BUG_ON (!lh);
+
+ list_for_each_safe (item, tmp, lh)
+ kfree (list_entry (item, struct vblk, list));
+}
+
+
+/**
+ * ldm_partition - Find out whether a device is a dynamic disk and handle it
+ * @state: Partition check state including device holding the LDM Database
+ *
+ * This determines whether the device @bdev is a dynamic disk and if so creates
+ * the partitions necessary in the gendisk structure pointed to by @hd.
+ *
+ * We create a dummy device 1, which contains the LDM database, and then create
+ * each partition described by the LDM database in sequence as devices 2+. For
+ * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
+ * and so on: the actual data containing partitions.
+ *
+ * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
+ * 0 Success, @state->bdev is not a dynamic disk
+ * -1 An error occurred before enough information had been read
+ * Or @state->bdev is a dynamic disk, but it may be corrupted
+ */
+int ldm_partition(struct parsed_partitions *state)
+{
+ struct ldmdb *ldb;
+ unsigned long base;
+ int result = -1;
+
+ BUG_ON(!state);
+
+ /* Look for signs of a Dynamic Disk */
+ if (!ldm_validate_partition_table(state))
+ return 0;
+
+ ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
+ if (!ldb) {
+ ldm_crit ("Out of memory.");
+ goto out;
+ }
+
+ /* Parse and check privheads. */
+ if (!ldm_validate_privheads(state, &ldb->ph))
+ goto out; /* Already logged */
+
+ /* All further references are relative to base (database start). */
+ base = ldb->ph.config_start;
+
+ /* Parse and check tocs and vmdb. */
+ if (!ldm_validate_tocblocks(state, base, ldb) ||
+ !ldm_validate_vmdb(state, base, ldb))
+ goto out; /* Already logged */
+
+ /* Initialize vblk lists in ldmdb struct */
+ INIT_LIST_HEAD (&ldb->v_dgrp);
+ INIT_LIST_HEAD (&ldb->v_disk);
+ INIT_LIST_HEAD (&ldb->v_volu);
+ INIT_LIST_HEAD (&ldb->v_comp);
+ INIT_LIST_HEAD (&ldb->v_part);
+
+ if (!ldm_get_vblks(state, base, ldb)) {
+ ldm_crit ("Failed to read the VBLKs from the database.");
+ goto cleanup;
+ }
+
+ /* Finally, create the data partition devices. */
+ if (ldm_create_data_partitions(state, ldb)) {
+ ldm_debug ("Parsed LDM database successfully.");
+ result = 1;
+ }
+ /* else Already logged */
+
+cleanup:
+ ldm_free_vblks (&ldb->v_dgrp);
+ ldm_free_vblks (&ldb->v_disk);
+ ldm_free_vblks (&ldb->v_volu);
+ ldm_free_vblks (&ldb->v_comp);
+ ldm_free_vblks (&ldb->v_part);
+out:
+ kfree (ldb);
+ return result;
+}
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
new file mode 100644
index 000000000000..374242c0971a
--- /dev/null
+++ b/block/partitions/ldm.h
@@ -0,0 +1,215 @@
+/**
+ * ldm - Part of the Linux-NTFS project.
+ *
+ * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
+ *
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _FS_PT_LDM_H_
+#define _FS_PT_LDM_H_
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
+
+struct parsed_partitions;
+
+/* Magic numbers in CPU format. */
+#define MAGIC_VMDB 0x564D4442 /* VMDB */
+#define MAGIC_VBLK 0x56424C4B /* VBLK */
+#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */
+#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */
+
+/* The defined vblk types. */
+#define VBLK_VOL5 0x51 /* Volume, version 5 */
+#define VBLK_CMP3 0x32 /* Component, version 3 */
+#define VBLK_PRT3 0x33 /* Partition, version 3 */
+#define VBLK_DSK3 0x34 /* Disk, version 3 */
+#define VBLK_DSK4 0x44 /* Disk, version 4 */
+#define VBLK_DGR3 0x35 /* Disk Group, version 3 */
+#define VBLK_DGR4 0x45 /* Disk Group, version 4 */
+
+/* vblk flags indicating extra information will be present */
+#define VBLK_FLAG_COMP_STRIPE 0x10
+#define VBLK_FLAG_PART_INDEX 0x08
+#define VBLK_FLAG_DGR3_IDS 0x08
+#define VBLK_FLAG_DGR4_IDS 0x08
+#define VBLK_FLAG_VOLU_ID1 0x08
+#define VBLK_FLAG_VOLU_ID2 0x20
+#define VBLK_FLAG_VOLU_SIZE 0x80
+#define VBLK_FLAG_VOLU_DRIVE 0x02
+
+/* size of a vblk's static parts */
+#define VBLK_SIZE_HEAD 16
+#define VBLK_SIZE_CMP3 22 /* Name and version */
+#define VBLK_SIZE_DGR3 12
+#define VBLK_SIZE_DGR4 44
+#define VBLK_SIZE_DSK3 12
+#define VBLK_SIZE_DSK4 45
+#define VBLK_SIZE_PRT3 28
+#define VBLK_SIZE_VOL5 58
+
+/* component types */
+#define COMP_STRIPE 0x01 /* Stripe-set */
+#define COMP_BASIC 0x02 /* Basic disk */
+#define COMP_RAID 0x03 /* Raid-set */
+
+/* Other constants. */
+#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */
+
+#define OFF_PRIV1 6 /* Offset of the first privhead
+ relative to the start of the
+ device in sectors */
+
+/* Offsets to structures within the LDM Database in sectors. */
+#define OFF_PRIV2 1856 /* Backup private headers. */
+#define OFF_PRIV3 2047
+
+#define OFF_TOCB1 1 /* Tables of contents. */
+#define OFF_TOCB2 2
+#define OFF_TOCB3 2045
+#define OFF_TOCB4 2046
+
+#define OFF_VMDB 17 /* List of partitions. */
+
+#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */
+
+#define TOC_BITMAP1 "config" /* Names of the two defined */
+#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
+
+/* Borrowed from msdos.c */
+#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
+
+struct frag { /* VBLK Fragment handling */
+ struct list_head list;
+ u32 group;
+ u8 num; /* Total number of records */
+ u8 rec; /* This is record number n */
+ u8 map; /* Which portions are in use */
+ u8 data[0];
+};
+
+/* In memory LDM database structures. */
+
+#define GUID_SIZE 16
+
+struct privhead { /* Offsets and sizes are in sectors. */
+ u16 ver_major;
+ u16 ver_minor;
+ u64 logical_disk_start;
+ u64 logical_disk_size;
+ u64 config_start;
+ u64 config_size;
+ u8 disk_id[GUID_SIZE];
+};
+
+struct tocblock { /* We have exactly two bitmaps. */
+ u8 bitmap1_name[16];
+ u64 bitmap1_start;
+ u64 bitmap1_size;
+ u8 bitmap2_name[16];
+ u64 bitmap2_start;
+ u64 bitmap2_size;
+};
+
+struct vmdb { /* VMDB: The database header */
+ u16 ver_major;
+ u16 ver_minor;
+ u32 vblk_size;
+ u32 vblk_offset;
+ u32 last_vblk_seq;
+};
+
+struct vblk_comp { /* VBLK Component */
+ u8 state[16];
+ u64 parent_id;
+ u8 type;
+ u8 children;
+ u16 chunksize;
+};
+
+struct vblk_dgrp { /* VBLK Disk Group */
+ u8 disk_id[64];
+};
+
+struct vblk_disk { /* VBLK Disk */
+ u8 disk_id[GUID_SIZE];
+ u8 alt_name[128];
+};
+
+struct vblk_part { /* VBLK Partition */
+ u64 start;
+ u64 size; /* start, size and vol_off in sectors */
+ u64 volume_offset;
+ u64 parent_id;
+ u64 disk_id;
+ u8 partnum;
+};
+
+struct vblk_volu { /* VBLK Volume */
+ u8 volume_type[16];
+ u8 volume_state[16];
+ u8 guid[16];
+ u8 drive_hint[4];
+ u64 size;
+ u8 partition_type;
+};
+
+struct vblk_head { /* VBLK standard header */
+ u32 group;
+ u16 rec;
+ u16 nrec;
+};
+
+struct vblk { /* Generalised VBLK */
+ u8 name[64];
+ u64 obj_id;
+ u32 sequence;
+ u8 flags;
+ u8 type;
+ union {
+ struct vblk_comp comp;
+ struct vblk_dgrp dgrp;
+ struct vblk_disk disk;
+ struct vblk_part part;
+ struct vblk_volu volu;
+ } vblk;
+ struct list_head list;
+};
+
+struct ldmdb { /* Cache of the database */
+ struct privhead ph;
+ struct tocblock toc;
+ struct vmdb vm;
+ struct list_head v_dgrp;
+ struct list_head v_disk;
+ struct list_head v_volu;
+ struct list_head v_comp;
+ struct list_head v_part;
+};
+
+int ldm_partition(struct parsed_partitions *state);
+
+#endif /* _FS_PT_LDM_H_ */
+
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
new file mode 100644
index 000000000000..11f688bd76c5
--- /dev/null
+++ b/block/partitions/mac.c
@@ -0,0 +1,134 @@
+/*
+ * fs/partitions/mac.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "mac.h"
+
+#ifdef CONFIG_PPC_PMAC
+#include <asm/machdep.h>
+extern void note_bootable_part(dev_t dev, int part, int goodness);
+#endif
+
+/*
+ * Code to understand MacOS partition tables.
+ */
+
+static inline void mac_fix_string(char *stg, int len)
+{
+ int i;
+
+ for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
+ stg[i] = 0;
+}
+
+int mac_partition(struct parsed_partitions *state)
+{
+ Sector sect;
+ unsigned char *data;
+ int slot, blocks_in_map;
+ unsigned secsize;
+#ifdef CONFIG_PPC_PMAC
+ int found_root = 0;
+ int found_root_goodness = 0;
+#endif
+ struct mac_partition *part;
+ struct mac_driver_desc *md;
+
+ /* Get 0th block and look at the first partition map entry. */
+ md = read_part_sector(state, 0, &sect);
+ if (!md)
+ return -1;
+ if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ secsize = be16_to_cpu(md->block_size);
+ put_dev_sector(sect);
+ data = read_part_sector(state, secsize/512, &sect);
+ if (!data)
+ return -1;
+ part = (struct mac_partition *) (data + secsize%512);
+ if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
+ put_dev_sector(sect);
+ return 0; /* not a MacOS disk */
+ }
+ blocks_in_map = be32_to_cpu(part->map_count);
+ if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+ for (slot = 1; slot <= blocks_in_map; ++slot) {
+ int pos = slot * secsize;
+ put_dev_sector(sect);
+ data = read_part_sector(state, pos/512, &sect);
+ if (!data)
+ return -1;
+ part = (struct mac_partition *) (data + pos%512);
+ if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
+ break;
+ put_partition(state, slot,
+ be32_to_cpu(part->start_block) * (secsize/512),
+ be32_to_cpu(part->block_count) * (secsize/512));
+
+ if (!strnicmp(part->type, "Linux_RAID", 10))
+ state->parts[slot].flags = ADDPART_FLAG_RAID;
+#ifdef CONFIG_PPC_PMAC
+ /*
+ * If this is the first bootable partition, tell the
+ * setup code, in case it wants to make this the root.
+ */
+ if (machine_is(powermac)) {
+ int goodness = 0;
+
+ mac_fix_string(part->processor, 16);
+ mac_fix_string(part->name, 32);
+ mac_fix_string(part->type, 32);
+
+ if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
+ && strcasecmp(part->processor, "powerpc") == 0)
+ goodness++;
+
+ if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
+ || (strnicmp(part->type, "Linux", 5) == 0
+ && strcasecmp(part->type, "Linux_swap") != 0)) {
+ int i, l;
+
+ goodness++;
+ l = strlen(part->name);
+ if (strcmp(part->name, "/") == 0)
+ goodness++;
+ for (i = 0; i <= l - 4; ++i) {
+ if (strnicmp(part->name + i, "root",
+ 4) == 0) {
+ goodness += 2;
+ break;
+ }
+ }
+ if (strnicmp(part->name, "swap", 4) == 0)
+ goodness--;
+ }
+
+ if (goodness > found_root_goodness) {
+ found_root = slot;
+ found_root_goodness = goodness;
+ }
+ }
+#endif /* CONFIG_PPC_PMAC */
+ }
+#ifdef CONFIG_PPC_PMAC
+ if (found_root_goodness)
+ note_bootable_part(state->bdev->bd_dev, found_root,
+ found_root_goodness);
+#endif
+
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+}
diff --git a/block/partitions/mac.h b/block/partitions/mac.h
new file mode 100644
index 000000000000..3c7d98436380
--- /dev/null
+++ b/block/partitions/mac.h
@@ -0,0 +1,44 @@
+/*
+ * fs/partitions/mac.h
+ */
+
+#define MAC_PARTITION_MAGIC 0x504d
+
+/* type field value for A/UX or other Unix partitions */
+#define APPLE_AUX_TYPE "Apple_UNIX_SVR2"
+
+struct mac_partition {
+ __be16 signature; /* expected to be MAC_PARTITION_MAGIC */
+ __be16 res1;
+ __be32 map_count; /* # blocks in partition map */
+ __be32 start_block; /* absolute starting block # of partition */
+ __be32 block_count; /* number of blocks in partition */
+ char name[32]; /* partition name */
+ char type[32]; /* string type description */
+ __be32 data_start; /* rel block # of first data block */
+ __be32 data_count; /* number of data blocks */
+ __be32 status; /* partition status bits */
+ __be32 boot_start;
+ __be32 boot_size;
+ __be32 boot_load;
+ __be32 boot_load2;
+ __be32 boot_entry;
+ __be32 boot_entry2;
+ __be32 boot_cksum;
+ char processor[16]; /* identifies ISA of boot */
+ /* there is more stuff after this that we don't need */
+};
+
+#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */
+
+#define MAC_DRIVER_MAGIC 0x4552
+
+/* Driver descriptor structure, in block 0 */
+struct mac_driver_desc {
+ __be16 signature; /* expected to be MAC_DRIVER_MAGIC */
+ __be16 block_size;
+ __be32 block_count;
+ /* ... more stuff */
+};
+
+int mac_partition(struct parsed_partitions *state);
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
new file mode 100644
index 000000000000..5f79a6677c69
--- /dev/null
+++ b/block/partitions/msdos.c
@@ -0,0 +1,552 @@
+/*
+ * fs/partitions/msdos.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ * Copyright (C) 1991-1998 Linus Torvalds
+ *
+ * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
+ * in the early extended-partition checks and added DM partitions
+ *
+ * Support for DiskManager v6.0x added by Mark Lord,
+ * with information provided by OnTrack. This now works for linux fdisk
+ * and LILO, as well as loadlin and bootln. Note that disks other than
+ * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
+ *
+ * More flexible handling of extended partitions - aeb, 950831
+ *
+ * Check partition table on IDE disks for common CHS translations
+ *
+ * Re-organised Feb 1998 Russell King
+ */
+#include <linux/msdos_fs.h>
+
+#include "check.h"
+#include "msdos.h"
+#include "efi.h"
+
+/*
+ * Many architectures don't like unaligned accesses, while
+ * the nr_sects and start_sect partition table entries are
+ * at a 2 (mod 4) address.
+ */
+#include <asm/unaligned.h>
+
+#define SYS_IND(p) get_unaligned(&p->sys_ind)
+
+static inline sector_t nr_sects(struct partition *p)
+{
+ return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+ return (sector_t)get_unaligned_le32(&p->start_sect);
+}
+
+static inline int is_extended_partition(struct partition *p)
+{
+ return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
+ SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
+ SYS_IND(p) == LINUX_EXTENDED_PARTITION);
+}
+
+#define MSDOS_LABEL_MAGIC1 0x55
+#define MSDOS_LABEL_MAGIC2 0xAA
+
+static inline int
+msdos_magic_present(unsigned char *p)
+{
+ return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
+}
+
+/* Value is EBCDIC 'IBMA' */
+#define AIX_LABEL_MAGIC1 0xC9
+#define AIX_LABEL_MAGIC2 0xC2
+#define AIX_LABEL_MAGIC3 0xD4
+#define AIX_LABEL_MAGIC4 0xC1
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
+{
+ struct partition *pt = (struct partition *) (p + 0x1be);
+ Sector sect;
+ unsigned char *d;
+ int slot, ret = 0;
+
+ if (!(p[0] == AIX_LABEL_MAGIC1 &&
+ p[1] == AIX_LABEL_MAGIC2 &&
+ p[2] == AIX_LABEL_MAGIC3 &&
+ p[3] == AIX_LABEL_MAGIC4))
+ return 0;
+ /* Assume the partition table is valid if Linux partitions exists */
+ for (slot = 1; slot <= 4; slot++, pt++) {
+ if (pt->sys_ind == LINUX_SWAP_PARTITION ||
+ pt->sys_ind == LINUX_RAID_PARTITION ||
+ pt->sys_ind == LINUX_DATA_PARTITION ||
+ pt->sys_ind == LINUX_LVM_PARTITION ||
+ is_extended_partition(pt))
+ return 0;
+ }
+ d = read_part_sector(state, 7, &sect);
+ if (d) {
+ if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
+ ret = 1;
+ put_dev_sector(sect);
+ };
+ return ret;
+}
+
+/*
+ * Create devices for each logical partition in an extended partition.
+ * The logical partitions form a linked list, with each entry being
+ * a partition table with two entries. The first entry
+ * is the real data partition (with a start relative to the partition
+ * table start). The second is a pointer to the next logical partition
+ * (with a start relative to the entire extended partition).
+ * We do not create a Linux partition for the partition tables, but
+ * only for the actual data partitions.
+ */
+
+static void parse_extended(struct parsed_partitions *state,
+ sector_t first_sector, sector_t first_size)
+{
+ struct partition *p;
+ Sector sect;
+ unsigned char *data;
+ sector_t this_sector, this_size;
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ int loopct = 0; /* number of links followed
+ without finding a data partition */
+ int i;
+
+ this_sector = first_sector;
+ this_size = first_size;
+
+ while (1) {
+ if (++loopct > 100)
+ return;
+ if (state->next == state->limit)
+ return;
+ data = read_part_sector(state, this_sector, &sect);
+ if (!data)
+ return;
+
+ if (!msdos_magic_present(data + 510))
+ goto done;
+
+ p = (struct partition *) (data + 0x1be);
+
+ /*
+ * Usually, the first entry is the real data partition,
+ * the 2nd entry is the next extended partition, or empty,
+ * and the 3rd and 4th entries are unused.
+ * However, DRDOS sometimes has the extended partition as
+ * the first entry (when the data partition is empty),
+ * and OS/2 seems to use all four entries.
+ */
+
+ /*
+ * First process the data partition(s)
+ */
+ for (i=0; i<4; i++, p++) {
+ sector_t offs, size, next;
+ if (!nr_sects(p) || is_extended_partition(p))
+ continue;
+
+ /* Check the 3rd and 4th entries -
+ these sometimes contain random garbage */
+ offs = start_sect(p)*sector_size;
+ size = nr_sects(p)*sector_size;
+ next = this_sector + offs;
+ if (i >= 2) {
+ if (offs + size > this_size)
+ continue;
+ if (next < first_sector)
+ continue;
+ if (next + size > first_sector + first_size)
+ continue;
+ }
+
+ put_partition(state, state->next, next, size);
+ if (SYS_IND(p) == LINUX_RAID_PARTITION)
+ state->parts[state->next].flags = ADDPART_FLAG_RAID;
+ loopct = 0;
+ if (++state->next == state->limit)
+ goto done;
+ }
+ /*
+ * Next, process the (first) extended partition, if present.
+ * (So far, there seems to be no reason to make
+ * parse_extended() recursive and allow a tree
+ * of extended partitions.)
+ * It should be a link to the next logical partition.
+ */
+ p -= 4;
+ for (i=0; i<4; i++, p++)
+ if (nr_sects(p) && is_extended_partition(p))
+ break;
+ if (i == 4)
+ goto done; /* nothing left to do */
+
+ this_sector = first_sector + start_sect(p) * sector_size;
+ this_size = nr_sects(p) * sector_size;
+ put_dev_sector(sect);
+ }
+done:
+ put_dev_sector(sect);
+}
+
+/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
+ indicates linux swap. Be careful before believing this is Solaris. */
+
+static void parse_solaris_x86(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_SOLARIS_X86_PARTITION
+ Sector sect;
+ struct solaris_x86_vtoc *v;
+ int i;
+ short max_nparts;
+
+ v = read_part_sector(state, offset + 1, &sect);
+ if (!v)
+ return;
+ if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
+ put_dev_sector(sect);
+ return;
+ }
+ {
+ char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+
+ snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ if (le32_to_cpu(v->v_version) != 1) {
+ char tmp[64];
+
+ snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
+ le32_to_cpu(v->v_version));
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ put_dev_sector(sect);
+ return;
+ }
+ /* Ensure we can handle previous case of VTOC with 8 entries gracefully */
+ max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+ for (i=0; i<max_nparts && state->next<state->limit; i++) {
+ struct solaris_x86_slice *s = &v->v_slice[i];
+ char tmp[3 + 10 + 1 + 1];
+
+ if (s->s_size == 0)
+ continue;
+ snprintf(tmp, sizeof(tmp), " [s%d]", i);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ /* solaris partitions are relative to current MS-DOS
+ * one; must add the offset of the current partition */
+ put_partition(state, state->next++,
+ le32_to_cpu(s->s_start)+offset,
+ le32_to_cpu(s->s_size));
+ }
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+#if defined(CONFIG_BSD_DISKLABEL)
+/*
+ * Create devices for BSD partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_bsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin, char *flavour,
+ int max_partitions)
+{
+ Sector sect;
+ struct bsd_disklabel *l;
+ struct bsd_partition *p;
+ char tmp[64];
+
+ l = read_part_sector(state, offset + 1, &sect);
+ if (!l)
+ return;
+ if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
+ put_dev_sector(sect);
+ return;
+ }
+
+ snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+ if (le16_to_cpu(l->d_npartitions) < max_partitions)
+ max_partitions = le16_to_cpu(l->d_npartitions);
+ for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
+ sector_t bsd_start, bsd_size;
+
+ if (state->next == state->limit)
+ break;
+ if (p->p_fstype == BSD_FS_UNUSED)
+ continue;
+ bsd_start = le32_to_cpu(p->p_offset);
+ bsd_size = le32_to_cpu(p->p_size);
+ if (offset == bsd_start && size == bsd_size)
+ /* full parent partition, we have it already */
+ continue;
+ if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
+ strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
+ continue;
+ }
+ put_partition(state, state->next++, bsd_start, bsd_size);
+ }
+ put_dev_sector(sect);
+ if (le16_to_cpu(l->d_npartitions) > max_partitions) {
+ snprintf(tmp, sizeof(tmp), " (ignored %d more)",
+ le16_to_cpu(l->d_npartitions) - max_partitions);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+}
+#endif
+
+static void parse_freebsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+ parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_netbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+ parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_openbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+ parse_bsd(state, offset, size, origin, "openbsd",
+ OPENBSD_MAXPARTITIONS);
+#endif
+}
+
+/*
+ * Create devices for Unixware partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_unixware(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_UNIXWARE_DISKLABEL
+ Sector sect;
+ struct unixware_disklabel *l;
+ struct unixware_slice *p;
+
+ l = read_part_sector(state, offset + 29, &sect);
+ if (!l)
+ return;
+ if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
+ le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
+ put_dev_sector(sect);
+ return;
+ }
+ {
+ char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+
+ snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ p = &l->vtoc.v_slice[1];
+ /* I omit the 0th slice as it is the same as whole disk. */
+ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
+ if (state->next == state->limit)
+ break;
+
+ if (p->s_label != UNIXWARE_FS_UNUSED)
+ put_partition(state, state->next++,
+ le32_to_cpu(p->start_sect),
+ le32_to_cpu(p->nr_sects));
+ p++;
+ }
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+/*
+ * Minix 2.0.0/2.0.2 subpartition support.
+ * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
+ * Rajeev V. Pillai <rajeevvp@yahoo.com>
+ */
+static void parse_minix(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_MINIX_SUBPARTITION
+ Sector sect;
+ unsigned char *data;
+ struct partition *p;
+ int i;
+
+ data = read_part_sector(state, offset, &sect);
+ if (!data)
+ return;
+
+ p = (struct partition *)(data + 0x1be);
+
+ /* The first sector of a Minix partition can have either
+ * a secondary MBR describing its subpartitions, or
+ * the normal boot sector. */
+ if (msdos_magic_present (data + 510) &&
+ SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
+
+ snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
+ if (state->next == state->limit)
+ break;
+ /* add each partition in use */
+ if (SYS_IND(p) == MINIX_PARTITION)
+ put_partition(state, state->next++,
+ start_sect(p), nr_sects(p));
+ }
+ strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+ }
+ put_dev_sector(sect);
+#endif /* CONFIG_MINIX_SUBPARTITION */
+}
+
+static struct {
+ unsigned char id;
+ void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
+} subtypes[] = {
+ {FREEBSD_PARTITION, parse_freebsd},
+ {NETBSD_PARTITION, parse_netbsd},
+ {OPENBSD_PARTITION, parse_openbsd},
+ {MINIX_PARTITION, parse_minix},
+ {UNIXWARE_PARTITION, parse_unixware},
+ {SOLARIS_X86_PARTITION, parse_solaris_x86},
+ {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
+ {0, NULL},
+};
+
+int msdos_partition(struct parsed_partitions *state)
+{
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ Sector sect;
+ unsigned char *data;
+ struct partition *p;
+ struct fat_boot_sector *fb;
+ int slot;
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+ if (!msdos_magic_present(data + 510)) {
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ if (aix_magic_present(state, data)) {
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
+ return 0;
+ }
+
+ /*
+ * Now that the 55aa signature is present, this is probably
+ * either the boot sector of a FAT filesystem or a DOS-type
+ * partition table. Reject this in case the boot indicator
+ * is not 0 or 0x80.
+ */
+ p = (struct partition *) (data + 0x1be);
+ for (slot = 1; slot <= 4; slot++, p++) {
+ if (p->boot_ind != 0 && p->boot_ind != 0x80) {
+ /*
+ * Even without a valid boot inidicator value
+ * its still possible this is valid FAT filesystem
+ * without a partition table.
+ */
+ fb = (struct fat_boot_sector *) data;
+ if (slot == 1 && fb->reserved && fb->fats
+ && fat_valid_media(fb->media)) {
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+ } else {
+ put_dev_sector(sect);
+ return 0;
+ }
+ }
+ }
+
+#ifdef CONFIG_EFI_PARTITION
+ p = (struct partition *) (data + 0x1be);
+ for (slot = 1 ; slot <= 4 ; slot++, p++) {
+ /* If this is an EFI GPT disk, msdos should ignore it. */
+ if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ }
+#endif
+ p = (struct partition *) (data + 0x1be);
+
+ /*
+ * Look for partitions in two passes:
+ * First find the primary and DOS-type extended partitions.
+ * On the second pass look inside *BSD, Unixware and Solaris partitions.
+ */
+
+ state->next = 5;
+ for (slot = 1 ; slot <= 4 ; slot++, p++) {
+ sector_t start = start_sect(p)*sector_size;
+ sector_t size = nr_sects(p)*sector_size;
+ if (!size)
+ continue;
+ if (is_extended_partition(p)) {
+ /*
+ * prevent someone doing mkfs or mkswap on an
+ * extended partition, but leave room for LILO
+ * FIXME: this uses one logical sector for > 512b
+ * sector, although it may not be enough/proper.
+ */
+ sector_t n = 2;
+ n = min(size, max(sector_size, n));
+ put_partition(state, slot, start, n);
+
+ strlcat(state->pp_buf, " <", PAGE_SIZE);
+ parse_extended(state, start, size);
+ strlcat(state->pp_buf, " >", PAGE_SIZE);
+ continue;
+ }
+ put_partition(state, slot, start, size);
+ if (SYS_IND(p) == LINUX_RAID_PARTITION)
+ state->parts[slot].flags = ADDPART_FLAG_RAID;
+ if (SYS_IND(p) == DM6_PARTITION)
+ strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
+ if (SYS_IND(p) == EZD_PARTITION)
+ strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
+ }
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+ /* second pass - output for each on a separate line */
+ p = (struct partition *) (0x1be + data);
+ for (slot = 1 ; slot <= 4 ; slot++, p++) {
+ unsigned char id = SYS_IND(p);
+ int n;
+
+ if (!nr_sects(p))
+ continue;
+
+ for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
+ ;
+
+ if (!subtypes[n].parse)
+ continue;
+ subtypes[n].parse(state, start_sect(p) * sector_size,
+ nr_sects(p) * sector_size, slot);
+ }
+ put_dev_sector(sect);
+ return 1;
+}
diff --git a/block/partitions/msdos.h b/block/partitions/msdos.h
new file mode 100644
index 000000000000..38c781c490b3
--- /dev/null
+++ b/block/partitions/msdos.h
@@ -0,0 +1,8 @@
+/*
+ * fs/partitions/msdos.h
+ */
+
+#define MSDOS_LABEL_MAGIC 0xAA55
+
+int msdos_partition(struct parsed_partitions *state);
+
diff --git a/block/partitions/osf.c b/block/partitions/osf.c
new file mode 100644
index 000000000000..764b86a01965
--- /dev/null
+++ b/block/partitions/osf.c
@@ -0,0 +1,86 @@
+/*
+ * fs/partitions/osf.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ *
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "osf.h"
+
+#define MAX_OSF_PARTITIONS 18
+
+int osf_partition(struct parsed_partitions *state)
+{
+ int i;
+ int slot = 1;
+ unsigned int npartitions;
+ Sector sect;
+ unsigned char *data;
+ struct disklabel {
+ __le32 d_magic;
+ __le16 d_type,d_subtype;
+ u8 d_typename[16];
+ u8 d_packname[16];
+ __le32 d_secsize;
+ __le32 d_nsectors;
+ __le32 d_ntracks;
+ __le32 d_ncylinders;
+ __le32 d_secpercyl;
+ __le32 d_secprtunit;
+ __le16 d_sparespertrack;
+ __le16 d_sparespercyl;
+ __le32 d_acylinders;
+ __le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
+ __le32 d_headswitch, d_trkseek, d_flags;
+ __le32 d_drivedata[5];
+ __le32 d_spare[5];
+ __le32 d_magic2;
+ __le16 d_checksum;
+ __le16 d_npartitions;
+ __le32 d_bbsize, d_sbsize;
+ struct d_partition {
+ __le32 p_size;
+ __le32 p_offset;
+ __le32 p_fsize;
+ u8 p_fstype;
+ u8 p_frag;
+ __le16 p_cpg;
+ } d_partitions[MAX_OSF_PARTITIONS];
+ } * label;
+ struct d_partition * partition;
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+
+ label = (struct disklabel *) (data+64);
+ partition = label->d_partitions;
+ if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ npartitions = le16_to_cpu(label->d_npartitions);
+ if (npartitions > MAX_OSF_PARTITIONS) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ for (i = 0 ; i < npartitions; i++, partition++) {
+ if (slot == state->limit)
+ break;
+ if (le32_to_cpu(partition->p_size))
+ put_partition(state, slot,
+ le32_to_cpu(partition->p_offset),
+ le32_to_cpu(partition->p_size));
+ slot++;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+}
diff --git a/block/partitions/osf.h b/block/partitions/osf.h
new file mode 100644
index 000000000000..20ed2315ec16
--- /dev/null
+++ b/block/partitions/osf.h
@@ -0,0 +1,7 @@
+/*
+ * fs/partitions/osf.h
+ */
+
+#define DISKLABELMAGIC (0x82564557UL)
+
+int osf_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
new file mode 100644
index 000000000000..ea8a86dceaf4
--- /dev/null
+++ b/block/partitions/sgi.c
@@ -0,0 +1,82 @@
+/*
+ * fs/partitions/sgi.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ */
+
+#include "check.h"
+#include "sgi.h"
+
+struct sgi_disklabel {
+ __be32 magic_mushroom; /* Big fat spliff... */
+ __be16 root_part_num; /* Root partition number */
+ __be16 swap_part_num; /* Swap partition number */
+ s8 boot_file[16]; /* Name of boot file for ARCS */
+ u8 _unused0[48]; /* Device parameter useless crapola.. */
+ struct sgi_volume {
+ s8 name[8]; /* Name of volume */
+ __be32 block_num; /* Logical block number */
+ __be32 num_bytes; /* How big, in bytes */
+ } volume[15];
+ struct sgi_partition {
+ __be32 num_blocks; /* Size in logical blocks */
+ __be32 first_block; /* First logical block */
+ __be32 type; /* Type of this partition */
+ } partitions[16];
+ __be32 csum; /* Disk label checksum */
+ __be32 _unused1; /* Padding */
+};
+
+int sgi_partition(struct parsed_partitions *state)
+{
+ int i, csum;
+ __be32 magic;
+ int slot = 1;
+ unsigned int start, blocks;
+ __be32 *ui, cs;
+ Sector sect;
+ struct sgi_disklabel *label;
+ struct sgi_partition *p;
+ char b[BDEVNAME_SIZE];
+
+ label = read_part_sector(state, 0, &sect);
+ if (!label)
+ return -1;
+ p = &label->partitions[0];
+ magic = label->magic_mushroom;
+ if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
+ /*printk("Dev %s SGI disklabel: bad magic %08x\n",
+ bdevname(bdev, b), be32_to_cpu(magic));*/
+ put_dev_sector(sect);
+ return 0;
+ }
+ ui = ((__be32 *) (label + 1)) - 1;
+ for(csum = 0; ui >= ((__be32 *) label);) {
+ cs = *ui--;
+ csum += be32_to_cpu(cs);
+ }
+ if(csum) {
+ printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
+ bdevname(state->bdev, b));
+ put_dev_sector(sect);
+ return 0;
+ }
+ /* All SGI disk labels have 16 partitions, disks under Linux only
+ * have 15 minor's. Luckily there are always a few zero length
+ * partitions which we don't care about so we never overflow the
+ * current_minor.
+ */
+ for(i = 0; i < 16; i++, p++) {
+ blocks = be32_to_cpu(p->num_blocks);
+ start = be32_to_cpu(p->first_block);
+ if (blocks) {
+ put_partition(state, slot, start, blocks);
+ if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
+ state->parts[slot].flags = ADDPART_FLAG_RAID;
+ }
+ slot++;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+}
diff --git a/block/partitions/sgi.h b/block/partitions/sgi.h
new file mode 100644
index 000000000000..b9553ebdd5a9
--- /dev/null
+++ b/block/partitions/sgi.h
@@ -0,0 +1,8 @@
+/*
+ * fs/partitions/sgi.h
+ */
+
+extern int sgi_partition(struct parsed_partitions *state);
+
+#define SGI_LABEL_MAGIC 0x0be5a941
+
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
new file mode 100644
index 000000000000..b5b6fcfb3d36
--- /dev/null
+++ b/block/partitions/sun.c
@@ -0,0 +1,122 @@
+/*
+ * fs/partitions/sun.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ *
+ * Copyright (C) 1991-1998 Linus Torvalds
+ * Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "sun.h"
+
+int sun_partition(struct parsed_partitions *state)
+{
+ int i;
+ __be16 csum;
+ int slot = 1;
+ __be16 *ush;
+ Sector sect;
+ struct sun_disklabel {
+ unsigned char info[128]; /* Informative text string */
+ struct sun_vtoc {
+ __be32 version; /* Layout version */
+ char volume[8]; /* Volume name */
+ __be16 nparts; /* Number of partitions */
+ struct sun_info { /* Partition hdrs, sec 2 */
+ __be16 id;
+ __be16 flags;
+ } infos[8];
+ __be16 padding; /* Alignment padding */
+ __be32 bootinfo[3]; /* Info needed by mboot */
+ __be32 sanity; /* To verify vtoc sanity */
+ __be32 reserved[10]; /* Free space */
+ __be32 timestamp[8]; /* Partition timestamp */
+ } vtoc;
+ __be32 write_reinstruct; /* sectors to skip, writes */
+ __be32 read_reinstruct; /* sectors to skip, reads */
+ unsigned char spare[148]; /* Padding */
+ __be16 rspeed; /* Disk rotational speed */
+ __be16 pcylcount; /* Physical cylinder count */
+ __be16 sparecyl; /* extra sects per cylinder */
+ __be16 obs1; /* gap1 */
+ __be16 obs2; /* gap2 */
+ __be16 ilfact; /* Interleave factor */
+ __be16 ncyl; /* Data cylinder count */
+ __be16 nacyl; /* Alt. cylinder count */
+ __be16 ntrks; /* Tracks per cylinder */
+ __be16 nsect; /* Sectors per track */
+ __be16 obs3; /* bhead - Label head offset */
+ __be16 obs4; /* ppart - Physical Partition */
+ struct sun_partition {
+ __be32 start_cylinder;
+ __be32 num_sectors;
+ } partitions[8];
+ __be16 magic; /* Magic number */
+ __be16 csum; /* Label xor'd checksum */
+ } * label;
+ struct sun_partition *p;
+ unsigned long spc;
+ char b[BDEVNAME_SIZE];
+ int use_vtoc;
+ int nparts;
+
+ label = read_part_sector(state, 0, &sect);
+ if (!label)
+ return -1;
+
+ p = label->partitions;
+ if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
+/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
+ bdevname(bdev, b), be16_to_cpu(label->magic)); */
+ put_dev_sector(sect);
+ return 0;
+ }
+ /* Look at the checksum */
+ ush = ((__be16 *) (label+1)) - 1;
+ for (csum = 0; ush >= ((__be16 *) label);)
+ csum ^= *ush--;
+ if (csum) {
+ printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
+ bdevname(state->bdev, b));
+ put_dev_sector(sect);
+ return 0;
+ }
+
+ /* Check to see if we can use the VTOC table */
+ use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
+ (be32_to_cpu(label->vtoc.version) == 1) &&
+ (be16_to_cpu(label->vtoc.nparts) <= 8));
+
+ /* Use 8 partition entries if not specified in validated VTOC */
+ nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
+
+ /*
+ * So that old Linux-Sun partitions continue to work,
+ * alow the VTOC to be used under the additional condition ...
+ */
+ use_vtoc = use_vtoc || !(label->vtoc.sanity ||
+ label->vtoc.version || label->vtoc.nparts);
+ spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
+ for (i = 0; i < nparts; i++, p++) {
+ unsigned long st_sector;
+ unsigned int num_sectors;
+
+ st_sector = be32_to_cpu(p->start_cylinder) * spc;
+ num_sectors = be32_to_cpu(p->num_sectors);
+ if (num_sectors) {
+ put_partition(state, slot, st_sector, num_sectors);
+ state->parts[slot].flags = 0;
+ if (use_vtoc) {
+ if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
+ state->parts[slot].flags |= ADDPART_FLAG_RAID;
+ else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
+ state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
+ }
+ }
+ slot++;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+}
diff --git a/block/partitions/sun.h b/block/partitions/sun.h
new file mode 100644
index 000000000000..2424baa8319f
--- /dev/null
+++ b/block/partitions/sun.h
@@ -0,0 +1,8 @@
+/*
+ * fs/partitions/sun.h
+ */
+
+#define SUN_LABEL_MAGIC 0xDABE
+#define SUN_VTOC_SANITY 0x600DDEEE
+
+int sun_partition(struct parsed_partitions *state);
diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c
new file mode 100644
index 000000000000..9627ccffc1c4
--- /dev/null
+++ b/block/partitions/sysv68.c
@@ -0,0 +1,95 @@
+/*
+ * fs/partitions/sysv68.c
+ *
+ * Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "sysv68.h"
+
+/*
+ * Volume ID structure: on first 256-bytes sector of disk
+ */
+
+struct volumeid {
+ u8 vid_unused[248];
+ u8 vid_mac[8]; /* ASCII string "MOTOROLA" */
+};
+
+/*
+ * config block: second 256-bytes sector on disk
+ */
+
+struct dkconfig {
+ u8 ios_unused0[128];
+ __be32 ios_slcblk; /* Slice table block number */
+ __be16 ios_slccnt; /* Number of entries in slice table */
+ u8 ios_unused1[122];
+};
+
+/*
+ * combined volumeid and dkconfig block
+ */
+
+struct dkblk0 {
+ struct volumeid dk_vid;
+ struct dkconfig dk_ios;
+};
+
+/*
+ * Slice Table Structure
+ */
+
+struct slice {
+ __be32 nblocks; /* slice size (in blocks) */
+ __be32 blkoff; /* block offset of slice */
+};
+
+
+int sysv68_partition(struct parsed_partitions *state)
+{
+ int i, slices;
+ int slot = 1;
+ Sector sect;
+ unsigned char *data;
+ struct dkblk0 *b;
+ struct slice *slice;
+ char tmp[64];
+
+ data = read_part_sector(state, 0, &sect);
+ if (!data)
+ return -1;
+
+ b = (struct dkblk0 *)data;
+ if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
+ put_dev_sector(sect);
+ return 0;
+ }
+ slices = be16_to_cpu(b->dk_ios.ios_slccnt);
+ i = be32_to_cpu(b->dk_ios.ios_slcblk);
+ put_dev_sector(sect);
+
+ data = read_part_sector(state, i, &sect);
+ if (!data)
+ return -1;
+
+ slices -= 1; /* last slice is the whole disk */
+ snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ slice = (struct slice *)data;
+ for (i = 0; i < slices; i++, slice++) {
+ if (slot == state->limit)
+ break;
+ if (be32_to_cpu(slice->nblocks)) {
+ put_partition(state, slot,
+ be32_to_cpu(slice->blkoff),
+ be32_to_cpu(slice->nblocks));
+ snprintf(tmp, sizeof(tmp), "(s%u)", i);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+ }
+ slot++;
+ }
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ put_dev_sector(sect);
+ return 1;
+}
diff --git a/block/partitions/sysv68.h b/block/partitions/sysv68.h
new file mode 100644
index 000000000000..bf2f5ffa97ac
--- /dev/null
+++ b/block/partitions/sysv68.h
@@ -0,0 +1 @@
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c
new file mode 100644
index 000000000000..8dbaf9f77a99
--- /dev/null
+++ b/block/partitions/ultrix.c
@@ -0,0 +1,48 @@
+/*
+ * fs/partitions/ultrix.c
+ *
+ * Code extracted from drivers/block/genhd.c
+ *
+ * Re-organised Jul 1999 Russell King
+ */
+
+#include "check.h"
+#include "ultrix.h"
+
+int ultrix_partition(struct parsed_partitions *state)
+{
+ int i;
+ Sector sect;
+ unsigned char *data;
+ struct ultrix_disklabel {
+ s32 pt_magic; /* magic no. indicating part. info exits */
+ s32 pt_valid; /* set by driver if pt is current */
+ struct pt_info {
+ s32 pi_nblocks; /* no. of sectors */
+ u32 pi_blkoff; /* block offset for start */
+ } pt_part[8];
+ } *label;
+
+#define PT_MAGIC 0x032957 /* Partition magic number */
+#define PT_VALID 1 /* Indicates if struct is valid */
+
+ data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
+ if (!data)
+ return -1;
+
+ label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
+
+ if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
+ for (i=0; i<8; i++)
+ if (label->pt_part[i].pi_nblocks)
+ put_partition(state, i+1,
+ label->pt_part[i].pi_blkoff,
+ label->pt_part[i].pi_nblocks);
+ put_dev_sector(sect);
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+ return 1;
+ } else {
+ put_dev_sector(sect);
+ return 0;
+ }
+}
diff --git a/block/partitions/ultrix.h b/block/partitions/ultrix.h
new file mode 100644
index 000000000000..a3cc00b2bded
--- /dev/null
+++ b/block/partitions/ultrix.h
@@ -0,0 +1,5 @@
+/*
+ * fs/partitions/ultrix.h
+ */
+
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index fbdf0d802ec4..260fa80ef575 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -24,6 +24,7 @@
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/cdrom.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
#include <asm/uaccess.h>
@@ -690,6 +691,57 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
}
EXPORT_SYMBOL(scsi_cmd_ioctl);
+int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
+{
+ if (bd && bd == bd->bd_contains)
+ return 0;
+
+ /* Actually none of these is particularly useful on a partition,
+ * but they are safe.
+ */
+ switch (cmd) {
+ case SCSI_IOCTL_GET_IDLUN:
+ case SCSI_IOCTL_GET_BUS_NUMBER:
+ case SCSI_IOCTL_GET_PCI:
+ case SCSI_IOCTL_PROBE_HOST:
+ case SG_GET_VERSION_NUM:
+ case SG_SET_TIMEOUT:
+ case SG_GET_TIMEOUT:
+ case SG_GET_RESERVED_SIZE:
+ case SG_SET_RESERVED_SIZE:
+ case SG_EMULATED_HOST:
+ return 0;
+ case CDROM_GET_CAPABILITY:
+ /* Keep this until we remove the printk below. udev sends it
+ * and we do not want to spam dmesg about it. CD-ROMs do
+ * not have partitions, so we get here only for disks.
+ */
+ return -ENOIOCTLCMD;
+ default:
+ break;
+ }
+
+ /* In particular, rule out all resets and host-specific ioctls. */
+ printk_ratelimited(KERN_WARNING
+ "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
+
+ return capable(CAP_SYS_RAWIO) ? 0 : -ENOIOCTLCMD;
+}
+EXPORT_SYMBOL(scsi_verify_blk_ioctl);
+
+int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
+ unsigned int cmd, void __user *arg)
+{
+ int ret;
+
+ ret = scsi_verify_blk_ioctl(bd, cmd);
+ if (ret < 0)
+ return ret;
+
+ return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
+}
+EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
+
static int __init blk_scsi_ioctl_init(void)
{
blk_set_cmd_filter_defaults(&blk_default_cmd_filter);