summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/ceph/osd_client.h1
-rw-r--r--include/linux/ceph/osdmap.h2
-rw-r--r--include/linux/ceph/rados.h4
-rw-r--r--include/linux/crush/crush.h2
-rw-r--r--net/ceph/crush/mapper.c2
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/osd_client.c14
-rw-r--r--net/ceph/osdmap.c60
8 files changed, 53 insertions, 38 deletions
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c6d96a5f46fd..adf670ecaf94 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -148,6 +148,7 @@ struct ceph_osd_request_target {
int size;
int min_size;
bool sort_bitwise;
+ bool recovery_deletes;
unsigned int flags; /* CEPH_OSD_FLAG_* */
bool paused;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index a0996cb9faed..af3444a5bfdd 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -272,6 +272,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
const struct ceph_pg *pgid);
bool ceph_osds_changed(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 385db08bb8b2..b8281feda9c7 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -158,6 +158,10 @@ extern const char *ceph_osd_state_name(int s);
#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
+#define CEPH_OSDMAP_REQUIRE_JEWEL (1<<16) /* require jewel for booting osds */
+#define CEPH_OSDMAP_REQUIRE_KRAKEN (1<<17) /* require kraken for booting osds */
+#define CEPH_OSDMAP_REQUIRE_LUMINOUS (1<<18) /* require l for booting osds */
+#define CEPH_OSDMAP_RECOVERY_DELETES (1<<19) /* deletes performed during recovery instead of peering */
/*
* The error code to return when an OSD can't handle a write
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 92e165d417a6..07eed95e10c7 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -193,7 +193,7 @@ struct crush_choose_arg {
struct crush_choose_arg_map {
#ifdef __KERNEL__
struct rb_node node;
- u64 choose_args_index;
+ s64 choose_args_index;
#endif
struct crush_choose_arg *args; /*!< replacement for each bucket
in the crushmap */
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 746b145bfd11..417df675c71b 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,7 @@ static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
const struct crush_choose_arg *arg,
int position)
{
- if (!arg || !arg->weight_set || arg->weight_set_size == 0)
+ if (!arg || !arg->weight_set)
return bucket->item_weights;
if (position >= arg->weight_set_size)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b7cc615d42ef..a67298c7e0cd 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1287,10 +1287,10 @@ static void prepare_write_message(struct ceph_connection *con)
if (m->needs_out_seq) {
m->hdr.seq = cpu_to_le64(++con->out_seq);
m->needs_out_seq = false;
- }
- if (con->ops->reencode_message)
- con->ops->reencode_message(m);
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(m);
+ }
dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type),
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 901bb8221366..dcfbdd74dfd1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1337,6 +1337,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
bool legacy_change;
bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
+ bool recovery_deletes = ceph_osdmap_flag(osdc,
+ CEPH_OSDMAP_RECOVERY_DELETES);
enum calc_target_result ct_res;
int ret;
@@ -1399,6 +1401,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
pi->pg_num,
t->sort_bitwise,
sort_bitwise,
+ t->recovery_deletes,
+ recovery_deletes,
&last_pgid))
force_resend = true;
@@ -1421,6 +1425,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->pg_num = pi->pg_num;
t->pg_num_mask = pi->pg_num_mask;
t->sort_bitwise = sort_bitwise;
+ t->recovery_deletes = recovery_deletes;
t->osd = acting.primary;
}
@@ -1918,10 +1923,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
}
ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
- BUG_ON(p != end - 8); /* space for features */
+ BUG_ON(p > end - 8); /* space for features */
msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
/* front_len is finalized in encode_request_finish() */
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
msg->hdr.data_len = cpu_to_le32(data_len);
/*
* The header "data_off" is a hint to the receiver allowing it
@@ -1937,11 +1944,12 @@ static void encode_request_partial(struct ceph_osd_request *req,
static void encode_request_finish(struct ceph_msg *msg)
{
void *p = msg->front.iov_base;
+ void *const partial_end = p + msg->front.iov_len;
void *const end = p + msg->front_alloc_len;
if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
/* luminous OSD -- encode features and be done */
- p = end - 8;
+ p = partial_end;
ceph_encode_64(&p, msg->con->peer_features);
} else {
struct {
@@ -1984,7 +1992,7 @@ static void encode_request_finish(struct ceph_msg *msg)
oid_len = p - oid;
tail = p;
- tail_len = (end - p) - 8;
+ tail_len = partial_end - p;
p = msg->front.iov_base;
ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 64ae9f89773a..f358d0bfa76b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -295,6 +295,10 @@ static int decode_choose_args(void **p, void *end, struct crush_map *c)
ret = decode_choose_arg(p, end, arg);
if (ret)
goto fail;
+
+ if (arg->ids_size &&
+ arg->ids_size != c->buckets[bucket_index]->size)
+ goto e_inval;
}
insert_choose_arg_map(&c->choose_args, arg_map);
@@ -2078,6 +2082,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
u32 new_pg_num,
bool old_sort_bitwise,
bool new_sort_bitwise,
+ bool old_recovery_deletes,
+ bool new_recovery_deletes,
const struct ceph_pg *pgid)
{
return !osds_equal(old_acting, new_acting) ||
@@ -2085,7 +2091,8 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
old_size != new_size ||
old_min_size != new_min_size ||
ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
- old_sort_bitwise != new_sort_bitwise;
+ old_sort_bitwise != new_sort_bitwise ||
+ old_recovery_deletes != new_recovery_deletes;
}
static int calc_pg_rank(int osd, const struct ceph_osds *acting)
@@ -2301,10 +2308,17 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
}
}
+/*
+ * Magic value used for a "default" fallback choose_args, used if the
+ * crush_choose_arg_map passed to do_crush() does not exist. If this
+ * also doesn't exist, fall back to canonical weights.
+ */
+#define CEPH_DEFAULT_CHOOSE_ARGS -1
+
static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max,
const __u32 *weight, int weight_max,
- u64 choose_args_index)
+ s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
int r;
@@ -2313,6 +2327,9 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
choose_args_index);
+ if (!arg_map)
+ arg_map = lookup_choose_arg_map(&map->crush->choose_args,
+ CEPH_DEFAULT_CHOOSE_ARGS);
mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
@@ -2423,40 +2440,23 @@ static void apply_upmap(struct ceph_osdmap *osdmap,
for (i = 0; i < pg->pg_upmap.len; i++)
raw->osds[i] = pg->pg_upmap.osds[i];
raw->size = pg->pg_upmap.len;
- return;
+ /* check and apply pg_upmap_items, if any */
}
pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
if (pg) {
- /*
- * Note: this approach does not allow a bidirectional swap,
- * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
- */
- for (i = 0; i < pg->pg_upmap_items.len; i++) {
- int from = pg->pg_upmap_items.from_to[i][0];
- int to = pg->pg_upmap_items.from_to[i][1];
- int pos = -1;
- bool exists = false;
-
- /* make sure replacement doesn't already appear */
- for (j = 0; j < raw->size; j++) {
- int osd = raw->osds[j];
-
- if (osd == to) {
- exists = true;
+ for (i = 0; i < raw->size; i++) {
+ for (j = 0; j < pg->pg_upmap_items.len; j++) {
+ int from = pg->pg_upmap_items.from_to[j][0];
+ int to = pg->pg_upmap_items.from_to[j][1];
+
+ if (from == raw->osds[i]) {
+ if (!(to != CRUSH_ITEM_NONE &&
+ to < osdmap->max_osd &&
+ osdmap->osd_weight[to] == 0))
+ raw->osds[i] = to;
break;
}
- /* ignore mapping if target is marked out */
- if (osd == from && pos < 0 &&
- !(to != CRUSH_ITEM_NONE &&
- to < osdmap->max_osd &&
- osdmap->osd_weight[to] == 0)) {
- pos = j;
- }
- }
- if (!exists && pos >= 0) {
- raw->osds[pos] = to;
- return;
}
}
}