Merge git://1984.lsi.us.es/nf-next
[cascardo/linux.git] / drivers / md / dm-raid.c
1 /*
2  * Copyright (C) 2010-2011 Neil Brown
3  * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include <linux/slab.h>
9 #include <linux/module.h>
10
11 #include "md.h"
12 #include "raid1.h"
13 #include "raid5.h"
14 #include "raid10.h"
15 #include "bitmap.h"
16
17 #include <linux/device-mapper.h>
18
19 #define DM_MSG_PREFIX "raid"
20
21 /*
22  * The following flags are used by dm-raid.c to set up the array state.
23  * They must be cleared before md_run is called.
24  */
25 #define FirstUse 10             /* rdev flag */
26
27 struct raid_dev {
28         /*
29          * Two DM devices, one to hold metadata and one to hold the
30          * actual data/parity.  The reason for this is to not confuse
31          * ti->len and give more flexibility in altering size and
32          * characteristics.
33          *
34          * While it is possible for this device to be associated
35          * with a different physical device than the data_dev, it
36          * is intended for it to be the same.
37          *    |--------- Physical Device ---------|
38          *    |- meta_dev -|------ data_dev ------|
39          */
40         struct dm_dev *meta_dev;
41         struct dm_dev *data_dev;
42         struct md_rdev rdev;
43 };
44
45 /*
46  * Flags for rs->print_flags field.
47  */
48 #define DMPF_SYNC              0x1
49 #define DMPF_NOSYNC            0x2
50 #define DMPF_REBUILD           0x4
51 #define DMPF_DAEMON_SLEEP      0x8
52 #define DMPF_MIN_RECOVERY_RATE 0x10
53 #define DMPF_MAX_RECOVERY_RATE 0x20
54 #define DMPF_MAX_WRITE_BEHIND  0x40
55 #define DMPF_STRIPE_CACHE      0x80
56 #define DMPF_REGION_SIZE       0x100
57 #define DMPF_RAID10_COPIES     0x200
58 #define DMPF_RAID10_FORMAT     0x400
59
60 struct raid_set {
61         struct dm_target *ti;
62
63         uint32_t bitmap_loaded;
64         uint32_t print_flags;
65
66         struct mddev md;
67         struct raid_type *raid_type;
68         struct dm_target_callbacks callbacks;
69
70         struct raid_dev dev[0];
71 };
72
73 /* Supported raid types and properties. */
74 static struct raid_type {
75         const char *name;               /* RAID algorithm. */
76         const char *descr;              /* Descriptor text for logging. */
77         const unsigned parity_devs;     /* # of parity devices. */
78         const unsigned minimal_devs;    /* minimal # of devices in set. */
79         const unsigned level;           /* RAID level. */
80         const unsigned algorithm;       /* RAID algorithm. */
81 } raid_types[] = {
82         {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
83         {"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
84         {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
85         {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
86         {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
87         {"raid5_ls", "RAID5 (left symmetric)",          1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
88         {"raid5_rs", "RAID5 (right symmetric)",         1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
89         {"raid6_zr", "RAID6 (zero restart)",            2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
90         {"raid6_nr", "RAID6 (N restart)",               2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
91         {"raid6_nc", "RAID6 (N continue)",              2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92 };
93
94 static unsigned raid10_md_layout_to_copies(int layout)
95 {
96         return layout & 0xFF;
97 }
98
99 static int raid10_format_to_md_layout(char *format, unsigned copies)
100 {
101         /* 1 "far" copy, and 'copies' "near" copies */
102         return (1 << 8) | (copies & 0xFF);
103 }
104
105 static struct raid_type *get_raid_type(char *name)
106 {
107         int i;
108
109         for (i = 0; i < ARRAY_SIZE(raid_types); i++)
110                 if (!strcmp(raid_types[i].name, name))
111                         return &raid_types[i];
112
113         return NULL;
114 }
115
116 static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
117 {
118         unsigned i;
119         struct raid_set *rs;
120
121         if (raid_devs <= raid_type->parity_devs) {
122                 ti->error = "Insufficient number of devices";
123                 return ERR_PTR(-EINVAL);
124         }
125
126         rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
127         if (!rs) {
128                 ti->error = "Cannot allocate raid context";
129                 return ERR_PTR(-ENOMEM);
130         }
131
132         mddev_init(&rs->md);
133
134         rs->ti = ti;
135         rs->raid_type = raid_type;
136         rs->md.raid_disks = raid_devs;
137         rs->md.level = raid_type->level;
138         rs->md.new_level = rs->md.level;
139         rs->md.layout = raid_type->algorithm;
140         rs->md.new_layout = rs->md.layout;
141         rs->md.delta_disks = 0;
142         rs->md.recovery_cp = 0;
143
144         for (i = 0; i < raid_devs; i++)
145                 md_rdev_init(&rs->dev[i].rdev);
146
147         /*
148          * Remaining items to be initialized by further RAID params:
149          *  rs->md.persistent
150          *  rs->md.external
151          *  rs->md.chunk_sectors
152          *  rs->md.new_chunk_sectors
153          *  rs->md.dev_sectors
154          */
155
156         return rs;
157 }
158
159 static void context_free(struct raid_set *rs)
160 {
161         int i;
162
163         for (i = 0; i < rs->md.raid_disks; i++) {
164                 if (rs->dev[i].meta_dev)
165                         dm_put_device(rs->ti, rs->dev[i].meta_dev);
166                 md_rdev_clear(&rs->dev[i].rdev);
167                 if (rs->dev[i].data_dev)
168                         dm_put_device(rs->ti, rs->dev[i].data_dev);
169         }
170
171         kfree(rs);
172 }
173
174 /*
175  * For every device we have two words
176  *  <meta_dev>: meta device name or '-' if missing
177  *  <data_dev>: data device name or '-' if missing
178  *
179  * The following are permitted:
180  *    - -
181  *    - <data_dev>
182  *    <meta_dev> <data_dev>
183  *
184  * The following is not allowed:
185  *    <meta_dev> -
186  *
187  * This code parses those words.  If there is a failure,
188  * the caller must use context_free to unwind the operations.
189  */
190 static int dev_parms(struct raid_set *rs, char **argv)
191 {
192         int i;
193         int rebuild = 0;
194         int metadata_available = 0;
195         int ret = 0;
196
197         for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
198                 rs->dev[i].rdev.raid_disk = i;
199
200                 rs->dev[i].meta_dev = NULL;
201                 rs->dev[i].data_dev = NULL;
202
203                 /*
204                  * There are no offsets, since there is a separate device
205                  * for data and metadata.
206                  */
207                 rs->dev[i].rdev.data_offset = 0;
208                 rs->dev[i].rdev.mddev = &rs->md;
209
210                 if (strcmp(argv[0], "-")) {
211                         ret = dm_get_device(rs->ti, argv[0],
212                                             dm_table_get_mode(rs->ti->table),
213                                             &rs->dev[i].meta_dev);
214                         rs->ti->error = "RAID metadata device lookup failure";
215                         if (ret)
216                                 return ret;
217
218                         rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
219                         if (!rs->dev[i].rdev.sb_page)
220                                 return -ENOMEM;
221                 }
222
223                 if (!strcmp(argv[1], "-")) {
224                         if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
225                             (!rs->dev[i].rdev.recovery_offset)) {
226                                 rs->ti->error = "Drive designated for rebuild not specified";
227                                 return -EINVAL;
228                         }
229
230                         rs->ti->error = "No data device supplied with metadata device";
231                         if (rs->dev[i].meta_dev)
232                                 return -EINVAL;
233
234                         continue;
235                 }
236
237                 ret = dm_get_device(rs->ti, argv[1],
238                                     dm_table_get_mode(rs->ti->table),
239                                     &rs->dev[i].data_dev);
240                 if (ret) {
241                         rs->ti->error = "RAID device lookup failure";
242                         return ret;
243                 }
244
245                 if (rs->dev[i].meta_dev) {
246                         metadata_available = 1;
247                         rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
248                 }
249                 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
250                 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
251                 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
252                         rebuild++;
253         }
254
255         if (metadata_available) {
256                 rs->md.external = 0;
257                 rs->md.persistent = 1;
258                 rs->md.major_version = 2;
259         } else if (rebuild && !rs->md.recovery_cp) {
260                 /*
261                  * Without metadata, we will not be able to tell if the array
262                  * is in-sync or not - we must assume it is not.  Therefore,
263                  * it is impossible to rebuild a drive.
264                  *
265                  * Even if there is metadata, the on-disk information may
266                  * indicate that the array is not in-sync and it will then
267                  * fail at that time.
268                  *
269                  * User could specify 'nosync' option if desperate.
270                  */
271                 DMERR("Unable to rebuild drive while array is not in-sync");
272                 rs->ti->error = "RAID device lookup failure";
273                 return -EINVAL;
274         }
275
276         return 0;
277 }
278
279 /*
280  * validate_region_size
281  * @rs
282  * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
283  *
284  * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
285  * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
286  *
287  * Returns: 0 on success, -EINVAL on failure.
288  */
289 static int validate_region_size(struct raid_set *rs, unsigned long region_size)
290 {
291         unsigned long min_region_size = rs->ti->len / (1 << 21);
292
293         if (!region_size) {
294                 /*
295                  * Choose a reasonable default.  All figures in sectors.
296                  */
297                 if (min_region_size > (1 << 13)) {
298                         DMINFO("Choosing default region size of %lu sectors",
299                                region_size);
300                         region_size = min_region_size;
301                 } else {
302                         DMINFO("Choosing default region size of 4MiB");
303                         region_size = 1 << 13; /* sectors */
304                 }
305         } else {
306                 /*
307                  * Validate user-supplied value.
308                  */
309                 if (region_size > rs->ti->len) {
310                         rs->ti->error = "Supplied region size is too large";
311                         return -EINVAL;
312                 }
313
314                 if (region_size < min_region_size) {
315                         DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
316                               region_size, min_region_size);
317                         rs->ti->error = "Supplied region size is too small";
318                         return -EINVAL;
319                 }
320
321                 if (!is_power_of_2(region_size)) {
322                         rs->ti->error = "Region size is not a power of 2";
323                         return -EINVAL;
324                 }
325
326                 if (region_size < rs->md.chunk_sectors) {
327                         rs->ti->error = "Region size is smaller than the chunk size";
328                         return -EINVAL;
329                 }
330         }
331
332         /*
333          * Convert sectors to bytes.
334          */
335         rs->md.bitmap_info.chunksize = (region_size << 9);
336
337         return 0;
338 }
339
340 /*
341  * Possible arguments are...
342  *      <chunk_size> [optional_args]
343  *
344  * Argument definitions
345  *    <chunk_size>                      The number of sectors per disk that
346  *                                      will form the "stripe"
347  *    [[no]sync]                        Force or prevent recovery of the
348  *                                      entire array
349  *    [rebuild <idx>]                   Rebuild the drive indicated by the index
350  *    [daemon_sleep <ms>]               Time between bitmap daemon work to
351  *                                      clear bits
352  *    [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
353  *    [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
354  *    [write_mostly <idx>]              Indicate a write mostly drive via index
355  *    [max_write_behind <sectors>]      See '-write-behind=' (man mdadm)
356  *    [stripe_cache <sectors>]          Stripe cache size for higher RAIDs
357  *    [region_size <sectors>]           Defines granularity of bitmap
358  *
359  * RAID10-only options:
360  *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
361  *    [raid10_format <near>]            Layout algorithm.  (Default: near)
362  */
363 static int parse_raid_params(struct raid_set *rs, char **argv,
364                              unsigned num_raid_params)
365 {
366         char *raid10_format = "near";
367         unsigned raid10_copies = 2;
368         unsigned i, rebuild_cnt = 0;
369         unsigned long value, region_size = 0;
370         sector_t sectors_per_dev = rs->ti->len;
371         sector_t max_io_len;
372         char *key;
373
374         /*
375          * First, parse the in-order required arguments
376          * "chunk_size" is the only argument of this type.
377          */
378         if ((strict_strtoul(argv[0], 10, &value) < 0)) {
379                 rs->ti->error = "Bad chunk size";
380                 return -EINVAL;
381         } else if (rs->raid_type->level == 1) {
382                 if (value)
383                         DMERR("Ignoring chunk size parameter for RAID 1");
384                 value = 0;
385         } else if (!is_power_of_2(value)) {
386                 rs->ti->error = "Chunk size must be a power of 2";
387                 return -EINVAL;
388         } else if (value < 8) {
389                 rs->ti->error = "Chunk size value is too small";
390                 return -EINVAL;
391         }
392
393         rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
394         argv++;
395         num_raid_params--;
396
397         /*
398          * We set each individual device as In_sync with a completed
399          * 'recovery_offset'.  If there has been a device failure or
400          * replacement then one of the following cases applies:
401          *
402          *   1) User specifies 'rebuild'.
403          *      - Device is reset when param is read.
404          *   2) A new device is supplied.
405          *      - No matching superblock found, resets device.
406          *   3) Device failure was transient and returns on reload.
407          *      - Failure noticed, resets device for bitmap replay.
408          *   4) Device hadn't completed recovery after previous failure.
409          *      - Superblock is read and overrides recovery_offset.
410          *
411          * What is found in the superblocks of the devices is always
412          * authoritative, unless 'rebuild' or '[no]sync' was specified.
413          */
414         for (i = 0; i < rs->md.raid_disks; i++) {
415                 set_bit(In_sync, &rs->dev[i].rdev.flags);
416                 rs->dev[i].rdev.recovery_offset = MaxSector;
417         }
418
419         /*
420          * Second, parse the unordered optional arguments
421          */
422         for (i = 0; i < num_raid_params; i++) {
423                 if (!strcasecmp(argv[i], "nosync")) {
424                         rs->md.recovery_cp = MaxSector;
425                         rs->print_flags |= DMPF_NOSYNC;
426                         continue;
427                 }
428                 if (!strcasecmp(argv[i], "sync")) {
429                         rs->md.recovery_cp = 0;
430                         rs->print_flags |= DMPF_SYNC;
431                         continue;
432                 }
433
434                 /* The rest of the optional arguments come in key/value pairs */
435                 if ((i + 1) >= num_raid_params) {
436                         rs->ti->error = "Wrong number of raid parameters given";
437                         return -EINVAL;
438                 }
439
440                 key = argv[i++];
441
442                 /* Parameters that take a string value are checked here. */
443                 if (!strcasecmp(key, "raid10_format")) {
444                         if (rs->raid_type->level != 10) {
445                                 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
446                                 return -EINVAL;
447                         }
448                         if (strcmp("near", argv[i])) {
449                                 rs->ti->error = "Invalid 'raid10_format' value given";
450                                 return -EINVAL;
451                         }
452                         raid10_format = argv[i];
453                         rs->print_flags |= DMPF_RAID10_FORMAT;
454                         continue;
455                 }
456
457                 if (strict_strtoul(argv[i], 10, &value) < 0) {
458                         rs->ti->error = "Bad numerical argument given in raid params";
459                         return -EINVAL;
460                 }
461
462                 /* Parameters that take a numeric value are checked here */
463                 if (!strcasecmp(key, "rebuild")) {
464                         rebuild_cnt++;
465
466                         switch (rs->raid_type->level) {
467                         case 1:
468                                 if (rebuild_cnt >= rs->md.raid_disks) {
469                                         rs->ti->error = "Too many rebuild devices specified";
470                                         return -EINVAL;
471                                 }
472                                 break;
473                         case 4:
474                         case 5:
475                         case 6:
476                                 if (rebuild_cnt > rs->raid_type->parity_devs) {
477                                         rs->ti->error = "Too many rebuild devices specified for given RAID type";
478                                         return -EINVAL;
479                                 }
480                                 break;
481                         case 10:
482                         default:
483                                 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
484                                 rs->ti->error = "Rebuild not supported for this RAID type";
485                                 return -EINVAL;
486                         }
487
488                         if (value > rs->md.raid_disks) {
489                                 rs->ti->error = "Invalid rebuild index given";
490                                 return -EINVAL;
491                         }
492                         clear_bit(In_sync, &rs->dev[value].rdev.flags);
493                         rs->dev[value].rdev.recovery_offset = 0;
494                         rs->print_flags |= DMPF_REBUILD;
495                 } else if (!strcasecmp(key, "write_mostly")) {
496                         if (rs->raid_type->level != 1) {
497                                 rs->ti->error = "write_mostly option is only valid for RAID1";
498                                 return -EINVAL;
499                         }
500                         if (value >= rs->md.raid_disks) {
501                                 rs->ti->error = "Invalid write_mostly drive index given";
502                                 return -EINVAL;
503                         }
504                         set_bit(WriteMostly, &rs->dev[value].rdev.flags);
505                 } else if (!strcasecmp(key, "max_write_behind")) {
506                         if (rs->raid_type->level != 1) {
507                                 rs->ti->error = "max_write_behind option is only valid for RAID1";
508                                 return -EINVAL;
509                         }
510                         rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
511
512                         /*
513                          * In device-mapper, we specify things in sectors, but
514                          * MD records this value in kB
515                          */
516                         value /= 2;
517                         if (value > COUNTER_MAX) {
518                                 rs->ti->error = "Max write-behind limit out of range";
519                                 return -EINVAL;
520                         }
521                         rs->md.bitmap_info.max_write_behind = value;
522                 } else if (!strcasecmp(key, "daemon_sleep")) {
523                         rs->print_flags |= DMPF_DAEMON_SLEEP;
524                         if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
525                                 rs->ti->error = "daemon sleep period out of range";
526                                 return -EINVAL;
527                         }
528                         rs->md.bitmap_info.daemon_sleep = value;
529                 } else if (!strcasecmp(key, "stripe_cache")) {
530                         rs->print_flags |= DMPF_STRIPE_CACHE;
531
532                         /*
533                          * In device-mapper, we specify things in sectors, but
534                          * MD records this value in kB
535                          */
536                         value /= 2;
537
538                         if ((rs->raid_type->level != 5) &&
539                             (rs->raid_type->level != 6)) {
540                                 rs->ti->error = "Inappropriate argument: stripe_cache";
541                                 return -EINVAL;
542                         }
543                         if (raid5_set_cache_size(&rs->md, (int)value)) {
544                                 rs->ti->error = "Bad stripe_cache size";
545                                 return -EINVAL;
546                         }
547                 } else if (!strcasecmp(key, "min_recovery_rate")) {
548                         rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
549                         if (value > INT_MAX) {
550                                 rs->ti->error = "min_recovery_rate out of range";
551                                 return -EINVAL;
552                         }
553                         rs->md.sync_speed_min = (int)value;
554                 } else if (!strcasecmp(key, "max_recovery_rate")) {
555                         rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
556                         if (value > INT_MAX) {
557                                 rs->ti->error = "max_recovery_rate out of range";
558                                 return -EINVAL;
559                         }
560                         rs->md.sync_speed_max = (int)value;
561                 } else if (!strcasecmp(key, "region_size")) {
562                         rs->print_flags |= DMPF_REGION_SIZE;
563                         region_size = value;
564                 } else if (!strcasecmp(key, "raid10_copies") &&
565                            (rs->raid_type->level == 10)) {
566                         if ((value < 2) || (value > 0xFF)) {
567                                 rs->ti->error = "Bad value for 'raid10_copies'";
568                                 return -EINVAL;
569                         }
570                         rs->print_flags |= DMPF_RAID10_COPIES;
571                         raid10_copies = value;
572                 } else {
573                         DMERR("Unable to parse RAID parameter: %s", key);
574                         rs->ti->error = "Unable to parse RAID parameters";
575                         return -EINVAL;
576                 }
577         }
578
579         if (validate_region_size(rs, region_size))
580                 return -EINVAL;
581
582         if (rs->md.chunk_sectors)
583                 max_io_len = rs->md.chunk_sectors;
584         else
585                 max_io_len = region_size;
586
587         if (dm_set_target_max_io_len(rs->ti, max_io_len))
588                 return -EINVAL;
589
590         if (rs->raid_type->level == 10) {
591                 if (raid10_copies > rs->md.raid_disks) {
592                         rs->ti->error = "Not enough devices to satisfy specification";
593                         return -EINVAL;
594                 }
595
596                 /* (Len * #mirrors) / #devices */
597                 sectors_per_dev = rs->ti->len * raid10_copies;
598                 sector_div(sectors_per_dev, rs->md.raid_disks);
599
600                 rs->md.layout = raid10_format_to_md_layout(raid10_format,
601                                                            raid10_copies);
602                 rs->md.new_layout = rs->md.layout;
603         } else if ((rs->raid_type->level > 1) &&
604                    sector_div(sectors_per_dev,
605                               (rs->md.raid_disks - rs->raid_type->parity_devs))) {
606                 rs->ti->error = "Target length not divisible by number of data devices";
607                 return -EINVAL;
608         }
609         rs->md.dev_sectors = sectors_per_dev;
610
611         /* Assume there are no metadata devices until the drives are parsed */
612         rs->md.persistent = 0;
613         rs->md.external = 1;
614
615         return 0;
616 }
617
618 static void do_table_event(struct work_struct *ws)
619 {
620         struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
621
622         dm_table_event(rs->ti->table);
623 }
624
625 static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
626 {
627         struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
628
629         if (rs->raid_type->level == 1)
630                 return md_raid1_congested(&rs->md, bits);
631
632         if (rs->raid_type->level == 10)
633                 return md_raid10_congested(&rs->md, bits);
634
635         return md_raid5_congested(&rs->md, bits);
636 }
637
638 /*
639  * This structure is never routinely used by userspace, unlike md superblocks.
640  * Devices with this superblock should only ever be accessed via device-mapper.
641  */
642 #define DM_RAID_MAGIC 0x64526D44
643 struct dm_raid_superblock {
644         __le32 magic;           /* "DmRd" */
645         __le32 features;        /* Used to indicate possible future changes */
646
647         __le32 num_devices;     /* Number of devices in this array. (Max 64) */
648         __le32 array_position;  /* The position of this drive in the array */
649
650         __le64 events;          /* Incremented by md when superblock updated */
651         __le64 failed_devices;  /* Bit field of devices to indicate failures */
652
653         /*
654          * This offset tracks the progress of the repair or replacement of
655          * an individual drive.
656          */
657         __le64 disk_recovery_offset;
658
659         /*
660          * This offset tracks the progress of the initial array
661          * synchronisation/parity calculation.
662          */
663         __le64 array_resync_offset;
664
665         /*
666          * RAID characteristics
667          */
668         __le32 level;
669         __le32 layout;
670         __le32 stripe_sectors;
671
672         __u8 pad[452];          /* Round struct to 512 bytes. */
673                                 /* Always set to 0 when writing. */
674 } __packed;
675
676 static int read_disk_sb(struct md_rdev *rdev, int size)
677 {
678         BUG_ON(!rdev->sb_page);
679
680         if (rdev->sb_loaded)
681                 return 0;
682
683         if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
684                 DMERR("Failed to read superblock of device at position %d",
685                       rdev->raid_disk);
686                 md_error(rdev->mddev, rdev);
687                 return -EINVAL;
688         }
689
690         rdev->sb_loaded = 1;
691
692         return 0;
693 }
694
695 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
696 {
697         int i;
698         uint64_t failed_devices;
699         struct dm_raid_superblock *sb;
700         struct raid_set *rs = container_of(mddev, struct raid_set, md);
701
702         sb = page_address(rdev->sb_page);
703         failed_devices = le64_to_cpu(sb->failed_devices);
704
705         for (i = 0; i < mddev->raid_disks; i++)
706                 if (!rs->dev[i].data_dev ||
707                     test_bit(Faulty, &(rs->dev[i].rdev.flags)))
708                         failed_devices |= (1ULL << i);
709
710         memset(sb, 0, sizeof(*sb));
711
712         sb->magic = cpu_to_le32(DM_RAID_MAGIC);
713         sb->features = cpu_to_le32(0);  /* No features yet */
714
715         sb->num_devices = cpu_to_le32(mddev->raid_disks);
716         sb->array_position = cpu_to_le32(rdev->raid_disk);
717
718         sb->events = cpu_to_le64(mddev->events);
719         sb->failed_devices = cpu_to_le64(failed_devices);
720
721         sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
722         sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
723
724         sb->level = cpu_to_le32(mddev->level);
725         sb->layout = cpu_to_le32(mddev->layout);
726         sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
727 }
728
729 /*
730  * super_load
731  *
732  * This function creates a superblock if one is not found on the device
733  * and will decide which superblock to use if there's a choice.
734  *
735  * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
736  */
737 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
738 {
739         int ret;
740         struct dm_raid_superblock *sb;
741         struct dm_raid_superblock *refsb;
742         uint64_t events_sb, events_refsb;
743
744         rdev->sb_start = 0;
745         rdev->sb_size = sizeof(*sb);
746
747         ret = read_disk_sb(rdev, rdev->sb_size);
748         if (ret)
749                 return ret;
750
751         sb = page_address(rdev->sb_page);
752
753         /*
754          * Two cases that we want to write new superblocks and rebuild:
755          * 1) New device (no matching magic number)
756          * 2) Device specified for rebuild (!In_sync w/ offset == 0)
757          */
758         if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
759             (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
760                 super_sync(rdev->mddev, rdev);
761
762                 set_bit(FirstUse, &rdev->flags);
763
764                 /* Force writing of superblocks to disk */
765                 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
766
767                 /* Any superblock is better than none, choose that if given */
768                 return refdev ? 0 : 1;
769         }
770
771         if (!refdev)
772                 return 1;
773
774         events_sb = le64_to_cpu(sb->events);
775
776         refsb = page_address(refdev->sb_page);
777         events_refsb = le64_to_cpu(refsb->events);
778
779         return (events_sb > events_refsb) ? 1 : 0;
780 }
781
782 static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
783 {
784         int role;
785         struct raid_set *rs = container_of(mddev, struct raid_set, md);
786         uint64_t events_sb;
787         uint64_t failed_devices;
788         struct dm_raid_superblock *sb;
789         uint32_t new_devs = 0;
790         uint32_t rebuilds = 0;
791         struct md_rdev *r;
792         struct dm_raid_superblock *sb2;
793
794         sb = page_address(rdev->sb_page);
795         events_sb = le64_to_cpu(sb->events);
796         failed_devices = le64_to_cpu(sb->failed_devices);
797
798         /*
799          * Initialise to 1 if this is a new superblock.
800          */
801         mddev->events = events_sb ? : 1;
802
803         /*
804          * Reshaping is not currently allowed
805          */
806         if ((le32_to_cpu(sb->level) != mddev->level) ||
807             (le32_to_cpu(sb->layout) != mddev->layout) ||
808             (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
809                 DMERR("Reshaping arrays not yet supported.");
810                 return -EINVAL;
811         }
812
813         /* We can only change the number of devices in RAID1 right now */
814         if ((rs->raid_type->level != 1) &&
815             (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
816                 DMERR("Reshaping arrays not yet supported.");
817                 return -EINVAL;
818         }
819
820         if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
821                 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
822
823         /*
824          * During load, we set FirstUse if a new superblock was written.
825          * There are two reasons we might not have a superblock:
826          * 1) The array is brand new - in which case, all of the
827          *    devices must have their In_sync bit set.  Also,
828          *    recovery_cp must be 0, unless forced.
829          * 2) This is a new device being added to an old array
830          *    and the new device needs to be rebuilt - in which
831          *    case the In_sync bit will /not/ be set and
832          *    recovery_cp must be MaxSector.
833          */
834         rdev_for_each(r, mddev) {
835                 if (!test_bit(In_sync, &r->flags)) {
836                         DMINFO("Device %d specified for rebuild: "
837                                "Clearing superblock", r->raid_disk);
838                         rebuilds++;
839                 } else if (test_bit(FirstUse, &r->flags))
840                         new_devs++;
841         }
842
843         if (!rebuilds) {
844                 if (new_devs == mddev->raid_disks) {
845                         DMINFO("Superblocks created for new array");
846                         set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
847                 } else if (new_devs) {
848                         DMERR("New device injected "
849                               "into existing array without 'rebuild' "
850                               "parameter specified");
851                         return -EINVAL;
852                 }
853         } else if (new_devs) {
854                 DMERR("'rebuild' devices cannot be "
855                       "injected into an array with other first-time devices");
856                 return -EINVAL;
857         } else if (mddev->recovery_cp != MaxSector) {
858                 DMERR("'rebuild' specified while array is not in-sync");
859                 return -EINVAL;
860         }
861
862         /*
863          * Now we set the Faulty bit for those devices that are
864          * recorded in the superblock as failed.
865          */
866         rdev_for_each(r, mddev) {
867                 if (!r->sb_page)
868                         continue;
869                 sb2 = page_address(r->sb_page);
870                 sb2->failed_devices = 0;
871
872                 /*
873                  * Check for any device re-ordering.
874                  */
875                 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
876                         role = le32_to_cpu(sb2->array_position);
877                         if (role != r->raid_disk) {
878                                 if (rs->raid_type->level != 1) {
879                                         rs->ti->error = "Cannot change device "
880                                                 "positions in RAID array";
881                                         return -EINVAL;
882                                 }
883                                 DMINFO("RAID1 device #%d now at position #%d",
884                                        role, r->raid_disk);
885                         }
886
887                         /*
888                          * Partial recovery is performed on
889                          * returning failed devices.
890                          */
891                         if (failed_devices & (1 << role))
892                                 set_bit(Faulty, &r->flags);
893                 }
894         }
895
896         return 0;
897 }
898
899 static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
900 {
901         struct dm_raid_superblock *sb = page_address(rdev->sb_page);
902
903         /*
904          * If mddev->events is not set, we know we have not yet initialized
905          * the array.
906          */
907         if (!mddev->events && super_init_validation(mddev, rdev))
908                 return -EINVAL;
909
910         mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
911         rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
912         if (!test_bit(FirstUse, &rdev->flags)) {
913                 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
914                 if (rdev->recovery_offset != MaxSector)
915                         clear_bit(In_sync, &rdev->flags);
916         }
917
918         /*
919          * If a device comes back, set it as not In_sync and no longer faulty.
920          */
921         if (test_bit(Faulty, &rdev->flags)) {
922                 clear_bit(Faulty, &rdev->flags);
923                 clear_bit(In_sync, &rdev->flags);
924                 rdev->saved_raid_disk = rdev->raid_disk;
925                 rdev->recovery_offset = 0;
926         }
927
928         clear_bit(FirstUse, &rdev->flags);
929
930         return 0;
931 }
932
933 /*
934  * Analyse superblocks and select the freshest.
935  */
936 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
937 {
938         int ret;
939         unsigned redundancy = 0;
940         struct raid_dev *dev;
941         struct md_rdev *rdev, *tmp, *freshest;
942         struct mddev *mddev = &rs->md;
943
944         switch (rs->raid_type->level) {
945         case 1:
946                 redundancy = rs->md.raid_disks - 1;
947                 break;
948         case 4:
949         case 5:
950         case 6:
951                 redundancy = rs->raid_type->parity_devs;
952                 break;
953         case 10:
954                 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
955                 break;
956         default:
957                 ti->error = "Unknown RAID type";
958                 return -EINVAL;
959         }
960
961         freshest = NULL;
962         rdev_for_each_safe(rdev, tmp, mddev) {
963                 if (!rdev->meta_bdev)
964                         continue;
965
966                 ret = super_load(rdev, freshest);
967
968                 switch (ret) {
969                 case 1:
970                         freshest = rdev;
971                         break;
972                 case 0:
973                         break;
974                 default:
975                         dev = container_of(rdev, struct raid_dev, rdev);
976                         if (redundancy--) {
977                                 if (dev->meta_dev)
978                                         dm_put_device(ti, dev->meta_dev);
979
980                                 dev->meta_dev = NULL;
981                                 rdev->meta_bdev = NULL;
982
983                                 if (rdev->sb_page)
984                                         put_page(rdev->sb_page);
985
986                                 rdev->sb_page = NULL;
987
988                                 rdev->sb_loaded = 0;
989
990                                 /*
991                                  * We might be able to salvage the data device
992                                  * even though the meta device has failed.  For
993                                  * now, we behave as though '- -' had been
994                                  * set for this device in the table.
995                                  */
996                                 if (dev->data_dev)
997                                         dm_put_device(ti, dev->data_dev);
998
999                                 dev->data_dev = NULL;
1000                                 rdev->bdev = NULL;
1001
1002                                 list_del(&rdev->same_set);
1003
1004                                 continue;
1005                         }
1006                         ti->error = "Failed to load superblock";
1007                         return ret;
1008                 }
1009         }
1010
1011         if (!freshest)
1012                 return 0;
1013
1014         /*
1015          * Validation of the freshest device provides the source of
1016          * validation for the remaining devices.
1017          */
1018         ti->error = "Unable to assemble array: Invalid superblocks";
1019         if (super_validate(mddev, freshest))
1020                 return -EINVAL;
1021
1022         rdev_for_each(rdev, mddev)
1023                 if ((rdev != freshest) && super_validate(mddev, rdev))
1024                         return -EINVAL;
1025
1026         return 0;
1027 }
1028
1029 /*
1030  * Construct a RAID4/5/6 mapping:
1031  * Args:
1032  *      <raid_type> <#raid_params> <raid_params>                \
1033  *      <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
1034  *
1035  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
1036  * details on possible <raid_params>.
1037  */
1038 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1039 {
1040         int ret;
1041         struct raid_type *rt;
1042         unsigned long num_raid_params, num_raid_devs;
1043         struct raid_set *rs = NULL;
1044
1045         /* Must have at least <raid_type> <#raid_params> */
1046         if (argc < 2) {
1047                 ti->error = "Too few arguments";
1048                 return -EINVAL;
1049         }
1050
1051         /* raid type */
1052         rt = get_raid_type(argv[0]);
1053         if (!rt) {
1054                 ti->error = "Unrecognised raid_type";
1055                 return -EINVAL;
1056         }
1057         argc--;
1058         argv++;
1059
1060         /* number of RAID parameters */
1061         if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
1062                 ti->error = "Cannot understand number of RAID parameters";
1063                 return -EINVAL;
1064         }
1065         argc--;
1066         argv++;
1067
1068         /* Skip over RAID params for now and find out # of devices */
1069         if (num_raid_params + 1 > argc) {
1070                 ti->error = "Arguments do not agree with counts given";
1071                 return -EINVAL;
1072         }
1073
1074         if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
1075             (num_raid_devs >= INT_MAX)) {
1076                 ti->error = "Cannot understand number of raid devices";
1077                 return -EINVAL;
1078         }
1079
1080         rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
1081         if (IS_ERR(rs))
1082                 return PTR_ERR(rs);
1083
1084         ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
1085         if (ret)
1086                 goto bad;
1087
1088         ret = -EINVAL;
1089
1090         argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
1091         argv += num_raid_params + 1;
1092
1093         if (argc != (num_raid_devs * 2)) {
1094                 ti->error = "Supplied RAID devices does not match the count given";
1095                 goto bad;
1096         }
1097
1098         ret = dev_parms(rs, argv);
1099         if (ret)
1100                 goto bad;
1101
1102         rs->md.sync_super = super_sync;
1103         ret = analyse_superblocks(ti, rs);
1104         if (ret)
1105                 goto bad;
1106
1107         INIT_WORK(&rs->md.event_work, do_table_event);
1108         ti->private = rs;
1109         ti->num_flush_requests = 1;
1110
1111         mutex_lock(&rs->md.reconfig_mutex);
1112         ret = md_run(&rs->md);
1113         rs->md.in_sync = 0; /* Assume already marked dirty */
1114         mutex_unlock(&rs->md.reconfig_mutex);
1115
1116         if (ret) {
1117                 ti->error = "Fail to run raid array";
1118                 goto bad;
1119         }
1120
1121         if (ti->len != rs->md.array_sectors) {
1122                 ti->error = "Array size does not match requested target length";
1123                 ret = -EINVAL;
1124                 goto size_mismatch;
1125         }
1126         rs->callbacks.congested_fn = raid_is_congested;
1127         dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1128
1129         mddev_suspend(&rs->md);
1130         return 0;
1131
1132 size_mismatch:
1133         md_stop(&rs->md);
1134 bad:
1135         context_free(rs);
1136
1137         return ret;
1138 }
1139
1140 static void raid_dtr(struct dm_target *ti)
1141 {
1142         struct raid_set *rs = ti->private;
1143
1144         list_del_init(&rs->callbacks.list);
1145         md_stop(&rs->md);
1146         context_free(rs);
1147 }
1148
1149 static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
1150 {
1151         struct raid_set *rs = ti->private;
1152         struct mddev *mddev = &rs->md;
1153
1154         mddev->pers->make_request(mddev, bio);
1155
1156         return DM_MAPIO_SUBMITTED;
1157 }
1158
1159 static int raid_status(struct dm_target *ti, status_type_t type,
1160                        unsigned status_flags, char *result, unsigned maxlen)
1161 {
1162         struct raid_set *rs = ti->private;
1163         unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
1164         unsigned sz = 0;
1165         int i, array_in_sync = 0;
1166         sector_t sync;
1167
1168         switch (type) {
1169         case STATUSTYPE_INFO:
1170                 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
1171
1172                 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
1173                         sync = rs->md.curr_resync_completed;
1174                 else
1175                         sync = rs->md.recovery_cp;
1176
1177                 if (sync >= rs->md.resync_max_sectors) {
1178                         array_in_sync = 1;
1179                         sync = rs->md.resync_max_sectors;
1180                 } else {
1181                         /*
1182                          * The array may be doing an initial sync, or it may
1183                          * be rebuilding individual components.  If all the
1184                          * devices are In_sync, then it is the array that is
1185                          * being initialized.
1186                          */
1187                         for (i = 0; i < rs->md.raid_disks; i++)
1188                                 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
1189                                         array_in_sync = 1;
1190                 }
1191                 /*
1192                  * Status characters:
1193                  *  'D' = Dead/Failed device
1194                  *  'a' = Alive but not in-sync
1195                  *  'A' = Alive and in-sync
1196                  */
1197                 for (i = 0; i < rs->md.raid_disks; i++) {
1198                         if (test_bit(Faulty, &rs->dev[i].rdev.flags))
1199                                 DMEMIT("D");
1200                         else if (!array_in_sync ||
1201                                  !test_bit(In_sync, &rs->dev[i].rdev.flags))
1202                                 DMEMIT("a");
1203                         else
1204                                 DMEMIT("A");
1205                 }
1206
1207                 /*
1208                  * In-sync ratio:
1209                  *  The in-sync ratio shows the progress of:
1210                  *   - Initializing the array
1211                  *   - Rebuilding a subset of devices of the array
1212                  *  The user can distinguish between the two by referring
1213                  *  to the status characters.
1214                  */
1215                 DMEMIT(" %llu/%llu",
1216                        (unsigned long long) sync,
1217                        (unsigned long long) rs->md.resync_max_sectors);
1218
1219                 break;
1220         case STATUSTYPE_TABLE:
1221                 /* The string you would use to construct this array */
1222                 for (i = 0; i < rs->md.raid_disks; i++) {
1223                         if ((rs->print_flags & DMPF_REBUILD) &&
1224                             rs->dev[i].data_dev &&
1225                             !test_bit(In_sync, &rs->dev[i].rdev.flags))
1226                                 raid_param_cnt += 2; /* for rebuilds */
1227                         if (rs->dev[i].data_dev &&
1228                             test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1229                                 raid_param_cnt += 2;
1230                 }
1231
1232                 raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
1233                 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
1234                         raid_param_cnt--;
1235
1236                 DMEMIT("%s %u %u", rs->raid_type->name,
1237                        raid_param_cnt, rs->md.chunk_sectors);
1238
1239                 if ((rs->print_flags & DMPF_SYNC) &&
1240                     (rs->md.recovery_cp == MaxSector))
1241                         DMEMIT(" sync");
1242                 if (rs->print_flags & DMPF_NOSYNC)
1243                         DMEMIT(" nosync");
1244
1245                 for (i = 0; i < rs->md.raid_disks; i++)
1246                         if ((rs->print_flags & DMPF_REBUILD) &&
1247                             rs->dev[i].data_dev &&
1248                             !test_bit(In_sync, &rs->dev[i].rdev.flags))
1249                                 DMEMIT(" rebuild %u", i);
1250
1251                 if (rs->print_flags & DMPF_DAEMON_SLEEP)
1252                         DMEMIT(" daemon_sleep %lu",
1253                                rs->md.bitmap_info.daemon_sleep);
1254
1255                 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
1256                         DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
1257
1258                 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
1259                         DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
1260
1261                 for (i = 0; i < rs->md.raid_disks; i++)
1262                         if (rs->dev[i].data_dev &&
1263                             test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1264                                 DMEMIT(" write_mostly %u", i);
1265
1266                 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
1267                         DMEMIT(" max_write_behind %lu",
1268                                rs->md.bitmap_info.max_write_behind);
1269
1270                 if (rs->print_flags & DMPF_STRIPE_CACHE) {
1271                         struct r5conf *conf = rs->md.private;
1272
1273                         /* convert from kiB to sectors */
1274                         DMEMIT(" stripe_cache %d",
1275                                conf ? conf->max_nr_stripes * 2 : 0);
1276                 }
1277
1278                 if (rs->print_flags & DMPF_REGION_SIZE)
1279                         DMEMIT(" region_size %lu",
1280                                rs->md.bitmap_info.chunksize >> 9);
1281
1282                 if (rs->print_flags & DMPF_RAID10_COPIES)
1283                         DMEMIT(" raid10_copies %u",
1284                                raid10_md_layout_to_copies(rs->md.layout));
1285
1286                 if (rs->print_flags & DMPF_RAID10_FORMAT)
1287                         DMEMIT(" raid10_format near");
1288
1289                 DMEMIT(" %d", rs->md.raid_disks);
1290                 for (i = 0; i < rs->md.raid_disks; i++) {
1291                         if (rs->dev[i].meta_dev)
1292                                 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1293                         else
1294                                 DMEMIT(" -");
1295
1296                         if (rs->dev[i].data_dev)
1297                                 DMEMIT(" %s", rs->dev[i].data_dev->name);
1298                         else
1299                                 DMEMIT(" -");
1300                 }
1301         }
1302
1303         return 0;
1304 }
1305
1306 static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
1307 {
1308         struct raid_set *rs = ti->private;
1309         unsigned i;
1310         int ret = 0;
1311
1312         for (i = 0; !ret && i < rs->md.raid_disks; i++)
1313                 if (rs->dev[i].data_dev)
1314                         ret = fn(ti,
1315                                  rs->dev[i].data_dev,
1316                                  0, /* No offset on data devs */
1317                                  rs->md.dev_sectors,
1318                                  data);
1319
1320         return ret;
1321 }
1322
1323 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
1324 {
1325         struct raid_set *rs = ti->private;
1326         unsigned chunk_size = rs->md.chunk_sectors << 9;
1327         struct r5conf *conf = rs->md.private;
1328
1329         blk_limits_io_min(limits, chunk_size);
1330         blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
1331 }
1332
1333 static void raid_presuspend(struct dm_target *ti)
1334 {
1335         struct raid_set *rs = ti->private;
1336
1337         md_stop_writes(&rs->md);
1338 }
1339
1340 static void raid_postsuspend(struct dm_target *ti)
1341 {
1342         struct raid_set *rs = ti->private;
1343
1344         mddev_suspend(&rs->md);
1345 }
1346
1347 static void raid_resume(struct dm_target *ti)
1348 {
1349         struct raid_set *rs = ti->private;
1350
1351         set_bit(MD_CHANGE_DEVS, &rs->md.flags);
1352         if (!rs->bitmap_loaded) {
1353                 bitmap_load(&rs->md);
1354                 rs->bitmap_loaded = 1;
1355         }
1356
1357         clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
1358         mddev_resume(&rs->md);
1359 }
1360
1361 static struct target_type raid_target = {
1362         .name = "raid",
1363         .version = {1, 3, 0},
1364         .module = THIS_MODULE,
1365         .ctr = raid_ctr,
1366         .dtr = raid_dtr,
1367         .map = raid_map,
1368         .status = raid_status,
1369         .iterate_devices = raid_iterate_devices,
1370         .io_hints = raid_io_hints,
1371         .presuspend = raid_presuspend,
1372         .postsuspend = raid_postsuspend,
1373         .resume = raid_resume,
1374 };
1375
1376 static int __init dm_raid_init(void)
1377 {
1378         return dm_register_target(&raid_target);
1379 }
1380
1381 static void __exit dm_raid_exit(void)
1382 {
1383         dm_unregister_target(&raid_target);
1384 }
1385
1386 module_init(dm_raid_init);
1387 module_exit(dm_raid_exit);
1388
1389 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1390 MODULE_ALIAS("dm-raid1");
1391 MODULE_ALIAS("dm-raid10");
1392 MODULE_ALIAS("dm-raid4");
1393 MODULE_ALIAS("dm-raid5");
1394 MODULE_ALIAS("dm-raid6");
1395 MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
1396 MODULE_LICENSE("GPL");