drbd: Improvements in sanitize_state()
[cascardo/linux.git] / drivers / block / drbd / drbd_main.c
1 /*
2    drbd.c
3
4    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10    Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11    from Logicworks, Inc. for making SDP replication support possible.
12
13    drbd is free software; you can redistribute it and/or modify
14    it under the terms of the GNU General Public License as published by
15    the Free Software Foundation; either version 2, or (at your option)
16    any later version.
17
18    drbd is distributed in the hope that it will be useful,
19    but WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21    GNU General Public License for more details.
22
23    You should have received a copy of the GNU General Public License
24    along with drbd; see the file COPYING.  If not, write to
25    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27  */
28
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
33 #include <net/sock.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
36 #include <linux/fs.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
40 #include <linux/mm.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
48
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
52
53 #include <linux/drbd_limits.h>
54 #include "drbd_int.h"
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57 #include "drbd_vli.h"
58
59 struct after_state_chg_work {
60         struct drbd_work w;
61         union drbd_state os;
62         union drbd_state ns;
63         enum chg_state_flags flags;
64         struct completion *done;
65 };
66
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
71
72 int drbd_init(void);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77                            union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82
83 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84               "Lars Ellenberg <lars@linbit.com>");
85 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86 MODULE_VERSION(REL_VERSION);
87 MODULE_LICENSE("GPL");
88 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
89 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
90
91 #include <linux/moduleparam.h>
92 /* allow_open_on_secondary */
93 MODULE_PARM_DESC(allow_oos, "DONT USE!");
94 /* thanks to these macros, if compiled into the kernel (not-module),
95  * this becomes the boot parameter drbd.minor_count */
96 module_param(minor_count, uint, 0444);
97 module_param(disable_sendpage, bool, 0644);
98 module_param(allow_oos, bool, 0);
99 module_param(cn_idx, uint, 0444);
100 module_param(proc_details, int, 0644);
101
102 #ifdef CONFIG_DRBD_FAULT_INJECTION
103 int enable_faults;
104 int fault_rate;
105 static int fault_count;
106 int fault_devs;
107 /* bitmap of enabled faults */
108 module_param(enable_faults, int, 0664);
109 /* fault rate % value - applies to all enabled faults */
110 module_param(fault_rate, int, 0664);
111 /* count of faults inserted */
112 module_param(fault_count, int, 0664);
113 /* bitmap of devices to insert faults on */
114 module_param(fault_devs, int, 0644);
115 #endif
116
117 /* module parameter, defined */
118 unsigned int minor_count = 32;
119 int disable_sendpage;
120 int allow_oos;
121 unsigned int cn_idx = CN_IDX_DRBD;
122 int proc_details;       /* Detail level in proc drbd*/
123
124 /* Module parameter for setting the user mode helper program
125  * to run. Default is /sbin/drbdadm */
126 char usermode_helper[80] = "/sbin/drbdadm";
127
128 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
129
130 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
131  * as member "struct gendisk *vdisk;"
132  */
133 struct drbd_conf **minor_table;
134
135 struct kmem_cache *drbd_request_cache;
136 struct kmem_cache *drbd_ee_cache;       /* epoch entries */
137 struct kmem_cache *drbd_bm_ext_cache;   /* bitmap extents */
138 struct kmem_cache *drbd_al_ext_cache;   /* activity log extents */
139 mempool_t *drbd_request_mempool;
140 mempool_t *drbd_ee_mempool;
141
142 /* I do not use a standard mempool, because:
143    1) I want to hand out the pre-allocated objects first.
144    2) I want to be able to interrupt sleeping allocation with a signal.
145    Note: This is a single linked list, the next pointer is the private
146          member of struct page.
147  */
148 struct page *drbd_pp_pool;
149 spinlock_t   drbd_pp_lock;
150 int          drbd_pp_vacant;
151 wait_queue_head_t drbd_pp_wait;
152
153 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
154
155 static const struct block_device_operations drbd_ops = {
156         .owner =   THIS_MODULE,
157         .open =    drbd_open,
158         .release = drbd_release,
159 };
160
161 #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
162
163 #ifdef __CHECKER__
164 /* When checking with sparse, and this is an inline function, sparse will
165    give tons of false positives. When this is a real functions sparse works.
166  */
167 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
168 {
169         int io_allowed;
170
171         atomic_inc(&mdev->local_cnt);
172         io_allowed = (mdev->state.disk >= mins);
173         if (!io_allowed) {
174                 if (atomic_dec_and_test(&mdev->local_cnt))
175                         wake_up(&mdev->misc_wait);
176         }
177         return io_allowed;
178 }
179
180 #endif
181
182 /**
183  * DOC: The transfer log
184  *
185  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
186  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
187  * of the list. There is always at least one &struct drbd_tl_epoch object.
188  *
189  * Each &struct drbd_tl_epoch has a circular double linked list of requests
190  * attached.
191  */
192 static int tl_init(struct drbd_conf *mdev)
193 {
194         struct drbd_tl_epoch *b;
195
196         /* during device minor initialization, we may well use GFP_KERNEL */
197         b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
198         if (!b)
199                 return 0;
200         INIT_LIST_HEAD(&b->requests);
201         INIT_LIST_HEAD(&b->w.list);
202         b->next = NULL;
203         b->br_number = 4711;
204         b->n_writes = 0;
205         b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
206
207         mdev->oldest_tle = b;
208         mdev->newest_tle = b;
209         INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
210
211         mdev->tl_hash = NULL;
212         mdev->tl_hash_s = 0;
213
214         return 1;
215 }
216
217 static void tl_cleanup(struct drbd_conf *mdev)
218 {
219         D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
220         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
221         kfree(mdev->oldest_tle);
222         mdev->oldest_tle = NULL;
223         kfree(mdev->unused_spare_tle);
224         mdev->unused_spare_tle = NULL;
225         kfree(mdev->tl_hash);
226         mdev->tl_hash = NULL;
227         mdev->tl_hash_s = 0;
228 }
229
230 /**
231  * _tl_add_barrier() - Adds a barrier to the transfer log
232  * @mdev:       DRBD device.
233  * @new:        Barrier to be added before the current head of the TL.
234  *
235  * The caller must hold the req_lock.
236  */
237 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
238 {
239         struct drbd_tl_epoch *newest_before;
240
241         INIT_LIST_HEAD(&new->requests);
242         INIT_LIST_HEAD(&new->w.list);
243         new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
244         new->next = NULL;
245         new->n_writes = 0;
246
247         newest_before = mdev->newest_tle;
248         /* never send a barrier number == 0, because that is special-cased
249          * when using TCQ for our write ordering code */
250         new->br_number = (newest_before->br_number+1) ?: 1;
251         if (mdev->newest_tle != new) {
252                 mdev->newest_tle->next = new;
253                 mdev->newest_tle = new;
254         }
255 }
256
257 /**
258  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
259  * @mdev:       DRBD device.
260  * @barrier_nr: Expected identifier of the DRBD write barrier packet.
261  * @set_size:   Expected number of requests before that barrier.
262  *
263  * In case the passed barrier_nr or set_size does not match the oldest
264  * &struct drbd_tl_epoch objects this function will cause a termination
265  * of the connection.
266  */
267 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
268                        unsigned int set_size)
269 {
270         struct drbd_tl_epoch *b, *nob; /* next old barrier */
271         struct list_head *le, *tle;
272         struct drbd_request *r;
273
274         spin_lock_irq(&mdev->req_lock);
275
276         b = mdev->oldest_tle;
277
278         /* first some paranoia code */
279         if (b == NULL) {
280                 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
281                         barrier_nr);
282                 goto bail;
283         }
284         if (b->br_number != barrier_nr) {
285                 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
286                         barrier_nr, b->br_number);
287                 goto bail;
288         }
289         if (b->n_writes != set_size) {
290                 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
291                         barrier_nr, set_size, b->n_writes);
292                 goto bail;
293         }
294
295         /* Clean up list of requests processed during current epoch */
296         list_for_each_safe(le, tle, &b->requests) {
297                 r = list_entry(le, struct drbd_request, tl_requests);
298                 _req_mod(r, barrier_acked);
299         }
300         /* There could be requests on the list waiting for completion
301            of the write to the local disk. To avoid corruptions of
302            slab's data structures we have to remove the lists head.
303
304            Also there could have been a barrier ack out of sequence, overtaking
305            the write acks - which would be a bug and violating write ordering.
306            To not deadlock in case we lose connection while such requests are
307            still pending, we need some way to find them for the
308            _req_mode(connection_lost_while_pending).
309
310            These have been list_move'd to the out_of_sequence_requests list in
311            _req_mod(, barrier_acked) above.
312            */
313         list_del_init(&b->requests);
314
315         nob = b->next;
316         if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
317                 _tl_add_barrier(mdev, b);
318                 if (nob)
319                         mdev->oldest_tle = nob;
320                 /* if nob == NULL b was the only barrier, and becomes the new
321                    barrier. Therefore mdev->oldest_tle points already to b */
322         } else {
323                 D_ASSERT(nob != NULL);
324                 mdev->oldest_tle = nob;
325                 kfree(b);
326         }
327
328         spin_unlock_irq(&mdev->req_lock);
329         dec_ap_pending(mdev);
330
331         return;
332
333 bail:
334         spin_unlock_irq(&mdev->req_lock);
335         drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
336 }
337
338 /**
339  * _tl_restart() - Walks the transfer log, and applies an action to all requests
340  * @mdev:       DRBD device.
341  * @what:       The action/event to perform with all request objects
342  *
343  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
344  * restart_frozen_disk_io.
345  */
346 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
347 {
348         struct drbd_tl_epoch *b, *tmp, **pn;
349         struct list_head *le, *tle, carry_reads;
350         struct drbd_request *req;
351         int rv, n_writes, n_reads;
352
353         b = mdev->oldest_tle;
354         pn = &mdev->oldest_tle;
355         while (b) {
356                 n_writes = 0;
357                 n_reads = 0;
358                 INIT_LIST_HEAD(&carry_reads);
359                 list_for_each_safe(le, tle, &b->requests) {
360                         req = list_entry(le, struct drbd_request, tl_requests);
361                         rv = _req_mod(req, what);
362
363                         n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
364                         n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
365                 }
366                 tmp = b->next;
367
368                 if (n_writes) {
369                         if (what == resend) {
370                                 b->n_writes = n_writes;
371                                 if (b->w.cb == NULL) {
372                                         b->w.cb = w_send_barrier;
373                                         inc_ap_pending(mdev);
374                                         set_bit(CREATE_BARRIER, &mdev->flags);
375                                 }
376
377                                 drbd_queue_work(&mdev->data.work, &b->w);
378                         }
379                         pn = &b->next;
380                 } else {
381                         if (n_reads)
382                                 list_add(&carry_reads, &b->requests);
383                         /* there could still be requests on that ring list,
384                          * in case local io is still pending */
385                         list_del(&b->requests);
386
387                         /* dec_ap_pending corresponding to queue_barrier.
388                          * the newest barrier may not have been queued yet,
389                          * in which case w.cb is still NULL. */
390                         if (b->w.cb != NULL)
391                                 dec_ap_pending(mdev);
392
393                         if (b == mdev->newest_tle) {
394                                 /* recycle, but reinit! */
395                                 D_ASSERT(tmp == NULL);
396                                 INIT_LIST_HEAD(&b->requests);
397                                 list_splice(&carry_reads, &b->requests);
398                                 INIT_LIST_HEAD(&b->w.list);
399                                 b->w.cb = NULL;
400                                 b->br_number = net_random();
401                                 b->n_writes = 0;
402
403                                 *pn = b;
404                                 break;
405                         }
406                         *pn = tmp;
407                         kfree(b);
408                 }
409                 b = tmp;
410                 list_splice(&carry_reads, &b->requests);
411         }
412 }
413
414
415 /**
416  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
417  * @mdev:       DRBD device.
418  *
419  * This is called after the connection to the peer was lost. The storage covered
420  * by the requests on the transfer gets marked as our of sync. Called from the
421  * receiver thread and the worker thread.
422  */
423 void tl_clear(struct drbd_conf *mdev)
424 {
425         struct list_head *le, *tle;
426         struct drbd_request *r;
427
428         spin_lock_irq(&mdev->req_lock);
429
430         _tl_restart(mdev, connection_lost_while_pending);
431
432         /* we expect this list to be empty. */
433         D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
434
435         /* but just in case, clean it up anyways! */
436         list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
437                 r = list_entry(le, struct drbd_request, tl_requests);
438                 /* It would be nice to complete outside of spinlock.
439                  * But this is easier for now. */
440                 _req_mod(r, connection_lost_while_pending);
441         }
442
443         /* ensure bit indicating barrier is required is clear */
444         clear_bit(CREATE_BARRIER, &mdev->flags);
445
446         memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
447
448         spin_unlock_irq(&mdev->req_lock);
449 }
450
451 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
452 {
453         spin_lock_irq(&mdev->req_lock);
454         _tl_restart(mdev, what);
455         spin_unlock_irq(&mdev->req_lock);
456 }
457
458 /**
459  * cl_wide_st_chg() - TRUE if the state change is a cluster wide one
460  * @mdev:       DRBD device.
461  * @os:         old (current) state.
462  * @ns:         new (wanted) state.
463  */
464 static int cl_wide_st_chg(struct drbd_conf *mdev,
465                           union drbd_state os, union drbd_state ns)
466 {
467         return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
468                  ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
469                   (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
470                   (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
471                   (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
472                 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
473                 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
474 }
475
476 int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
477                       union drbd_state mask, union drbd_state val)
478 {
479         unsigned long flags;
480         union drbd_state os, ns;
481         int rv;
482
483         spin_lock_irqsave(&mdev->req_lock, flags);
484         os = mdev->state;
485         ns.i = (os.i & ~mask.i) | val.i;
486         rv = _drbd_set_state(mdev, ns, f, NULL);
487         ns = mdev->state;
488         spin_unlock_irqrestore(&mdev->req_lock, flags);
489
490         return rv;
491 }
492
493 /**
494  * drbd_force_state() - Impose a change which happens outside our control on our state
495  * @mdev:       DRBD device.
496  * @mask:       mask of state bits to change.
497  * @val:        value of new state bits.
498  */
499 void drbd_force_state(struct drbd_conf *mdev,
500         union drbd_state mask, union drbd_state val)
501 {
502         drbd_change_state(mdev, CS_HARD, mask, val);
503 }
504
505 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
506 static int is_valid_state_transition(struct drbd_conf *,
507                                      union drbd_state, union drbd_state);
508 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
509                                        union drbd_state ns, const char **warn_sync_abort);
510 int drbd_send_state_req(struct drbd_conf *,
511                         union drbd_state, union drbd_state);
512
513 static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev,
514                                     union drbd_state mask, union drbd_state val)
515 {
516         union drbd_state os, ns;
517         unsigned long flags;
518         int rv;
519
520         if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
521                 return SS_CW_SUCCESS;
522
523         if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
524                 return SS_CW_FAILED_BY_PEER;
525
526         rv = 0;
527         spin_lock_irqsave(&mdev->req_lock, flags);
528         os = mdev->state;
529         ns.i = (os.i & ~mask.i) | val.i;
530         ns = sanitize_state(mdev, os, ns, NULL);
531
532         if (!cl_wide_st_chg(mdev, os, ns))
533                 rv = SS_CW_NO_NEED;
534         if (!rv) {
535                 rv = is_valid_state(mdev, ns);
536                 if (rv == SS_SUCCESS) {
537                         rv = is_valid_state_transition(mdev, ns, os);
538                         if (rv == SS_SUCCESS)
539                                 rv = 0; /* cont waiting, otherwise fail. */
540                 }
541         }
542         spin_unlock_irqrestore(&mdev->req_lock, flags);
543
544         return rv;
545 }
546
547 /**
548  * drbd_req_state() - Perform an eventually cluster wide state change
549  * @mdev:       DRBD device.
550  * @mask:       mask of state bits to change.
551  * @val:        value of new state bits.
552  * @f:          flags
553  *
554  * Should not be called directly, use drbd_request_state() or
555  * _drbd_request_state().
556  */
557 static int drbd_req_state(struct drbd_conf *mdev,
558                           union drbd_state mask, union drbd_state val,
559                           enum chg_state_flags f)
560 {
561         struct completion done;
562         unsigned long flags;
563         union drbd_state os, ns;
564         int rv;
565
566         init_completion(&done);
567
568         if (f & CS_SERIALIZE)
569                 mutex_lock(&mdev->state_mutex);
570
571         spin_lock_irqsave(&mdev->req_lock, flags);
572         os = mdev->state;
573         ns.i = (os.i & ~mask.i) | val.i;
574         ns = sanitize_state(mdev, os, ns, NULL);
575
576         if (cl_wide_st_chg(mdev, os, ns)) {
577                 rv = is_valid_state(mdev, ns);
578                 if (rv == SS_SUCCESS)
579                         rv = is_valid_state_transition(mdev, ns, os);
580                 spin_unlock_irqrestore(&mdev->req_lock, flags);
581
582                 if (rv < SS_SUCCESS) {
583                         if (f & CS_VERBOSE)
584                                 print_st_err(mdev, os, ns, rv);
585                         goto abort;
586                 }
587
588                 drbd_state_lock(mdev);
589                 if (!drbd_send_state_req(mdev, mask, val)) {
590                         drbd_state_unlock(mdev);
591                         rv = SS_CW_FAILED_BY_PEER;
592                         if (f & CS_VERBOSE)
593                                 print_st_err(mdev, os, ns, rv);
594                         goto abort;
595                 }
596
597                 wait_event(mdev->state_wait,
598                         (rv = _req_st_cond(mdev, mask, val)));
599
600                 if (rv < SS_SUCCESS) {
601                         drbd_state_unlock(mdev);
602                         if (f & CS_VERBOSE)
603                                 print_st_err(mdev, os, ns, rv);
604                         goto abort;
605                 }
606                 spin_lock_irqsave(&mdev->req_lock, flags);
607                 os = mdev->state;
608                 ns.i = (os.i & ~mask.i) | val.i;
609                 rv = _drbd_set_state(mdev, ns, f, &done);
610                 drbd_state_unlock(mdev);
611         } else {
612                 rv = _drbd_set_state(mdev, ns, f, &done);
613         }
614
615         spin_unlock_irqrestore(&mdev->req_lock, flags);
616
617         if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
618                 D_ASSERT(current != mdev->worker.task);
619                 wait_for_completion(&done);
620         }
621
622 abort:
623         if (f & CS_SERIALIZE)
624                 mutex_unlock(&mdev->state_mutex);
625
626         return rv;
627 }
628
629 /**
630  * _drbd_request_state() - Request a state change (with flags)
631  * @mdev:       DRBD device.
632  * @mask:       mask of state bits to change.
633  * @val:        value of new state bits.
634  * @f:          flags
635  *
636  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
637  * flag, or when logging of failed state change requests is not desired.
638  */
639 int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
640                         union drbd_state val,   enum chg_state_flags f)
641 {
642         int rv;
643
644         wait_event(mdev->state_wait,
645                    (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
646
647         return rv;
648 }
649
650 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
651 {
652         dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
653             name,
654             drbd_conn_str(ns.conn),
655             drbd_role_str(ns.role),
656             drbd_role_str(ns.peer),
657             drbd_disk_str(ns.disk),
658             drbd_disk_str(ns.pdsk),
659             is_susp(ns) ? 's' : 'r',
660             ns.aftr_isp ? 'a' : '-',
661             ns.peer_isp ? 'p' : '-',
662             ns.user_isp ? 'u' : '-'
663             );
664 }
665
666 void print_st_err(struct drbd_conf *mdev,
667         union drbd_state os, union drbd_state ns, int err)
668 {
669         if (err == SS_IN_TRANSIENT_STATE)
670                 return;
671         dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
672         print_st(mdev, " state", os);
673         print_st(mdev, "wanted", ns);
674 }
675
676
677 #define drbd_peer_str drbd_role_str
678 #define drbd_pdsk_str drbd_disk_str
679
680 #define drbd_susp_str(A)     ((A) ? "1" : "0")
681 #define drbd_aftr_isp_str(A) ((A) ? "1" : "0")
682 #define drbd_peer_isp_str(A) ((A) ? "1" : "0")
683 #define drbd_user_isp_str(A) ((A) ? "1" : "0")
684
685 #define PSC(A) \
686         ({ if (ns.A != os.A) { \
687                 pbp += sprintf(pbp, #A "( %s -> %s ) ", \
688                               drbd_##A##_str(os.A), \
689                               drbd_##A##_str(ns.A)); \
690         } })
691
692 /**
693  * is_valid_state() - Returns an SS_ error code if ns is not valid
694  * @mdev:       DRBD device.
695  * @ns:         State to consider.
696  */
697 static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
698 {
699         /* See drbd_state_sw_errors in drbd_strings.c */
700
701         enum drbd_fencing_p fp;
702         int rv = SS_SUCCESS;
703
704         fp = FP_DONT_CARE;
705         if (get_ldev(mdev)) {
706                 fp = mdev->ldev->dc.fencing;
707                 put_ldev(mdev);
708         }
709
710         if (get_net_conf(mdev)) {
711                 if (!mdev->net_conf->two_primaries &&
712                     ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
713                         rv = SS_TWO_PRIMARIES;
714                 put_net_conf(mdev);
715         }
716
717         if (rv <= 0)
718                 /* already found a reason to abort */;
719         else if (ns.role == R_SECONDARY && mdev->open_cnt)
720                 rv = SS_DEVICE_IN_USE;
721
722         else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
723                 rv = SS_NO_UP_TO_DATE_DISK;
724
725         else if (fp >= FP_RESOURCE &&
726                  ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
727                 rv = SS_PRIMARY_NOP;
728
729         else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
730                 rv = SS_NO_UP_TO_DATE_DISK;
731
732         else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
733                 rv = SS_NO_LOCAL_DISK;
734
735         else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
736                 rv = SS_NO_REMOTE_DISK;
737
738         else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
739                 rv = SS_NO_UP_TO_DATE_DISK;
740
741         else if ((ns.conn == C_CONNECTED ||
742                   ns.conn == C_WF_BITMAP_S ||
743                   ns.conn == C_SYNC_SOURCE ||
744                   ns.conn == C_PAUSED_SYNC_S) &&
745                   ns.disk == D_OUTDATED)
746                 rv = SS_CONNECTED_OUTDATES;
747
748         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
749                  (mdev->sync_conf.verify_alg[0] == 0))
750                 rv = SS_NO_VERIFY_ALG;
751
752         else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
753                   mdev->agreed_pro_version < 88)
754                 rv = SS_NOT_SUPPORTED;
755
756         return rv;
757 }
758
759 /**
760  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
761  * @mdev:       DRBD device.
762  * @ns:         new state.
763  * @os:         old state.
764  */
765 static int is_valid_state_transition(struct drbd_conf *mdev,
766                                      union drbd_state ns, union drbd_state os)
767 {
768         int rv = SS_SUCCESS;
769
770         if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
771             os.conn > C_CONNECTED)
772                 rv = SS_RESYNC_RUNNING;
773
774         if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
775                 rv = SS_ALREADY_STANDALONE;
776
777         if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
778                 rv = SS_IS_DISKLESS;
779
780         if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
781                 rv = SS_NO_NET_CONFIG;
782
783         if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
784                 rv = SS_LOWER_THAN_OUTDATED;
785
786         if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
787                 rv = SS_IN_TRANSIENT_STATE;
788
789         if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
790                 rv = SS_IN_TRANSIENT_STATE;
791
792         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
793                 rv = SS_NEED_CONNECTION;
794
795         if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
796             ns.conn != os.conn && os.conn > C_CONNECTED)
797                 rv = SS_RESYNC_RUNNING;
798
799         if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
800             os.conn < C_CONNECTED)
801                 rv = SS_NEED_CONNECTION;
802
803         return rv;
804 }
805
806 /**
807  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
808  * @mdev:       DRBD device.
809  * @os:         old state.
810  * @ns:         new state.
811  * @warn_sync_abort:
812  *
813  * When we loose connection, we have to set the state of the peers disk (pdsk)
814  * to D_UNKNOWN. This rule and many more along those lines are in this function.
815  */
816 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
817                                        union drbd_state ns, const char **warn_sync_abort)
818 {
819         enum drbd_fencing_p fp;
820         enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
821
822         fp = FP_DONT_CARE;
823         if (get_ldev(mdev)) {
824                 fp = mdev->ldev->dc.fencing;
825                 put_ldev(mdev);
826         }
827
828         /* Disallow Network errors to configure a device's network part */
829         if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
830             os.conn <= C_DISCONNECTING)
831                 ns.conn = os.conn;
832
833         /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
834          * If you try to go into some Sync* state, that shall fail (elsewhere). */
835         if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
836             ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
837                 ns.conn = os.conn;
838
839         /* we cannot fail (again) if we already detached */
840         if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
841                 ns.disk = D_DISKLESS;
842
843         /* if we are only D_ATTACHING yet,
844          * we can (and should) go directly to D_DISKLESS. */
845         if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
846                 ns.disk = D_DISKLESS;
847
848         /* After C_DISCONNECTING only C_STANDALONE may follow */
849         if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
850                 ns.conn = os.conn;
851
852         if (ns.conn < C_CONNECTED) {
853                 ns.peer_isp = 0;
854                 ns.peer = R_UNKNOWN;
855                 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
856                         ns.pdsk = D_UNKNOWN;
857         }
858
859         /* Clear the aftr_isp when becoming unconfigured */
860         if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
861                 ns.aftr_isp = 0;
862
863         /* Abort resync if a disk fails/detaches */
864         if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
865             (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
866                 if (warn_sync_abort)
867                         *warn_sync_abort =
868                                 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
869                                 "Online-verify" : "Resync";
870                 ns.conn = C_CONNECTED;
871         }
872
873         /* Connection breaks down before we finished "Negotiating" */
874         if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
875             get_ldev_if_state(mdev, D_NEGOTIATING)) {
876                 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
877                         ns.disk = mdev->new_state_tmp.disk;
878                         ns.pdsk = mdev->new_state_tmp.pdsk;
879                 } else {
880                         dev_alert(DEV, "Connection lost while negotiating, no data!\n");
881                         ns.disk = D_DISKLESS;
882                         ns.pdsk = D_UNKNOWN;
883                 }
884                 put_ldev(mdev);
885         }
886
887         /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
888         if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
889                 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
890                         ns.disk = D_UP_TO_DATE;
891                 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
892                         ns.pdsk = D_UP_TO_DATE;
893         }
894
895         /* Implications of the connection stat on the disk states */
896         disk_min = D_DISKLESS;
897         disk_max = D_UP_TO_DATE;
898         pdsk_min = D_INCONSISTENT;
899         pdsk_max = D_UNKNOWN;
900         switch ((enum drbd_conns)ns.conn) {
901         case C_WF_BITMAP_T:
902         case C_PAUSED_SYNC_T:
903         case C_STARTING_SYNC_T:
904         case C_WF_SYNC_UUID:
905         case C_BEHIND:
906                 disk_min = D_INCONSISTENT;
907                 disk_max = D_OUTDATED;
908                 pdsk_min = D_UP_TO_DATE;
909                 pdsk_max = D_UP_TO_DATE;
910                 break;
911         case C_VERIFY_S:
912         case C_VERIFY_T:
913                 disk_min = D_UP_TO_DATE;
914                 disk_max = D_UP_TO_DATE;
915                 pdsk_min = D_UP_TO_DATE;
916                 pdsk_max = D_UP_TO_DATE;
917                 break;
918         case C_CONNECTED:
919                 disk_min = D_DISKLESS;
920                 disk_max = D_UP_TO_DATE;
921                 pdsk_min = D_DISKLESS;
922                 pdsk_max = D_UP_TO_DATE;
923                 break;
924         case C_WF_BITMAP_S:
925         case C_PAUSED_SYNC_S:
926         case C_STARTING_SYNC_S:
927         case C_AHEAD:
928                 disk_min = D_UP_TO_DATE;
929                 disk_max = D_UP_TO_DATE;
930                 pdsk_min = D_INCONSISTENT;
931                 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
932                 break;
933         case C_SYNC_TARGET:
934                 disk_min = D_INCONSISTENT;
935                 disk_max = D_INCONSISTENT;
936                 pdsk_min = D_UP_TO_DATE;
937                 pdsk_max = D_UP_TO_DATE;
938                 break;
939         case C_SYNC_SOURCE:
940                 disk_min = D_UP_TO_DATE;
941                 disk_max = D_UP_TO_DATE;
942                 pdsk_min = D_INCONSISTENT;
943                 pdsk_max = D_INCONSISTENT;
944                 break;
945         case C_STANDALONE:
946         case C_DISCONNECTING:
947         case C_UNCONNECTED:
948         case C_TIMEOUT:
949         case C_BROKEN_PIPE:
950         case C_NETWORK_FAILURE:
951         case C_PROTOCOL_ERROR:
952         case C_TEAR_DOWN:
953         case C_WF_CONNECTION:
954         case C_WF_REPORT_PARAMS:
955         case C_MASK:
956                 break;
957         }
958         if (ns.disk > disk_max)
959                 ns.disk = disk_max;
960
961         if (ns.disk < disk_min) {
962                 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
963                          drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
964                 ns.disk = disk_min;
965         }
966         if (ns.pdsk > pdsk_max)
967                 ns.pdsk = pdsk_max;
968
969         if (ns.pdsk < pdsk_min) {
970                 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
971                          drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
972                 ns.pdsk = pdsk_min;
973         }
974
975         if (fp == FP_STONITH &&
976             (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
977             !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
978                 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
979
980         if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
981             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
982             !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
983                 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
984
985         if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
986                 if (ns.conn == C_SYNC_SOURCE)
987                         ns.conn = C_PAUSED_SYNC_S;
988                 if (ns.conn == C_SYNC_TARGET)
989                         ns.conn = C_PAUSED_SYNC_T;
990         } else {
991                 if (ns.conn == C_PAUSED_SYNC_S)
992                         ns.conn = C_SYNC_SOURCE;
993                 if (ns.conn == C_PAUSED_SYNC_T)
994                         ns.conn = C_SYNC_TARGET;
995         }
996
997         return ns;
998 }
999
1000 /* helper for __drbd_set_state */
1001 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1002 {
1003         if (mdev->agreed_pro_version < 90)
1004                 mdev->ov_start_sector = 0;
1005         mdev->rs_total = drbd_bm_bits(mdev);
1006         mdev->ov_position = 0;
1007         if (cs == C_VERIFY_T) {
1008                 /* starting online verify from an arbitrary position
1009                  * does not fit well into the existing protocol.
1010                  * on C_VERIFY_T, we initialize ov_left and friends
1011                  * implicitly in receive_DataRequest once the
1012                  * first P_OV_REQUEST is received */
1013                 mdev->ov_start_sector = ~(sector_t)0;
1014         } else {
1015                 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1016                 if (bit >= mdev->rs_total) {
1017                         mdev->ov_start_sector =
1018                                 BM_BIT_TO_SECT(mdev->rs_total - 1);
1019                         mdev->rs_total = 1;
1020                 } else
1021                         mdev->rs_total -= bit;
1022                 mdev->ov_position = mdev->ov_start_sector;
1023         }
1024         mdev->ov_left = mdev->rs_total;
1025 }
1026
1027 static void drbd_resume_al(struct drbd_conf *mdev)
1028 {
1029         if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1030                 dev_info(DEV, "Resumed AL updates\n");
1031 }
1032
1033 /**
1034  * __drbd_set_state() - Set a new DRBD state
1035  * @mdev:       DRBD device.
1036  * @ns:         new state.
1037  * @flags:      Flags
1038  * @done:       Optional completion, that will get completed after the after_state_ch() finished
1039  *
1040  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1041  */
1042 int __drbd_set_state(struct drbd_conf *mdev,
1043                     union drbd_state ns, enum chg_state_flags flags,
1044                     struct completion *done)
1045 {
1046         union drbd_state os;
1047         int rv = SS_SUCCESS;
1048         const char *warn_sync_abort = NULL;
1049         struct after_state_chg_work *ascw;
1050
1051         os = mdev->state;
1052
1053         ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1054
1055         if (ns.i == os.i)
1056                 return SS_NOTHING_TO_DO;
1057
1058         if (!(flags & CS_HARD)) {
1059                 /*  pre-state-change checks ; only look at ns  */
1060                 /* See drbd_state_sw_errors in drbd_strings.c */
1061
1062                 rv = is_valid_state(mdev, ns);
1063                 if (rv < SS_SUCCESS) {
1064                         /* If the old state was illegal as well, then let
1065                            this happen...*/
1066
1067                         if (is_valid_state(mdev, os) == rv)
1068                                 rv = is_valid_state_transition(mdev, ns, os);
1069                 } else
1070                         rv = is_valid_state_transition(mdev, ns, os);
1071         }
1072
1073         if (rv < SS_SUCCESS) {
1074                 if (flags & CS_VERBOSE)
1075                         print_st_err(mdev, os, ns, rv);
1076                 return rv;
1077         }
1078
1079         if (warn_sync_abort)
1080                 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1081
1082         {
1083                 char *pbp, pb[300];
1084                 pbp = pb;
1085                 *pbp = 0;
1086                 PSC(role);
1087                 PSC(peer);
1088                 PSC(conn);
1089                 PSC(disk);
1090                 PSC(pdsk);
1091                 if (is_susp(ns) != is_susp(os))
1092                         pbp += sprintf(pbp, "susp( %s -> %s ) ",
1093                                        drbd_susp_str(is_susp(os)),
1094                                        drbd_susp_str(is_susp(ns)));
1095                 PSC(aftr_isp);
1096                 PSC(peer_isp);
1097                 PSC(user_isp);
1098                 dev_info(DEV, "%s\n", pb);
1099         }
1100
1101         /* solve the race between becoming unconfigured,
1102          * worker doing the cleanup, and
1103          * admin reconfiguring us:
1104          * on (re)configure, first set CONFIG_PENDING,
1105          * then wait for a potentially exiting worker,
1106          * start the worker, and schedule one no_op.
1107          * then proceed with configuration.
1108          */
1109         if (ns.disk == D_DISKLESS &&
1110             ns.conn == C_STANDALONE &&
1111             ns.role == R_SECONDARY &&
1112             !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1113                 set_bit(DEVICE_DYING, &mdev->flags);
1114
1115         /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1116          * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1117          * drbd_ldev_destroy() won't happen before our corresponding
1118          * after_state_ch works run, where we put_ldev again. */
1119         if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1120             (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1121                 atomic_inc(&mdev->local_cnt);
1122
1123         mdev->state = ns;
1124         wake_up(&mdev->misc_wait);
1125         wake_up(&mdev->state_wait);
1126
1127         /* aborted verify run. log the last position */
1128         if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1129             ns.conn < C_CONNECTED) {
1130                 mdev->ov_start_sector =
1131                         BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1132                 dev_info(DEV, "Online Verify reached sector %llu\n",
1133                         (unsigned long long)mdev->ov_start_sector);
1134         }
1135
1136         if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1137             (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1138                 dev_info(DEV, "Syncer continues.\n");
1139                 mdev->rs_paused += (long)jiffies
1140                                   -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1141                 if (ns.conn == C_SYNC_TARGET)
1142                         mod_timer(&mdev->resync_timer, jiffies);
1143         }
1144
1145         if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1146             (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1147                 dev_info(DEV, "Resync suspended\n");
1148                 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1149         }
1150
1151         if (os.conn == C_CONNECTED &&
1152             (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1153                 unsigned long now = jiffies;
1154                 int i;
1155
1156                 set_ov_position(mdev, ns.conn);
1157                 mdev->rs_start = now;
1158                 mdev->rs_last_events = 0;
1159                 mdev->rs_last_sect_ev = 0;
1160                 mdev->ov_last_oos_size = 0;
1161                 mdev->ov_last_oos_start = 0;
1162
1163                 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1164                         mdev->rs_mark_left[i] = mdev->ov_left;
1165                         mdev->rs_mark_time[i] = now;
1166                 }
1167
1168                 drbd_rs_controller_reset(mdev);
1169
1170                 if (ns.conn == C_VERIFY_S) {
1171                         dev_info(DEV, "Starting Online Verify from sector %llu\n",
1172                                         (unsigned long long)mdev->ov_position);
1173                         mod_timer(&mdev->resync_timer, jiffies);
1174                 }
1175         }
1176
1177         if (get_ldev(mdev)) {
1178                 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1179                                                  MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1180                                                  MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1181
1182                 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1183                         mdf |= MDF_CRASHED_PRIMARY;
1184                 if (mdev->state.role == R_PRIMARY ||
1185                     (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1186                         mdf |= MDF_PRIMARY_IND;
1187                 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1188                         mdf |= MDF_CONNECTED_IND;
1189                 if (mdev->state.disk > D_INCONSISTENT)
1190                         mdf |= MDF_CONSISTENT;
1191                 if (mdev->state.disk > D_OUTDATED)
1192                         mdf |= MDF_WAS_UP_TO_DATE;
1193                 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1194                         mdf |= MDF_PEER_OUT_DATED;
1195                 if (mdf != mdev->ldev->md.flags) {
1196                         mdev->ldev->md.flags = mdf;
1197                         drbd_md_mark_dirty(mdev);
1198                 }
1199                 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1200                         drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1201                 put_ldev(mdev);
1202         }
1203
1204         /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1205         if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1206             os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1207                 set_bit(CONSIDER_RESYNC, &mdev->flags);
1208
1209         /* Receiver should clean up itself */
1210         if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1211                 drbd_thread_stop_nowait(&mdev->receiver);
1212
1213         /* Now the receiver finished cleaning up itself, it should die */
1214         if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1215                 drbd_thread_stop_nowait(&mdev->receiver);
1216
1217         /* Upon network failure, we need to restart the receiver. */
1218         if (os.conn > C_TEAR_DOWN &&
1219             ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1220                 drbd_thread_restart_nowait(&mdev->receiver);
1221
1222         /* Resume AL writing if we get a connection */
1223         if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1224                 drbd_resume_al(mdev);
1225
1226         ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1227         if (ascw) {
1228                 ascw->os = os;
1229                 ascw->ns = ns;
1230                 ascw->flags = flags;
1231                 ascw->w.cb = w_after_state_ch;
1232                 ascw->done = done;
1233                 drbd_queue_work(&mdev->data.work, &ascw->w);
1234         } else {
1235                 dev_warn(DEV, "Could not kmalloc an ascw\n");
1236         }
1237
1238         return rv;
1239 }
1240
1241 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1242 {
1243         struct after_state_chg_work *ascw =
1244                 container_of(w, struct after_state_chg_work, w);
1245         after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1246         if (ascw->flags & CS_WAIT_COMPLETE) {
1247                 D_ASSERT(ascw->done != NULL);
1248                 complete(ascw->done);
1249         }
1250         kfree(ascw);
1251
1252         return 1;
1253 }
1254
1255 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1256 {
1257         if (rv) {
1258                 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1259                 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1260                 return;
1261         }
1262
1263         switch (mdev->state.conn) {
1264         case C_STARTING_SYNC_T:
1265                 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1266                 break;
1267         case C_STARTING_SYNC_S:
1268                 drbd_start_resync(mdev, C_SYNC_SOURCE);
1269                 break;
1270         }
1271 }
1272
1273 /**
1274  * after_state_ch() - Perform after state change actions that may sleep
1275  * @mdev:       DRBD device.
1276  * @os:         old state.
1277  * @ns:         new state.
1278  * @flags:      Flags
1279  */
1280 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1281                            union drbd_state ns, enum chg_state_flags flags)
1282 {
1283         enum drbd_fencing_p fp;
1284         enum drbd_req_event what = nothing;
1285         union drbd_state nsm = (union drbd_state){ .i = -1 };
1286
1287         if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1288                 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1289                 if (mdev->p_uuid)
1290                         mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1291         }
1292
1293         fp = FP_DONT_CARE;
1294         if (get_ldev(mdev)) {
1295                 fp = mdev->ldev->dc.fencing;
1296                 put_ldev(mdev);
1297         }
1298
1299         /* Inform userspace about the change... */
1300         drbd_bcast_state(mdev, ns);
1301
1302         if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1303             (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1304                 drbd_khelper(mdev, "pri-on-incon-degr");
1305
1306         /* Here we have the actions that are performed after a
1307            state change. This function might sleep */
1308
1309         nsm.i = -1;
1310         if (ns.susp_nod) {
1311                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1312                         if (ns.conn == C_CONNECTED)
1313                                 what = resend, nsm.susp_nod = 0;
1314                         else /* ns.conn > C_CONNECTED */
1315                                 dev_err(DEV, "Unexpected Resynd going on!\n");
1316                 }
1317
1318                 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1319                         what = restart_frozen_disk_io, nsm.susp_nod = 0;
1320
1321         }
1322
1323         if (ns.susp_fen) {
1324                 /* case1: The outdate peer handler is successful: */
1325                 if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1326                         tl_clear(mdev);
1327                         if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1328                                 drbd_uuid_new_current(mdev);
1329                                 clear_bit(NEW_CUR_UUID, &mdev->flags);
1330                         }
1331                         spin_lock_irq(&mdev->req_lock);
1332                         _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1333                         spin_unlock_irq(&mdev->req_lock);
1334                 }
1335                 /* case2: The connection was established again: */
1336                 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1337                         clear_bit(NEW_CUR_UUID, &mdev->flags);
1338                         what = resend;
1339                         nsm.susp_fen = 0;
1340                 }
1341         }
1342
1343         if (what != nothing) {
1344                 spin_lock_irq(&mdev->req_lock);
1345                 _tl_restart(mdev, what);
1346                 nsm.i &= mdev->state.i;
1347                 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1348                 spin_unlock_irq(&mdev->req_lock);
1349         }
1350
1351         /* Do not change the order of the if above and the two below... */
1352         if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1353                 drbd_send_uuids(mdev);
1354                 drbd_send_state(mdev);
1355         }
1356         if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S)
1357                 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)");
1358
1359         /* Lost contact to peer's copy of the data */
1360         if ((os.pdsk >= D_INCONSISTENT &&
1361              os.pdsk != D_UNKNOWN &&
1362              os.pdsk != D_OUTDATED)
1363         &&  (ns.pdsk < D_INCONSISTENT ||
1364              ns.pdsk == D_UNKNOWN ||
1365              ns.pdsk == D_OUTDATED)) {
1366                 if (get_ldev(mdev)) {
1367                         if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1368                             mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1369                                 if (is_susp(mdev->state)) {
1370                                         set_bit(NEW_CUR_UUID, &mdev->flags);
1371                                 } else {
1372                                         drbd_uuid_new_current(mdev);
1373                                         drbd_send_uuids(mdev);
1374                                 }
1375                         }
1376                         put_ldev(mdev);
1377                 }
1378         }
1379
1380         if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1381                 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1382                         drbd_uuid_new_current(mdev);
1383                         drbd_send_uuids(mdev);
1384                 }
1385
1386                 /* D_DISKLESS Peer becomes secondary */
1387                 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1388                         drbd_al_to_on_disk_bm(mdev);
1389                 put_ldev(mdev);
1390         }
1391
1392         /* Last part of the attaching process ... */
1393         if (ns.conn >= C_CONNECTED &&
1394             os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1395                 drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1396                 drbd_send_uuids(mdev);
1397                 drbd_send_state(mdev);
1398         }
1399
1400         /* We want to pause/continue resync, tell peer. */
1401         if (ns.conn >= C_CONNECTED &&
1402              ((os.aftr_isp != ns.aftr_isp) ||
1403               (os.user_isp != ns.user_isp)))
1404                 drbd_send_state(mdev);
1405
1406         /* In case one of the isp bits got set, suspend other devices. */
1407         if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1408             (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1409                 suspend_other_sg(mdev);
1410
1411         /* Make sure the peer gets informed about eventual state
1412            changes (ISP bits) while we were in WFReportParams. */
1413         if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1414                 drbd_send_state(mdev);
1415
1416         if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1417                 drbd_send_state(mdev);
1418
1419         /* We are in the progress to start a full sync... */
1420         if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1421             (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1422                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync");
1423
1424         /* We are invalidating our self... */
1425         if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1426             os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1427                 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
1428
1429         /* first half of local IO error, failure to attach,
1430          * or administrative detach */
1431         if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1432                 enum drbd_io_error_p eh;
1433                 int was_io_error;
1434                 /* corresponding get_ldev was in __drbd_set_state, to serialize
1435                  * our cleanup here with the transition to D_DISKLESS,
1436                  * so it is safe to dreference ldev here. */
1437                 eh = mdev->ldev->dc.on_io_error;
1438                 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1439
1440                 /* current state still has to be D_FAILED,
1441                  * there is only one way out: to D_DISKLESS,
1442                  * and that may only happen after our put_ldev below. */
1443                 if (mdev->state.disk != D_FAILED)
1444                         dev_err(DEV,
1445                                 "ASSERT FAILED: disk is %s during detach\n",
1446                                 drbd_disk_str(mdev->state.disk));
1447
1448                 if (drbd_send_state(mdev))
1449                         dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1450                 else
1451                         dev_err(DEV, "Sending state for detaching disk failed\n");
1452
1453                 drbd_rs_cancel_all(mdev);
1454
1455                 /* In case we want to get something to stable storage still,
1456                  * this may be the last chance.
1457                  * Following put_ldev may transition to D_DISKLESS. */
1458                 drbd_md_sync(mdev);
1459                 put_ldev(mdev);
1460
1461                 if (was_io_error && eh == EP_CALL_HELPER)
1462                         drbd_khelper(mdev, "local-io-error");
1463         }
1464
1465         /* second half of local IO error, failure to attach,
1466          * or administrative detach,
1467          * after local_cnt references have reached zero again */
1468         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1469                 /* We must still be diskless,
1470                  * re-attach has to be serialized with this! */
1471                 if (mdev->state.disk != D_DISKLESS)
1472                         dev_err(DEV,
1473                                 "ASSERT FAILED: disk is %s while going diskless\n",
1474                                 drbd_disk_str(mdev->state.disk));
1475
1476                 mdev->rs_total = 0;
1477                 mdev->rs_failed = 0;
1478                 atomic_set(&mdev->rs_pending_cnt, 0);
1479
1480                 if (drbd_send_state(mdev))
1481                         dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1482                 else
1483                         dev_err(DEV, "Sending state for being diskless failed\n");
1484                 /* corresponding get_ldev in __drbd_set_state
1485                  * this may finaly trigger drbd_ldev_destroy. */
1486                 put_ldev(mdev);
1487         }
1488
1489         /* Disks got bigger while they were detached */
1490         if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1491             test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1492                 if (ns.conn == C_CONNECTED)
1493                         resync_after_online_grow(mdev);
1494         }
1495
1496         /* A resync finished or aborted, wake paused devices... */
1497         if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1498             (os.peer_isp && !ns.peer_isp) ||
1499             (os.user_isp && !ns.user_isp))
1500                 resume_next_sg(mdev);
1501
1502         /* sync target done with resync.  Explicitly notify peer, even though
1503          * it should (at least for non-empty resyncs) already know itself. */
1504         if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1505                 drbd_send_state(mdev);
1506
1507         /* free tl_hash if we Got thawed and are C_STANDALONE */
1508         if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1509                 drbd_free_tl_hash(mdev);
1510
1511         /* Upon network connection, we need to start the receiver */
1512         if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1513                 drbd_thread_start(&mdev->receiver);
1514
1515         /* Terminate worker thread if we are unconfigured - it will be
1516            restarted as needed... */
1517         if (ns.disk == D_DISKLESS &&
1518             ns.conn == C_STANDALONE &&
1519             ns.role == R_SECONDARY) {
1520                 if (os.aftr_isp != ns.aftr_isp)
1521                         resume_next_sg(mdev);
1522                 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1523                 if (test_bit(DEVICE_DYING, &mdev->flags))
1524                         drbd_thread_stop_nowait(&mdev->worker);
1525         }
1526
1527         drbd_md_sync(mdev);
1528 }
1529
1530
1531 static int drbd_thread_setup(void *arg)
1532 {
1533         struct drbd_thread *thi = (struct drbd_thread *) arg;
1534         struct drbd_conf *mdev = thi->mdev;
1535         unsigned long flags;
1536         int retval;
1537
1538 restart:
1539         retval = thi->function(thi);
1540
1541         spin_lock_irqsave(&thi->t_lock, flags);
1542
1543         /* if the receiver has been "Exiting", the last thing it did
1544          * was set the conn state to "StandAlone",
1545          * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1546          * and receiver thread will be "started".
1547          * drbd_thread_start needs to set "Restarting" in that case.
1548          * t_state check and assignment needs to be within the same spinlock,
1549          * so either thread_start sees Exiting, and can remap to Restarting,
1550          * or thread_start see None, and can proceed as normal.
1551          */
1552
1553         if (thi->t_state == Restarting) {
1554                 dev_info(DEV, "Restarting %s\n", current->comm);
1555                 thi->t_state = Running;
1556                 spin_unlock_irqrestore(&thi->t_lock, flags);
1557                 goto restart;
1558         }
1559
1560         thi->task = NULL;
1561         thi->t_state = None;
1562         smp_mb();
1563         complete(&thi->stop);
1564         spin_unlock_irqrestore(&thi->t_lock, flags);
1565
1566         dev_info(DEV, "Terminating %s\n", current->comm);
1567
1568         /* Release mod reference taken when thread was started */
1569         module_put(THIS_MODULE);
1570         return retval;
1571 }
1572
1573 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1574                       int (*func) (struct drbd_thread *))
1575 {
1576         spin_lock_init(&thi->t_lock);
1577         thi->task    = NULL;
1578         thi->t_state = None;
1579         thi->function = func;
1580         thi->mdev = mdev;
1581 }
1582
1583 int drbd_thread_start(struct drbd_thread *thi)
1584 {
1585         struct drbd_conf *mdev = thi->mdev;
1586         struct task_struct *nt;
1587         unsigned long flags;
1588
1589         const char *me =
1590                 thi == &mdev->receiver ? "receiver" :
1591                 thi == &mdev->asender  ? "asender"  :
1592                 thi == &mdev->worker   ? "worker"   : "NONSENSE";
1593
1594         /* is used from state engine doing drbd_thread_stop_nowait,
1595          * while holding the req lock irqsave */
1596         spin_lock_irqsave(&thi->t_lock, flags);
1597
1598         switch (thi->t_state) {
1599         case None:
1600                 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1601                                 me, current->comm, current->pid);
1602
1603                 /* Get ref on module for thread - this is released when thread exits */
1604                 if (!try_module_get(THIS_MODULE)) {
1605                         dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1606                         spin_unlock_irqrestore(&thi->t_lock, flags);
1607                         return FALSE;
1608                 }
1609
1610                 init_completion(&thi->stop);
1611                 D_ASSERT(thi->task == NULL);
1612                 thi->reset_cpu_mask = 1;
1613                 thi->t_state = Running;
1614                 spin_unlock_irqrestore(&thi->t_lock, flags);
1615                 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1616
1617                 nt = kthread_create(drbd_thread_setup, (void *) thi,
1618                                     "drbd%d_%s", mdev_to_minor(mdev), me);
1619
1620                 if (IS_ERR(nt)) {
1621                         dev_err(DEV, "Couldn't start thread\n");
1622
1623                         module_put(THIS_MODULE);
1624                         return FALSE;
1625                 }
1626                 spin_lock_irqsave(&thi->t_lock, flags);
1627                 thi->task = nt;
1628                 thi->t_state = Running;
1629                 spin_unlock_irqrestore(&thi->t_lock, flags);
1630                 wake_up_process(nt);
1631                 break;
1632         case Exiting:
1633                 thi->t_state = Restarting;
1634                 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1635                                 me, current->comm, current->pid);
1636                 /* fall through */
1637         case Running:
1638         case Restarting:
1639         default:
1640                 spin_unlock_irqrestore(&thi->t_lock, flags);
1641                 break;
1642         }
1643
1644         return TRUE;
1645 }
1646
1647
1648 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1649 {
1650         unsigned long flags;
1651
1652         enum drbd_thread_state ns = restart ? Restarting : Exiting;
1653
1654         /* may be called from state engine, holding the req lock irqsave */
1655         spin_lock_irqsave(&thi->t_lock, flags);
1656
1657         if (thi->t_state == None) {
1658                 spin_unlock_irqrestore(&thi->t_lock, flags);
1659                 if (restart)
1660                         drbd_thread_start(thi);
1661                 return;
1662         }
1663
1664         if (thi->t_state != ns) {
1665                 if (thi->task == NULL) {
1666                         spin_unlock_irqrestore(&thi->t_lock, flags);
1667                         return;
1668                 }
1669
1670                 thi->t_state = ns;
1671                 smp_mb();
1672                 init_completion(&thi->stop);
1673                 if (thi->task != current)
1674                         force_sig(DRBD_SIGKILL, thi->task);
1675
1676         }
1677
1678         spin_unlock_irqrestore(&thi->t_lock, flags);
1679
1680         if (wait)
1681                 wait_for_completion(&thi->stop);
1682 }
1683
1684 #ifdef CONFIG_SMP
1685 /**
1686  * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1687  * @mdev:       DRBD device.
1688  *
1689  * Forces all threads of a device onto the same CPU. This is beneficial for
1690  * DRBD's performance. May be overwritten by user's configuration.
1691  */
1692 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1693 {
1694         int ord, cpu;
1695
1696         /* user override. */
1697         if (cpumask_weight(mdev->cpu_mask))
1698                 return;
1699
1700         ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1701         for_each_online_cpu(cpu) {
1702                 if (ord-- == 0) {
1703                         cpumask_set_cpu(cpu, mdev->cpu_mask);
1704                         return;
1705                 }
1706         }
1707         /* should not be reached */
1708         cpumask_setall(mdev->cpu_mask);
1709 }
1710
1711 /**
1712  * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1713  * @mdev:       DRBD device.
1714  *
1715  * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1716  * prematurely.
1717  */
1718 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1719 {
1720         struct task_struct *p = current;
1721         struct drbd_thread *thi =
1722                 p == mdev->asender.task  ? &mdev->asender  :
1723                 p == mdev->receiver.task ? &mdev->receiver :
1724                 p == mdev->worker.task   ? &mdev->worker   :
1725                 NULL;
1726         ERR_IF(thi == NULL)
1727                 return;
1728         if (!thi->reset_cpu_mask)
1729                 return;
1730         thi->reset_cpu_mask = 0;
1731         set_cpus_allowed_ptr(p, mdev->cpu_mask);
1732 }
1733 #endif
1734
1735 /* the appropriate socket mutex must be held already */
1736 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1737                           enum drbd_packets cmd, struct p_header80 *h,
1738                           size_t size, unsigned msg_flags)
1739 {
1740         int sent, ok;
1741
1742         ERR_IF(!h) return FALSE;
1743         ERR_IF(!size) return FALSE;
1744
1745         h->magic   = BE_DRBD_MAGIC;
1746         h->command = cpu_to_be16(cmd);
1747         h->length  = cpu_to_be16(size-sizeof(struct p_header80));
1748
1749         sent = drbd_send(mdev, sock, h, size, msg_flags);
1750
1751         ok = (sent == size);
1752         if (!ok)
1753                 dev_err(DEV, "short sent %s size=%d sent=%d\n",
1754                     cmdname(cmd), (int)size, sent);
1755         return ok;
1756 }
1757
1758 /* don't pass the socket. we may only look at it
1759  * when we hold the appropriate socket mutex.
1760  */
1761 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1762                   enum drbd_packets cmd, struct p_header80 *h, size_t size)
1763 {
1764         int ok = 0;
1765         struct socket *sock;
1766
1767         if (use_data_socket) {
1768                 mutex_lock(&mdev->data.mutex);
1769                 sock = mdev->data.socket;
1770         } else {
1771                 mutex_lock(&mdev->meta.mutex);
1772                 sock = mdev->meta.socket;
1773         }
1774
1775         /* drbd_disconnect() could have called drbd_free_sock()
1776          * while we were waiting in down()... */
1777         if (likely(sock != NULL))
1778                 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1779
1780         if (use_data_socket)
1781                 mutex_unlock(&mdev->data.mutex);
1782         else
1783                 mutex_unlock(&mdev->meta.mutex);
1784         return ok;
1785 }
1786
1787 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1788                    size_t size)
1789 {
1790         struct p_header80 h;
1791         int ok;
1792
1793         h.magic   = BE_DRBD_MAGIC;
1794         h.command = cpu_to_be16(cmd);
1795         h.length  = cpu_to_be16(size);
1796
1797         if (!drbd_get_data_sock(mdev))
1798                 return 0;
1799
1800         ok = (sizeof(h) ==
1801                 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1802         ok = ok && (size ==
1803                 drbd_send(mdev, mdev->data.socket, data, size, 0));
1804
1805         drbd_put_data_sock(mdev);
1806
1807         return ok;
1808 }
1809
1810 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1811 {
1812         struct p_rs_param_95 *p;
1813         struct socket *sock;
1814         int size, rv;
1815         const int apv = mdev->agreed_pro_version;
1816
1817         size = apv <= 87 ? sizeof(struct p_rs_param)
1818                 : apv == 88 ? sizeof(struct p_rs_param)
1819                         + strlen(mdev->sync_conf.verify_alg) + 1
1820                 : apv <= 94 ? sizeof(struct p_rs_param_89)
1821                 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
1822
1823         /* used from admin command context and receiver/worker context.
1824          * to avoid kmalloc, grab the socket right here,
1825          * then use the pre-allocated sbuf there */
1826         mutex_lock(&mdev->data.mutex);
1827         sock = mdev->data.socket;
1828
1829         if (likely(sock != NULL)) {
1830                 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1831
1832                 p = &mdev->data.sbuf.rs_param_95;
1833
1834                 /* initialize verify_alg and csums_alg */
1835                 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1836
1837                 p->rate = cpu_to_be32(sc->rate);
1838                 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1839                 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1840                 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1841                 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1842
1843                 if (apv >= 88)
1844                         strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1845                 if (apv >= 89)
1846                         strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1847
1848                 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1849         } else
1850                 rv = 0; /* not ok */
1851
1852         mutex_unlock(&mdev->data.mutex);
1853
1854         return rv;
1855 }
1856
1857 int drbd_send_protocol(struct drbd_conf *mdev)
1858 {
1859         struct p_protocol *p;
1860         int size, cf, rv;
1861
1862         size = sizeof(struct p_protocol);
1863
1864         if (mdev->agreed_pro_version >= 87)
1865                 size += strlen(mdev->net_conf->integrity_alg) + 1;
1866
1867         /* we must not recurse into our own queue,
1868          * as that is blocked during handshake */
1869         p = kmalloc(size, GFP_NOIO);
1870         if (p == NULL)
1871                 return 0;
1872
1873         p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1874         p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1875         p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1876         p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1877         p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1878
1879         cf = 0;
1880         if (mdev->net_conf->want_lose)
1881                 cf |= CF_WANT_LOSE;
1882         if (mdev->net_conf->dry_run) {
1883                 if (mdev->agreed_pro_version >= 92)
1884                         cf |= CF_DRY_RUN;
1885                 else {
1886                         dev_err(DEV, "--dry-run is not supported by peer");
1887                         kfree(p);
1888                         return 0;
1889                 }
1890         }
1891         p->conn_flags    = cpu_to_be32(cf);
1892
1893         if (mdev->agreed_pro_version >= 87)
1894                 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1895
1896         rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1897                            (struct p_header80 *)p, size);
1898         kfree(p);
1899         return rv;
1900 }
1901
1902 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1903 {
1904         struct p_uuids p;
1905         int i;
1906
1907         if (!get_ldev_if_state(mdev, D_NEGOTIATING))
1908                 return 1;
1909
1910         for (i = UI_CURRENT; i < UI_SIZE; i++)
1911                 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
1912
1913         mdev->comm_bm_set = drbd_bm_total_weight(mdev);
1914         p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
1915         uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
1916         uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
1917         uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
1918         p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
1919
1920         put_ldev(mdev);
1921
1922         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
1923                              (struct p_header80 *)&p, sizeof(p));
1924 }
1925
1926 int drbd_send_uuids(struct drbd_conf *mdev)
1927 {
1928         return _drbd_send_uuids(mdev, 0);
1929 }
1930
1931 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
1932 {
1933         return _drbd_send_uuids(mdev, 8);
1934 }
1935
1936
1937 int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
1938 {
1939         struct p_rs_uuid p;
1940
1941         p.uuid = cpu_to_be64(val);
1942
1943         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
1944                              (struct p_header80 *)&p, sizeof(p));
1945 }
1946
1947 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
1948 {
1949         struct p_sizes p;
1950         sector_t d_size, u_size;
1951         int q_order_type;
1952         int ok;
1953
1954         if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
1955                 D_ASSERT(mdev->ldev->backing_bdev);
1956                 d_size = drbd_get_max_capacity(mdev->ldev);
1957                 u_size = mdev->ldev->dc.disk_size;
1958                 q_order_type = drbd_queue_order_type(mdev);
1959                 put_ldev(mdev);
1960         } else {
1961                 d_size = 0;
1962                 u_size = 0;
1963                 q_order_type = QUEUE_ORDERED_NONE;
1964         }
1965
1966         p.d_size = cpu_to_be64(d_size);
1967         p.u_size = cpu_to_be64(u_size);
1968         p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
1969         p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
1970         p.queue_order_type = cpu_to_be16(q_order_type);
1971         p.dds_flags = cpu_to_be16(flags);
1972
1973         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
1974                            (struct p_header80 *)&p, sizeof(p));
1975         return ok;
1976 }
1977
1978 /**
1979  * drbd_send_state() - Sends the drbd state to the peer
1980  * @mdev:       DRBD device.
1981  */
1982 int drbd_send_state(struct drbd_conf *mdev)
1983 {
1984         struct socket *sock;
1985         struct p_state p;
1986         int ok = 0;
1987
1988         /* Grab state lock so we wont send state if we're in the middle
1989          * of a cluster wide state change on another thread */
1990         drbd_state_lock(mdev);
1991
1992         mutex_lock(&mdev->data.mutex);
1993
1994         p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
1995         sock = mdev->data.socket;
1996
1997         if (likely(sock != NULL)) {
1998                 ok = _drbd_send_cmd(mdev, sock, P_STATE,
1999                                     (struct p_header80 *)&p, sizeof(p), 0);
2000         }
2001
2002         mutex_unlock(&mdev->data.mutex);
2003
2004         drbd_state_unlock(mdev);
2005         return ok;
2006 }
2007
2008 int drbd_send_state_req(struct drbd_conf *mdev,
2009         union drbd_state mask, union drbd_state val)
2010 {
2011         struct p_req_state p;
2012
2013         p.mask    = cpu_to_be32(mask.i);
2014         p.val     = cpu_to_be32(val.i);
2015
2016         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2017                              (struct p_header80 *)&p, sizeof(p));
2018 }
2019
2020 int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
2021 {
2022         struct p_req_state_reply p;
2023
2024         p.retcode    = cpu_to_be32(retcode);
2025
2026         return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2027                              (struct p_header80 *)&p, sizeof(p));
2028 }
2029
2030 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2031         struct p_compressed_bm *p,
2032         struct bm_xfer_ctx *c)
2033 {
2034         struct bitstream bs;
2035         unsigned long plain_bits;
2036         unsigned long tmp;
2037         unsigned long rl;
2038         unsigned len;
2039         unsigned toggle;
2040         int bits;
2041
2042         /* may we use this feature? */
2043         if ((mdev->sync_conf.use_rle == 0) ||
2044                 (mdev->agreed_pro_version < 90))
2045                         return 0;
2046
2047         if (c->bit_offset >= c->bm_bits)
2048                 return 0; /* nothing to do. */
2049
2050         /* use at most thus many bytes */
2051         bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2052         memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2053         /* plain bits covered in this code string */
2054         plain_bits = 0;
2055
2056         /* p->encoding & 0x80 stores whether the first run length is set.
2057          * bit offset is implicit.
2058          * start with toggle == 2 to be able to tell the first iteration */
2059         toggle = 2;
2060
2061         /* see how much plain bits we can stuff into one packet
2062          * using RLE and VLI. */
2063         do {
2064                 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2065                                     : _drbd_bm_find_next(mdev, c->bit_offset);
2066                 if (tmp == -1UL)
2067                         tmp = c->bm_bits;
2068                 rl = tmp - c->bit_offset;
2069
2070                 if (toggle == 2) { /* first iteration */
2071                         if (rl == 0) {
2072                                 /* the first checked bit was set,
2073                                  * store start value, */
2074                                 DCBP_set_start(p, 1);
2075                                 /* but skip encoding of zero run length */
2076                                 toggle = !toggle;
2077                                 continue;
2078                         }
2079                         DCBP_set_start(p, 0);
2080                 }
2081
2082                 /* paranoia: catch zero runlength.
2083                  * can only happen if bitmap is modified while we scan it. */
2084                 if (rl == 0) {
2085                         dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2086                             "t:%u bo:%lu\n", toggle, c->bit_offset);
2087                         return -1;
2088                 }
2089
2090                 bits = vli_encode_bits(&bs, rl);
2091                 if (bits == -ENOBUFS) /* buffer full */
2092                         break;
2093                 if (bits <= 0) {
2094                         dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2095                         return 0;
2096                 }
2097
2098                 toggle = !toggle;
2099                 plain_bits += rl;
2100                 c->bit_offset = tmp;
2101         } while (c->bit_offset < c->bm_bits);
2102
2103         len = bs.cur.b - p->code + !!bs.cur.bit;
2104
2105         if (plain_bits < (len << 3)) {
2106                 /* incompressible with this method.
2107                  * we need to rewind both word and bit position. */
2108                 c->bit_offset -= plain_bits;
2109                 bm_xfer_ctx_bit_to_word_offset(c);
2110                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2111                 return 0;
2112         }
2113
2114         /* RLE + VLI was able to compress it just fine.
2115          * update c->word_offset. */
2116         bm_xfer_ctx_bit_to_word_offset(c);
2117
2118         /* store pad_bits */
2119         DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2120
2121         return len;
2122 }
2123
2124 enum { OK, FAILED, DONE }
2125 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2126         struct p_header80 *h, struct bm_xfer_ctx *c)
2127 {
2128         struct p_compressed_bm *p = (void*)h;
2129         unsigned long num_words;
2130         int len;
2131         int ok;
2132
2133         len = fill_bitmap_rle_bits(mdev, p, c);
2134
2135         if (len < 0)
2136                 return FAILED;
2137
2138         if (len) {
2139                 DCBP_set_code(p, RLE_VLI_Bits);
2140                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2141                         sizeof(*p) + len, 0);
2142
2143                 c->packets[0]++;
2144                 c->bytes[0] += sizeof(*p) + len;
2145
2146                 if (c->bit_offset >= c->bm_bits)
2147                         len = 0; /* DONE */
2148         } else {
2149                 /* was not compressible.
2150                  * send a buffer full of plain text bits instead. */
2151                 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2152                 len = num_words * sizeof(long);
2153                 if (len)
2154                         drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2155                 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2156                                    h, sizeof(struct p_header80) + len, 0);
2157                 c->word_offset += num_words;
2158                 c->bit_offset = c->word_offset * BITS_PER_LONG;
2159
2160                 c->packets[1]++;
2161                 c->bytes[1] += sizeof(struct p_header80) + len;
2162
2163                 if (c->bit_offset > c->bm_bits)
2164                         c->bit_offset = c->bm_bits;
2165         }
2166         ok = ok ? ((len == 0) ? DONE : OK) : FAILED;
2167
2168         if (ok == DONE)
2169                 INFO_bm_xfer_stats(mdev, "send", c);
2170         return ok;
2171 }
2172
2173 /* See the comment at receive_bitmap() */
2174 int _drbd_send_bitmap(struct drbd_conf *mdev)
2175 {
2176         struct bm_xfer_ctx c;
2177         struct p_header80 *p;
2178         int ret;
2179
2180         ERR_IF(!mdev->bitmap) return FALSE;
2181
2182         /* maybe we should use some per thread scratch page,
2183          * and allocate that during initial device creation? */
2184         p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2185         if (!p) {
2186                 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2187                 return FALSE;
2188         }
2189
2190         if (get_ldev(mdev)) {
2191                 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2192                         dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2193                         drbd_bm_set_all(mdev);
2194                         if (drbd_bm_write(mdev)) {
2195                                 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2196                                  * but otherwise process as per normal - need to tell other
2197                                  * side that a full resync is required! */
2198                                 dev_err(DEV, "Failed to write bitmap to disk!\n");
2199                         } else {
2200                                 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2201                                 drbd_md_sync(mdev);
2202                         }
2203                 }
2204                 put_ldev(mdev);
2205         }
2206
2207         c = (struct bm_xfer_ctx) {
2208                 .bm_bits = drbd_bm_bits(mdev),
2209                 .bm_words = drbd_bm_words(mdev),
2210         };
2211
2212         do {
2213                 ret = send_bitmap_rle_or_plain(mdev, p, &c);
2214         } while (ret == OK);
2215
2216         free_page((unsigned long) p);
2217         return (ret == DONE);
2218 }
2219
2220 int drbd_send_bitmap(struct drbd_conf *mdev)
2221 {
2222         int err;
2223
2224         if (!drbd_get_data_sock(mdev))
2225                 return -1;
2226         err = !_drbd_send_bitmap(mdev);
2227         drbd_put_data_sock(mdev);
2228         return err;
2229 }
2230
2231 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2232 {
2233         int ok;
2234         struct p_barrier_ack p;
2235
2236         p.barrier  = barrier_nr;
2237         p.set_size = cpu_to_be32(set_size);
2238
2239         if (mdev->state.conn < C_CONNECTED)
2240                 return FALSE;
2241         ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2242                         (struct p_header80 *)&p, sizeof(p));
2243         return ok;
2244 }
2245
2246 /**
2247  * _drbd_send_ack() - Sends an ack packet
2248  * @mdev:       DRBD device.
2249  * @cmd:        Packet command code.
2250  * @sector:     sector, needs to be in big endian byte order
2251  * @blksize:    size in byte, needs to be in big endian byte order
2252  * @block_id:   Id, big endian byte order
2253  */
2254 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2255                           u64 sector,
2256                           u32 blksize,
2257                           u64 block_id)
2258 {
2259         int ok;
2260         struct p_block_ack p;
2261
2262         p.sector   = sector;
2263         p.block_id = block_id;
2264         p.blksize  = blksize;
2265         p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2266
2267         if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2268                 return FALSE;
2269         ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2270                                 (struct p_header80 *)&p, sizeof(p));
2271         return ok;
2272 }
2273
2274 /* dp->sector and dp->block_id already/still in network byte order,
2275  * data_size is payload size according to dp->head,
2276  * and may need to be corrected for digest size. */
2277 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2278                      struct p_data *dp, int data_size)
2279 {
2280         data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2281                 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2282         return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2283                               dp->block_id);
2284 }
2285
2286 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2287                      struct p_block_req *rp)
2288 {
2289         return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2290 }
2291
2292 /**
2293  * drbd_send_ack() - Sends an ack packet
2294  * @mdev:       DRBD device.
2295  * @cmd:        Packet command code.
2296  * @e:          Epoch entry.
2297  */
2298 int drbd_send_ack(struct drbd_conf *mdev,
2299         enum drbd_packets cmd, struct drbd_epoch_entry *e)
2300 {
2301         return _drbd_send_ack(mdev, cmd,
2302                               cpu_to_be64(e->sector),
2303                               cpu_to_be32(e->size),
2304                               e->block_id);
2305 }
2306
2307 /* This function misuses the block_id field to signal if the blocks
2308  * are is sync or not. */
2309 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2310                      sector_t sector, int blksize, u64 block_id)
2311 {
2312         return _drbd_send_ack(mdev, cmd,
2313                               cpu_to_be64(sector),
2314                               cpu_to_be32(blksize),
2315                               cpu_to_be64(block_id));
2316 }
2317
2318 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2319                        sector_t sector, int size, u64 block_id)
2320 {
2321         int ok;
2322         struct p_block_req p;
2323
2324         p.sector   = cpu_to_be64(sector);
2325         p.block_id = block_id;
2326         p.blksize  = cpu_to_be32(size);
2327
2328         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2329                                 (struct p_header80 *)&p, sizeof(p));
2330         return ok;
2331 }
2332
2333 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2334                             sector_t sector, int size,
2335                             void *digest, int digest_size,
2336                             enum drbd_packets cmd)
2337 {
2338         int ok;
2339         struct p_block_req p;
2340
2341         p.sector   = cpu_to_be64(sector);
2342         p.block_id = BE_DRBD_MAGIC + 0xbeef;
2343         p.blksize  = cpu_to_be32(size);
2344
2345         p.head.magic   = BE_DRBD_MAGIC;
2346         p.head.command = cpu_to_be16(cmd);
2347         p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2348
2349         mutex_lock(&mdev->data.mutex);
2350
2351         ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2352         ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2353
2354         mutex_unlock(&mdev->data.mutex);
2355
2356         return ok;
2357 }
2358
2359 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2360 {
2361         int ok;
2362         struct p_block_req p;
2363
2364         p.sector   = cpu_to_be64(sector);
2365         p.block_id = BE_DRBD_MAGIC + 0xbabe;
2366         p.blksize  = cpu_to_be32(size);
2367
2368         ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2369                            (struct p_header80 *)&p, sizeof(p));
2370         return ok;
2371 }
2372
2373 /* called on sndtimeo
2374  * returns FALSE if we should retry,
2375  * TRUE if we think connection is dead
2376  */
2377 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2378 {
2379         int drop_it;
2380         /* long elapsed = (long)(jiffies - mdev->last_received); */
2381
2382         drop_it =   mdev->meta.socket == sock
2383                 || !mdev->asender.task
2384                 || get_t_state(&mdev->asender) != Running
2385                 || mdev->state.conn < C_CONNECTED;
2386
2387         if (drop_it)
2388                 return TRUE;
2389
2390         drop_it = !--mdev->ko_count;
2391         if (!drop_it) {
2392                 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2393                        current->comm, current->pid, mdev->ko_count);
2394                 request_ping(mdev);
2395         }
2396
2397         return drop_it; /* && (mdev->state == R_PRIMARY) */;
2398 }
2399
2400 /* The idea of sendpage seems to be to put some kind of reference
2401  * to the page into the skb, and to hand it over to the NIC. In
2402  * this process get_page() gets called.
2403  *
2404  * As soon as the page was really sent over the network put_page()
2405  * gets called by some part of the network layer. [ NIC driver? ]
2406  *
2407  * [ get_page() / put_page() increment/decrement the count. If count
2408  *   reaches 0 the page will be freed. ]
2409  *
2410  * This works nicely with pages from FSs.
2411  * But this means that in protocol A we might signal IO completion too early!
2412  *
2413  * In order not to corrupt data during a resync we must make sure
2414  * that we do not reuse our own buffer pages (EEs) to early, therefore
2415  * we have the net_ee list.
2416  *
2417  * XFS seems to have problems, still, it submits pages with page_count == 0!
2418  * As a workaround, we disable sendpage on pages
2419  * with page_count == 0 or PageSlab.
2420  */
2421 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2422                    int offset, size_t size, unsigned msg_flags)
2423 {
2424         int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2425         kunmap(page);
2426         if (sent == size)
2427                 mdev->send_cnt += size>>9;
2428         return sent == size;
2429 }
2430
2431 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2432                     int offset, size_t size, unsigned msg_flags)
2433 {
2434         mm_segment_t oldfs = get_fs();
2435         int sent, ok;
2436         int len = size;
2437
2438         /* e.g. XFS meta- & log-data is in slab pages, which have a
2439          * page_count of 0 and/or have PageSlab() set.
2440          * we cannot use send_page for those, as that does get_page();
2441          * put_page(); and would cause either a VM_BUG directly, or
2442          * __page_cache_release a page that would actually still be referenced
2443          * by someone, leading to some obscure delayed Oops somewhere else. */
2444         if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2445                 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2446
2447         msg_flags |= MSG_NOSIGNAL;
2448         drbd_update_congested(mdev);
2449         set_fs(KERNEL_DS);
2450         do {
2451                 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2452                                                         offset, len,
2453                                                         msg_flags);
2454                 if (sent == -EAGAIN) {
2455                         if (we_should_drop_the_connection(mdev,
2456                                                           mdev->data.socket))
2457                                 break;
2458                         else
2459                                 continue;
2460                 }
2461                 if (sent <= 0) {
2462                         dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2463                              __func__, (int)size, len, sent);
2464                         break;
2465                 }
2466                 len    -= sent;
2467                 offset += sent;
2468         } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2469         set_fs(oldfs);
2470         clear_bit(NET_CONGESTED, &mdev->flags);
2471
2472         ok = (len == 0);
2473         if (likely(ok))
2474                 mdev->send_cnt += size>>9;
2475         return ok;
2476 }
2477
2478 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2479 {
2480         struct bio_vec *bvec;
2481         int i;
2482         /* hint all but last page with MSG_MORE */
2483         __bio_for_each_segment(bvec, bio, i, 0) {
2484                 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2485                                      bvec->bv_offset, bvec->bv_len,
2486                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2487                         return 0;
2488         }
2489         return 1;
2490 }
2491
2492 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2493 {
2494         struct bio_vec *bvec;
2495         int i;
2496         /* hint all but last page with MSG_MORE */
2497         __bio_for_each_segment(bvec, bio, i, 0) {
2498                 if (!_drbd_send_page(mdev, bvec->bv_page,
2499                                      bvec->bv_offset, bvec->bv_len,
2500                                      i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2501                         return 0;
2502         }
2503         return 1;
2504 }
2505
2506 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2507 {
2508         struct page *page = e->pages;
2509         unsigned len = e->size;
2510         /* hint all but last page with MSG_MORE */
2511         page_chain_for_each(page) {
2512                 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2513                 if (!_drbd_send_page(mdev, page, 0, l,
2514                                 page_chain_next(page) ? MSG_MORE : 0))
2515                         return 0;
2516                 len -= l;
2517         }
2518         return 1;
2519 }
2520
2521 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2522 {
2523         if (mdev->agreed_pro_version >= 95)
2524                 return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2525                         (bi_rw & REQ_FUA ? DP_FUA : 0) |
2526                         (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2527                         (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2528         else
2529                 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2530 }
2531
2532 /* Used to send write requests
2533  * R_PRIMARY -> Peer    (P_DATA)
2534  */
2535 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2536 {
2537         int ok = 1;
2538         struct p_data p;
2539         unsigned int dp_flags = 0;
2540         void *dgb;
2541         int dgs;
2542
2543         if (!drbd_get_data_sock(mdev))
2544                 return 0;
2545
2546         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2547                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2548
2549         if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2550                 p.head.h80.magic   = BE_DRBD_MAGIC;
2551                 p.head.h80.command = cpu_to_be16(P_DATA);
2552                 p.head.h80.length  =
2553                         cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2554         } else {
2555                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2556                 p.head.h95.command = cpu_to_be16(P_DATA);
2557                 p.head.h95.length  =
2558                         cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2559         }
2560
2561         p.sector   = cpu_to_be64(req->sector);
2562         p.block_id = (unsigned long)req;
2563         p.seq_num  = cpu_to_be32(req->seq_num =
2564                                  atomic_add_return(1, &mdev->packet_seq));
2565
2566         dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2567
2568         if (mdev->state.conn >= C_SYNC_SOURCE &&
2569             mdev->state.conn <= C_PAUSED_SYNC_T)
2570                 dp_flags |= DP_MAY_SET_IN_SYNC;
2571
2572         p.dp_flags = cpu_to_be32(dp_flags);
2573         set_bit(UNPLUG_REMOTE, &mdev->flags);
2574         ok = (sizeof(p) ==
2575                 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2576         if (ok && dgs) {
2577                 dgb = mdev->int_dig_out;
2578                 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2579                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2580         }
2581         if (ok) {
2582                 /* For protocol A, we have to memcpy the payload into
2583                  * socket buffers, as we may complete right away
2584                  * as soon as we handed it over to tcp, at which point the data
2585                  * pages may become invalid.
2586                  *
2587                  * For data-integrity enabled, we copy it as well, so we can be
2588                  * sure that even if the bio pages may still be modified, it
2589                  * won't change the data on the wire, thus if the digest checks
2590                  * out ok after sending on this side, but does not fit on the
2591                  * receiving side, we sure have detected corruption elsewhere.
2592                  */
2593                 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2594                         ok = _drbd_send_bio(mdev, req->master_bio);
2595                 else
2596                         ok = _drbd_send_zc_bio(mdev, req->master_bio);
2597
2598                 /* double check digest, sometimes buffers have been modified in flight. */
2599                 if (dgs > 0 && dgs <= 64) {
2600                         /* 64 byte, 512 bit, is the larges digest size
2601                          * currently supported in kernel crypto. */
2602                         unsigned char digest[64];
2603                         drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2604                         if (memcmp(mdev->int_dig_out, digest, dgs)) {
2605                                 dev_warn(DEV,
2606                                         "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2607                                         (unsigned long long)req->sector, req->size);
2608                         }
2609                 } /* else if (dgs > 64) {
2610                      ... Be noisy about digest too large ...
2611                 } */
2612         }
2613
2614         drbd_put_data_sock(mdev);
2615
2616         return ok;
2617 }
2618
2619 /* answer packet, used to send data back for read requests:
2620  *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2621  *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2622  */
2623 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2624                     struct drbd_epoch_entry *e)
2625 {
2626         int ok;
2627         struct p_data p;
2628         void *dgb;
2629         int dgs;
2630
2631         dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2632                 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2633
2634         if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2635                 p.head.h80.magic   = BE_DRBD_MAGIC;
2636                 p.head.h80.command = cpu_to_be16(cmd);
2637                 p.head.h80.length  =
2638                         cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2639         } else {
2640                 p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2641                 p.head.h95.command = cpu_to_be16(cmd);
2642                 p.head.h95.length  =
2643                         cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2644         }
2645
2646         p.sector   = cpu_to_be64(e->sector);
2647         p.block_id = e->block_id;
2648         /* p.seq_num  = 0;    No sequence numbers here.. */
2649
2650         /* Only called by our kernel thread.
2651          * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2652          * in response to admin command or module unload.
2653          */
2654         if (!drbd_get_data_sock(mdev))
2655                 return 0;
2656
2657         ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2658         if (ok && dgs) {
2659                 dgb = mdev->int_dig_out;
2660                 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2661                 ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2662         }
2663         if (ok)
2664                 ok = _drbd_send_zc_ee(mdev, e);
2665
2666         drbd_put_data_sock(mdev);
2667
2668         return ok;
2669 }
2670
2671 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2672 {
2673         struct p_block_desc p;
2674
2675         p.sector  = cpu_to_be64(req->sector);
2676         p.blksize = cpu_to_be32(req->size);
2677
2678         return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2679 }
2680
2681 /*
2682   drbd_send distinguishes two cases:
2683
2684   Packets sent via the data socket "sock"
2685   and packets sent via the meta data socket "msock"
2686
2687                     sock                      msock
2688   -----------------+-------------------------+------------------------------
2689   timeout           conf.timeout / 2          conf.timeout / 2
2690   timeout action    send a ping via msock     Abort communication
2691                                               and close all sockets
2692 */
2693
2694 /*
2695  * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2696  */
2697 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2698               void *buf, size_t size, unsigned msg_flags)
2699 {
2700         struct kvec iov;
2701         struct msghdr msg;
2702         int rv, sent = 0;
2703
2704         if (!sock)
2705                 return -1000;
2706
2707         /* THINK  if (signal_pending) return ... ? */
2708
2709         iov.iov_base = buf;
2710         iov.iov_len  = size;
2711
2712         msg.msg_name       = NULL;
2713         msg.msg_namelen    = 0;
2714         msg.msg_control    = NULL;
2715         msg.msg_controllen = 0;
2716         msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2717
2718         if (sock == mdev->data.socket) {
2719                 mdev->ko_count = mdev->net_conf->ko_count;
2720                 drbd_update_congested(mdev);
2721         }
2722         do {
2723                 /* STRANGE
2724                  * tcp_sendmsg does _not_ use its size parameter at all ?
2725                  *
2726                  * -EAGAIN on timeout, -EINTR on signal.
2727                  */
2728 /* THINK
2729  * do we need to block DRBD_SIG if sock == &meta.socket ??
2730  * otherwise wake_asender() might interrupt some send_*Ack !
2731  */
2732                 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2733                 if (rv == -EAGAIN) {
2734                         if (we_should_drop_the_connection(mdev, sock))
2735                                 break;
2736                         else
2737                                 continue;
2738                 }
2739                 D_ASSERT(rv != 0);
2740                 if (rv == -EINTR) {
2741                         flush_signals(current);
2742                         rv = 0;
2743                 }
2744                 if (rv < 0)
2745                         break;
2746                 sent += rv;
2747                 iov.iov_base += rv;
2748                 iov.iov_len  -= rv;
2749         } while (sent < size);
2750
2751         if (sock == mdev->data.socket)
2752                 clear_bit(NET_CONGESTED, &mdev->flags);
2753
2754         if (rv <= 0) {
2755                 if (rv != -EAGAIN) {
2756                         dev_err(DEV, "%s_sendmsg returned %d\n",
2757                             sock == mdev->meta.socket ? "msock" : "sock",
2758                             rv);
2759                         drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2760                 } else
2761                         drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2762         }
2763
2764         return sent;
2765 }
2766
2767 static int drbd_open(struct block_device *bdev, fmode_t mode)
2768 {
2769         struct drbd_conf *mdev = bdev->bd_disk->private_data;
2770         unsigned long flags;
2771         int rv = 0;
2772
2773         mutex_lock(&drbd_main_mutex);
2774         spin_lock_irqsave(&mdev->req_lock, flags);
2775         /* to have a stable mdev->state.role
2776          * and no race with updating open_cnt */
2777
2778         if (mdev->state.role != R_PRIMARY) {
2779                 if (mode & FMODE_WRITE)
2780                         rv = -EROFS;
2781                 else if (!allow_oos)
2782                         rv = -EMEDIUMTYPE;
2783         }
2784
2785         if (!rv)
2786                 mdev->open_cnt++;
2787         spin_unlock_irqrestore(&mdev->req_lock, flags);
2788         mutex_unlock(&drbd_main_mutex);
2789
2790         return rv;
2791 }
2792
2793 static int drbd_release(struct gendisk *gd, fmode_t mode)
2794 {
2795         struct drbd_conf *mdev = gd->private_data;
2796         mutex_lock(&drbd_main_mutex);
2797         mdev->open_cnt--;
2798         mutex_unlock(&drbd_main_mutex);
2799         return 0;
2800 }
2801
2802 static void drbd_set_defaults(struct drbd_conf *mdev)
2803 {
2804         /* This way we get a compile error when sync_conf grows,
2805            and we forgot to initialize it here */
2806         mdev->sync_conf = (struct syncer_conf) {
2807                 /* .rate = */           DRBD_RATE_DEF,
2808                 /* .after = */          DRBD_AFTER_DEF,
2809                 /* .al_extents = */     DRBD_AL_EXTENTS_DEF,
2810                 /* .verify_alg = */     {}, 0,
2811                 /* .cpu_mask = */       {}, 0,
2812                 /* .csums_alg = */      {}, 0,
2813                 /* .use_rle = */        0,
2814                 /* .on_no_data = */     DRBD_ON_NO_DATA_DEF,
2815                 /* .c_plan_ahead = */   DRBD_C_PLAN_AHEAD_DEF,
2816                 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2817                 /* .c_fill_target = */  DRBD_C_FILL_TARGET_DEF,
2818                 /* .c_max_rate = */     DRBD_C_MAX_RATE_DEF,
2819                 /* .c_min_rate = */     DRBD_C_MIN_RATE_DEF
2820         };
2821
2822         /* Have to use that way, because the layout differs between
2823            big endian and little endian */
2824         mdev->state = (union drbd_state) {
2825                 { .role = R_SECONDARY,
2826                   .peer = R_UNKNOWN,
2827                   .conn = C_STANDALONE,
2828                   .disk = D_DISKLESS,
2829                   .pdsk = D_UNKNOWN,
2830                   .susp = 0,
2831                   .susp_nod = 0,
2832                   .susp_fen = 0
2833                 } };
2834 }
2835
2836 void drbd_init_set_defaults(struct drbd_conf *mdev)
2837 {
2838         /* the memset(,0,) did most of this.
2839          * note: only assignments, no allocation in here */
2840
2841         drbd_set_defaults(mdev);
2842
2843         atomic_set(&mdev->ap_bio_cnt, 0);
2844         atomic_set(&mdev->ap_pending_cnt, 0);
2845         atomic_set(&mdev->rs_pending_cnt, 0);
2846         atomic_set(&mdev->unacked_cnt, 0);
2847         atomic_set(&mdev->local_cnt, 0);
2848         atomic_set(&mdev->net_cnt, 0);
2849         atomic_set(&mdev->packet_seq, 0);
2850         atomic_set(&mdev->pp_in_use, 0);
2851         atomic_set(&mdev->pp_in_use_by_net, 0);
2852         atomic_set(&mdev->rs_sect_in, 0);
2853         atomic_set(&mdev->rs_sect_ev, 0);
2854         atomic_set(&mdev->ap_in_flight, 0);
2855
2856         mutex_init(&mdev->md_io_mutex);
2857         mutex_init(&mdev->data.mutex);
2858         mutex_init(&mdev->meta.mutex);
2859         sema_init(&mdev->data.work.s, 0);
2860         sema_init(&mdev->meta.work.s, 0);
2861         mutex_init(&mdev->state_mutex);
2862
2863         spin_lock_init(&mdev->data.work.q_lock);
2864         spin_lock_init(&mdev->meta.work.q_lock);
2865
2866         spin_lock_init(&mdev->al_lock);
2867         spin_lock_init(&mdev->req_lock);
2868         spin_lock_init(&mdev->peer_seq_lock);
2869         spin_lock_init(&mdev->epoch_lock);
2870
2871         INIT_LIST_HEAD(&mdev->active_ee);
2872         INIT_LIST_HEAD(&mdev->sync_ee);
2873         INIT_LIST_HEAD(&mdev->done_ee);
2874         INIT_LIST_HEAD(&mdev->read_ee);
2875         INIT_LIST_HEAD(&mdev->net_ee);
2876         INIT_LIST_HEAD(&mdev->resync_reads);
2877         INIT_LIST_HEAD(&mdev->data.work.q);
2878         INIT_LIST_HEAD(&mdev->meta.work.q);
2879         INIT_LIST_HEAD(&mdev->resync_work.list);
2880         INIT_LIST_HEAD(&mdev->unplug_work.list);
2881         INIT_LIST_HEAD(&mdev->go_diskless.list);
2882         INIT_LIST_HEAD(&mdev->md_sync_work.list);
2883         INIT_LIST_HEAD(&mdev->start_resync_work.list);
2884         INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
2885
2886         mdev->resync_work.cb  = w_resync_inactive;
2887         mdev->unplug_work.cb  = w_send_write_hint;
2888         mdev->go_diskless.cb  = w_go_diskless;
2889         mdev->md_sync_work.cb = w_md_sync;
2890         mdev->bm_io_work.w.cb = w_bitmap_io;
2891         init_timer(&mdev->resync_timer);
2892         init_timer(&mdev->md_sync_timer);
2893         mdev->resync_timer.function = resync_timer_fn;
2894         mdev->resync_timer.data = (unsigned long) mdev;
2895         mdev->md_sync_timer.function = md_sync_timer_fn;
2896         mdev->md_sync_timer.data = (unsigned long) mdev;
2897
2898         init_waitqueue_head(&mdev->misc_wait);
2899         init_waitqueue_head(&mdev->state_wait);
2900         init_waitqueue_head(&mdev->net_cnt_wait);
2901         init_waitqueue_head(&mdev->ee_wait);
2902         init_waitqueue_head(&mdev->al_wait);
2903         init_waitqueue_head(&mdev->seq_wait);
2904
2905         drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
2906         drbd_thread_init(mdev, &mdev->worker, drbd_worker);
2907         drbd_thread_init(mdev, &mdev->asender, drbd_asender);
2908
2909         mdev->agreed_pro_version = PRO_VERSION_MAX;
2910         mdev->write_ordering = WO_bdev_flush;
2911         mdev->resync_wenr = LC_FREE;
2912 }
2913
2914 void drbd_mdev_cleanup(struct drbd_conf *mdev)
2915 {
2916         int i;
2917         if (mdev->receiver.t_state != None)
2918                 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2919                                 mdev->receiver.t_state);
2920
2921         /* no need to lock it, I'm the only thread alive */
2922         if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
2923                 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
2924         mdev->al_writ_cnt  =
2925         mdev->bm_writ_cnt  =
2926         mdev->read_cnt     =
2927         mdev->recv_cnt     =
2928         mdev->send_cnt     =
2929         mdev->writ_cnt     =
2930         mdev->p_size       =
2931         mdev->rs_start     =
2932         mdev->rs_total     =
2933         mdev->rs_failed    = 0;
2934         mdev->rs_last_events = 0;
2935         mdev->rs_last_sect_ev = 0;
2936         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2937                 mdev->rs_mark_left[i] = 0;
2938                 mdev->rs_mark_time[i] = 0;
2939         }
2940         D_ASSERT(mdev->net_conf == NULL);
2941
2942         drbd_set_my_capacity(mdev, 0);
2943         if (mdev->bitmap) {
2944                 /* maybe never allocated. */
2945                 drbd_bm_resize(mdev, 0, 1);
2946                 drbd_bm_cleanup(mdev);
2947         }
2948
2949         drbd_free_resources(mdev);
2950         clear_bit(AL_SUSPENDED, &mdev->flags);
2951
2952         /*
2953          * currently we drbd_init_ee only on module load, so
2954          * we may do drbd_release_ee only on module unload!
2955          */
2956         D_ASSERT(list_empty(&mdev->active_ee));
2957         D_ASSERT(list_empty(&mdev->sync_ee));
2958         D_ASSERT(list_empty(&mdev->done_ee));
2959         D_ASSERT(list_empty(&mdev->read_ee));
2960         D_ASSERT(list_empty(&mdev->net_ee));
2961         D_ASSERT(list_empty(&mdev->resync_reads));
2962         D_ASSERT(list_empty(&mdev->data.work.q));
2963         D_ASSERT(list_empty(&mdev->meta.work.q));
2964         D_ASSERT(list_empty(&mdev->resync_work.list));
2965         D_ASSERT(list_empty(&mdev->unplug_work.list));
2966         D_ASSERT(list_empty(&mdev->go_diskless.list));
2967 }
2968
2969
2970 static void drbd_destroy_mempools(void)
2971 {
2972         struct page *page;
2973
2974         while (drbd_pp_pool) {
2975                 page = drbd_pp_pool;
2976                 drbd_pp_pool = (struct page *)page_private(page);
2977                 __free_page(page);
2978                 drbd_pp_vacant--;
2979         }
2980
2981         /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
2982
2983         if (drbd_ee_mempool)
2984                 mempool_destroy(drbd_ee_mempool);
2985         if (drbd_request_mempool)
2986                 mempool_destroy(drbd_request_mempool);
2987         if (drbd_ee_cache)
2988                 kmem_cache_destroy(drbd_ee_cache);
2989         if (drbd_request_cache)
2990                 kmem_cache_destroy(drbd_request_cache);
2991         if (drbd_bm_ext_cache)
2992                 kmem_cache_destroy(drbd_bm_ext_cache);
2993         if (drbd_al_ext_cache)
2994                 kmem_cache_destroy(drbd_al_ext_cache);
2995
2996         drbd_ee_mempool      = NULL;
2997         drbd_request_mempool = NULL;
2998         drbd_ee_cache        = NULL;
2999         drbd_request_cache   = NULL;
3000         drbd_bm_ext_cache    = NULL;
3001         drbd_al_ext_cache    = NULL;
3002
3003         return;
3004 }
3005
3006 static int drbd_create_mempools(void)
3007 {
3008         struct page *page;
3009         const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3010         int i;
3011
3012         /* prepare our caches and mempools */
3013         drbd_request_mempool = NULL;
3014         drbd_ee_cache        = NULL;
3015         drbd_request_cache   = NULL;
3016         drbd_bm_ext_cache    = NULL;
3017         drbd_al_ext_cache    = NULL;
3018         drbd_pp_pool         = NULL;
3019
3020         /* caches */
3021         drbd_request_cache = kmem_cache_create(
3022                 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3023         if (drbd_request_cache == NULL)
3024                 goto Enomem;
3025
3026         drbd_ee_cache = kmem_cache_create(
3027                 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3028         if (drbd_ee_cache == NULL)
3029                 goto Enomem;
3030
3031         drbd_bm_ext_cache = kmem_cache_create(
3032                 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3033         if (drbd_bm_ext_cache == NULL)
3034                 goto Enomem;
3035
3036         drbd_al_ext_cache = kmem_cache_create(
3037                 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3038         if (drbd_al_ext_cache == NULL)
3039                 goto Enomem;
3040
3041         /* mempools */
3042         drbd_request_mempool = mempool_create(number,
3043                 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3044         if (drbd_request_mempool == NULL)
3045                 goto Enomem;
3046
3047         drbd_ee_mempool = mempool_create(number,
3048                 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3049         if (drbd_ee_mempool == NULL)
3050                 goto Enomem;
3051
3052         /* drbd's page pool */
3053         spin_lock_init(&drbd_pp_lock);
3054
3055         for (i = 0; i < number; i++) {
3056                 page = alloc_page(GFP_HIGHUSER);
3057                 if (!page)
3058                         goto Enomem;
3059                 set_page_private(page, (unsigned long)drbd_pp_pool);
3060                 drbd_pp_pool = page;
3061         }
3062         drbd_pp_vacant = number;
3063
3064         return 0;
3065
3066 Enomem:
3067         drbd_destroy_mempools(); /* in case we allocated some */
3068         return -ENOMEM;
3069 }
3070
3071 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3072         void *unused)
3073 {
3074         /* just so we have it.  you never know what interesting things we
3075          * might want to do here some day...
3076          */
3077
3078         return NOTIFY_DONE;
3079 }
3080
3081 static struct notifier_block drbd_notifier = {
3082         .notifier_call = drbd_notify_sys,
3083 };
3084
3085 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3086 {
3087         int rr;
3088
3089         rr = drbd_release_ee(mdev, &mdev->active_ee);
3090         if (rr)
3091                 dev_err(DEV, "%d EEs in active list found!\n", rr);
3092
3093         rr = drbd_release_ee(mdev, &mdev->sync_ee);
3094         if (rr)
3095                 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3096
3097         rr = drbd_release_ee(mdev, &mdev->read_ee);
3098         if (rr)
3099                 dev_err(DEV, "%d EEs in read list found!\n", rr);
3100
3101         rr = drbd_release_ee(mdev, &mdev->done_ee);
3102         if (rr)
3103                 dev_err(DEV, "%d EEs in done list found!\n", rr);
3104
3105         rr = drbd_release_ee(mdev, &mdev->net_ee);
3106         if (rr)
3107                 dev_err(DEV, "%d EEs in net list found!\n", rr);
3108 }
3109
3110 /* caution. no locking.
3111  * currently only used from module cleanup code. */
3112 static void drbd_delete_device(unsigned int minor)
3113 {
3114         struct drbd_conf *mdev = minor_to_mdev(minor);
3115
3116         if (!mdev)
3117                 return;
3118
3119         /* paranoia asserts */
3120         if (mdev->open_cnt != 0)
3121                 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3122                                 __FILE__ , __LINE__);
3123
3124         ERR_IF (!list_empty(&mdev->data.work.q)) {
3125                 struct list_head *lp;
3126                 list_for_each(lp, &mdev->data.work.q) {
3127                         dev_err(DEV, "lp = %p\n", lp);
3128                 }
3129         };
3130         /* end paranoia asserts */
3131
3132         del_gendisk(mdev->vdisk);
3133
3134         /* cleanup stuff that may have been allocated during
3135          * device (re-)configuration or state changes */
3136
3137         if (mdev->this_bdev)
3138                 bdput(mdev->this_bdev);
3139
3140         drbd_free_resources(mdev);
3141
3142         drbd_release_ee_lists(mdev);
3143
3144         /* should be free'd on disconnect? */
3145         kfree(mdev->ee_hash);
3146         /*
3147         mdev->ee_hash_s = 0;
3148         mdev->ee_hash = NULL;
3149         */
3150
3151         lc_destroy(mdev->act_log);
3152         lc_destroy(mdev->resync);
3153
3154         kfree(mdev->p_uuid);
3155         /* mdev->p_uuid = NULL; */
3156
3157         kfree(mdev->int_dig_out);
3158         kfree(mdev->int_dig_in);
3159         kfree(mdev->int_dig_vv);
3160
3161         /* cleanup the rest that has been
3162          * allocated from drbd_new_device
3163          * and actually free the mdev itself */
3164         drbd_free_mdev(mdev);
3165 }
3166
3167 static void drbd_cleanup(void)
3168 {
3169         unsigned int i;
3170
3171         unregister_reboot_notifier(&drbd_notifier);
3172
3173         drbd_nl_cleanup();
3174
3175         if (minor_table) {
3176                 if (drbd_proc)
3177                         remove_proc_entry("drbd", NULL);
3178                 i = minor_count;
3179                 while (i--)
3180                         drbd_delete_device(i);
3181                 drbd_destroy_mempools();
3182         }
3183
3184         kfree(minor_table);
3185
3186         unregister_blkdev(DRBD_MAJOR, "drbd");
3187
3188         printk(KERN_INFO "drbd: module cleanup done.\n");
3189 }
3190
3191 /**
3192  * drbd_congested() - Callback for pdflush
3193  * @congested_data:     User data
3194  * @bdi_bits:           Bits pdflush is currently interested in
3195  *
3196  * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3197  */
3198 static int drbd_congested(void *congested_data, int bdi_bits)
3199 {
3200         struct drbd_conf *mdev = congested_data;
3201         struct request_queue *q;
3202         char reason = '-';
3203         int r = 0;
3204
3205         if (!__inc_ap_bio_cond(mdev)) {
3206                 /* DRBD has frozen IO */
3207                 r = bdi_bits;
3208                 reason = 'd';
3209                 goto out;
3210         }
3211
3212         if (get_ldev(mdev)) {
3213                 q = bdev_get_queue(mdev->ldev->backing_bdev);
3214                 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3215                 put_ldev(mdev);
3216                 if (r)
3217                         reason = 'b';
3218         }
3219
3220         if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3221                 r |= (1 << BDI_async_congested);
3222                 reason = reason == 'b' ? 'a' : 'n';
3223         }
3224
3225 out:
3226         mdev->congestion_reason = reason;
3227         return r;
3228 }
3229
3230 struct drbd_conf *drbd_new_device(unsigned int minor)
3231 {
3232         struct drbd_conf *mdev;
3233         struct gendisk *disk;
3234         struct request_queue *q;
3235
3236         /* GFP_KERNEL, we are outside of all write-out paths */
3237         mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3238         if (!mdev)
3239                 return NULL;
3240         if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3241                 goto out_no_cpumask;
3242
3243         mdev->minor = minor;
3244
3245         drbd_init_set_defaults(mdev);
3246
3247         q = blk_alloc_queue(GFP_KERNEL);
3248         if (!q)
3249                 goto out_no_q;
3250         mdev->rq_queue = q;
3251         q->queuedata   = mdev;
3252
3253         disk = alloc_disk(1);
3254         if (!disk)
3255                 goto out_no_disk;
3256         mdev->vdisk = disk;
3257
3258         set_disk_ro(disk, TRUE);
3259
3260         disk->queue = q;
3261         disk->major = DRBD_MAJOR;
3262         disk->first_minor = minor;
3263         disk->fops = &drbd_ops;
3264         sprintf(disk->disk_name, "drbd%d", minor);
3265         disk->private_data = mdev;
3266
3267         mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3268         /* we have no partitions. we contain only ourselves. */
3269         mdev->this_bdev->bd_contains = mdev->this_bdev;
3270
3271         q->backing_dev_info.congested_fn = drbd_congested;
3272         q->backing_dev_info.congested_data = mdev;
3273
3274         blk_queue_make_request(q, drbd_make_request_26);
3275         blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
3276         blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3277         blk_queue_merge_bvec(q, drbd_merge_bvec);
3278         q->queue_lock = &mdev->req_lock;
3279
3280         mdev->md_io_page = alloc_page(GFP_KERNEL);
3281         if (!mdev->md_io_page)
3282                 goto out_no_io_page;
3283
3284         if (drbd_bm_init(mdev))
3285                 goto out_no_bitmap;
3286         /* no need to lock access, we are still initializing this minor device. */
3287         if (!tl_init(mdev))
3288                 goto out_no_tl;
3289
3290         mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3291         if (!mdev->app_reads_hash)
3292                 goto out_no_app_reads;
3293
3294         mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3295         if (!mdev->current_epoch)
3296                 goto out_no_epoch;
3297
3298         INIT_LIST_HEAD(&mdev->current_epoch->list);
3299         mdev->epochs = 1;
3300
3301         return mdev;
3302
3303 /* out_whatever_else:
3304         kfree(mdev->current_epoch); */
3305 out_no_epoch:
3306         kfree(mdev->app_reads_hash);
3307 out_no_app_reads:
3308         tl_cleanup(mdev);
3309 out_no_tl:
3310         drbd_bm_cleanup(mdev);
3311 out_no_bitmap:
3312         __free_page(mdev->md_io_page);
3313 out_no_io_page:
3314         put_disk(disk);
3315 out_no_disk:
3316         blk_cleanup_queue(q);
3317 out_no_q:
3318         free_cpumask_var(mdev->cpu_mask);
3319 out_no_cpumask:
3320         kfree(mdev);
3321         return NULL;
3322 }
3323
3324 /* counterpart of drbd_new_device.
3325  * last part of drbd_delete_device. */
3326 void drbd_free_mdev(struct drbd_conf *mdev)
3327 {
3328         kfree(mdev->current_epoch);
3329         kfree(mdev->app_reads_hash);
3330         tl_cleanup(mdev);
3331         if (mdev->bitmap) /* should no longer be there. */
3332                 drbd_bm_cleanup(mdev);
3333         __free_page(mdev->md_io_page);
3334         put_disk(mdev->vdisk);
3335         blk_cleanup_queue(mdev->rq_queue);
3336         free_cpumask_var(mdev->cpu_mask);
3337         kfree(mdev);
3338 }
3339
3340
3341 int __init drbd_init(void)
3342 {
3343         int err;
3344
3345         if (sizeof(struct p_handshake) != 80) {
3346                 printk(KERN_ERR
3347                        "drbd: never change the size or layout "
3348                        "of the HandShake packet.\n");
3349                 return -EINVAL;
3350         }
3351
3352         if (1 > minor_count || minor_count > 255) {
3353                 printk(KERN_ERR
3354                         "drbd: invalid minor_count (%d)\n", minor_count);
3355 #ifdef MODULE
3356                 return -EINVAL;
3357 #else
3358                 minor_count = 8;
3359 #endif
3360         }
3361
3362         err = drbd_nl_init();
3363         if (err)
3364                 return err;
3365
3366         err = register_blkdev(DRBD_MAJOR, "drbd");
3367         if (err) {
3368                 printk(KERN_ERR
3369                        "drbd: unable to register block device major %d\n",
3370                        DRBD_MAJOR);
3371                 return err;
3372         }
3373
3374         register_reboot_notifier(&drbd_notifier);
3375
3376         /*
3377          * allocate all necessary structs
3378          */
3379         err = -ENOMEM;
3380
3381         init_waitqueue_head(&drbd_pp_wait);
3382
3383         drbd_proc = NULL; /* play safe for drbd_cleanup */
3384         minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3385                                 GFP_KERNEL);
3386         if (!minor_table)
3387                 goto Enomem;
3388
3389         err = drbd_create_mempools();
3390         if (err)
3391                 goto Enomem;
3392
3393         drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3394         if (!drbd_proc) {
3395                 printk(KERN_ERR "drbd: unable to register proc file\n");
3396                 goto Enomem;
3397         }
3398
3399         rwlock_init(&global_state_lock);
3400
3401         printk(KERN_INFO "drbd: initialized. "
3402                "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3403                API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3404         printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3405         printk(KERN_INFO "drbd: registered as block device major %d\n",
3406                 DRBD_MAJOR);
3407         printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3408
3409         return 0; /* Success! */
3410
3411 Enomem:
3412         drbd_cleanup();
3413         if (err == -ENOMEM)
3414                 /* currently always the case */
3415                 printk(KERN_ERR "drbd: ran out of memory\n");
3416         else
3417                 printk(KERN_ERR "drbd: initialization failure\n");
3418         return err;
3419 }
3420
3421 void drbd_free_bc(struct drbd_backing_dev *ldev)
3422 {
3423         if (ldev == NULL)
3424                 return;
3425
3426         blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3427         blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3428
3429         kfree(ldev);
3430 }
3431
3432 void drbd_free_sock(struct drbd_conf *mdev)
3433 {
3434         if (mdev->data.socket) {
3435                 mutex_lock(&mdev->data.mutex);
3436                 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3437                 sock_release(mdev->data.socket);
3438                 mdev->data.socket = NULL;
3439                 mutex_unlock(&mdev->data.mutex);
3440         }
3441         if (mdev->meta.socket) {
3442                 mutex_lock(&mdev->meta.mutex);
3443                 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3444                 sock_release(mdev->meta.socket);
3445                 mdev->meta.socket = NULL;
3446                 mutex_unlock(&mdev->meta.mutex);
3447         }
3448 }
3449
3450
3451 void drbd_free_resources(struct drbd_conf *mdev)
3452 {
3453         crypto_free_hash(mdev->csums_tfm);
3454         mdev->csums_tfm = NULL;
3455         crypto_free_hash(mdev->verify_tfm);
3456         mdev->verify_tfm = NULL;
3457         crypto_free_hash(mdev->cram_hmac_tfm);
3458         mdev->cram_hmac_tfm = NULL;
3459         crypto_free_hash(mdev->integrity_w_tfm);
3460         mdev->integrity_w_tfm = NULL;
3461         crypto_free_hash(mdev->integrity_r_tfm);
3462         mdev->integrity_r_tfm = NULL;
3463
3464         drbd_free_sock(mdev);
3465
3466         __no_warn(local,
3467                   drbd_free_bc(mdev->ldev);
3468                   mdev->ldev = NULL;);
3469 }
3470
3471 /* meta data management */
3472
3473 struct meta_data_on_disk {
3474         u64 la_size;           /* last agreed size. */
3475         u64 uuid[UI_SIZE];   /* UUIDs. */
3476         u64 device_uuid;
3477         u64 reserved_u64_1;
3478         u32 flags;             /* MDF */
3479         u32 magic;
3480         u32 md_size_sect;
3481         u32 al_offset;         /* offset to this block */
3482         u32 al_nr_extents;     /* important for restoring the AL */
3483               /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3484         u32 bm_offset;         /* offset to the bitmap, from here */
3485         u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3486         u32 reserved_u32[4];
3487
3488 } __packed;
3489
3490 /**
3491  * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3492  * @mdev:       DRBD device.
3493  */
3494 void drbd_md_sync(struct drbd_conf *mdev)
3495 {
3496         struct meta_data_on_disk *buffer;
3497         sector_t sector;
3498         int i;
3499
3500         del_timer(&mdev->md_sync_timer);
3501         /* timer may be rearmed by drbd_md_mark_dirty() now. */
3502         if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3503                 return;
3504
3505         /* We use here D_FAILED and not D_ATTACHING because we try to write
3506          * metadata even if we detach due to a disk failure! */
3507         if (!get_ldev_if_state(mdev, D_FAILED))
3508                 return;
3509
3510         mutex_lock(&mdev->md_io_mutex);
3511         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3512         memset(buffer, 0, 512);
3513
3514         buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3515         for (i = UI_CURRENT; i < UI_SIZE; i++)
3516                 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3517         buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3518         buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3519
3520         buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3521         buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3522         buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3523         buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3524         buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3525
3526         buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3527
3528         D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3529         sector = mdev->ldev->md.md_offset;
3530
3531         if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3532                 /* this was a try anyways ... */
3533                 dev_err(DEV, "meta data update failed!\n");
3534                 drbd_chk_io_error(mdev, 1, TRUE);
3535         }
3536
3537         /* Update mdev->ldev->md.la_size_sect,
3538          * since we updated it on metadata. */
3539         mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3540
3541         mutex_unlock(&mdev->md_io_mutex);
3542         put_ldev(mdev);
3543 }
3544
3545 /**
3546  * drbd_md_read() - Reads in the meta data super block
3547  * @mdev:       DRBD device.
3548  * @bdev:       Device from which the meta data should be read in.
3549  *
3550  * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case
3551  * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3552  */
3553 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3554 {
3555         struct meta_data_on_disk *buffer;
3556         int i, rv = NO_ERROR;
3557
3558         if (!get_ldev_if_state(mdev, D_ATTACHING))
3559                 return ERR_IO_MD_DISK;
3560
3561         mutex_lock(&mdev->md_io_mutex);
3562         buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3563
3564         if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3565                 /* NOTE: cant do normal error processing here as this is
3566                    called BEFORE disk is attached */
3567                 dev_err(DEV, "Error while reading metadata.\n");
3568                 rv = ERR_IO_MD_DISK;
3569                 goto err;
3570         }
3571
3572         if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3573                 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3574                 rv = ERR_MD_INVALID;
3575                 goto err;
3576         }
3577         if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3578                 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3579                     be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3580                 rv = ERR_MD_INVALID;
3581                 goto err;
3582         }
3583         if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3584                 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3585                     be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3586                 rv = ERR_MD_INVALID;
3587                 goto err;
3588         }
3589         if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3590                 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3591                     be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3592                 rv = ERR_MD_INVALID;
3593                 goto err;
3594         }
3595
3596         if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3597                 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3598                     be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3599                 rv = ERR_MD_INVALID;
3600                 goto err;
3601         }
3602
3603         bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3604         for (i = UI_CURRENT; i < UI_SIZE; i++)
3605                 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3606         bdev->md.flags = be32_to_cpu(buffer->flags);
3607         mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3608         bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3609
3610         if (mdev->sync_conf.al_extents < 7)
3611                 mdev->sync_conf.al_extents = 127;
3612
3613  err:
3614         mutex_unlock(&mdev->md_io_mutex);
3615         put_ldev(mdev);
3616
3617         return rv;
3618 }
3619
3620 static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
3621 {
3622         static char *uuid_str[UI_EXTENDED_SIZE] = {
3623                 [UI_CURRENT] = "CURRENT",
3624                 [UI_BITMAP] = "BITMAP",
3625                 [UI_HISTORY_START] = "HISTORY_START",
3626                 [UI_HISTORY_END] = "HISTORY_END",
3627                 [UI_SIZE] = "SIZE",
3628                 [UI_FLAGS] = "FLAGS",
3629         };
3630
3631         if (index >= UI_EXTENDED_SIZE) {
3632                 dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
3633                 return;
3634         }
3635
3636         dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
3637                  uuid_str[index],
3638                  (unsigned long long)mdev->ldev->md.uuid[index]);
3639 }
3640
3641
3642 /**
3643  * drbd_md_mark_dirty() - Mark meta data super block as dirty
3644  * @mdev:       DRBD device.
3645  *
3646  * Call this function if you change anything that should be written to
3647  * the meta-data super block. This function sets MD_DIRTY, and starts a
3648  * timer that ensures that within five seconds you have to call drbd_md_sync().
3649  */
3650 #ifdef DEBUG
3651 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3652 {
3653         if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3654                 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3655                 mdev->last_md_mark_dirty.line = line;
3656                 mdev->last_md_mark_dirty.func = func;
3657         }
3658 }
3659 #else
3660 void drbd_md_mark_dirty(struct drbd_conf *mdev)
3661 {
3662         if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3663                 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3664 }
3665 #endif
3666
3667 static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3668 {
3669         int i;
3670
3671         for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
3672                 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3673                 debug_drbd_uuid(mdev, i+1);
3674         }
3675 }
3676
3677 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3678 {
3679         if (idx == UI_CURRENT) {
3680                 if (mdev->state.role == R_PRIMARY)
3681                         val |= 1;
3682                 else
3683                         val &= ~((u64)1);
3684
3685                 drbd_set_ed_uuid(mdev, val);
3686         }
3687
3688         mdev->ldev->md.uuid[idx] = val;
3689         debug_drbd_uuid(mdev, idx);
3690         drbd_md_mark_dirty(mdev);
3691 }
3692
3693
3694 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3695 {
3696         if (mdev->ldev->md.uuid[idx]) {
3697                 drbd_uuid_move_history(mdev);
3698                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
3699                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3700         }
3701         _drbd_uuid_set(mdev, idx, val);
3702 }
3703
3704 /**
3705  * drbd_uuid_new_current() - Creates a new current UUID
3706  * @mdev:       DRBD device.
3707  *
3708  * Creates a new current UUID, and rotates the old current UUID into
3709  * the bitmap slot. Causes an incremental resync upon next connect.
3710  */
3711 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3712 {
3713         u64 val;
3714
3715         dev_info(DEV, "Creating new current UUID\n");
3716         D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
3717         mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
3718         debug_drbd_uuid(mdev, UI_BITMAP);
3719
3720         get_random_bytes(&val, sizeof(u64));
3721         _drbd_uuid_set(mdev, UI_CURRENT, val);
3722         /* get it to stable storage _now_ */
3723         drbd_md_sync(mdev);
3724 }
3725
3726 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3727 {
3728         if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3729                 return;
3730
3731         if (val == 0) {
3732                 drbd_uuid_move_history(mdev);
3733                 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3734                 mdev->ldev->md.uuid[UI_BITMAP] = 0;
3735                 debug_drbd_uuid(mdev, UI_HISTORY_START);
3736                 debug_drbd_uuid(mdev, UI_BITMAP);
3737         } else {
3738                 if (mdev->ldev->md.uuid[UI_BITMAP])
3739                         dev_warn(DEV, "bm UUID already set");
3740
3741                 mdev->ldev->md.uuid[UI_BITMAP] = val;
3742                 mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
3743
3744                 debug_drbd_uuid(mdev, UI_BITMAP);
3745         }
3746         drbd_md_mark_dirty(mdev);
3747 }
3748
3749 /**
3750  * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3751  * @mdev:       DRBD device.
3752  *
3753  * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3754  */
3755 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3756 {
3757         int rv = -EIO;
3758
3759         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3760                 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3761                 drbd_md_sync(mdev);
3762                 drbd_bm_set_all(mdev);
3763
3764                 rv = drbd_bm_write(mdev);
3765
3766                 if (!rv) {
3767                         drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3768                         drbd_md_sync(mdev);
3769                 }
3770
3771                 put_ldev(mdev);
3772         }
3773
3774         return rv;
3775 }
3776
3777 /**
3778  * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3779  * @mdev:       DRBD device.
3780  *
3781  * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3782  */
3783 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3784 {
3785         int rv = -EIO;
3786
3787         drbd_resume_al(mdev);
3788         if (get_ldev_if_state(mdev, D_ATTACHING)) {
3789                 drbd_bm_clear_all(mdev);
3790                 rv = drbd_bm_write(mdev);
3791                 put_ldev(mdev);
3792         }
3793
3794         return rv;
3795 }
3796
3797 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3798 {
3799         struct bm_io_work *work = container_of(w, struct bm_io_work, w);
3800         int rv;
3801
3802         D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3803
3804         drbd_bm_lock(mdev, work->why);
3805         rv = work->io_fn(mdev);
3806         drbd_bm_unlock(mdev);
3807
3808         clear_bit(BITMAP_IO, &mdev->flags);
3809         smp_mb__after_clear_bit();
3810         wake_up(&mdev->misc_wait);
3811
3812         if (work->done)
3813                 work->done(mdev, rv);
3814
3815         clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3816         work->why = NULL;
3817
3818         return 1;
3819 }
3820
3821 void drbd_ldev_destroy(struct drbd_conf *mdev)
3822 {
3823         lc_destroy(mdev->resync);
3824         mdev->resync = NULL;
3825         lc_destroy(mdev->act_log);
3826         mdev->act_log = NULL;
3827         __no_warn(local,
3828                 drbd_free_bc(mdev->ldev);
3829                 mdev->ldev = NULL;);
3830
3831         if (mdev->md_io_tmpp) {
3832                 __free_page(mdev->md_io_tmpp);
3833                 mdev->md_io_tmpp = NULL;
3834         }
3835         clear_bit(GO_DISKLESS, &mdev->flags);
3836 }
3837
3838 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3839 {
3840         D_ASSERT(mdev->state.disk == D_FAILED);
3841         /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3842          * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3843          * the protected members anymore, though, so once put_ldev reaches zero
3844          * again, it will be safe to free them. */
3845         drbd_force_state(mdev, NS(disk, D_DISKLESS));
3846         return 1;
3847 }
3848
3849 void drbd_go_diskless(struct drbd_conf *mdev)
3850 {
3851         D_ASSERT(mdev->state.disk == D_FAILED);
3852         if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3853                 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3854 }
3855
3856 /**
3857  * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3858  * @mdev:       DRBD device.
3859  * @io_fn:      IO callback to be called when bitmap IO is possible
3860  * @done:       callback to be called after the bitmap IO was performed
3861  * @why:        Descriptive text of the reason for doing the IO
3862  *
3863  * While IO on the bitmap happens we freeze application IO thus we ensure
3864  * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3865  * called from worker context. It MUST NOT be used while a previous such
3866  * work is still pending!
3867  */
3868 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3869                           int (*io_fn)(struct drbd_conf *),
3870                           void (*done)(struct drbd_conf *, int),
3871                           char *why)
3872 {
3873         D_ASSERT(current == mdev->worker.task);
3874
3875         D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
3876         D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
3877         D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
3878         if (mdev->bm_io_work.why)
3879                 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
3880                         why, mdev->bm_io_work.why);
3881
3882         mdev->bm_io_work.io_fn = io_fn;
3883         mdev->bm_io_work.done = done;
3884         mdev->bm_io_work.why = why;
3885
3886         spin_lock_irq(&mdev->req_lock);
3887         set_bit(BITMAP_IO, &mdev->flags);
3888         if (atomic_read(&mdev->ap_bio_cnt) == 0) {
3889                 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
3890                         drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
3891         }
3892         spin_unlock_irq(&mdev->req_lock);
3893 }
3894
3895 /**
3896  * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3897  * @mdev:       DRBD device.
3898  * @io_fn:      IO callback to be called when bitmap IO is possible
3899  * @why:        Descriptive text of the reason for doing the IO
3900  *
3901  * freezes application IO while that the actual IO operations runs. This
3902  * functions MAY NOT be called from worker context.
3903  */
3904 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why)
3905 {
3906         int rv;
3907
3908         D_ASSERT(current != mdev->worker.task);
3909
3910         drbd_suspend_io(mdev);
3911
3912         drbd_bm_lock(mdev, why);
3913         rv = io_fn(mdev);
3914         drbd_bm_unlock(mdev);
3915
3916         drbd_resume_io(mdev);
3917
3918         return rv;
3919 }
3920
3921 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3922 {
3923         if ((mdev->ldev->md.flags & flag) != flag) {
3924                 drbd_md_mark_dirty(mdev);
3925                 mdev->ldev->md.flags |= flag;
3926         }
3927 }
3928
3929 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
3930 {
3931         if ((mdev->ldev->md.flags & flag) != 0) {
3932                 drbd_md_mark_dirty(mdev);
3933                 mdev->ldev->md.flags &= ~flag;
3934         }
3935 }
3936 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3937 {
3938         return (bdev->md.flags & flag) != 0;
3939 }
3940
3941 static void md_sync_timer_fn(unsigned long data)
3942 {
3943         struct drbd_conf *mdev = (struct drbd_conf *) data;
3944
3945         drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
3946 }
3947
3948 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3949 {
3950         dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
3951 #ifdef DEBUG
3952         dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
3953                 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
3954 #endif
3955         drbd_md_sync(mdev);
3956         return 1;
3957 }
3958
3959 #ifdef CONFIG_DRBD_FAULT_INJECTION
3960 /* Fault insertion support including random number generator shamelessly
3961  * stolen from kernel/rcutorture.c */
3962 struct fault_random_state {
3963         unsigned long state;
3964         unsigned long count;
3965 };
3966
3967 #define FAULT_RANDOM_MULT 39916801  /* prime */
3968 #define FAULT_RANDOM_ADD        479001701 /* prime */
3969 #define FAULT_RANDOM_REFRESH 10000
3970
3971 /*
3972  * Crude but fast random-number generator.  Uses a linear congruential
3973  * generator, with occasional help from get_random_bytes().
3974  */
3975 static unsigned long
3976 _drbd_fault_random(struct fault_random_state *rsp)
3977 {
3978         long refresh;
3979
3980         if (!rsp->count--) {
3981                 get_random_bytes(&refresh, sizeof(refresh));
3982                 rsp->state += refresh;
3983                 rsp->count = FAULT_RANDOM_REFRESH;
3984         }
3985         rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3986         return swahw32(rsp->state);
3987 }
3988
3989 static char *
3990 _drbd_fault_str(unsigned int type) {
3991         static char *_faults[] = {
3992                 [DRBD_FAULT_MD_WR] = "Meta-data write",
3993                 [DRBD_FAULT_MD_RD] = "Meta-data read",
3994                 [DRBD_FAULT_RS_WR] = "Resync write",
3995                 [DRBD_FAULT_RS_RD] = "Resync read",
3996                 [DRBD_FAULT_DT_WR] = "Data write",
3997                 [DRBD_FAULT_DT_RD] = "Data read",
3998                 [DRBD_FAULT_DT_RA] = "Data read ahead",
3999                 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4000                 [DRBD_FAULT_AL_EE] = "EE allocation",
4001                 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4002         };
4003
4004         return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4005 }
4006
4007 unsigned int
4008 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4009 {
4010         static struct fault_random_state rrs = {0, 0};
4011
4012         unsigned int ret = (
4013                 (fault_devs == 0 ||
4014                         ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4015                 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4016
4017         if (ret) {
4018                 fault_count++;
4019
4020                 if (__ratelimit(&drbd_ratelimit_state))
4021                         dev_warn(DEV, "***Simulating %s failure\n",
4022                                 _drbd_fault_str(type));
4023         }
4024
4025         return ret;
4026 }
4027 #endif
4028
4029 const char *drbd_buildtag(void)
4030 {
4031         /* DRBD built from external sources has here a reference to the
4032            git hash of the source code. */
4033
4034         static char buildtag[38] = "\0uilt-in";
4035
4036         if (buildtag[0] == 0) {
4037 #ifdef CONFIG_MODULES
4038                 if (THIS_MODULE != NULL)
4039                         sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4040                 else
4041 #endif
4042                         buildtag[0] = 'b';
4043         }
4044
4045         return buildtag;
4046 }
4047
4048 module_init(drbd_init)
4049 module_exit(drbd_cleanup)
4050
4051 EXPORT_SYMBOL(drbd_conn_str);
4052 EXPORT_SYMBOL(drbd_role_str);
4053 EXPORT_SYMBOL(drbd_disk_str);
4054 EXPORT_SYMBOL(drbd_set_st_err_str);