x86/power/64: Fix hibernation return address corruption
[cascardo/linux.git] / drivers / staging / lustre / lustre / lmv / lmv_obd.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
19  *
20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21  * CA 95054 USA or visit www.sun.com if you need additional information or
22  * have any questions.
23  *
24  * GPL HEADER END
25  */
26 /*
27  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
28  * Use is subject to license terms.
29  *
30  * Copyright (c) 2011, 2015, Intel Corporation.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  */
36
37 #define DEBUG_SUBSYSTEM S_LMV
38 #include <linux/slab.h>
39 #include <linux/module.h>
40 #include <linux/init.h>
41 #include <linux/pagemap.h>
42 #include <linux/mm.h>
43 #include <asm/div64.h>
44 #include <linux/seq_file.h>
45 #include <linux/namei.h>
46 #include <linux/uaccess.h>
47
48 #include "../include/lustre/lustre_idl.h"
49 #include "../include/obd_support.h"
50 #include "../include/lustre_lib.h"
51 #include "../include/lustre_net.h"
52 #include "../include/obd_class.h"
53 #include "../include/lprocfs_status.h"
54 #include "../include/lustre_lite.h"
55 #include "../include/lustre_fid.h"
56 #include "../include/lustre_kernelcomm.h"
57 #include "lmv_internal.h"
58
59 static void lmv_activate_target(struct lmv_obd *lmv,
60                                 struct lmv_tgt_desc *tgt,
61                                 int activate)
62 {
63         if (tgt->ltd_active == activate)
64                 return;
65
66         tgt->ltd_active = activate;
67         lmv->desc.ld_active_tgt_count += (activate ? 1 : -1);
68 }
69
70 /**
71  * Error codes:
72  *
73  *  -EINVAL  : UUID can't be found in the LMV's target list
74  *  -ENOTCONN: The UUID is found, but the target connection is bad (!)
75  *  -EBADF   : The UUID is found, but the OBD of the wrong type (!)
76  */
77 static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
78                               int activate)
79 {
80         struct lmv_tgt_desc    *uninitialized_var(tgt);
81         struct obd_device      *obd;
82         int                  i;
83         int                  rc = 0;
84
85         CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n",
86                lmv, uuid->uuid, activate);
87
88         spin_lock(&lmv->lmv_lock);
89         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
90                 tgt = lmv->tgts[i];
91                 if (!tgt || !tgt->ltd_exp)
92                         continue;
93
94                 CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i,
95                        tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie);
96
97                 if (obd_uuid_equals(uuid, &tgt->ltd_uuid))
98                         break;
99         }
100
101         if (i == lmv->desc.ld_tgt_count) {
102                 rc = -EINVAL;
103                 goto out_lmv_lock;
104         }
105
106         obd = class_exp2obd(tgt->ltd_exp);
107         if (!obd) {
108                 rc = -ENOTCONN;
109                 goto out_lmv_lock;
110         }
111
112         CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n",
113                obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd,
114                obd->obd_type->typ_name, i);
115         LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0);
116
117         if (tgt->ltd_active == activate) {
118                 CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd,
119                        activate ? "" : "in");
120                 goto out_lmv_lock;
121         }
122
123         CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
124                activate ? "" : "in");
125         lmv_activate_target(lmv, tgt, activate);
126
127  out_lmv_lock:
128         spin_unlock(&lmv->lmv_lock);
129         return rc;
130 }
131
132 static struct obd_uuid *lmv_get_uuid(struct obd_export *exp)
133 {
134         struct lmv_obd *lmv = &exp->exp_obd->u.lmv;
135         struct lmv_tgt_desc *tgt = lmv->tgts[0];
136
137         return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL;
138 }
139
140 static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
141                       enum obd_notify_event ev, void *data)
142 {
143         struct obd_connect_data *conn_data;
144         struct lmv_obd    *lmv = &obd->u.lmv;
145         struct obd_uuid  *uuid;
146         int                   rc = 0;
147
148         if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) {
149                 CERROR("unexpected notification of %s %s!\n",
150                        watched->obd_type->typ_name,
151                        watched->obd_name);
152                 return -EINVAL;
153         }
154
155         uuid = &watched->u.cli.cl_target_uuid;
156         if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) {
157                 /*
158                  * Set MDC as active before notifying the observer, so the
159                  * observer can use the MDC normally.
160                  */
161                 rc = lmv_set_mdc_active(lmv, uuid,
162                                         ev == OBD_NOTIFY_ACTIVE);
163                 if (rc) {
164                         CERROR("%sactivation of %s failed: %d\n",
165                                ev == OBD_NOTIFY_ACTIVE ? "" : "de",
166                                uuid->uuid, rc);
167                         return rc;
168                 }
169         } else if (ev == OBD_NOTIFY_OCD) {
170                 conn_data = &watched->u.cli.cl_import->imp_connect_data;
171                 /*
172                  * XXX: Make sure that ocd_connect_flags from all targets are
173                  * the same. Otherwise one of MDTs runs wrong version or
174                  * something like this.  --umka
175                  */
176                 obd->obd_self_export->exp_connect_data = *conn_data;
177         }
178 #if 0
179         else if (ev == OBD_NOTIFY_DISCON) {
180                 /*
181                  * For disconnect event, flush fld cache for failout MDS case.
182                  */
183                 fld_client_flush(&lmv->lmv_fld);
184         }
185 #endif
186         /*
187          * Pass the notification up the chain.
188          */
189         if (obd->obd_observer)
190                 rc = obd_notify(obd->obd_observer, watched, ev, data);
191
192         return rc;
193 }
194
195 /**
196  * This is fake connect function. Its purpose is to initialize lmv and say
197  * caller that everything is okay. Real connection will be performed later.
198  */
199 static int lmv_connect(const struct lu_env *env,
200                        struct obd_export **exp, struct obd_device *obd,
201                        struct obd_uuid *cluuid, struct obd_connect_data *data,
202                        void *localdata)
203 {
204         struct lmv_obd  *lmv = &obd->u.lmv;
205         struct lustre_handle  conn = { 0 };
206         int                 rc = 0;
207
208         /*
209          * We don't want to actually do the underlying connections more than
210          * once, so keep track.
211          */
212         lmv->refcount++;
213         if (lmv->refcount > 1) {
214                 *exp = NULL;
215                 return 0;
216         }
217
218         rc = class_connect(&conn, obd, cluuid);
219         if (rc) {
220                 CERROR("class_connection() returned %d\n", rc);
221                 return rc;
222         }
223
224         *exp = class_conn2export(&conn);
225         class_export_get(*exp);
226
227         lmv->exp = *exp;
228         lmv->connected = 0;
229         lmv->cluuid = *cluuid;
230
231         if (data)
232                 lmv->conn_data = *data;
233
234         lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds",
235                                                     &obd->obd_kobj);
236         /*
237          * All real clients should perform actual connection right away, because
238          * it is possible, that LMV will not have opportunity to connect targets
239          * and MDC stuff will be called directly, for instance while reading
240          * ../mdc/../kbytesfree procfs file, etc.
241          */
242         if (data && data->ocd_connect_flags & OBD_CONNECT_REAL)
243                 rc = lmv_check_connect(obd);
244
245         if (rc && lmv->lmv_tgts_kobj)
246                 kobject_put(lmv->lmv_tgts_kobj);
247
248         return rc;
249 }
250
251 static void lmv_set_timeouts(struct obd_device *obd)
252 {
253         struct lmv_obd  *lmv;
254         int                 i;
255
256         lmv = &obd->u.lmv;
257         if (lmv->server_timeout == 0)
258                 return;
259
260         if (lmv->connected == 0)
261                 return;
262
263         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
264                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
265
266                 tgt = lmv->tgts[i];
267                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
268                         continue;
269
270                 obd_set_info_async(NULL, tgt->ltd_exp, sizeof(KEY_INTERMDS),
271                                    KEY_INTERMDS, 0, NULL, NULL);
272         }
273 }
274
275 static int lmv_init_ea_size(struct obd_export *exp, int easize,
276                             int def_easize, int cookiesize, int def_cookiesize)
277 {
278         struct obd_device   *obd = exp->exp_obd;
279         struct lmv_obd      *lmv = &obd->u.lmv;
280         int               i;
281         int               rc = 0;
282         int               change = 0;
283
284         if (lmv->max_easize < easize) {
285                 lmv->max_easize = easize;
286                 change = 1;
287         }
288         if (lmv->max_def_easize < def_easize) {
289                 lmv->max_def_easize = def_easize;
290                 change = 1;
291         }
292         if (lmv->max_cookiesize < cookiesize) {
293                 lmv->max_cookiesize = cookiesize;
294                 change = 1;
295         }
296         if (lmv->max_def_cookiesize < def_cookiesize) {
297                 lmv->max_def_cookiesize = def_cookiesize;
298                 change = 1;
299         }
300         if (change == 0)
301                 return 0;
302
303         if (lmv->connected == 0)
304                 return 0;
305
306         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
307                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
308
309                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
310                         CWARN("%s: NULL export for %d\n", obd->obd_name, i);
311                         continue;
312                 }
313
314                 rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize,
315                                      cookiesize, def_cookiesize);
316                 if (rc) {
317                         CERROR("%s: obd_init_ea_size() failed on MDT target %d: rc = %d\n",
318                                obd->obd_name, i, rc);
319                         break;
320                 }
321         }
322         return rc;
323 }
324
325 #define MAX_STRING_SIZE 128
326
327 static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
328 {
329         struct lmv_obd    *lmv = &obd->u.lmv;
330         struct obd_uuid  *cluuid = &lmv->cluuid;
331         struct obd_uuid   lmv_mdc_uuid = { "LMV_MDC_UUID" };
332         struct obd_device       *mdc_obd;
333         struct obd_export       *mdc_exp;
334         struct lu_fld_target     target;
335         int                   rc;
336
337         mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME,
338                                         &obd->obd_uuid);
339         if (!mdc_obd) {
340                 CERROR("target %s not attached\n", tgt->ltd_uuid.uuid);
341                 return -EINVAL;
342         }
343
344         CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n",
345                mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
346                tgt->ltd_uuid.uuid, obd->obd_uuid.uuid, cluuid->uuid);
347
348         if (!mdc_obd->obd_set_up) {
349                 CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid);
350                 return -EINVAL;
351         }
352
353         rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid,
354                          &lmv->conn_data, NULL);
355         if (rc) {
356                 CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc);
357                 return rc;
358         }
359
360         /*
361          * Init fid sequence client for this mdc and add new fld target.
362          */
363         rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA);
364         if (rc)
365                 return rc;
366
367         target.ft_srv = NULL;
368         target.ft_exp = mdc_exp;
369         target.ft_idx = tgt->ltd_idx;
370
371         fld_client_add_target(&lmv->lmv_fld, &target);
372
373         rc = obd_register_observer(mdc_obd, obd);
374         if (rc) {
375                 obd_disconnect(mdc_exp);
376                 CERROR("target %s register_observer error %d\n",
377                        tgt->ltd_uuid.uuid, rc);
378                 return rc;
379         }
380
381         if (obd->obd_observer) {
382                 /*
383                  * Tell the observer about the new target.
384                  */
385                 rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
386                                 OBD_NOTIFY_ACTIVE,
387                                 (void *)(tgt - lmv->tgts[0]));
388                 if (rc) {
389                         obd_disconnect(mdc_exp);
390                         return rc;
391                 }
392         }
393
394         tgt->ltd_active = 1;
395         tgt->ltd_exp = mdc_exp;
396         lmv->desc.ld_active_tgt_count++;
397
398         md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize,
399                         lmv->max_cookiesize, lmv->max_def_cookiesize);
400
401         CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n",
402                mdc_obd->obd_name, mdc_obd->obd_uuid.uuid,
403                atomic_read(&obd->obd_refcount));
404
405         if (lmv->lmv_tgts_kobj)
406                 /* Even if we failed to create the link, that's fine */
407                 rc = sysfs_create_link(lmv->lmv_tgts_kobj, &mdc_obd->obd_kobj,
408                                        mdc_obd->obd_name);
409         return 0;
410 }
411
412 static void lmv_del_target(struct lmv_obd *lmv, int index)
413 {
414         if (!lmv->tgts[index])
415                 return;
416
417         kfree(lmv->tgts[index]);
418         lmv->tgts[index] = NULL;
419         return;
420 }
421
422 static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
423                           __u32 index, int gen)
424 {
425         struct lmv_obd      *lmv = &obd->u.lmv;
426         struct lmv_tgt_desc *tgt;
427         int               rc = 0;
428
429         CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index);
430
431         mutex_lock(&lmv->lmv_init_mutex);
432
433         if (lmv->desc.ld_tgt_count == 0) {
434                 struct obd_device *mdc_obd;
435
436                 mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME,
437                                                 &obd->obd_uuid);
438                 if (!mdc_obd) {
439                         mutex_unlock(&lmv->lmv_init_mutex);
440                         CERROR("%s: Target %s not attached: rc = %d\n",
441                                obd->obd_name, uuidp->uuid, -EINVAL);
442                         return -EINVAL;
443                 }
444         }
445
446         if ((index < lmv->tgts_size) && lmv->tgts[index]) {
447                 tgt = lmv->tgts[index];
448                 CERROR("%s: UUID %s already assigned at LOV target index %d: rc = %d\n",
449                        obd->obd_name,
450                        obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST);
451                 mutex_unlock(&lmv->lmv_init_mutex);
452                 return -EEXIST;
453         }
454
455         if (index >= lmv->tgts_size) {
456                 /* We need to reallocate the lmv target array. */
457                 struct lmv_tgt_desc **newtgts, **old = NULL;
458                 __u32 newsize = 1;
459                 __u32 oldsize = 0;
460
461                 while (newsize < index + 1)
462                         newsize <<= 1;
463                 newtgts = kcalloc(newsize, sizeof(*newtgts), GFP_NOFS);
464                 if (!newtgts) {
465                         mutex_unlock(&lmv->lmv_init_mutex);
466                         return -ENOMEM;
467                 }
468
469                 if (lmv->tgts_size) {
470                         memcpy(newtgts, lmv->tgts,
471                                sizeof(*newtgts) * lmv->tgts_size);
472                         old = lmv->tgts;
473                         oldsize = lmv->tgts_size;
474                 }
475
476                 lmv->tgts = newtgts;
477                 lmv->tgts_size = newsize;
478                 smp_rmb();
479                 kfree(old);
480
481                 CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts,
482                        lmv->tgts_size);
483         }
484
485         tgt = kzalloc(sizeof(*tgt), GFP_NOFS);
486         if (!tgt) {
487                 mutex_unlock(&lmv->lmv_init_mutex);
488                 return -ENOMEM;
489         }
490
491         mutex_init(&tgt->ltd_fid_mutex);
492         tgt->ltd_idx = index;
493         tgt->ltd_uuid = *uuidp;
494         tgt->ltd_active = 0;
495         lmv->tgts[index] = tgt;
496         if (index >= lmv->desc.ld_tgt_count)
497                 lmv->desc.ld_tgt_count = index + 1;
498
499         if (lmv->connected) {
500                 rc = lmv_connect_mdc(obd, tgt);
501                 if (rc) {
502                         spin_lock(&lmv->lmv_lock);
503                         lmv->desc.ld_tgt_count--;
504                         memset(tgt, 0, sizeof(*tgt));
505                         spin_unlock(&lmv->lmv_lock);
506                 } else {
507                         int easize = sizeof(struct lmv_stripe_md) +
508                                 lmv->desc.ld_tgt_count * sizeof(struct lu_fid);
509                         lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
510                 }
511         }
512
513         mutex_unlock(&lmv->lmv_init_mutex);
514         return rc;
515 }
516
517 int lmv_check_connect(struct obd_device *obd)
518 {
519         struct lmv_obd       *lmv = &obd->u.lmv;
520         struct lmv_tgt_desc  *tgt;
521         int                i;
522         int                rc;
523         int                easize;
524
525         if (lmv->connected)
526                 return 0;
527
528         mutex_lock(&lmv->lmv_init_mutex);
529         if (lmv->connected) {
530                 mutex_unlock(&lmv->lmv_init_mutex);
531                 return 0;
532         }
533
534         if (lmv->desc.ld_tgt_count == 0) {
535                 mutex_unlock(&lmv->lmv_init_mutex);
536                 CERROR("%s: no targets configured.\n", obd->obd_name);
537                 return -EINVAL;
538         }
539
540         LASSERT(lmv->tgts);
541
542         if (!lmv->tgts[0]) {
543                 mutex_unlock(&lmv->lmv_init_mutex);
544                 CERROR("%s: no target configured for index 0.\n",
545                        obd->obd_name);
546                 return -EINVAL;
547         }
548
549         CDEBUG(D_CONFIG, "Time to connect %s to %s\n",
550                lmv->cluuid.uuid, obd->obd_name);
551
552         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
553                 tgt = lmv->tgts[i];
554                 if (!tgt)
555                         continue;
556                 rc = lmv_connect_mdc(obd, tgt);
557                 if (rc)
558                         goto out_disc;
559         }
560
561         lmv_set_timeouts(obd);
562         class_export_put(lmv->exp);
563         lmv->connected = 1;
564         easize = lmv_get_easize(lmv);
565         lmv_init_ea_size(obd->obd_self_export, easize, 0, 0, 0);
566         mutex_unlock(&lmv->lmv_init_mutex);
567         return 0;
568
569  out_disc:
570         while (i-- > 0) {
571                 int rc2;
572
573                 tgt = lmv->tgts[i];
574                 if (!tgt)
575                         continue;
576                 tgt->ltd_active = 0;
577                 if (tgt->ltd_exp) {
578                         --lmv->desc.ld_active_tgt_count;
579                         rc2 = obd_disconnect(tgt->ltd_exp);
580                         if (rc2) {
581                                 CERROR("LMV target %s disconnect on MDC idx %d: error %d\n",
582                                        tgt->ltd_uuid.uuid, i, rc2);
583                         }
584                 }
585         }
586         class_disconnect(lmv->exp);
587         mutex_unlock(&lmv->lmv_init_mutex);
588         return rc;
589 }
590
591 static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
592 {
593         struct lmv_obd   *lmv = &obd->u.lmv;
594         struct obd_device      *mdc_obd;
595         int                  rc;
596
597         mdc_obd = class_exp2obd(tgt->ltd_exp);
598
599         if (mdc_obd) {
600                 mdc_obd->obd_force = obd->obd_force;
601                 mdc_obd->obd_fail = obd->obd_fail;
602                 mdc_obd->obd_no_recov = obd->obd_no_recov;
603
604                 if (lmv->lmv_tgts_kobj)
605                         sysfs_remove_link(lmv->lmv_tgts_kobj,
606                                           mdc_obd->obd_name);
607         }
608
609         rc = obd_fid_fini(tgt->ltd_exp->exp_obd);
610         if (rc)
611                 CERROR("Can't finalize fids factory\n");
612
613         CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n",
614                tgt->ltd_exp->exp_obd->obd_name,
615                tgt->ltd_exp->exp_obd->obd_uuid.uuid);
616
617         obd_register_observer(tgt->ltd_exp->exp_obd, NULL);
618         rc = obd_disconnect(tgt->ltd_exp);
619         if (rc) {
620                 if (tgt->ltd_active) {
621                         CERROR("Target %s disconnect error %d\n",
622                                tgt->ltd_uuid.uuid, rc);
623                 }
624         }
625
626         lmv_activate_target(lmv, tgt, 0);
627         tgt->ltd_exp = NULL;
628         return 0;
629 }
630
631 static int lmv_disconnect(struct obd_export *exp)
632 {
633         struct obd_device     *obd = class_exp2obd(exp);
634         struct lmv_obd  *lmv = &obd->u.lmv;
635         int                 rc;
636         int                 i;
637
638         if (!lmv->tgts)
639                 goto out_local;
640
641         /*
642          * Only disconnect the underlying layers on the final disconnect.
643          */
644         lmv->refcount--;
645         if (lmv->refcount != 0)
646                 goto out_local;
647
648         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
649                 if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp)
650                         continue;
651
652                 lmv_disconnect_mdc(obd, lmv->tgts[i]);
653         }
654
655         if (lmv->lmv_tgts_kobj)
656                 kobject_put(lmv->lmv_tgts_kobj);
657
658 out_local:
659         /*
660          * This is the case when no real connection is established by
661          * lmv_check_connect().
662          */
663         if (!lmv->connected)
664                 class_export_put(exp);
665         rc = class_disconnect(exp);
666         if (lmv->refcount == 0)
667                 lmv->connected = 0;
668         return rc;
669 }
670
671 static int lmv_fid2path(struct obd_export *exp, int len, void *karg,
672                         void __user *uarg)
673 {
674         struct obd_device       *obddev = class_exp2obd(exp);
675         struct lmv_obd          *lmv = &obddev->u.lmv;
676         struct getinfo_fid2path *gf;
677         struct lmv_tgt_desc     *tgt;
678         struct getinfo_fid2path *remote_gf = NULL;
679         int                     remote_gf_size = 0;
680         int                     rc;
681
682         gf = (struct getinfo_fid2path *)karg;
683         tgt = lmv_find_target(lmv, &gf->gf_fid);
684         if (IS_ERR(tgt))
685                 return PTR_ERR(tgt);
686
687 repeat_fid2path:
688         rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg);
689         if (rc != 0 && rc != -EREMOTE)
690                 goto out_fid2path;
691
692         /* If remote_gf != NULL, it means just building the
693          * path on the remote MDT, copy this path segment to gf
694          */
695         if (remote_gf) {
696                 struct getinfo_fid2path *ori_gf;
697                 char *ptr;
698
699                 ori_gf = (struct getinfo_fid2path *)karg;
700                 if (strlen(ori_gf->gf_path) +
701                     strlen(gf->gf_path) > ori_gf->gf_pathlen) {
702                         rc = -EOVERFLOW;
703                         goto out_fid2path;
704                 }
705
706                 ptr = ori_gf->gf_path;
707
708                 memmove(ptr + strlen(gf->gf_path) + 1, ptr,
709                         strlen(ori_gf->gf_path));
710
711                 strncpy(ptr, gf->gf_path, strlen(gf->gf_path));
712                 ptr += strlen(gf->gf_path);
713                 *ptr = '/';
714         }
715
716         CDEBUG(D_INFO, "%s: get path %s "DFID" rec: %llu ln: %u\n",
717                tgt->ltd_exp->exp_obd->obd_name,
718                gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno,
719                gf->gf_linkno);
720
721         if (rc == 0)
722                 goto out_fid2path;
723
724         /* sigh, has to go to another MDT to do path building further */
725         if (!remote_gf) {
726                 remote_gf_size = sizeof(*remote_gf) + PATH_MAX;
727                 remote_gf = kzalloc(remote_gf_size, GFP_NOFS);
728                 if (!remote_gf) {
729                         rc = -ENOMEM;
730                         goto out_fid2path;
731                 }
732                 remote_gf->gf_pathlen = PATH_MAX;
733         }
734
735         if (!fid_is_sane(&gf->gf_fid)) {
736                 CERROR("%s: invalid FID "DFID": rc = %d\n",
737                        tgt->ltd_exp->exp_obd->obd_name,
738                        PFID(&gf->gf_fid), -EINVAL);
739                 rc = -EINVAL;
740                 goto out_fid2path;
741         }
742
743         tgt = lmv_find_target(lmv, &gf->gf_fid);
744         if (IS_ERR(tgt)) {
745                 rc = -EINVAL;
746                 goto out_fid2path;
747         }
748
749         remote_gf->gf_fid = gf->gf_fid;
750         remote_gf->gf_recno = -1;
751         remote_gf->gf_linkno = -1;
752         memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen);
753         gf = remote_gf;
754         goto repeat_fid2path;
755
756 out_fid2path:
757         kfree(remote_gf);
758         return rc;
759 }
760
761 static int lmv_hsm_req_count(struct lmv_obd *lmv,
762                              const struct hsm_user_request *hur,
763                              const struct lmv_tgt_desc *tgt_mds)
764 {
765         int                     i, nr = 0;
766         struct lmv_tgt_desc    *curr_tgt;
767
768         /* count how many requests must be sent to the given target */
769         for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
770                 curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid);
771                 if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid))
772                         nr++;
773         }
774         return nr;
775 }
776
777 static void lmv_hsm_req_build(struct lmv_obd *lmv,
778                               struct hsm_user_request *hur_in,
779                               const struct lmv_tgt_desc *tgt_mds,
780                               struct hsm_user_request *hur_out)
781 {
782         int                     i, nr_out;
783         struct lmv_tgt_desc    *curr_tgt;
784
785         /* build the hsm_user_request for the given target */
786         hur_out->hur_request = hur_in->hur_request;
787         nr_out = 0;
788         for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) {
789                 curr_tgt = lmv_find_target(lmv,
790                                            &hur_in->hur_user_item[i].hui_fid);
791                 if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) {
792                         hur_out->hur_user_item[nr_out] =
793                                 hur_in->hur_user_item[i];
794                         nr_out++;
795                 }
796         }
797         hur_out->hur_request.hr_itemcount = nr_out;
798         memcpy(hur_data(hur_out), hur_data(hur_in),
799                hur_in->hur_request.hr_data_len);
800 }
801
802 static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len,
803                                  struct lustre_kernelcomm *lk,
804                                  void __user *uarg)
805 {
806         int rc = 0;
807         __u32 i;
808
809         /* unregister request (call from llapi_hsm_copytool_fini) */
810         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
811                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
812
813                 if (!tgt || !tgt->ltd_exp)
814                         continue;
815
816                 /* best effort: try to clean as much as possible
817                  * (continue on error)
818                  */
819                 obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg);
820         }
821
822         /* Whatever the result, remove copytool from kuc groups.
823          * Unreached coordinators will get EPIPE on next requests
824          * and will unregister automatically.
825          */
826         rc = libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group);
827
828         return rc;
829 }
830
831 static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len,
832                                struct lustre_kernelcomm *lk, void __user *uarg)
833 {
834         struct file *filp;
835         __u32 i, j;
836         int err, rc = 0;
837         bool any_set = false;
838         struct kkuc_ct_data kcd = { 0 };
839
840         /* All or nothing: try to register to all MDS.
841          * In case of failure, unregister from previous MDS,
842          * except if it because of inactive target.
843          */
844         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
845                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
846
847                 if (!tgt || !tgt->ltd_exp)
848                         continue;
849
850                 err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg);
851                 if (err) {
852                         if (tgt->ltd_active) {
853                                 /* permanent error */
854                                 CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
855                                        tgt->ltd_uuid.uuid, i, cmd, err);
856                                 rc = err;
857                                 lk->lk_flags |= LK_FLG_STOP;
858                                 /* unregister from previous MDS */
859                                 for (j = 0; j < i; j++) {
860                                         tgt = lmv->tgts[j];
861
862                                         if (!tgt || !tgt->ltd_exp)
863                                                 continue;
864                                         obd_iocontrol(cmd, tgt->ltd_exp, len,
865                                                       lk, uarg);
866                                 }
867                                 return rc;
868                         }
869                         /* else: transient error.
870                          * kuc will register to the missing MDT when it is back
871                          */
872                 } else {
873                         any_set = true;
874                 }
875         }
876
877         if (!any_set)
878                 /* no registration done: return error */
879                 return -ENOTCONN;
880
881         /* at least one registration done, with no failure */
882         filp = fget(lk->lk_wfd);
883         if (!filp)
884                 return -EBADF;
885
886         kcd.kcd_magic = KKUC_CT_DATA_MAGIC;
887         kcd.kcd_uuid = lmv->cluuid;
888         kcd.kcd_archive = lk->lk_data;
889
890         rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group,
891                                    &kcd, sizeof(kcd));
892         if (rc) {
893                 if (filp)
894                         fput(filp);
895         }
896
897         return rc;
898 }
899
900 static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
901                          int len, void *karg, void __user *uarg)
902 {
903         struct obd_device    *obddev = class_exp2obd(exp);
904         struct lmv_obd       *lmv = &obddev->u.lmv;
905         struct lmv_tgt_desc *tgt = NULL;
906         int                i = 0;
907         int                rc = 0;
908         int                set = 0;
909         int                count = lmv->desc.ld_tgt_count;
910
911         if (count == 0)
912                 return -ENOTTY;
913
914         switch (cmd) {
915         case IOC_OBD_STATFS: {
916                 struct obd_ioctl_data *data = karg;
917                 struct obd_device *mdc_obd;
918                 struct obd_statfs stat_buf = {0};
919                 __u32 index;
920
921                 memcpy(&index, data->ioc_inlbuf2, sizeof(__u32));
922                 if (index >= count)
923                         return -ENODEV;
924
925                 tgt = lmv->tgts[index];
926                 if (!tgt || !tgt->ltd_active)
927                         return -ENODATA;
928
929                 mdc_obd = class_exp2obd(tgt->ltd_exp);
930                 if (!mdc_obd)
931                         return -EINVAL;
932
933                 /* copy UUID */
934                 if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd),
935                                  min((int)data->ioc_plen2,
936                                      (int)sizeof(struct obd_uuid))))
937                         return -EFAULT;
938
939                 rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf,
940                                 cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
941                                 0);
942                 if (rc)
943                         return rc;
944                 if (copy_to_user(data->ioc_pbuf1, &stat_buf,
945                                  min((int)data->ioc_plen1,
946                                      (int)sizeof(stat_buf))))
947                         return -EFAULT;
948                 break;
949         }
950         case OBD_IOC_QUOTACTL: {
951                 struct if_quotactl *qctl = karg;
952                 struct obd_quotactl *oqctl;
953
954                 if (qctl->qc_valid == QC_MDTIDX) {
955                         if (count <= qctl->qc_idx)
956                                 return -EINVAL;
957
958                         tgt = lmv->tgts[qctl->qc_idx];
959                         if (!tgt || !tgt->ltd_exp)
960                                 return -EINVAL;
961                 } else if (qctl->qc_valid == QC_UUID) {
962                         for (i = 0; i < count; i++) {
963                                 tgt = lmv->tgts[i];
964                                 if (!tgt)
965                                         continue;
966                                 if (!obd_uuid_equals(&tgt->ltd_uuid,
967                                                      &qctl->obd_uuid))
968                                         continue;
969
970                                 if (!tgt->ltd_exp)
971                                         return -EINVAL;
972
973                                 break;
974                         }
975                 } else {
976                         return -EINVAL;
977                 }
978
979                 if (i >= count)
980                         return -EAGAIN;
981
982                 LASSERT(tgt && tgt->ltd_exp);
983                 oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
984                 if (!oqctl)
985                         return -ENOMEM;
986
987                 QCTL_COPY(oqctl, qctl);
988                 rc = obd_quotactl(tgt->ltd_exp, oqctl);
989                 if (rc == 0) {
990                         QCTL_COPY(qctl, oqctl);
991                         qctl->qc_valid = QC_MDTIDX;
992                         qctl->obd_uuid = tgt->ltd_uuid;
993                 }
994                 kfree(oqctl);
995                 break;
996         }
997         case OBD_IOC_CHANGELOG_SEND:
998         case OBD_IOC_CHANGELOG_CLEAR: {
999                 struct ioc_changelog *icc = karg;
1000
1001                 if (icc->icc_mdtindex >= count)
1002                         return -ENODEV;
1003
1004                 tgt = lmv->tgts[icc->icc_mdtindex];
1005                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
1006                         return -ENODEV;
1007                 rc = obd_iocontrol(cmd, tgt->ltd_exp, sizeof(*icc), icc, NULL);
1008                 break;
1009         }
1010         case LL_IOC_GET_CONNECT_FLAGS: {
1011                 tgt = lmv->tgts[0];
1012
1013                 if (!tgt || !tgt->ltd_exp)
1014                         return -ENODATA;
1015                 rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1016                 break;
1017         }
1018         case OBD_IOC_FID2PATH: {
1019                 rc = lmv_fid2path(exp, len, karg, uarg);
1020                 break;
1021         }
1022         case LL_IOC_HSM_STATE_GET:
1023         case LL_IOC_HSM_STATE_SET:
1024         case LL_IOC_HSM_ACTION: {
1025                 struct md_op_data       *op_data = karg;
1026
1027                 tgt = lmv_find_target(lmv, &op_data->op_fid1);
1028                 if (IS_ERR(tgt))
1029                         return PTR_ERR(tgt);
1030
1031                 if (!tgt->ltd_exp)
1032                         return -EINVAL;
1033
1034                 rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1035                 break;
1036         }
1037         case LL_IOC_HSM_PROGRESS: {
1038                 const struct hsm_progress_kernel *hpk = karg;
1039
1040                 tgt = lmv_find_target(lmv, &hpk->hpk_fid);
1041                 if (IS_ERR(tgt))
1042                         return PTR_ERR(tgt);
1043                 rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1044                 break;
1045         }
1046         case LL_IOC_HSM_REQUEST: {
1047                 struct hsm_user_request *hur = karg;
1048                 unsigned int reqcount = hur->hur_request.hr_itemcount;
1049
1050                 if (reqcount == 0)
1051                         return 0;
1052
1053                 /* if the request is about a single fid
1054                  * or if there is a single MDS, no need to split
1055                  * the request.
1056                  */
1057                 if (reqcount == 1 || count == 1) {
1058                         tgt = lmv_find_target(lmv,
1059                                               &hur->hur_user_item[0].hui_fid);
1060                         if (IS_ERR(tgt))
1061                                 return PTR_ERR(tgt);
1062                         rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1063                 } else {
1064                         /* split fid list to their respective MDS */
1065                         for (i = 0; i < count; i++) {
1066                                 unsigned int            nr, reqlen;
1067                                 int                     rc1;
1068                                 struct hsm_user_request *req;
1069
1070                                 tgt = lmv->tgts[i];
1071                                 if (!tgt || !tgt->ltd_exp)
1072                                         continue;
1073
1074                                 nr = lmv_hsm_req_count(lmv, hur, tgt);
1075                                 if (nr == 0) /* nothing for this MDS */
1076                                         continue;
1077
1078                                 /* build a request with fids for this MDS */
1079                                 reqlen = offsetof(typeof(*hur),
1080                                                   hur_user_item[nr])
1081                                          + hur->hur_request.hr_data_len;
1082                                 req = libcfs_kvzalloc(reqlen, GFP_NOFS);
1083                                 if (!req)
1084                                         return -ENOMEM;
1085
1086                                 lmv_hsm_req_build(lmv, hur, tgt, req);
1087
1088                                 rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen,
1089                                                     req, uarg);
1090                                 if (rc1 != 0 && rc == 0)
1091                                         rc = rc1;
1092                                 kvfree(req);
1093                         }
1094                 }
1095                 break;
1096         }
1097         case LL_IOC_LOV_SWAP_LAYOUTS: {
1098                 struct md_op_data       *op_data = karg;
1099                 struct lmv_tgt_desc     *tgt1, *tgt2;
1100
1101                 tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
1102                 if (IS_ERR(tgt1))
1103                         return PTR_ERR(tgt1);
1104
1105                 tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
1106                 if (IS_ERR(tgt2))
1107                         return PTR_ERR(tgt2);
1108
1109                 if (!tgt1->ltd_exp || !tgt2->ltd_exp)
1110                         return -EINVAL;
1111
1112                 /* only files on same MDT can have their layouts swapped */
1113                 if (tgt1->ltd_idx != tgt2->ltd_idx)
1114                         return -EPERM;
1115
1116                 rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg);
1117                 break;
1118         }
1119         case LL_IOC_HSM_CT_START: {
1120                 struct lustre_kernelcomm *lk = karg;
1121
1122                 if (lk->lk_flags & LK_FLG_STOP)
1123                         rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg);
1124                 else
1125                         rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg);
1126                 break;
1127         }
1128         default:
1129                 for (i = 0; i < count; i++) {
1130                         struct obd_device *mdc_obd;
1131                         int err;
1132
1133                         tgt = lmv->tgts[i];
1134                         if (!tgt || !tgt->ltd_exp)
1135                                 continue;
1136                         /* ll_umount_begin() sets force flag but for lmv, not
1137                          * mdc. Let's pass it through
1138                          */
1139                         mdc_obd = class_exp2obd(tgt->ltd_exp);
1140                         mdc_obd->obd_force = obddev->obd_force;
1141                         err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg);
1142                         if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) {
1143                                 return err;
1144                         } else if (err) {
1145                                 if (tgt->ltd_active) {
1146                                         CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n",
1147                                                tgt->ltd_uuid.uuid, i, cmd, err);
1148                                         if (!rc)
1149                                                 rc = err;
1150                                 }
1151                         } else {
1152                                 set = 1;
1153                         }
1154                 }
1155                 if (!set && !rc)
1156                         rc = -EIO;
1157         }
1158         return rc;
1159 }
1160
1161 /**
1162  * This is _inode_ placement policy function (not name).
1163  */
1164 static int lmv_placement_policy(struct obd_device *obd,
1165                                 struct md_op_data *op_data, u32 *mds)
1166 {
1167         struct lmv_obd    *lmv = &obd->u.lmv;
1168
1169         LASSERT(mds);
1170
1171         if (lmv->desc.ld_tgt_count == 1) {
1172                 *mds = 0;
1173                 return 0;
1174         }
1175
1176         /**
1177          * If stripe_offset is provided during setdirstripe
1178          * (setdirstripe -i xx), xx MDS will be chosen.
1179          */
1180         if (op_data->op_cli_flags & CLI_SET_MEA) {
1181                 struct lmv_user_md *lum;
1182
1183                 lum = (struct lmv_user_md *)op_data->op_data;
1184                 if (lum->lum_type == LMV_STRIPE_TYPE &&
1185                     lum->lum_stripe_offset != -1) {
1186                         if (lum->lum_stripe_offset >= lmv->desc.ld_tgt_count) {
1187                                 CERROR("%s: Stripe_offset %d > MDT count %d: rc = %d\n",
1188                                        obd->obd_name,
1189                                        lum->lum_stripe_offset,
1190                                        lmv->desc.ld_tgt_count, -ERANGE);
1191                                 return -ERANGE;
1192                         }
1193                         *mds = lum->lum_stripe_offset;
1194                         return 0;
1195                 }
1196         }
1197
1198         /* Allocate new fid on target according to operation type and parent
1199          * home mds.
1200          */
1201         *mds = op_data->op_mds;
1202         return 0;
1203 }
1204
1205 int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds)
1206 {
1207         struct lmv_tgt_desc     *tgt;
1208         int                      rc;
1209
1210         tgt = lmv_get_target(lmv, mds);
1211         if (IS_ERR(tgt))
1212                 return PTR_ERR(tgt);
1213
1214         /*
1215          * New seq alloc and FLD setup should be atomic. Otherwise we may find
1216          * on server that seq in new allocated fid is not yet known.
1217          */
1218         mutex_lock(&tgt->ltd_fid_mutex);
1219
1220         if (tgt->ltd_active == 0 || !tgt->ltd_exp) {
1221                 rc = -ENODEV;
1222                 goto out;
1223         }
1224
1225         /*
1226          * Asking underlaying tgt layer to allocate new fid.
1227          */
1228         rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
1229         if (rc > 0) {
1230                 LASSERT(fid_is_sane(fid));
1231                 rc = 0;
1232         }
1233
1234 out:
1235         mutex_unlock(&tgt->ltd_fid_mutex);
1236         return rc;
1237 }
1238
1239 int lmv_fid_alloc(struct obd_export *exp, struct lu_fid *fid,
1240                   struct md_op_data *op_data)
1241 {
1242         struct obd_device     *obd = class_exp2obd(exp);
1243         struct lmv_obd  *lmv = &obd->u.lmv;
1244         u32                    mds = 0;
1245         int                 rc;
1246
1247         LASSERT(op_data);
1248         LASSERT(fid);
1249
1250         rc = lmv_placement_policy(obd, op_data, &mds);
1251         if (rc) {
1252                 CERROR("Can't get target for allocating fid, rc %d\n",
1253                        rc);
1254                 return rc;
1255         }
1256
1257         rc = __lmv_fid_alloc(lmv, fid, mds);
1258         if (rc) {
1259                 CERROR("Can't alloc new fid, rc %d\n", rc);
1260                 return rc;
1261         }
1262
1263         return rc;
1264 }
1265
1266 static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
1267 {
1268         struct lmv_obd       *lmv = &obd->u.lmv;
1269         struct lprocfs_static_vars  lvars = { NULL };
1270         struct lmv_desc     *desc;
1271         int                      rc;
1272
1273         if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
1274                 CERROR("LMV setup requires a descriptor\n");
1275                 return -EINVAL;
1276         }
1277
1278         desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1);
1279         if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) {
1280                 CERROR("Lmv descriptor size wrong: %d > %d\n",
1281                        (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1));
1282                 return -EINVAL;
1283         }
1284
1285         lmv->tgts = kcalloc(32, sizeof(*lmv->tgts), GFP_NOFS);
1286         if (!lmv->tgts)
1287                 return -ENOMEM;
1288         lmv->tgts_size = 32;
1289
1290         obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid);
1291         lmv->desc.ld_tgt_count = 0;
1292         lmv->desc.ld_active_tgt_count = 0;
1293         lmv->max_cookiesize = 0;
1294         lmv->max_def_easize = 0;
1295         lmv->max_easize = 0;
1296         lmv->lmv_placement = PLACEMENT_CHAR_POLICY;
1297
1298         spin_lock_init(&lmv->lmv_lock);
1299         mutex_init(&lmv->lmv_init_mutex);
1300
1301         lprocfs_lmv_init_vars(&lvars);
1302
1303         lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars);
1304         rc = ldebugfs_seq_create(obd->obd_debugfs_entry, "target_obd",
1305                                  0444, &lmv_proc_target_fops, obd);
1306         if (rc)
1307                 CWARN("%s: error adding LMV target_obd file: rc = %d\n",
1308                       obd->obd_name, rc);
1309         rc = fld_client_init(&lmv->lmv_fld, obd->obd_name,
1310                              LUSTRE_CLI_FLD_HASH_DHT);
1311         if (rc) {
1312                 CERROR("Can't init FLD, err %d\n", rc);
1313                 goto out;
1314         }
1315
1316         return 0;
1317
1318 out:
1319         return rc;
1320 }
1321
1322 static int lmv_cleanup(struct obd_device *obd)
1323 {
1324         struct lmv_obd   *lmv = &obd->u.lmv;
1325
1326         fld_client_fini(&lmv->lmv_fld);
1327         if (lmv->tgts) {
1328                 int i;
1329
1330                 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1331                         if (!lmv->tgts[i])
1332                                 continue;
1333                         lmv_del_target(lmv, i);
1334                 }
1335                 kfree(lmv->tgts);
1336                 lmv->tgts_size = 0;
1337         }
1338         return 0;
1339 }
1340
1341 static int lmv_process_config(struct obd_device *obd, u32 len, void *buf)
1342 {
1343         struct lustre_cfg       *lcfg = buf;
1344         struct obd_uuid         obd_uuid;
1345         int                     gen;
1346         __u32                   index;
1347         int                     rc;
1348
1349         switch (lcfg->lcfg_command) {
1350         case LCFG_ADD_MDC:
1351                 /* modify_mdc_tgts add 0:lustre-clilmv  1:lustre-MDT0000_UUID
1352                  * 2:0  3:1  4:lustre-MDT0000-mdc_UUID
1353                  */
1354                 if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) {
1355                         rc = -EINVAL;
1356                         goto out;
1357                 }
1358
1359                 obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
1360
1361                 if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) {
1362                         rc = -EINVAL;
1363                         goto out;
1364                 }
1365                 if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) {
1366                         rc = -EINVAL;
1367                         goto out;
1368                 }
1369                 rc = lmv_add_target(obd, &obd_uuid, index, gen);
1370                 goto out;
1371         default:
1372                 CERROR("Unknown command: %d\n", lcfg->lcfg_command);
1373                 rc = -EINVAL;
1374                 goto out;
1375         }
1376 out:
1377         return rc;
1378 }
1379
1380 static int lmv_statfs(const struct lu_env *env, struct obd_export *exp,
1381                       struct obd_statfs *osfs, __u64 max_age, __u32 flags)
1382 {
1383         struct obd_device     *obd = class_exp2obd(exp);
1384         struct lmv_obd  *lmv = &obd->u.lmv;
1385         struct obd_statfs     *temp;
1386         int                 rc = 0;
1387         int                 i;
1388
1389         rc = lmv_check_connect(obd);
1390         if (rc)
1391                 return rc;
1392
1393         temp = kzalloc(sizeof(*temp), GFP_NOFS);
1394         if (!temp)
1395                 return -ENOMEM;
1396
1397         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1398                 if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp)
1399                         continue;
1400
1401                 rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp,
1402                                 max_age, flags);
1403                 if (rc) {
1404                         CERROR("can't stat MDS #%d (%s), error %d\n", i,
1405                                lmv->tgts[i]->ltd_exp->exp_obd->obd_name,
1406                                rc);
1407                         goto out_free_temp;
1408                 }
1409
1410                 if (i == 0) {
1411                         *osfs = *temp;
1412                         /* If the statfs is from mount, it will needs
1413                          * retrieve necessary information from MDT0.
1414                          * i.e. mount does not need the merged osfs
1415                          * from all of MDT.
1416                          * And also clients can be mounted as long as
1417                          * MDT0 is in service
1418                          */
1419                         if (flags & OBD_STATFS_FOR_MDT0)
1420                                 goto out_free_temp;
1421                 } else {
1422                         osfs->os_bavail += temp->os_bavail;
1423                         osfs->os_blocks += temp->os_blocks;
1424                         osfs->os_ffree += temp->os_ffree;
1425                         osfs->os_files += temp->os_files;
1426                 }
1427         }
1428
1429 out_free_temp:
1430         kfree(temp);
1431         return rc;
1432 }
1433
1434 static int lmv_getstatus(struct obd_export *exp,
1435                          struct lu_fid *fid)
1436 {
1437         struct obd_device    *obd = exp->exp_obd;
1438         struct lmv_obd       *lmv = &obd->u.lmv;
1439         int                rc;
1440
1441         rc = lmv_check_connect(obd);
1442         if (rc)
1443                 return rc;
1444
1445         rc = md_getstatus(lmv->tgts[0]->ltd_exp, fid);
1446         return rc;
1447 }
1448
1449 static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid,
1450                         u64 valid, const char *name,
1451                         const char *input, int input_size, int output_size,
1452                         int flags, struct ptlrpc_request **request)
1453 {
1454         struct obd_device      *obd = exp->exp_obd;
1455         struct lmv_obd   *lmv = &obd->u.lmv;
1456         struct lmv_tgt_desc    *tgt;
1457         int                  rc;
1458
1459         rc = lmv_check_connect(obd);
1460         if (rc)
1461                 return rc;
1462
1463         tgt = lmv_find_target(lmv, fid);
1464         if (IS_ERR(tgt))
1465                 return PTR_ERR(tgt);
1466
1467         rc = md_getxattr(tgt->ltd_exp, fid, valid, name, input,
1468                          input_size, output_size, flags, request);
1469
1470         return rc;
1471 }
1472
1473 static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
1474                         u64 valid, const char *name,
1475                         const char *input, int input_size, int output_size,
1476                         int flags, __u32 suppgid,
1477                         struct ptlrpc_request **request)
1478 {
1479         struct obd_device      *obd = exp->exp_obd;
1480         struct lmv_obd   *lmv = &obd->u.lmv;
1481         struct lmv_tgt_desc    *tgt;
1482         int                  rc;
1483
1484         rc = lmv_check_connect(obd);
1485         if (rc)
1486                 return rc;
1487
1488         tgt = lmv_find_target(lmv, fid);
1489         if (IS_ERR(tgt))
1490                 return PTR_ERR(tgt);
1491
1492         rc = md_setxattr(tgt->ltd_exp, fid, valid, name, input,
1493                          input_size, output_size, flags, suppgid,
1494                          request);
1495
1496         return rc;
1497 }
1498
1499 static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data,
1500                        struct ptlrpc_request **request)
1501 {
1502         struct obd_device       *obd = exp->exp_obd;
1503         struct lmv_obd    *lmv = &obd->u.lmv;
1504         struct lmv_tgt_desc     *tgt;
1505         int                   rc;
1506
1507         rc = lmv_check_connect(obd);
1508         if (rc)
1509                 return rc;
1510
1511         tgt = lmv_find_target(lmv, &op_data->op_fid1);
1512         if (IS_ERR(tgt))
1513                 return PTR_ERR(tgt);
1514
1515         if (op_data->op_flags & MF_GET_MDT_IDX) {
1516                 op_data->op_mds = tgt->ltd_idx;
1517                 return 0;
1518         }
1519
1520         rc = md_getattr(tgt->ltd_exp, op_data, request);
1521
1522         return rc;
1523 }
1524
1525 static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid)
1526 {
1527         struct obd_device   *obd = exp->exp_obd;
1528         struct lmv_obd      *lmv = &obd->u.lmv;
1529         int               i;
1530         int               rc;
1531
1532         rc = lmv_check_connect(obd);
1533         if (rc)
1534                 return rc;
1535
1536         CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
1537
1538         /*
1539          * With DNE every object can have two locks in different namespaces:
1540          * lookup lock in space of MDT storing direntry and update/open lock in
1541          * space of MDT storing inode.
1542          */
1543         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1544                 if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp)
1545                         continue;
1546                 md_null_inode(lmv->tgts[i]->ltd_exp, fid);
1547         }
1548
1549         return 0;
1550 }
1551
1552 static int lmv_find_cbdata(struct obd_export *exp, const struct lu_fid *fid,
1553                            ldlm_iterator_t it, void *data)
1554 {
1555         struct obd_device   *obd = exp->exp_obd;
1556         struct lmv_obd      *lmv = &obd->u.lmv;
1557         int               i;
1558         int               rc;
1559
1560         rc = lmv_check_connect(obd);
1561         if (rc)
1562                 return rc;
1563
1564         CDEBUG(D_INODE, "CBDATA for "DFID"\n", PFID(fid));
1565
1566         /*
1567          * With DNE every object can have two locks in different namespaces:
1568          * lookup lock in space of MDT storing direntry and update/open lock in
1569          * space of MDT storing inode.
1570          */
1571         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
1572                 if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp)
1573                         continue;
1574                 rc = md_find_cbdata(lmv->tgts[i]->ltd_exp, fid, it, data);
1575                 if (rc)
1576                         return rc;
1577         }
1578
1579         return rc;
1580 }
1581
1582 static int lmv_close(struct obd_export *exp, struct md_op_data *op_data,
1583                      struct md_open_data *mod, struct ptlrpc_request **request)
1584 {
1585         struct obd_device     *obd = exp->exp_obd;
1586         struct lmv_obd  *lmv = &obd->u.lmv;
1587         struct lmv_tgt_desc   *tgt;
1588         int                 rc;
1589
1590         rc = lmv_check_connect(obd);
1591         if (rc)
1592                 return rc;
1593
1594         tgt = lmv_find_target(lmv, &op_data->op_fid1);
1595         if (IS_ERR(tgt))
1596                 return PTR_ERR(tgt);
1597
1598         CDEBUG(D_INODE, "CLOSE "DFID"\n", PFID(&op_data->op_fid1));
1599         rc = md_close(tgt->ltd_exp, op_data, mod, request);
1600         return rc;
1601 }
1602
1603 struct lmv_tgt_desc
1604 *lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
1605                 struct lu_fid *fid)
1606 {
1607         struct lmv_tgt_desc *tgt;
1608
1609         tgt = lmv_find_target(lmv, fid);
1610         if (IS_ERR(tgt))
1611                 return tgt;
1612
1613         op_data->op_mds = tgt->ltd_idx;
1614
1615         return tgt;
1616 }
1617
1618 static int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
1619                       const void *data, int datalen, int mode, __u32 uid,
1620                       __u32 gid, cfs_cap_t cap_effective, __u64 rdev,
1621                       struct ptlrpc_request **request)
1622 {
1623         struct obd_device       *obd = exp->exp_obd;
1624         struct lmv_obd    *lmv = &obd->u.lmv;
1625         struct lmv_tgt_desc     *tgt;
1626         int                   rc;
1627
1628         rc = lmv_check_connect(obd);
1629         if (rc)
1630                 return rc;
1631
1632         if (!lmv->desc.ld_active_tgt_count)
1633                 return -EIO;
1634
1635         tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1636         if (IS_ERR(tgt))
1637                 return PTR_ERR(tgt);
1638
1639         rc = lmv_fid_alloc(exp, &op_data->op_fid2, op_data);
1640         if (rc)
1641                 return rc;
1642
1643         CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n",
1644                op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
1645                op_data->op_mds);
1646
1647         op_data->op_flags |= MF_MDC_CANCEL_FID1;
1648         rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid,
1649                        cap_effective, rdev, request);
1650
1651         if (rc == 0) {
1652                 if (!*request)
1653                         return rc;
1654                 CDEBUG(D_INODE, "Created - "DFID"\n", PFID(&op_data->op_fid2));
1655         }
1656         return rc;
1657 }
1658
1659 static int lmv_done_writing(struct obd_export *exp,
1660                             struct md_op_data *op_data,
1661                             struct md_open_data *mod)
1662 {
1663         struct obd_device     *obd = exp->exp_obd;
1664         struct lmv_obd  *lmv = &obd->u.lmv;
1665         struct lmv_tgt_desc   *tgt;
1666         int                 rc;
1667
1668         rc = lmv_check_connect(obd);
1669         if (rc)
1670                 return rc;
1671
1672         tgt = lmv_find_target(lmv, &op_data->op_fid1);
1673         if (IS_ERR(tgt))
1674                 return PTR_ERR(tgt);
1675
1676         rc = md_done_writing(tgt->ltd_exp, op_data, mod);
1677         return rc;
1678 }
1679
1680 static int
1681 lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
1682                    struct lookup_intent *it, struct md_op_data *op_data,
1683                    struct lustre_handle *lockh, void *lmm, int lmmsize,
1684                    __u64 extra_lock_flags)
1685 {
1686         struct ptlrpc_request      *req = it->d.lustre.it_data;
1687         struct obd_device         *obd = exp->exp_obd;
1688         struct lmv_obd       *lmv = &obd->u.lmv;
1689         struct lustre_handle    plock;
1690         struct lmv_tgt_desc     *tgt;
1691         struct md_op_data         *rdata;
1692         struct lu_fid          fid1;
1693         struct mdt_body     *body;
1694         int                      rc = 0;
1695         int                      pmode;
1696
1697         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1698
1699         if (!(body->valid & OBD_MD_MDS))
1700                 return 0;
1701
1702         CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
1703                LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
1704
1705         /*
1706          * We got LOOKUP lock, but we really need attrs.
1707          */
1708         pmode = it->d.lustre.it_lock_mode;
1709         LASSERT(pmode != 0);
1710         memcpy(&plock, lockh, sizeof(plock));
1711         it->d.lustre.it_lock_mode = 0;
1712         it->d.lustre.it_data = NULL;
1713         fid1 = body->fid1;
1714
1715         ptlrpc_req_finished(req);
1716
1717         tgt = lmv_find_target(lmv, &fid1);
1718         if (IS_ERR(tgt)) {
1719                 rc = PTR_ERR(tgt);
1720                 goto out;
1721         }
1722
1723         rdata = kzalloc(sizeof(*rdata), GFP_NOFS);
1724         if (!rdata) {
1725                 rc = -ENOMEM;
1726                 goto out;
1727         }
1728
1729         rdata->op_fid1 = fid1;
1730         rdata->op_bias = MDS_CROSS_REF;
1731
1732         rc = md_enqueue(tgt->ltd_exp, einfo, it, rdata, lockh,
1733                         lmm, lmmsize, NULL, extra_lock_flags);
1734         kfree(rdata);
1735 out:
1736         ldlm_lock_decref(&plock, pmode);
1737         return rc;
1738 }
1739
1740 static int
1741 lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
1742             struct lookup_intent *it, struct md_op_data *op_data,
1743             struct lustre_handle *lockh, void *lmm, int lmmsize,
1744             struct ptlrpc_request **req, __u64 extra_lock_flags)
1745 {
1746         struct obd_device       *obd = exp->exp_obd;
1747         struct lmv_obd     *lmv = &obd->u.lmv;
1748         struct lmv_tgt_desc      *tgt;
1749         int                    rc;
1750
1751         rc = lmv_check_connect(obd);
1752         if (rc)
1753                 return rc;
1754
1755         CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID"\n",
1756                LL_IT2STR(it), PFID(&op_data->op_fid1));
1757
1758         tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1759         if (IS_ERR(tgt))
1760                 return PTR_ERR(tgt);
1761
1762         CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
1763                LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
1764
1765         rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
1766                         lmm, lmmsize, req, extra_lock_flags);
1767
1768         if (rc == 0 && it && it->it_op == IT_OPEN) {
1769                 rc = lmv_enqueue_remote(exp, einfo, it, op_data, lockh,
1770                                         lmm, lmmsize, extra_lock_flags);
1771         }
1772         return rc;
1773 }
1774
1775 static int
1776 lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data,
1777                  struct ptlrpc_request **request)
1778 {
1779         struct ptlrpc_request   *req = NULL;
1780         struct obd_device       *obd = exp->exp_obd;
1781         struct lmv_obd    *lmv = &obd->u.lmv;
1782         struct lmv_tgt_desc     *tgt;
1783         struct mdt_body  *body;
1784         int                   rc;
1785
1786         rc = lmv_check_connect(obd);
1787         if (rc)
1788                 return rc;
1789
1790         tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1791         if (IS_ERR(tgt))
1792                 return PTR_ERR(tgt);
1793
1794         CDEBUG(D_INODE, "GETATTR_NAME for %*s on "DFID" -> mds #%d\n",
1795                op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
1796                tgt->ltd_idx);
1797
1798         rc = md_getattr_name(tgt->ltd_exp, op_data, request);
1799         if (rc != 0)
1800                 return rc;
1801
1802         body = req_capsule_server_get(&(*request)->rq_pill,
1803                                       &RMF_MDT_BODY);
1804
1805         if (body->valid & OBD_MD_MDS) {
1806                 struct lu_fid rid = body->fid1;
1807
1808                 CDEBUG(D_INODE, "Request attrs for "DFID"\n",
1809                        PFID(&rid));
1810
1811                 tgt = lmv_find_target(lmv, &rid);
1812                 if (IS_ERR(tgt)) {
1813                         ptlrpc_req_finished(*request);
1814                         return PTR_ERR(tgt);
1815                 }
1816
1817                 op_data->op_fid1 = rid;
1818                 op_data->op_valid |= OBD_MD_FLCROSSREF;
1819                 op_data->op_namelen = 0;
1820                 op_data->op_name = NULL;
1821                 rc = md_getattr_name(tgt->ltd_exp, op_data, &req);
1822                 ptlrpc_req_finished(*request);
1823                 *request = req;
1824         }
1825
1826         return rc;
1827 }
1828
1829 #define md_op_data_fid(op_data, fl)                  \
1830         (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \
1831          fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \
1832          fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \
1833          fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \
1834          NULL)
1835
1836 static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
1837                             int op_tgt, enum ldlm_mode mode, int bits,
1838                             int flag)
1839 {
1840         struct lu_fid     *fid = md_op_data_fid(op_data, flag);
1841         struct obd_device      *obd = exp->exp_obd;
1842         struct lmv_obd   *lmv = &obd->u.lmv;
1843         struct lmv_tgt_desc    *tgt;
1844         ldlm_policy_data_t      policy = { {0} };
1845         int                  rc = 0;
1846
1847         if (!fid_is_sane(fid))
1848                 return 0;
1849
1850         tgt = lmv_find_target(lmv, fid);
1851         if (IS_ERR(tgt))
1852                 return PTR_ERR(tgt);
1853
1854         if (tgt->ltd_idx != op_tgt) {
1855                 CDEBUG(D_INODE, "EARLY_CANCEL on "DFID"\n", PFID(fid));
1856                 policy.l_inodebits.bits = bits;
1857                 rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
1858                                       mode, LCF_ASYNC, NULL);
1859         } else {
1860                 CDEBUG(D_INODE,
1861                        "EARLY_CANCEL skip operation target %d on "DFID"\n",
1862                        op_tgt, PFID(fid));
1863                 op_data->op_flags |= flag;
1864                 rc = 0;
1865         }
1866
1867         return rc;
1868 }
1869
1870 /*
1871  * llite passes fid of an target inode in op_data->op_fid1 and id of directory in
1872  * op_data->op_fid2
1873  */
1874 static int lmv_link(struct obd_export *exp, struct md_op_data *op_data,
1875                     struct ptlrpc_request **request)
1876 {
1877         struct obd_device       *obd = exp->exp_obd;
1878         struct lmv_obd    *lmv = &obd->u.lmv;
1879         struct lmv_tgt_desc     *tgt;
1880         int                   rc;
1881
1882         rc = lmv_check_connect(obd);
1883         if (rc)
1884                 return rc;
1885
1886         LASSERT(op_data->op_namelen != 0);
1887
1888         CDEBUG(D_INODE, "LINK "DFID":%*s to "DFID"\n",
1889                PFID(&op_data->op_fid2), op_data->op_namelen,
1890                op_data->op_name, PFID(&op_data->op_fid1));
1891
1892         op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
1893         op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
1894         op_data->op_cap = cfs_curproc_cap_pack();
1895         tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
1896         if (IS_ERR(tgt))
1897                 return PTR_ERR(tgt);
1898
1899         /*
1900          * Cancel UPDATE lock on child (fid1).
1901          */
1902         op_data->op_flags |= MF_MDC_CANCEL_FID2;
1903         rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
1904                               MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1);
1905         if (rc != 0)
1906                 return rc;
1907
1908         rc = md_link(tgt->ltd_exp, op_data, request);
1909
1910         return rc;
1911 }
1912
1913 static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data,
1914                       const char *old, int oldlen, const char *new, int newlen,
1915                       struct ptlrpc_request **request)
1916 {
1917         struct obd_device       *obd = exp->exp_obd;
1918         struct lmv_obd    *lmv = &obd->u.lmv;
1919         struct lmv_tgt_desc     *src_tgt;
1920         struct lmv_tgt_desc     *tgt_tgt;
1921         int                     rc;
1922
1923         LASSERT(oldlen != 0);
1924
1925         CDEBUG(D_INODE, "RENAME %*s in "DFID" to %*s in "DFID"\n",
1926                oldlen, old, PFID(&op_data->op_fid1),
1927                newlen, new, PFID(&op_data->op_fid2));
1928
1929         rc = lmv_check_connect(obd);
1930         if (rc)
1931                 return rc;
1932
1933         op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
1934         op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
1935         op_data->op_cap = cfs_curproc_cap_pack();
1936         src_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
1937         if (IS_ERR(src_tgt))
1938                 return PTR_ERR(src_tgt);
1939
1940         tgt_tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
1941         if (IS_ERR(tgt_tgt))
1942                 return PTR_ERR(tgt_tgt);
1943         /*
1944          * LOOKUP lock on src child (fid3) should also be cancelled for
1945          * src_tgt in mdc_rename.
1946          */
1947         op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
1948
1949         /*
1950          * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
1951          * own target.
1952          */
1953         rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
1954                               LCK_EX, MDS_INODELOCK_UPDATE,
1955                               MF_MDC_CANCEL_FID2);
1956
1957         /*
1958          * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
1959          */
1960         if (rc == 0) {
1961                 rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
1962                                       LCK_EX, MDS_INODELOCK_LOOKUP,
1963                                       MF_MDC_CANCEL_FID4);
1964         }
1965
1966         /*
1967          * Cancel all the locks on tgt child (fid4).
1968          */
1969         if (rc == 0)
1970                 rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
1971                                       LCK_EX, MDS_INODELOCK_FULL,
1972                                       MF_MDC_CANCEL_FID4);
1973
1974         if (rc == 0)
1975                 rc = md_rename(src_tgt->ltd_exp, op_data, old, oldlen,
1976                                new, newlen, request);
1977         return rc;
1978 }
1979
1980 static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data,
1981                        void *ea, int ealen, void *ea2, int ea2len,
1982                        struct ptlrpc_request **request,
1983                        struct md_open_data **mod)
1984 {
1985         struct obd_device       *obd = exp->exp_obd;
1986         struct lmv_obd    *lmv = &obd->u.lmv;
1987         struct lmv_tgt_desc     *tgt;
1988         int                   rc;
1989
1990         rc = lmv_check_connect(obd);
1991         if (rc)
1992                 return rc;
1993
1994         CDEBUG(D_INODE, "SETATTR for "DFID", valid 0x%x\n",
1995                PFID(&op_data->op_fid1), op_data->op_attr.ia_valid);
1996
1997         op_data->op_flags |= MF_MDC_CANCEL_FID1;
1998         tgt = lmv_find_target(lmv, &op_data->op_fid1);
1999         if (IS_ERR(tgt))
2000                 return PTR_ERR(tgt);
2001
2002         rc = md_setattr(tgt->ltd_exp, op_data, ea, ealen, ea2,
2003                         ea2len, request, mod);
2004
2005         return rc;
2006 }
2007
2008 static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
2009                     struct ptlrpc_request **request)
2010 {
2011         struct obd_device        *obd = exp->exp_obd;
2012         struct lmv_obd      *lmv = &obd->u.lmv;
2013         struct lmv_tgt_desc       *tgt;
2014         int                     rc;
2015
2016         rc = lmv_check_connect(obd);
2017         if (rc)
2018                 return rc;
2019
2020         tgt = lmv_find_target(lmv, fid);
2021         if (IS_ERR(tgt))
2022                 return PTR_ERR(tgt);
2023
2024         rc = md_sync(tgt->ltd_exp, fid, request);
2025         return rc;
2026 }
2027
2028 /*
2029  * Adjust a set of pages, each page containing an array of lu_dirpages,
2030  * so that each page can be used as a single logical lu_dirpage.
2031  *
2032  * A lu_dirpage is laid out as follows, where s = ldp_hash_start,
2033  * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a
2034  * struct lu_dirent.  It has size up to LU_PAGE_SIZE. The ldp_hash_end
2035  * value is used as a cookie to request the next lu_dirpage in a
2036  * directory listing that spans multiple pages (two in this example):
2037  *   ________
2038  *  |   |
2039  * .|--------v-------   -----.
2040  * |s|e|f|p|ent|ent| ... |ent|
2041  * '--|--------------   -----'   Each CFS_PAGE contains a single
2042  *    '------.             lu_dirpage.
2043  * .---------v-------   -----.
2044  * |s|e|f|p|ent| 0 | ... | 0 |
2045  * '-----------------   -----'
2046  *
2047  * However, on hosts where the native VM page size (PAGE_SIZE) is
2048  * larger than LU_PAGE_SIZE, a single host page may contain multiple
2049  * lu_dirpages. After reading the lu_dirpages from the MDS, the
2050  * ldp_hash_end of the first lu_dirpage refers to the one immediately
2051  * after it in the same CFS_PAGE (arrows simplified for brevity, but
2052  * in general e0==s1, e1==s2, etc.):
2053  *
2054  * .--------------------   -----.
2055  * |s0|e0|f0|p|ent|ent| ... |ent|
2056  * |---v----------------   -----|
2057  * |s1|e1|f1|p|ent|ent| ... |ent|
2058  * |---v----------------   -----|  Here, each CFS_PAGE contains
2059  *           ...                 multiple lu_dirpages.
2060  * |---v----------------   -----|
2061  * |s'|e'|f'|p|ent|ent| ... |ent|
2062  * '---|----------------   -----'
2063  *     v
2064  * .----------------------------.
2065  * |    next CFS_PAGE       |
2066  *
2067  * This structure is transformed into a single logical lu_dirpage as follows:
2068  *
2069  * - Replace e0 with e' so the request for the next lu_dirpage gets the page
2070  *   labeled 'next CFS_PAGE'.
2071  *
2072  * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether
2073  *   a hash collision with the next page exists.
2074  *
2075  * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span
2076  *   to the first entry of the next lu_dirpage.
2077  */
2078 #if PAGE_SIZE > LU_PAGE_SIZE
2079 static void lmv_adjust_dirpages(struct page **pages, int ncfspgs, int nlupgs)
2080 {
2081         int i;
2082
2083         for (i = 0; i < ncfspgs; i++) {
2084                 struct lu_dirpage       *dp = kmap(pages[i]);
2085                 struct lu_dirpage       *first = dp;
2086                 struct lu_dirent        *end_dirent = NULL;
2087                 struct lu_dirent        *ent;
2088                 __u64                   hash_end = dp->ldp_hash_end;
2089                 __u32                   flags = dp->ldp_flags;
2090
2091                 while (--nlupgs > 0) {
2092                         ent = lu_dirent_start(dp);
2093                         for (end_dirent = ent; ent;
2094                              end_dirent = ent, ent = lu_dirent_next(ent))
2095                                 ;
2096
2097                         /* Advance dp to next lu_dirpage. */
2098                         dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE);
2099
2100                         /* Check if we've reached the end of the CFS_PAGE. */
2101                         if (!((unsigned long)dp & ~PAGE_MASK))
2102                                 break;
2103
2104                         /* Save the hash and flags of this lu_dirpage. */
2105                         hash_end = dp->ldp_hash_end;
2106                         flags = dp->ldp_flags;
2107
2108                         /* Check if lu_dirpage contains no entries. */
2109                         if (!end_dirent)
2110                                 break;
2111
2112                         /* Enlarge the end entry lde_reclen from 0 to
2113                          * first entry of next lu_dirpage.
2114                          */
2115                         LASSERT(le16_to_cpu(end_dirent->lde_reclen) == 0);
2116                         end_dirent->lde_reclen =
2117                                 cpu_to_le16((char *)(dp->ldp_entries) -
2118                                             (char *)end_dirent);
2119                 }
2120
2121                 first->ldp_hash_end = hash_end;
2122                 first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE);
2123                 first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE);
2124
2125                 kunmap(pages[i]);
2126         }
2127         LASSERTF(nlupgs == 0, "left = %d", nlupgs);
2128 }
2129 #else
2130 #define lmv_adjust_dirpages(pages, ncfspgs, nlupgs) do {} while (0)
2131 #endif  /* PAGE_SIZE > LU_PAGE_SIZE */
2132
2133 static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
2134                         struct page **pages, struct ptlrpc_request **request)
2135 {
2136         struct obd_device       *obd = exp->exp_obd;
2137         struct lmv_obd          *lmv = &obd->u.lmv;
2138         __u64                   offset = op_data->op_offset;
2139         int                     rc;
2140         int                     ncfspgs; /* pages read in PAGE_SIZE */
2141         int                     nlupgs; /* pages read in LU_PAGE_SIZE */
2142         struct lmv_tgt_desc     *tgt;
2143
2144         rc = lmv_check_connect(obd);
2145         if (rc)
2146                 return rc;
2147
2148         CDEBUG(D_INODE, "READPAGE at %#llx from "DFID"\n",
2149                offset, PFID(&op_data->op_fid1));
2150
2151         tgt = lmv_find_target(lmv, &op_data->op_fid1);
2152         if (IS_ERR(tgt))
2153                 return PTR_ERR(tgt);
2154
2155         rc = md_readpage(tgt->ltd_exp, op_data, pages, request);
2156         if (rc != 0)
2157                 return rc;
2158
2159         ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1)
2160                  >> PAGE_SHIFT;
2161         nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
2162         LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
2163         LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
2164
2165         CDEBUG(D_INODE, "read %d(%d)/%d pages\n", ncfspgs, nlupgs,
2166                op_data->op_npages);
2167
2168         lmv_adjust_dirpages(pages, ncfspgs, nlupgs);
2169
2170         return rc;
2171 }
2172
2173 static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data,
2174                       struct ptlrpc_request **request)
2175 {
2176         struct obd_device       *obd = exp->exp_obd;
2177         struct lmv_obd    *lmv = &obd->u.lmv;
2178         struct lmv_tgt_desc     *tgt = NULL;
2179         struct mdt_body         *body;
2180         int                  rc;
2181
2182         rc = lmv_check_connect(obd);
2183         if (rc)
2184                 return rc;
2185 retry:
2186         /* Send unlink requests to the MDT where the child is located */
2187         if (likely(!fid_is_zero(&op_data->op_fid2)))
2188                 tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2);
2189         else
2190                 tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1);
2191         if (IS_ERR(tgt))
2192                 return PTR_ERR(tgt);
2193
2194         op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid());
2195         op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid());
2196         op_data->op_cap = cfs_curproc_cap_pack();
2197
2198         /*
2199          * If child's fid is given, cancel unused locks for it if it is from
2200          * another export than parent.
2201          *
2202          * LOOKUP lock for child (fid3) should also be cancelled on parent
2203          * tgt_tgt in mdc_unlink().
2204          */
2205         op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
2206
2207         /*
2208          * Cancel FULL locks on child (fid3).
2209          */
2210         rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
2211                               MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
2212
2213         if (rc != 0)
2214                 return rc;
2215
2216         CDEBUG(D_INODE, "unlink with fid="DFID"/"DFID" -> mds #%d\n",
2217                PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx);
2218
2219         rc = md_unlink(tgt->ltd_exp, op_data, request);
2220         if (rc != 0 && rc != -EREMOTE)
2221                 return rc;
2222
2223         body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY);
2224         if (!body)
2225                 return -EPROTO;
2226
2227         /* Not cross-ref case, just get out of here. */
2228         if (likely(!(body->valid & OBD_MD_MDS)))
2229                 return 0;
2230
2231         CDEBUG(D_INODE, "%s: try unlink to another MDT for "DFID"\n",
2232                exp->exp_obd->obd_name, PFID(&body->fid1));
2233
2234         /* This is a remote object, try remote MDT, Note: it may
2235          * try more than 1 time here, Considering following case
2236          * /mnt/lustre is root on MDT0, remote1 is on MDT1
2237          * 1. Initially A does not know where remote1 is, it send
2238          *    unlink RPC to MDT0, MDT0 return -EREMOTE, it will
2239          *    resend unlink RPC to MDT1 (retry 1st time).
2240          *
2241          * 2. During the unlink RPC in flight,
2242          *    client B mv /mnt/lustre/remote1 /mnt/lustre/remote2
2243          *    and create new remote1, but on MDT0
2244          *
2245          * 3. MDT1 get unlink RPC(from A), then do remote lock on
2246          *    /mnt/lustre, then lookup get fid of remote1, and find
2247          *    it is remote dir again, and replay -EREMOTE again.
2248          *
2249          * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times).
2250          *
2251          * In theory, it might try unlimited time here, but it should
2252          * be very rare case.
2253          */
2254         op_data->op_fid2 = body->fid1;
2255         ptlrpc_req_finished(*request);
2256         *request = NULL;
2257
2258         goto retry;
2259 }
2260
2261 static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
2262 {
2263         struct lmv_obd *lmv = &obd->u.lmv;
2264
2265         switch (stage) {
2266         case OBD_CLEANUP_EARLY:
2267                 /* XXX: here should be calling obd_precleanup() down to
2268                  * stack.
2269                  */
2270                 break;
2271         case OBD_CLEANUP_EXPORTS:
2272                 fld_client_debugfs_fini(&lmv->lmv_fld);
2273                 lprocfs_obd_cleanup(obd);
2274                 break;
2275         default:
2276                 break;
2277         }
2278         return 0;
2279 }
2280
2281 static int lmv_get_info(const struct lu_env *env, struct obd_export *exp,
2282                         __u32 keylen, void *key, __u32 *vallen, void *val,
2283                         struct lov_stripe_md *lsm)
2284 {
2285         struct obd_device       *obd;
2286         struct lmv_obd    *lmv;
2287         int                   rc = 0;
2288
2289         obd = class_exp2obd(exp);
2290         if (!obd) {
2291                 CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
2292                        exp->exp_handle.h_cookie);
2293                 return -EINVAL;
2294         }
2295
2296         lmv = &obd->u.lmv;
2297         if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) {
2298                 int i;
2299
2300                 rc = lmv_check_connect(obd);
2301                 if (rc)
2302                         return rc;
2303
2304                 LASSERT(*vallen == sizeof(__u32));
2305                 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2306                         struct lmv_tgt_desc *tgt = lmv->tgts[i];
2307
2308                         /*
2309                          * All tgts should be connected when this gets called.
2310                          */
2311                         if (!tgt || !tgt->ltd_exp)
2312                                 continue;
2313
2314                         if (!obd_get_info(env, tgt->ltd_exp, keylen, key,
2315                                           vallen, val, NULL))
2316                                 return 0;
2317                 }
2318                 return -EINVAL;
2319         } else if (KEY_IS(KEY_MAX_EASIZE) ||
2320                    KEY_IS(KEY_DEFAULT_EASIZE) ||
2321                    KEY_IS(KEY_CONN_DATA)) {
2322                 rc = lmv_check_connect(obd);
2323                 if (rc)
2324                         return rc;
2325
2326                 /*
2327                  * Forwarding this request to first MDS, it should know LOV
2328                  * desc.
2329                  */
2330                 rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key,
2331                                   vallen, val, NULL);
2332                 if (!rc && KEY_IS(KEY_CONN_DATA))
2333                         exp->exp_connect_data = *(struct obd_connect_data *)val;
2334                 return rc;
2335         } else if (KEY_IS(KEY_TGT_COUNT)) {
2336                 *((int *)val) = lmv->desc.ld_tgt_count;
2337                 return 0;
2338         }
2339
2340         CDEBUG(D_IOCTL, "Invalid key\n");
2341         return -EINVAL;
2342 }
2343
2344 static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp,
2345                               u32 keylen, void *key, u32 vallen,
2346                               void *val, struct ptlrpc_request_set *set)
2347 {
2348         struct lmv_tgt_desc    *tgt;
2349         struct obd_device      *obd;
2350         struct lmv_obd   *lmv;
2351         int rc = 0;
2352
2353         obd = class_exp2obd(exp);
2354         if (!obd) {
2355                 CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n",
2356                        exp->exp_handle.h_cookie);
2357                 return -EINVAL;
2358         }
2359         lmv = &obd->u.lmv;
2360
2361         if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX)) {
2362                 int i, err = 0;
2363
2364                 for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2365                         tgt = lmv->tgts[i];
2366
2367                         if (!tgt || !tgt->ltd_exp)
2368                                 continue;
2369
2370                         err = obd_set_info_async(env, tgt->ltd_exp,
2371                                                  keylen, key, vallen, val, set);
2372                         if (err && rc == 0)
2373                                 rc = err;
2374                 }
2375
2376                 return rc;
2377         }
2378
2379         return -EINVAL;
2380 }
2381
2382 static int lmv_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
2383                       struct lov_stripe_md *lsm)
2384 {
2385         struct obd_device        *obd = class_exp2obd(exp);
2386         struct lmv_obd      *lmv = &obd->u.lmv;
2387         struct lmv_stripe_md      *meap;
2388         struct lmv_stripe_md      *lsmp;
2389         int                     mea_size;
2390         int                     i;
2391
2392         mea_size = lmv_get_easize(lmv);
2393         if (!lmmp)
2394                 return mea_size;
2395
2396         if (*lmmp && !lsm) {
2397                 kvfree(*lmmp);
2398                 *lmmp = NULL;
2399                 return 0;
2400         }
2401
2402         if (!*lmmp) {
2403                 *lmmp = libcfs_kvzalloc(mea_size, GFP_NOFS);
2404                 if (!*lmmp)
2405                         return -ENOMEM;
2406         }
2407
2408         if (!lsm)
2409                 return mea_size;
2410
2411         lsmp = (struct lmv_stripe_md *)lsm;
2412         meap = (struct lmv_stripe_md *)*lmmp;
2413
2414         if (lsmp->mea_magic != MEA_MAGIC_LAST_CHAR &&
2415             lsmp->mea_magic != MEA_MAGIC_ALL_CHARS)
2416                 return -EINVAL;
2417
2418         meap->mea_magic = cpu_to_le32(lsmp->mea_magic);
2419         meap->mea_count = cpu_to_le32(lsmp->mea_count);
2420         meap->mea_master = cpu_to_le32(lsmp->mea_master);
2421
2422         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2423                 meap->mea_ids[i] = lsmp->mea_ids[i];
2424                 fid_cpu_to_le(&meap->mea_ids[i], &lsmp->mea_ids[i]);
2425         }
2426
2427         return mea_size;
2428 }
2429
2430 static int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
2431                         struct lov_mds_md *lmm, int lmm_size)
2432 {
2433         struct obd_device         *obd = class_exp2obd(exp);
2434         struct lmv_stripe_md      **tmea = (struct lmv_stripe_md **)lsmp;
2435         struct lmv_stripe_md       *mea = (struct lmv_stripe_md *)lmm;
2436         struct lmv_obd       *lmv = &obd->u.lmv;
2437         int                      mea_size;
2438         int                      i;
2439         __u32                  magic;
2440
2441         mea_size = lmv_get_easize(lmv);
2442         if (!lsmp)
2443                 return mea_size;
2444
2445         if (*lsmp && !lmm) {
2446                 kvfree(*tmea);
2447                 *lsmp = NULL;
2448                 return 0;
2449         }
2450
2451         LASSERT(mea_size == lmm_size);
2452
2453         *tmea = libcfs_kvzalloc(mea_size, GFP_NOFS);
2454         if (!*tmea)
2455                 return -ENOMEM;
2456
2457         if (!lmm)
2458                 return mea_size;
2459
2460         if (mea->mea_magic == MEA_MAGIC_LAST_CHAR ||
2461             mea->mea_magic == MEA_MAGIC_ALL_CHARS ||
2462             mea->mea_magic == MEA_MAGIC_HASH_SEGMENT) {
2463                 magic = le32_to_cpu(mea->mea_magic);
2464         } else {
2465                 /*
2466                  * Old mea is not handled here.
2467                  */
2468                 CERROR("Old not supportable EA is found\n");
2469                 LBUG();
2470         }
2471
2472         (*tmea)->mea_magic = magic;
2473         (*tmea)->mea_count = le32_to_cpu(mea->mea_count);
2474         (*tmea)->mea_master = le32_to_cpu(mea->mea_master);
2475
2476         for (i = 0; i < (*tmea)->mea_count; i++) {
2477                 (*tmea)->mea_ids[i] = mea->mea_ids[i];
2478                 fid_le_to_cpu(&(*tmea)->mea_ids[i], &(*tmea)->mea_ids[i]);
2479         }
2480         return mea_size;
2481 }
2482
2483 static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
2484                              ldlm_policy_data_t *policy, enum ldlm_mode mode,
2485                              enum ldlm_cancel_flags flags, void *opaque)
2486 {
2487         struct obd_device       *obd = exp->exp_obd;
2488         struct lmv_obd    *lmv = &obd->u.lmv;
2489         int                   rc = 0;
2490         int                   err;
2491         int                   i;
2492
2493         LASSERT(fid);
2494
2495         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2496                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
2497
2498                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
2499                         continue;
2500
2501                 err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags,
2502                                        opaque);
2503                 if (!rc)
2504                         rc = err;
2505         }
2506         return rc;
2507 }
2508
2509 static int lmv_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data,
2510                              __u64 *bits)
2511 {
2512         struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
2513         struct lmv_tgt_desc *tgt = lmv->tgts[0];
2514         int                   rc;
2515
2516         if (!tgt || !tgt->ltd_exp)
2517                 return -EINVAL;
2518
2519         rc = md_set_lock_data(tgt->ltd_exp, lockh, data, bits);
2520         return rc;
2521 }
2522
2523 static enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags,
2524                                      const struct lu_fid *fid,
2525                                      enum ldlm_type type,
2526                                      ldlm_policy_data_t *policy,
2527                                      enum ldlm_mode mode,
2528                                      struct lustre_handle *lockh)
2529 {
2530         struct obd_device       *obd = exp->exp_obd;
2531         struct lmv_obd    *lmv = &obd->u.lmv;
2532         enum ldlm_mode        rc;
2533         int                   i;
2534
2535         CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
2536
2537         /*
2538          * With CMD every object can have two locks in different namespaces:
2539          * lookup lock in space of mds storing direntry and update/open lock in
2540          * space of mds storing inode. Thus we check all targets, not only that
2541          * one fid was created in.
2542          */
2543         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2544                 struct lmv_tgt_desc *tgt = lmv->tgts[i];
2545
2546                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
2547                         continue;
2548
2549                 rc = md_lock_match(tgt->ltd_exp, flags, fid, type, policy, mode,
2550                                    lockh);
2551                 if (rc)
2552                         return rc;
2553         }
2554
2555         return 0;
2556 }
2557
2558 static int lmv_get_lustre_md(struct obd_export *exp,
2559                              struct ptlrpc_request *req,
2560                              struct obd_export *dt_exp,
2561                              struct obd_export *md_exp,
2562                              struct lustre_md *md)
2563 {
2564         struct lmv_obd    *lmv = &exp->exp_obd->u.lmv;
2565         struct lmv_tgt_desc *tgt = lmv->tgts[0];
2566
2567         if (!tgt || !tgt->ltd_exp)
2568                 return -EINVAL;
2569         return md_get_lustre_md(tgt->ltd_exp, req, dt_exp, md_exp, md);
2570 }
2571
2572 static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md)
2573 {
2574         struct obd_device       *obd = exp->exp_obd;
2575         struct lmv_obd    *lmv = &obd->u.lmv;
2576         struct lmv_tgt_desc *tgt = lmv->tgts[0];
2577
2578         if (md->mea)
2579                 obd_free_memmd(exp, (void *)&md->mea);
2580         if (!tgt || !tgt->ltd_exp)
2581                 return -EINVAL;
2582         return md_free_lustre_md(tgt->ltd_exp, md);
2583 }
2584
2585 static int lmv_set_open_replay_data(struct obd_export *exp,
2586                                     struct obd_client_handle *och,
2587                                     struct lookup_intent *it)
2588 {
2589         struct obd_device       *obd = exp->exp_obd;
2590         struct lmv_obd    *lmv = &obd->u.lmv;
2591         struct lmv_tgt_desc     *tgt;
2592
2593         tgt = lmv_find_target(lmv, &och->och_fid);
2594         if (IS_ERR(tgt))
2595                 return PTR_ERR(tgt);
2596
2597         return md_set_open_replay_data(tgt->ltd_exp, och, it);
2598 }
2599
2600 static int lmv_clear_open_replay_data(struct obd_export *exp,
2601                                       struct obd_client_handle *och)
2602 {
2603         struct obd_device       *obd = exp->exp_obd;
2604         struct lmv_obd    *lmv = &obd->u.lmv;
2605         struct lmv_tgt_desc     *tgt;
2606
2607         tgt = lmv_find_target(lmv, &och->och_fid);
2608         if (IS_ERR(tgt))
2609                 return PTR_ERR(tgt);
2610
2611         return md_clear_open_replay_data(tgt->ltd_exp, och);
2612 }
2613
2614 static int lmv_get_remote_perm(struct obd_export *exp,
2615                                const struct lu_fid *fid,
2616                                __u32 suppgid, struct ptlrpc_request **request)
2617 {
2618         struct obd_device       *obd = exp->exp_obd;
2619         struct lmv_obd    *lmv = &obd->u.lmv;
2620         struct lmv_tgt_desc     *tgt;
2621         int                   rc;
2622
2623         rc = lmv_check_connect(obd);
2624         if (rc)
2625                 return rc;
2626
2627         tgt = lmv_find_target(lmv, fid);
2628         if (IS_ERR(tgt))
2629                 return PTR_ERR(tgt);
2630
2631         rc = md_get_remote_perm(tgt->ltd_exp, fid, suppgid, request);
2632         return rc;
2633 }
2634
2635 static int lmv_intent_getattr_async(struct obd_export *exp,
2636                                     struct md_enqueue_info *minfo,
2637                                     struct ldlm_enqueue_info *einfo)
2638 {
2639         struct md_op_data       *op_data = &minfo->mi_data;
2640         struct obd_device       *obd = exp->exp_obd;
2641         struct lmv_obd    *lmv = &obd->u.lmv;
2642         struct lmv_tgt_desc     *tgt = NULL;
2643         int                   rc;
2644
2645         rc = lmv_check_connect(obd);
2646         if (rc)
2647                 return rc;
2648
2649         tgt = lmv_find_target(lmv, &op_data->op_fid1);
2650         if (IS_ERR(tgt))
2651                 return PTR_ERR(tgt);
2652
2653         rc = md_intent_getattr_async(tgt->ltd_exp, minfo, einfo);
2654         return rc;
2655 }
2656
2657 static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it,
2658                                struct lu_fid *fid, __u64 *bits)
2659 {
2660         struct obd_device       *obd = exp->exp_obd;
2661         struct lmv_obd    *lmv = &obd->u.lmv;
2662         struct lmv_tgt_desc     *tgt;
2663         int                   rc;
2664
2665         rc = lmv_check_connect(obd);
2666         if (rc)
2667                 return rc;
2668
2669         tgt = lmv_find_target(lmv, fid);
2670         if (IS_ERR(tgt))
2671                 return PTR_ERR(tgt);
2672
2673         rc = md_revalidate_lock(tgt->ltd_exp, it, fid, bits);
2674         return rc;
2675 }
2676
2677 /**
2678  * For lmv, only need to send request to master MDT, and the master MDT will
2679  * process with other slave MDTs. The only exception is Q_GETOQUOTA for which
2680  * we directly fetch data from the slave MDTs.
2681  */
2682 static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp,
2683                         struct obd_quotactl *oqctl)
2684 {
2685         struct obd_device   *obd = class_exp2obd(exp);
2686         struct lmv_obd      *lmv = &obd->u.lmv;
2687         struct lmv_tgt_desc *tgt = lmv->tgts[0];
2688         int               rc = 0, i;
2689         __u64           curspace, curinodes;
2690
2691         if (!tgt || !tgt->ltd_exp || !tgt->ltd_active ||
2692             !lmv->desc.ld_tgt_count) {
2693                 CERROR("master lmv inactive\n");
2694                 return -EIO;
2695         }
2696
2697         if (oqctl->qc_cmd != Q_GETOQUOTA) {
2698                 rc = obd_quotactl(tgt->ltd_exp, oqctl);
2699                 return rc;
2700         }
2701
2702         curspace = curinodes = 0;
2703         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2704                 int err;
2705
2706                 tgt = lmv->tgts[i];
2707
2708                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active)
2709                         continue;
2710
2711                 err = obd_quotactl(tgt->ltd_exp, oqctl);
2712                 if (err) {
2713                         CERROR("getquota on mdt %d failed. %d\n", i, err);
2714                         if (!rc)
2715                                 rc = err;
2716                 } else {
2717                         curspace += oqctl->qc_dqblk.dqb_curspace;
2718                         curinodes += oqctl->qc_dqblk.dqb_curinodes;
2719                 }
2720         }
2721         oqctl->qc_dqblk.dqb_curspace = curspace;
2722         oqctl->qc_dqblk.dqb_curinodes = curinodes;
2723
2724         return rc;
2725 }
2726
2727 static int lmv_quotacheck(struct obd_device *unused, struct obd_export *exp,
2728                           struct obd_quotactl *oqctl)
2729 {
2730         struct obd_device   *obd = class_exp2obd(exp);
2731         struct lmv_obd      *lmv = &obd->u.lmv;
2732         struct lmv_tgt_desc *tgt;
2733         int               i, rc = 0;
2734
2735         for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
2736                 int err;
2737
2738                 tgt = lmv->tgts[i];
2739                 if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) {
2740                         CERROR("lmv idx %d inactive\n", i);
2741                         return -EIO;
2742                 }
2743
2744                 err = obd_quotacheck(tgt->ltd_exp, oqctl);
2745                 if (err && !rc)
2746                         rc = err;
2747         }
2748
2749         return rc;
2750 }
2751
2752 static struct obd_ops lmv_obd_ops = {
2753         .owner          = THIS_MODULE,
2754         .setup          = lmv_setup,
2755         .cleanup        = lmv_cleanup,
2756         .precleanup     = lmv_precleanup,
2757         .process_config = lmv_process_config,
2758         .connect        = lmv_connect,
2759         .disconnect     = lmv_disconnect,
2760         .statfs         = lmv_statfs,
2761         .get_info       = lmv_get_info,
2762         .set_info_async = lmv_set_info_async,
2763         .packmd         = lmv_packmd,
2764         .unpackmd       = lmv_unpackmd,
2765         .notify         = lmv_notify,
2766         .get_uuid       = lmv_get_uuid,
2767         .iocontrol      = lmv_iocontrol,
2768         .quotacheck     = lmv_quotacheck,
2769         .quotactl       = lmv_quotactl
2770 };
2771
2772 static struct md_ops lmv_md_ops = {
2773         .getstatus              = lmv_getstatus,
2774         .null_inode             = lmv_null_inode,
2775         .find_cbdata            = lmv_find_cbdata,
2776         .close                  = lmv_close,
2777         .create                 = lmv_create,
2778         .done_writing           = lmv_done_writing,
2779         .enqueue                = lmv_enqueue,
2780         .getattr                = lmv_getattr,
2781         .getxattr               = lmv_getxattr,
2782         .getattr_name           = lmv_getattr_name,
2783         .intent_lock            = lmv_intent_lock,
2784         .link                   = lmv_link,
2785         .rename                 = lmv_rename,
2786         .setattr                = lmv_setattr,
2787         .setxattr               = lmv_setxattr,
2788         .sync                   = lmv_sync,
2789         .readpage               = lmv_readpage,
2790         .unlink                 = lmv_unlink,
2791         .init_ea_size           = lmv_init_ea_size,
2792         .cancel_unused          = lmv_cancel_unused,
2793         .set_lock_data          = lmv_set_lock_data,
2794         .lock_match             = lmv_lock_match,
2795         .get_lustre_md          = lmv_get_lustre_md,
2796         .free_lustre_md         = lmv_free_lustre_md,
2797         .set_open_replay_data   = lmv_set_open_replay_data,
2798         .clear_open_replay_data = lmv_clear_open_replay_data,
2799         .get_remote_perm        = lmv_get_remote_perm,
2800         .intent_getattr_async   = lmv_intent_getattr_async,
2801         .revalidate_lock        = lmv_revalidate_lock
2802 };
2803
2804 static int __init lmv_init(void)
2805 {
2806         struct lprocfs_static_vars lvars;
2807         int                     rc;
2808
2809         lprocfs_lmv_init_vars(&lvars);
2810
2811         rc = class_register_type(&lmv_obd_ops, &lmv_md_ops,
2812                                  LUSTRE_LMV_NAME, NULL);
2813         return rc;
2814 }
2815
2816 static void lmv_exit(void)
2817 {
2818         class_unregister_type(LUSTRE_LMV_NAME);
2819 }
2820
2821 MODULE_AUTHOR("OpenSFS, Inc. <http://www.lustre.org/>");
2822 MODULE_DESCRIPTION("Lustre Logical Metadata Volume");
2823 MODULE_VERSION(LUSTRE_VERSION_STRING);
2824 MODULE_LICENSE("GPL");
2825
2826 module_init(lmv_init);
2827 module_exit(lmv_exit);